namespace ViE.Unicode
/-- Ranges of characters that have a visual width of 2 (Fullwidth/Wide).
Based on EastAsianWidth.txt (Wide or Fullwidth). -/
def wideRangesBmp : Array (Nat × Nat) := #[
(0x1100, 0x115F), -- Hangul Jamo
(0x2329, 0x232A), -- Angle brackets
(0x2E80, 0x303E), -- CJK Radicals / Punctuation
(0x3040, 0x3247), -- Hiragana, Katakana, Enclosed
(0x3250, 0x4DBF), -- Enclosed CJK / CJK Extension A
(0x4E00, 0xA4C6), -- CJK Unified Ideographs / Yi
(0xA960, 0xA97C), -- Hangul Jamo Extended-A
(0xAC00, 0xD7A3), -- Hangul Syllables
(0xF900, 0xFAFF), -- CJK Compatibility Ideographs
(0xFE10, 0xFE19), -- Vertical forms
(0xFE30, 0xFE6F), -- CJK Compatibility Forms
(0xFF01, 0xFF60), -- Fullwidth Forms
(0xFFE0, 0xFFE6) -- Fullwidth currency/signs
]
def wideRangesSupplemental : Array (Nat × Nat) := #[
(0x1F300, 0x1F64F), -- Miscellaneous Symbols and Pictographs (Emojis)
(0x1F900, 0x1F9FF) -- Supplemental Symbols and Pictographs (some, generic range)
]
/-- Check if a character code is in a wide range. -/
def isWide (c : Nat) : Bool :=
if c < 0x1100 then
false
else
let ranges := if c <= 0xFFFF then wideRangesBmp else wideRangesSupplemental
let rec loop (lo hi : Nat) : Bool :=
if lo < hi then
let mid := (lo + hi) / 2
let (start, stop) := ranges[mid]!
if c < start then
loop lo mid
else if c > stop then
loop (mid + 1) hi
else
true
else
false
termination_by hi - lo
loop 0 ranges.size
/-- Get the visual width of a character.
Returns 2 for wide characters, 0 for null/combining (simplified), 1 for others. -/
def charWidth (c : Char) : Nat :=
let n := c.toNat
if n == 0 then 0 -- Null is 0 width? Or usually displayed as something?
-- Usually invalid in text, but let's say 0.
else if n < 0x20 then 2 -- Control characters shown as ^A
else if n == 0x7F then 2 -- DEL shown as ^?
else if isWide n then 2
else 1
/-- Get the display string for a character.
- Control characters (0x00-0x1F) -> "^A" notation.
- 0x7F -> "^?"
- Others -> character itself as string. -/
def getDisplayString (c : Char) : String :=
let n := c.toNat
if n < 0x20 then
-- 0x01 is ^A (A is 0x41). 0x01 + 0x40 = 0x41.
-- 0x00 is ^@.
let base := (n + 0x40).toUInt8
"^" ++ (Char.ofNat base.toNat).toString
else if n == 0x7F then
"^?"
else
c.toString
/-- Calculate the total visual width of a string. -/
def stringWidth (s : String) : Nat :=
s.foldl (fun acc c => acc + charWidth c) 0
/-- Calculate the total visual width of a substring. -/
def substringWidth (s : Substring.Raw) : Nat :=
s.foldl (fun acc c => acc + charWidth c) 0
/-- Get the byte length of a UTF-8 sequence starting with `b`.
Returns 0 if `b` is a continuation byte or invalid start. -/
def utf8ByteLength (b : UInt8) : Nat :=
if b < 0x80 then 1
else if (b &&& 0xE0) == 0xC0 then 2
else if (b &&& 0xF0) == 0xE0 then 3
else if (b &&& 0xF8) == 0xF0 then 4
else 0 -- Invalid or continuation
/-- Count newlines in a byte array slice. -/
def countNewlines (data : ByteArray) (start : Nat) (len : Nat) : Nat :=
let endPos := start + len
let rec loop (i : Nat) (count : Nat) : Nat :=
if i >= endPos then count
else
if data[i]! == 10 then loop (i + 1) (count + 1) -- 10 is \n
else loop (i + 1) count
loop start 0
/-- Count UTF-8 characters in a byte array range. -/
def countChars (bytes : ByteArray) (start len : Nat) : Nat :=
let stop := start + len
let rec loop (i : Nat) (cnt : Nat) : Nat :=
if i >= stop then cnt
else
let b := bytes[i]!
if (b &&& 0xC0) != 0x80 then
loop (i + 1) (cnt + 1)
else
loop (i + 1) cnt
loop start 0
/-- Convert a display column to a character index (0-based).
If the column is in the middle of a wide character, it snaps to that character's index. -/
def displayColToCharIndex (s : String) (col : Nat) : Nat :=
let rec loop (cs : List Char) (idx width : Nat) : Nat :=
match cs with
| [] => idx
| c :: rest =>
let w := charWidth c
if width + w > col then
idx
else
loop rest (idx + 1) (width + w)
loop s.toList 0 0
/-- Display width of the first `idx` characters in a string. -/
def displayWidthAtCharIndex (s : String) (idx : Nat) : Nat :=
let rec loop (cs : List Char) (i acc : Nat) : Nat :=
match cs with
| [] => acc
| c :: rest =>
if i >= idx then
acc
else
loop rest (i + 1) (acc + charWidth c)
loop s.toList 0 0
/-- Convert a display column to a UTF-8 byte offset within a string. -/
def displayColToByteOffset (s : String) (col : Nat) : Nat :=
let rec loop (cs : List Char) (byteAcc widthAcc : Nat) : Nat :=
match cs with
| [] => byteAcc
| c :: rest =>
let w := charWidth c
let b := c.toString.toUTF8.size
if widthAcc + w > col then
byteAcc
else
loop rest (byteAcc + b) (widthAcc + w)
loop s.toList 0 0
/-- Build an index mapping display columns to UTF-8 byte offsets at character boundaries.
The result includes the starting boundary (0,0) and the final boundary (totalWidth,totalBytes). -/
def buildDisplayByteIndex (s : String) : Array (Nat × Nat) :=
let rec loop (cs : List Char) (disp bytes : Nat) (acc : Array (Nat × Nat)) : Array (Nat × Nat) :=
match cs with
| [] => acc
| c :: rest =>
let w := charWidth c
let b := c.toString.toUTF8.size
let disp' := disp + w
let bytes' := bytes + b
loop rest disp' bytes' (acc.push (disp', bytes'))
loop s.toList 0 0 #[(0, 0)]
/-- Convert a display column to a UTF-8 byte offset using a precomputed index. -/
def displayColToByteOffsetFromIndex (idx : Array (Nat × Nat)) (col : Nat) : Nat :=
if idx.isEmpty then
0
else
let rec loop (lo hi : Nat) (best : Nat) : Nat :=
if lo < hi then
let mid := (lo + hi) / 2
let (disp, bytes) := idx[mid]!
if col < disp then
loop lo mid best
else
loop (mid + 1) hi bytes
else
best
termination_by hi - lo
loop 0 idx.size 0
/-- Compute the next display column (advancing by one character). -/
def nextDisplayCol (s : String) (col : Nat) : Nat :=
let idx := displayColToCharIndex s col
let startWidth := displayWidthAtCharIndex s idx
if col < startWidth then
startWidth
else
let chars := s.toList
if idx < chars.length then
let w := charWidth (chars[idx]!)
startWidth + w
else
col
/-- Compute the previous display column (moving back by one character). -/
def prevDisplayCol (s : String) (col : Nat) : Nat :=
if col == 0 then
0
else
let idx := displayColToCharIndex s col
let startWidth := displayWidthAtCharIndex s idx
if col > startWidth then
startWidth
else if idx == 0 then
0
else
displayWidthAtCharIndex s (idx - 1)
/-- Drop `width` display columns from the start of a substring. -/
def dropByDisplayWidth (s : Substring.Raw) (width : Nat) : Substring.Raw :=
let rec loop (i : String.Pos.Raw) (acc : Nat) : Substring.Raw :=
if h : i.byteIdx < s.bsize then
let c := s.get i
let w := charWidth c
if acc + w > width then
s.extract i ⟨s.bsize⟩
else
let i' := s.next i
have := Nat.sub_lt_sub_left h (Substring.Raw.lt_next s i h)
loop i' (acc + w)
else
s.extract i ⟨s.bsize⟩
termination_by s.bsize - i.1
loop 0 0
/-- Take characters from substring until visual width limit is reached. -/
def takeByDisplayWidth (s : Substring.Raw) (width : Nat) : String :=
let rec loop (i : String.Pos.Raw) (currW : Nat) (acc : String) : String :=
if h : i.byteIdx < s.bsize then
let c := s.get i
let w := charWidth c
if currW + w <= width then
let i' := s.next i
have := Nat.sub_lt_sub_left h (Substring.Raw.lt_next s i h)
loop i' (currW + w) (acc ++ getDisplayString c)
else
acc
else
acc
termination_by s.bsize - i.1
loop 0 0 ""
/-- Normalize invalid tab stop values.
Tab stop 0 would break modulo logic, so clamp to 1. -/
def normalizeTabStop (tabStop : Nat) : Nat :=
if tabStop == 0 then 1 else tabStop
/-- Display width contribution of a tab at a given display column. -/
def tabWidthAt (tabStop : Nat) (col : Nat) : Nat :=
let ts := normalizeTabStop tabStop
let rem := col % ts
if rem == 0 then ts else ts - rem
/-- Character width with tab-stop awareness. -/
def charWidthAt (tabStop : Nat) (col : Nat) (c : Char) : Nat :=
if c == '\t' then
tabWidthAt tabStop col
else
charWidth c
/-- Display text for a character with tab-stop awareness. -/
def getDisplayStringAt (tabStop : Nat) (col : Nat) (c : Char) : String :=
if c == '\t' then
"".pushn ' ' (tabWidthAt tabStop col)
else
getDisplayString c
/-- Calculate the total visual width of a string with tab-stop awareness. -/
def stringWidthWithTabStop (s : String) (tabStop : Nat) : Nat :=
let rec loop (cs : List Char) (acc : Nat) : Nat :=
match cs with
| [] => acc
| c :: rest =>
let w := charWidthAt tabStop acc c
loop rest (acc + w)
loop s.toList 0
/-- Calculate the total visual width of a substring with tab-stop awareness. -/
def substringWidthWithTabStop (s : Substring.Raw) (tabStop : Nat) : Nat :=
let rec loop (i : String.Pos.Raw) (acc : Nat) : Nat :=
if h : i.byteIdx < s.bsize then
let c := s.get i
let i' := s.next i
have := Nat.sub_lt_sub_left h (Substring.Raw.lt_next s i h)
let w := charWidthAt tabStop acc c
loop i' (acc + w)
else
acc
termination_by s.bsize - i.1
loop 0 0
/-- Convert a display column to a character index (0-based), tab-stop aware. -/
def displayColToCharIndexWithTabStop (s : String) (tabStop : Nat) (col : Nat) : Nat :=
let rec loop (cs : List Char) (idx width : Nat) : Nat :=
match cs with
| [] => idx
| c :: rest =>
let w := charWidthAt tabStop width c
if width + w > col then
idx
else
loop rest (idx + 1) (width + w)
loop s.toList 0 0
/-- Display width of the first `idx` characters in a string, tab-stop aware. -/
def displayWidthAtCharIndexWithTabStop (s : String) (tabStop : Nat) (idx : Nat) : Nat :=
let rec loop (cs : List Char) (i acc : Nat) : Nat :=
match cs with
| [] => acc
| c :: rest =>
if i >= idx then
acc
else
let w := charWidthAt tabStop acc c
loop rest (i + 1) (acc + w)
loop s.toList 0 0
/-- Convert a display column to a UTF-8 byte offset within a string, tab-stop aware. -/
def displayColToByteOffsetWithTabStop (s : String) (tabStop : Nat) (col : Nat) : Nat :=
let rec loop (cs : List Char) (byteAcc widthAcc : Nat) : Nat :=
match cs with
| [] => byteAcc
| c :: rest =>
let w := charWidthAt tabStop widthAcc c
let b := c.toString.toUTF8.size
if widthAcc + w > col then
byteAcc
else
loop rest (byteAcc + b) (widthAcc + w)
loop s.toList 0 0
/-- Build display-column to byte-offset index with tab-stop awareness. -/
def buildDisplayByteIndexWithTabStop (s : String) (tabStop : Nat) : Array (Nat × Nat) :=
let rec loop (cs : List Char) (disp bytes : Nat) (acc : Array (Nat × Nat)) : Array (Nat × Nat) :=
match cs with
| [] => acc
| c :: rest =>
let w := charWidthAt tabStop disp c
let b := c.toString.toUTF8.size
let disp' := disp + w
let bytes' := bytes + b
loop rest disp' bytes' (acc.push (disp', bytes'))
loop s.toList 0 0 #[(0, 0)]
/-- Compute next display column (one character advance), tab-stop aware. -/
def nextDisplayColWithTabStop (s : String) (tabStop : Nat) (col : Nat) : Nat :=
let idx := displayColToCharIndexWithTabStop s tabStop col
let startWidth := displayWidthAtCharIndexWithTabStop s tabStop idx
if col < startWidth then
startWidth
else
let chars := s.toList
if idx < chars.length then
let w := charWidthAt tabStop startWidth (chars[idx]!)
startWidth + w
else
col
/-- Compute previous display column (one character back), tab-stop aware. -/
def prevDisplayColWithTabStop (s : String) (tabStop : Nat) (col : Nat) : Nat :=
if col == 0 then
0
else
let idx := displayColToCharIndexWithTabStop s tabStop col
let startWidth := displayWidthAtCharIndexWithTabStop s tabStop idx
if col > startWidth then
startWidth
else if idx == 0 then
0
else
displayWidthAtCharIndexWithTabStop s tabStop (idx - 1)
/-- Drop `width` display columns from substring start, tab-stop aware. -/
def dropByDisplayWidthWithTabStop (s : Substring.Raw) (tabStop : Nat) (width : Nat) : Substring.Raw :=
let rec loop (i : String.Pos.Raw) (acc : Nat) : Substring.Raw :=
if h : i.byteIdx < s.bsize then
let c := s.get i
let w := charWidthAt tabStop acc c
if acc + w > width then
s.extract i ⟨s.bsize⟩
else
let i' := s.next i
have := Nat.sub_lt_sub_left h (Substring.Raw.lt_next s i h)
loop i' (acc + w)
else
s.extract i ⟨s.bsize⟩
termination_by s.bsize - i.1
loop 0 0
/-- Take characters from substring until visual width limit, tab-stop aware. -/
def takeByDisplayWidthWithTabStop (s : Substring.Raw) (tabStop : Nat) (width : Nat) : String :=
let rec loop (i : String.Pos.Raw) (currW : Nat) (acc : String) : String :=
if h : i.byteIdx < s.bsize then
let c := s.get i
let w := charWidthAt tabStop currW c
if currW + w <= width then
let i' := s.next i
have := Nat.sub_lt_sub_left h (Substring.Raw.lt_next s i h)
loop i' (currW + w) (acc ++ getDisplayStringAt tabStop currW c)
else
acc
else
acc
termination_by s.bsize - i.1
loop 0 0 ""
end ViE.Unicode