Treat CJK Ideographs as letters in isAlpha() (#23651)

Because of the bug in `tools/parse_unicodedata.nim`, CJK Ideographs were not considered letters in `isAlpha()`, even though they have category Lo. This is because they are specified as range in `UnicodeData.txt`, not as separate characters: ``` 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;; 9FEF;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;; ``` The parser was not prepared to parse such ranges and thus omitted almost all CJK Ideographs from consideration. To fix this, we need to consider ranges from `UnicodeData.txt` in `tools/parse_unicodedata.nim`.
2026-06-05 03:14:08 +00:00 · 2024-05-29 06:42:07 +02:00
parent d923c581c1
commit b172b34a24
3 changed files with 1996 additions and 1983 deletions
--- a/lib/pure/includes/unicode_ranges.nim
+++ b/lib/pure/includes/unicode_ranges.nim
--- a/tests/stdlib/tunicode.nim
+++ b/tests/stdlib/tunicode.nim
@@ -57,6 +57,7 @@ doAssert isAlpha("r")
 doAssert isAlpha("α")
 doAssert isAlpha("ϙ")
 doAssert isAlpha("ஶ")
+doAssert isAlpha("网")
 doAssert(not isAlpha("$"))
 doAssert(not isAlpha(""))

@@ -66,6 +67,7 @@ doAssert isAlpha("𐌼𐌰𐌲𐌲𐌻𐌴𐍃𐍄𐌰𐌽")
 doAssert isAlpha("ὕαλονϕαγεῖνδύναμαιτοῦτοοὔμεβλάπτει")
 doAssert isAlpha("Јамогујестистаклоитоминештети")
 doAssert isAlpha("Կրնամապակիուտեևինծիանհանգիստչըներ")
+doAssert isAlpha("编程语言")
 doAssert(not isAlpha("$Foo✓"))
 doAssert(not isAlpha("⠙⠕⠑⠎⠝⠞"))

--- a/tools/unicode_parsedata.nim
+++ b/tools/unicode_parsedata.nim
@@ -26,34 +26,54 @@ var


 proc parseData(data: seq[string]) =
-  for line in data:
+  proc doAdd(firstCode, lastCode: int, category, uc, lc, tc: string) =
+    if category notin spaces and category notin letters:
+      return
+
+    if firstCode != lastCode:
+      doAssert uc == "" and lc == "" and tc == ""
+    if uc.len > 0:
+      let diff = 500 + uc.parseHexInt() - firstCode
+      toUpper.add (firstCode, diff)
+    if lc.len > 0:
+      let diff = 500 + lc.parseHexInt() - firstCode
+      toLower.add (firstCode, diff)
+    if tc.len > 0 and tc != uc:
+      # if titlecase is different than uppercase
+      let diff = 500 + tc.parseHexInt() - firstCode
+      if diff != 500:
+        toTitle.add (firstCode, diff)
+
+    for code in firstCode..lastCode:
+      if category in spaces:
+        unispaces.add code
+      else:
+        alphas.add code
+
+  var idx = 0
+  while idx < data.len:
    let
+      line = data[idx]
      fields = line.split(';')
      code = fields[0].parseHexInt()
+      name = fields[1]
      category = fields[2]
      uc = fields[12]
      lc = fields[13]
      tc = fields[14]
-
-    if category notin spaces and category notin letters:
-      continue
-
-    if uc.len > 0:
-      let diff = 500 + uc.parseHexInt() - code
-      toUpper.add (code, diff)
-    if lc.len > 0:
-      let diff = 500 + lc.parseHexInt() - code
-      toLower.add (code, diff)
-    if tc.len > 0 and tc != uc:
-      # if titlecase is different than uppercase
-      let diff = 500 + tc.parseHexInt() - code
-      if diff != 500:
-        toTitle.add (code, diff)
-
-    if category in spaces:
-      unispaces.add code
+    inc(idx)
+    if name.endsWith(", First>"):
+      doAssert idx < data.len
+      let
+        nextLine = data[idx]
+        nextFields = nextLine.split(';')
+        nextCode = nextFields[0].parseHexInt()
+        nextName = nextFields[1]
+      inc(idx)
+      doAssert nextName.endsWith(", Last>")
+      doAdd(code, nextCode, category, uc, lc, tc)
    else:
-      alphas.add code
+      doAdd(code, code, category, uc, lc, tc)

 proc splitRanges(a: seq[Singlets], r: var seq[Ranges], s: var seq[Singlets]) =
  ## Splits `toLower`, `toUpper` and `toTitle` into separate sequences:
@@ -153,18 +173,18 @@ proc createHeader(output: var string) =

 proc `$`(r: Ranges): string =
  let
-    start = "0x" & toHex(r.start, 5)
-    stop = "0x" & toHex(r.stop, 5)
+    start = "0x" & toHex(r.start, 5) & "'i32"
+    stop = "0x" & toHex(r.stop, 5) & "'i32"
  result = "$#, $#, $#,\n" % [start, stop, $r.diff]

 proc `$`(r: Singlets): string =
-  let code = "0x" & toHex(r.code, 5)
+  let code = "0x" & toHex(r.code, 5) & "'i32"
  result = "$#, $#,\n" % [code, $r.diff]

 proc `$`(r: NonLetterRanges): string =
  let
-    start = "0x" & toHex(r.start, 5)
-    stop = "0x" & toHex(r.stop, 5)
+    start = "0x" & toHex(r.start, 5) & "'i32"
+    stop = "0x" & toHex(r.stop, 5) & "'i32"
  result = "$#, $#,\n" % [start, stop]


@@ -178,7 +198,7 @@ proc outputSeq(s: seq[Ranges|Singlets|NonLetterRanges], name: string,
 proc outputSeq(s: seq[int], name: string, output: var string) =
  output.add "  $# = [\n" % name
  for i in s:
-    output.add "    0x$#,\n" % toHex(i, 5)
+    output.add "    0x$#'i32,\n" % toHex(i, 5)
  output.add "  ]\n\n"

 proc outputSpaces(s: seq[int], name: string, output: var string) =