Treat CJK Ideographs as letters in isAlpha() (#23651)

Because of the bug in `tools/parse_unicodedata.nim`, CJK Ideographs were
not considered letters in `isAlpha()`, even though they have category
Lo. This is because they are specified as range in `UnicodeData.txt`,
not as separate characters:

```
4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
9FEF;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
```

The parser was not prepared to parse such ranges and thus omitted almost
all CJK Ideographs from consideration.

To fix this, we need to consider ranges from `UnicodeData.txt` in
`tools/parse_unicodedata.nim`.
This commit is contained in:
Alexander Kernozhitsky
2024-05-29 06:42:07 +02:00
committed by GitHub
parent d923c581c1
commit b172b34a24
3 changed files with 1996 additions and 1983 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -57,6 +57,7 @@ doAssert isAlpha("r")
doAssert isAlpha("α")
doAssert isAlpha("ϙ")
doAssert isAlpha("")
doAssert isAlpha("")
doAssert(not isAlpha("$"))
doAssert(not isAlpha(""))
@@ -66,6 +67,7 @@ doAssert isAlpha("𐌼𐌰𐌲𐌲𐌻𐌴𐍃𐍄𐌰𐌽")
doAssert isAlpha("ὕαλονϕαγεῖνδύναμαιτοῦτοοὔμεβλάπτει")
doAssert isAlpha("Јамогујестистаклоитоминештети")
doAssert isAlpha("Կրնամապակիուտեևինծիանհանգիստչըներ")
doAssert isAlpha("编程语言")
doAssert(not isAlpha("$Foo"))
doAssert(not isAlpha("⠙⠕⠑⠎⠝⠞"))

View File

@@ -26,34 +26,54 @@ var
proc parseData(data: seq[string]) =
for line in data:
proc doAdd(firstCode, lastCode: int, category, uc, lc, tc: string) =
if category notin spaces and category notin letters:
return
if firstCode != lastCode:
doAssert uc == "" and lc == "" and tc == ""
if uc.len > 0:
let diff = 500 + uc.parseHexInt() - firstCode
toUpper.add (firstCode, diff)
if lc.len > 0:
let diff = 500 + lc.parseHexInt() - firstCode
toLower.add (firstCode, diff)
if tc.len > 0 and tc != uc:
# if titlecase is different than uppercase
let diff = 500 + tc.parseHexInt() - firstCode
if diff != 500:
toTitle.add (firstCode, diff)
for code in firstCode..lastCode:
if category in spaces:
unispaces.add code
else:
alphas.add code
var idx = 0
while idx < data.len:
let
line = data[idx]
fields = line.split(';')
code = fields[0].parseHexInt()
name = fields[1]
category = fields[2]
uc = fields[12]
lc = fields[13]
tc = fields[14]
if category notin spaces and category notin letters:
continue
if uc.len > 0:
let diff = 500 + uc.parseHexInt() - code
toUpper.add (code, diff)
if lc.len > 0:
let diff = 500 + lc.parseHexInt() - code
toLower.add (code, diff)
if tc.len > 0 and tc != uc:
# if titlecase is different than uppercase
let diff = 500 + tc.parseHexInt() - code
if diff != 500:
toTitle.add (code, diff)
if category in spaces:
unispaces.add code
inc(idx)
if name.endsWith(", First>"):
doAssert idx < data.len
let
nextLine = data[idx]
nextFields = nextLine.split(';')
nextCode = nextFields[0].parseHexInt()
nextName = nextFields[1]
inc(idx)
doAssert nextName.endsWith(", Last>")
doAdd(code, nextCode, category, uc, lc, tc)
else:
alphas.add code
doAdd(code, code, category, uc, lc, tc)
proc splitRanges(a: seq[Singlets], r: var seq[Ranges], s: var seq[Singlets]) =
## Splits `toLower`, `toUpper` and `toTitle` into separate sequences:
@@ -153,18 +173,18 @@ proc createHeader(output: var string) =
proc `$`(r: Ranges): string =
let
start = "0x" & toHex(r.start, 5)
stop = "0x" & toHex(r.stop, 5)
start = "0x" & toHex(r.start, 5) & "'i32"
stop = "0x" & toHex(r.stop, 5) & "'i32"
result = "$#, $#, $#,\n" % [start, stop, $r.diff]
proc `$`(r: Singlets): string =
let code = "0x" & toHex(r.code, 5)
let code = "0x" & toHex(r.code, 5) & "'i32"
result = "$#, $#,\n" % [code, $r.diff]
proc `$`(r: NonLetterRanges): string =
let
start = "0x" & toHex(r.start, 5)
stop = "0x" & toHex(r.stop, 5)
start = "0x" & toHex(r.start, 5) & "'i32"
stop = "0x" & toHex(r.stop, 5) & "'i32"
result = "$#, $#,\n" % [start, stop]
@@ -178,7 +198,7 @@ proc outputSeq(s: seq[Ranges|Singlets|NonLetterRanges], name: string,
proc outputSeq(s: seq[int], name: string, output: var string) =
output.add " $# = [\n" % name
for i in s:
output.add " 0x$#,\n" % toHex(i, 5)
output.add " 0x$#'i32,\n" % toHex(i, 5)
output.add " ]\n\n"
proc outputSpaces(s: seq[int], name: string, output: var string) =