mirror of
https://github.com/nim-lang/Nim.git
synced 2026-04-19 05:50:30 +00:00
Treat CJK Ideographs as letters in isAlpha() (#23651)
Because of the bug in `tools/parse_unicodedata.nim`, CJK Ideographs were not considered letters in `isAlpha()`, even though they have category Lo. This is because they are specified as range in `UnicodeData.txt`, not as separate characters: ``` 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;; 9FEF;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;; ``` The parser was not prepared to parse such ranges and thus omitted almost all CJK Ideographs from consideration. To fix this, we need to consider ranges from `UnicodeData.txt` in `tools/parse_unicodedata.nim`.
This commit is contained in:
committed by
GitHub
parent
d923c581c1
commit
b172b34a24
File diff suppressed because it is too large
Load Diff
@@ -57,6 +57,7 @@ doAssert isAlpha("r")
|
||||
doAssert isAlpha("α")
|
||||
doAssert isAlpha("ϙ")
|
||||
doAssert isAlpha("ஶ")
|
||||
doAssert isAlpha("网")
|
||||
doAssert(not isAlpha("$"))
|
||||
doAssert(not isAlpha(""))
|
||||
|
||||
@@ -66,6 +67,7 @@ doAssert isAlpha("𐌼𐌰𐌲𐌲𐌻𐌴𐍃𐍄𐌰𐌽")
|
||||
doAssert isAlpha("ὕαλονϕαγεῖνδύναμαιτοῦτοοὔμεβλάπτει")
|
||||
doAssert isAlpha("Јамогујестистаклоитоминештети")
|
||||
doAssert isAlpha("Կրնամապակիուտեևինծիանհանգիստչըներ")
|
||||
doAssert isAlpha("编程语言")
|
||||
doAssert(not isAlpha("$Foo✓"))
|
||||
doAssert(not isAlpha("⠙⠕⠑⠎⠝⠞"))
|
||||
|
||||
|
||||
@@ -26,34 +26,54 @@ var
|
||||
|
||||
|
||||
proc parseData(data: seq[string]) =
|
||||
for line in data:
|
||||
proc doAdd(firstCode, lastCode: int, category, uc, lc, tc: string) =
|
||||
if category notin spaces and category notin letters:
|
||||
return
|
||||
|
||||
if firstCode != lastCode:
|
||||
doAssert uc == "" and lc == "" and tc == ""
|
||||
if uc.len > 0:
|
||||
let diff = 500 + uc.parseHexInt() - firstCode
|
||||
toUpper.add (firstCode, diff)
|
||||
if lc.len > 0:
|
||||
let diff = 500 + lc.parseHexInt() - firstCode
|
||||
toLower.add (firstCode, diff)
|
||||
if tc.len > 0 and tc != uc:
|
||||
# if titlecase is different than uppercase
|
||||
let diff = 500 + tc.parseHexInt() - firstCode
|
||||
if diff != 500:
|
||||
toTitle.add (firstCode, diff)
|
||||
|
||||
for code in firstCode..lastCode:
|
||||
if category in spaces:
|
||||
unispaces.add code
|
||||
else:
|
||||
alphas.add code
|
||||
|
||||
var idx = 0
|
||||
while idx < data.len:
|
||||
let
|
||||
line = data[idx]
|
||||
fields = line.split(';')
|
||||
code = fields[0].parseHexInt()
|
||||
name = fields[1]
|
||||
category = fields[2]
|
||||
uc = fields[12]
|
||||
lc = fields[13]
|
||||
tc = fields[14]
|
||||
|
||||
if category notin spaces and category notin letters:
|
||||
continue
|
||||
|
||||
if uc.len > 0:
|
||||
let diff = 500 + uc.parseHexInt() - code
|
||||
toUpper.add (code, diff)
|
||||
if lc.len > 0:
|
||||
let diff = 500 + lc.parseHexInt() - code
|
||||
toLower.add (code, diff)
|
||||
if tc.len > 0 and tc != uc:
|
||||
# if titlecase is different than uppercase
|
||||
let diff = 500 + tc.parseHexInt() - code
|
||||
if diff != 500:
|
||||
toTitle.add (code, diff)
|
||||
|
||||
if category in spaces:
|
||||
unispaces.add code
|
||||
inc(idx)
|
||||
if name.endsWith(", First>"):
|
||||
doAssert idx < data.len
|
||||
let
|
||||
nextLine = data[idx]
|
||||
nextFields = nextLine.split(';')
|
||||
nextCode = nextFields[0].parseHexInt()
|
||||
nextName = nextFields[1]
|
||||
inc(idx)
|
||||
doAssert nextName.endsWith(", Last>")
|
||||
doAdd(code, nextCode, category, uc, lc, tc)
|
||||
else:
|
||||
alphas.add code
|
||||
doAdd(code, code, category, uc, lc, tc)
|
||||
|
||||
proc splitRanges(a: seq[Singlets], r: var seq[Ranges], s: var seq[Singlets]) =
|
||||
## Splits `toLower`, `toUpper` and `toTitle` into separate sequences:
|
||||
@@ -153,18 +173,18 @@ proc createHeader(output: var string) =
|
||||
|
||||
proc `$`(r: Ranges): string =
|
||||
let
|
||||
start = "0x" & toHex(r.start, 5)
|
||||
stop = "0x" & toHex(r.stop, 5)
|
||||
start = "0x" & toHex(r.start, 5) & "'i32"
|
||||
stop = "0x" & toHex(r.stop, 5) & "'i32"
|
||||
result = "$#, $#, $#,\n" % [start, stop, $r.diff]
|
||||
|
||||
proc `$`(r: Singlets): string =
|
||||
let code = "0x" & toHex(r.code, 5)
|
||||
let code = "0x" & toHex(r.code, 5) & "'i32"
|
||||
result = "$#, $#,\n" % [code, $r.diff]
|
||||
|
||||
proc `$`(r: NonLetterRanges): string =
|
||||
let
|
||||
start = "0x" & toHex(r.start, 5)
|
||||
stop = "0x" & toHex(r.stop, 5)
|
||||
start = "0x" & toHex(r.start, 5) & "'i32"
|
||||
stop = "0x" & toHex(r.stop, 5) & "'i32"
|
||||
result = "$#, $#,\n" % [start, stop]
|
||||
|
||||
|
||||
@@ -178,7 +198,7 @@ proc outputSeq(s: seq[Ranges|Singlets|NonLetterRanges], name: string,
|
||||
proc outputSeq(s: seq[int], name: string, output: var string) =
|
||||
output.add " $# = [\n" % name
|
||||
for i in s:
|
||||
output.add " 0x$#,\n" % toHex(i, 5)
|
||||
output.add " 0x$#'i32,\n" % toHex(i, 5)
|
||||
output.add " ]\n\n"
|
||||
|
||||
proc outputSpaces(s: seq[int], name: string, output: var string) =
|
||||
|
||||
Reference in New Issue
Block a user