mirror of
https://github.com/nim-lang/Nim.git
synced 2026-06-06 20:04:18 +00:00
update unicode.nim (#10921)
* update unicode.nim * create a script to create the needed unicode data * make unicode.nim compatible with Unicode v12.0.0 * slightly improve unicode.nim documentation (fixes #4795) * more documentation
This commit is contained in:
2016
lib/pure/includes/unicode_ranges.nim
Normal file
2016
lib/pure/includes/unicode_ranges.nim
Normal file
File diff suppressed because it is too large
Load Diff
1523
lib/pure/unicode.nim
1523
lib/pure/unicode.nim
File diff suppressed because it is too large
Load Diff
32840
tools/UnicodeData.txt
Normal file
32840
tools/UnicodeData.txt
Normal file
File diff suppressed because it is too large
Load Diff
204
tools/unicode_parsedata.nim
Normal file
204
tools/unicode_parsedata.nim
Normal file
@@ -0,0 +1,204 @@
|
||||
import strutils, algorithm
|
||||
|
||||
let
|
||||
# this file was obtained from:
|
||||
# https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
|
||||
filename = "tools/UnicodeData.txt"
|
||||
data = readFile(filename).strip.splitLines()
|
||||
|
||||
const
|
||||
# see the table here:
|
||||
# https://www.unicode.org/reports/tr44/#GC_Values_Table
|
||||
letters = ["Lu", "Ll", "Lt", "Lm", "Lo"]
|
||||
spaces = ["Zs", "Zl", "Zp"]
|
||||
|
||||
type
|
||||
Ranges = tuple[start, stop, diff: int]
|
||||
Singlets = tuple[code, diff: int]
|
||||
NonLetterRanges = tuple[start, stop: int]
|
||||
|
||||
var
|
||||
toUpper = newSeq[Singlets]()
|
||||
toLower = newSeq[Singlets]()
|
||||
toTitle = newSeq[Singlets]()
|
||||
alphas = newSeq[int]()
|
||||
unispaces = newSeq[int]()
|
||||
|
||||
|
||||
proc parseData(data: seq[string]) =
|
||||
for line in data:
|
||||
let
|
||||
fields = line.split(';')
|
||||
code = fields[0].parseHexInt()
|
||||
category = fields[2]
|
||||
uc = fields[12]
|
||||
lc = fields[13]
|
||||
tc = fields[14]
|
||||
|
||||
if category notin spaces and category notin letters:
|
||||
continue
|
||||
|
||||
if uc.len > 0:
|
||||
let diff = 500 + uc.parseHexInt() - code
|
||||
toUpper.add (code, diff)
|
||||
if lc.len > 0:
|
||||
let diff = 500 + lc.parseHexInt() - code
|
||||
toLower.add (code, diff)
|
||||
if tc.len > 0 and tc != uc:
|
||||
# if titlecase is different than uppercase
|
||||
let diff = 500 + tc.parseHexInt() - code
|
||||
if diff != 500:
|
||||
toTitle.add (code, diff)
|
||||
|
||||
if category in spaces:
|
||||
unispaces.add code
|
||||
else:
|
||||
alphas.add code
|
||||
|
||||
proc splitRanges(a: seq[Singlets], r: var seq[Ranges], s: var seq[Singlets]) =
|
||||
## Splits `toLower`, `toUpper` and `toTitle` into separate sequences:
|
||||
## - `r` contains continuous ranges with the same characteristics
|
||||
## (their upper/lower version is the same distance away)
|
||||
## - `s` contains single code points
|
||||
var i, j: int
|
||||
while i < a.len:
|
||||
j = 1
|
||||
let
|
||||
startCode = a[i].code
|
||||
startDiff = a[i].diff
|
||||
while i + j <= a.len:
|
||||
if i+j >= a.len or a[i+j].code != startCode+j or a[i+j].diff != startDiff:
|
||||
if j == 1:
|
||||
s.add (startCode, startDiff)
|
||||
else:
|
||||
r.add (startCode, a[i+j-1].code, startDiff)
|
||||
i += j-1
|
||||
break
|
||||
else:
|
||||
inc j
|
||||
inc i
|
||||
|
||||
proc splitRanges(a: seq[int], r: var seq[NonLetterRanges], s: var seq[int]) =
|
||||
## Splits `alphas` and `unispaces` into separate sequences:
|
||||
## - `r` contains continuous ranges
|
||||
## - `s` contains single code points
|
||||
var i, j: int
|
||||
while i < a.len:
|
||||
j = 1
|
||||
let startCode = a[i]
|
||||
while i + j <= a.len:
|
||||
if i+j >= a.len or a[i+j] != startCode+j:
|
||||
if j == 1:
|
||||
s.add startCode
|
||||
else:
|
||||
r.add (startCode, a[i+j-1])
|
||||
i += j-1
|
||||
break
|
||||
else:
|
||||
inc j
|
||||
inc i
|
||||
|
||||
proc splitSpaces(a: seq[int], r: var seq[NonLetterRanges], s: var seq[int]) =
|
||||
## Spaces are special because of the way how `isWhiteSpace` and `split`
|
||||
## are implemented.
|
||||
##
|
||||
## All spaces are added both to `r` (ranges) and `s` (singlets).
|
||||
var i, j: int
|
||||
while i < a.len:
|
||||
j = 1
|
||||
let startCode = a[i]
|
||||
while i + j <= a.len:
|
||||
if i+j >= a.len or a[i+j] != startCode+j:
|
||||
r.add (startCode, a[i+j-1])
|
||||
i += j-1
|
||||
break
|
||||
else:
|
||||
inc j
|
||||
inc i
|
||||
s = a
|
||||
|
||||
|
||||
var
|
||||
toupperRanges = newSeq[Ranges]()
|
||||
toupperSinglets = newSeq[Singlets]()
|
||||
tolowerRanges = newSeq[Ranges]()
|
||||
tolowerSinglets = newSeq[Singlets]()
|
||||
totitleRanges = newSeq[Ranges]()
|
||||
totitleSinglets = newSeq[Singlets]()
|
||||
spaceRanges = newSeq[NonLetterRanges]()
|
||||
unicodeSpaces = newSeq[int]()
|
||||
alphaRanges = newSeq[NonLetterRanges]()
|
||||
alphaSinglets = newSeq[int]()
|
||||
|
||||
parseData(data)
|
||||
splitRanges(toLower, tolowerRanges, tolowerSinglets)
|
||||
splitRanges(toUpper, toUpperRanges, toUpperSinglets)
|
||||
splitRanges(toTitle, toTitleRanges, toTitleSinglets)
|
||||
splitRanges(alphas, alphaRanges, alphaSinglets)
|
||||
|
||||
# manually add "special" spaces
|
||||
for i in 9 .. 13:
|
||||
unispaces.add i
|
||||
unispaces.add 0x85
|
||||
unispaces.sort()
|
||||
|
||||
splitSpaces(unispaces, spaceRanges, unicodeSpaces)
|
||||
|
||||
|
||||
var output: string
|
||||
|
||||
proc createHeader(output: var string) =
|
||||
output.add "# This file was created from a script.\n\n"
|
||||
output.add "const\n"
|
||||
|
||||
proc `$`(r: Ranges): string =
|
||||
let
|
||||
start = "0x" & toHex(r.start, 5)
|
||||
stop = "0x" & toHex(r.stop, 5)
|
||||
result = "$#, $#, $#,\n" % [start, stop, $r.diff]
|
||||
|
||||
proc `$`(r: Singlets): string =
|
||||
let code = "0x" & toHex(r.code, 5)
|
||||
result = "$#, $#,\n" % [code, $r.diff]
|
||||
|
||||
proc `$`(r: NonLetterRanges): string =
|
||||
let
|
||||
start = "0x" & toHex(r.start, 5)
|
||||
stop = "0x" & toHex(r.stop, 5)
|
||||
result = "$#, $#,\n" % [start, stop]
|
||||
|
||||
|
||||
proc outputSeq(s: seq[Ranges|Singlets|NonLetterRanges], name: string,
|
||||
output: var string) =
|
||||
output.add " $# = [\n" % name
|
||||
for r in s:
|
||||
output.add " " & $r
|
||||
output.add " ]\n\n"
|
||||
|
||||
proc outputSeq(s: seq[int], name: string, output: var string) =
|
||||
output.add " $# = [\n" % name
|
||||
for i in s:
|
||||
output.add " 0x$#,\n" % toHex(i, 5)
|
||||
output.add " ]\n\n"
|
||||
|
||||
proc outputSpaces(s: seq[int], name: string, output: var string) =
|
||||
output.add " $# = [\n" % name
|
||||
for i in s:
|
||||
output.add " Rune 0x$#,\n" % toHex(i, 5)
|
||||
output.add " ]\n\n"
|
||||
|
||||
|
||||
output.createHeader()
|
||||
outputSeq(tolowerRanges, "toLowerRanges", output)
|
||||
outputSeq(tolowerSinglets, "toLowerSinglets", output)
|
||||
outputSeq(toupperRanges, "toUpperRanges", output)
|
||||
outputSeq(toupperSinglets, "toUpperSinglets", output)
|
||||
outputSeq(totitleSinglets, "toTitleSinglets", output)
|
||||
outputSeq(alphaRanges, "alphaRanges", output)
|
||||
outputSeq(alphaSinglets, "alphaSinglets", output)
|
||||
outputSeq(spaceRanges, "spaceRanges", output)
|
||||
outputSpaces(unispaces, "unicodeSpaces", output) # array of runes
|
||||
|
||||
|
||||
let outfile = "lib/pure/includes/unicode_ranges.nim"
|
||||
outfile.writeFile(output)
|
||||
Reference in New Issue
Block a user