mirror of
https://github.com/nim-lang/Nim.git
synced 2025-12-28 17:04:41 +00:00
* update unicode.nim * create a script to create the needed unicode data * make unicode.nim compatible with Unicode v12.0.0 * slightly improve unicode.nim documentation (fixes #4795) * more documentation
205 lines
5.5 KiB
Nim
205 lines
5.5 KiB
Nim
import strutils, algorithm
|
|
|
|
let
|
|
# this file was obtained from:
|
|
# https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
|
|
filename = "tools/UnicodeData.txt"
|
|
data = readFile(filename).strip.splitLines()
|
|
|
|
const
|
|
# see the table here:
|
|
# https://www.unicode.org/reports/tr44/#GC_Values_Table
|
|
letters = ["Lu", "Ll", "Lt", "Lm", "Lo"]
|
|
spaces = ["Zs", "Zl", "Zp"]
|
|
|
|
type
|
|
Ranges = tuple[start, stop, diff: int]
|
|
Singlets = tuple[code, diff: int]
|
|
NonLetterRanges = tuple[start, stop: int]
|
|
|
|
var
|
|
toUpper = newSeq[Singlets]()
|
|
toLower = newSeq[Singlets]()
|
|
toTitle = newSeq[Singlets]()
|
|
alphas = newSeq[int]()
|
|
unispaces = newSeq[int]()
|
|
|
|
|
|
proc parseData(data: seq[string]) =
|
|
for line in data:
|
|
let
|
|
fields = line.split(';')
|
|
code = fields[0].parseHexInt()
|
|
category = fields[2]
|
|
uc = fields[12]
|
|
lc = fields[13]
|
|
tc = fields[14]
|
|
|
|
if category notin spaces and category notin letters:
|
|
continue
|
|
|
|
if uc.len > 0:
|
|
let diff = 500 + uc.parseHexInt() - code
|
|
toUpper.add (code, diff)
|
|
if lc.len > 0:
|
|
let diff = 500 + lc.parseHexInt() - code
|
|
toLower.add (code, diff)
|
|
if tc.len > 0 and tc != uc:
|
|
# if titlecase is different than uppercase
|
|
let diff = 500 + tc.parseHexInt() - code
|
|
if diff != 500:
|
|
toTitle.add (code, diff)
|
|
|
|
if category in spaces:
|
|
unispaces.add code
|
|
else:
|
|
alphas.add code
|
|
|
|
proc splitRanges(a: seq[Singlets], r: var seq[Ranges], s: var seq[Singlets]) =
|
|
## Splits `toLower`, `toUpper` and `toTitle` into separate sequences:
|
|
## - `r` contains continuous ranges with the same characteristics
|
|
## (their upper/lower version is the same distance away)
|
|
## - `s` contains single code points
|
|
var i, j: int
|
|
while i < a.len:
|
|
j = 1
|
|
let
|
|
startCode = a[i].code
|
|
startDiff = a[i].diff
|
|
while i + j <= a.len:
|
|
if i+j >= a.len or a[i+j].code != startCode+j or a[i+j].diff != startDiff:
|
|
if j == 1:
|
|
s.add (startCode, startDiff)
|
|
else:
|
|
r.add (startCode, a[i+j-1].code, startDiff)
|
|
i += j-1
|
|
break
|
|
else:
|
|
inc j
|
|
inc i
|
|
|
|
proc splitRanges(a: seq[int], r: var seq[NonLetterRanges], s: var seq[int]) =
|
|
## Splits `alphas` and `unispaces` into separate sequences:
|
|
## - `r` contains continuous ranges
|
|
## - `s` contains single code points
|
|
var i, j: int
|
|
while i < a.len:
|
|
j = 1
|
|
let startCode = a[i]
|
|
while i + j <= a.len:
|
|
if i+j >= a.len or a[i+j] != startCode+j:
|
|
if j == 1:
|
|
s.add startCode
|
|
else:
|
|
r.add (startCode, a[i+j-1])
|
|
i += j-1
|
|
break
|
|
else:
|
|
inc j
|
|
inc i
|
|
|
|
proc splitSpaces(a: seq[int], r: var seq[NonLetterRanges], s: var seq[int]) =
|
|
## Spaces are special because of the way how `isWhiteSpace` and `split`
|
|
## are implemented.
|
|
##
|
|
## All spaces are added both to `r` (ranges) and `s` (singlets).
|
|
var i, j: int
|
|
while i < a.len:
|
|
j = 1
|
|
let startCode = a[i]
|
|
while i + j <= a.len:
|
|
if i+j >= a.len or a[i+j] != startCode+j:
|
|
r.add (startCode, a[i+j-1])
|
|
i += j-1
|
|
break
|
|
else:
|
|
inc j
|
|
inc i
|
|
s = a
|
|
|
|
|
|
var
|
|
toupperRanges = newSeq[Ranges]()
|
|
toupperSinglets = newSeq[Singlets]()
|
|
tolowerRanges = newSeq[Ranges]()
|
|
tolowerSinglets = newSeq[Singlets]()
|
|
totitleRanges = newSeq[Ranges]()
|
|
totitleSinglets = newSeq[Singlets]()
|
|
spaceRanges = newSeq[NonLetterRanges]()
|
|
unicodeSpaces = newSeq[int]()
|
|
alphaRanges = newSeq[NonLetterRanges]()
|
|
alphaSinglets = newSeq[int]()
|
|
|
|
parseData(data)
|
|
splitRanges(toLower, tolowerRanges, tolowerSinglets)
|
|
splitRanges(toUpper, toUpperRanges, toUpperSinglets)
|
|
splitRanges(toTitle, toTitleRanges, toTitleSinglets)
|
|
splitRanges(alphas, alphaRanges, alphaSinglets)
|
|
|
|
# manually add "special" spaces
|
|
for i in 9 .. 13:
|
|
unispaces.add i
|
|
unispaces.add 0x85
|
|
unispaces.sort()
|
|
|
|
splitSpaces(unispaces, spaceRanges, unicodeSpaces)
|
|
|
|
|
|
var output: string
|
|
|
|
proc createHeader(output: var string) =
|
|
output.add "# This file was created from a script.\n\n"
|
|
output.add "const\n"
|
|
|
|
proc `$`(r: Ranges): string =
|
|
let
|
|
start = "0x" & toHex(r.start, 5)
|
|
stop = "0x" & toHex(r.stop, 5)
|
|
result = "$#, $#, $#,\n" % [start, stop, $r.diff]
|
|
|
|
proc `$`(r: Singlets): string =
|
|
let code = "0x" & toHex(r.code, 5)
|
|
result = "$#, $#,\n" % [code, $r.diff]
|
|
|
|
proc `$`(r: NonLetterRanges): string =
|
|
let
|
|
start = "0x" & toHex(r.start, 5)
|
|
stop = "0x" & toHex(r.stop, 5)
|
|
result = "$#, $#,\n" % [start, stop]
|
|
|
|
|
|
proc outputSeq(s: seq[Ranges|Singlets|NonLetterRanges], name: string,
|
|
output: var string) =
|
|
output.add " $# = [\n" % name
|
|
for r in s:
|
|
output.add " " & $r
|
|
output.add " ]\n\n"
|
|
|
|
proc outputSeq(s: seq[int], name: string, output: var string) =
|
|
output.add " $# = [\n" % name
|
|
for i in s:
|
|
output.add " 0x$#,\n" % toHex(i, 5)
|
|
output.add " ]\n\n"
|
|
|
|
proc outputSpaces(s: seq[int], name: string, output: var string) =
|
|
output.add " $# = [\n" % name
|
|
for i in s:
|
|
output.add " Rune 0x$#,\n" % toHex(i, 5)
|
|
output.add " ]\n\n"
|
|
|
|
|
|
output.createHeader()
|
|
outputSeq(tolowerRanges, "toLowerRanges", output)
|
|
outputSeq(tolowerSinglets, "toLowerSinglets", output)
|
|
outputSeq(toupperRanges, "toUpperRanges", output)
|
|
outputSeq(toupperSinglets, "toUpperSinglets", output)
|
|
outputSeq(totitleSinglets, "toTitleSinglets", output)
|
|
outputSeq(alphaRanges, "alphaRanges", output)
|
|
outputSeq(alphaSinglets, "alphaSinglets", output)
|
|
outputSeq(spaceRanges, "spaceRanges", output)
|
|
outputSpaces(unispaces, "unicodeSpaces", output) # array of runes
|
|
|
|
|
|
let outfile = "lib/pure/includes/unicode_ranges.nim"
|
|
outfile.writeFile(output)
|