mirror of
https://github.com/nim-lang/Nim.git
synced 2026-01-01 02:42:05 +00:00
154 lines
5.2 KiB
Nim
154 lines
5.2 KiB
Nim
#
|
|
#
|
|
# Nimrod's Runtime Library
|
|
# (c) Copyright 2008 Andreas Rumpf
|
|
#
|
|
# See the file "copying.txt", included in this
|
|
# distribution, for details about the copyright.
|
|
#
|
|
|
|
## This module provides a way to handle various Unicode (or other) encodings.
|
|
|
|
type
|
|
TUniChar* = int32 ## type that can hold any Unicode character
|
|
TUniChar16* = int16 ## 16 bit Unicode character
|
|
|
|
template ones(n) = ((1 shl n)-1)
|
|
|
|
proc uniCharLen*(s: string): int =
|
|
## returns the number of Unicode characters of the string `s`.
|
|
var i = 0
|
|
while i < len(s):
|
|
if ord(s[i]) <= 127: inc(i)
|
|
elif ord(s[i]) shr 5 == 0b110: inc(i, 2)
|
|
elif ord(s[i]) shr 4 == 0b1110: inc(i, 3)
|
|
elif ord(s[i]) shr 3 == 0b11110: inc(i, 4)
|
|
else: assert(false)
|
|
inc(result)
|
|
|
|
proc uniCharAt*(s: string, i: int): TUniChar =
|
|
## returns the unicode character in `s` at byte index `i`
|
|
if ord(s[i]) <= 127:
|
|
result = ord(s[i])
|
|
elif ord(s[i]) shr 5 == 0b110:
|
|
assert(ord(s[i+1]) shr 6 == 0b10)
|
|
result = (ord(s[i]) and ones(5)) shl 6 or (ord(s[i+1]) and ones(6))
|
|
elif ord(s[i]) shr 4 == 0b1110:
|
|
assert(ord(s[i+1]) shr 6 == 0b10)
|
|
assert(ord(s[i+2]) shr 6 == 0b10)
|
|
result = (ord(s[i]) and ones(4)) shl 12 or
|
|
(ord(s[i+1]) and ones(6)) shl 6 or
|
|
(ord(s[i+2]) and ones(6))
|
|
elif ord(s[i]) shr 3 == 0b11110:
|
|
assert(ord(s[i+1]) shr 6 == 0b10)
|
|
assert(ord(s[i+2]) shr 6 == 0b10)
|
|
assert(ord(s[i+3]) shr 6 == 0b10)
|
|
result = (ord(s[i]) and ones(3)) shl 18 or
|
|
(ord(s[i+1]) and ones(6)) shl 12 or
|
|
(ord(s[i+2]) and ones(6)) shl 6 or
|
|
(ord(s[i+3]) and ones(6))
|
|
else:
|
|
assert(false)
|
|
|
|
iterator unichars*(s: string): TUniChar =
|
|
## iterates over any unicode character of the string `s`.
|
|
var
|
|
i = 0
|
|
result: TUniChar
|
|
while i < len(s):
|
|
if ord(s[i]) <= 127:
|
|
result = ord(s[i])
|
|
inc(i)
|
|
elif ord(s[i]) shr 5 == 0b110:
|
|
result = (ord(s[i]) and ones(5)) shl 6 or (ord(s[i+1]) and ones(6))
|
|
inc(i, 2)
|
|
elif ord(s[i]) shr 4 == 0b1110:
|
|
result = (ord(s[i]) and ones(4)) shl 12 or
|
|
(ord(s[i+1]) and ones(6)) shl 6 or
|
|
(ord(s[i+2]) and ones(6))
|
|
inc(i, 3)
|
|
elif ord(s[i]) shr 3 == 0b11110:
|
|
result = (ord(s[i]) and ones(3)) shl 18 or
|
|
(ord(s[i+1]) and ones(6)) shl 12 or
|
|
(ord(s[i+2]) and ones(6)) shl 6 or
|
|
(ord(s[i+3]) and ones(6))
|
|
inc(i, 4)
|
|
else:
|
|
assert(false)
|
|
yield result
|
|
|
|
type
|
|
TCharacterSet = enum
|
|
cs8859_1, cs8859_2
|
|
|
|
const
|
|
characterSetToName: array [TCharacterSet, string] = [
|
|
"ISO/IEC 8859-1:1998",
|
|
"ISO 8859-2:1999",
|
|
"",
|
|
""
|
|
]
|
|
|
|
cs8859_2toUnicode: array [0xA1..0xff, TUniChar16] = [
|
|
0x0104'i16, 0x02D8'i16, 0x0141'i16, 0x00A4'i16, 0x013D'i16, 0x015A'i16,
|
|
0x00A7'i16, 0x00A8'i16, 0x0160'i16, 0x015E'i16, 0x0164'i16, 0x0179'i16,
|
|
0x00AD'i16, 0x017D'i16, 0x017B'i16, 0x00B0'i16, 0x0105'i16, 0x02DB'i16,
|
|
0x0142'i16, 0x00B4'i16, 0x013E'i16, 0x015B'i16, 0x02C7'i16, 0x00B8'i16,
|
|
0x0161'i16, 0x015F'i16, 0x0165'i16, 0x017A'i16, 0x02DD'i16, 0x017E'i16,
|
|
0x017C'i16, 0x0154'i16, 0x00C1'i16, 0x00C2'i16, 0x0102'i16, 0x00C4'i16,
|
|
0x0139'i16, 0x0106'i16, 0x00C7'i16, 0x010C'i16, 0x00C9'i16, 0x0118'i16,
|
|
0x00CB'i16, 0x011A'i16, 0x00CD'i16, 0x00CE'i16, 0x010E'i16, 0x0110'i16,
|
|
0x0143'i16, 0x0147'i16, 0x00D3'i16, 0x00D4'i16, 0x0150'i16, 0x00D6'i16,
|
|
0x00D7'i16, 0x0158'i16, 0x016E'i16, 0x00DA'i16, 0x0170'i16, 0x00DC'i16,
|
|
0x00DD'i16, 0x0162'i16, 0x00DF'i16, 0x0155'i16, 0x00E1'i16, 0x00E2'i16,
|
|
0x0103'i16, 0x00E4'i16, 0x013A'i16, 0x0107'i16, 0x00E7'i16, 0x010D'i16,
|
|
0x00E9'i16, 0x0119'i16, 0x00EB'i16, 0x011B'i16, 0x00ED'i16, 0x00EE'i16,
|
|
0x010F'i16, 0x0111'i16, 0x0144'i16, 0x0148'i16, 0x00F3'i16, 0x00F4'i16,
|
|
0x0151'i16, 0x00F6'i16, 0x00F7'i16, 0x0159'i16, 0x016F'i16, 0x00FA'i16,
|
|
0x0171'i16, 0x00FC'i16, 0x00FD'i16, 0x0163'i16, 0x02D9'i16]
|
|
|
|
proc searchTable(tab: openarray[TUniChar16], u: TUniChar16): int8 =
|
|
var idx = find(tab, u)
|
|
assert(idx > 0)
|
|
result = toU8(idx)
|
|
|
|
proc csToUnicode(cs: TCharacterSet, c: int8): TUniChar16 =
|
|
case cs
|
|
of cs8859_1: result = ze16(c) # no table lookup necessary
|
|
of cs8859_2:
|
|
if c <=% 0xA0'i8:
|
|
result = ze16(c)
|
|
else:
|
|
result = cs8859_2toUnicode[ze(c)]
|
|
|
|
proc unicodeToCS(cs: TCharacterSet, u: TUniChar16): int8 =
|
|
case cs
|
|
of cs8859_1: result = toU8(u) # no table lookup necessary
|
|
of cs8859_2:
|
|
if u <=% 0x00A0'i16:
|
|
result = toU8(u)
|
|
else:
|
|
result = searchTable(cs8859_2toUnicode, u) +% 0xA1'8
|
|
|
|
proc utf8toLocale*(s: string): string
|
|
proc localeToUtf8*(s: string): string
|
|
|
|
proc utf8toUtf16*(s: string): seq[TUniChar16]
|
|
proc utf8toUcs4*(s: string): seq[TUniChar] =
|
|
result = []
|
|
for u in unichars(s):
|
|
|
|
proc ucs4ToUtf8(s: seq[TUnichar]): string
|
|
proc utf16ToUtf8(s: seq[TUnichar16]): string
|
|
proc ucs4toUft16(s: seq[TUnichar]): seq[TUnichar16]
|
|
proc uft16toUcs4(s: seq[Tunichar16]): seq[TUnichar]
|
|
|
|
proc cmpUnicode*(a, b: string): int =
|
|
## treats `a` and `b` as UTF-8 strings and compares them. Returns:
|
|
## | < 0 iff a < b
|
|
## | > 0 iff a > b
|
|
## | == 0 iff a == b
|
|
## This routine is useful for sorting UTF-8 strings.
|
|
return -1
|
|
|