fixed UTF-16 to UTF-8 conversion in widestrs.nim

the source of problem for issue #3228
also add test for entire range of valid UTF-16
and test for invalid UTF-16 sequence
This commit is contained in:
jangko
2015-08-21 10:43:31 +07:00
parent c103eddc73
commit 7c757599f1
2 changed files with 55 additions and 13 deletions

View File

@@ -114,7 +114,7 @@ proc newWideCString*(s: cstring): WideCString =
proc newWideCString*(s: string): WideCString =
result = newWideCString(s, s.len)
proc `$`*(w: WideCString, estimate: int): string =
proc `$`*(w: WideCString, estimate: int, replacement: int = 0xFFFD): string =
result = newStringOfCap(estimate + estimate shr 2)
var i = 0
@@ -124,9 +124,18 @@ proc `$`*(w: WideCString, estimate: int): string =
if ch >= UNI_SUR_HIGH_START and ch <= UNI_SUR_HIGH_END:
# If the 16 bits following the high surrogate are in the source buffer...
let ch2 = int(cast[uint16](w[i]))
ch = (((ch and halfMask) shl halfShift) + (ch2 and halfMask)) + halfBase
inc i
# If it's a low surrogate, convert to UTF32:
if ch2 >= UNI_SUR_LOW_START and ch2 <= UNI_SUR_LOW_END:
ch = (((ch and halfMask) shl halfShift) + (ch2 and halfMask)) + halfBase
inc i
else:
#invalid UTF-16
ch = replacement
elif ch >= UNI_SUR_LOW_START and ch <= UNI_SUR_LOW_END:
#invalid UTF-16
ch = replacement
if ch < 0x80:
result.add chr(ch)
elif ch < 0x800:
@@ -136,11 +145,16 @@ proc `$`*(w: WideCString, estimate: int): string =
result.add chr((ch shr 12) or 0xe0)
result.add chr(((ch shr 6) and 0x3f) or 0x80)
result.add chr((ch and 0x3f) or 0x80)
else:
elif ch <= 0x10FFFF:
result.add chr((ch shr 18) or 0xf0)
result.add chr(((ch shr 12) and 0x3f) or 0x80)
result.add chr(((ch shr 6) and 0x3f) or 0x80)
result.add chr((ch and 0x3f) or 0x80)
else:
# replacement char(in case user give very large number):
result.add chr(0xFFFD shr 12 or 0b1110_0000)
result.add chr(0xFFFD shr 6 and ones(6) or 0b10_0000_00)
result.add chr(0xFFFD and ones(6) or 0b10_0000_00)
proc `$`*(s: WideCString): string =
result = s $ 80

View File

@@ -33,9 +33,9 @@ proc testCP(wc: WideCString, lo, hi: int) =
var x = 0
let chunk = 1024
for i in lo..hi:
wc[x] = cast[TUtf16Char](i)
wc[x] = cast[Utf16Char](i)
if (x >= chunk) or (i >= hi):
wc[x] = TUtf16Char(0)
wc[x] = Utf16Char(0)
var a = convertToUTF8(wc, int32(x))
var b = wc $ chunk
assert a == b
@@ -43,26 +43,26 @@ proc testCP(wc: WideCString, lo, hi: int) =
inc x
proc testCP2(wc: WideCString, lo, hi: int) =
assert ((lo >=0x10000) and (hi <= 0x10FFFF))
assert((lo >= 0x10000) and (hi <= 0x10FFFF))
var x = 0
let chunk = 1024
for i in lo..hi:
let ch = i - 0x10000
let W1 = 0xD800 or (ch shr 10)
let W2 = 0xDC00 or (0x3FF and ch)
wc[x] = cast[TUtf16Char](W1)
wc[x+1] = cast[TUtf16Char](W2)
wc[x] = cast[Utf16Char](W1)
wc[x+1] = cast[Utf16Char](W2)
inc(x, 2)
if (x >= chunk) or (i >= hi):
wc[x] = TUtf16Char(0)
wc[x] = Utf16Char(0)
var a = convertToUTF8(wc, int32(x))
var b = wc $ chunk
assert a == b
x = 0
#RFC-2781 "UTF-16, an encoding of ISO 10646"
var wc: WideCString
unsafeNew(wc, 1024 * 4 + 2)
@@ -75,4 +75,32 @@ wc.testCP(0xE000, 0xFFFF)
#U+10000 to U+10FFFF
wc.testCP2(0x10000, 0x10FFFF)
#invalid UTF-16
const
b = "\xEF\xBF\xBD"
c = "\xEF\xBF\xBF"
wc[0] = cast[Utf16Char](0xDC00)
wc[1] = Utf16Char(0)
var a = $wc
assert a == b
wc[0] = cast[Utf16Char](0xFFFF)
wc[1] = cast[Utf16Char](0xDC00)
wc[2] = Utf16Char(0)
a = $wc
assert a == c & b
wc[0] = cast[Utf16Char](0xD800)
wc[1] = Utf16Char(0)
a = $wc
assert a == b
wc[0] = cast[Utf16Char](0xD800)
wc[1] = cast[Utf16Char](0xFFFF)
wc[2] = Utf16Char(0)
a = $wc
assert a == b & c
echo "OK"