Make sure the json module decodes UTF16 correctly

Javascript uses UTF-16 as its internal representation of strings,
so JSON does so as well. This means that we could have surrogate
pairs, with codepoints above 0xFFFF that take 2 ecape codes to
decode.
This commit is contained in:
Daniel Hertz
2015-10-13 14:35:17 -04:00
parent 7f4f37eaa2
commit e5bcd287f8

View File

@@ -203,6 +203,15 @@ proc handleHexChar(c: char, x: var int): bool =
of 'A'..'F': x = (x shl 4) or (ord(c) - ord('A') + 10)
else: result = false # error
proc parseEscapedUTF16(buf: cstring, pos: var int): int =
result = 0
#UTF-16 escape is always 4 bytes.
for _ in 0..3:
if handleHexChar(buf[pos], result):
inc(pos)
else:
return -1
proc parseString(my: var JsonParser): TokKind =
result = tkString
var pos = my.bufpos + 1
@@ -238,11 +247,22 @@ proc parseString(my: var JsonParser): TokKind =
inc(pos, 2)
of 'u':
inc(pos, 2)
var r: int
if handleHexChar(buf[pos], r): inc(pos)
if handleHexChar(buf[pos], r): inc(pos)
if handleHexChar(buf[pos], r): inc(pos)
if handleHexChar(buf[pos], r): inc(pos)
var r = parseEscapedUTF16(buf, pos)
if r < 0:
my.err = errInvalidToken
break
# Deal with surrogates
if (r and 0xfc00) == 0xd800:
if buf[pos] & buf[pos+1] != "\\u":
my.err = errInvalidToken
break
inc(pos, 2)
var s = parseEscapedUTF16(buf, pos)
if (s and 0xfc00) == 0xdc00 and s > 0:
r = 0x10000 + (((r - 0xd800) shl 10) or (s - 0xdc00))
else:
my.err = errInvalidToken
break
add(my.a, toUTF8(Rune(r)))
else:
# don't bother with the error