fixed UTF-16 to UTF-8 conversion in widestrs.nim

the source of problem for issue #3228 also add test for entire range of valid UTF-16 and test for invalid UTF-16 sequence
2026-06-04 10:54:42 +00:00 · 2015-08-21 10:43:31 +07:00
parent c103eddc73
commit 7c757599f1
2 changed files with 55 additions and 13 deletions
--- a/lib/system/widestrs.nim
+++ b/lib/system/widestrs.nim
@@ -114,7 +114,7 @@ proc newWideCString*(s: cstring): WideCString =
 proc newWideCString*(s: string): WideCString =
  result = newWideCString(s, s.len)

-proc `$`*(w: WideCString, estimate: int): string =
+proc `$`*(w: WideCString, estimate: int, replacement: int = 0xFFFD): string =
  result = newStringOfCap(estimate + estimate shr 2)

  var i = 0
@@ -124,9 +124,18 @@ proc `$`*(w: WideCString, estimate: int): string =
    if ch >= UNI_SUR_HIGH_START and ch <= UNI_SUR_HIGH_END:
      # If the 16 bits following the high surrogate are in the source buffer...
      let ch2 = int(cast[uint16](w[i]))
-      ch = (((ch and halfMask) shl halfShift) + (ch2 and halfMask)) + halfBase
-      inc i
-    
+      
+      # If it's a low surrogate, convert to UTF32:
+      if ch2 >= UNI_SUR_LOW_START and ch2 <= UNI_SUR_LOW_END:
+        ch = (((ch and halfMask) shl halfShift) + (ch2 and halfMask)) + halfBase
+        inc i
+      else:
+        #invalid UTF-16
+        ch = replacement
+    elif ch >= UNI_SUR_LOW_START and ch <= UNI_SUR_LOW_END:
+      #invalid UTF-16
+      ch = replacement
+      
    if ch < 0x80:
      result.add chr(ch)
    elif ch < 0x800:
@@ -136,11 +145,16 @@ proc `$`*(w: WideCString, estimate: int): string =
      result.add chr((ch shr 12) or 0xe0)
      result.add chr(((ch shr 6) and 0x3f) or 0x80)
      result.add chr((ch and 0x3f) or 0x80)
-    else:
+    elif ch <= 0x10FFFF:
      result.add chr((ch shr 18) or 0xf0)
      result.add chr(((ch shr 12) and 0x3f) or 0x80)
      result.add chr(((ch shr 6) and 0x3f) or 0x80)
      result.add chr((ch and 0x3f) or 0x80)
-
+    else:
+      # replacement char(in case user give very large number):
+      result.add chr(0xFFFD shr 12 or 0b1110_0000)
+      result.add chr(0xFFFD shr 6 and ones(6) or 0b10_0000_00)
+      result.add chr(0xFFFD and ones(6) or 0b10_0000_00)
+      
 proc `$`*(s: WideCString): string =
  result = s $ 80
--- a/tests/stdlib/twchartoutf8.nim
+++ b/tests/stdlib/twchartoutf8.nim
@@ -33,9 +33,9 @@ proc testCP(wc: WideCString, lo, hi: int) =
  var x = 0
  let chunk = 1024
  for i in lo..hi:
-    wc[x] = cast[TUtf16Char](i)
+    wc[x] = cast[Utf16Char](i)
    if (x >= chunk) or (i >= hi):
-      wc[x] = TUtf16Char(0)
+      wc[x] = Utf16Char(0)
      var a = convertToUTF8(wc, int32(x))
      var b = wc $ chunk
      assert a == b
@@ -43,26 +43,26 @@ proc testCP(wc: WideCString, lo, hi: int) =
    inc x

 proc testCP2(wc: WideCString, lo, hi: int) =
-  assert ((lo >=0x10000) and (hi <= 0x10FFFF))
+  assert((lo >= 0x10000) and (hi <= 0x10FFFF))
  var x = 0
  let chunk = 1024
  for i in lo..hi:
    let ch = i - 0x10000
    let W1 = 0xD800 or (ch shr 10)
    let W2 = 0xDC00 or (0x3FF and ch)
-    wc[x] = cast[TUtf16Char](W1)
-    wc[x+1] = cast[TUtf16Char](W2)
+    wc[x] = cast[Utf16Char](W1)
+    wc[x+1] = cast[Utf16Char](W2)
    inc(x, 2)
    
    if (x >= chunk) or (i >= hi):
-      wc[x] = TUtf16Char(0)
+      wc[x] = Utf16Char(0)
      var a = convertToUTF8(wc, int32(x))
      var b = wc $ chunk
      assert a == b
      x = 0

 #RFC-2781 "UTF-16, an encoding of ISO 10646"
-
+    
 var wc: WideCString
 unsafeNew(wc, 1024 * 4 + 2)

@@ -75,4 +75,32 @@ wc.testCP(0xE000, 0xFFFF)

 #U+10000 to U+10FFFF
 wc.testCP2(0x10000, 0x10FFFF)
+
+#invalid UTF-16
+const 
+  b = "\xEF\xBF\xBD"
+  c = "\xEF\xBF\xBF"
+
+wc[0] = cast[Utf16Char](0xDC00)
+wc[1] = Utf16Char(0)
+var a = $wc
+assert a == b
+
+wc[0] = cast[Utf16Char](0xFFFF)
+wc[1] = cast[Utf16Char](0xDC00)
+wc[2] = Utf16Char(0)
+a = $wc
+assert a == c & b
+
+wc[0] = cast[Utf16Char](0xD800)
+wc[1] = Utf16Char(0)
+a = $wc
+assert a == b
+
+wc[0] = cast[Utf16Char](0xD800)
+wc[1] = cast[Utf16Char](0xFFFF)
+wc[2] = Utf16Char(0)
+a = $wc
+assert a == b & c
+
 echo "OK"