fixed UTF-16 to UTF-8 conversion in widestrs.nim

the source of problem for issue #3228 also add test for entire range of valid UTF-16
2026-06-03 18:34:43 +00:00 · 2015-08-20 20:30:14 +07:00
parent 493dbc8932
commit c103eddc73
2 changed files with 85 additions and 7 deletions
--- a/lib/system/widestrs.nim
+++ b/lib/system/widestrs.nim
@@ -119,20 +119,20 @@ proc `$`*(w: WideCString, estimate: int): string =

  var i = 0
  while w[i].int16 != 0'i16:
-    var ch = uint32(cast[uint16](w[i]))
+    var ch = int(cast[uint16](w[i]))
    inc i
-    if ch >= uint32(UNI_SUR_HIGH_START) and ch <= uint32(UNI_SUR_HIGH_END):
+    if ch >= UNI_SUR_HIGH_START and ch <= UNI_SUR_HIGH_END:
      # If the 16 bits following the high surrogate are in the source buffer...
-      let ch2 = uint32(cast[uint16](w[i]))
-      ch = (ch shl halfShift) + ch2 + halfBase
+      let ch2 = int(cast[uint16](w[i]))
+      ch = (((ch and halfMask) shl halfShift) + (ch2 and halfMask)) + halfBase
      inc i
    
-    if ch < 0x80'u32:
+    if ch < 0x80:
      result.add chr(ch)
-    elif ch < 0x800'u32:
+    elif ch < 0x800:
      result.add chr((ch shr 6) or 0xc0)
      result.add chr((ch and 0x3f) or 0x80)
-    elif ch < 0x10000'u32:
+    elif ch < 0x10000:
      result.add chr((ch shr 12) or 0xe0)
      result.add chr(((ch shr 6) and 0x3f) or 0x80)
      result.add chr((ch and 0x3f) or 0x80)
--- a/tests/stdlib/twchartoutf8.nim
+++ b/tests/stdlib/twchartoutf8.nim
@@ -0,0 +1,78 @@
+#assume WideCharToMultiByte always produce correct result
+#windows only
+
+when not defined(windows):
+  {.error: "windows only".}
+  
+{.push gcsafe.}
+
+const CP_UTF8 = 65001'i32
+
+type
+  LPBOOL = ptr int32
+  LPWCSTR = ptr uint16
+  
+proc WideCharToMultiByte*(CodePage: int32, dwFlags: int32,
+                          lpWideCharStr: LPWCSTR, cchWideChar: int32,
+                          lpMultiByteStr: cstring, cchMultiByte: int32,
+                          lpDefaultChar: cstring, lpUsedDefaultChar: LPBOOL): int32{.
+    stdcall, dynlib: "kernel32", importc: "WideCharToMultiByte".}
+
+{.pop.}
+
+proc convertToUTF8(wc: WideCString, wclen: int32): string =
+  let size = WideCharToMultiByte(CP_UTF8, 0'i32, cast[LPWCSTR](addr(wc[0])), wclen, 
+    cstring(nil), 0'i32, cstring(nil), LPBOOL(nil))
+  result = newString(size)
+  let res = WideCharToMultiByte(CP_UTF8, 0'i32, cast[LPWCSTR](addr(wc[0])), wclen, 
+    cstring(result), size, cstring(nil), LPBOOL(nil))
+  result[size] = chr(0)
+  assert size == res
+   
+proc testCP(wc: WideCString, lo, hi: int) =
+  var x = 0
+  let chunk = 1024
+  for i in lo..hi:
+    wc[x] = cast[TUtf16Char](i)
+    if (x >= chunk) or (i >= hi):
+      wc[x] = TUtf16Char(0)
+      var a = convertToUTF8(wc, int32(x))
+      var b = wc $ chunk
+      assert a == b
+      x = 0
+    inc x
+
+proc testCP2(wc: WideCString, lo, hi: int) =
+  assert ((lo >=0x10000) and (hi <= 0x10FFFF))
+  var x = 0
+  let chunk = 1024
+  for i in lo..hi:
+    let ch = i - 0x10000
+    let W1 = 0xD800 or (ch shr 10)
+    let W2 = 0xDC00 or (0x3FF and ch)
+    wc[x] = cast[TUtf16Char](W1)
+    wc[x+1] = cast[TUtf16Char](W2)
+    inc(x, 2)
+    
+    if (x >= chunk) or (i >= hi):
+      wc[x] = TUtf16Char(0)
+      var a = convertToUTF8(wc, int32(x))
+      var b = wc $ chunk
+      assert a == b
+      x = 0
+
+#RFC-2781 "UTF-16, an encoding of ISO 10646"
+
+var wc: WideCString
+unsafeNew(wc, 1024 * 4 + 2)
+
+#U+0000 to U+D7FF
+#skip the U+0000
+wc.testCP(1, 0xD7FF)
+
+#U+E000 to U+FFFF
+wc.testCP(0xE000, 0xFFFF)
+
+#U+10000 to U+10FFFF
+wc.testCP2(0x10000, 0x10FFFF)
+echo "OK"