Merge pull request #4258 from Parashurama/add_unicode_escape

adds support for unicode hexcode in string literals.
2026-02-13 06:43:52 +00:00 · 2016-06-02 09:55:27 +02:00
parent c11de219e5 8ce9739f11
commit ca6986b89c
2 changed files with 26 additions and 3 deletions
--- a/compiler/lexer.nim
+++ b/compiler/lexer.nim
@@ -138,6 +138,8 @@ proc getLineInfo*(L: TLexer, tok: TToken): TLineInfo {.inline.} =
 proc isKeyword*(kind: TTokType): bool =
  result = (kind >= tokKeywordLow) and (kind <= tokKeywordHigh)

+template ones(n: expr): expr = ((1 shl n)-1) # for utf-8 conversion
+
 proc isNimIdentifier*(s: string): bool =
  if s[0] in SymStartChars:
    var i = 1
@@ -589,12 +591,29 @@ proc getEscapedChar(L: var TLexer, tok: var TToken) =
  of '\\':
    add(tok.literal, '\\')
    inc(L.bufpos)
-  of 'x', 'X':
+  of 'x', 'X', 'u', 'U':
+    var tp = L.buf[L.bufpos]
    inc(L.bufpos)
    var xi = 0
    handleHexChar(L, xi)
    handleHexChar(L, xi)
-    add(tok.literal, chr(xi))
+    if tp in {'u', 'U'}:
+      handleHexChar(L, xi)
+      handleHexChar(L, xi)
+      # inlined toUTF-8 to avoid unicode and strutils dependencies.
+      if xi <=% 127:
+        add(tok.literal, xi.char )
+      elif xi <=% 0x07FF:
+        add(tok.literal, ((xi shr 6) or 0b110_00000).char )
+        add(tok.literal, ((xi and ones(6)) or 0b10_0000_00).char )
+      elif xi <=% 0xFFFF:
+        add(tok.literal, (xi shr 12 or 0b1110_0000).char )
+        add(tok.literal, (xi shr 6 and ones(6) or 0b10_0000_00).char )
+        add(tok.literal, (xi and ones(6) or 0b10_0000_00).char )
+      else: # value is 0xFFFF
+        add(tok.literal, "\xef\xbf\xbf" )
+    else:
+      add(tok.literal, chr(xi))
  of '0'..'9':
    if matchTwoChars(L, '0', {'0'..'9'}):
      lexMessage(L, warnOctalEscape)
--- a/tests/lexer/tstrlits.nim
+++ b/tests/lexer/tstrlits.nim
@@ -1,6 +1,6 @@
 discard """
  file: "tstrlits.nim"
-  output: "a\"\"long string\"\"\"\"\"abc\"def"
+  output: "a\"\"long string\"\"\"\"\"abc\"def_'2'●"
 """
 # Test the new different string literals

@@ -11,9 +11,13 @@ const

  raw = r"abc""def"

+  escaped = "\x5f'\50'\u25cf"
+
+
 stdout.write(rawQuote)
 stdout.write(tripleEmpty)
 stdout.write(raw)
+stdout.write(escaped)
 #OUT a""long string"""""abc"def