highlite: fix #17890 - tokenize Nim escape seq-s (#17919)

* highlite: fix #17890 - tokenize Nim escape seq-s * Update tests/stdlib/thighlite.nim Co-authored-by: Timothee Cour <timothee.cour2@gmail.com> Co-authored-by: Timothee Cour <timothee.cour2@gmail.com>
2026-07-12 12:19:40 +00:00 · 2021-05-03 11:21:36 +03:00
parent 0dc534832e
commit 287f1170ba
2 changed files with 48 additions and 21 deletions
--- a/lib/packages/docutils/highlite.nim
+++ b/lib/packages/docutils/highlite.nim
@@ -190,31 +190,33 @@ proc nimNextToken(g: var GeneralTokenizer, keywords: openArray[string] = @[]) =
  var pos = g.pos
  g.start = g.pos
  if g.state == gtStringLit:
-    g.kind = gtStringLit
-    while true:
+    if g.buf[pos] == '\\':
+      g.kind = gtEscapeSequence
+      inc(pos)
      case g.buf[pos]
-      of '\\':
-        g.kind = gtEscapeSequence
+      of 'x', 'X':
        inc(pos)
-        case g.buf[pos]
-        of 'x', 'X':
-          inc(pos)
-          if g.buf[pos] in hexChars: inc(pos)
-          if g.buf[pos] in hexChars: inc(pos)
-        of '0'..'9':
-          while g.buf[pos] in {'0'..'9'}: inc(pos)
-        of '\0':
-          g.state = gtNone
-        else: inc(pos)
-        break
-      of '\0', '\r', '\n':
+        if g.buf[pos] in hexChars: inc(pos)
+        if g.buf[pos] in hexChars: inc(pos)
+      of '0'..'9':
+        while g.buf[pos] in {'0'..'9'}: inc(pos)
+      of '\0':
        g.state = gtNone
-        break
-      of '\"':
-        inc(pos)
-        g.state = gtNone
-        break
      else: inc(pos)
+    else:
+      g.kind = gtStringLit
+      while true:
+        case g.buf[pos]
+        of '\\':
+          break
+        of '\0', '\r', '\n':
+          g.state = gtNone
+          break
+        of '\"':
+          inc(pos)
+          g.state = gtNone
+          break
+        else: inc(pos)
  else:
    case g.buf[pos]
    of ' ', '\t'..'\r':
@@ -985,6 +987,18 @@ proc getNextToken*(g: var GeneralTokenizer, lang: SourceLanguage) =
  of langPython: pythonNextToken(g)
  of langCmd: cmdNextToken(g)

+proc tokenize*(text: string, lang: SourceLanguage): seq[(string, TokenClass)] =
+  var g: GeneralTokenizer
+  initGeneralTokenizer(g, text)
+  var prevPos = 0
+  while true:
+    getNextToken(g, lang)
+    if g.kind == gtEof:
+      break
+    var s = text[prevPos ..< g.pos]
+    result.add (s, g.kind)
+    prevPos = g.pos
+
 when isMainModule:
  var keywords: seq[string]
  # Try to work running in both the subdir or at the root.
--- a/tests/stdlib/thighlite.nim
+++ b/tests/stdlib/thighlite.nim
@@ -0,0 +1,13 @@
+
+import unittest
+import ../../lib/packages/docutils/highlite
+
+block: # Nim tokenizing"
+  test "string literals and escape seq":
+    check("\"ok1\\nok2\\nok3\"".tokenize(langNim) ==
+       @[("\"ok1", gtStringLit), ("\\n", gtEscapeSequence), ("ok2", gtStringLit),
+         ("\\n", gtEscapeSequence), ("ok3\"", gtStringLit)
+      ])
+    check("\"\"\"ok1\\nok2\\nok3\"\"\"".tokenize(langNim) ==
+       @[("\"\"\"ok1\\nok2\\nok3\"\"\"", gtLongStringLit)
+      ])