add RST highlighting for command line / shells (also fixes #16858) (#17789)

2026-02-13 06:43:52 +00:00 · 2021-04-21 17:57:54 +03:00
parent 80389b8053
commit 8f79bc5f3d
10 changed files with 291 additions and 140 deletions
--- a/lib/packages/docutils/highlite.nim
+++ b/lib/packages/docutils/highlite.nim
@@ -37,6 +37,18 @@
 ## .. code:: Nim
 ##   for l in ["C", "c++", "jAvA", "Nim", "c#"]: echo getSourceLanguage(l)
 ##
+## There is also a `Cmd` pseudo-language supported, which is a simple generic
+## shell/cmdline tokenizer (UNIX shell/Powershell/Windows Command):
+## no escaping, no programming language constructs besides variable definition
+## at the beginning of line. It supports these operators:
+##
+## .. code:: Cmd
+##    &  &&  |  ||  (  )  ''  ""  ;  # for comments
+##
+## Instead of escaping always use quotes like here
+## `nimgrep --ext:'nim|nims' file.name`:cmd: shows how to input ``|``.
+## Any argument that contains ``.`` or ``/`` or ``\`` will be treated
+## as a file or directory.

 import
  strutils
@@ -45,7 +57,7 @@ from algorithm import binarySearch
 type
  SourceLanguage* = enum
    langNone, langNim, langCpp, langCsharp, langC, langJava,
-    langYaml, langPython
+    langYaml, langPython, langCmd
  TokenClass* = enum
    gtEof, gtNone, gtWhitespace, gtDecNumber, gtBinNumber, gtHexNumber,
    gtOctNumber, gtFloatNumber, gtIdentifier, gtKeyword, gtStringLit,
@@ -53,7 +65,7 @@ type
    gtOperator, gtPunctuation, gtComment, gtLongComment, gtRegularExpression,
    gtTagStart, gtTagEnd, gtKey, gtValue, gtRawData, gtAssembler,
    gtPreprocessor, gtDirective, gtCommand, gtRule, gtHyperlink, gtLabel,
-    gtReference, gtOther
+    gtReference, gtProgram, gtOption, gtOther
  GeneralTokenizer* = object of RootObj
    kind*: TokenClass
    start*, length*: int
@@ -64,14 +76,17 @@ type

 const
  sourceLanguageToStr*: array[SourceLanguage, string] = ["none",
-    "Nim", "C++", "C#", "C", "Java", "Yaml", "Python"]
+    "Nim", "C++", "C#", "C", "Java", "Yaml", "Python", "Cmd"]
  tokenClassToStr*: array[TokenClass, string] = ["Eof", "None", "Whitespace",
    "DecNumber", "BinNumber", "HexNumber", "OctNumber", "FloatNumber",
    "Identifier", "Keyword", "StringLit", "LongStringLit", "CharLit",
    "EscapeSequence", "Operator", "Punctuation", "Comment", "LongComment",
    "RegularExpression", "TagStart", "TagEnd", "Key", "Value", "RawData",
    "Assembler", "Preprocessor", "Directive", "Command", "Rule", "Hyperlink",
-    "Label", "Reference", "Other"]
+    "Label", "Reference",
+    # start from lower-case if there is a corresponding RST role (see rst.nim)
+    "program", "option",
+    "Other"]

  # The following list comes from doc/keywords.txt, make sure it is
  # synchronized with this array by running the module itself as a test case.
@@ -898,6 +913,65 @@ proc pythonNextToken(g: var GeneralTokenizer) =
      "with", "yield"]
  nimNextToken(g, keywords)

+proc cmdNextToken(g: var GeneralTokenizer) =
+  var pos = g.pos
+  g.start = g.pos
+  if g.state == low(TokenClass):
+    g.state = gtProgram
+  case g.buf[pos]
+  of ' ', '\t'..'\r':
+    g.kind = gtWhitespace
+    while g.buf[pos] in {' ', '\t'..'\r'}:
+      if g.buf[pos] == '\n':
+        g.state = gtProgram
+      inc(pos)
+  of '\'', '"':
+    g.kind = gtOption
+    let q = g.buf[pos]
+    inc(pos)
+    while g.buf[pos] notin {q, '\0'}:
+      inc(pos)
+    if g.buf[pos] == q: inc(pos)
+  of '#':
+    g.kind = gtComment
+    while g.buf[pos] notin {'\n', '\0'}:
+      inc(pos)
+  of '&', '|':
+    g.kind = gtOperator
+    inc(pos)
+    if g.buf[pos] == g.buf[pos-1]: inc(pos)
+    g.state = gtProgram
+  of '(':
+    g.kind = gtOperator
+    g.state = gtProgram
+    inc(pos)
+  of ')':
+    g.kind = gtOperator
+    inc(pos)
+  of ';':
+    g.state = gtProgram
+    g.kind = gtOperator
+    inc(pos)
+  of '\0': g.kind = gtEof
+  else:
+    if g.state == gtProgram:
+      g.kind = gtProgram
+      g.state = gtOption
+    else:
+      g.kind = gtOption
+    while g.buf[pos] notin {' ', '\t'..'\r', '&', '|', '(', ')', '\'', '"', '\0'}:
+      if g.buf[pos] == ';' and g.buf[pos+1] == ' ':
+        # (check space because ';' can be used inside arguments in Win bat)
+        break
+      if g.kind == gtOption and g.buf[pos] in {'/', '\\', '.'}:
+        g.kind = gtIdentifier  # for file/dir name
+      elif g.kind == gtProgram and g.buf[pos] == '=':
+        g.kind = gtIdentifier  # for env variable setting at beginning of line
+        g.state = gtProgram
+      inc(pos)
+  g.length = pos - g.pos
+  g.pos = pos
+
 proc getNextToken*(g: var GeneralTokenizer, lang: SourceLanguage) =
  g.lang = lang
  case lang
@@ -909,6 +983,7 @@ proc getNextToken*(g: var GeneralTokenizer, lang: SourceLanguage) =
  of langJava: javaNextToken(g)
  of langYaml: yamlNextToken(g)
  of langPython: pythonNextToken(g)
+  of langCmd: cmdNextToken(g)

 when isMainModule:
  var keywords: seq[string]
--- a/lib/packages/docutils/rst.nim
+++ b/lib/packages/docutils/rst.nim
@@ -23,10 +23,10 @@
 ##
 ## Nim can output the result to HTML [#html]_ or Latex [#latex]_.
 ##
-## .. [#html] commands ``nim doc`` for ``*.nim`` files and
-##    ``nim rst2html`` for ``*.rst`` files
+## .. [#html] commands `nim doc`:cmd: for ``*.nim`` files and
+##    `nim rst2html`:cmd: for ``*.rst`` files
 ##
-## .. [#latex] command ``nim rst2tex`` for ``*.rst``.
+## .. [#latex] command `nim rst2tex`:cmd: for ``*.rst``.
 ##
 ## If you are new to RST please consider reading the following:
 ##
@@ -78,14 +78,21 @@
 ##
 ## * directives: ``code-block`` [cmp:Sphinx]_, ``title``,
 ##   ``index`` [cmp:Sphinx]_
-## * predefined roles ``:nim:`` (default), ``:c:`` (C programming language),
-##   ``:python:``, ``:yaml:``, ``:java:``, ``:cpp:`` (C++), ``:csharp`` (C#).
-##   That is every language that `highlite <highlite.html>`_ supports.
-##   They turn on appropriate syntax highlighting in inline code.
+## * predefined roles
+##   - ``:nim:`` (default), ``:c:`` (C programming language),
+##     ``:python:``, ``:yaml:``, ``:java:``, ``:cpp:`` (C++), ``:csharp`` (C#).
+##     That is every language that `highlite <highlite.html>`_ supports.
+##     They turn on appropriate syntax highlighting in inline code.
 ##
-##   .. Note:: default role for Nim files is ``:nim:``,
-##             for ``*.rst`` it's currently ``:literal:``.
+##     .. Note:: default role for Nim files is ``:nim:``,
+##               for ``*.rst`` it's currently ``:literal:``.
 ##
+##   - generic command line highlighting roles:
+##     - ``:cmd:`` for commands and common shells syntax
+##     - ``:program:`` for executable names [cmp:Sphinx]_
+##       (one can just use ``:cmd:`` on single word)
+##     - ``:option:`` for command line options [cmp:Sphinx]_
+##   - ``:tok:``, a role for highlighting of programming language tokens
 ## * ***triple emphasis*** (bold and italic) using \*\*\*
 ## * ``:idx:`` role for \`interpreted text\` to include the link to this
 ##   text into an index (example: `Nim index`_).
@@ -95,11 +102,11 @@
 ##     //compile   compile the project
 ##     //doc       generate documentation
 ##
-##   Here the dummy `//` will disappear, while options ``compile``
-##   and ``doc`` will be left in the final document.
+##   Here the dummy `//` will disappear, while options `compile`:option:
+##   and `doc`:option: will be left in the final document.
 ##
 ## .. [cmp:Sphinx] similar but different from the directives of
-##    Python `Sphinx directives`_ extensions
+##    Python `Sphinx directives`_ and `Sphinx roles`_ extensions
 ##
 ## .. _`extra features`:
 ##
@@ -144,7 +151,7 @@
 ## -----
 ##
 ## See `Nim DocGen Tools Guide <docgen.html>`_ for the details about
-## ``nim doc``, ``nim rst2html`` and ``nim rst2tex`` commands.
+## `nim doc`:cmd:, `nim rst2html`:cmd: and `nim rst2tex`:cmd: commands.
 ##
 ## See `packages/docutils/rstgen module <rstgen.html>`_ to know how to
 ## generate HTML or Latex strings to embed them into your documents.
@@ -156,6 +163,7 @@
 ## .. _RST roles list: https://docutils.sourceforge.io/docs/ref/rst/roles.html
 ## .. _Nim index: https://nim-lang.org/docs/theindex.html
 ## .. _Sphinx directives: https://www.sphinx-doc.org/en/master/usage/restructuredtext/directives.html
+## .. _Sphinx roles: https://www.sphinx-doc.org/en/master/usage/restructuredtext/roles.html

 import
  os, strutils, rstast, std/enumutils, algorithm, lists, sequtils,
@@ -530,7 +538,7 @@ proc defaultRole(options: RstParseOptions): string =

 # mirror highlite.nim sourceLanguageToStr with substitutions c++ cpp, c# csharp
 const supportedLanguages = ["nim", "yaml", "python", "java", "c",
-                            "cpp", "csharp"]
+                            "cpp", "csharp", "cmd"]

 proc whichRoleAux(sym: string): RstNodeKind =
  let r = sym.toLowerAscii
@@ -543,6 +551,7 @@ proc whichRoleAux(sym: string): RstNodeKind =
  of "sup", "superscript": result = rnSup
  # literal and code are the same in our implementation
  of "code": result = rnInlineLiteral
+  of "program", "option", "tok": result = rnCodeFragment
  # c++ currently can be spelled only as cpp, c# only as csharp
  elif r in supportedLanguages:
    result = rnInlineCode
@@ -1113,10 +1122,10 @@ proc toInlineCode(n: PRstNode, language: string): PRstNode =
  lb.add newLeaf(s)
  result.add lb

-proc toUnknownRole(n: PRstNode, roleName: string): PRstNode =
+proc toOtherRole(n: PRstNode, kind: RstNodeKind, roleName: string): PRstNode =
  let newN = newRstNode(rnInner, n.sons)
  let newSons = @[newN, newLeaf(roleName)]
-  result = newRstNode(rnUnknownRole, newSons)
+  result = newRstNode(kind, newSons)

 proc parsePostfix(p: var RstParser, n: PRstNode): PRstNode =
  var newKind = n.kind
@@ -1144,8 +1153,8 @@ proc parsePostfix(p: var RstParser, n: PRstNode): PRstNode =
    # a role:
    let (roleName, lastIdx) = getRefname(p, p.idx+1)
    newKind = whichRole(p, roleName)
-    if newKind == rnUnknownRole:
-      result = n.toUnknownRole(roleName)
+    if newKind in {rnUnknownRole, rnCodeFragment}:
+      result = n.toOtherRole(newKind, roleName)
    elif newKind == rnInlineCode:
      result = n.toInlineCode(language=roleName)
    else:
@@ -1417,8 +1426,8 @@ proc parseInline(p: var RstParser, father: PRstNode) =
      if k == rnInlineCode:
        n = n.toInlineCode(language=roleName)
      parseUntil(p, n, "`", false) # bug #17260
-      if k == rnUnknownRole:
-        n = n.toUnknownRole(roleName)
+      if k in {rnUnknownRole, rnCodeFragment}:
+        n = n.toOtherRole(k, roleName)
      father.add(n)
    elif isInlineMarkupStart(p, "`"):
      var n = newRstNode(rnInterpretedText)
--- a/lib/packages/docutils/rstast.nim
+++ b/lib/packages/docutils/rstast.nim
@@ -56,7 +56,9 @@ type
                              #     * `file#id <file#id>'_
    rnSubstitutionDef,        # a definition of a substitution
    # Inline markup:
-    rnInlineCode,
+    rnInlineCode,             # interpreted text with code in a known language
+    rnCodeFragment,           # inline code for highlighting with the specified
+                              # class (which cannot be inferred from context)
    rnUnknownRole,            # interpreted text with an unknown role
    rnSub, rnSup, rnIdx,
    rnEmphasis,               # "*"
--- a/lib/packages/docutils/rstgen.nim
+++ b/lib/packages/docutils/rstgen.nim
@@ -1198,7 +1198,8 @@ proc renderRstToOut(d: PDoc, n: PRstNode, result: var string) =
        "$1", result)
  of rnOptionGroup:
    renderAux(d, n,
-        "<div class=\"option-list-label\">$1</div>",
+        "<div class=\"option-list-label\"><tt><span class=\"option\">" &
+        "$1</span></tt></div>",
        "\\item[$1]", result)
  of rnDescription:
    renderAux(d, n, "<div class=\"option-list-description\">$1</div>",
@@ -1319,13 +1320,22 @@ proc renderRstToOut(d: PDoc, n: PRstNode, result: var string) =
    renderAux(d, n, "|$1|", "|$1|", result)
  of rnDirective:
    renderAux(d, n, "", "", result)
-  of rnUnknownRole:
+  of rnUnknownRole, rnCodeFragment:
    var tmp0 = ""
    var tmp1 = ""
    renderRstToOut(d, n.sons[0], tmp0)
    renderRstToOut(d, n.sons[1], tmp1)
-    dispA(d.target, result, "<span class=\"$2\">$1</span>", "\\span$2{$1}",
-          [tmp0, tmp1])
+    var class = tmp1
+    # don't allow missing role break latex compilation:
+    if d.target == outLatex and n.kind == rnUnknownRole: class = "Other"
+    if n.kind == rnCodeFragment:
+      dispA(d.target, result,
+            "<tt class=\"docutils literal\"><span class=\"pre $2\">" &
+              "$1</span></tt>",
+            "\\texttt{\\span$2{$1}}", [tmp0, class])
+    else:  # rnUnknownRole, not necessarily code/monospace font
+      dispA(d.target, result, "<span class=\"$2\">$1</span>", "\\span$2{$1}",
+            [tmp0, class])
  of rnSub: renderAux(d, n, "<sub>$1</sub>", "\\rstsub{$1}", result)
  of rnSup: renderAux(d, n, "<sup>$1</sup>", "\\rstsup{$1}", result)
  of rnEmphasis: renderAux(d, n, "<em>$1</em>", "\\emph{$1}", result)