changed handling of generalized string literals

2026-02-13 06:43:52 +00:00 · 2011-02-07 00:11:11 +01:00
parent 77d045b3ac
commit 134f24f579
10 changed files with 76 additions and 39 deletions
--- a/doc/grammar.txt
+++ b/doc/grammar.txt
@@ -29,15 +29,18 @@ symbol ::= '`' (KEYWORD | IDENT | operator | '(' ')'
         | IDENT
         
 primaryPrefix ::= (prefixOperator | 'bind') optInd
-primarySuffix ::= '.' optInd symbol
+primarySuffix ::= '.' optInd symbol [generalizedLit]
                | '(' optInd namedExprList optPar ')'
                | '[' optInd [indexExpr (comma indexExpr)* [comma]] optPar ']'
                | '^'
                | pragma

-primary ::= primaryPrefix* (symbol | constructor | castExpr | addrExpr)
+primary ::= primaryPrefix* (symbol [generalizedLit] | 
+                            constructor | castExpr | addrExpr)
            primarySuffix*

+generalizedLit ::= GENERALIZED_STR_LIT | GENERALIZED_TRIPLESTR_LIT
+
 literal ::= INT_LIT | INT8_LIT | INT16_LIT | INT32_LIT | INT64_LIT
          | FLOAT_LIT | FLOAT32_LIT | FLOAT64_LIT
          | STR_LIT | RSTR_LIT | TRIPLESTR_LIT
--- a/doc/manual.txt
+++ b/doc/manual.txt
@@ -178,6 +178,8 @@ the exact spelling of an identifier.
 String literals
 ---------------

+Terminal symbol in the grammar: ``STR_LIT``.
+
 `String literals`:idx: can be delimited by matching double quotes, and can
 contain the following `escape sequences`:idx:\ :

@@ -205,12 +207,14 @@ contain the following `escape sequences`:idx:\ :


 Strings in Nimrod may contain any 8-bit value, even embedded zeros. However 
-some operations may interpret the first binary zero as terminator.
+some operations may interpret the first binary zero as a terminator.


 Triple quoted string literals
 -----------------------------

+Terminal symbol in the grammar: ``TRIPLESTR_LIT``.
+
 String literals can also be delimited by three double quotes
 ``"""`` ... ``"""``.
 Literals in this form may run for several lines, may contain ``"`` and do not
@@ -230,10 +234,12 @@ Produces::
 Raw string literals
 -------------------

-There are also `raw string literals` that are preceded with the letter ``r``
-(or ``R``) and are delimited by matching double quotes (just like ordinary
-string literals) and do not interpret the escape sequences. This is especially
-convenient for regular expressions or Windows paths:
+Terminal symbol in the grammar: ``RSTR_LIT``.
+
+There are also `raw string literals`:idx: that are preceded with the 
+letter ``r`` (or ``R``) and are delimited by matching double quotes (just 
+like ordinary string literals) and do not interpret the escape sequences. 
+This is especially convenient for regular expressions or Windows paths:

 .. code-block:: nimrod

@@ -250,12 +256,17 @@ Produces::
  a"b

 ``r""""`` is not possible with this notation, because the three leading 
-quotes introduce a triple quoted string literal. 
+quotes introduce a triple quoted string literal. ``r"""`` is the same 
+as ``"""`` since triple quoted string literals do not interpret escape 
+sequences either.


 Generalized raw string literals
 -------------------------------

+Terminal symbols in the grammar: ``GENERALIZED_STR_LIT``, 
+``GENERALIZED_TRIPLESTR_LIT``.
+
 The construct ``identifier"string literal"`` (without whitespace between the
 identifier and the opening quotation mark) is a
 `generalized raw string literal`:idx:. It is a shortcut for the construct
--- a/rod/options.nim
+++ b/rod/options.nim
@@ -128,7 +128,7 @@ proc addImplicitMod(filename: string) =
  gImplicitMods[length] = filename

 proc getPrefixDir(): string = 
-  result = SplitPath(getApplicationDir()).head
+  result = SplitPath(getAppDir()).head

 proc shortenDir(dir: string): string = 
  # returns the interesting part of a dir
--- a/rod/pbraces.nim
+++ b/rod/pbraces.nim
@@ -163,12 +163,28 @@ proc parseAddr(p: var TParser): PNode =
  addSon(result, parseExpr(p))
  optPar(p)
  eat(p, tkParRi)
+  
+proc parseGStrLit(p: var TParser, a: PNode): PNode = 
+  case p.tok.tokType
+  of tkGStrLit: 
+    result = newNodeP(nkCallStrLit, p)
+    addSon(result, a)
+    addSon(result, newStrNodeP(nkRStrLit, p.tok.literal, p))
+    getTok(p)
+  of tkGTripleStrLit: 
+    result = newNodeP(nkCallStrLit, p)
+    addSon(result, a)
+    addSon(result, newStrNodeP(nkTripleStrLit, p.tok.literal, p))
+    getTok(p)
+  else:
+    result = a

 proc identOrLiteral(p: var TParser): PNode = 
  case p.tok.tokType
  of tkSymbol: 
    result = newIdentNodeP(p.tok.ident, p)
    getTok(p)
+    result = parseGStrLit(p, result)
  of tkAccent: 
    result = accExpr(p)       # literals
  of tkIntLit: 
@@ -212,16 +228,6 @@ proc identOrLiteral(p: var TParser): PNode =
  of tkTripleStrLit: 
    result = newStrNodeP(nkTripleStrLit, p.tok.literal, p)
    getTok(p)
-  of tkCallRStrLit: 
-    result = newNodeP(nkCallStrLit, p)
-    addSon(result, newIdentNodeP(p.tok.ident, p))
-    addSon(result, newStrNodeP(nkRStrLit, p.tok.literal, p))
-    getTok(p)
-  of tkCallTripleStrLit: 
-    result = newNodeP(nkCallStrLit, p)
-    addSon(result, newIdentNodeP(p.tok.ident, p))
-    addSon(result, newStrNodeP(nkTripleStrLit, p.tok.literal, p))
-    getTok(p)
  of tkCharLit: 
    result = newIntNodeP(nkCharLit, ord(p.tok.literal[0]), p)
    getTok(p)
@@ -279,6 +285,7 @@ proc primary(p: var TParser): PNode =
      getTok(p)               # skip '.'
      optInd(p, result)
      addSon(result, parseSymbol(p))
+      result = parseGStrLit(p, result)
    of tkHat: 
      a = result
      result = newNodeP(nkDerefExpr, p)
--- a/rod/pnimsyn.nim
+++ b/rod/pnimsyn.nim
@@ -394,11 +394,27 @@ proc setBaseFlags(n: PNode, base: TNumericalBase) =
  of base8: incl(n.flags, nfBase8)
  of base16: incl(n.flags, nfBase16)
  
+proc parseGStrLit(p: var TParser, a: PNode): PNode = 
+  case p.tok.tokType
+  of tkGStrLit: 
+    result = newNodeP(nkCallStrLit, p)
+    addSon(result, a)
+    addSon(result, newStrNodeP(nkRStrLit, p.tok.literal, p))
+    getTok(p)
+  of tkGTripleStrLit: 
+    result = newNodeP(nkCallStrLit, p)
+    addSon(result, a)
+    addSon(result, newStrNodeP(nkTripleStrLit, p.tok.literal, p))
+    getTok(p)
+  else:
+    result = a
+  
 proc identOrLiteral(p: var TParser): PNode = 
  case p.tok.tokType
  of tkSymbol: 
    result = newIdentNodeP(p.tok.ident, p)
    getTok(p)
+    result = parseGStrLit(p, result)
  of tkAccent: 
    result = accExpr(p)       # literals
  of tkIntLit: 
@@ -442,16 +458,6 @@ proc identOrLiteral(p: var TParser): PNode =
  of tkTripleStrLit: 
    result = newStrNodeP(nkTripleStrLit, p.tok.literal, p)
    getTok(p)
-  of tkCallRStrLit: 
-    result = newNodeP(nkCallStrLit, p)
-    addSon(result, newIdentNodeP(p.tok.ident, p))
-    addSon(result, newStrNodeP(nkRStrLit, p.tok.literal, p))
-    getTok(p)
-  of tkCallTripleStrLit: 
-    result = newNodeP(nkCallStrLit, p)
-    addSon(result, newIdentNodeP(p.tok.ident, p))
-    addSon(result, newStrNodeP(nkTripleStrLit, p.tok.literal, p))
-    getTok(p)
  of tkCharLit: 
    result = newIntNodeP(nkCharLit, ord(p.tok.literal[0]), p)
    getTok(p)
@@ -509,6 +515,7 @@ proc primary(p: var TParser): PNode =
      getTok(p)               # skip '.'
      optInd(p, result)
      addSon(result, parseSymbol(p))
+      result = parseGStrLit(p, result)
    of tkHat: 
      a = result
      result = newNodeP(nkDerefExpr, p)
--- a/rod/rnimsyn.nim
+++ b/rod/rnimsyn.nim
@@ -684,7 +684,7 @@ proc gsub(g: var TSrcGen, n: PNode, c: TContext) =
    if n.sons[1].kind == nkRStrLit: 
      put(g, tkRStrLit, '\"' & replace(n[1].strVal, "\"", "\"\"") & '\"')
    else: 
-      gsub(g, n.sons[0])
+      gsub(g, n.sons[1])
  of nkHiddenStdConv, nkHiddenSubConv, nkHiddenCallConv: gsub(g, n.sons[0])
  of nkCast: 
    put(g, tkCast, "cast")
--- a/rod/scanner.nim
+++ b/rod/scanner.nim
@@ -58,7 +58,7 @@ type
    tkYield, #[[[end]]]
    tkIntLit, tkInt8Lit, tkInt16Lit, tkInt32Lit, tkInt64Lit, tkFloatLit, 
    tkFloat32Lit, tkFloat64Lit, tkStrLit, tkRStrLit, tkTripleStrLit, 
-    tkCallRStrLit, tkCallTripleStrLit, tkCharLit, tkParLe, tkParRi, tkBracketLe, 
+    tkGStrLit, tkGTripleStrLit, tkCharLit, tkParLe, tkParRi, tkBracketLe, 
    tkBracketRi, tkCurlyLe, tkCurlyRi, 
    tkBracketDotLe, tkBracketDotRi, # [. and  .]
    tkCurlyDotLe, tkCurlyDotRi, # {.  and  .}
@@ -91,7 +91,7 @@ const
    "yield", #[[[end]]]
    "tkIntLit", "tkInt8Lit", "tkInt16Lit", "tkInt32Lit", "tkInt64Lit", 
    "tkFloatLit", "tkFloat32Lit", "tkFloat64Lit", "tkStrLit", "tkRStrLit", 
-    "tkTripleStrLit", "tkCallRStrLit", "tkCallTripleStrLit", "tkCharLit", "(", 
+    "tkTripleStrLit", "tkGStrLit", "tkGTripleStrLit", "tkCharLit", "(", 
    ")", "[", "]", "{", "}", "[.", ".]", "{.", ".}", "(.", ".)", ",", ";", ":", 
    "=", ".", "..", "^", "tkOpr", "tkComment", "`", "[new indentation]", 
    "[same indentation]", "[dedentation]", "tkSpaces", "tkInfixOpr", 
@@ -587,10 +587,11 @@ proc getSymbol(L: var TLexer, tok: var TToken) =
    tok.tokType = tkSymbol
  else: 
    tok.tokType = TTokType(tok.ident.id + ord(tkSymbol))
-  if buf[pos] == '\"': 
-    getString(L, tok, true)
-    if tok.tokType == tkRStrLit: tok.tokType = tkCallRStrLit
-    else: tok.tokType = tkCallTripleStrLit
+  when false:
+    if buf[pos] == '\"': 
+      getString(L, tok, true)
+      if tok.tokType == tkRStrLit: tok.tokType = tkCallRStrLit
+      else: tok.tokType = tkCallTripleStrLit
  
 proc getOperator(L: var TLexer, tok: var TToken) = 
  var pos = L.bufpos
@@ -770,7 +771,13 @@ proc rawGetTok(L: var TLexer, tok: var TToken) =
      tok.tokType = tkAccent
      Inc(L.bufpos)
    of '\"': 
-      getString(L, tok, false)
+      # check for extended raw string literal:
+      var rawMode = L.bufpos > 0 and L.buf[L.bufpos-1] in SymChars
+      getString(L, tok, rawMode)
+      if rawMode:
+        # tkRStrLit -> tkGStrLit
+        # tkTripleStrLit -> tkGTripleStrLit
+        inc(tok.tokType, 2)
    of '\'':
      tok.tokType = tkCharLit
      getCharacter(L, tok)
--- a/tests/accept/run/tregex.nim
+++ b/tests/accept/run/tregex.nim
@@ -6,7 +6,8 @@ import

 if "keyA = valueA" =~ re"\s*(\w+)\s*\=\s*(\w+)":
  write(stdout, "key: ", matches[0])
-elif "# comment!" =~ re"\s*(\#.*)":
+elif "# comment!" =~ re.re"\s*(\#.*)": 
+  # test re.re"" syntax
  echo("comment: ", matches[0])
 else: 
  echo("Bug!")
--- a/todo.txt
+++ b/todo.txt
@@ -1,7 +1,6 @@
 - thread support: threadvar on Windows seems broken; 
  add --deadlock_prevention:on|off switch
 - built-in serialization
- change how generalized raw string literals work

 - we need a way to disable tests
 - deprecate ^ and make it available as operator
--- a/web/news.txt
+++ b/web/news.txt
@@ -32,6 +32,8 @@ Changes affecting backwards compatibility
  instead.
 - Deprecated ``os.getApplicationDir``: Use ``os.getAppDir`` 
  instead.
+- Changed and documented how generalized string literals work: The syntax
+  ``module.re"abc"`` is now supported.


 Additions