Peg captures get stack-like behavior (#18369)

* Implements reverse capture indexing. * Now works for modified backrefs too. * Changed reverse indexing syntax prefix for back-references to '$^'.
2026-02-12 06:18:51 +00:00 · 2021-06-28 12:33:20 +02:00
parent 908b2cc2e4
commit e720bbdd76
3 changed files with 94 additions and 34 deletions
--- a/doc/pegdocs.txt
+++ b/doc/pegdocs.txt
@@ -27,7 +27,10 @@ notation           meaning
 ``{E}``            Capture: Apply expression `E` and store the substring
                   that matched `E` into a *capture* that can be accessed
                   after the matching process.
-``$i``             Back reference to the ``i``th capture. ``i`` counts from 1.
+``{}``             Empty capture: Delete the last capture. No character
+                   is consumed.
+``$i``             Back reference to the ``i``th capture. ``i`` counts forwards
+                   from 1 or backwards (last capture to first) from ^1.
 ``$``              Anchor: Matches at the end of the input. No character
                   is consumed. Same as ``!.``.
 ``^``              Anchor: Matches at the start of the input. No character
@@ -149,14 +152,15 @@ The PEG parser implements this grammar (written in PEG syntax)::
  rule <- identifier \s* "<-" expr ig
  identNoArrow <- identifier !(\s* "<-")
  prefixOpr <- ig '&' / ig '!' / ig '@' / ig '{@}' / ig '@@'
-  literal <- ig identifier? '$' [0-9]+ / '$' / '^' /
+  literal <- ig identifier? '$' '^'? [0-9]+ / '$' / '^' /
             ig identNoArrow /
             ig charset /
             ig stringlit /
             ig builtin /
             ig '.' /
             ig '_' /
-             (ig "(" expr ig ")")
+             (ig "(" expr ig ")") /
+             (ig "{" expr? ig "}")
  postfixOpr <- ig '?' / ig '*' / ig '+'
  primary <- prefixOpr* (literal postfixOpr*)

--- a/lib/pure/pegs.nim
+++ b/lib/pure/pegs.nim
@@ -83,7 +83,7 @@ type
    of pkChar, pkGreedyRepChar: ch: char
    of pkCharChoice, pkGreedyRepSet: charChoice: ref set[char]
    of pkNonTerminal: nt: NonTerminal
-    of pkBackRef..pkBackRefIgnoreStyle: index: range[0..MaxSubpatterns]
+    of pkBackRef..pkBackRefIgnoreStyle: index: range[-MaxSubpatterns..MaxSubpatterns-1]
    else: sons: seq[Peg]
  NonTerminal* = ref NonTerminalObj

@@ -106,7 +106,7 @@ proc nt*(p: Peg): NonTerminal = p.nt
  ## Returns the *NonTerminal* object of a given *Peg* variant object
  ## where present.

-proc index*(p: Peg): range[0..MaxSubpatterns] = p.index
+proc index*(p: Peg): range[-MaxSubpatterns..MaxSubpatterns-1] = p.index
  ## Returns the back-reference index of a captured sub-pattern in the
  ## *Captures* object for a given *Peg* variant object where present.

@@ -304,34 +304,37 @@ proc endAnchor*: Peg {.inline.} =
  ## constructs the PEG ``$`` which matches the end of the input.
  result = !any()

-proc capture*(a: Peg): Peg {.noSideEffect, rtl, extern: "npegsCapture".} =
+proc capture*(a: Peg = Peg(kind: pkEmpty)): Peg {.noSideEffect, rtl, extern: "npegsCapture".} =
  ## constructs a capture with the PEG `a`
  result = Peg(kind: pkCapture, sons: @[a])

-proc backref*(index: range[1..MaxSubpatterns]): Peg {.
+proc backref*(index: range[1..MaxSubpatterns], reverse: bool = false): Peg {.
  noSideEffect, rtl, extern: "npegs$1".} =
  ## constructs a back reference of the given `index`. `index` starts counting
-  ## from 1.
-  result = Peg(kind: pkBackRef, index: index-1)
+  ## from 1. `reverse` specifies wether indexing starts from the end of the
+  ## capture list.
+  result = Peg(kind: pkBackRef, index: (if reverse: -index else: index - 1))

-proc backrefIgnoreCase*(index: range[1..MaxSubpatterns]): Peg {.
+proc backrefIgnoreCase*(index: range[1..MaxSubpatterns], reverse: bool = false): Peg {.
  noSideEffect, rtl, extern: "npegs$1".} =
  ## constructs a back reference of the given `index`. `index` starts counting
-  ## from 1. Ignores case for matching.
-  result = Peg(kind: pkBackRefIgnoreCase, index: index-1)
+  ## from 1. `reverse` specifies wether indexing starts from the end of the
+  ## capture list. Ignores case for matching.
+  result = Peg(kind: pkBackRefIgnoreCase, index: (if reverse: -index else: index - 1))

-proc backrefIgnoreStyle*(index: range[1..MaxSubpatterns]): Peg {.
+proc backrefIgnoreStyle*(index: range[1..MaxSubpatterns], reverse: bool = false): Peg {.
  noSideEffect, rtl, extern: "npegs$1".} =
  ## constructs a back reference of the given `index`. `index` starts counting
-  ## from 1. Ignores style for matching.
-  result = Peg(kind: pkBackRefIgnoreStyle, index: index-1)
+  ## from 1. `reverse` specifies wether indexing starts from the end of the
+  ## capture list. Ignores style for matching.
+  result = Peg(kind: pkBackRefIgnoreStyle, index: (if reverse: -index else: index - 1))

 proc spaceCost(n: Peg): int =
  case n.kind
  of pkEmpty: discard
  of pkTerminal, pkTerminalIgnoreCase, pkTerminalIgnoreStyle, pkChar,
     pkGreedyRepChar, pkCharChoice, pkGreedyRepSet,
-     pkAny..pkWhitespace, pkGreedyAny:
+     pkAny..pkWhitespace, pkGreedyAny, pkBackRef..pkBackRefIgnoreStyle:
    result = 1
  of pkNonTerminal:
    # we cannot inline a rule with a non-terminal
@@ -561,8 +564,10 @@ template matchOrParse(mopProc: untyped) =
      # Parse handler code must run in an *of* clause of its own for each
      # *PegKind*, so we encapsulate the identical clause body for
      # *pkBackRef..pkBackRefIgnoreStyle* here.
-      if p.index >= c.ml: return -1
-      var (a, b) = c.matches[p.index]
+      var index = p.index
+      if index < 0: index.inc(c.ml)
+      if index < 0 or index >= c.ml: return -1
+      var (a, b) = c.matches[index]
      var n: Peg
      case p.kind
      of pkBackRef:
@@ -822,15 +827,19 @@ template matchOrParse(mopProc: untyped) =
      leave(pkNotPredicate, s, p, start, result)
    of pkCapture:
      enter(pkCapture, s, p, start)
-      var idx = c.ml # reserve a slot for the subpattern
-      inc(c.ml)
-      result = mopProc(s, p.sons[0], start, c)
-      if result >= 0:
-        if idx < MaxSubpatterns:
-          c.matches[idx] = (start, start+result-1)
-        #else: silently ignore the capture
+      if p.sons.len == 0 or p.sons[0].kind == pkEmpty:
+        # empty capture removes last match
+        dec(c.ml)
+        c.matches[c.ml] = (0, 0)
+        result = 0 # match of length 0
      else:
-        c.ml = idx
+        var idx = c.ml # reserve a slot for the subpattern
+        result = mopProc(s, p.sons[0], start, c)
+        if result >= 0:
+          inc(c.ml)
+          if idx < MaxSubpatterns:
+            c.matches[idx] = (start, start+result-1)
+          #else: silently ignore the capture
      leave(pkCapture, s, p, start, result)
    of pkBackRef:
      enter(pkBackRef, s, p, start)
@@ -1395,6 +1404,7 @@ type
    tkCurlyLe,    ## '{'
    tkCurlyRi,    ## '}'
    tkCurlyAt,    ## '{@}'
+    tkEmptyCurl,  ## '{}'
    tkArrow,      ## '<-'
    tkBar,        ## '/'
    tkStar,       ## '*'
@@ -1427,7 +1437,7 @@ type
 const
  tokKindToStr: array[TokKind, string] = [
    "invalid", "[EOF]", ".", "_", "identifier", "string literal",
-    "character set", "(", ")", "{", "}", "{@}",
+    "character set", "(", ")", "{", "}", "{@}", "{}",
    "<-", "/", "*", "+", "&", "!", "?",
    "@", "built-in", "escaped", "$", "$", "^"
  ]
@@ -1564,13 +1574,21 @@ proc getString(c: var PegLexer, tok: var Token) =

 proc getDollar(c: var PegLexer, tok: var Token) =
  var pos = c.bufpos + 1
+  var neg = false
+  if pos < c.buf.len and c.buf[pos] == '^':
+    neg = true
+    inc(pos)
  if pos < c.buf.len and c.buf[pos] in {'0'..'9'}:
    tok.kind = tkBackref
    tok.index = 0
    while pos < c.buf.len and c.buf[pos] in {'0'..'9'}:
      tok.index = tok.index * 10 + ord(c.buf[pos]) - ord('0')
      inc(pos)
+    if neg:
+      tok.index = -tok.index
  else:
+    if neg:
+      dec(pos)
    tok.kind = tkDollar
  c.bufpos = pos

@@ -1670,6 +1688,10 @@ proc getTok(c: var PegLexer, tok: var Token) =
      tok.kind = tkCurlyAt
      inc(c.bufpos, 2)
      add(tok.literal, "{@}")
+    elif c.buf[c.bufpos] == '}' and c.bufpos < c.buf.len:
+      tok.kind = tkEmptyCurl
+      inc(c.bufpos)
+      add(tok.literal, "{}")
    else:
      tok.kind = tkCurlyLe
      add(tok.literal, '{')
@@ -1705,7 +1727,7 @@ proc getTok(c: var PegLexer, tok: var Token) =
      return
    if c.buf[c.bufpos] in {'\'', '"'} or
        c.buf[c.bufpos] == '$' and c.bufpos+1 < c.buf.len and
-        c.buf[c.bufpos+1] in {'0'..'9'}:
+        c.buf[c.bufpos+1] in {'^', '0'..'9'}:
      case tok.literal
      of "i": tok.modifier = modIgnoreCase
      of "y": tok.modifier = modIgnoreStyle
@@ -1819,10 +1841,13 @@ proc modifiedTerm(s: string, m: Modifier): Peg =
  of modIgnoreStyle: result = termIgnoreStyle(s)

 proc modifiedBackref(s: int, m: Modifier): Peg =
+  var
+    reverse = s < 0
+    index = if reverse: -s else: s
  case m
-  of modNone, modVerbatim: result = backref(s)
-  of modIgnoreCase: result = backrefIgnoreCase(s)
-  of modIgnoreStyle: result = backrefIgnoreStyle(s)
+  of modNone, modVerbatim: result = backref(index, reverse)
+  of modIgnoreCase: result = backrefIgnoreCase(index, reverse)
+  of modIgnoreStyle: result = backrefIgnoreStyle(index, reverse)

 proc builtin(p: var PegParser): Peg =
  # do not use "y", "skip" or "i" as these would be ambiguous
@@ -1896,6 +1921,9 @@ proc primary(p: var PegParser): Peg =
    result = capture(parseExpr(p)).token(p)
    eat(p, tkCurlyRi)
    inc(p.captures)
+  of tkEmptyCurl:
+    result = capture()
+    getTok(p)
  of tkAny:
    result = any().token(p)
    getTok(p)
@@ -1915,11 +1943,11 @@ proc primary(p: var PegParser): Peg =
    result = startAnchor()
    getTok(p)
  of tkBackref:
+    if abs(p.tok.index) > p.captures or p.tok.index == 0:
+      pegError(p, "invalid back reference index: " & $p.tok.index)
    var m = p.tok.modifier
    if m == modNone: m = p.modifier
    result = modifiedBackref(p.tok.index, m).token(p)
-    if p.tok.index < 0 or p.tok.index > p.captures:
-      pegError(p, "invalid back reference index: " & $p.tok.index)
    getTok(p)
  else:
    pegError(p, "expression expected, but found: " & p.tok.literal)
@@ -1943,7 +1971,7 @@ proc seqExpr(p: var PegParser): Peg =
    case p.tok.kind
    of tkAmp, tkNot, tkAt, tkStringLit, tkCharSet, tkParLe, tkCurlyLe,
       tkAny, tkAnyRune, tkBuiltin, tkEscaped, tkDollar, tkBackref,
-       tkHat, tkCurlyAt:
+       tkHat, tkCurlyAt, tkEmptyCurl:
      result = sequence(result, primary(p))
    of tkIdentifier:
      if not arrowIsNextTok(p):
--- a/tests/stdlib/tpegs.nim
+++ b/tests/stdlib/tpegs.nim
@@ -293,6 +293,34 @@ block:

    doAssert "test1".match(peg"""{@}$""")
    doAssert "test2".match(peg"""{(!$ .)*} $""")
+
+    doAssert "abbb".match(peg"{a} {b} $2 $^1")
+    doAssert "abBA".match(peg"{a} {b} i$2 i$^2")
+
+    doAssert "abba".match(peg"{a} {b} $^1 {} $^1")
+
+    block:
+      let grammar = peg"""
+program <- {''} stmt* $
+stmt <- call / block
+call <- 'call()' EOL
+EOL <- \n / $
+block <- 'block:' \n indBody
+indBody <- {$^1 ' '+} stmt ($^1 stmt)* {}
+"""
+      let program = """
+call()
+block:
+  block:
+    call()
+    call()
+  call()
+call()
+"""
+      var c: Captures
+      doAssert program.len == program.rawMatch(grammar, 0, c)
+      doAssert c.ml == 1
+
  pegsTest()
  static:
    pegsTest()