pegs: captured search loop

2026-06-06 11:54:11 +00:00 · 2010-11-07 23:52:41 +01:00
parent 7659739caf
commit 8ee63f9836
3 changed files with 55 additions and 5 deletions
--- a/doc/pegdocs.txt
+++ b/doc/pegdocs.txt
@@ -66,6 +66,10 @@ notation           meaning
                   failure.
 ``@E``             Search: Shorthand for ``(!E .)* E``. (Search loop for the
                   pattern `E`.)
+``{@} E``          Captured Search: Shorthand for ``{(!E .)*} E``. (Search 
+                   loop for the pattern `E`.) Everything until and exluding
+                   `E` is captured.
+``@@ E``           Same as ``{@} E``.
 ``A <- E``         Rule: Bind the expression `E` to the *nonterminal symbol*
                   `A`. **Left recursive rules are not possible and crash the
                   matching engine.**
@@ -131,7 +135,7 @@ The PEG parser implements this grammar (written in PEG syntax)::
  
  rule <- identifier \s* "<-" expr ig
  identNoArrow <- identifier !(\s* "<-")
-  prefixOpr <- ig '&' / ig '!' / ig '@'
+  prefixOpr <- ig '&' / ig '!' / ig '@' / ig '{@}' / ig '@@'
  literal <- ig identifier? '$' [0-9]+
             ig identNoArrow / 
             ig charset / 
--- a/lib/pure/pegs.nim
+++ b/lib/pure/pegs.nim
@@ -58,6 +58,7 @@ type
    pkBackRefIgnoreCase,
    pkBackRefIgnoreStyle,
    pkSearch,           ## @a     --> Internal DSL: @a
+    pkCapturedSearch,   ## {@} a  --> Internal DSL: @@a
    pkRule,             ## a <- b
    pkList              ## a, b
  TNonTerminalFlag = enum
@@ -192,6 +193,11 @@ proc `@`*(a: TPeg): TPeg {.nosideEffect, rtl, extern: "npegsSearch".} =
  ## constructs a "search" for the PEG `a`
  result.kind = pkSearch
  result.sons = @[a]
+
+proc `@@`*(a: TPeg): TPeg {.noSideEffect, rtl, 
+                            extern: "npgegsCapturedSearch".} =
+  result.kind = pkCapturedSearch
+  result.sons = @[a]
  
 when false:
  proc contains(a: TPeg, k: TPegKind): bool =
@@ -421,6 +427,9 @@ proc toStrAux(r: TPeg, res: var string) =
  of pkSearch:
    add(res, '@')
    toStrAux(r.sons[0], res)
+  of pkCapturedSearch:
+    add(res, "{@}")
+    toStrAux(r.sons[0], res)
  of pkCapture:
    add(res, '{')
    toStrAux(r.sons[0], res)    
@@ -558,6 +567,21 @@ proc m(s: string, p: TPeg, start: int, c: var TMatchClosure): int =
      inc(result)
    result = -1
    c.ml = oldMl
+  of pkCapturedSearch:
+    var idx = c.ml # reserve a slot for the subpattern
+    inc(c.ml)
+    result = 0
+    while start+result < s.len:
+      var x = m(s, p.sons[0], start+result, c)
+      if x >= 0:
+        if idx < maxSubpatterns:
+          c.matches[idx] = (start, start+result-1)
+        #else: silently ignore the capture
+        inc(result, x)
+        return
+      inc(result)
+    result = -1
+    c.ml = idx
  of pkGreedyRep:
    result = 0
    while true:
@@ -850,6 +874,7 @@ type
    tkParRi,            ## ')'
    tkCurlyLe,          ## '{'
    tkCurlyRi,          ## '}'
+    tkCurlyAt,          ## '{@}'
    tkArrow,            ## '<-'
    tkBar,              ## '/'
    tkStar,             ## '*'
@@ -880,7 +905,8 @@ type
 const
  tokKindToStr: array[TTokKind, string] = [
    "invalid", "[EOF]", ".", "_", "identifier", "string literal",
-    "character set", "(", ")", "{", "}", "<-", "/", "*", "+", "&", "!", "?",
+    "character set", "(", ")", "{", "}", "{@}",
+    "<-", "/", "*", "+", "&", "!", "?",
    "@", "built-in", "escaped", "$"
  ]

@@ -1112,9 +1138,14 @@ proc getTok(c: var TPegLexer, tok: var TToken) =
  skip(c)
  case c.buf[c.bufpos]
  of '{':
-    tok.kind = tkCurlyLe
    inc(c.bufpos)
-    add(tok.literal, '{')
+    if c.buf[c.bufpos] == '@' and c.buf[c.bufpos+1] == '}':
+      tok.kind = tkCurlyAt
+      inc(c.bufpos, 2)
+      add(tok.literal, "{@}")
+    else:
+      tok.kind = tkCurlyLe
+      add(tok.literal, '{')
  of '}': 
    tok.kind = tkCurlyRi
    inc(c.bufpos)
@@ -1193,6 +1224,10 @@ proc getTok(c: var TPegLexer, tok: var TToken) =
    tok.kind = tkAt
    inc(c.bufpos)
    add(tok.literal, '@')
+    if c.buf[c.bufpos] == '@': 
+      tok.kind = tkCurlyAt
+      inc(c.bufpos)
+      add(tok.literal, '@')
  else:
    add(tok.literal, c.buf[c.bufpos])
    inc(c.bufpos)
@@ -1261,6 +1296,9 @@ proc primary(p: var TPegParser): TPeg =
  of tkAt:
    getTok(p)
    return @primary(p)
+  of tkCurlyAt:
+    getTok(p)
+    return @@primary(p)
  else: nil
  case p.tok.kind
  of tkIdentifier:
@@ -1346,7 +1384,7 @@ proc seqExpr(p: var TPegParser): TPeg =
  while true:
    case p.tok.kind
    of tkAmp, tkNot, tkAt, tkStringLit, tkCharset, tkParLe, tkCurlyLe,
-       tkAny, tkAnyRune, tkBuiltin, tkEscaped, tkDollar:
+       tkAny, tkAnyRune, tkBuiltin, tkEscaped, tkDollar, tkCurlyAt:
      result = sequence(result, primary(p))
    of tkIdentifier:
      if not arrowIsNextTok(p):
@@ -1514,4 +1552,11 @@ when isMainModule:

  for x in findAll("abcdef", peg"{.}", 3):
    echo x
+    
+  if "f(a, b)" =~ peg"{[0-9]+} / ({\ident} '(' {@} ')')":
+    assert matches[0] == "f"
+    assert matches[1] == "a, b"
+  else:
+    assert false
+  

--- a/web/news.txt
+++ b/web/news.txt
@@ -21,6 +21,7 @@ Additions

 - Added ``re.findAll``, ``pegs.findAll``.
 - Added ``os.findExe``.
+- The Pegs module supports a *captured search loop operator* ``{@}``.


 2010-10-20 Version 0.8.10 released