mirror of
https://github.com/nim-lang/Nim.git
synced 2025-12-28 17:04:41 +00:00
Peg captures get stack-like behavior (#18369)
* Implements reverse capture indexing. * Now works for modified backrefs too. * Changed reverse indexing syntax prefix for back-references to '$^'.
This commit is contained in:
@@ -27,7 +27,10 @@ notation meaning
|
||||
``{E}`` Capture: Apply expression `E` and store the substring
|
||||
that matched `E` into a *capture* that can be accessed
|
||||
after the matching process.
|
||||
``$i`` Back reference to the ``i``th capture. ``i`` counts from 1.
|
||||
``{}`` Empty capture: Delete the last capture. No character
|
||||
is consumed.
|
||||
``$i`` Back reference to the ``i``th capture. ``i`` counts forwards
|
||||
from 1 or backwards (last capture to first) from ^1.
|
||||
``$`` Anchor: Matches at the end of the input. No character
|
||||
is consumed. Same as ``!.``.
|
||||
``^`` Anchor: Matches at the start of the input. No character
|
||||
@@ -149,14 +152,15 @@ The PEG parser implements this grammar (written in PEG syntax)::
|
||||
rule <- identifier \s* "<-" expr ig
|
||||
identNoArrow <- identifier !(\s* "<-")
|
||||
prefixOpr <- ig '&' / ig '!' / ig '@' / ig '{@}' / ig '@@'
|
||||
literal <- ig identifier? '$' [0-9]+ / '$' / '^' /
|
||||
literal <- ig identifier? '$' '^'? [0-9]+ / '$' / '^' /
|
||||
ig identNoArrow /
|
||||
ig charset /
|
||||
ig stringlit /
|
||||
ig builtin /
|
||||
ig '.' /
|
||||
ig '_' /
|
||||
(ig "(" expr ig ")")
|
||||
(ig "(" expr ig ")") /
|
||||
(ig "{" expr? ig "}")
|
||||
postfixOpr <- ig '?' / ig '*' / ig '+'
|
||||
primary <- prefixOpr* (literal postfixOpr*)
|
||||
|
||||
|
||||
@@ -83,7 +83,7 @@ type
|
||||
of pkChar, pkGreedyRepChar: ch: char
|
||||
of pkCharChoice, pkGreedyRepSet: charChoice: ref set[char]
|
||||
of pkNonTerminal: nt: NonTerminal
|
||||
of pkBackRef..pkBackRefIgnoreStyle: index: range[0..MaxSubpatterns]
|
||||
of pkBackRef..pkBackRefIgnoreStyle: index: range[-MaxSubpatterns..MaxSubpatterns-1]
|
||||
else: sons: seq[Peg]
|
||||
NonTerminal* = ref NonTerminalObj
|
||||
|
||||
@@ -106,7 +106,7 @@ proc nt*(p: Peg): NonTerminal = p.nt
|
||||
## Returns the *NonTerminal* object of a given *Peg* variant object
|
||||
## where present.
|
||||
|
||||
proc index*(p: Peg): range[0..MaxSubpatterns] = p.index
|
||||
proc index*(p: Peg): range[-MaxSubpatterns..MaxSubpatterns-1] = p.index
|
||||
## Returns the back-reference index of a captured sub-pattern in the
|
||||
## *Captures* object for a given *Peg* variant object where present.
|
||||
|
||||
@@ -304,34 +304,37 @@ proc endAnchor*: Peg {.inline.} =
|
||||
## constructs the PEG ``$`` which matches the end of the input.
|
||||
result = !any()
|
||||
|
||||
proc capture*(a: Peg): Peg {.noSideEffect, rtl, extern: "npegsCapture".} =
|
||||
proc capture*(a: Peg = Peg(kind: pkEmpty)): Peg {.noSideEffect, rtl, extern: "npegsCapture".} =
|
||||
## constructs a capture with the PEG `a`
|
||||
result = Peg(kind: pkCapture, sons: @[a])
|
||||
|
||||
proc backref*(index: range[1..MaxSubpatterns]): Peg {.
|
||||
proc backref*(index: range[1..MaxSubpatterns], reverse: bool = false): Peg {.
|
||||
noSideEffect, rtl, extern: "npegs$1".} =
|
||||
## constructs a back reference of the given `index`. `index` starts counting
|
||||
## from 1.
|
||||
result = Peg(kind: pkBackRef, index: index-1)
|
||||
## from 1. `reverse` specifies wether indexing starts from the end of the
|
||||
## capture list.
|
||||
result = Peg(kind: pkBackRef, index: (if reverse: -index else: index - 1))
|
||||
|
||||
proc backrefIgnoreCase*(index: range[1..MaxSubpatterns]): Peg {.
|
||||
proc backrefIgnoreCase*(index: range[1..MaxSubpatterns], reverse: bool = false): Peg {.
|
||||
noSideEffect, rtl, extern: "npegs$1".} =
|
||||
## constructs a back reference of the given `index`. `index` starts counting
|
||||
## from 1. Ignores case for matching.
|
||||
result = Peg(kind: pkBackRefIgnoreCase, index: index-1)
|
||||
## from 1. `reverse` specifies wether indexing starts from the end of the
|
||||
## capture list. Ignores case for matching.
|
||||
result = Peg(kind: pkBackRefIgnoreCase, index: (if reverse: -index else: index - 1))
|
||||
|
||||
proc backrefIgnoreStyle*(index: range[1..MaxSubpatterns]): Peg {.
|
||||
proc backrefIgnoreStyle*(index: range[1..MaxSubpatterns], reverse: bool = false): Peg {.
|
||||
noSideEffect, rtl, extern: "npegs$1".} =
|
||||
## constructs a back reference of the given `index`. `index` starts counting
|
||||
## from 1. Ignores style for matching.
|
||||
result = Peg(kind: pkBackRefIgnoreStyle, index: index-1)
|
||||
## from 1. `reverse` specifies wether indexing starts from the end of the
|
||||
## capture list. Ignores style for matching.
|
||||
result = Peg(kind: pkBackRefIgnoreStyle, index: (if reverse: -index else: index - 1))
|
||||
|
||||
proc spaceCost(n: Peg): int =
|
||||
case n.kind
|
||||
of pkEmpty: discard
|
||||
of pkTerminal, pkTerminalIgnoreCase, pkTerminalIgnoreStyle, pkChar,
|
||||
pkGreedyRepChar, pkCharChoice, pkGreedyRepSet,
|
||||
pkAny..pkWhitespace, pkGreedyAny:
|
||||
pkAny..pkWhitespace, pkGreedyAny, pkBackRef..pkBackRefIgnoreStyle:
|
||||
result = 1
|
||||
of pkNonTerminal:
|
||||
# we cannot inline a rule with a non-terminal
|
||||
@@ -561,8 +564,10 @@ template matchOrParse(mopProc: untyped) =
|
||||
# Parse handler code must run in an *of* clause of its own for each
|
||||
# *PegKind*, so we encapsulate the identical clause body for
|
||||
# *pkBackRef..pkBackRefIgnoreStyle* here.
|
||||
if p.index >= c.ml: return -1
|
||||
var (a, b) = c.matches[p.index]
|
||||
var index = p.index
|
||||
if index < 0: index.inc(c.ml)
|
||||
if index < 0 or index >= c.ml: return -1
|
||||
var (a, b) = c.matches[index]
|
||||
var n: Peg
|
||||
case p.kind
|
||||
of pkBackRef:
|
||||
@@ -822,15 +827,19 @@ template matchOrParse(mopProc: untyped) =
|
||||
leave(pkNotPredicate, s, p, start, result)
|
||||
of pkCapture:
|
||||
enter(pkCapture, s, p, start)
|
||||
var idx = c.ml # reserve a slot for the subpattern
|
||||
inc(c.ml)
|
||||
result = mopProc(s, p.sons[0], start, c)
|
||||
if result >= 0:
|
||||
if idx < MaxSubpatterns:
|
||||
c.matches[idx] = (start, start+result-1)
|
||||
#else: silently ignore the capture
|
||||
if p.sons.len == 0 or p.sons[0].kind == pkEmpty:
|
||||
# empty capture removes last match
|
||||
dec(c.ml)
|
||||
c.matches[c.ml] = (0, 0)
|
||||
result = 0 # match of length 0
|
||||
else:
|
||||
c.ml = idx
|
||||
var idx = c.ml # reserve a slot for the subpattern
|
||||
result = mopProc(s, p.sons[0], start, c)
|
||||
if result >= 0:
|
||||
inc(c.ml)
|
||||
if idx < MaxSubpatterns:
|
||||
c.matches[idx] = (start, start+result-1)
|
||||
#else: silently ignore the capture
|
||||
leave(pkCapture, s, p, start, result)
|
||||
of pkBackRef:
|
||||
enter(pkBackRef, s, p, start)
|
||||
@@ -1395,6 +1404,7 @@ type
|
||||
tkCurlyLe, ## '{'
|
||||
tkCurlyRi, ## '}'
|
||||
tkCurlyAt, ## '{@}'
|
||||
tkEmptyCurl, ## '{}'
|
||||
tkArrow, ## '<-'
|
||||
tkBar, ## '/'
|
||||
tkStar, ## '*'
|
||||
@@ -1427,7 +1437,7 @@ type
|
||||
const
|
||||
tokKindToStr: array[TokKind, string] = [
|
||||
"invalid", "[EOF]", ".", "_", "identifier", "string literal",
|
||||
"character set", "(", ")", "{", "}", "{@}",
|
||||
"character set", "(", ")", "{", "}", "{@}", "{}",
|
||||
"<-", "/", "*", "+", "&", "!", "?",
|
||||
"@", "built-in", "escaped", "$", "$", "^"
|
||||
]
|
||||
@@ -1564,13 +1574,21 @@ proc getString(c: var PegLexer, tok: var Token) =
|
||||
|
||||
proc getDollar(c: var PegLexer, tok: var Token) =
|
||||
var pos = c.bufpos + 1
|
||||
var neg = false
|
||||
if pos < c.buf.len and c.buf[pos] == '^':
|
||||
neg = true
|
||||
inc(pos)
|
||||
if pos < c.buf.len and c.buf[pos] in {'0'..'9'}:
|
||||
tok.kind = tkBackref
|
||||
tok.index = 0
|
||||
while pos < c.buf.len and c.buf[pos] in {'0'..'9'}:
|
||||
tok.index = tok.index * 10 + ord(c.buf[pos]) - ord('0')
|
||||
inc(pos)
|
||||
if neg:
|
||||
tok.index = -tok.index
|
||||
else:
|
||||
if neg:
|
||||
dec(pos)
|
||||
tok.kind = tkDollar
|
||||
c.bufpos = pos
|
||||
|
||||
@@ -1670,6 +1688,10 @@ proc getTok(c: var PegLexer, tok: var Token) =
|
||||
tok.kind = tkCurlyAt
|
||||
inc(c.bufpos, 2)
|
||||
add(tok.literal, "{@}")
|
||||
elif c.buf[c.bufpos] == '}' and c.bufpos < c.buf.len:
|
||||
tok.kind = tkEmptyCurl
|
||||
inc(c.bufpos)
|
||||
add(tok.literal, "{}")
|
||||
else:
|
||||
tok.kind = tkCurlyLe
|
||||
add(tok.literal, '{')
|
||||
@@ -1705,7 +1727,7 @@ proc getTok(c: var PegLexer, tok: var Token) =
|
||||
return
|
||||
if c.buf[c.bufpos] in {'\'', '"'} or
|
||||
c.buf[c.bufpos] == '$' and c.bufpos+1 < c.buf.len and
|
||||
c.buf[c.bufpos+1] in {'0'..'9'}:
|
||||
c.buf[c.bufpos+1] in {'^', '0'..'9'}:
|
||||
case tok.literal
|
||||
of "i": tok.modifier = modIgnoreCase
|
||||
of "y": tok.modifier = modIgnoreStyle
|
||||
@@ -1819,10 +1841,13 @@ proc modifiedTerm(s: string, m: Modifier): Peg =
|
||||
of modIgnoreStyle: result = termIgnoreStyle(s)
|
||||
|
||||
proc modifiedBackref(s: int, m: Modifier): Peg =
|
||||
var
|
||||
reverse = s < 0
|
||||
index = if reverse: -s else: s
|
||||
case m
|
||||
of modNone, modVerbatim: result = backref(s)
|
||||
of modIgnoreCase: result = backrefIgnoreCase(s)
|
||||
of modIgnoreStyle: result = backrefIgnoreStyle(s)
|
||||
of modNone, modVerbatim: result = backref(index, reverse)
|
||||
of modIgnoreCase: result = backrefIgnoreCase(index, reverse)
|
||||
of modIgnoreStyle: result = backrefIgnoreStyle(index, reverse)
|
||||
|
||||
proc builtin(p: var PegParser): Peg =
|
||||
# do not use "y", "skip" or "i" as these would be ambiguous
|
||||
@@ -1896,6 +1921,9 @@ proc primary(p: var PegParser): Peg =
|
||||
result = capture(parseExpr(p)).token(p)
|
||||
eat(p, tkCurlyRi)
|
||||
inc(p.captures)
|
||||
of tkEmptyCurl:
|
||||
result = capture()
|
||||
getTok(p)
|
||||
of tkAny:
|
||||
result = any().token(p)
|
||||
getTok(p)
|
||||
@@ -1915,11 +1943,11 @@ proc primary(p: var PegParser): Peg =
|
||||
result = startAnchor()
|
||||
getTok(p)
|
||||
of tkBackref:
|
||||
if abs(p.tok.index) > p.captures or p.tok.index == 0:
|
||||
pegError(p, "invalid back reference index: " & $p.tok.index)
|
||||
var m = p.tok.modifier
|
||||
if m == modNone: m = p.modifier
|
||||
result = modifiedBackref(p.tok.index, m).token(p)
|
||||
if p.tok.index < 0 or p.tok.index > p.captures:
|
||||
pegError(p, "invalid back reference index: " & $p.tok.index)
|
||||
getTok(p)
|
||||
else:
|
||||
pegError(p, "expression expected, but found: " & p.tok.literal)
|
||||
@@ -1943,7 +1971,7 @@ proc seqExpr(p: var PegParser): Peg =
|
||||
case p.tok.kind
|
||||
of tkAmp, tkNot, tkAt, tkStringLit, tkCharSet, tkParLe, tkCurlyLe,
|
||||
tkAny, tkAnyRune, tkBuiltin, tkEscaped, tkDollar, tkBackref,
|
||||
tkHat, tkCurlyAt:
|
||||
tkHat, tkCurlyAt, tkEmptyCurl:
|
||||
result = sequence(result, primary(p))
|
||||
of tkIdentifier:
|
||||
if not arrowIsNextTok(p):
|
||||
|
||||
@@ -293,6 +293,34 @@ block:
|
||||
|
||||
doAssert "test1".match(peg"""{@}$""")
|
||||
doAssert "test2".match(peg"""{(!$ .)*} $""")
|
||||
|
||||
doAssert "abbb".match(peg"{a} {b} $2 $^1")
|
||||
doAssert "abBA".match(peg"{a} {b} i$2 i$^2")
|
||||
|
||||
doAssert "abba".match(peg"{a} {b} $^1 {} $^1")
|
||||
|
||||
block:
|
||||
let grammar = peg"""
|
||||
program <- {''} stmt* $
|
||||
stmt <- call / block
|
||||
call <- 'call()' EOL
|
||||
EOL <- \n / $
|
||||
block <- 'block:' \n indBody
|
||||
indBody <- {$^1 ' '+} stmt ($^1 stmt)* {}
|
||||
"""
|
||||
let program = """
|
||||
call()
|
||||
block:
|
||||
block:
|
||||
call()
|
||||
call()
|
||||
call()
|
||||
call()
|
||||
"""
|
||||
var c: Captures
|
||||
doAssert program.len == program.rawMatch(grammar, 0, c)
|
||||
doAssert c.ml == 1
|
||||
|
||||
pegsTest()
|
||||
static:
|
||||
pegsTest()
|
||||
|
||||
Reference in New Issue
Block a user