Files
Nim/lib/pure/pegs.nim
metagn 770f8d5513 opensym for templates + move behavior of opensymchoice to itself (#24007)
fixes #15314, fixes #24002

The OpenSym behavior first added to generics in #23091 now also applies
to templates, since templates can also capture symbols that are meant to
be replaced by local symbols if the context imports symbols with the
same name, as in the issue #24002. The experimental switch
`templateOpenSym` is added to enable this behavior for templates only,
and the experimental switch `openSym` is added to enable it for both
templates and generics, and the documentation now mainly mentions this
switch.

Additionally the logic for `nkOpenSymChoice` nodes that were previously
wrapped in `nkOpenSym` now apply to all `nkOpenSymChoice` nodes, and so
these nodes aren't wrapped in `nkOpenSym` anymore. This means
`nkOpenSym` can only have children of kind `nkSym` again, so it is more
in line with the structure of symchoice nodes. As for why they aren't
merged with `nkOpenSymChoice` nodes yet, we need some way to signal that
the node shouldn't become ambiguous if other options exist at
instantiation time, we already captured a symbol at the beginning and
another symbol can only replace it if it's closer in scope and
unambiguous.
2024-08-28 20:51:13 +02:00

2089 lines
65 KiB
Nim

#
#
# Nim's Runtime Library
# (c) Copyright 2012 Andreas Rumpf
#
# See the file "copying.txt", included in this
# distribution, for details about the copyright.
#
## Simple PEG (Parsing expression grammar) matching. Uses no memorization, but
## uses superoperators and symbol inlining to improve performance. Note:
## Matching performance is hopefully competitive with optimized regular
## expression engines.
##
## .. include:: ../../doc/pegdocs.txt
##
include "system/inclrtl"
when defined(nimPreviewSlimSystem):
import std/[syncio, assertions]
const
useUnicode = true ## change this to deactivate proper UTF-8 support
import std/[strutils, macros]
import std/private/decode_helpers
when useUnicode:
import std/unicode
export unicode.`==`
const
InlineThreshold = 5 ## number of leaves; -1 to disable inlining
MaxSubpatterns* = 20 ## defines the maximum number of subpatterns that
## can be captured. More subpatterns cannot be captured!
type
PegKind* = enum
pkEmpty,
pkAny, ## any character (.)
pkAnyRune, ## any Unicode character (_)
pkNewLine, ## CR-LF, LF, CR
pkLetter, ## Unicode letter
pkLower, ## Unicode lower case letter
pkUpper, ## Unicode upper case letter
pkTitle, ## Unicode title character
pkWhitespace, ## Unicode whitespace character
pkTerminal,
pkTerminalIgnoreCase,
pkTerminalIgnoreStyle,
pkChar, ## single character to match
pkCharChoice,
pkNonTerminal,
pkSequence, ## a b c ... --> Internal DSL: peg(a, b, c)
pkOrderedChoice, ## a / b / ... --> Internal DSL: a / b or /[a, b, c]
pkGreedyRep, ## a* --> Internal DSL: *a
## a+ --> (a a*)
pkGreedyRepChar, ## x* where x is a single character (superop)
pkGreedyRepSet, ## [set]* (superop)
pkGreedyAny, ## .* or _* (superop)
pkOption, ## a? --> Internal DSL: ?a
pkAndPredicate, ## &a --> Internal DSL: &a
pkNotPredicate, ## !a --> Internal DSL: !a
pkCapture, ## {a} --> Internal DSL: capture(a)
pkBackRef, ## $i --> Internal DSL: backref(i)
pkBackRefIgnoreCase,
pkBackRefIgnoreStyle,
pkSearch, ## @a --> Internal DSL: !*a
pkCapturedSearch, ## {@} a --> Internal DSL: !*\a
pkRule, ## a <- b
pkList, ## a, b
pkStartAnchor ## ^ --> Internal DSL: startAnchor()
NonTerminalFlag* = enum
ntDeclared, ntUsed
NonTerminalObj = object ## represents a non terminal symbol
name: string ## the name of the symbol
line: int ## line the symbol has been declared/used in
col: int ## column the symbol has been declared/used in
flags: set[NonTerminalFlag] ## the nonterminal's flags
rule: Peg ## the rule that the symbol refers to
Peg* {.shallow.} = object ## type that represents a PEG
case kind: PegKind
of pkEmpty..pkWhitespace: nil
of pkTerminal, pkTerminalIgnoreCase, pkTerminalIgnoreStyle: term: string
of pkChar, pkGreedyRepChar: ch: char
of pkCharChoice, pkGreedyRepSet: charChoice: ref set[char]
of pkNonTerminal: nt: NonTerminal
of pkBackRef..pkBackRefIgnoreStyle: index: range[-MaxSubpatterns..MaxSubpatterns-1]
else: sons: seq[Peg]
NonTerminal* = ref NonTerminalObj
func kind*(p: Peg): PegKind = p.kind
## Returns the *PegKind* of a given *Peg* object.
func term*(p: Peg): string = p.term
## Returns the *string* representation of a given *Peg* variant object
## where present.
func ch*(p: Peg): char = p.ch
## Returns the *char* representation of a given *Peg* variant object
## where present.
func charChoice*(p: Peg): ref set[char] = p.charChoice
## Returns the *charChoice* field of a given *Peg* variant object
## where present.
func nt*(p: Peg): NonTerminal = p.nt
## Returns the *NonTerminal* object of a given *Peg* variant object
## where present.
func index*(p: Peg): range[-MaxSubpatterns..MaxSubpatterns-1] = p.index
## Returns the back-reference index of a captured sub-pattern in the
## *Captures* object for a given *Peg* variant object where present.
iterator items*(p: Peg): Peg {.inline.} =
## Yields the child nodes of a *Peg* variant object where present.
for s in p.sons:
yield s
iterator pairs*(p: Peg): (int, Peg) {.inline.} =
## Yields the indices and child nodes of a *Peg* variant object where present.
for i in 0 ..< p.sons.len:
yield (i, p.sons[i])
func name*(nt: NonTerminal): string = nt.name
## Gets the name of the symbol represented by the parent *Peg* object variant
## of a given *NonTerminal*.
func line*(nt: NonTerminal): int = nt.line
## Gets the line number of the definition of the parent *Peg* object variant
## of a given *NonTerminal*.
func col*(nt: NonTerminal): int = nt.col
## Gets the column number of the definition of the parent *Peg* object variant
## of a given *NonTerminal*.
func flags*(nt: NonTerminal): set[NonTerminalFlag] = nt.flags
## Gets the *NonTerminalFlag*-typed flags field of the parent *Peg* variant
## object of a given *NonTerminal*.
func rule*(nt: NonTerminal): Peg = nt.rule
## Gets the *Peg* object representing the rule definition of the parent *Peg*
## object variant of a given *NonTerminal*.
func term*(t: string): Peg {.rtl, extern: "npegs$1Str".} =
## constructs a PEG from a terminal string
if t.len != 1:
result = Peg(kind: pkTerminal, term: t)
else:
result = Peg(kind: pkChar, ch: t[0])
func termIgnoreCase*(t: string): Peg {.
rtl, extern: "npegs$1".} =
## constructs a PEG from a terminal string; ignore case for matching
result = Peg(kind: pkTerminalIgnoreCase, term: t)
func termIgnoreStyle*(t: string): Peg {.
rtl, extern: "npegs$1".} =
## constructs a PEG from a terminal string; ignore style for matching
result = Peg(kind: pkTerminalIgnoreStyle, term: t)
func term*(t: char): Peg {.rtl, extern: "npegs$1Char".} =
## constructs a PEG from a terminal char
assert t != '\0'
result = Peg(kind: pkChar, ch: t)
func charSet*(s: set[char]): Peg {.rtl, extern: "npegs$1".} =
## constructs a PEG from a character set `s`
assert '\0' notin s
result = Peg(kind: pkCharChoice)
{.cast(noSideEffect).}:
new(result.charChoice)
result.charChoice[] = s
func len(a: Peg): int {.inline.} = return a.sons.len
func add(d: var Peg, s: Peg) {.inline.} = add(d.sons, s)
func addChoice(dest: var Peg, elem: Peg) =
var L = dest.len-1
if L >= 0 and dest.sons[L].kind == pkCharChoice:
# caution! Do not introduce false aliasing here!
case elem.kind
of pkCharChoice:
dest.sons[L] = charSet(dest.sons[L].charChoice[] + elem.charChoice[])
of pkChar:
dest.sons[L] = charSet(dest.sons[L].charChoice[] + {elem.ch})
else: add(dest, elem)
else: add(dest, elem)
template multipleOp(k: PegKind, localOpt: untyped) =
result = Peg(kind: k, sons: @[])
for x in items(a):
if x.kind == k:
for y in items(x.sons):
localOpt(result, y)
else:
localOpt(result, x)
if result.len == 1:
result = result.sons[0]
func `/`*(a: varargs[Peg]): Peg {.
rtl, extern: "npegsOrderedChoice".} =
## constructs an ordered choice with the PEGs in `a`
multipleOp(pkOrderedChoice, addChoice)
func addSequence(dest: var Peg, elem: Peg) =
var L = dest.len-1
if L >= 0 and dest.sons[L].kind == pkTerminal:
# caution! Do not introduce false aliasing here!
case elem.kind
of pkTerminal:
dest.sons[L] = term(dest.sons[L].term & elem.term)
of pkChar:
dest.sons[L] = term(dest.sons[L].term & elem.ch)
else: add(dest, elem)
else: add(dest, elem)
func sequence*(a: varargs[Peg]): Peg {.
rtl, extern: "npegs$1".} =
## constructs a sequence with all the PEGs from `a`
multipleOp(pkSequence, addSequence)
func `?`*(a: Peg): Peg {.rtl, extern: "npegsOptional".} =
## constructs an optional for the PEG `a`
if a.kind in {pkOption, pkGreedyRep, pkGreedyAny, pkGreedyRepChar,
pkGreedyRepSet}:
# a* ? --> a*
# a? ? --> a?
result = a
else:
result = Peg(kind: pkOption, sons: @[a])
func `*`*(a: Peg): Peg {.rtl, extern: "npegsGreedyRep".} =
## constructs a "greedy repetition" for the PEG `a`
case a.kind
of pkGreedyRep, pkGreedyRepChar, pkGreedyRepSet, pkGreedyAny, pkOption:
assert false
# produces endless loop!
of pkChar:
result = Peg(kind: pkGreedyRepChar, ch: a.ch)
of pkCharChoice:
result = Peg(kind: pkGreedyRepSet, charChoice: a.charChoice)
of pkAny, pkAnyRune:
result = Peg(kind: pkGreedyAny)
else:
result = Peg(kind: pkGreedyRep, sons: @[a])
func `!*`*(a: Peg): Peg {.rtl, extern: "npegsSearch".} =
## constructs a "search" for the PEG `a`
result = Peg(kind: pkSearch, sons: @[a])
func `!*\`*(a: Peg): Peg {.rtl,
extern: "npgegsCapturedSearch".} =
## constructs a "captured search" for the PEG `a`
result = Peg(kind: pkCapturedSearch, sons: @[a])
func `+`*(a: Peg): Peg {.rtl, extern: "npegsGreedyPosRep".} =
## constructs a "greedy positive repetition" with the PEG `a`
return sequence(a, *a)
func `&`*(a: Peg): Peg {.rtl, extern: "npegsAndPredicate".} =
## constructs an "and predicate" with the PEG `a`
result = Peg(kind: pkAndPredicate, sons: @[a])
func `!`*(a: Peg): Peg {.rtl, extern: "npegsNotPredicate".} =
## constructs a "not predicate" with the PEG `a`
result = Peg(kind: pkNotPredicate, sons: @[a])
func any*: Peg {.inline.} =
## constructs the PEG `any character`:idx: (``.``)
result = Peg(kind: pkAny)
func anyRune*: Peg {.inline.} =
## constructs the PEG `any rune`:idx: (``_``)
result = Peg(kind: pkAnyRune)
func newLine*: Peg {.inline.} =
## constructs the PEG `newline`:idx: (``\n``)
result = Peg(kind: pkNewLine)
func unicodeLetter*: Peg {.inline.} =
## constructs the PEG ``\letter`` which matches any Unicode letter.
result = Peg(kind: pkLetter)
func unicodeLower*: Peg {.inline.} =
## constructs the PEG ``\lower`` which matches any Unicode lowercase letter.
result = Peg(kind: pkLower)
func unicodeUpper*: Peg {.inline.} =
## constructs the PEG ``\upper`` which matches any Unicode uppercase letter.
result = Peg(kind: pkUpper)
func unicodeTitle*: Peg {.inline.} =
## constructs the PEG ``\title`` which matches any Unicode title letter.
result = Peg(kind: pkTitle)
func unicodeWhitespace*: Peg {.inline.} =
## constructs the PEG ``\white`` which matches any Unicode
## whitespace character.
result = Peg(kind: pkWhitespace)
func startAnchor*: Peg {.inline.} =
## constructs the PEG ``^`` which matches the start of the input.
result = Peg(kind: pkStartAnchor)
func endAnchor*: Peg {.inline.} =
## constructs the PEG ``$`` which matches the end of the input.
result = !any()
func capture*(a: Peg = Peg(kind: pkEmpty)): Peg {.rtl, extern: "npegsCapture".} =
## constructs a capture with the PEG `a`
result = Peg(kind: pkCapture, sons: @[a])
func backref*(index: range[1..MaxSubpatterns], reverse: bool = false): Peg {.
rtl, extern: "npegs$1".} =
## constructs a back reference of the given `index`. `index` starts counting
## from 1. `reverse` specifies whether indexing starts from the end of the
## capture list.
result = Peg(kind: pkBackRef, index: (if reverse: -index else: index - 1))
func backrefIgnoreCase*(index: range[1..MaxSubpatterns], reverse: bool = false): Peg {.
rtl, extern: "npegs$1".} =
## constructs a back reference of the given `index`. `index` starts counting
## from 1. `reverse` specifies whether indexing starts from the end of the
## capture list. Ignores case for matching.
result = Peg(kind: pkBackRefIgnoreCase, index: (if reverse: -index else: index - 1))
func backrefIgnoreStyle*(index: range[1..MaxSubpatterns], reverse: bool = false): Peg {.
rtl, extern: "npegs$1".} =
## constructs a back reference of the given `index`. `index` starts counting
## from 1. `reverse` specifies whether indexing starts from the end of the
## capture list. Ignores style for matching.
result = Peg(kind: pkBackRefIgnoreStyle, index: (if reverse: -index else: index - 1))
func spaceCost(n: Peg): int =
case n.kind
of pkEmpty: discard
of pkTerminal, pkTerminalIgnoreCase, pkTerminalIgnoreStyle, pkChar,
pkGreedyRepChar, pkCharChoice, pkGreedyRepSet,
pkAny..pkWhitespace, pkGreedyAny, pkBackRef..pkBackRefIgnoreStyle:
result = 1
of pkNonTerminal:
# we cannot inline a rule with a non-terminal
result = InlineThreshold+1
else:
for i in 0..n.len-1:
inc(result, spaceCost(n.sons[i]))
if result >= InlineThreshold: break
func nonterminal*(n: NonTerminal): Peg {.
rtl, extern: "npegs$1".} =
## constructs a PEG that consists of the nonterminal symbol
assert n != nil
if ntDeclared in n.flags and spaceCost(n.rule) < InlineThreshold:
when false: echo "inlining symbol: ", n.name
result = n.rule # inlining of rule enables better optimizations
else:
result = Peg(kind: pkNonTerminal, nt: n)
func newNonTerminal*(name: string, line, column: int): NonTerminal {.
rtl, extern: "npegs$1".} =
## constructs a nonterminal symbol
result = NonTerminal(name: name, line: line, col: column)
template letters*: Peg =
## expands to ``charset({'A'..'Z', 'a'..'z'})``
charSet({'A'..'Z', 'a'..'z'})
template digits*: Peg =
## expands to ``charset({'0'..'9'})``
charSet({'0'..'9'})
template whitespace*: Peg =
## expands to ``charset({' ', '\9'..'\13'})``
charSet({' ', '\9'..'\13'})
template identChars*: Peg =
## expands to ``charset({'a'..'z', 'A'..'Z', '0'..'9', '_'})``
charSet({'a'..'z', 'A'..'Z', '0'..'9', '_'})
template identStartChars*: Peg =
## expands to ``charset({'A'..'Z', 'a'..'z', '_'})``
charSet({'a'..'z', 'A'..'Z', '_'})
template ident*: Peg =
## same as ``[a-zA-Z_][a-zA-z_0-9]*``; standard identifier
sequence(charSet({'a'..'z', 'A'..'Z', '_'}),
*charSet({'a'..'z', 'A'..'Z', '0'..'9', '_'}))
template natural*: Peg =
## same as ``\d+``
+digits
# ------------------------- debugging -----------------------------------------
func esc(c: char, reserved = {'\0'..'\255'}): string =
case c
of '\b': result = "\\b"
of '\t': result = "\\t"
of '\c': result = "\\c"
of '\L': result = "\\l"
of '\v': result = "\\v"
of '\f': result = "\\f"
of '\e': result = "\\e"
of '\a': result = "\\a"
of '\\': result = "\\\\"
of 'a'..'z', 'A'..'Z', '0'..'9', '_': result = $c
elif c < ' ' or c >= '\127': result = '\\' & $ord(c)
elif c in reserved: result = '\\' & c
else: result = $c
func singleQuoteEsc(c: char): string = return "'" & esc(c, {'\''}) & "'"
func singleQuoteEsc(str: string): string =
result = "'"
for c in items(str): add result, esc(c, {'\''})
add result, '\''
func charSetEscAux(cc: set[char]): string =
const reserved = {'^', '-', ']'}
result = ""
var c1 = 0
while c1 <= 0xff:
if chr(c1) in cc:
var c2 = c1
while c2 < 0xff and chr(succ(c2)) in cc: inc(c2)
if c1 == c2:
add result, esc(chr(c1), reserved)
elif c2 == succ(c1):
add result, esc(chr(c1), reserved) & esc(chr(c2), reserved)
else:
add result, esc(chr(c1), reserved) & '-' & esc(chr(c2), reserved)
c1 = c2
inc(c1)
func charSetEsc(cc: set[char]): string =
if card(cc) >= 128+64:
result = "[^" & charSetEscAux({'\1'..'\xFF'} - cc) & ']'
else:
result = '[' & charSetEscAux(cc) & ']'
func toStrAux(r: Peg, res: var string) =
case r.kind
of pkEmpty: add(res, "()")
of pkAny: add(res, '.')
of pkAnyRune: add(res, '_')
of pkLetter: add(res, "\\letter")
of pkLower: add(res, "\\lower")
of pkUpper: add(res, "\\upper")
of pkTitle: add(res, "\\title")
of pkWhitespace: add(res, "\\white")
of pkNewLine: add(res, "\\n")
of pkTerminal: add(res, singleQuoteEsc(r.term))
of pkTerminalIgnoreCase:
add(res, 'i')
add(res, singleQuoteEsc(r.term))
of pkTerminalIgnoreStyle:
add(res, 'y')
add(res, singleQuoteEsc(r.term))
of pkChar: add(res, singleQuoteEsc(r.ch))
of pkCharChoice: add(res, charSetEsc(r.charChoice[]))
of pkNonTerminal: add(res, r.nt.name)
of pkSequence:
add(res, '(')
toStrAux(r.sons[0], res)
for i in 1 .. high(r.sons):
add(res, ' ')
toStrAux(r.sons[i], res)
add(res, ')')
of pkOrderedChoice:
add(res, '(')
toStrAux(r.sons[0], res)
for i in 1 .. high(r.sons):
add(res, " / ")
toStrAux(r.sons[i], res)
add(res, ')')
of pkGreedyRep:
toStrAux(r.sons[0], res)
add(res, '*')
of pkGreedyRepChar:
add(res, singleQuoteEsc(r.ch))
add(res, '*')
of pkGreedyRepSet:
add(res, charSetEsc(r.charChoice[]))
add(res, '*')
of pkGreedyAny:
add(res, ".*")
of pkOption:
toStrAux(r.sons[0], res)
add(res, '?')
of pkAndPredicate:
add(res, '&')
toStrAux(r.sons[0], res)
of pkNotPredicate:
add(res, '!')
toStrAux(r.sons[0], res)
of pkSearch:
add(res, '@')
toStrAux(r.sons[0], res)
of pkCapturedSearch:
add(res, "{@}")
toStrAux(r.sons[0], res)
of pkCapture:
add(res, '{')
toStrAux(r.sons[0], res)
add(res, '}')
of pkBackRef:
add(res, '$')
add(res, $r.index)
of pkBackRefIgnoreCase:
add(res, "i$")
add(res, $r.index)
of pkBackRefIgnoreStyle:
add(res, "y$")
add(res, $r.index)
of pkRule:
toStrAux(r.sons[0], res)
add(res, " <- ")
toStrAux(r.sons[1], res)
of pkList:
for i in 0 .. high(r.sons):
toStrAux(r.sons[i], res)
add(res, "\n")
of pkStartAnchor:
add(res, '^')
func `$` *(r: Peg): string {.rtl, extern: "npegsToString".} =
## converts a PEG to its string representation
result = ""
toStrAux(r, result)
# --------------------- core engine -------------------------------------------
type
Captures* = object ## contains the captured substrings.
matches: array[0..MaxSubpatterns-1, tuple[first, last: int]]
ml: int
origStart: int
func bounds*(c: Captures,
i: range[0..MaxSubpatterns-1]): tuple[first, last: int] =
## returns the bounds ``[first..last]`` of the `i`'th capture.
result = c.matches[i]
when not useUnicode:
type
Rune = char
template fastRuneAt(s, i, ch) =
ch = s[i]
inc(i)
template runeLenAt(s, i): untyped = 1
func isAlpha(a: char): bool {.inline.} = return a in {'a'..'z', 'A'..'Z'}
func isUpper(a: char): bool {.inline.} = return a in {'A'..'Z'}
func isLower(a: char): bool {.inline.} = return a in {'a'..'z'}
func isTitle(a: char): bool {.inline.} = return false
func isWhiteSpace(a: char): bool {.inline.} = return a in {' ', '\9'..'\13'}
template matchOrParse(mopProc: untyped) =
# Used to make the main matcher proc *rawMatch* as well as event parser
# procs. For the former, *enter* and *leave* event handler code generators
# are provided which just return *discard*.
proc mopProc(s: string, p: Peg, start: int, c: var Captures): int {.gcsafe, raises: [].} =
proc matchBackRef(s: string, p: Peg, start: int, c: var Captures): int =
# Parse handler code must run in an *of* clause of its own for each
# *PegKind*, so we encapsulate the identical clause body for
# *pkBackRef..pkBackRefIgnoreStyle* here.
var index = p.index
if index < 0: index.inc(c.ml)
if index < 0 or index >= c.ml: return -1
var (a, b) = c.matches[index]
var n: Peg
case p.kind
of pkBackRef:
n = Peg(kind: pkTerminal, term: s.substr(a, b))
of pkBackRefIgnoreStyle:
n = Peg(kind: pkTerminalIgnoreStyle, term: s.substr(a, b))
of pkBackRefIgnoreCase:
n = Peg(kind: pkTerminalIgnoreCase, term: s.substr(a, b))
else: assert(false, "impossible case")
mopProc(s, n, start, c)
case p.kind
of pkEmpty:
enter(pkEmpty, s, p, start)
result = 0 # match of length 0
leave(pkEmpty, s, p, start, result)
of pkAny:
enter(pkAny, s, p, start)
if start < s.len: result = 1
else: result = -1
leave(pkAny, s, p, start, result)
of pkAnyRune:
enter(pkAnyRune, s, p, start)
if start < s.len:
result = runeLenAt(s, start)
else:
result = -1
leave(pkAnyRune, s, p, start, result)
of pkLetter:
enter(pkLetter, s, p, start)
if start < s.len:
var a: Rune
result = start
fastRuneAt(s, result, a)
if isAlpha(a): dec(result, start)
else: result = -1
else:
result = -1
leave(pkLetter, s, p, start, result)
of pkLower:
enter(pkLower, s, p, start)
if start < s.len:
var a: Rune
result = start
fastRuneAt(s, result, a)
if isLower(a): dec(result, start)
else: result = -1
else:
result = -1
leave(pkLower, s, p, start, result)
of pkUpper:
enter(pkUpper, s, p, start)
if start < s.len:
var a: Rune
result = start
fastRuneAt(s, result, a)
if isUpper(a): dec(result, start)
else: result = -1
else:
result = -1
leave(pkUpper, s, p, start, result)
of pkTitle:
enter(pkTitle, s, p, start)
if start < s.len:
var a: Rune
result = start
fastRuneAt(s, result, a)
if isTitle(a): dec(result, start)
else: result = -1
else:
result = -1
leave(pkTitle, s, p, start, result)
of pkWhitespace:
enter(pkWhitespace, s, p, start)
if start < s.len:
var a: Rune
result = start
fastRuneAt(s, result, a)
if isWhiteSpace(a): dec(result, start)
else: result = -1
else:
result = -1
leave(pkWhitespace, s, p, start, result)
of pkGreedyAny:
enter(pkGreedyAny, s, p, start)
result = len(s) - start
leave(pkGreedyAny, s, p, start, result)
of pkNewLine:
enter(pkNewLine, s, p, start)
if start < s.len and s[start] == '\L': result = 1
elif start < s.len and s[start] == '\C':
if start+1 < s.len and s[start+1] == '\L': result = 2
else: result = 1
else: result = -1
leave(pkNewLine, s, p, start, result)
of pkTerminal:
enter(pkTerminal, s, p, start)
result = len(p.term)
for i in 0..result-1:
if start+i >= s.len or p.term[i] != s[start+i]:
result = -1
break
leave(pkTerminal, s, p, start, result)
of pkTerminalIgnoreCase:
enter(pkTerminalIgnoreCase, s, p, start)
var
i = 0
a, b: Rune
result = start
while i < len(p.term):
if result >= s.len:
result = -1
break
fastRuneAt(p.term, i, a)
fastRuneAt(s, result, b)
if toLower(a) != toLower(b):
result = -1
break
dec(result, start)
leave(pkTerminalIgnoreCase, s, p, start, result)
of pkTerminalIgnoreStyle:
enter(pkTerminalIgnoreStyle, s, p, start)
var
i = 0
a, b: Rune
result = start
while i < len(p.term):
while i < len(p.term):
fastRuneAt(p.term, i, a)
if a != Rune('_'): break
while result < s.len:
fastRuneAt(s, result, b)
if b != Rune('_'): break
if result >= s.len:
if i >= p.term.len: break
else:
result = -1
break
elif toLower(a) != toLower(b):
result = -1
break
dec(result, start)
leave(pkTerminalIgnoreStyle, s, p, start, result)
of pkChar:
enter(pkChar, s, p, start)
if start < s.len and p.ch == s[start]: result = 1
else: result = -1
leave(pkChar, s, p, start, result)
of pkCharChoice:
enter(pkCharChoice, s, p, start)
if start < s.len and contains(p.charChoice[], s[start]): result = 1
else: result = -1
leave(pkCharChoice, s, p, start, result)
of pkNonTerminal:
enter(pkNonTerminal, s, p, start)
var oldMl = c.ml
when false: echo "enter: ", p.nt.name
result = mopProc(s, p.nt.rule, start, c)
when false: echo "leave: ", p.nt.name
if result < 0: c.ml = oldMl
leave(pkNonTerminal, s, p, start, result)
of pkSequence:
enter(pkSequence, s, p, start)
var oldMl = c.ml
result = 0
for i in 0..high(p.sons):
var x = mopProc(s, p.sons[i], start+result, c)
if x < 0:
c.ml = oldMl
result = -1
break
else: inc(result, x)
leave(pkSequence, s, p, start, result)
of pkOrderedChoice:
enter(pkOrderedChoice, s, p, start)
var oldMl = c.ml
for i in 0..high(p.sons):
result = mopProc(s, p.sons[i], start, c)
if result >= 0: break
c.ml = oldMl
leave(pkOrderedChoice, s, p, start, result)
of pkSearch:
enter(pkSearch, s, p, start)
var oldMl = c.ml
result = 0
while start+result <= s.len:
var x = mopProc(s, p.sons[0], start+result, c)
if x >= 0:
inc(result, x)
leave(pkSearch, s, p, start, result)
return
inc(result)
result = -1
c.ml = oldMl
leave(pkSearch, s, p, start, result)
of pkCapturedSearch:
enter(pkCapturedSearch, s, p, start)
var idx = c.ml # reserve a slot for the subpattern
inc(c.ml)
result = 0
while start+result <= s.len:
var x = mopProc(s, p.sons[0], start+result, c)
if x >= 0:
if idx < MaxSubpatterns:
c.matches[idx] = (start, start+result-1)
#else: silently ignore the capture
inc(result, x)
leave(pkCapturedSearch, s, p, start, result)
return
inc(result)
result = -1
c.ml = idx
leave(pkCapturedSearch, s, p, start, result)
of pkGreedyRep:
enter(pkGreedyRep, s, p, start)
result = 0
while true:
var x = mopProc(s, p.sons[0], start+result, c)
# if x == 0, we have an endless loop; so the correct behaviour would be
# not to break. But endless loops can be easily introduced:
# ``(comment / \w*)*`` is such an example. Breaking for x == 0 does the
# expected thing in this case.
if x <= 0: break
inc(result, x)
leave(pkGreedyRep, s, p, start, result)
of pkGreedyRepChar:
enter(pkGreedyRepChar, s, p, start)
result = 0
var ch = p.ch
while start+result < s.len and ch == s[start+result]: inc(result)
leave(pkGreedyRepChar, s, p, start, result)
of pkGreedyRepSet:
enter(pkGreedyRepSet, s, p, start)
result = 0
while start+result < s.len and contains(p.charChoice[], s[start+result]):
inc(result)
leave(pkGreedyRepSet, s, p, start, result)
of pkOption:
enter(pkOption, s, p, start)
result = max(0, mopProc(s, p.sons[0], start, c))
leave(pkOption, s, p, start, result)
of pkAndPredicate:
enter(pkAndPredicate, s, p, start)
var oldMl = c.ml
result = mopProc(s, p.sons[0], start, c)
if result >= 0: result = 0 # do not consume anything
else: c.ml = oldMl
leave(pkAndPredicate, s, p, start, result)
of pkNotPredicate:
enter(pkNotPredicate, s, p, start)
var oldMl = c.ml
result = mopProc(s, p.sons[0], start, c)
if result < 0: result = 0
else:
c.ml = oldMl
result = -1
leave(pkNotPredicate, s, p, start, result)
of pkCapture:
enter(pkCapture, s, p, start)
if p.sons.len == 0 or p.sons[0].kind == pkEmpty:
# empty capture removes last match
dec(c.ml)
c.matches[c.ml] = (0, 0)
result = 0 # match of length 0
else:
var idx = c.ml # reserve a slot for the subpattern
result = mopProc(s, p.sons[0], start, c)
if result >= 0:
if idx < MaxSubpatterns:
if idx != c.ml:
for i in countdown(c.ml, idx):
c.matches[i+1] = c.matches[i]
c.matches[idx] = (start, start+result-1)
#else: silently ignore the capture
inc(c.ml)
leave(pkCapture, s, p, start, result)
of pkBackRef:
enter(pkBackRef, s, p, start)
result = matchBackRef(s, p, start, c)
leave(pkBackRef, s, p, start, result)
of pkBackRefIgnoreCase:
enter(pkBackRefIgnoreCase, s, p, start)
result = matchBackRef(s, p, start, c)
leave(pkBackRefIgnoreCase, s, p, start, result)
of pkBackRefIgnoreStyle:
enter(pkBackRefIgnoreStyle, s, p, start)
result = matchBackRef(s, p, start, c)
leave(pkBackRefIgnoreStyle, s, p, start, result)
of pkStartAnchor:
enter(pkStartAnchor, s, p, start)
if c.origStart == start: result = 0
else: result = -1
leave(pkStartAnchor, s, p, start, result)
of pkRule, pkList: assert false
func rawMatch*(s: string, p: Peg, start: int, c: var Captures): int
{.rtl, extern: "npegs$1".} =
## low-level matching proc that implements the PEG interpreter. Use this
## for maximum efficiency (every other PEG operation ends up calling this
## proc).
## Returns -1 if it does not match, else the length of the match
# Set the handler generators to produce do-nothing handlers.
template enter(pk, s, p, start) =
discard
template leave(pk, s, p, start, length) =
discard
matchOrParse(matchIt)
{.cast(noSideEffect).}:
# This cast is allowed because the `matchOrParse` template is used for
# both matching and parsing, but side effects are only possible when it's
# used by `eventParser`.
result = matchIt(s, p, start, c)
macro mkHandlerTplts(handlers: untyped): untyped =
# Transforms the handler spec in *handlers* into handler templates.
# The AST structure of *handlers[0]*:
#
# ```
# StmtList
# Call
# Ident "pkNonTerminal"
# StmtList
# Call
# Ident "enter"
# StmtList
# <handler code block>
# Call
# Ident "leave"
# StmtList
# <handler code block>
# Call
# Ident "pkChar"
# StmtList
# Call
# Ident "leave"
# StmtList
# <handler code block>
# ...
# ```
func mkEnter(hdName, body: NimNode): NimNode =
template helper(hdName, body) {.dirty.} =
template hdName(s, p, start) =
let s {.inject.} = s
let p {.inject.} = p
let start {.inject.} = start
body
result = getAst(helper(hdName, body))
template mkLeave(hdPostf, body) {.dirty.} =
# this has to be dirty to be able to capture *result* as *length* in
# *leaveXX* calls.
template `leave hdPostf`(s, p, start, length) =
body
result = newStmtList()
for topCall in handlers[0]:
if topCall.kind notin nnkCallKinds:
error("Call syntax expected.", topCall)
let pegKind = topCall[0]
if pegKind.kind notin {nnkIdent, nnkSym}:
error("PegKind expected.", pegKind)
if 2 == topCall.len:
for hdDef in topCall[1]:
if hdDef.kind notin nnkCallKinds:
error("Call syntax expected.", hdDef)
if hdDef[0].kind notin {nnkIdent, nnkSym}:
error("Handler identifier expected.", hdDef[0])
if 2 == hdDef.len:
let hdPostf = substr(pegKind.strVal, 2)
case hdDef[0].strVal
of "enter":
result.add mkEnter(newIdentNode("enter" & hdPostf), hdDef[1])
of "leave":
result.add getAst(mkLeave(ident(hdPostf), hdDef[1]))
else:
error(
"Unsupported handler identifier, expected 'enter' or 'leave'.",
hdDef[0]
)
template eventParser*(pegAst, handlers: untyped): (proc(s: string): int) =
## Generates an interpreting event parser *proc* according to the specified
## PEG AST and handler code blocks. The *proc* can be called with a string
## to be parsed and will execute the handler code blocks whenever their
## associated grammar element is matched. It returns -1 if the string does not
## match, else the length of the total match. The following example code
## evaluates an arithmetic expression defined by a simple PEG:
##
## ```nim
## import std/[strutils, pegs]
##
## let
## pegAst = """
## Expr <- Sum
## Sum <- Product (('+' / '-')Product)*
## Product <- Value (('*' / '/')Value)*
## Value <- [0-9]+ / '(' Expr ')'
## """.peg
## txt = "(5+3)/2-7*22"
##
## var
## pStack: seq[string] = @[]
## valStack: seq[float] = @[]
## opStack = ""
## let
## parseArithExpr = pegAst.eventParser:
## pkNonTerminal:
## enter:
## pStack.add p.nt.name
## leave:
## pStack.setLen pStack.high
## if length > 0:
## let matchStr = s.substr(start, start+length-1)
## case p.nt.name
## of "Value":
## try:
## valStack.add matchStr.parseFloat
## echo valStack
## except ValueError:
## discard
## of "Sum", "Product":
## try:
## let val = matchStr.parseFloat
## except ValueError:
## if valStack.len > 1 and opStack.len > 0:
## valStack[^2] = case opStack[^1]
## of '+': valStack[^2] + valStack[^1]
## of '-': valStack[^2] - valStack[^1]
## of '*': valStack[^2] * valStack[^1]
## else: valStack[^2] / valStack[^1]
## valStack.setLen valStack.high
## echo valStack
## opStack.setLen opStack.high
## echo opStack
## pkChar:
## leave:
## if length == 1 and "Value" != pStack[^1]:
## let matchChar = s[start]
## opStack.add matchChar
## echo opStack
##
## let pLen = parseArithExpr(txt)
## ```
##
## The *handlers* parameter consists of code blocks for *PegKinds*,
## which define the grammar elements of interest. Each block can contain
## handler code to be executed when the parser enters and leaves text
## matching the grammar element. An *enter* handler can access the specific
## PEG AST node being matched as *p*, the entire parsed string as *s*
## and the position of the matched text segment in *s* as *start*. A *leave*
## handler can access *p*, *s*, *start* and also the length of the matched
## text segment as *length*. For an unsuccessful match, the *enter* and
## *leave* handlers will be executed, with *length* set to -1.
##
## Symbols declared in an *enter* handler can be made visible in the
## corresponding *leave* handler by annotating them with an *inject* pragma.
proc rawParse(s: string, p: Peg, start: int, c: var Captures): int
{.gensym.} =
# binding from *macros*
bind strVal
mkHandlerTplts:
handlers
macro enter(pegKind, s, pegNode, start: untyped): untyped =
# This is called by the matcher code in *matchOrParse* at the
# start of the code for a grammar element of kind *pegKind*.
# Expands to a call to the handler template if one was generated
# by *mkHandlerTplts*.
template mkDoEnter(hdPostf, s, pegNode, start) =
when declared(`enter hdPostf`):
`enter hdPostf`(s, pegNode, start)
else:
discard
let hdPostf = ident(substr($pegKind, 2))
getAst(mkDoEnter(hdPostf, s, pegNode, start))
macro leave(pegKind, s, pegNode, start, length: untyped): untyped =
# Like *enter*, but called at the end of the matcher code for
# a grammar element of kind *pegKind*.
template mkDoLeave(hdPostf, s, pegNode, start, length) =
when declared(`leave hdPostf`):
`leave hdPostf`(s, pegNode, start, length)
else:
discard
let hdPostf = ident(substr($pegKind, 2))
getAst(mkDoLeave(hdPostf, s, pegNode, start, length))
matchOrParse(parseIt)
parseIt(s, p, start, c)
proc parser(s: string): int {.gensym.} =
# the proc to be returned
var
ms: array[MaxSubpatterns, (int, int)]
cs = Captures(matches: ms, ml: 0, origStart: 0)
rawParse(s, pegAst, 0, cs)
parser
template fillMatches(s, caps, c) =
for k in 0..c.ml-1:
let startIdx = c.matches[k][0]
let endIdx = c.matches[k][1]
if startIdx != -1:
caps[k] = substr(s, startIdx, endIdx)
else:
caps[k] = ""
func matchLen*(s: string, pattern: Peg, matches: var openArray[string],
start = 0): int {.rtl, extern: "npegs$1Capture".} =
## the same as ``match``, but it returns the length of the match,
## if there is no match, -1 is returned. Note that a match length
## of zero can happen. It's possible that a suffix of `s` remains
## that does not belong to the match.
var c: Captures
c.origStart = start
result = rawMatch(s, pattern, start, c)
if result >= 0: fillMatches(s, matches, c)
func matchLen*(s: string, pattern: Peg,
start = 0): int {.rtl, extern: "npegs$1".} =
## the same as ``match``, but it returns the length of the match,
## if there is no match, -1 is returned. Note that a match length
## of zero can happen. It's possible that a suffix of `s` remains
## that does not belong to the match.
var c: Captures
c.origStart = start
result = rawMatch(s, pattern, start, c)
func match*(s: string, pattern: Peg, matches: var openArray[string],
start = 0): bool {.rtl, extern: "npegs$1Capture".} =
## returns ``true`` if ``s[start..]`` matches the ``pattern`` and
## the captured substrings in the array ``matches``. If it does not
## match, nothing is written into ``matches`` and ``false`` is
## returned.
result = matchLen(s, pattern, matches, start) != -1
func match*(s: string, pattern: Peg,
start = 0): bool {.rtl, extern: "npegs$1".} =
## returns ``true`` if ``s`` matches the ``pattern`` beginning from ``start``.
result = matchLen(s, pattern, start) != -1
func find*(s: string, pattern: Peg, matches: var openArray[string],
start = 0): int {.rtl, extern: "npegs$1Capture".} =
## returns the starting position of ``pattern`` in ``s`` and the captured
## substrings in the array ``matches``. If it does not match, nothing
## is written into ``matches`` and -1 is returned.
var c: Captures
c.origStart = start
for i in start .. s.len-1:
c.ml = 0
if rawMatch(s, pattern, i, c) >= 0:
fillMatches(s, matches, c)
return i
return -1
# could also use the pattern here: (!P .)* P
func findBounds*(s: string, pattern: Peg, matches: var openArray[string],
start = 0): tuple[first, last: int] {.
rtl, extern: "npegs$1Capture".} =
## returns the starting position and end position of ``pattern`` in ``s``
## and the captured
## substrings in the array ``matches``. If it does not match, nothing
## is written into ``matches`` and (-1,0) is returned.
var c: Captures
c.origStart = start
for i in start .. s.len-1:
c.ml = 0
var L = rawMatch(s, pattern, i, c)
if L >= 0:
fillMatches(s, matches, c)
return (i, i+L-1)
return (-1, 0)
func find*(s: string, pattern: Peg,
start = 0): int {.rtl, extern: "npegs$1".} =
## returns the starting position of ``pattern`` in ``s``. If it does not
## match, -1 is returned.
var c: Captures
c.origStart = start
for i in start .. s.len-1:
if rawMatch(s, pattern, i, c) >= 0: return i
return -1
iterator findAll*(s: string, pattern: Peg, start = 0): string =
## yields all matching *substrings* of `s` that match `pattern`.
var c: Captures
c.origStart = start
var i = start
while i < s.len:
c.ml = 0
var L = rawMatch(s, pattern, i, c)
if L < 0:
inc(i, 1)
else:
yield substr(s, i, i+L-1)
inc(i, L)
func findAll*(s: string, pattern: Peg, start = 0): seq[string] {.
rtl, extern: "npegs$1".} =
## returns all matching *substrings* of `s` that match `pattern`.
## If it does not match, `@[]` is returned.
result = @[]
for it in findAll(s, pattern, start): result.add it
template `=~`*(s: string, pattern: Peg): bool =
## This calls ``match`` with an implicit declared ``matches`` array that
## can be used in the scope of the ``=~`` call:
##
## ```nim
## if line =~ peg"\s* {\w+} \s* '=' \s* {\w+}":
## # matches a key=value pair:
## echo("Key: ", matches[0])
## echo("Value: ", matches[1])
## elif line =~ peg"\s*{'#'.*}":
## # matches a comment
## # note that the implicit ``matches`` array is different from the
## # ``matches`` array of the first branch
## echo("comment: ", matches[0])
## else:
## echo("syntax error")
## ```
bind MaxSubpatterns
when not declaredInScope(matches):
var matches {.inject.} = default(array[0..MaxSubpatterns-1, string])
match(s, pattern, matches)
# ------------------------- more string handling ------------------------------
func contains*(s: string, pattern: Peg, start = 0): bool {.
rtl, extern: "npegs$1".} =
## same as ``find(s, pattern, start) >= 0``
return find(s, pattern, start) >= 0
func contains*(s: string, pattern: Peg, matches: var openArray[string],
start = 0): bool {.rtl, extern: "npegs$1Capture".} =
## same as ``find(s, pattern, matches, start) >= 0``
return find(s, pattern, matches, start) >= 0
func startsWith*(s: string, prefix: Peg, start = 0): bool {.
rtl, extern: "npegs$1".} =
## returns true if `s` starts with the pattern `prefix`
result = matchLen(s, prefix, start) >= 0
func endsWith*(s: string, suffix: Peg, start = 0): bool {.
rtl, extern: "npegs$1".} =
## returns true if `s` ends with the pattern `suffix`
var c: Captures
c.origStart = start
for i in start .. s.len-1:
if rawMatch(s, suffix, i, c) == s.len - i: return true
func replacef*(s: string, sub: Peg, by: string): string {.
rtl, extern: "npegs$1".} =
## Replaces `sub` in `s` by the string `by`. Captures can be accessed in `by`
## with the notation ``$i`` and ``$#`` (see strutils.`%`). Examples:
##
## ```nim
## "var1=key; var2=key2".replacef(peg"{\ident}'='{\ident}", "$1<-$2$2")
## ```
##
## Results in:
##
## ```nim
## "var1<-keykey; val2<-key2key2"
## ```
result = ""
var i = 0
var caps: array[0..MaxSubpatterns-1, string]
var c: Captures
while i < s.len:
c.ml = 0
var x = rawMatch(s, sub, i, c)
if x <= 0:
add(result, s[i])
inc(i)
else:
fillMatches(s, caps, c)
addf(result, by, caps)
inc(i, x)
add(result, substr(s, i))
func replace*(s: string, sub: Peg, by = ""): string {.
rtl, extern: "npegs$1".} =
## Replaces `sub` in `s` by the string `by`. Captures cannot be accessed
## in `by`.
result = ""
var i = 0
var c: Captures
while i < s.len:
var x = rawMatch(s, sub, i, c)
if x <= 0:
add(result, s[i])
inc(i)
else:
add(result, by)
inc(i, x)
add(result, substr(s, i))
func parallelReplace*(s: string, subs: varargs[
tuple[pattern: Peg, repl: string]]): string {.
rtl, extern: "npegs$1".} =
## Returns a modified copy of `s` with the substitutions in `subs`
## applied in parallel.
result = ""
var i = 0
var c: Captures
var caps: array[0..MaxSubpatterns-1, string]
while i < s.len:
block searchSubs:
for j in 0..high(subs):
c.ml = 0
var x = rawMatch(s, subs[j][0], i, c)
if x > 0:
fillMatches(s, caps, c)
addf(result, subs[j][1], caps)
inc(i, x)
break searchSubs
add(result, s[i])
inc(i)
# copy the rest:
add(result, substr(s, i))
when not defined(nimHasEffectsOf):
{.pragma: effectsOf.}
func replace*(s: string, sub: Peg, cb: proc(
match: int, cnt: int, caps: openArray[string]): string): string {.
rtl, extern: "npegs$1cb", effectsOf: cb.} =
## Replaces `sub` in `s` by the resulting strings from the callback.
## The callback proc receives the index of the current match (starting with 0),
## the count of captures and an open array with the captures of each match. Examples:
##
## ```nim
## func handleMatches*(m: int, n: int, c: openArray[string]): string =
## result = ""
## if m > 0:
## result.add ", "
## result.add case n:
## of 2: c[0].toLower & ": '" & c[1] & "'"
## of 1: c[0].toLower & ": ''"
## else: ""
##
## let s = "Var1=key1;var2=Key2; VAR3"
## echo s.replace(peg"{\ident}('='{\ident})* ';'* \s*", handleMatches)
## ```
##
## Results in:
##
## ```nim
## "var1: 'key1', var2: 'Key2', var3: ''"
## ```
result = ""
var i = 0
var caps: array[0..MaxSubpatterns-1, string]
var c: Captures
var m = 0
while i < s.len:
c.ml = 0
var x = rawMatch(s, sub, i, c)
if x <= 0:
add(result, s[i])
inc(i)
else:
fillMatches(s, caps, c)
add(result, cb(m, c.ml, caps))
inc(i, x)
inc(m)
add(result, substr(s, i))
when not defined(js):
proc transformFile*(infile, outfile: string,
subs: varargs[tuple[pattern: Peg, repl: string]]) {.
rtl, extern: "npegs$1".} =
## reads in the file `infile`, performs a parallel replacement (calls
## `parallelReplace`) and writes back to `outfile`. Raises ``IOError`` if an
## error occurs. This is supposed to be used for quick scripting.
##
## **Note**: this proc does not exist while using the JS backend.
var x = readFile(infile)
writeFile(outfile, x.parallelReplace(subs))
iterator split*(s: string, sep: Peg): string =
## Splits the string `s` into substrings.
##
## Substrings are separated by the PEG `sep`.
## Examples:
##
## ```nim
## for word in split("00232this02939is39an22example111", peg"\d+"):
## writeLine(stdout, word)
## ```
##
## Results in:
##
## ```nim
## "this"
## "is"
## "an"
## "example"
## ```
var c: Captures
var
first = 0
last = 0
while last < len(s):
c.ml = 0
var x = rawMatch(s, sep, last, c)
if x > 0: inc(last, x)
first = last
while last < len(s):
inc(last)
c.ml = 0
x = rawMatch(s, sep, last, c)
if x > 0: break
if first < last:
yield substr(s, first, last-1)
func split*(s: string, sep: Peg): seq[string] {.
rtl, extern: "npegs$1".} =
## Splits the string `s` into substrings.
result = @[]
for it in split(s, sep): result.add it
# ------------------- scanner -------------------------------------------------
type
Modifier = enum
modNone,
modVerbatim,
modIgnoreCase,
modIgnoreStyle
TokKind = enum ## enumeration of all tokens
tkInvalid, ## invalid token
tkEof, ## end of file reached
tkAny, ## .
tkAnyRune, ## _
tkIdentifier, ## abc
tkStringLit, ## "abc" or 'abc'
tkCharSet, ## [^A-Z]
tkParLe, ## '('
tkParRi, ## ')'
tkCurlyLe, ## '{'
tkCurlyRi, ## '}'
tkCurlyAt, ## '{@}'
tkEmptyCurl, ## '{}'
tkArrow, ## '<-'
tkBar, ## '/'
tkStar, ## '*'
tkPlus, ## '+'
tkAmp, ## '&'
tkNot, ## '!'
tkOption, ## '?'
tkAt, ## '@'
tkBuiltin, ## \identifier
tkEscaped, ## \\
tkBackref, ## '$'
tkDollar, ## '$'
tkHat ## '^'
Token {.final.} = object ## a token
kind: TokKind ## the type of the token
modifier: Modifier
literal: string ## the parsed (string) literal
charset: set[char] ## if kind == tkCharSet
index: int ## if kind == tkBackref
PegLexer {.inheritable.} = object ## the lexer object.
bufpos: int ## the current position within the buffer
buf: string ## the buffer itself
lineNumber: int ## the current line number
lineStart: int ## index of last line start in buffer
colOffset: int ## column to add
filename: string
const
tokKindToStr: array[TokKind, string] = [
"invalid", "[EOF]", ".", "_", "identifier", "string literal",
"character set", "(", ")", "{", "}", "{@}", "{}",
"<-", "/", "*", "+", "&", "!", "?",
"@", "built-in", "escaped", "$", "$", "^"
]
func handleCR(L: var PegLexer, pos: int): int =
assert(L.buf[pos] == '\c')
inc(L.lineNumber)
result = pos+1
if result < L.buf.len and L.buf[result] == '\L': inc(result)
L.lineStart = result
func handleLF(L: var PegLexer, pos: int): int =
assert(L.buf[pos] == '\L')
inc(L.lineNumber)
result = pos+1
L.lineStart = result
func init(L: var PegLexer, input, filename: string, line = 1, col = 0) =
L.buf = input
L.bufpos = 0
L.lineNumber = line
L.colOffset = col
L.lineStart = 0
L.filename = filename
func getColumn(L: PegLexer): int {.inline.} =
result = abs(L.bufpos - L.lineStart) + L.colOffset
func getLine(L: PegLexer): int {.inline.} =
result = L.lineNumber
func errorStr(L: PegLexer, msg: string, line = -1, col = -1): string =
var line = if line < 0: getLine(L) else: line
var col = if col < 0: getColumn(L) else: col
result = "$1($2, $3) Error: $4" % [L.filename, $line, $col, msg]
func getEscapedChar(c: var PegLexer, tok: var Token) =
inc(c.bufpos)
if c.bufpos >= len(c.buf):
tok.kind = tkInvalid
return
case c.buf[c.bufpos]
of 'r', 'R', 'c', 'C':
add(tok.literal, '\c')
inc(c.bufpos)
of 'l', 'L':
add(tok.literal, '\L')
inc(c.bufpos)
of 'f', 'F':
add(tok.literal, '\f')
inc(c.bufpos)
of 'e', 'E':
add(tok.literal, '\e')
inc(c.bufpos)
of 'a', 'A':
add(tok.literal, '\a')
inc(c.bufpos)
of 'b', 'B':
add(tok.literal, '\b')
inc(c.bufpos)
of 'v', 'V':
add(tok.literal, '\v')
inc(c.bufpos)
of 't', 'T':
add(tok.literal, '\t')
inc(c.bufpos)
of 'x', 'X':
inc(c.bufpos)
if c.bufpos >= len(c.buf):
tok.kind = tkInvalid
return
var xi = 0
if handleHexChar(c.buf[c.bufpos], xi):
inc(c.bufpos)
if handleHexChar(c.buf[c.bufpos], xi):
inc(c.bufpos)
if xi == 0: tok.kind = tkInvalid
else: add(tok.literal, chr(xi))
of '0'..'9':
var val = ord(c.buf[c.bufpos]) - ord('0')
inc(c.bufpos)
var i = 1
while (c.bufpos < len(c.buf)) and (i <= 3) and (c.buf[c.bufpos] in {'0'..'9'}):
val = val * 10 + ord(c.buf[c.bufpos]) - ord('0')
inc(c.bufpos)
inc(i)
if val > 0 and val <= 255: add(tok.literal, chr(val))
else: tok.kind = tkInvalid
of '\0'..'\31':
tok.kind = tkInvalid
elif c.buf[c.bufpos] in strutils.Letters:
tok.kind = tkInvalid
else:
add(tok.literal, c.buf[c.bufpos])
inc(c.bufpos)
func skip(c: var PegLexer) =
var pos = c.bufpos
while pos < c.buf.len:
case c.buf[pos]
of ' ', '\t':
inc(pos)
of '#':
while (pos < c.buf.len) and
not (c.buf[pos] in {'\c', '\L', '\0'}): inc(pos)
of '\c':
pos = handleCR(c, pos)
of '\L':
pos = handleLF(c, pos)
else:
break # EndOfFile also leaves the loop
c.bufpos = pos
func getString(c: var PegLexer, tok: var Token) =
tok.kind = tkStringLit
var pos = c.bufpos + 1
var quote = c.buf[pos-1]
while pos < c.buf.len:
case c.buf[pos]
of '\\':
c.bufpos = pos
getEscapedChar(c, tok)
pos = c.bufpos
of '\c', '\L', '\0':
tok.kind = tkInvalid
break
elif c.buf[pos] == quote:
inc(pos)
break
else:
add(tok.literal, c.buf[pos])
inc(pos)
c.bufpos = pos
func getDollar(c: var PegLexer, tok: var Token) =
var pos = c.bufpos + 1
var neg = false
if pos < c.buf.len and c.buf[pos] == '^':
neg = true
inc(pos)
if pos < c.buf.len and c.buf[pos] in {'0'..'9'}:
tok.kind = tkBackref
tok.index = 0
while pos < c.buf.len and c.buf[pos] in {'0'..'9'}:
tok.index = tok.index * 10 + ord(c.buf[pos]) - ord('0')
inc(pos)
if neg:
tok.index = -tok.index
else:
if neg:
dec(pos)
tok.kind = tkDollar
c.bufpos = pos
func getCharSet(c: var PegLexer, tok: var Token) =
tok.kind = tkCharSet
tok.charset = {}
var pos = c.bufpos + 1
var caret = false
if pos < c.buf.len:
if c.buf[pos] == '^':
inc(pos)
caret = true
while pos < c.buf.len:
var ch: char
case c.buf[pos]
of ']':
if pos < c.buf.len: inc(pos)
break
of '\\':
c.bufpos = pos
getEscapedChar(c, tok)
pos = c.bufpos
ch = tok.literal[tok.literal.len-1]
of '\C', '\L', '\0':
tok.kind = tkInvalid
break
else:
ch = c.buf[pos]
inc(pos)
incl(tok.charset, ch)
if c.buf[pos] == '-':
if pos+1 < c.buf.len and c.buf[pos+1] == ']':
incl(tok.charset, '-')
inc(pos)
else:
if pos+1 < c.buf.len:
inc(pos)
else:
break
var ch2: char
case c.buf[pos]
of '\\':
c.bufpos = pos
getEscapedChar(c, tok)
pos = c.bufpos
ch2 = tok.literal[tok.literal.len-1]
of '\C', '\L', '\0':
tok.kind = tkInvalid
break
else:
if pos+1 < c.buf.len:
ch2 = c.buf[pos]
inc(pos)
else:
break
for i in ord(ch)+1 .. ord(ch2):
incl(tok.charset, chr(i))
c.bufpos = pos
if caret: tok.charset = {'\1'..'\xFF'} - tok.charset
func getSymbol(c: var PegLexer, tok: var Token) =
var pos = c.bufpos
while pos < c.buf.len:
add(tok.literal, c.buf[pos])
inc(pos)
if pos < c.buf.len and c.buf[pos] notin strutils.IdentChars: break
c.bufpos = pos
tok.kind = tkIdentifier
func getBuiltin(c: var PegLexer, tok: var Token) =
if c.bufpos+1 < c.buf.len and c.buf[c.bufpos+1] in strutils.Letters:
inc(c.bufpos)
getSymbol(c, tok)
tok.kind = tkBuiltin
else:
tok.kind = tkEscaped
getEscapedChar(c, tok) # may set tok.kind to tkInvalid
func getTok(c: var PegLexer, tok: var Token) =
tok.kind = tkInvalid
tok.modifier = modNone
setLen(tok.literal, 0)
skip(c)
if c.bufpos >= c.buf.len:
tok.kind = tkEof
tok.literal = "[EOF]"
add(tok.literal, '\0')
inc(c.bufpos)
return
case c.buf[c.bufpos]
of '{':
inc(c.bufpos)
if c.buf[c.bufpos] == '@' and c.bufpos+2 < c.buf.len and
c.buf[c.bufpos+1] == '}':
tok.kind = tkCurlyAt
inc(c.bufpos, 2)
add(tok.literal, "{@}")
elif c.buf[c.bufpos] == '}' and c.bufpos < c.buf.len:
tok.kind = tkEmptyCurl
inc(c.bufpos)
add(tok.literal, "{}")
else:
tok.kind = tkCurlyLe
add(tok.literal, '{')
of '}':
tok.kind = tkCurlyRi
inc(c.bufpos)
add(tok.literal, '}')
of '[':
getCharSet(c, tok)
of '(':
tok.kind = tkParLe
inc(c.bufpos)
add(tok.literal, '(')
of ')':
tok.kind = tkParRi
inc(c.bufpos)
add(tok.literal, ')')
of '.':
tok.kind = tkAny
inc(c.bufpos)
add(tok.literal, '.')
of '_':
tok.kind = tkAnyRune
inc(c.bufpos)
add(tok.literal, '_')
of '\\':
getBuiltin(c, tok)
of '\'', '"': getString(c, tok)
of '$': getDollar(c, tok)
of 'a'..'z', 'A'..'Z', '\128'..'\255':
getSymbol(c, tok)
if c.bufpos >= c.buf.len:
return
if c.buf[c.bufpos] in {'\'', '"'} or
c.buf[c.bufpos] == '$' and c.bufpos+1 < c.buf.len and
c.buf[c.bufpos+1] in {'^', '0'..'9'}:
case tok.literal
of "i": tok.modifier = modIgnoreCase
of "y": tok.modifier = modIgnoreStyle
of "v": tok.modifier = modVerbatim
else: discard
setLen(tok.literal, 0)
if c.buf[c.bufpos] == '$':
getDollar(c, tok)
else:
getString(c, tok)
if tok.modifier == modNone: tok.kind = tkInvalid
of '+':
tok.kind = tkPlus
inc(c.bufpos)
add(tok.literal, '+')
of '*':
tok.kind = tkStar
inc(c.bufpos)
add(tok.literal, '+')
of '<':
if c.bufpos+2 < c.buf.len and c.buf[c.bufpos+1] == '-':
inc(c.bufpos, 2)
tok.kind = tkArrow
add(tok.literal, "<-")
else:
add(tok.literal, '<')
of '/':
tok.kind = tkBar
inc(c.bufpos)
add(tok.literal, '/')
of '?':
tok.kind = tkOption
inc(c.bufpos)
add(tok.literal, '?')
of '!':
tok.kind = tkNot
inc(c.bufpos)
add(tok.literal, '!')
of '&':
tok.kind = tkAmp
inc(c.bufpos)
add(tok.literal, '!')
of '@':
tok.kind = tkAt
inc(c.bufpos)
add(tok.literal, '@')
if c.buf[c.bufpos] == '@':
tok.kind = tkCurlyAt
inc(c.bufpos)
add(tok.literal, '@')
of '^':
tok.kind = tkHat
inc(c.bufpos)
add(tok.literal, '^')
else:
if c.bufpos >= c.buf.len:
tok.kind = tkEof
tok.literal = "[EOF]"
add(tok.literal, c.buf[c.bufpos])
inc(c.bufpos)
func arrowIsNextTok(c: PegLexer): bool =
# the only look ahead we need
var pos = c.bufpos
while pos < c.buf.len and c.buf[pos] in {'\t', ' '}: inc(pos)
if pos+1 >= c.buf.len:
return
result = c.buf[pos] == '<' and c.buf[pos+1] == '-'
# ----------------------------- parser ----------------------------------------
type
EInvalidPeg* = object of ValueError ## raised if an invalid
## PEG has been detected
PegParser = object of PegLexer ## the PEG parser object
tok: Token
nonterms: seq[NonTerminal]
modifier: Modifier
captures: int
identIsVerbatim: bool
skip: Peg
func pegError(p: PegParser, msg: string, line = -1, col = -1) =
var e = (ref EInvalidPeg)(msg: errorStr(p, msg, line, col))
raise e
func getTok(p: var PegParser) =
getTok(p, p.tok)
if p.tok.kind == tkInvalid: pegError(p, "'" & p.tok.literal & "' is invalid token")
func eat(p: var PegParser, kind: TokKind) =
if p.tok.kind == kind: getTok(p)
else: pegError(p, tokKindToStr[kind] & " expected")
func parseExpr(p: var PegParser): Peg {.gcsafe.}
func getNonTerminal(p: var PegParser, name: string): NonTerminal =
for i in 0..high(p.nonterms):
result = p.nonterms[i]
if cmpIgnoreStyle(result.name, name) == 0: return
# forward reference:
result = newNonTerminal(name, getLine(p), getColumn(p))
add(p.nonterms, result)
func modifiedTerm(s: string, m: Modifier): Peg =
case m
of modNone, modVerbatim: result = term(s)
of modIgnoreCase: result = termIgnoreCase(s)
of modIgnoreStyle: result = termIgnoreStyle(s)
func modifiedBackref(s: int, m: Modifier): Peg =
var
reverse = s < 0
index = if reverse: -s else: s
case m
of modNone, modVerbatim: result = backref(index, reverse)
of modIgnoreCase: result = backrefIgnoreCase(index, reverse)
of modIgnoreStyle: result = backrefIgnoreStyle(index, reverse)
func builtin(p: var PegParser): Peg =
# do not use "y", "skip" or "i" as these would be ambiguous
case p.tok.literal
of "n": result = newLine()
of "d": result = charSet({'0'..'9'})
of "D": result = charSet({'\1'..'\xff'} - {'0'..'9'})
of "s": result = charSet({' ', '\9'..'\13'})
of "S": result = charSet({'\1'..'\xff'} - {' ', '\9'..'\13'})
of "w": result = charSet({'a'..'z', 'A'..'Z', '_', '0'..'9'})
of "W": result = charSet({'\1'..'\xff'} - {'a'..'z', 'A'..'Z', '_', '0'..'9'})
of "a": result = charSet({'a'..'z', 'A'..'Z'})
of "A": result = charSet({'\1'..'\xff'} - {'a'..'z', 'A'..'Z'})
of "ident": result = pegs.ident
of "letter": result = unicodeLetter()
of "upper": result = unicodeUpper()
of "lower": result = unicodeLower()
of "title": result = unicodeTitle()
of "white": result = unicodeWhitespace()
else: pegError(p, "unknown built-in: " & p.tok.literal)
func token(terminal: Peg, p: PegParser): Peg =
if p.skip.kind == pkEmpty: result = terminal
else: result = sequence(p.skip, terminal)
func primary(p: var PegParser): Peg =
case p.tok.kind
of tkAmp:
getTok(p)
return &primary(p)
of tkNot:
getTok(p)
return !primary(p)
of tkAt:
getTok(p)
return !*primary(p)
of tkCurlyAt:
getTok(p)
return !*\primary(p).token(p)
else: discard
case p.tok.kind
of tkIdentifier:
if p.identIsVerbatim:
var m = p.tok.modifier
if m == modNone: m = p.modifier
result = modifiedTerm(p.tok.literal, m).token(p)
getTok(p)
elif not arrowIsNextTok(p):
var nt = getNonTerminal(p, p.tok.literal)
{.cast(noSideEffect).}:
incl(nt.flags, ntUsed)
result = nonterminal(nt).token(p)
getTok(p)
else:
pegError(p, "expression expected, but found: " & p.tok.literal)
of tkStringLit:
var m = p.tok.modifier
if m == modNone: m = p.modifier
result = modifiedTerm(p.tok.literal, m).token(p)
getTok(p)
of tkCharSet:
if '\0' in p.tok.charset:
pegError(p, "binary zero ('\\0') not allowed in character class")
result = charSet(p.tok.charset).token(p)
getTok(p)
of tkParLe:
getTok(p)
result = parseExpr(p)
eat(p, tkParRi)
of tkCurlyLe:
getTok(p)
result = capture(parseExpr(p)).token(p)
eat(p, tkCurlyRi)
inc(p.captures)
of tkEmptyCurl:
result = capture()
getTok(p)
of tkAny:
result = any().token(p)
getTok(p)
of tkAnyRune:
result = anyRune().token(p)
getTok(p)
of tkBuiltin:
result = builtin(p).token(p)
getTok(p)
of tkEscaped:
result = term(p.tok.literal[0]).token(p)
getTok(p)
of tkDollar:
result = endAnchor()
getTok(p)
of tkHat:
result = startAnchor()
getTok(p)
of tkBackref:
if abs(p.tok.index) > p.captures or p.tok.index == 0:
pegError(p, "invalid back reference index: " & $p.tok.index)
var m = p.tok.modifier
if m == modNone: m = p.modifier
result = modifiedBackref(p.tok.index, m).token(p)
getTok(p)
else:
pegError(p, "expression expected, but found: " & p.tok.literal)
getTok(p) # we must consume a token here to prevent endless loops!
while true:
case p.tok.kind
of tkOption:
result = ?result
getTok(p)
of tkStar:
result = *result
getTok(p)
of tkPlus:
result = +result
getTok(p)
else: break
func seqExpr(p: var PegParser): Peg =
result = primary(p)
while true:
case p.tok.kind
of tkAmp, tkNot, tkAt, tkStringLit, tkCharSet, tkParLe, tkCurlyLe,
tkAny, tkAnyRune, tkBuiltin, tkEscaped, tkDollar, tkBackref,
tkHat, tkCurlyAt, tkEmptyCurl:
result = sequence(result, primary(p))
of tkIdentifier:
if not arrowIsNextTok(p):
result = sequence(result, primary(p))
else: break
else: break
func parseExpr(p: var PegParser): Peg =
result = seqExpr(p)
while p.tok.kind == tkBar:
getTok(p)
result = result / seqExpr(p)
func parseRule(p: var PegParser): NonTerminal =
if p.tok.kind == tkIdentifier and arrowIsNextTok(p):
result = getNonTerminal(p, p.tok.literal)
if ntDeclared in result.flags:
pegError(p, "attempt to redefine: " & result.name)
{.cast(noSideEffect).}:
result.line = getLine(p)
result.col = getColumn(p)
getTok(p)
eat(p, tkArrow)
{.cast(noSideEffect).}:
result.rule = parseExpr(p)
incl(result.flags, ntDeclared) # NOW inlining may be attempted
else:
pegError(p, "rule expected, but found: " & p.tok.literal)
func rawParse(p: var PegParser): Peg =
## parses a rule or a PEG expression
while p.tok.kind == tkBuiltin:
case p.tok.literal
of "i":
p.modifier = modIgnoreCase
getTok(p)
of "y":
p.modifier = modIgnoreStyle
getTok(p)
of "skip":
getTok(p)
p.skip = ?primary(p)
else: break
if p.tok.kind == tkIdentifier and arrowIsNextTok(p):
result = parseRule(p).rule
while p.tok.kind != tkEof:
discard parseRule(p)
else:
p.identIsVerbatim = true
result = parseExpr(p)
if p.tok.kind != tkEof:
pegError(p, "EOF expected, but found: " & p.tok.literal)
for i in 0..high(p.nonterms):
var nt = p.nonterms[i]
if ntDeclared notin nt.flags:
pegError(p, "undeclared identifier: " & nt.name, nt.line, nt.col)
elif ntUsed notin nt.flags and i > 0:
pegError(p, "unused rule: " & nt.name, nt.line, nt.col)
func parsePeg*(pattern: string, filename = "pattern", line = 1, col = 0): Peg =
## constructs a Peg object from `pattern`. `filename`, `line`, `col` are
## used for error messages, but they only provide start offsets. `parsePeg`
## keeps track of line and column numbers within `pattern`.
var p: PegParser
init(PegLexer(p), pattern, filename, line, col)
p.tok.kind = tkInvalid
p.tok.modifier = modNone
p.tok.literal = ""
p.tok.charset = {}
p.nonterms = @[]
p.identIsVerbatim = false
getTok(p)
result = rawParse(p)
func peg*(pattern: string): Peg =
## constructs a Peg object from the `pattern`. The short name has been
## chosen to encourage its use as a raw string modifier:
##
## peg"{\ident} \s* '=' \s* {.*}"
result = parsePeg(pattern, "pattern")
func escapePeg*(s: string): string =
## escapes `s` so that it is matched verbatim when used as a peg.
result = ""
var inQuote = false
for c in items(s):
case c
of '\0'..'\31', '\'', '"', '\\':
if inQuote:
result.add('\'')
inQuote = false
result.add("\\x")
result.add(toHex(ord(c), 2))
else:
if not inQuote:
result.add('\'')
inQuote = true
result.add(c)
if inQuote: result.add('\'')