Files
Nim/lib/pure/pegs.nim
Andreas Rumpf 3812d91390 alternative, much simpler algorithm for strict func checking (#21066)
* alternative, much simpler algorithm for strict func checking

* forgot to git add new compiler module

* new spec is incredibly simple to describe

* fixes bigints regression

* typos

* closes #16305; closes #17387; closes #20863
2022-12-11 16:58:50 +01:00

2086 lines
64 KiB
Nim

#
#
# Nim's Runtime Library
# (c) Copyright 2012 Andreas Rumpf
#
# See the file "copying.txt", included in this
# distribution, for details about the copyright.
#
## Simple PEG (Parsing expression grammar) matching. Uses no memorization, but
## uses superoperators and symbol inlining to improve performance. Note:
## Matching performance is hopefully competitive with optimized regular
## expression engines.
##
## .. include:: ../../doc/pegdocs.txt
##
include "system/inclrtl"
when defined(nimPreviewSlimSystem):
import std/[syncio, assertions]
const
useUnicode = true ## change this to deactivate proper UTF-8 support
import strutils, macros
import std/private/decode_helpers
when useUnicode:
import unicode
export unicode.`==`
const
InlineThreshold = 5 ## number of leaves; -1 to disable inlining
MaxSubpatterns* = 20 ## defines the maximum number of subpatterns that
## can be captured. More subpatterns cannot be captured!
type
PegKind* = enum
pkEmpty,
pkAny, ## any character (.)
pkAnyRune, ## any Unicode character (_)
pkNewLine, ## CR-LF, LF, CR
pkLetter, ## Unicode letter
pkLower, ## Unicode lower case letter
pkUpper, ## Unicode upper case letter
pkTitle, ## Unicode title character
pkWhitespace, ## Unicode whitespace character
pkTerminal,
pkTerminalIgnoreCase,
pkTerminalIgnoreStyle,
pkChar, ## single character to match
pkCharChoice,
pkNonTerminal,
pkSequence, ## a b c ... --> Internal DSL: peg(a, b, c)
pkOrderedChoice, ## a / b / ... --> Internal DSL: a / b or /[a, b, c]
pkGreedyRep, ## a* --> Internal DSL: *a
## a+ --> (a a*)
pkGreedyRepChar, ## x* where x is a single character (superop)
pkGreedyRepSet, ## [set]* (superop)
pkGreedyAny, ## .* or _* (superop)
pkOption, ## a? --> Internal DSL: ?a
pkAndPredicate, ## &a --> Internal DSL: &a
pkNotPredicate, ## !a --> Internal DSL: !a
pkCapture, ## {a} --> Internal DSL: capture(a)
pkBackRef, ## $i --> Internal DSL: backref(i)
pkBackRefIgnoreCase,
pkBackRefIgnoreStyle,
pkSearch, ## @a --> Internal DSL: !*a
pkCapturedSearch, ## {@} a --> Internal DSL: !*\a
pkRule, ## a <- b
pkList, ## a, b
pkStartAnchor ## ^ --> Internal DSL: startAnchor()
NonTerminalFlag* = enum
ntDeclared, ntUsed
NonTerminalObj = object ## represents a non terminal symbol
name: string ## the name of the symbol
line: int ## line the symbol has been declared/used in
col: int ## column the symbol has been declared/used in
flags: set[NonTerminalFlag] ## the nonterminal's flags
rule: Peg ## the rule that the symbol refers to
Peg* {.shallow.} = object ## type that represents a PEG
case kind: PegKind
of pkEmpty..pkWhitespace: nil
of pkTerminal, pkTerminalIgnoreCase, pkTerminalIgnoreStyle: term: string
of pkChar, pkGreedyRepChar: ch: char
of pkCharChoice, pkGreedyRepSet: charChoice: ref set[char]
of pkNonTerminal: nt: NonTerminal
of pkBackRef..pkBackRefIgnoreStyle: index: range[-MaxSubpatterns..MaxSubpatterns-1]
else: sons: seq[Peg]
NonTerminal* = ref NonTerminalObj
func kind*(p: Peg): PegKind = p.kind
## Returns the *PegKind* of a given *Peg* object.
func term*(p: Peg): string = p.term
## Returns the *string* representation of a given *Peg* variant object
## where present.
func ch*(p: Peg): char = p.ch
## Returns the *char* representation of a given *Peg* variant object
## where present.
func charChoice*(p: Peg): ref set[char] = p.charChoice
## Returns the *charChoice* field of a given *Peg* variant object
## where present.
func nt*(p: Peg): NonTerminal = p.nt
## Returns the *NonTerminal* object of a given *Peg* variant object
## where present.
func index*(p: Peg): range[-MaxSubpatterns..MaxSubpatterns-1] = p.index
## Returns the back-reference index of a captured sub-pattern in the
## *Captures* object for a given *Peg* variant object where present.
iterator items*(p: Peg): Peg {.inline.} =
## Yields the child nodes of a *Peg* variant object where present.
for s in p.sons:
yield s
iterator pairs*(p: Peg): (int, Peg) {.inline.} =
## Yields the indices and child nodes of a *Peg* variant object where present.
for i in 0 ..< p.sons.len:
yield (i, p.sons[i])
func name*(nt: NonTerminal): string = nt.name
## Gets the name of the symbol represented by the parent *Peg* object variant
## of a given *NonTerminal*.
func line*(nt: NonTerminal): int = nt.line
## Gets the line number of the definition of the parent *Peg* object variant
## of a given *NonTerminal*.
func col*(nt: NonTerminal): int = nt.col
## Gets the column number of the definition of the parent *Peg* object variant
## of a given *NonTerminal*.
func flags*(nt: NonTerminal): set[NonTerminalFlag] = nt.flags
## Gets the *NonTerminalFlag*-typed flags field of the parent *Peg* variant
## object of a given *NonTerminal*.
func rule*(nt: NonTerminal): Peg = nt.rule
## Gets the *Peg* object representing the rule definition of the parent *Peg*
## object variant of a given *NonTerminal*.
func term*(t: string): Peg {.rtl, extern: "npegs$1Str".} =
## constructs a PEG from a terminal string
if t.len != 1:
result = Peg(kind: pkTerminal, term: t)
else:
result = Peg(kind: pkChar, ch: t[0])
func termIgnoreCase*(t: string): Peg {.
rtl, extern: "npegs$1".} =
## constructs a PEG from a terminal string; ignore case for matching
result = Peg(kind: pkTerminalIgnoreCase, term: t)
func termIgnoreStyle*(t: string): Peg {.
rtl, extern: "npegs$1".} =
## constructs a PEG from a terminal string; ignore style for matching
result = Peg(kind: pkTerminalIgnoreStyle, term: t)
func term*(t: char): Peg {.rtl, extern: "npegs$1Char".} =
## constructs a PEG from a terminal char
assert t != '\0'
result = Peg(kind: pkChar, ch: t)
func charSet*(s: set[char]): Peg {.rtl, extern: "npegs$1".} =
## constructs a PEG from a character set `s`
assert '\0' notin s
result = Peg(kind: pkCharChoice)
{.cast(noSideEffect).}:
new(result.charChoice)
result.charChoice[] = s
func len(a: Peg): int {.inline.} = return a.sons.len
func add(d: var Peg, s: Peg) {.inline.} = add(d.sons, s)
func addChoice(dest: var Peg, elem: Peg) =
var L = dest.len-1
if L >= 0 and dest.sons[L].kind == pkCharChoice:
# caution! Do not introduce false aliasing here!
case elem.kind
of pkCharChoice:
dest.sons[L] = charSet(dest.sons[L].charChoice[] + elem.charChoice[])
of pkChar:
dest.sons[L] = charSet(dest.sons[L].charChoice[] + {elem.ch})
else: add(dest, elem)
else: add(dest, elem)
template multipleOp(k: PegKind, localOpt: untyped) =
result = Peg(kind: k, sons: @[])
for x in items(a):
if x.kind == k:
for y in items(x.sons):
localOpt(result, y)
else:
localOpt(result, x)
if result.len == 1:
result = result.sons[0]
func `/`*(a: varargs[Peg]): Peg {.
rtl, extern: "npegsOrderedChoice".} =
## constructs an ordered choice with the PEGs in `a`
multipleOp(pkOrderedChoice, addChoice)
func addSequence(dest: var Peg, elem: Peg) =
var L = dest.len-1
if L >= 0 and dest.sons[L].kind == pkTerminal:
# caution! Do not introduce false aliasing here!
case elem.kind
of pkTerminal:
dest.sons[L] = term(dest.sons[L].term & elem.term)
of pkChar:
dest.sons[L] = term(dest.sons[L].term & elem.ch)
else: add(dest, elem)
else: add(dest, elem)
func sequence*(a: varargs[Peg]): Peg {.
rtl, extern: "npegs$1".} =
## constructs a sequence with all the PEGs from `a`
multipleOp(pkSequence, addSequence)
func `?`*(a: Peg): Peg {.rtl, extern: "npegsOptional".} =
## constructs an optional for the PEG `a`
if a.kind in {pkOption, pkGreedyRep, pkGreedyAny, pkGreedyRepChar,
pkGreedyRepSet}:
# a* ? --> a*
# a? ? --> a?
result = a
else:
result = Peg(kind: pkOption, sons: @[a])
func `*`*(a: Peg): Peg {.rtl, extern: "npegsGreedyRep".} =
## constructs a "greedy repetition" for the PEG `a`
case a.kind
of pkGreedyRep, pkGreedyRepChar, pkGreedyRepSet, pkGreedyAny, pkOption:
assert false
# produces endless loop!
of pkChar:
result = Peg(kind: pkGreedyRepChar, ch: a.ch)
of pkCharChoice:
result = Peg(kind: pkGreedyRepSet, charChoice: a.charChoice)
of pkAny, pkAnyRune:
result = Peg(kind: pkGreedyAny)
else:
result = Peg(kind: pkGreedyRep, sons: @[a])
func `!*`*(a: Peg): Peg {.rtl, extern: "npegsSearch".} =
## constructs a "search" for the PEG `a`
result = Peg(kind: pkSearch, sons: @[a])
func `!*\`*(a: Peg): Peg {.rtl,
extern: "npgegsCapturedSearch".} =
## constructs a "captured search" for the PEG `a`
result = Peg(kind: pkCapturedSearch, sons: @[a])
func `+`*(a: Peg): Peg {.rtl, extern: "npegsGreedyPosRep".} =
## constructs a "greedy positive repetition" with the PEG `a`
return sequence(a, *a)
func `&`*(a: Peg): Peg {.rtl, extern: "npegsAndPredicate".} =
## constructs an "and predicate" with the PEG `a`
result = Peg(kind: pkAndPredicate, sons: @[a])
func `!`*(a: Peg): Peg {.rtl, extern: "npegsNotPredicate".} =
## constructs a "not predicate" with the PEG `a`
result = Peg(kind: pkNotPredicate, sons: @[a])
func any*: Peg {.inline.} =
## constructs the PEG `any character`:idx: (``.``)
result = Peg(kind: pkAny)
func anyRune*: Peg {.inline.} =
## constructs the PEG `any rune`:idx: (``_``)
result = Peg(kind: pkAnyRune)
func newLine*: Peg {.inline.} =
## constructs the PEG `newline`:idx: (``\n``)
result = Peg(kind: pkNewLine)
func unicodeLetter*: Peg {.inline.} =
## constructs the PEG ``\letter`` which matches any Unicode letter.
result = Peg(kind: pkLetter)
func unicodeLower*: Peg {.inline.} =
## constructs the PEG ``\lower`` which matches any Unicode lowercase letter.
result = Peg(kind: pkLower)
func unicodeUpper*: Peg {.inline.} =
## constructs the PEG ``\upper`` which matches any Unicode uppercase letter.
result = Peg(kind: pkUpper)
func unicodeTitle*: Peg {.inline.} =
## constructs the PEG ``\title`` which matches any Unicode title letter.
result = Peg(kind: pkTitle)
func unicodeWhitespace*: Peg {.inline.} =
## constructs the PEG ``\white`` which matches any Unicode
## whitespace character.
result = Peg(kind: pkWhitespace)
func startAnchor*: Peg {.inline.} =
## constructs the PEG ``^`` which matches the start of the input.
result = Peg(kind: pkStartAnchor)
func endAnchor*: Peg {.inline.} =
## constructs the PEG ``$`` which matches the end of the input.
result = !any()
func capture*(a: Peg = Peg(kind: pkEmpty)): Peg {.rtl, extern: "npegsCapture".} =
## constructs a capture with the PEG `a`
result = Peg(kind: pkCapture, sons: @[a])
func backref*(index: range[1..MaxSubpatterns], reverse: bool = false): Peg {.
rtl, extern: "npegs$1".} =
## constructs a back reference of the given `index`. `index` starts counting
## from 1. `reverse` specifies wether indexing starts from the end of the
## capture list.
result = Peg(kind: pkBackRef, index: (if reverse: -index else: index - 1))
func backrefIgnoreCase*(index: range[1..MaxSubpatterns], reverse: bool = false): Peg {.
rtl, extern: "npegs$1".} =
## constructs a back reference of the given `index`. `index` starts counting
## from 1. `reverse` specifies wether indexing starts from the end of the
## capture list. Ignores case for matching.
result = Peg(kind: pkBackRefIgnoreCase, index: (if reverse: -index else: index - 1))
func backrefIgnoreStyle*(index: range[1..MaxSubpatterns], reverse: bool = false): Peg {.
rtl, extern: "npegs$1".} =
## constructs a back reference of the given `index`. `index` starts counting
## from 1. `reverse` specifies wether indexing starts from the end of the
## capture list. Ignores style for matching.
result = Peg(kind: pkBackRefIgnoreStyle, index: (if reverse: -index else: index - 1))
func spaceCost(n: Peg): int =
case n.kind
of pkEmpty: discard
of pkTerminal, pkTerminalIgnoreCase, pkTerminalIgnoreStyle, pkChar,
pkGreedyRepChar, pkCharChoice, pkGreedyRepSet,
pkAny..pkWhitespace, pkGreedyAny, pkBackRef..pkBackRefIgnoreStyle:
result = 1
of pkNonTerminal:
# we cannot inline a rule with a non-terminal
result = InlineThreshold+1
else:
for i in 0..n.len-1:
inc(result, spaceCost(n.sons[i]))
if result >= InlineThreshold: break
func nonterminal*(n: NonTerminal): Peg {.
rtl, extern: "npegs$1".} =
## constructs a PEG that consists of the nonterminal symbol
assert n != nil
if ntDeclared in n.flags and spaceCost(n.rule) < InlineThreshold:
when false: echo "inlining symbol: ", n.name
result = n.rule # inlining of rule enables better optimizations
else:
result = Peg(kind: pkNonTerminal, nt: n)
func newNonTerminal*(name: string, line, column: int): NonTerminal {.
rtl, extern: "npegs$1".} =
## constructs a nonterminal symbol
result = NonTerminal(name: name, line: line, col: column)
template letters*: Peg =
## expands to ``charset({'A'..'Z', 'a'..'z'})``
charSet({'A'..'Z', 'a'..'z'})
template digits*: Peg =
## expands to ``charset({'0'..'9'})``
charSet({'0'..'9'})
template whitespace*: Peg =
## expands to ``charset({' ', '\9'..'\13'})``
charSet({' ', '\9'..'\13'})
template identChars*: Peg =
## expands to ``charset({'a'..'z', 'A'..'Z', '0'..'9', '_'})``
charSet({'a'..'z', 'A'..'Z', '0'..'9', '_'})
template identStartChars*: Peg =
## expands to ``charset({'A'..'Z', 'a'..'z', '_'})``
charSet({'a'..'z', 'A'..'Z', '_'})
template ident*: Peg =
## same as ``[a-zA-Z_][a-zA-z_0-9]*``; standard identifier
sequence(charSet({'a'..'z', 'A'..'Z', '_'}),
*charSet({'a'..'z', 'A'..'Z', '0'..'9', '_'}))
template natural*: Peg =
## same as ``\d+``
+digits
# ------------------------- debugging -----------------------------------------
func esc(c: char, reserved = {'\0'..'\255'}): string =
case c
of '\b': result = "\\b"
of '\t': result = "\\t"
of '\c': result = "\\c"
of '\L': result = "\\l"
of '\v': result = "\\v"
of '\f': result = "\\f"
of '\e': result = "\\e"
of '\a': result = "\\a"
of '\\': result = "\\\\"
of 'a'..'z', 'A'..'Z', '0'..'9', '_': result = $c
elif c < ' ' or c >= '\127': result = '\\' & $ord(c)
elif c in reserved: result = '\\' & c
else: result = $c
func singleQuoteEsc(c: char): string = return "'" & esc(c, {'\''}) & "'"
func singleQuoteEsc(str: string): string =
result = "'"
for c in items(str): add result, esc(c, {'\''})
add result, '\''
func charSetEscAux(cc: set[char]): string =
const reserved = {'^', '-', ']'}
result = ""
var c1 = 0
while c1 <= 0xff:
if chr(c1) in cc:
var c2 = c1
while c2 < 0xff and chr(succ(c2)) in cc: inc(c2)
if c1 == c2:
add result, esc(chr(c1), reserved)
elif c2 == succ(c1):
add result, esc(chr(c1), reserved) & esc(chr(c2), reserved)
else:
add result, esc(chr(c1), reserved) & '-' & esc(chr(c2), reserved)
c1 = c2
inc(c1)
func charSetEsc(cc: set[char]): string =
if card(cc) >= 128+64:
result = "[^" & charSetEscAux({'\1'..'\xFF'} - cc) & ']'
else:
result = '[' & charSetEscAux(cc) & ']'
func toStrAux(r: Peg, res: var string) =
case r.kind
of pkEmpty: add(res, "()")
of pkAny: add(res, '.')
of pkAnyRune: add(res, '_')
of pkLetter: add(res, "\\letter")
of pkLower: add(res, "\\lower")
of pkUpper: add(res, "\\upper")
of pkTitle: add(res, "\\title")
of pkWhitespace: add(res, "\\white")
of pkNewLine: add(res, "\\n")
of pkTerminal: add(res, singleQuoteEsc(r.term))
of pkTerminalIgnoreCase:
add(res, 'i')
add(res, singleQuoteEsc(r.term))
of pkTerminalIgnoreStyle:
add(res, 'y')
add(res, singleQuoteEsc(r.term))
of pkChar: add(res, singleQuoteEsc(r.ch))
of pkCharChoice: add(res, charSetEsc(r.charChoice[]))
of pkNonTerminal: add(res, r.nt.name)
of pkSequence:
add(res, '(')
toStrAux(r.sons[0], res)
for i in 1 .. high(r.sons):
add(res, ' ')
toStrAux(r.sons[i], res)
add(res, ')')
of pkOrderedChoice:
add(res, '(')
toStrAux(r.sons[0], res)
for i in 1 .. high(r.sons):
add(res, " / ")
toStrAux(r.sons[i], res)
add(res, ')')
of pkGreedyRep:
toStrAux(r.sons[0], res)
add(res, '*')
of pkGreedyRepChar:
add(res, singleQuoteEsc(r.ch))
add(res, '*')
of pkGreedyRepSet:
add(res, charSetEsc(r.charChoice[]))
add(res, '*')
of pkGreedyAny:
add(res, ".*")
of pkOption:
toStrAux(r.sons[0], res)
add(res, '?')
of pkAndPredicate:
add(res, '&')
toStrAux(r.sons[0], res)
of pkNotPredicate:
add(res, '!')
toStrAux(r.sons[0], res)
of pkSearch:
add(res, '@')
toStrAux(r.sons[0], res)
of pkCapturedSearch:
add(res, "{@}")
toStrAux(r.sons[0], res)
of pkCapture:
add(res, '{')
toStrAux(r.sons[0], res)
add(res, '}')
of pkBackRef:
add(res, '$')
add(res, $r.index)
of pkBackRefIgnoreCase:
add(res, "i$")
add(res, $r.index)
of pkBackRefIgnoreStyle:
add(res, "y$")
add(res, $r.index)
of pkRule:
toStrAux(r.sons[0], res)
add(res, " <- ")
toStrAux(r.sons[1], res)
of pkList:
for i in 0 .. high(r.sons):
toStrAux(r.sons[i], res)
add(res, "\n")
of pkStartAnchor:
add(res, '^')
func `$` *(r: Peg): string {.rtl, extern: "npegsToString".} =
## converts a PEG to its string representation
result = ""
toStrAux(r, result)
# --------------------- core engine -------------------------------------------
type
Captures* = object ## contains the captured substrings.
matches: array[0..MaxSubpatterns-1, tuple[first, last: int]]
ml: int
origStart: int
func bounds*(c: Captures,
i: range[0..MaxSubpatterns-1]): tuple[first, last: int] =
## returns the bounds ``[first..last]`` of the `i`'th capture.
result = c.matches[i]
when not useUnicode:
type
Rune = char
template fastRuneAt(s, i, ch) =
ch = s[i]
inc(i)
template runeLenAt(s, i): untyped = 1
func isAlpha(a: char): bool {.inline.} = return a in {'a'..'z', 'A'..'Z'}
func isUpper(a: char): bool {.inline.} = return a in {'A'..'Z'}
func isLower(a: char): bool {.inline.} = return a in {'a'..'z'}
func isTitle(a: char): bool {.inline.} = return false
func isWhiteSpace(a: char): bool {.inline.} = return a in {' ', '\9'..'\13'}
template matchOrParse(mopProc: untyped) =
# Used to make the main matcher proc *rawMatch* as well as event parser
# procs. For the former, *enter* and *leave* event handler code generators
# are provided which just return *discard*.
proc mopProc(s: string, p: Peg, start: int, c: var Captures): int {.gcsafe.} =
proc matchBackRef(s: string, p: Peg, start: int, c: var Captures): int =
# Parse handler code must run in an *of* clause of its own for each
# *PegKind*, so we encapsulate the identical clause body for
# *pkBackRef..pkBackRefIgnoreStyle* here.
var index = p.index
if index < 0: index.inc(c.ml)
if index < 0 or index >= c.ml: return -1
var (a, b) = c.matches[index]
var n: Peg
case p.kind
of pkBackRef:
n = Peg(kind: pkTerminal, term: s.substr(a, b))
of pkBackRefIgnoreStyle:
n = Peg(kind: pkTerminalIgnoreStyle, term: s.substr(a, b))
of pkBackRefIgnoreCase:
n = Peg(kind: pkTerminalIgnoreCase, term: s.substr(a, b))
else: assert(false, "impossible case")
mopProc(s, n, start, c)
case p.kind
of pkEmpty:
enter(pkEmpty, s, p, start)
result = 0 # match of length 0
leave(pkEmpty, s, p, start, result)
of pkAny:
enter(pkAny, s, p, start)
if start < s.len: result = 1
else: result = -1
leave(pkAny, s, p, start, result)
of pkAnyRune:
enter(pkAnyRune, s, p, start)
if start < s.len:
result = runeLenAt(s, start)
else:
result = -1
leave(pkAnyRune, s, p, start, result)
of pkLetter:
enter(pkLetter, s, p, start)
if start < s.len:
var a: Rune
result = start
fastRuneAt(s, result, a)
if isAlpha(a): dec(result, start)
else: result = -1
else:
result = -1
leave(pkLetter, s, p, start, result)
of pkLower:
enter(pkLower, s, p, start)
if start < s.len:
var a: Rune
result = start
fastRuneAt(s, result, a)
if isLower(a): dec(result, start)
else: result = -1
else:
result = -1
leave(pkLower, s, p, start, result)
of pkUpper:
enter(pkUpper, s, p, start)
if start < s.len:
var a: Rune
result = start
fastRuneAt(s, result, a)
if isUpper(a): dec(result, start)
else: result = -1
else:
result = -1
leave(pkUpper, s, p, start, result)
of pkTitle:
enter(pkTitle, s, p, start)
if start < s.len:
var a: Rune
result = start
fastRuneAt(s, result, a)
if isTitle(a): dec(result, start)
else: result = -1
else:
result = -1
leave(pkTitle, s, p, start, result)
of pkWhitespace:
enter(pkWhitespace, s, p, start)
if start < s.len:
var a: Rune
result = start
fastRuneAt(s, result, a)
if isWhiteSpace(a): dec(result, start)
else: result = -1
else:
result = -1
leave(pkWhitespace, s, p, start, result)
of pkGreedyAny:
enter(pkGreedyAny, s, p, start)
result = len(s) - start
leave(pkGreedyAny, s, p, start, result)
of pkNewLine:
enter(pkNewLine, s, p, start)
if start < s.len and s[start] == '\L': result = 1
elif start < s.len and s[start] == '\C':
if start+1 < s.len and s[start+1] == '\L': result = 2
else: result = 1
else: result = -1
leave(pkNewLine, s, p, start, result)
of pkTerminal:
enter(pkTerminal, s, p, start)
result = len(p.term)
for i in 0..result-1:
if start+i >= s.len or p.term[i] != s[start+i]:
result = -1
break
leave(pkTerminal, s, p, start, result)
of pkTerminalIgnoreCase:
enter(pkTerminalIgnoreCase, s, p, start)
var
i = 0
a, b: Rune
result = start
while i < len(p.term):
if result >= s.len:
result = -1
break
fastRuneAt(p.term, i, a)
fastRuneAt(s, result, b)
if toLower(a) != toLower(b):
result = -1
break
dec(result, start)
leave(pkTerminalIgnoreCase, s, p, start, result)
of pkTerminalIgnoreStyle:
enter(pkTerminalIgnoreStyle, s, p, start)
var
i = 0
a, b: Rune
result = start
while i < len(p.term):
while i < len(p.term):
fastRuneAt(p.term, i, a)
if a != Rune('_'): break
while result < s.len:
fastRuneAt(s, result, b)
if b != Rune('_'): break
if result >= s.len:
if i >= p.term.len: break
else:
result = -1
break
elif toLower(a) != toLower(b):
result = -1
break
dec(result, start)
leave(pkTerminalIgnoreStyle, s, p, start, result)
of pkChar:
enter(pkChar, s, p, start)
if start < s.len and p.ch == s[start]: result = 1
else: result = -1
leave(pkChar, s, p, start, result)
of pkCharChoice:
enter(pkCharChoice, s, p, start)
if start < s.len and contains(p.charChoice[], s[start]): result = 1
else: result = -1
leave(pkCharChoice, s, p, start, result)
of pkNonTerminal:
enter(pkNonTerminal, s, p, start)
var oldMl = c.ml
when false: echo "enter: ", p.nt.name
result = mopProc(s, p.nt.rule, start, c)
when false: echo "leave: ", p.nt.name
if result < 0: c.ml = oldMl
leave(pkNonTerminal, s, p, start, result)
of pkSequence:
enter(pkSequence, s, p, start)
var oldMl = c.ml
result = 0
for i in 0..high(p.sons):
var x = mopProc(s, p.sons[i], start+result, c)
if x < 0:
c.ml = oldMl
result = -1
break
else: inc(result, x)
leave(pkSequence, s, p, start, result)
of pkOrderedChoice:
enter(pkOrderedChoice, s, p, start)
var oldMl = c.ml
for i in 0..high(p.sons):
result = mopProc(s, p.sons[i], start, c)
if result >= 0: break
c.ml = oldMl
leave(pkOrderedChoice, s, p, start, result)
of pkSearch:
enter(pkSearch, s, p, start)
var oldMl = c.ml
result = 0
while start+result <= s.len:
var x = mopProc(s, p.sons[0], start+result, c)
if x >= 0:
inc(result, x)
leave(pkSearch, s, p, start, result)
return
inc(result)
result = -1
c.ml = oldMl
leave(pkSearch, s, p, start, result)
of pkCapturedSearch:
enter(pkCapturedSearch, s, p, start)
var idx = c.ml # reserve a slot for the subpattern
inc(c.ml)
result = 0
while start+result <= s.len:
var x = mopProc(s, p.sons[0], start+result, c)
if x >= 0:
if idx < MaxSubpatterns:
c.matches[idx] = (start, start+result-1)
#else: silently ignore the capture
inc(result, x)
leave(pkCapturedSearch, s, p, start, result)
return
inc(result)
result = -1
c.ml = idx
leave(pkCapturedSearch, s, p, start, result)
of pkGreedyRep:
enter(pkGreedyRep, s, p, start)
result = 0
while true:
var x = mopProc(s, p.sons[0], start+result, c)
# if x == 0, we have an endless loop; so the correct behaviour would be
# not to break. But endless loops can be easily introduced:
# ``(comment / \w*)*`` is such an example. Breaking for x == 0 does the
# expected thing in this case.
if x <= 0: break
inc(result, x)
leave(pkGreedyRep, s, p, start, result)
of pkGreedyRepChar:
enter(pkGreedyRepChar, s, p, start)
result = 0
var ch = p.ch
while start+result < s.len and ch == s[start+result]: inc(result)
leave(pkGreedyRepChar, s, p, start, result)
of pkGreedyRepSet:
enter(pkGreedyRepSet, s, p, start)
result = 0
while start+result < s.len and contains(p.charChoice[], s[start+result]):
inc(result)
leave(pkGreedyRepSet, s, p, start, result)
of pkOption:
enter(pkOption, s, p, start)
result = max(0, mopProc(s, p.sons[0], start, c))
leave(pkOption, s, p, start, result)
of pkAndPredicate:
enter(pkAndPredicate, s, p, start)
var oldMl = c.ml
result = mopProc(s, p.sons[0], start, c)
if result >= 0: result = 0 # do not consume anything
else: c.ml = oldMl
leave(pkAndPredicate, s, p, start, result)
of pkNotPredicate:
enter(pkNotPredicate, s, p, start)
var oldMl = c.ml
result = mopProc(s, p.sons[0], start, c)
if result < 0: result = 0
else:
c.ml = oldMl
result = -1
leave(pkNotPredicate, s, p, start, result)
of pkCapture:
enter(pkCapture, s, p, start)
if p.sons.len == 0 or p.sons[0].kind == pkEmpty:
# empty capture removes last match
dec(c.ml)
c.matches[c.ml] = (0, 0)
result = 0 # match of length 0
else:
var idx = c.ml # reserve a slot for the subpattern
result = mopProc(s, p.sons[0], start, c)
if result >= 0:
if idx < MaxSubpatterns:
if idx != c.ml:
for i in countdown(c.ml, idx):
c.matches[i+1] = c.matches[i]
c.matches[idx] = (start, start+result-1)
#else: silently ignore the capture
inc(c.ml)
leave(pkCapture, s, p, start, result)
of pkBackRef:
enter(pkBackRef, s, p, start)
result = matchBackRef(s, p, start, c)
leave(pkBackRef, s, p, start, result)
of pkBackRefIgnoreCase:
enter(pkBackRefIgnoreCase, s, p, start)
result = matchBackRef(s, p, start, c)
leave(pkBackRefIgnoreCase, s, p, start, result)
of pkBackRefIgnoreStyle:
enter(pkBackRefIgnoreStyle, s, p, start)
result = matchBackRef(s, p, start, c)
leave(pkBackRefIgnoreStyle, s, p, start, result)
of pkStartAnchor:
enter(pkStartAnchor, s, p, start)
if c.origStart == start: result = 0
else: result = -1
leave(pkStartAnchor, s, p, start, result)
of pkRule, pkList: assert false
func rawMatch*(s: string, p: Peg, start: int, c: var Captures): int
{.rtl, extern: "npegs$1".} =
## low-level matching proc that implements the PEG interpreter. Use this
## for maximum efficiency (every other PEG operation ends up calling this
## proc).
## Returns -1 if it does not match, else the length of the match
# Set the handler generators to produce do-nothing handlers.
template enter(pk, s, p, start) =
discard
template leave(pk, s, p, start, length) =
discard
matchOrParse(matchIt)
{.cast(noSideEffect).}:
# This cast is allowed because the `matchOrParse` template is used for
# both matching and parsing, but side effects are only possible when it's
# used by `eventParser`.
result = matchIt(s, p, start, c)
macro mkHandlerTplts(handlers: untyped): untyped =
# Transforms the handler spec in *handlers* into handler templates.
# The AST structure of *handlers[0]*:
#
# .. code-block::
# StmtList
# Call
# Ident "pkNonTerminal"
# StmtList
# Call
# Ident "enter"
# StmtList
# <handler code block>
# Call
# Ident "leave"
# StmtList
# <handler code block>
# Call
# Ident "pkChar"
# StmtList
# Call
# Ident "leave"
# StmtList
# <handler code block>
# ...
func mkEnter(hdName, body: NimNode): NimNode =
template helper(hdName, body) {.dirty.} =
template hdName(s, p, start) =
let s {.inject.} = s
let p {.inject.} = p
let start {.inject.} = start
body
result = getAst(helper(hdName, body))
template mkLeave(hdPostf, body) {.dirty.} =
# this has to be dirty to be able to capture *result* as *length* in
# *leaveXX* calls.
template `leave hdPostf`(s, p, start, length) =
body
result = newStmtList()
for topCall in handlers[0]:
if topCall.kind notin nnkCallKinds:
error("Call syntax expected.", topCall)
let pegKind = topCall[0]
if pegKind.kind notin {nnkIdent, nnkSym}:
error("PegKind expected.", pegKind)
if 2 == topCall.len:
for hdDef in topCall[1]:
if hdDef.kind notin nnkCallKinds:
error("Call syntax expected.", hdDef)
if hdDef[0].kind notin {nnkIdent, nnkSym}:
error("Handler identifier expected.", hdDef[0])
if 2 == hdDef.len:
let hdPostf = substr(pegKind.strVal, 2)
case hdDef[0].strVal
of "enter":
result.add mkEnter(newIdentNode("enter" & hdPostf), hdDef[1])
of "leave":
result.add getAst(mkLeave(ident(hdPostf), hdDef[1]))
else:
error(
"Unsupported handler identifier, expected 'enter' or 'leave'.",
hdDef[0]
)
template eventParser*(pegAst, handlers: untyped): (proc(s: string): int) =
## Generates an interpreting event parser *proc* according to the specified
## PEG AST and handler code blocks. The *proc* can be called with a string
## to be parsed and will execute the handler code blocks whenever their
## associated grammar element is matched. It returns -1 if the string does not
## match, else the length of the total match. The following example code
## evaluates an arithmetic expression defined by a simple PEG:
##
## .. code-block:: nim
## import std/[strutils, pegs]
##
## let
## pegAst = """
## Expr <- Sum
## Sum <- Product (('+' / '-')Product)*
## Product <- Value (('*' / '/')Value)*
## Value <- [0-9]+ / '(' Expr ')'
## """.peg
## txt = "(5+3)/2-7*22"
##
## var
## pStack: seq[string] = @[]
## valStack: seq[float] = @[]
## opStack = ""
## let
## parseArithExpr = pegAst.eventParser:
## pkNonTerminal:
## enter:
## pStack.add p.nt.name
## leave:
## pStack.setLen pStack.high
## if length > 0:
## let matchStr = s.substr(start, start+length-1)
## case p.nt.name
## of "Value":
## try:
## valStack.add matchStr.parseFloat
## echo valStack
## except ValueError:
## discard
## of "Sum", "Product":
## try:
## let val = matchStr.parseFloat
## except ValueError:
## if valStack.len > 1 and opStack.len > 0:
## valStack[^2] = case opStack[^1]
## of '+': valStack[^2] + valStack[^1]
## of '-': valStack[^2] - valStack[^1]
## of '*': valStack[^2] * valStack[^1]
## else: valStack[^2] / valStack[^1]
## valStack.setLen valStack.high
## echo valStack
## opStack.setLen opStack.high
## echo opStack
## pkChar:
## leave:
## if length == 1 and "Value" != pStack[^1]:
## let matchChar = s[start]
## opStack.add matchChar
## echo opStack
##
## let pLen = parseArithExpr(txt)
##
## The *handlers* parameter consists of code blocks for *PegKinds*,
## which define the grammar elements of interest. Each block can contain
## handler code to be executed when the parser enters and leaves text
## matching the grammar element. An *enter* handler can access the specific
## PEG AST node being matched as *p*, the entire parsed string as *s*
## and the position of the matched text segment in *s* as *start*. A *leave*
## handler can access *p*, *s*, *start* and also the length of the matched
## text segment as *length*. For an unsuccessful match, the *enter* and
## *leave* handlers will be executed, with *length* set to -1.
##
## Symbols declared in an *enter* handler can be made visible in the
## corresponding *leave* handler by annotating them with an *inject* pragma.
proc rawParse(s: string, p: Peg, start: int, c: var Captures): int
{.gensym.} =
# binding from *macros*
bind strVal
mkHandlerTplts:
handlers
macro enter(pegKind, s, pegNode, start: untyped): untyped =
# This is called by the matcher code in *matchOrParse* at the
# start of the code for a grammar element of kind *pegKind*.
# Expands to a call to the handler template if one was generated
# by *mkHandlerTplts*.
template mkDoEnter(hdPostf, s, pegNode, start) =
when declared(`enter hdPostf`):
`enter hdPostf`(s, pegNode, start):
else:
discard
let hdPostf = ident(substr(strVal(pegKind), 2))
getAst(mkDoEnter(hdPostf, s, pegNode, start))
macro leave(pegKind, s, pegNode, start, length: untyped): untyped =
# Like *enter*, but called at the end of the matcher code for
# a grammar element of kind *pegKind*.
template mkDoLeave(hdPostf, s, pegNode, start, length) =
when declared(`leave hdPostf`):
`leave hdPostf`(s, pegNode, start, length):
else:
discard
let hdPostf = ident(substr(strVal(pegKind), 2))
getAst(mkDoLeave(hdPostf, s, pegNode, start, length))
matchOrParse(parseIt)
parseIt(s, p, start, c)
proc parser(s: string): int {.gensym.} =
# the proc to be returned
var
ms: array[MaxSubpatterns, (int, int)]
cs = Captures(matches: ms, ml: 0, origStart: 0)
rawParse(s, pegAst, 0, cs)
parser
template fillMatches(s, caps, c) =
for k in 0..c.ml-1:
let startIdx = c.matches[k][0]
let endIdx = c.matches[k][1]
if startIdx != -1:
caps[k] = substr(s, startIdx, endIdx)
else:
caps[k] = ""
func matchLen*(s: string, pattern: Peg, matches: var openArray[string],
start = 0): int {.rtl, extern: "npegs$1Capture".} =
## the same as ``match``, but it returns the length of the match,
## if there is no match, -1 is returned. Note that a match length
## of zero can happen. It's possible that a suffix of `s` remains
## that does not belong to the match.
var c: Captures
c.origStart = start
result = rawMatch(s, pattern, start, c)
if result >= 0: fillMatches(s, matches, c)
func matchLen*(s: string, pattern: Peg,
start = 0): int {.rtl, extern: "npegs$1".} =
## the same as ``match``, but it returns the length of the match,
## if there is no match, -1 is returned. Note that a match length
## of zero can happen. It's possible that a suffix of `s` remains
## that does not belong to the match.
var c: Captures
c.origStart = start
result = rawMatch(s, pattern, start, c)
func match*(s: string, pattern: Peg, matches: var openArray[string],
start = 0): bool {.rtl, extern: "npegs$1Capture".} =
## returns ``true`` if ``s[start..]`` matches the ``pattern`` and
## the captured substrings in the array ``matches``. If it does not
## match, nothing is written into ``matches`` and ``false`` is
## returned.
result = matchLen(s, pattern, matches, start) != -1
func match*(s: string, pattern: Peg,
start = 0): bool {.rtl, extern: "npegs$1".} =
## returns ``true`` if ``s`` matches the ``pattern`` beginning from ``start``.
result = matchLen(s, pattern, start) != -1
func find*(s: string, pattern: Peg, matches: var openArray[string],
start = 0): int {.rtl, extern: "npegs$1Capture".} =
## returns the starting position of ``pattern`` in ``s`` and the captured
## substrings in the array ``matches``. If it does not match, nothing
## is written into ``matches`` and -1 is returned.
var c: Captures
c.origStart = start
for i in start .. s.len-1:
c.ml = 0
if rawMatch(s, pattern, i, c) >= 0:
fillMatches(s, matches, c)
return i
return -1
# could also use the pattern here: (!P .)* P
func findBounds*(s: string, pattern: Peg, matches: var openArray[string],
start = 0): tuple[first, last: int] {.
rtl, extern: "npegs$1Capture".} =
## returns the starting position and end position of ``pattern`` in ``s``
## and the captured
## substrings in the array ``matches``. If it does not match, nothing
## is written into ``matches`` and (-1,0) is returned.
var c: Captures
c.origStart = start
for i in start .. s.len-1:
c.ml = 0
var L = rawMatch(s, pattern, i, c)
if L >= 0:
fillMatches(s, matches, c)
return (i, i+L-1)
return (-1, 0)
func find*(s: string, pattern: Peg,
start = 0): int {.rtl, extern: "npegs$1".} =
## returns the starting position of ``pattern`` in ``s``. If it does not
## match, -1 is returned.
var c: Captures
c.origStart = start
for i in start .. s.len-1:
if rawMatch(s, pattern, i, c) >= 0: return i
return -1
iterator findAll*(s: string, pattern: Peg, start = 0): string =
## yields all matching *substrings* of `s` that match `pattern`.
var c: Captures
c.origStart = start
var i = start
while i < s.len:
c.ml = 0
var L = rawMatch(s, pattern, i, c)
if L < 0:
inc(i, 1)
else:
yield substr(s, i, i+L-1)
inc(i, L)
func findAll*(s: string, pattern: Peg, start = 0): seq[string] {.
rtl, extern: "npegs$1".} =
## returns all matching *substrings* of `s` that match `pattern`.
## If it does not match, `@[]` is returned.
result = @[]
for it in findAll(s, pattern, start): result.add it
template `=~`*(s: string, pattern: Peg): bool =
## This calls ``match`` with an implicit declared ``matches`` array that
## can be used in the scope of the ``=~`` call:
##
## .. code-block:: nim
##
## if line =~ peg"\s* {\w+} \s* '=' \s* {\w+}":
## # matches a key=value pair:
## echo("Key: ", matches[0])
## echo("Value: ", matches[1])
## elif line =~ peg"\s*{'#'.*}":
## # matches a comment
## # note that the implicit ``matches`` array is different from the
## # ``matches`` array of the first branch
## echo("comment: ", matches[0])
## else:
## echo("syntax error")
##
bind MaxSubpatterns
when not declaredInScope(matches):
var matches {.inject.}: array[0..MaxSubpatterns-1, string]
match(s, pattern, matches)
# ------------------------- more string handling ------------------------------
func contains*(s: string, pattern: Peg, start = 0): bool {.
rtl, extern: "npegs$1".} =
## same as ``find(s, pattern, start) >= 0``
return find(s, pattern, start) >= 0
func contains*(s: string, pattern: Peg, matches: var openArray[string],
start = 0): bool {.rtl, extern: "npegs$1Capture".} =
## same as ``find(s, pattern, matches, start) >= 0``
return find(s, pattern, matches, start) >= 0
func startsWith*(s: string, prefix: Peg, start = 0): bool {.
rtl, extern: "npegs$1".} =
## returns true if `s` starts with the pattern `prefix`
result = matchLen(s, prefix, start) >= 0
func endsWith*(s: string, suffix: Peg, start = 0): bool {.
rtl, extern: "npegs$1".} =
## returns true if `s` ends with the pattern `suffix`
var c: Captures
c.origStart = start
for i in start .. s.len-1:
if rawMatch(s, suffix, i, c) == s.len - i: return true
func replacef*(s: string, sub: Peg, by: string): string {.
rtl, extern: "npegs$1".} =
## Replaces `sub` in `s` by the string `by`. Captures can be accessed in `by`
## with the notation ``$i`` and ``$#`` (see strutils.`%`). Examples:
##
## .. code-block:: nim
## "var1=key; var2=key2".replacef(peg"{\ident}'='{\ident}", "$1<-$2$2")
##
## Results in:
##
## .. code-block:: nim
##
## "var1<-keykey; val2<-key2key2"
result = ""
var i = 0
var caps: array[0..MaxSubpatterns-1, string]
var c: Captures
while i < s.len:
c.ml = 0
var x = rawMatch(s, sub, i, c)
if x <= 0:
add(result, s[i])
inc(i)
else:
fillMatches(s, caps, c)
addf(result, by, caps)
inc(i, x)
add(result, substr(s, i))
func replace*(s: string, sub: Peg, by = ""): string {.
rtl, extern: "npegs$1".} =
## Replaces `sub` in `s` by the string `by`. Captures cannot be accessed
## in `by`.
result = ""
var i = 0
var c: Captures
while i < s.len:
var x = rawMatch(s, sub, i, c)
if x <= 0:
add(result, s[i])
inc(i)
else:
add(result, by)
inc(i, x)
add(result, substr(s, i))
func parallelReplace*(s: string, subs: varargs[
tuple[pattern: Peg, repl: string]]): string {.
rtl, extern: "npegs$1".} =
## Returns a modified copy of `s` with the substitutions in `subs`
## applied in parallel.
result = ""
var i = 0
var c: Captures
var caps: array[0..MaxSubpatterns-1, string]
while i < s.len:
block searchSubs:
for j in 0..high(subs):
c.ml = 0
var x = rawMatch(s, subs[j][0], i, c)
if x > 0:
fillMatches(s, caps, c)
addf(result, subs[j][1], caps)
inc(i, x)
break searchSubs
add(result, s[i])
inc(i)
# copy the rest:
add(result, substr(s, i))
when not defined(nimHasEffectsOf):
{.pragma: effectsOf.}
func replace*(s: string, sub: Peg, cb: proc(
match: int, cnt: int, caps: openArray[string]): string): string {.
rtl, extern: "npegs$1cb", effectsOf: cb.} =
## Replaces `sub` in `s` by the resulting strings from the callback.
## The callback proc receives the index of the current match (starting with 0),
## the count of captures and an open array with the captures of each match. Examples:
##
## .. code-block:: nim
##
## func handleMatches*(m: int, n: int, c: openArray[string]): string =
## result = ""
## if m > 0:
## result.add ", "
## result.add case n:
## of 2: c[0].toLower & ": '" & c[1] & "'"
## of 1: c[0].toLower & ": ''"
## else: ""
##
## let s = "Var1=key1;var2=Key2; VAR3"
## echo s.replace(peg"{\ident}('='{\ident})* ';'* \s*", handleMatches)
##
## Results in:
##
## .. code-block:: nim
##
## "var1: 'key1', var2: 'Key2', var3: ''"
result = ""
var i = 0
var caps: array[0..MaxSubpatterns-1, string]
var c: Captures
var m = 0
while i < s.len:
c.ml = 0
var x = rawMatch(s, sub, i, c)
if x <= 0:
add(result, s[i])
inc(i)
else:
fillMatches(s, caps, c)
add(result, cb(m, c.ml, caps))
inc(i, x)
inc(m)
add(result, substr(s, i))
when not defined(js):
proc transformFile*(infile, outfile: string,
subs: varargs[tuple[pattern: Peg, repl: string]]) {.
rtl, extern: "npegs$1".} =
## reads in the file `infile`, performs a parallel replacement (calls
## `parallelReplace`) and writes back to `outfile`. Raises ``IOError`` if an
## error occurs. This is supposed to be used for quick scripting.
##
## **Note**: this proc does not exist while using the JS backend.
var x = readFile(infile)
writeFile(outfile, x.parallelReplace(subs))
iterator split*(s: string, sep: Peg): string =
## Splits the string `s` into substrings.
##
## Substrings are separated by the PEG `sep`.
## Examples:
##
## .. code-block:: nim
## for word in split("00232this02939is39an22example111", peg"\d+"):
## writeLine(stdout, word)
##
## Results in:
##
## .. code-block:: nim
## "this"
## "is"
## "an"
## "example"
##
var c: Captures
var
first = 0
last = 0
while last < len(s):
c.ml = 0
var x = rawMatch(s, sep, last, c)
if x > 0: inc(last, x)
first = last
while last < len(s):
inc(last)
c.ml = 0
x = rawMatch(s, sep, last, c)
if x > 0: break
if first < last:
yield substr(s, first, last-1)
func split*(s: string, sep: Peg): seq[string] {.
rtl, extern: "npegs$1".} =
## Splits the string `s` into substrings.
result = @[]
for it in split(s, sep): result.add it
# ------------------- scanner -------------------------------------------------
type
Modifier = enum
modNone,
modVerbatim,
modIgnoreCase,
modIgnoreStyle
TokKind = enum ## enumeration of all tokens
tkInvalid, ## invalid token
tkEof, ## end of file reached
tkAny, ## .
tkAnyRune, ## _
tkIdentifier, ## abc
tkStringLit, ## "abc" or 'abc'
tkCharSet, ## [^A-Z]
tkParLe, ## '('
tkParRi, ## ')'
tkCurlyLe, ## '{'
tkCurlyRi, ## '}'
tkCurlyAt, ## '{@}'
tkEmptyCurl, ## '{}'
tkArrow, ## '<-'
tkBar, ## '/'
tkStar, ## '*'
tkPlus, ## '+'
tkAmp, ## '&'
tkNot, ## '!'
tkOption, ## '?'
tkAt, ## '@'
tkBuiltin, ## \identifier
tkEscaped, ## \\
tkBackref, ## '$'
tkDollar, ## '$'
tkHat ## '^'
Token {.final.} = object ## a token
kind: TokKind ## the type of the token
modifier: Modifier
literal: string ## the parsed (string) literal
charset: set[char] ## if kind == tkCharSet
index: int ## if kind == tkBackref
PegLexer {.inheritable.} = object ## the lexer object.
bufpos: int ## the current position within the buffer
buf: string ## the buffer itself
lineNumber: int ## the current line number
lineStart: int ## index of last line start in buffer
colOffset: int ## column to add
filename: string
const
tokKindToStr: array[TokKind, string] = [
"invalid", "[EOF]", ".", "_", "identifier", "string literal",
"character set", "(", ")", "{", "}", "{@}", "{}",
"<-", "/", "*", "+", "&", "!", "?",
"@", "built-in", "escaped", "$", "$", "^"
]
func handleCR(L: var PegLexer, pos: int): int =
assert(L.buf[pos] == '\c')
inc(L.lineNumber)
result = pos+1
if result < L.buf.len and L.buf[result] == '\L': inc(result)
L.lineStart = result
func handleLF(L: var PegLexer, pos: int): int =
assert(L.buf[pos] == '\L')
inc(L.lineNumber)
result = pos+1
L.lineStart = result
func init(L: var PegLexer, input, filename: string, line = 1, col = 0) =
L.buf = input
L.bufpos = 0
L.lineNumber = line
L.colOffset = col
L.lineStart = 0
L.filename = filename
func getColumn(L: PegLexer): int {.inline.} =
result = abs(L.bufpos - L.lineStart) + L.colOffset
func getLine(L: PegLexer): int {.inline.} =
result = L.lineNumber
func errorStr(L: PegLexer, msg: string, line = -1, col = -1): string =
var line = if line < 0: getLine(L) else: line
var col = if col < 0: getColumn(L) else: col
result = "$1($2, $3) Error: $4" % [L.filename, $line, $col, msg]
func getEscapedChar(c: var PegLexer, tok: var Token) =
inc(c.bufpos)
if c.bufpos >= len(c.buf):
tok.kind = tkInvalid
return
case c.buf[c.bufpos]
of 'r', 'R', 'c', 'C':
add(tok.literal, '\c')
inc(c.bufpos)
of 'l', 'L':
add(tok.literal, '\L')
inc(c.bufpos)
of 'f', 'F':
add(tok.literal, '\f')
inc(c.bufpos)
of 'e', 'E':
add(tok.literal, '\e')
inc(c.bufpos)
of 'a', 'A':
add(tok.literal, '\a')
inc(c.bufpos)
of 'b', 'B':
add(tok.literal, '\b')
inc(c.bufpos)
of 'v', 'V':
add(tok.literal, '\v')
inc(c.bufpos)
of 't', 'T':
add(tok.literal, '\t')
inc(c.bufpos)
of 'x', 'X':
inc(c.bufpos)
if c.bufpos >= len(c.buf):
tok.kind = tkInvalid
return
var xi = 0
if handleHexChar(c.buf[c.bufpos], xi):
inc(c.bufpos)
if handleHexChar(c.buf[c.bufpos], xi):
inc(c.bufpos)
if xi == 0: tok.kind = tkInvalid
else: add(tok.literal, chr(xi))
of '0'..'9':
var val = ord(c.buf[c.bufpos]) - ord('0')
inc(c.bufpos)
var i = 1
while (c.bufpos < len(c.buf)) and (i <= 3) and (c.buf[c.bufpos] in {'0'..'9'}):
val = val * 10 + ord(c.buf[c.bufpos]) - ord('0')
inc(c.bufpos)
inc(i)
if val > 0 and val <= 255: add(tok.literal, chr(val))
else: tok.kind = tkInvalid
of '\0'..'\31':
tok.kind = tkInvalid
elif c.buf[c.bufpos] in strutils.Letters:
tok.kind = tkInvalid
else:
add(tok.literal, c.buf[c.bufpos])
inc(c.bufpos)
func skip(c: var PegLexer) =
var pos = c.bufpos
while pos < c.buf.len:
case c.buf[pos]
of ' ', '\t':
inc(pos)
of '#':
while (pos < c.buf.len) and
not (c.buf[pos] in {'\c', '\L', '\0'}): inc(pos)
of '\c':
pos = handleCR(c, pos)
of '\L':
pos = handleLF(c, pos)
else:
break # EndOfFile also leaves the loop
c.bufpos = pos
func getString(c: var PegLexer, tok: var Token) =
tok.kind = tkStringLit
var pos = c.bufpos + 1
var quote = c.buf[pos-1]
while pos < c.buf.len:
case c.buf[pos]
of '\\':
c.bufpos = pos
getEscapedChar(c, tok)
pos = c.bufpos
of '\c', '\L', '\0':
tok.kind = tkInvalid
break
elif c.buf[pos] == quote:
inc(pos)
break
else:
add(tok.literal, c.buf[pos])
inc(pos)
c.bufpos = pos
func getDollar(c: var PegLexer, tok: var Token) =
var pos = c.bufpos + 1
var neg = false
if pos < c.buf.len and c.buf[pos] == '^':
neg = true
inc(pos)
if pos < c.buf.len and c.buf[pos] in {'0'..'9'}:
tok.kind = tkBackref
tok.index = 0
while pos < c.buf.len and c.buf[pos] in {'0'..'9'}:
tok.index = tok.index * 10 + ord(c.buf[pos]) - ord('0')
inc(pos)
if neg:
tok.index = -tok.index
else:
if neg:
dec(pos)
tok.kind = tkDollar
c.bufpos = pos
func getCharSet(c: var PegLexer, tok: var Token) =
tok.kind = tkCharSet
tok.charset = {}
var pos = c.bufpos + 1
var caret = false
if pos < c.buf.len:
if c.buf[pos] == '^':
inc(pos)
caret = true
while pos < c.buf.len:
var ch: char
case c.buf[pos]
of ']':
if pos < c.buf.len: inc(pos)
break
of '\\':
c.bufpos = pos
getEscapedChar(c, tok)
pos = c.bufpos
ch = tok.literal[tok.literal.len-1]
of '\C', '\L', '\0':
tok.kind = tkInvalid
break
else:
ch = c.buf[pos]
inc(pos)
incl(tok.charset, ch)
if c.buf[pos] == '-':
if pos+1 < c.buf.len and c.buf[pos+1] == ']':
incl(tok.charset, '-')
inc(pos)
else:
if pos+1 < c.buf.len:
inc(pos)
else:
break
var ch2: char
case c.buf[pos]
of '\\':
c.bufpos = pos
getEscapedChar(c, tok)
pos = c.bufpos
ch2 = tok.literal[tok.literal.len-1]
of '\C', '\L', '\0':
tok.kind = tkInvalid
break
else:
if pos+1 < c.buf.len:
ch2 = c.buf[pos]
inc(pos)
else:
break
for i in ord(ch)+1 .. ord(ch2):
incl(tok.charset, chr(i))
c.bufpos = pos
if caret: tok.charset = {'\1'..'\xFF'} - tok.charset
func getSymbol(c: var PegLexer, tok: var Token) =
var pos = c.bufpos
while pos < c.buf.len:
add(tok.literal, c.buf[pos])
inc(pos)
if pos < c.buf.len and c.buf[pos] notin strutils.IdentChars: break
c.bufpos = pos
tok.kind = tkIdentifier
func getBuiltin(c: var PegLexer, tok: var Token) =
if c.bufpos+1 < c.buf.len and c.buf[c.bufpos+1] in strutils.Letters:
inc(c.bufpos)
getSymbol(c, tok)
tok.kind = tkBuiltin
else:
tok.kind = tkEscaped
getEscapedChar(c, tok) # may set tok.kind to tkInvalid
func getTok(c: var PegLexer, tok: var Token) =
tok.kind = tkInvalid
tok.modifier = modNone
setLen(tok.literal, 0)
skip(c)
if c.bufpos >= c.buf.len:
tok.kind = tkEof
tok.literal = "[EOF]"
add(tok.literal, '\0')
inc(c.bufpos)
return
case c.buf[c.bufpos]
of '{':
inc(c.bufpos)
if c.buf[c.bufpos] == '@' and c.bufpos+2 < c.buf.len and
c.buf[c.bufpos+1] == '}':
tok.kind = tkCurlyAt
inc(c.bufpos, 2)
add(tok.literal, "{@}")
elif c.buf[c.bufpos] == '}' and c.bufpos < c.buf.len:
tok.kind = tkEmptyCurl
inc(c.bufpos)
add(tok.literal, "{}")
else:
tok.kind = tkCurlyLe
add(tok.literal, '{')
of '}':
tok.kind = tkCurlyRi
inc(c.bufpos)
add(tok.literal, '}')
of '[':
getCharSet(c, tok)
of '(':
tok.kind = tkParLe
inc(c.bufpos)
add(tok.literal, '(')
of ')':
tok.kind = tkParRi
inc(c.bufpos)
add(tok.literal, ')')
of '.':
tok.kind = tkAny
inc(c.bufpos)
add(tok.literal, '.')
of '_':
tok.kind = tkAnyRune
inc(c.bufpos)
add(tok.literal, '_')
of '\\':
getBuiltin(c, tok)
of '\'', '"': getString(c, tok)
of '$': getDollar(c, tok)
of 'a'..'z', 'A'..'Z', '\128'..'\255':
getSymbol(c, tok)
if c.bufpos >= c.buf.len:
return
if c.buf[c.bufpos] in {'\'', '"'} or
c.buf[c.bufpos] == '$' and c.bufpos+1 < c.buf.len and
c.buf[c.bufpos+1] in {'^', '0'..'9'}:
case tok.literal
of "i": tok.modifier = modIgnoreCase
of "y": tok.modifier = modIgnoreStyle
of "v": tok.modifier = modVerbatim
else: discard
setLen(tok.literal, 0)
if c.buf[c.bufpos] == '$':
getDollar(c, tok)
else:
getString(c, tok)
if tok.modifier == modNone: tok.kind = tkInvalid
of '+':
tok.kind = tkPlus
inc(c.bufpos)
add(tok.literal, '+')
of '*':
tok.kind = tkStar
inc(c.bufpos)
add(tok.literal, '+')
of '<':
if c.bufpos+2 < c.buf.len and c.buf[c.bufpos+1] == '-':
inc(c.bufpos, 2)
tok.kind = tkArrow
add(tok.literal, "<-")
else:
add(tok.literal, '<')
of '/':
tok.kind = tkBar
inc(c.bufpos)
add(tok.literal, '/')
of '?':
tok.kind = tkOption
inc(c.bufpos)
add(tok.literal, '?')
of '!':
tok.kind = tkNot
inc(c.bufpos)
add(tok.literal, '!')
of '&':
tok.kind = tkAmp
inc(c.bufpos)
add(tok.literal, '!')
of '@':
tok.kind = tkAt
inc(c.bufpos)
add(tok.literal, '@')
if c.buf[c.bufpos] == '@':
tok.kind = tkCurlyAt
inc(c.bufpos)
add(tok.literal, '@')
of '^':
tok.kind = tkHat
inc(c.bufpos)
add(tok.literal, '^')
else:
if c.bufpos >= c.buf.len:
tok.kind = tkEof
tok.literal = "[EOF]"
add(tok.literal, c.buf[c.bufpos])
inc(c.bufpos)
func arrowIsNextTok(c: PegLexer): bool =
# the only look ahead we need
var pos = c.bufpos
while pos < c.buf.len and c.buf[pos] in {'\t', ' '}: inc(pos)
if pos+1 >= c.buf.len:
return
result = c.buf[pos] == '<' and c.buf[pos+1] == '-'
# ----------------------------- parser ----------------------------------------
type
EInvalidPeg* = object of ValueError ## raised if an invalid
## PEG has been detected
PegParser = object of PegLexer ## the PEG parser object
tok: Token
nonterms: seq[NonTerminal]
modifier: Modifier
captures: int
identIsVerbatim: bool
skip: Peg
func pegError(p: PegParser, msg: string, line = -1, col = -1) =
var e = (ref EInvalidPeg)(msg: errorStr(p, msg, line, col))
raise e
func getTok(p: var PegParser) =
getTok(p, p.tok)
if p.tok.kind == tkInvalid: pegError(p, "'" & p.tok.literal & "' is invalid token")
func eat(p: var PegParser, kind: TokKind) =
if p.tok.kind == kind: getTok(p)
else: pegError(p, tokKindToStr[kind] & " expected")
func parseExpr(p: var PegParser): Peg {.gcsafe.}
func getNonTerminal(p: var PegParser, name: string): NonTerminal =
for i in 0..high(p.nonterms):
result = p.nonterms[i]
if cmpIgnoreStyle(result.name, name) == 0: return
# forward reference:
result = newNonTerminal(name, getLine(p), getColumn(p))
add(p.nonterms, result)
func modifiedTerm(s: string, m: Modifier): Peg =
case m
of modNone, modVerbatim: result = term(s)
of modIgnoreCase: result = termIgnoreCase(s)
of modIgnoreStyle: result = termIgnoreStyle(s)
func modifiedBackref(s: int, m: Modifier): Peg =
var
reverse = s < 0
index = if reverse: -s else: s
case m
of modNone, modVerbatim: result = backref(index, reverse)
of modIgnoreCase: result = backrefIgnoreCase(index, reverse)
of modIgnoreStyle: result = backrefIgnoreStyle(index, reverse)
func builtin(p: var PegParser): Peg =
# do not use "y", "skip" or "i" as these would be ambiguous
case p.tok.literal
of "n": result = newLine()
of "d": result = charSet({'0'..'9'})
of "D": result = charSet({'\1'..'\xff'} - {'0'..'9'})
of "s": result = charSet({' ', '\9'..'\13'})
of "S": result = charSet({'\1'..'\xff'} - {' ', '\9'..'\13'})
of "w": result = charSet({'a'..'z', 'A'..'Z', '_', '0'..'9'})
of "W": result = charSet({'\1'..'\xff'} - {'a'..'z', 'A'..'Z', '_', '0'..'9'})
of "a": result = charSet({'a'..'z', 'A'..'Z'})
of "A": result = charSet({'\1'..'\xff'} - {'a'..'z', 'A'..'Z'})
of "ident": result = pegs.ident
of "letter": result = unicodeLetter()
of "upper": result = unicodeUpper()
of "lower": result = unicodeLower()
of "title": result = unicodeTitle()
of "white": result = unicodeWhitespace()
else: pegError(p, "unknown built-in: " & p.tok.literal)
func token(terminal: Peg, p: PegParser): Peg =
if p.skip.kind == pkEmpty: result = terminal
else: result = sequence(p.skip, terminal)
func primary(p: var PegParser): Peg =
case p.tok.kind
of tkAmp:
getTok(p)
return &primary(p)
of tkNot:
getTok(p)
return !primary(p)
of tkAt:
getTok(p)
return !*primary(p)
of tkCurlyAt:
getTok(p)
return !*\primary(p).token(p)
else: discard
case p.tok.kind
of tkIdentifier:
if p.identIsVerbatim:
var m = p.tok.modifier
if m == modNone: m = p.modifier
result = modifiedTerm(p.tok.literal, m).token(p)
getTok(p)
elif not arrowIsNextTok(p):
var nt = getNonTerminal(p, p.tok.literal)
{.cast(noSideEffect).}:
incl(nt.flags, ntUsed)
result = nonterminal(nt).token(p)
getTok(p)
else:
pegError(p, "expression expected, but found: " & p.tok.literal)
of tkStringLit:
var m = p.tok.modifier
if m == modNone: m = p.modifier
result = modifiedTerm(p.tok.literal, m).token(p)
getTok(p)
of tkCharSet:
if '\0' in p.tok.charset:
pegError(p, "binary zero ('\\0') not allowed in character class")
result = charSet(p.tok.charset).token(p)
getTok(p)
of tkParLe:
getTok(p)
result = parseExpr(p)
eat(p, tkParRi)
of tkCurlyLe:
getTok(p)
result = capture(parseExpr(p)).token(p)
eat(p, tkCurlyRi)
inc(p.captures)
of tkEmptyCurl:
result = capture()
getTok(p)
of tkAny:
result = any().token(p)
getTok(p)
of tkAnyRune:
result = anyRune().token(p)
getTok(p)
of tkBuiltin:
result = builtin(p).token(p)
getTok(p)
of tkEscaped:
result = term(p.tok.literal[0]).token(p)
getTok(p)
of tkDollar:
result = endAnchor()
getTok(p)
of tkHat:
result = startAnchor()
getTok(p)
of tkBackref:
if abs(p.tok.index) > p.captures or p.tok.index == 0:
pegError(p, "invalid back reference index: " & $p.tok.index)
var m = p.tok.modifier
if m == modNone: m = p.modifier
result = modifiedBackref(p.tok.index, m).token(p)
getTok(p)
else:
pegError(p, "expression expected, but found: " & p.tok.literal)
getTok(p) # we must consume a token here to prevent endless loops!
while true:
case p.tok.kind
of tkOption:
result = ?result
getTok(p)
of tkStar:
result = *result
getTok(p)
of tkPlus:
result = +result
getTok(p)
else: break
func seqExpr(p: var PegParser): Peg =
result = primary(p)
while true:
case p.tok.kind
of tkAmp, tkNot, tkAt, tkStringLit, tkCharSet, tkParLe, tkCurlyLe,
tkAny, tkAnyRune, tkBuiltin, tkEscaped, tkDollar, tkBackref,
tkHat, tkCurlyAt, tkEmptyCurl:
result = sequence(result, primary(p))
of tkIdentifier:
if not arrowIsNextTok(p):
result = sequence(result, primary(p))
else: break
else: break
func parseExpr(p: var PegParser): Peg =
result = seqExpr(p)
while p.tok.kind == tkBar:
getTok(p)
result = result / seqExpr(p)
func parseRule(p: var PegParser): NonTerminal =
if p.tok.kind == tkIdentifier and arrowIsNextTok(p):
result = getNonTerminal(p, p.tok.literal)
if ntDeclared in result.flags:
pegError(p, "attempt to redefine: " & result.name)
{.cast(noSideEffect).}:
result.line = getLine(p)
result.col = getColumn(p)
getTok(p)
eat(p, tkArrow)
{.cast(noSideEffect).}:
result.rule = parseExpr(p)
incl(result.flags, ntDeclared) # NOW inlining may be attempted
else:
pegError(p, "rule expected, but found: " & p.tok.literal)
func rawParse(p: var PegParser): Peg =
## parses a rule or a PEG expression
while p.tok.kind == tkBuiltin:
case p.tok.literal
of "i":
p.modifier = modIgnoreCase
getTok(p)
of "y":
p.modifier = modIgnoreStyle
getTok(p)
of "skip":
getTok(p)
p.skip = ?primary(p)
else: break
if p.tok.kind == tkIdentifier and arrowIsNextTok(p):
result = parseRule(p).rule
while p.tok.kind != tkEof:
discard parseRule(p)
else:
p.identIsVerbatim = true
result = parseExpr(p)
if p.tok.kind != tkEof:
pegError(p, "EOF expected, but found: " & p.tok.literal)
for i in 0..high(p.nonterms):
var nt = p.nonterms[i]
if ntDeclared notin nt.flags:
pegError(p, "undeclared identifier: " & nt.name, nt.line, nt.col)
elif ntUsed notin nt.flags and i > 0:
pegError(p, "unused rule: " & nt.name, nt.line, nt.col)
func parsePeg*(pattern: string, filename = "pattern", line = 1, col = 0): Peg =
## constructs a Peg object from `pattern`. `filename`, `line`, `col` are
## used for error messages, but they only provide start offsets. `parsePeg`
## keeps track of line and column numbers within `pattern`.
var p: PegParser
init(PegLexer(p), pattern, filename, line, col)
p.tok.kind = tkInvalid
p.tok.modifier = modNone
p.tok.literal = ""
p.tok.charset = {}
p.nonterms = @[]
p.identIsVerbatim = false
getTok(p)
result = rawParse(p)
func peg*(pattern: string): Peg =
## constructs a Peg object from the `pattern`. The short name has been
## chosen to encourage its use as a raw string modifier:
##
## peg"{\ident} \s* '=' \s* {.*}"
result = parsePeg(pattern, "pattern")
func escapePeg*(s: string): string =
## escapes `s` so that it is matched verbatim when used as a peg.
result = ""
var inQuote = false
for c in items(s):
case c
of '\0'..'\31', '\'', '"', '\\':
if inQuote:
result.add('\'')
inQuote = false
result.add("\\x")
result.add(toHex(ord(c), 2))
else:
if not inQuote:
result.add('\'')
inQuote = true
result.add(c)
if inQuote: result.add('\'')