mirror of
https://github.com/nim-lang/Nim.git
synced 2026-01-04 04:02:41 +00:00
nimgrep: first working version
This commit is contained in:
@@ -27,7 +27,11 @@ notation meaning
|
||||
``{E}`` Capture: Apply expression `E` and store the substring
|
||||
that matched `E` into a *capture* that can be accessed
|
||||
after the matching process.
|
||||
``$i`` back reference to the ``i``th capture. ``i`` counts from 1.
|
||||
``$i`` Back reference to the ``i``th capture. ``i`` counts from 1.
|
||||
``$`` Anchor: Matches at the end of the input. No character
|
||||
is consumed. Same as ``!.``.
|
||||
``^`` Anchor: Matches at the start of the input. No character
|
||||
is consumed.
|
||||
``&E`` And predicate: Indicate success if expression `E` matches
|
||||
the text ahead; otherwise indicate failure. Do not consume
|
||||
any text.
|
||||
@@ -145,7 +149,7 @@ The PEG parser implements this grammar (written in PEG syntax)::
|
||||
rule <- identifier \s* "<-" expr ig
|
||||
identNoArrow <- identifier !(\s* "<-")
|
||||
prefixOpr <- ig '&' / ig '!' / ig '@' / ig '{@}' / ig '@@'
|
||||
literal <- ig identifier? '$' [0-9]+
|
||||
literal <- ig identifier? '$' [0-9]+ / '$' / '^' /
|
||||
ig identNoArrow /
|
||||
ig charset /
|
||||
ig stringlit /
|
||||
|
||||
@@ -80,6 +80,24 @@ proc matchOrFind(s: string, pattern: TRegEx, matches: var openarray[string],
|
||||
if a >= 0'i32: matches[i-1] = copy(s, int(a), int(b)-1)
|
||||
else: matches[i-1] = ""
|
||||
return rawMatches[1] - rawMatches[0]
|
||||
|
||||
proc findBounds*(s: string, pattern: TRegEx, matches: var openarray[string],
|
||||
start = 0): tuple[first, last: int] =
|
||||
## returns the starting position and end position of ``pattern`` in ``s``
|
||||
## and the captured
|
||||
## substrings in the array ``matches``. If it does not match, nothing
|
||||
## is written into ``matches`` and (-1,0) is returned.
|
||||
var
|
||||
rawMatches: array[0..maxSubpatterns * 3 - 1, cint]
|
||||
res = pcre.Exec(pattern.h, nil, s, len(s), start, 0'i32,
|
||||
cast[ptr cint](addr(rawMatches)), maxSubpatterns * 3)
|
||||
if res < 0'i32: return (-1, 0)
|
||||
for i in 1..int(res)-1:
|
||||
var a = rawMatches[i * 2]
|
||||
var b = rawMatches[i * 2 + 1]
|
||||
if a >= 0'i32: matches[i-1] = copy(s, int(a), int(b)-1)
|
||||
else: matches[i-1] = ""
|
||||
return (rawMatches[0].int, rawMatches[1].int - 1)
|
||||
|
||||
proc matchOrFind(s: string, pattern: TRegEx, start, flags: cint): cint =
|
||||
var rawMatches: array [0..maxSubpatterns * 3 - 1, cint]
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
#
|
||||
#
|
||||
# Nimrod's Runtime Library
|
||||
# (c) Copyright 2010 Andreas Rumpf
|
||||
# (c) Copyright 2011 Andreas Rumpf
|
||||
#
|
||||
# See the file "copying.txt", included in this
|
||||
# distribution, for details about the copyright.
|
||||
@@ -65,7 +65,8 @@ type
|
||||
pkSearch, ## @a --> Internal DSL: @a
|
||||
pkCapturedSearch, ## {@} a --> Internal DSL: @@a
|
||||
pkRule, ## a <- b
|
||||
pkList ## a, b
|
||||
pkList, ## a, b
|
||||
pkStartAnchor ## ^ --> Internal DSL: startAnchor()
|
||||
TNonTerminalFlag = enum
|
||||
ntDeclared, ntUsed
|
||||
TNonTerminal {.final.} = object ## represents a non terminal symbol
|
||||
@@ -264,6 +265,14 @@ proc UnicodeWhitespace*: TPeg {.inline.} =
|
||||
## whitespace character.
|
||||
result.kind = pkWhitespace
|
||||
|
||||
proc startAnchor*: TPeg {.inline.} =
|
||||
## constructs the PEG ``^`` which matches the start of the input.
|
||||
result.kind = pkStartAnchor
|
||||
|
||||
proc endAnchor*: TPeg {.inline.} =
|
||||
## constructs the PEG ``$`` which matches the end of the input.
|
||||
result = !any()
|
||||
|
||||
proc capture*(a: TPeg): TPeg {.nosideEffect, rtl, extern: "npegsCapture".} =
|
||||
## constructs a capture with the PEG `a`
|
||||
result.kind = pkCapture
|
||||
@@ -484,6 +493,8 @@ proc toStrAux(r: TPeg, res: var string) =
|
||||
for i in 0 .. high(r.sons):
|
||||
toStrAux(r.sons[i], res)
|
||||
add(res, "\n")
|
||||
of pkStartAnchor:
|
||||
add(res, '^')
|
||||
|
||||
proc `$` *(r: TPeg): string {.nosideEffect, rtl, extern: "npegsToString".} =
|
||||
## converts a PEG to its string representation
|
||||
@@ -496,6 +507,7 @@ type
|
||||
TCaptures* {.final.} = object ## contains the captured substrings.
|
||||
matches: array[0..maxSubpatterns-1, tuple[first, last: int]]
|
||||
ml: int
|
||||
origStart: int
|
||||
|
||||
proc bounds*(c: TCaptures,
|
||||
i: range[0..maxSubpatterns-1]): tuple[first, last: int] =
|
||||
@@ -721,6 +733,9 @@ proc rawMatch*(s: string, p: TPeg, start: int, c: var TCaptures): int {.
|
||||
n.kind = succ(pkTerminal, ord(p.kind)-ord(pkBackRef))
|
||||
n.term = s.copy(a, b)
|
||||
result = rawMatch(s, n, start, c)
|
||||
of pkStartAnchor:
|
||||
if c.origStart == start: result = 0
|
||||
else: result = -1
|
||||
of pkRule, pkList: assert false
|
||||
|
||||
proc match*(s: string, pattern: TPeg, matches: var openarray[string],
|
||||
@@ -730,6 +745,7 @@ proc match*(s: string, pattern: TPeg, matches: var openarray[string],
|
||||
## match, nothing is written into ``matches`` and ``false`` is
|
||||
## returned.
|
||||
var c: TCaptures
|
||||
c.origStart = start
|
||||
result = rawMatch(s, pattern, start, c) == len(s) -start
|
||||
if result:
|
||||
for i in 0..c.ml-1:
|
||||
@@ -739,6 +755,7 @@ proc match*(s: string, pattern: TPeg,
|
||||
start = 0): bool {.nosideEffect, rtl, extern: "npegs$1".} =
|
||||
## returns ``true`` if ``s`` matches the ``pattern`` beginning from ``start``.
|
||||
var c: TCaptures
|
||||
c.origStart = start
|
||||
result = rawMatch(s, pattern, start, c) == len(s)-start
|
||||
|
||||
proc matchLen*(s: string, pattern: TPeg, matches: var openarray[string],
|
||||
@@ -748,6 +765,7 @@ proc matchLen*(s: string, pattern: TPeg, matches: var openarray[string],
|
||||
## of zero can happen. It's possible that a suffix of `s` remains
|
||||
## that does not belong to the match.
|
||||
var c: TCaptures
|
||||
c.origStart = start
|
||||
result = rawMatch(s, pattern, start, c)
|
||||
if result >= 0:
|
||||
for i in 0..c.ml-1:
|
||||
@@ -760,6 +778,7 @@ proc matchLen*(s: string, pattern: TPeg,
|
||||
## of zero can happen. It's possible that a suffix of `s` remains
|
||||
## that does not belong to the match.
|
||||
var c: TCaptures
|
||||
c.origStart = start
|
||||
result = rawMatch(s, pattern, start, c)
|
||||
|
||||
proc find*(s: string, pattern: TPeg, matches: var openarray[string],
|
||||
@@ -988,14 +1007,16 @@ type
|
||||
tkAt, ## '@'
|
||||
tkBuiltin, ## \identifier
|
||||
tkEscaped, ## \\
|
||||
tkDollar ## '$'
|
||||
tkBackref, ## '$'
|
||||
tkDollar, ## '$'
|
||||
tkHat ## '^'
|
||||
|
||||
TToken {.final.} = object ## a token
|
||||
kind: TTokKind ## the type of the token
|
||||
modifier: TModifier
|
||||
literal: string ## the parsed (string) literal
|
||||
charset: set[char] ## if kind == tkCharSet
|
||||
index: int ## if kind == tkDollar
|
||||
index: int ## if kind == tkBackref
|
||||
|
||||
TPegLexer = object ## the lexer object.
|
||||
bufpos: int ## the current position within the buffer
|
||||
@@ -1010,7 +1031,7 @@ const
|
||||
"invalid", "[EOF]", ".", "_", "identifier", "string literal",
|
||||
"character set", "(", ")", "{", "}", "{@}",
|
||||
"<-", "/", "*", "+", "&", "!", "?",
|
||||
"@", "built-in", "escaped", "$"
|
||||
"@", "built-in", "escaped", "$", "$", "^"
|
||||
]
|
||||
|
||||
proc HandleCR(L: var TPegLexer, pos: int): int =
|
||||
@@ -1155,13 +1176,13 @@ proc getDollar(c: var TPegLexer, tok: var TToken) =
|
||||
var pos = c.bufPos + 1
|
||||
var buf = c.buf
|
||||
if buf[pos] in {'0'..'9'}:
|
||||
tok.kind = tkDollar
|
||||
tok.kind = tkBackref
|
||||
tok.index = 0
|
||||
while buf[pos] in {'0'..'9'}:
|
||||
tok.index = tok.index * 10 + ord(buf[pos]) - ord('0')
|
||||
inc(pos)
|
||||
else:
|
||||
tok.kind = tkInvalid
|
||||
tok.kind = tkDollar
|
||||
c.bufpos = pos
|
||||
|
||||
proc getCharSet(c: var TPegLexer, tok: var TToken) =
|
||||
@@ -1280,7 +1301,8 @@ proc getTok(c: var TPegLexer, tok: var TToken) =
|
||||
tok.literal = "[EOF]"
|
||||
of 'a'..'z', 'A'..'Z', '\128'..'\255':
|
||||
getSymbol(c, tok)
|
||||
if c.buf[c.bufpos] in {'\'', '"', '$'}:
|
||||
if c.buf[c.bufpos] in {'\'', '"'} or
|
||||
c.buf[c.bufpos] == '$' and c.buf[c.bufpos+1] in {'0'..'9'}:
|
||||
case tok.literal
|
||||
of "i": tok.modifier = modIgnoreCase
|
||||
of "y": tok.modifier = modIgnoreStyle
|
||||
@@ -1331,6 +1353,10 @@ proc getTok(c: var TPegLexer, tok: var TToken) =
|
||||
tok.kind = tkCurlyAt
|
||||
inc(c.bufpos)
|
||||
add(tok.literal, '@')
|
||||
of '^':
|
||||
tok.kind = tkHat
|
||||
inc(c.bufpos)
|
||||
add(tok.literal, '^')
|
||||
else:
|
||||
add(tok.literal, c.buf[c.bufpos])
|
||||
inc(c.bufpos)
|
||||
@@ -1474,7 +1500,13 @@ proc primary(p: var TPegParser): TPeg =
|
||||
of tkEscaped:
|
||||
result = term(p.tok.literal[0]).token(p)
|
||||
getTok(p)
|
||||
of tkDollar:
|
||||
of tkDollar:
|
||||
result = endAnchor()
|
||||
getTok(p)
|
||||
of tkHat:
|
||||
result = startAnchor()
|
||||
getTok(p)
|
||||
of tkBackref:
|
||||
var m = p.tok.modifier
|
||||
if m == modNone: m = p.modifier
|
||||
result = modifiedBackRef(p.tok.index, m).token(p)
|
||||
@@ -1502,7 +1534,8 @@ proc seqExpr(p: var TPegParser): TPeg =
|
||||
while true:
|
||||
case p.tok.kind
|
||||
of tkAmp, tkNot, tkAt, tkStringLit, tkCharset, tkParLe, tkCurlyLe,
|
||||
tkAny, tkAnyRune, tkBuiltin, tkEscaped, tkDollar, tkCurlyAt:
|
||||
tkAny, tkAnyRune, tkBuiltin, tkEscaped, tkDollar, tkBackref,
|
||||
tkHat, tkCurlyAt:
|
||||
result = sequence(result, primary(p))
|
||||
of tkIdentifier:
|
||||
if not arrowIsNextTok(p):
|
||||
@@ -1693,3 +1726,5 @@ when isMainModule:
|
||||
peg"\skip(\s*) {\ident}'='{\ident}", "$1<-$2$2") ==
|
||||
"var1<-keykey;var2<-key2key2")
|
||||
|
||||
assert match("prefix/start", peg"^start$", 7)
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
#
|
||||
#
|
||||
# Nimrod's Runtime Library
|
||||
# (c) Copyright 2010 Andreas Rumpf
|
||||
# (c) Copyright 2011 Andreas Rumpf
|
||||
#
|
||||
# See the file "copying.txt", included in this
|
||||
# distribution, for details about the copyright.
|
||||
@@ -42,6 +42,8 @@ const
|
||||
IdentStartChars* = {'a'..'z', 'A'..'Z', '_'}
|
||||
## the set of characters an identifier can start with
|
||||
|
||||
NewLines* = {'\13', '\10'}
|
||||
## the set of characters a newline terminator can start with
|
||||
|
||||
proc toLower*(c: Char): Char {.noSideEffect, procvar,
|
||||
rtl, extern: "nsuToLowerChar".} =
|
||||
|
||||
@@ -734,16 +734,16 @@ proc compileOption*(option: string): bool {.
|
||||
## can be used to determine an on|off compile-time option. Example:
|
||||
##
|
||||
## .. code-block:: nimrod
|
||||
## when compileOption("floatchecks"):
|
||||
## echo "compiled with floating point NaN and Inf checks"
|
||||
## when compileOption("floatchecks"):
|
||||
## echo "compiled with floating point NaN and Inf checks"
|
||||
|
||||
proc compileOption*(option, arg: string): bool {.
|
||||
magic: "CompileOptionArg", noSideEffect.}
|
||||
## can be used to determine an enum compile-time option. Example:
|
||||
##
|
||||
## .. code-block:: nimrod
|
||||
## when compileOption("opt", "size") and compileOption("gc", "boehm"):
|
||||
## echo "compiled with optimization for size and uses Boehm's GC"
|
||||
## when compileOption("opt", "size") and compileOption("gc", "boehm"):
|
||||
## echo "compiled with optimization for size and uses Boehm's GC"
|
||||
|
||||
include "system/inclrtl"
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
#
|
||||
#
|
||||
# Nimrod Grep Utility
|
||||
# (c) Copyright 2010 Andreas Rumpf
|
||||
# (c) Copyright 2011 Andreas Rumpf
|
||||
#
|
||||
# See the file "copying.txt", included in this
|
||||
# distribution, for details about the copyright.
|
||||
@@ -11,21 +11,28 @@ import
|
||||
os, strutils, parseopt, pegs, re, terminal
|
||||
|
||||
const
|
||||
Usage = """
|
||||
Usage: nimgrep [options] [pattern] [files/directory]
|
||||
Version = "0.7"
|
||||
Usage = "nimgrep - Nimrod Grep Utility Version " & version & """
|
||||
|
||||
(c) 2011 Andreas Rumpf
|
||||
Usage:
|
||||
nimgrep [options] [pattern] [files/directory]
|
||||
Options:
|
||||
--find, -f find the pattern (default)
|
||||
--replace, -r replace the pattern
|
||||
--peg pattern is a peg (default)
|
||||
--re pattern is a regular expression
|
||||
--re pattern is a regular expression; extended syntax for
|
||||
the regular expression is always turned on
|
||||
--recursive process directories recursively
|
||||
--confirm confirm each occurence/replacement; there is a chance
|
||||
to abort any time without touching the file(s)
|
||||
to abort any time without touching the file
|
||||
--stdin read pattern from stdin (to avoid the shell's confusing
|
||||
quoting rules)
|
||||
--word, -w the pattern should have word boundaries
|
||||
--ignore_case, -i be case insensitive
|
||||
--ignore_style, -y be style insensitive
|
||||
--help, -h shows this help
|
||||
--version, -v shows the version
|
||||
"""
|
||||
|
||||
type
|
||||
@@ -48,7 +55,7 @@ proc ask(msg: string): string =
|
||||
|
||||
proc Confirm: TConfirmEnum =
|
||||
while true:
|
||||
case normalize(ask("[a]bort; [y]es, a[l]l, [n]o, non[e]: "))
|
||||
case normalize(ask(" [a]bort; [y]es, a[l]l, [n]o, non[e]: "))
|
||||
of "a", "abort": return ceAbort
|
||||
of "y", "yes": return ceYes
|
||||
of "l", "all": return ceAll
|
||||
@@ -56,12 +63,7 @@ proc Confirm: TConfirmEnum =
|
||||
of "e", "none": return ceNone
|
||||
else: nil
|
||||
|
||||
proc highlight(a, b, c: string) =
|
||||
stdout.write(a)
|
||||
terminal.WriteStyled(b)
|
||||
stdout.writeln(c)
|
||||
|
||||
proc countLines(s: string, first = 0, last = s.high): int =
|
||||
proc countLines(s: string, first, last: int): int =
|
||||
var i = first
|
||||
while i <= last:
|
||||
if s[i] == '\13':
|
||||
@@ -71,6 +73,37 @@ proc countLines(s: string, first = 0, last = s.high): int =
|
||||
inc result
|
||||
inc i
|
||||
|
||||
proc beforePattern(s: string, first: int): int =
|
||||
result = first-1
|
||||
while result >= 0:
|
||||
if s[result] in newlines: break
|
||||
dec(result)
|
||||
inc(result)
|
||||
|
||||
proc afterPattern(s: string, last: int): int =
|
||||
result = last+1
|
||||
while result < s.len:
|
||||
if s[result] in newlines: break
|
||||
inc(result)
|
||||
dec(result)
|
||||
|
||||
proc highlight(s, match, repl: string, t: tuple[first, last: int],
|
||||
line: int, showRepl: bool) =
|
||||
const alignment = 6
|
||||
stdout.write(line.`$`.align(alignment), ": ")
|
||||
var x = beforePattern(s, t.first)
|
||||
var y = afterPattern(s, t.last)
|
||||
for i in x .. t.first-1: stdout.write(s[i])
|
||||
terminal.WriteStyled(match, {styleUnderscore, styleBright})
|
||||
for i in t.last+1 .. y: stdout.write(s[i])
|
||||
stdout.write("\n")
|
||||
if showRepl:
|
||||
stdout.write(repeatChar(alignment-1), "-> ")
|
||||
for i in x .. t.first-1: stdout.write(s[i])
|
||||
terminal.WriteStyled(repl, {styleUnderscore, styleBright})
|
||||
for i in t.last+1 .. y: stdout.write(s[i])
|
||||
stdout.write("\n")
|
||||
|
||||
proc processFile(filename: string) =
|
||||
var buffer = system.readFile(filename)
|
||||
if isNil(buffer): quit("cannot open file: " & filename)
|
||||
@@ -92,53 +125,76 @@ proc processFile(filename: string) =
|
||||
|
||||
var line = 1
|
||||
var i = 0
|
||||
var matches: array[0..re.MaxSubpatterns-1. string]
|
||||
var matches: array[0..re.MaxSubpatterns-1, string]
|
||||
for j in 0..high(matches): matches[j] = ""
|
||||
var reallyReplace = true
|
||||
while i < buffer.len:
|
||||
var t: tuple[first, last: int]
|
||||
if optRegex in options:
|
||||
quit "to implement"
|
||||
else:
|
||||
if optRegex notin options:
|
||||
t = findBounds(buffer, pegp, matches, i)
|
||||
|
||||
else:
|
||||
t = findBounds(buffer, rep, matches, i)
|
||||
if t.first <= 0: break
|
||||
inc(line, countLines(buffer, i, t.first-1))
|
||||
|
||||
var wholeMatch = buffer.copy(t.first, t.last)
|
||||
echo "line ", line, ": ", wholeMatch
|
||||
|
||||
if optReplace in options:
|
||||
var r = replace(wholeMatch, pegp, replacement)
|
||||
|
||||
if optReplace notin options:
|
||||
highlight(buffer, wholeMatch, "", t, line, showRepl=false)
|
||||
else:
|
||||
var r: string
|
||||
if optRegex notin options:
|
||||
r = replace(wholeMatch, pegp, replacement % matches)
|
||||
else:
|
||||
r = replace(wholeMatch, rep, replacement % matches)
|
||||
if optConfirm in options:
|
||||
highlight(buffer, wholeMatch, r, t, line, showRepl=true)
|
||||
case Confirm()
|
||||
of ceAbort:
|
||||
of ceYes:
|
||||
of ceAbort: quit(0)
|
||||
of ceYes: reallyReplace = true
|
||||
of ceAll:
|
||||
reallyReplace = true
|
||||
options.excl(optConfirm)
|
||||
of ceNo:
|
||||
reallyReplace = false
|
||||
of ceNone:
|
||||
reallyReplace = false
|
||||
options.excl(optConfirm)
|
||||
else:
|
||||
highlight(buffer, wholeMatch, r, t, line, showRepl=reallyReplace)
|
||||
if reallyReplace:
|
||||
|
||||
result.add(buffer.copy(i, t.first-1))
|
||||
result.add(r)
|
||||
else:
|
||||
result.add(buffer.copy(i, t.last))
|
||||
|
||||
inc(line, countLines(buffer, t.first, t.last))
|
||||
|
||||
i = t.last+1
|
||||
|
||||
if optReplace in options:
|
||||
result.add(copy(buffer, i))
|
||||
var f: TFile
|
||||
if open(f, filename, fmWrite):
|
||||
f.write(result)
|
||||
f.close()
|
||||
else:
|
||||
quit "cannot open file for overwriting: " & filename
|
||||
|
||||
|
||||
proc walker(dir: string) =
|
||||
var isDir = false
|
||||
for kind, path in walkDir(dir):
|
||||
isDir = true
|
||||
case kind
|
||||
of pcFile: processFile(path)
|
||||
of pcDirectory:
|
||||
of pcFile:
|
||||
processFile(path)
|
||||
of pcDir:
|
||||
if optRecursive in options:
|
||||
walker(path)
|
||||
else: nil
|
||||
if not isDir: processFile(dir)
|
||||
|
||||
proc writeHelp() = quit(Usage)
|
||||
proc writeVersion() = quit("1.0")
|
||||
proc writeVersion() = quit(Version)
|
||||
|
||||
proc checkOptions(subset: TOptions, a, b: string) =
|
||||
if subset <= options:
|
||||
@@ -187,5 +243,17 @@ if pattern.len == 0:
|
||||
writeHelp()
|
||||
else:
|
||||
if filename.len == 0: filename = os.getCurrentDir()
|
||||
if optRegex notin options:
|
||||
if optIgnoreStyle in options:
|
||||
pattern = "\\y " & pattern
|
||||
elif optIgnoreCase in options:
|
||||
pattern = "\\i " & pattern
|
||||
if optWord in options:
|
||||
pattern = r"(&\letter? / ^ )(" & pattern & r") !\letter"
|
||||
else:
|
||||
if optIgnoreStyle in options:
|
||||
quit "ignorestyle not supported for regular expressions"
|
||||
if optWord in options:
|
||||
pattern = r"\b (:?" & pattern & r") \b"
|
||||
walker(filename)
|
||||
|
||||
|
||||
@@ -42,6 +42,7 @@ Additions
|
||||
- Pegs support new built-ins: ``\letter``, ``\upper``, ``\lower``,
|
||||
``\title``, ``\white``.
|
||||
- Pegs support the new built-in ``\skip`` operation.
|
||||
- Pegs support the ``$`` and ``^`` anchors.
|
||||
- Source code filters are now documented.
|
||||
- Added ``emit`` pragma for direct code generator control.
|
||||
- Additional operations were added to the ``complex`` module.
|
||||
|
||||
Reference in New Issue
Block a user