nimgrep: first working version

2026-04-22 07:15:22 +00:00 · 2011-02-02 00:05:55 +01:00
parent 9387913b73
commit 437eb2c37e
7 changed files with 174 additions and 46 deletions
--- a/lib/impure/re.nim
+++ b/lib/impure/re.nim
@@ -80,6 +80,24 @@ proc matchOrFind(s: string, pattern: TRegEx, matches: var openarray[string],
    if a >= 0'i32: matches[i-1] = copy(s, int(a), int(b)-1)
    else: matches[i-1] = ""
  return rawMatches[1] - rawMatches[0]
+  
+proc findBounds*(s: string, pattern: TRegEx, matches: var openarray[string],
+                 start = 0): tuple[first, last: int] =
+  ## returns the starting position and end position of ``pattern`` in ``s`` 
+  ## and the captured
+  ## substrings in the array ``matches``. If it does not match, nothing
+  ## is written into ``matches`` and (-1,0) is returned.
+  var
+    rawMatches: array[0..maxSubpatterns * 3 - 1, cint]
+    res = pcre.Exec(pattern.h, nil, s, len(s), start, 0'i32,
+      cast[ptr cint](addr(rawMatches)), maxSubpatterns * 3)
+  if res < 0'i32: return (-1, 0)
+  for i in 1..int(res)-1:
+    var a = rawMatches[i * 2]
+    var b = rawMatches[i * 2 + 1]
+    if a >= 0'i32: matches[i-1] = copy(s, int(a), int(b)-1)
+    else: matches[i-1] = ""
+  return (rawMatches[0].int, rawMatches[1].int - 1)

 proc matchOrFind(s: string, pattern: TRegEx, start, flags: cint): cint =
  var rawMatches: array [0..maxSubpatterns * 3 - 1, cint]
--- a/lib/pure/pegs.nim
+++ b/lib/pure/pegs.nim
@@ -1,7 +1,7 @@
 #
 #
 #            Nimrod's Runtime Library
-#        (c) Copyright 2010 Andreas Rumpf
+#        (c) Copyright 2011 Andreas Rumpf
 #
 #    See the file "copying.txt", included in this
 #    distribution, for details about the copyright.
@@ -65,7 +65,8 @@ type
    pkSearch,           ## @a     --> Internal DSL: @a
    pkCapturedSearch,   ## {@} a  --> Internal DSL: @@a
    pkRule,             ## a <- b
-    pkList              ## a, b
+    pkList,             ## a, b
+    pkStartAnchor       ## ^      --> Internal DSL: startAnchor()
  TNonTerminalFlag = enum
    ntDeclared, ntUsed
  TNonTerminal {.final.} = object ## represents a non terminal symbol
@@ -264,6 +265,14 @@ proc UnicodeWhitespace*: TPeg {.inline.} =
  ## whitespace character.
  result.kind = pkWhitespace

+proc startAnchor*: TPeg {.inline.} = 
+  ## constructs the PEG ``^`` which matches the start of the input.  
+  result.kind = pkStartAnchor
+
+proc endAnchor*: TPeg {.inline.} = 
+  ## constructs the PEG ``$`` which matches the end of the input.  
+  result = !any()
+
 proc capture*(a: TPeg): TPeg {.nosideEffect, rtl, extern: "npegsCapture".} =
  ## constructs a capture with the PEG `a`
  result.kind = pkCapture
@@ -484,6 +493,8 @@ proc toStrAux(r: TPeg, res: var string) =
    for i in 0 .. high(r.sons):
      toStrAux(r.sons[i], res)
      add(res, "\n")  
+  of pkStartAnchor:
+    add(res, '^')

 proc `$` *(r: TPeg): string {.nosideEffect, rtl, extern: "npegsToString".} =
  ## converts a PEG to its string representation
@@ -496,6 +507,7 @@ type
  TCaptures* {.final.} = object ## contains the captured substrings.
    matches: array[0..maxSubpatterns-1, tuple[first, last: int]]
    ml: int
+    origStart: int

 proc bounds*(c: TCaptures, 
             i: range[0..maxSubpatterns-1]): tuple[first, last: int] = 
@@ -721,6 +733,9 @@ proc rawMatch*(s: string, p: TPeg, start: int, c: var TCaptures): int {.
    n.kind = succ(pkTerminal, ord(p.kind)-ord(pkBackRef)) 
    n.term = s.copy(a, b)
    result = rawMatch(s, n, start, c)
+  of pkStartAnchor:
+    if c.origStart == start: result = 0
+    else: result = -1
  of pkRule, pkList: assert false

 proc match*(s: string, pattern: TPeg, matches: var openarray[string],
@@ -730,6 +745,7 @@ proc match*(s: string, pattern: TPeg, matches: var openarray[string],
  ## match, nothing is written into ``matches`` and ``false`` is
  ## returned.
  var c: TCaptures
+  c.origStart = start
  result = rawMatch(s, pattern, start, c) == len(s) -start
  if result:
    for i in 0..c.ml-1:
@@ -739,6 +755,7 @@ proc match*(s: string, pattern: TPeg,
            start = 0): bool {.nosideEffect, rtl, extern: "npegs$1".} =
  ## returns ``true`` if ``s`` matches the ``pattern`` beginning from ``start``.
  var c: TCaptures
+  c.origStart = start
  result = rawMatch(s, pattern, start, c) == len(s)-start

 proc matchLen*(s: string, pattern: TPeg, matches: var openarray[string],
@@ -748,6 +765,7 @@ proc matchLen*(s: string, pattern: TPeg, matches: var openarray[string],
  ## of zero can happen. It's possible that a suffix of `s` remains
  ## that does not belong to the match.
  var c: TCaptures
+  c.origStart = start
  result = rawMatch(s, pattern, start, c)
  if result >= 0:
    for i in 0..c.ml-1:
@@ -760,6 +778,7 @@ proc matchLen*(s: string, pattern: TPeg,
  ## of zero can happen. It's possible that a suffix of `s` remains
  ## that does not belong to the match.
  var c: TCaptures
+  c.origStart = start
  result = rawMatch(s, pattern, start, c)

 proc find*(s: string, pattern: TPeg, matches: var openarray[string],
@@ -988,14 +1007,16 @@ type
    tkAt,               ## '@'
    tkBuiltin,          ## \identifier
    tkEscaped,          ## \\
-    tkDollar            ## '$'
+    tkBackref,          ## '$'
+    tkDollar,           ## '$'
+    tkHat               ## '^'
  
  TToken {.final.} = object  ## a token
    kind: TTokKind           ## the type of the token
    modifier: TModifier
    literal: string          ## the parsed (string) literal
    charset: set[char]       ## if kind == tkCharSet
-    index: int               ## if kind == tkDollar
+    index: int               ## if kind == tkBackref
  
  TPegLexer = object          ## the lexer object.
    bufpos: int               ## the current position within the buffer
@@ -1010,7 +1031,7 @@ const
    "invalid", "[EOF]", ".", "_", "identifier", "string literal",
    "character set", "(", ")", "{", "}", "{@}",
    "<-", "/", "*", "+", "&", "!", "?",
-    "@", "built-in", "escaped", "$"
+    "@", "built-in", "escaped", "$", "$", "^"
  ]

 proc HandleCR(L: var TPegLexer, pos: int): int =
@@ -1155,13 +1176,13 @@ proc getDollar(c: var TPegLexer, tok: var TToken) =
  var pos = c.bufPos + 1
  var buf = c.buf
  if buf[pos] in {'0'..'9'}:
-    tok.kind = tkDollar
+    tok.kind = tkBackref
    tok.index = 0
    while buf[pos] in {'0'..'9'}:
      tok.index = tok.index * 10 + ord(buf[pos]) - ord('0')
      inc(pos)
  else:
-    tok.kind = tkInvalid
+    tok.kind = tkDollar
  c.bufpos = pos
  
 proc getCharSet(c: var TPegLexer, tok: var TToken) = 
@@ -1280,7 +1301,8 @@ proc getTok(c: var TPegLexer, tok: var TToken) =
    tok.literal = "[EOF]"
  of 'a'..'z', 'A'..'Z', '\128'..'\255':
    getSymbol(c, tok)
-    if c.buf[c.bufpos] in {'\'', '"', '$'}:
+    if c.buf[c.bufpos] in {'\'', '"'} or 
+        c.buf[c.bufpos] == '$' and c.buf[c.bufpos+1] in {'0'..'9'}:
      case tok.literal
      of "i": tok.modifier = modIgnoreCase
      of "y": tok.modifier = modIgnoreStyle
@@ -1331,6 +1353,10 @@ proc getTok(c: var TPegLexer, tok: var TToken) =
      tok.kind = tkCurlyAt
      inc(c.bufpos)
      add(tok.literal, '@')
+  of '^':
+    tok.kind = tkHat
+    inc(c.bufpos)
+    add(tok.literal, '^')
  else:
    add(tok.literal, c.buf[c.bufpos])
    inc(c.bufpos)
@@ -1474,7 +1500,13 @@ proc primary(p: var TPegParser): TPeg =
  of tkEscaped:
    result = term(p.tok.literal[0]).token(p)
    getTok(p)
-  of tkDollar:
+  of tkDollar: 
+    result = endAnchor()
+    getTok(p)
+  of tkHat: 
+    result = startAnchor()
+    getTok(p)
+  of tkBackref:
    var m = p.tok.modifier
    if m == modNone: m = p.modifier
    result = modifiedBackRef(p.tok.index, m).token(p)
@@ -1502,7 +1534,8 @@ proc seqExpr(p: var TPegParser): TPeg =
  while true:
    case p.tok.kind
    of tkAmp, tkNot, tkAt, tkStringLit, tkCharset, tkParLe, tkCurlyLe,
-       tkAny, tkAnyRune, tkBuiltin, tkEscaped, tkDollar, tkCurlyAt:
+       tkAny, tkAnyRune, tkBuiltin, tkEscaped, tkDollar, tkBackref, 
+       tkHat, tkCurlyAt:
      result = sequence(result, primary(p))
    of tkIdentifier:
      if not arrowIsNextTok(p):
@@ -1693,3 +1726,5 @@ when isMainModule:
    peg"\skip(\s*) {\ident}'='{\ident}", "$1<-$2$2") ==
         "var1<-keykey;var2<-key2key2")

+  assert match("prefix/start", peg"^start$", 7)
+
--- a/lib/pure/strutils.nim
+++ b/lib/pure/strutils.nim
@@ -1,7 +1,7 @@
 #
 #
 #            Nimrod's Runtime Library
-#        (c) Copyright 2010 Andreas Rumpf
+#        (c) Copyright 2011 Andreas Rumpf
 #
 #    See the file "copying.txt", included in this
 #    distribution, for details about the copyright.
@@ -42,6 +42,8 @@ const
  IdentStartChars* = {'a'..'z', 'A'..'Z', '_'}
    ## the set of characters an identifier can start with

+  NewLines* = {'\13', '\10'}
+    ## the set of characters a newline terminator can start with

 proc toLower*(c: Char): Char {.noSideEffect, procvar,
  rtl, extern: "nsuToLowerChar".} =
--- a/lib/system.nim
+++ b/lib/system.nim
@@ -734,16 +734,16 @@ proc compileOption*(option: string): bool {.
  ## can be used to determine an on|off compile-time option. Example:
  ##
  ## .. code-block:: nimrod
-  ## when compileOption("floatchecks"): 
-  ##   echo "compiled with floating point NaN and Inf checks"
+  ##   when compileOption("floatchecks"): 
+  ##     echo "compiled with floating point NaN and Inf checks"
  
 proc compileOption*(option, arg: string): bool {.
  magic: "CompileOptionArg", noSideEffect.}
  ## can be used to determine an enum compile-time option. Example:
  ##
  ## .. code-block:: nimrod
-  ## when compileOption("opt", "size") and compileOption("gc", "boehm"): 
-  ##   echo "compiled with optimization for size and uses Boehm's GC"
+  ##   when compileOption("opt", "size") and compileOption("gc", "boehm"): 
+  ##     echo "compiled with optimization for size and uses Boehm's GC"
  
 include "system/inclrtl"