bugfixes: re; pegs module

2026-04-19 22:10:33 +00:00 · 2010-02-28 18:14:27 +01:00
parent 230567588b
commit ddb7185482
13 changed files with 97 additions and 72 deletions
--- a/config/nimrod.cfg
+++ b/config/nimrod.cfg
@@ -1,5 +1,5 @@
 # Configuration file for the Nimrod Compiler.
-# (c) 2009 Andreas Rumpf
+# (c) 2010 Andreas Rumpf

 # Feel free to edit the default values as you need.

--- a/doc/lib.txt
+++ b/doc/lib.txt
@@ -55,7 +55,7 @@ String handling
 * `unicode <unicode.html>`_ 
  This module provides support to handle the Unicode UTF-8 encoding.

-* `regexprs <regexprs.html>`_
+* `re <re.html>`_
  This module contains procedures and operators for handling regular
  expressions. Consider using `pegs` instead.

@@ -229,6 +229,10 @@ Database support
 * `db_mysql <db_mysql.html>`_
  A higher level mySQL database wrapper. The same interface is implemented
  for other databases too.
+ 
+* `db_sqlite <db_sqlite.html>`_
+  A higher level mySQL database wrapper. The same interface is implemented
+  for other databases too.



--- a/doc/pegdocs.txt
+++ b/doc/pegdocs.txt
@@ -86,8 +86,8 @@ macro              meaning
 ``\s``             any whitespace character: ``[ \9-\13]``
 ``\S``             any character that is not a whitespace character:
                   ``[^ \9-\13]``
-``\w``             any "word" character: ``[a-zA-Z_]``
-``\W``             any "non-word" character: ``[^a-zA-Z_]``
+``\w``             any "word" character: ``[a-zA-Z0-9_]``
+``\W``             any "non-word" character: ``[^a-zA-Z0-9_]``
 ``\n``             any newline combination: ``\10 / \13\10 / \13``
 ``\i``             ignore case for matching; use this at the start of the PEG
 ``\y``             ignore style for matching; use this at the start of the PEG
--- a/lib/pure/pegs.nim
+++ b/lib/pure/pegs.nim
@@ -604,7 +604,7 @@ proc match*(s: string, pattern: TPeg, matches: var openarray[string],
  ## match, nothing is written into ``matches`` and ``false`` is
  ## returned.
  var c: TMatchClosure
-  result = m(s, pattern, start, c) == len(s)
+  result = m(s, pattern, start, c) == len(s) -start
  if result:
    for i in 0..c.ml-1:
      matches[i] = copy(s, c.matches[i][0], c.matches[i][1])
@@ -612,7 +612,7 @@ proc match*(s: string, pattern: TPeg, matches: var openarray[string],
 proc match*(s: string, pattern: TPeg, start = 0): bool =
  ## returns ``true`` if ``s`` matches the ``pattern`` beginning from ``start``.
  var c: TMatchClosure
-  result = m(s, pattern, start, c) == len(s)
+  result = m(s, pattern, start, c) == len(s)-start

 proc matchLen*(s: string, pattern: TPeg, matches: var openarray[string],
               start = 0): int =
@@ -1263,8 +1263,8 @@ proc primary(p: var TPegParser): TPeg =
    of "D": result = charset({'\1'..'\xff'} - {'0'..'9'})
    of "s": result = charset({' ', '\9'..'\13'})
    of "S": result = charset({'\1'..'\xff'} - {' ', '\9'..'\13'})
-    of "w": result = charset({'a'..'z', 'A'..'Z', '_'})
-    of "W": result = charset({'\1'..'\xff'} - {'a'..'z', 'A'..'Z', '_'})
+    of "w": result = charset({'a'..'z', 'A'..'Z', '_', '0'..'9'})
+    of "W": result = charset({'\1'..'\xff'} - {'a'..'z','A'..'Z','_','0'..'9'})
    of "ident": result = pegs.ident
    else: pegError(p, "unknown built-in: " & p.tok.literal)
    getTok(p)
@@ -1435,3 +1435,11 @@ when isMainModule:
    assert matches[0] == "a"
  else:
    assert false
+    
+  var matches: array[0..5, string]
+  if match("abcdefg", peg"'c' {'d'} 'ef' {'g'}", matches, 2): 
+    assert matches[0] == "d"
+    assert matches[1] == "g"
+  else:
+    assert false
+
--- a/lib/pure/re.nim
+++ b/lib/pure/re.nim
@@ -75,12 +75,14 @@ proc matchOrFind(s: string, pattern: TRegEx, matches: var openarray[string],
    var b = rawMatches[i * 2 + 1]
    if a >= 0'i32: matches[i-1] = copy(s, int(a), int(b)-1)
    else: matches[i-1] = ""
-  return res
+  return rawMatches[1] - rawMatches[0]

 proc matchOrFind(s: string, pattern: TRegEx, start, flags: cint): cint =
  var rawMatches: array [0..maxSubpatterns * 3 - 1, cint]
-  return pcreExec(pattern.h, nil, s, len(s), start, flags,
-                  cast[ptr cint](addr(rawMatches)), maxSubpatterns * 3)
+  result = pcreExec(pattern.h, nil, s, len(s), start, flags,
+                    cast[ptr cint](addr(rawMatches)), maxSubpatterns * 3)
+  if result >= 0'i32:
+    result = rawMatches[1] - rawMatches[0]

 proc match*(s: string, pattern: TRegEx, matches: var openarray[string],
           start = 0): bool =
@@ -88,11 +90,12 @@ proc match*(s: string, pattern: TRegEx, matches: var openarray[string],
  ## the captured substrings in the array ``matches``. If it does not
  ## match, nothing is written into ``matches`` and ``false`` is
  ## returned.
-  return matchOrFind(s, pattern, matches, start, PCRE_ANCHORED) >= 0'i32
+  return matchOrFind(s, pattern, matches, start, 
+                     PCRE_ANCHORED) == cint(s.len - start)

 proc match*(s: string, pattern: TRegEx, start = 0): bool =
  ## returns ``true`` if ``s[start..]`` matches the ``pattern``.
-  return matchOrFind(s, pattern, start, PCRE_ANCHORED) >= 0'i32
+  return matchOrFind(s, pattern, start, PCRE_ANCHORED) == cint(s.len - start)

 proc matchLen*(s: string, pattern: TRegEx, matches: var openarray[string],
              start = 0): int =
@@ -112,12 +115,23 @@ proc find*(s: string, pattern: TRegEx, matches: var openarray[string],
  ## returns the starting position of ``pattern`` in ``s`` and the captured
  ## substrings in the array ``matches``. If it does not match, nothing
  ## is written into ``matches`` and -1 is returned.
-  return matchOrFind(s, pattern, matches, start, 0'i32)
+  var
+    rawMatches: array[0..maxSubpatterns * 3 - 1, cint]
+    res = pcreExec(pattern.h, nil, s, len(s), start, 0'i32,
+      cast[ptr cint](addr(rawMatches)), maxSubpatterns * 3)
+  if res < 0'i32: return res
+  for i in 1..int(res)-1:
+    var a = rawMatches[i * 2]
+    var b = rawMatches[i * 2 + 1]
+    if a >= 0'i32: matches[i-1] = copy(s, int(a), int(b)-1)
+    else: matches[i-1] = ""
+  return rawMatches[0]

 proc find*(s: string, pattern: TRegEx, start = 0): int =
  ## returns the starting position of ``pattern`` in ``s``. If it does not
  ## match, -1 is returned.
-  return matchOrFind(s, pattern, start, 0'i32)
+  var matches: array[0..maxSubpatterns-1, string]
+  result = find(s, pattern, matches, start)

 template `=~` *(s: string, pattern: TRegEx): expr = 
  ## This calls ``match`` with an implicit declared ``matches`` array that 
@@ -279,57 +293,36 @@ const ## common regular expressions
    ## describes an URL

 when isMainModule:
-  assert match("(a b c)", re"'(' @ ')'")
-  assert match("WHiLe", re(r"while", {reIgnoreCase}))
+  assert match("(a b c)", re"\( .* \)")
+  assert match("WHiLe", re("while", {reIgnoreCase}))
  
  assert "0158787".match(re"\d+")
  assert "ABC 0232".match(re"\w+\s+\d+")
-  assert "ABC".match(re"\d+ / \w+")
+  assert "ABC".match(re"\d+ | \w+")
+
+  assert matchLen("key", re(reIdentifier)) == 3
+
+  var pattern = re"[a-z0-9]+\s*=\s*[a-z0-9]+"
+  assert matchLen("key1=  cal9", pattern) == 11
+  
+  assert find("_____abc_______", re"abc") == 5
+  
+  var matches: array[0..5, string]
+  if match("abcdefg", re"c(d)ef(g)", matches, 2): 
+    assert matches[0] == "d"
+    assert matches[1] == "g"
+  else:
+    assert false
+  
+  if "abc" =~ re"(a)bcxyz|(\w+)":
+    assert matches[1] == "abc"
+  else:
+    assert false
+    
+  assert "var1=key; var2=key2".endsWith(re"\w+=\w+")
+  assert("var1=key; var2=key2".replace(re"(\w+)=(\w+)", "$1<-$2$2") ==
+         "var1<-keykey; var2<-key2key2")

  for word in split("00232this02939is39an22example111", re"\d+"):
    writeln(stdout, word)

-  assert matchLen("key", re(reIdentifier)) == 3
-
-  var pattern = re"[a-z0-9]+\s*=\s*[a-z0-9]+")
-  assert matchLen("key1=  cal9", pattern) == 11
-  
-  var c: TMatchClosure
-  var s = "a+b +  c +d+e+f"
-  assert m(s, expr.rule, 0, c) == len(s)
-  var a = ""
-  for i in 0..c.ml-1:
-    a.add(copy(s, c.matches[i][0], c.matches[i][1]))
-  assert a == "abcdef"
-  #echo expr.rule
-
-  #const filename = "lib/devel/peg/grammar.txt"
-  #var grammar = parsePeg(newFileStream(filename, fmRead), filename)
-  #echo "a <- [abc]*?".match(grammar)
-  assert find("_____abc_______", term("abc")) == 5
-  assert match("_______ana", peg"A <- 'ana' / . A")
-  assert match("abcs%%%", peg"A <- ..A / .A / '%'")
-
-  if "abc" =~ peg"{'a'}'bc' 'xyz' / {\ident}":
-    assert matches[0] == "abc"
-  else:
-    assert false
-  
-  var g2 = peg"""S <- A B / C D
-                 A <- 'a'+
-                 B <- 'b'+
-                 C <- 'c'+
-                 D <- 'd'+
-              """
-  assert($g2 == "((A B) / (C D))")
-  assert match("cccccdddddd", g2)
-  assert("var1=key; var2=key2".replace(peg"{\ident}'='{\ident}", "$1<-$2$2") ==
-         "var1<-keykey; var2<-key2key2")
-  assert "var1=key; var2=key2".endsWith(peg"{\ident}'='{\ident}")
-
-  if "aaaaaa" =~ peg"'aa' !. / ({'a'})+":
-    assert matches[0] == "a"
-  else:
-    assert false
-
-
--- a/lib/pure/xmldom.nim
+++ b/lib/pure/xmldom.nim
@@ -1044,10 +1044,20 @@ proc target*(PI: PProcessingInstruction): string =
    
 # --Other stuff--
 # Writer
+proc addEscaped(s: string): string = 
+  result = ""
+  for c in items(s):
+    case c
+    of '<': result.add("&lt;")
+    of '>': result.add("&gt;")
+    of '&': result.add("&amp;")
+    of '"': result.add("&quot;")
+    else: result.add(c)
+
 proc nodeToXml(n: PNode, indent: int = 0): string =
  result = repeatChar(indent, ' ') & "<" & n.nodeName
  for i in items(n.Attributes):
-    result.add(" " & i.name & "=\"" & i.value & "\"")
+    result.add(" " & i.name & "=\"" & addEscaped(i.value) & "\"")
  
  if n.childNodes.len() == 0:
    result.add("/>") # No idea why this doesn't need a \n :O
@@ -1060,7 +1070,7 @@ proc nodeToXml(n: PNode, indent: int = 0): string =
        result.add(nodeToXml(i, indent + 2))
      of TextNode:
        result.add(repeatChar(indent * 2, ' '))
-        result.add(i.nodeValue)
+        result.add(addEscaped(i.nodeValue))
      of CDataSectionNode:
        result.add(repeatChar(indent * 2, ' '))
        result.add("<![CDATA[" & i.nodeValue & "]]>")
@@ -1080,4 +1090,4 @@ proc nodeToXml(n: PNode, indent: int = 0): string =
 proc `$`*(doc: PDocument): string =
  ## Converts a PDocument object into a string representation of it's XML
  result = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n"
-  result.add(nodeToXml(doc.documentElement))
+  result.add(nodeToXml(doc.documentElement))
--- a/lib/pure/xmldomparser.nim
+++ b/lib/pure/xmldomparser.nim
@@ -165,4 +165,4 @@ when isMainModule:
      echo(i.nodeName, "=", i.namespaceURI)

    
-  echo($xml)
+  echo($xml)
--- a/lib/pure/xmltree.nim
+++ b/lib/pure/xmltree.nim
@@ -223,9 +223,8 @@ proc newXmlTree*(tag: string, children: openArray[PXmlNode],
  result.fAttr = attributes
  
 proc xmlConstructor(e: PNimrodNode): PNimrodNode {.compileTime.} =
-  ## use this procedure to define a new XML tag
-  expectLen(e, 1)
-  var a = e[0]
+  expectLen(e, 2)
+  var a = e[1]
  if a.kind == nnkCall:
    result = newCall("newXmlTree", toStrLit(a[0]))
    var attrs = newCall("newStringTable", [])
--- a/tests/accept/run/spec.csv
+++ b/tests/accept/run/spec.csv
@@ -59,3 +59,4 @@ tstrutil.nim;ha/home/a1xyz/usr/bin
 tvardecl.nim;44
 tvarnums.nim;Success!
 tvartup.nim;2 3
+txmltree.nim;true
--- a/tests/accept/run/txmltree.nim
+++ b/tests/accept/run/txmltree.nim
@@ -0,0 +1,7 @@
+
+import xmltree
+
+var x = <>a(href="nimrod.de", "www.nimrod-test.de")
+
+echo x == "<a href=\"nimrod.de\">www.nimrod-test.de"
+
--- a/web/index.txt
+++ b/web/index.txt
@@ -19,7 +19,7 @@ Welcome to Nimrod
  
  .. code-block:: nimrod
    # Filter key=value pairs
-    import regexprs
+    import re

    for x in lines("myfile.txt"):
      if x =~ r"(\w+)=(.*)":
@@ -70,7 +70,7 @@ Nimrod is expressive
  generics, etc.
 * User-defineable operators; code with new operators is often easier to read
  than code which overloads built-in operators. In the code snippet, the 
-  ``=~`` operator is defined in the ``regexprs`` module.
+  ``=~`` operator is defined in the ``re`` module.
 * Macros can modify the abstract syntax tree at compile time.


--- a/web/news.txt
+++ b/web/news.txt
@@ -24,6 +24,7 @@ Bugfixes
 - Fixed ``unicode.toUTF8``. 
 - The compiler now rejects ``'\n'``. 
 - ``times.getStartMilsecs()`` now works on Mac OS X.
+- Fixed a bug in ``pegs.match`` concerning start offsets.


 Additions
@@ -45,6 +46,7 @@ Additions
 - Added ``xmltree`` module.
 - Added ``xmlparser`` module.
 - Added ``htmlparser`` module.
+- Added ``re`` module.
 - Many wrappers now do not contain redundant name prefixes (like ``GTK_``,
  ``lua``). The new wrappers are available in ``lib/newwrap``. Change
  your configuration file to use these.
@@ -72,6 +74,7 @@ Changes affecting backwards compatibility
  named arguments only, because the parameter order will change the next
  version!
 - ``atomic`` and ``let`` are now keywords.
+- The ``\w`` character class for pegs now includes the digits ``'0'..'9'``.


 2009-12-21 Version 0.8.6 released
--- a/web/snippets/snippet1.nim
+++ b/web/snippets/snippet1.nim
@@ -1,4 +1,4 @@
 import strutils
 echo "Give a list of integers (separated by spaces): ", 
-     stdin.readLine.splitSeq.each(parseInt).max,
+     stdin.readLine.split.each(parseInt).max,
     " is the maximum!"