mirror of
https://github.com/nim-lang/Nim.git
synced 2026-04-19 22:10:33 +00:00
bugfixes: re; pegs module
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
# Configuration file for the Nimrod Compiler.
|
||||
# (c) 2009 Andreas Rumpf
|
||||
# (c) 2010 Andreas Rumpf
|
||||
|
||||
# Feel free to edit the default values as you need.
|
||||
|
||||
|
||||
@@ -55,7 +55,7 @@ String handling
|
||||
* `unicode <unicode.html>`_
|
||||
This module provides support to handle the Unicode UTF-8 encoding.
|
||||
|
||||
* `regexprs <regexprs.html>`_
|
||||
* `re <re.html>`_
|
||||
This module contains procedures and operators for handling regular
|
||||
expressions. Consider using `pegs` instead.
|
||||
|
||||
@@ -229,6 +229,10 @@ Database support
|
||||
* `db_mysql <db_mysql.html>`_
|
||||
A higher level mySQL database wrapper. The same interface is implemented
|
||||
for other databases too.
|
||||
|
||||
* `db_sqlite <db_sqlite.html>`_
|
||||
A higher level mySQL database wrapper. The same interface is implemented
|
||||
for other databases too.
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -86,8 +86,8 @@ macro meaning
|
||||
``\s`` any whitespace character: ``[ \9-\13]``
|
||||
``\S`` any character that is not a whitespace character:
|
||||
``[^ \9-\13]``
|
||||
``\w`` any "word" character: ``[a-zA-Z_]``
|
||||
``\W`` any "non-word" character: ``[^a-zA-Z_]``
|
||||
``\w`` any "word" character: ``[a-zA-Z0-9_]``
|
||||
``\W`` any "non-word" character: ``[^a-zA-Z0-9_]``
|
||||
``\n`` any newline combination: ``\10 / \13\10 / \13``
|
||||
``\i`` ignore case for matching; use this at the start of the PEG
|
||||
``\y`` ignore style for matching; use this at the start of the PEG
|
||||
|
||||
@@ -604,7 +604,7 @@ proc match*(s: string, pattern: TPeg, matches: var openarray[string],
|
||||
## match, nothing is written into ``matches`` and ``false`` is
|
||||
## returned.
|
||||
var c: TMatchClosure
|
||||
result = m(s, pattern, start, c) == len(s)
|
||||
result = m(s, pattern, start, c) == len(s) -start
|
||||
if result:
|
||||
for i in 0..c.ml-1:
|
||||
matches[i] = copy(s, c.matches[i][0], c.matches[i][1])
|
||||
@@ -612,7 +612,7 @@ proc match*(s: string, pattern: TPeg, matches: var openarray[string],
|
||||
proc match*(s: string, pattern: TPeg, start = 0): bool =
|
||||
## returns ``true`` if ``s`` matches the ``pattern`` beginning from ``start``.
|
||||
var c: TMatchClosure
|
||||
result = m(s, pattern, start, c) == len(s)
|
||||
result = m(s, pattern, start, c) == len(s)-start
|
||||
|
||||
proc matchLen*(s: string, pattern: TPeg, matches: var openarray[string],
|
||||
start = 0): int =
|
||||
@@ -1263,8 +1263,8 @@ proc primary(p: var TPegParser): TPeg =
|
||||
of "D": result = charset({'\1'..'\xff'} - {'0'..'9'})
|
||||
of "s": result = charset({' ', '\9'..'\13'})
|
||||
of "S": result = charset({'\1'..'\xff'} - {' ', '\9'..'\13'})
|
||||
of "w": result = charset({'a'..'z', 'A'..'Z', '_'})
|
||||
of "W": result = charset({'\1'..'\xff'} - {'a'..'z', 'A'..'Z', '_'})
|
||||
of "w": result = charset({'a'..'z', 'A'..'Z', '_', '0'..'9'})
|
||||
of "W": result = charset({'\1'..'\xff'} - {'a'..'z','A'..'Z','_','0'..'9'})
|
||||
of "ident": result = pegs.ident
|
||||
else: pegError(p, "unknown built-in: " & p.tok.literal)
|
||||
getTok(p)
|
||||
@@ -1435,3 +1435,11 @@ when isMainModule:
|
||||
assert matches[0] == "a"
|
||||
else:
|
||||
assert false
|
||||
|
||||
var matches: array[0..5, string]
|
||||
if match("abcdefg", peg"'c' {'d'} 'ef' {'g'}", matches, 2):
|
||||
assert matches[0] == "d"
|
||||
assert matches[1] == "g"
|
||||
else:
|
||||
assert false
|
||||
|
||||
|
||||
101
lib/pure/re.nim
101
lib/pure/re.nim
@@ -75,12 +75,14 @@ proc matchOrFind(s: string, pattern: TRegEx, matches: var openarray[string],
|
||||
var b = rawMatches[i * 2 + 1]
|
||||
if a >= 0'i32: matches[i-1] = copy(s, int(a), int(b)-1)
|
||||
else: matches[i-1] = ""
|
||||
return res
|
||||
return rawMatches[1] - rawMatches[0]
|
||||
|
||||
proc matchOrFind(s: string, pattern: TRegEx, start, flags: cint): cint =
|
||||
var rawMatches: array [0..maxSubpatterns * 3 - 1, cint]
|
||||
return pcreExec(pattern.h, nil, s, len(s), start, flags,
|
||||
cast[ptr cint](addr(rawMatches)), maxSubpatterns * 3)
|
||||
result = pcreExec(pattern.h, nil, s, len(s), start, flags,
|
||||
cast[ptr cint](addr(rawMatches)), maxSubpatterns * 3)
|
||||
if result >= 0'i32:
|
||||
result = rawMatches[1] - rawMatches[0]
|
||||
|
||||
proc match*(s: string, pattern: TRegEx, matches: var openarray[string],
|
||||
start = 0): bool =
|
||||
@@ -88,11 +90,12 @@ proc match*(s: string, pattern: TRegEx, matches: var openarray[string],
|
||||
## the captured substrings in the array ``matches``. If it does not
|
||||
## match, nothing is written into ``matches`` and ``false`` is
|
||||
## returned.
|
||||
return matchOrFind(s, pattern, matches, start, PCRE_ANCHORED) >= 0'i32
|
||||
return matchOrFind(s, pattern, matches, start,
|
||||
PCRE_ANCHORED) == cint(s.len - start)
|
||||
|
||||
proc match*(s: string, pattern: TRegEx, start = 0): bool =
|
||||
## returns ``true`` if ``s[start..]`` matches the ``pattern``.
|
||||
return matchOrFind(s, pattern, start, PCRE_ANCHORED) >= 0'i32
|
||||
return matchOrFind(s, pattern, start, PCRE_ANCHORED) == cint(s.len - start)
|
||||
|
||||
proc matchLen*(s: string, pattern: TRegEx, matches: var openarray[string],
|
||||
start = 0): int =
|
||||
@@ -112,12 +115,23 @@ proc find*(s: string, pattern: TRegEx, matches: var openarray[string],
|
||||
## returns the starting position of ``pattern`` in ``s`` and the captured
|
||||
## substrings in the array ``matches``. If it does not match, nothing
|
||||
## is written into ``matches`` and -1 is returned.
|
||||
return matchOrFind(s, pattern, matches, start, 0'i32)
|
||||
var
|
||||
rawMatches: array[0..maxSubpatterns * 3 - 1, cint]
|
||||
res = pcreExec(pattern.h, nil, s, len(s), start, 0'i32,
|
||||
cast[ptr cint](addr(rawMatches)), maxSubpatterns * 3)
|
||||
if res < 0'i32: return res
|
||||
for i in 1..int(res)-1:
|
||||
var a = rawMatches[i * 2]
|
||||
var b = rawMatches[i * 2 + 1]
|
||||
if a >= 0'i32: matches[i-1] = copy(s, int(a), int(b)-1)
|
||||
else: matches[i-1] = ""
|
||||
return rawMatches[0]
|
||||
|
||||
proc find*(s: string, pattern: TRegEx, start = 0): int =
|
||||
## returns the starting position of ``pattern`` in ``s``. If it does not
|
||||
## match, -1 is returned.
|
||||
return matchOrFind(s, pattern, start, 0'i32)
|
||||
var matches: array[0..maxSubpatterns-1, string]
|
||||
result = find(s, pattern, matches, start)
|
||||
|
||||
template `=~` *(s: string, pattern: TRegEx): expr =
|
||||
## This calls ``match`` with an implicit declared ``matches`` array that
|
||||
@@ -279,57 +293,36 @@ const ## common regular expressions
|
||||
## describes an URL
|
||||
|
||||
when isMainModule:
|
||||
assert match("(a b c)", re"'(' @ ')'")
|
||||
assert match("WHiLe", re(r"while", {reIgnoreCase}))
|
||||
assert match("(a b c)", re"\( .* \)")
|
||||
assert match("WHiLe", re("while", {reIgnoreCase}))
|
||||
|
||||
assert "0158787".match(re"\d+")
|
||||
assert "ABC 0232".match(re"\w+\s+\d+")
|
||||
assert "ABC".match(re"\d+ / \w+")
|
||||
assert "ABC".match(re"\d+ | \w+")
|
||||
|
||||
assert matchLen("key", re(reIdentifier)) == 3
|
||||
|
||||
var pattern = re"[a-z0-9]+\s*=\s*[a-z0-9]+"
|
||||
assert matchLen("key1= cal9", pattern) == 11
|
||||
|
||||
assert find("_____abc_______", re"abc") == 5
|
||||
|
||||
var matches: array[0..5, string]
|
||||
if match("abcdefg", re"c(d)ef(g)", matches, 2):
|
||||
assert matches[0] == "d"
|
||||
assert matches[1] == "g"
|
||||
else:
|
||||
assert false
|
||||
|
||||
if "abc" =~ re"(a)bcxyz|(\w+)":
|
||||
assert matches[1] == "abc"
|
||||
else:
|
||||
assert false
|
||||
|
||||
assert "var1=key; var2=key2".endsWith(re"\w+=\w+")
|
||||
assert("var1=key; var2=key2".replace(re"(\w+)=(\w+)", "$1<-$2$2") ==
|
||||
"var1<-keykey; var2<-key2key2")
|
||||
|
||||
for word in split("00232this02939is39an22example111", re"\d+"):
|
||||
writeln(stdout, word)
|
||||
|
||||
assert matchLen("key", re(reIdentifier)) == 3
|
||||
|
||||
var pattern = re"[a-z0-9]+\s*=\s*[a-z0-9]+")
|
||||
assert matchLen("key1= cal9", pattern) == 11
|
||||
|
||||
var c: TMatchClosure
|
||||
var s = "a+b + c +d+e+f"
|
||||
assert m(s, expr.rule, 0, c) == len(s)
|
||||
var a = ""
|
||||
for i in 0..c.ml-1:
|
||||
a.add(copy(s, c.matches[i][0], c.matches[i][1]))
|
||||
assert a == "abcdef"
|
||||
#echo expr.rule
|
||||
|
||||
#const filename = "lib/devel/peg/grammar.txt"
|
||||
#var grammar = parsePeg(newFileStream(filename, fmRead), filename)
|
||||
#echo "a <- [abc]*?".match(grammar)
|
||||
assert find("_____abc_______", term("abc")) == 5
|
||||
assert match("_______ana", peg"A <- 'ana' / . A")
|
||||
assert match("abcs%%%", peg"A <- ..A / .A / '%'")
|
||||
|
||||
if "abc" =~ peg"{'a'}'bc' 'xyz' / {\ident}":
|
||||
assert matches[0] == "abc"
|
||||
else:
|
||||
assert false
|
||||
|
||||
var g2 = peg"""S <- A B / C D
|
||||
A <- 'a'+
|
||||
B <- 'b'+
|
||||
C <- 'c'+
|
||||
D <- 'd'+
|
||||
"""
|
||||
assert($g2 == "((A B) / (C D))")
|
||||
assert match("cccccdddddd", g2)
|
||||
assert("var1=key; var2=key2".replace(peg"{\ident}'='{\ident}", "$1<-$2$2") ==
|
||||
"var1<-keykey; var2<-key2key2")
|
||||
assert "var1=key; var2=key2".endsWith(peg"{\ident}'='{\ident}")
|
||||
|
||||
if "aaaaaa" =~ peg"'aa' !. / ({'a'})+":
|
||||
assert matches[0] == "a"
|
||||
else:
|
||||
assert false
|
||||
|
||||
|
||||
|
||||
@@ -1044,10 +1044,20 @@ proc target*(PI: PProcessingInstruction): string =
|
||||
|
||||
# --Other stuff--
|
||||
# Writer
|
||||
proc addEscaped(s: string): string =
|
||||
result = ""
|
||||
for c in items(s):
|
||||
case c
|
||||
of '<': result.add("<")
|
||||
of '>': result.add(">")
|
||||
of '&': result.add("&")
|
||||
of '"': result.add(""")
|
||||
else: result.add(c)
|
||||
|
||||
proc nodeToXml(n: PNode, indent: int = 0): string =
|
||||
result = repeatChar(indent, ' ') & "<" & n.nodeName
|
||||
for i in items(n.Attributes):
|
||||
result.add(" " & i.name & "=\"" & i.value & "\"")
|
||||
result.add(" " & i.name & "=\"" & addEscaped(i.value) & "\"")
|
||||
|
||||
if n.childNodes.len() == 0:
|
||||
result.add("/>") # No idea why this doesn't need a \n :O
|
||||
@@ -1060,7 +1070,7 @@ proc nodeToXml(n: PNode, indent: int = 0): string =
|
||||
result.add(nodeToXml(i, indent + 2))
|
||||
of TextNode:
|
||||
result.add(repeatChar(indent * 2, ' '))
|
||||
result.add(i.nodeValue)
|
||||
result.add(addEscaped(i.nodeValue))
|
||||
of CDataSectionNode:
|
||||
result.add(repeatChar(indent * 2, ' '))
|
||||
result.add("<![CDATA[" & i.nodeValue & "]]>")
|
||||
@@ -1080,4 +1090,4 @@ proc nodeToXml(n: PNode, indent: int = 0): string =
|
||||
proc `$`*(doc: PDocument): string =
|
||||
## Converts a PDocument object into a string representation of it's XML
|
||||
result = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n"
|
||||
result.add(nodeToXml(doc.documentElement))
|
||||
result.add(nodeToXml(doc.documentElement))
|
||||
|
||||
@@ -165,4 +165,4 @@ when isMainModule:
|
||||
echo(i.nodeName, "=", i.namespaceURI)
|
||||
|
||||
|
||||
echo($xml)
|
||||
echo($xml)
|
||||
|
||||
@@ -223,9 +223,8 @@ proc newXmlTree*(tag: string, children: openArray[PXmlNode],
|
||||
result.fAttr = attributes
|
||||
|
||||
proc xmlConstructor(e: PNimrodNode): PNimrodNode {.compileTime.} =
|
||||
## use this procedure to define a new XML tag
|
||||
expectLen(e, 1)
|
||||
var a = e[0]
|
||||
expectLen(e, 2)
|
||||
var a = e[1]
|
||||
if a.kind == nnkCall:
|
||||
result = newCall("newXmlTree", toStrLit(a[0]))
|
||||
var attrs = newCall("newStringTable", [])
|
||||
|
||||
@@ -59,3 +59,4 @@ tstrutil.nim;ha/home/a1xyz/usr/bin
|
||||
tvardecl.nim;44
|
||||
tvarnums.nim;Success!
|
||||
tvartup.nim;2 3
|
||||
txmltree.nim;true
|
||||
|
||||
|
Can't render this file because it contains an unexpected character in line 57 and column 15.
|
7
tests/accept/run/txmltree.nim
Normal file
7
tests/accept/run/txmltree.nim
Normal file
@@ -0,0 +1,7 @@
|
||||
|
||||
import xmltree
|
||||
|
||||
var x = <>a(href="nimrod.de", "www.nimrod-test.de")
|
||||
|
||||
echo x == "<a href=\"nimrod.de\">www.nimrod-test.de"
|
||||
|
||||
@@ -19,7 +19,7 @@ Welcome to Nimrod
|
||||
|
||||
.. code-block:: nimrod
|
||||
# Filter key=value pairs
|
||||
import regexprs
|
||||
import re
|
||||
|
||||
for x in lines("myfile.txt"):
|
||||
if x =~ r"(\w+)=(.*)":
|
||||
@@ -70,7 +70,7 @@ Nimrod is expressive
|
||||
generics, etc.
|
||||
* User-defineable operators; code with new operators is often easier to read
|
||||
than code which overloads built-in operators. In the code snippet, the
|
||||
``=~`` operator is defined in the ``regexprs`` module.
|
||||
``=~`` operator is defined in the ``re`` module.
|
||||
* Macros can modify the abstract syntax tree at compile time.
|
||||
|
||||
|
||||
|
||||
@@ -24,6 +24,7 @@ Bugfixes
|
||||
- Fixed ``unicode.toUTF8``.
|
||||
- The compiler now rejects ``'\n'``.
|
||||
- ``times.getStartMilsecs()`` now works on Mac OS X.
|
||||
- Fixed a bug in ``pegs.match`` concerning start offsets.
|
||||
|
||||
|
||||
Additions
|
||||
@@ -45,6 +46,7 @@ Additions
|
||||
- Added ``xmltree`` module.
|
||||
- Added ``xmlparser`` module.
|
||||
- Added ``htmlparser`` module.
|
||||
- Added ``re`` module.
|
||||
- Many wrappers now do not contain redundant name prefixes (like ``GTK_``,
|
||||
``lua``). The new wrappers are available in ``lib/newwrap``. Change
|
||||
your configuration file to use these.
|
||||
@@ -72,6 +74,7 @@ Changes affecting backwards compatibility
|
||||
named arguments only, because the parameter order will change the next
|
||||
version!
|
||||
- ``atomic`` and ``let`` are now keywords.
|
||||
- The ``\w`` character class for pegs now includes the digits ``'0'..'9'``.
|
||||
|
||||
|
||||
2009-12-21 Version 0.8.6 released
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
import strutils
|
||||
echo "Give a list of integers (separated by spaces): ",
|
||||
stdin.readLine.splitSeq.each(parseInt).max,
|
||||
stdin.readLine.split.each(parseInt).max,
|
||||
" is the maximum!"
|
||||
|
||||
Reference in New Issue
Block a user