Merge pull request #14 from BlaXpirit/inline-options

Inline options
This commit is contained in:
Flaviu Tamas
2015-05-08 13:56:25 -04:00
5 changed files with 119 additions and 123 deletions

View File

@@ -47,14 +47,7 @@ from unicode import runeLenAt
type
Regex* = ref object
## Represents the pattern that things are matched against, constructed with
## ``re(string, string)``. Examples: ``re"foo"``, ``re(r"foo # comment",
## "x<anycrlf>")``, ``re"(?x)(*ANYCRLF)foo # comment"``. For more details
## on the leading option groups, see the `Option
## Setting <http://man7.org/linux/man-pages/man3/pcresyntax.3.html#OPTION_SETTING>`__
## and the `Newline
## Convention <http://man7.org/linux/man-pages/man3/pcresyntax.3.html#NEWLINE_CONVENTION>`__
## sections of the `PCRE syntax
## manual <http://man7.org/linux/man-pages/man3/pcresyntax.3.html>`__.
## ``re(string)``. Examples: ``re"foo"``, ``re(r"(*ANYCRLF)(?x)foo # comment".
##
## ``pattern: string``
## the string that was used to create the pattern.
@@ -66,33 +59,36 @@ type
## a table from the capture names to their numeric id.
##
##
## Flags
## .....
## Options
## .......
##
## - ``8``, ``u``, ``<utf8>`` - treat both the pattern and subject as UTF8
## - ``9``, ``<no_utf8>`` - prevents the pattern from being interpreted as UTF, no matter
## what
## - ``A``, ``<anchored>`` - as if the pattern had a ``^`` at the beginning
## - ``E``, ``<dollar_endonly>`` - DOLLAR\_ENDONLY
## - ``f``, ``<firstline>`` - fails if there is not a match on the first line
## - ``i``, ``<case_insensitive>`` - case insensitive
## - ``m``, ``<multiline>`` - multi-line, ``^`` and ``$`` match the beginning and end of
## The following options may appear anywhere in the pattern, and they affect
## the rest of it.
##
## - ``(?i)`` - case insensitive
## - ``(?m)`` - multi-line: ``^`` and ``$`` match the beginning and end of
## lines, not of the subject string
## - ``N``, ``<no_auto_capture>`` - turn off auto-capture, ``(?foo)`` is necessary to capture.
## - ``s``, ``<dotall>`` - ``.`` matches newline
## - ``U``, ``<ungreedy>`` - expressions are not greedy by default. ``?`` can be added to
## a qualifier to make it greedy.
## - ``W``, ``<ucp>`` - Unicode character properties; ``\w`` matches ``к``.
## - ``X``, ``<extra>`` - "Extra", character escapes without special meaning (``\w``
## vs. ``\a``) are errors
## - ``x``, ``<extended>`` - extended, comments (``#``) and newlines are ignored
## (extended)
## - ``Y``, ``<no_start_optimize>`` - pcre.NO\_START\_OPTIMIZE,
## - ``<cr>`` - newlines are separated by ``\r``
## - ``<crlf>`` - newlines are separated by ``\r\n`` (Windows default)
## - ``<lf>`` - newlines are separated by ``\n`` (UNIX default)
## - ``<anycrlf>`` - newlines are separated by any of the above
## - ``<any>`` - newlines are separated by any of the above and Unicode
## - ``(?s)`` - ``.`` also matches newline (*dotall*)
## - ``(?U)`` - expressions are not greedy by default. ``?`` can be added
## to a qualifier to make it greedy
## - ``(?x)`` - whitespace and comments (``#``) are ignored (*extended*)
## - ``(?X)`` - character escapes without special meaning (``\w`` vs.
## ``\a``) are errors (*extra*)
##
## One or a combination of these options may appear only at the beginning
## of the pattern:
##
## - ``(*UTF8)`` - treat both the pattern and subject as UTF-8
## - ``(*UCP)`` - Unicode character properties; ``\w`` matches ``я``
## - ``(*U)`` - a combination of the two options above
## - ``(*FIRSTLINE*)`` - fails if there is not a match on the first line
## - ``(*NO_AUTO_CAPTURE)`` - turn off auto-capture for groups;
## ``(?<name>...)`` can be used to capture
## - ``(*CR)`` - newlines are separated by ``\r``
## - ``(*LF)`` - newlines are separated by ``\n`` (UNIX default)
## - ``(*CRLF)`` - newlines are separated by ``\r\n`` (Windows default)
## - ``(*ANYCRLF)`` - newlines are separated by any of the above
## - ``(*ANY)`` - newlines are separated by any of the above and Unicode
## newlines:
##
## single characters VT (vertical tab, U+000B), FF (form feed, U+000C),
@@ -101,10 +97,15 @@ type
## are recognized only in UTF-8 mode.
## — man pcre
##
## - ``<bsr_anycrlf>`` - ``\R`` matches CR, LF, or CRLF
## - ``<bsr_unicode>`` - ``\R`` matches any unicode newline
## - ``<js>`` - Javascript compatibility
## - ``<no_study>`` - turn off studying; study is enabled by deafault
## - ``(*JAVASCRIPT_COMPAT)`` - JavaScript compatibility
## - ``(*NO_STUDY)`` - turn off studying; study is enabled by default
##
## For more details on the leading option groups, see the `Option
## Setting <http://man7.org/linux/man-pages/man3/pcresyntax.3.html#OPTION_SETTING>`__
## and the `Newline
## Convention <http://man7.org/linux/man-pages/man3/pcresyntax.3.html#NEWLINE_CONVENTION>`__
## sections of the `PCRE syntax
## manual <http://man7.org/linux/man-pages/man3/pcresyntax.3.html>`__.
pattern*: string ## not nil
pcreObj: ptr pcre.Pcre ## not nil
pcreExtra: ptr pcre.ExtraData ## nil
@@ -316,71 +317,61 @@ proc `==`*(a, b: RegexMatch): bool =
# Creation & Destruction {{{
# PCRE Options {{{
let Options: Table[string, int] = {
"8" : pcre.UTF8,
"utf8" : pcre.UTF8,
"9" : pcre.NEVER_UTF,
"no_utf8" : pcre.NEVER_UTF,
"A" : pcre.ANCHORED,
"anchored" : pcre.ANCHORED,
# "C" : pcre.AUTO_CALLOUT, unsuported XXX
"E" : pcre.DOLLAR_ENDONLY,
"dollar_endonly" : pcre.DOLLAR_ENDONLY,
"f" : pcre.FIRSTLINE,
"firstline" : pcre.FIRSTLINE,
"i" : pcre.CASELESS,
"case_insensitive" : pcre.CASELESS,
"m" : pcre.MULTILINE,
"multiline" : pcre.MULTILINE,
"N" : pcre.NO_AUTO_CAPTURE,
"no_auto_capture" : pcre.NO_AUTO_CAPTURE,
"s" : pcre.DOTALL,
"dotall" : pcre.DOTALL,
"U" : pcre.UNGREEDY,
"ungreedy" : pcre.UNGREEDY,
"u" : pcre.UTF8,
"W" : pcre.UCP,
"ucp" : pcre.UCP,
"X" : pcre.EXTRA,
"extra" : pcre.EXTRA,
"x" : pcre.EXTENDED,
"extended" : pcre.EXTENDED,
"Y" : pcre.NO_START_OPTIMIZE,
"no_start_optimize" : pcre.NO_START_OPTIMIZE,
"any" : pcre.NEWLINE_ANY,
"anycrlf" : pcre.NEWLINE_ANYCRLF,
"cr" : pcre.NEWLINE_CR,
"crlf" : pcre.NEWLINE_CRLF,
"lf" : pcre.NEWLINE_LF,
"bsr_anycrlf" : pcre.BSR_ANYCRLF,
"bsr_unicode" : pcre.BSR_UNICODE,
"js" : pcre.JAVASCRIPT_COMPAT,
const PcreOptions = {
"NEVER_UTF": pcre.NEVER_UTF,
"ANCHORED": pcre.ANCHORED,
"DOLLAR_ENDONLY": pcre.DOLLAR_ENDONLY,
"FIRSTLINE": pcre.FIRSTLINE,
"NO_AUTO_CAPTURE": pcre.NO_AUTO_CAPTURE,
"JAVASCRIPT_COMPAT": pcre.JAVASCRIPT_COMPAT,
"U": pcre.UTF8 or pcre.UCP
}.toTable
proc tokenizeOptions(opts: string): tuple[flags: int, study: bool] =
result = (0, true)
# Options that are supported inside regular expressions themselves
const SkipOptions = [
"LIMIT_MATCH=", "LIMIT_RECURSION=", "NO_AUTO_POSSESS", "NO_START_OPT",
"UTF8", "UTF16", "UTF32", "UTF", "UCP",
"CR", "LF", "CRLF", "ANYCRLF", "ANY", "BSR_ANYCRLF", "BSR_UNICODE"
]
var longOpt: string = nil
for i, c in opts:
# Handle long options {{{
if c == '<':
longOpt = ""
continue
proc extractOptions(pattern: string): tuple[pattern: string, flags: int, study: bool] =
result = ("", 0, true)
if longOpt != nil:
if c == '>':
if longOpt == "no_study":
result.study = false
else:
result.flags = result.flags or Options.fget(longOpt)
longOpt = nil
var optionStart = 0
var equals = false
for i, c in pattern:
if optionStart == i:
if c != '(':
break
optionStart = i
elif optionStart == i-1:
if c != '*':
break
elif c == ')':
let name = pattern[optionStart+2 .. i-1]
if equals or name in SkipOptions:
result.pattern.add pattern[optionStart .. i]
elif PcreOptions.hasKey name:
result.flags = result.flags or PcreOptions[name]
elif name == "NO_STUDY":
result.study = false
else:
longOpt.add(c.toLower)
continue
# }}}
break
optionStart = i+1
equals = false
elif not equals:
if c == '=':
equals = true
if pattern[optionStart+2 .. i] notin SkipOptions:
break
elif c notin {'A'..'Z', '0'..'9', '_'}:
break
result.pattern.add pattern[optionStart .. pattern.high]
result.flags = result.flags or Options.fget($c)
# }}}
type UncheckedArray {.unchecked.}[T] = array[0 .. 0, T]
@@ -411,24 +402,22 @@ proc getNameToNumberTable(pattern: Regex): Table[string, int] =
result[name] = num
proc initRegex(pattern: string, options: string): Regex =
proc initRegex(pattern: string, flags: int, study = true): Regex =
new(result, destroyRegex)
result.pattern = pattern
var errorMsg: cstring
var errOffset: cint
let opts = tokenizeOptions(options)
result.pcreObj = pcre.compile(cstring(pattern),
# better hope int is at least 4 bytes..
cint(opts.flags), addr errorMsg,
cint(flags), addr errorMsg,
addr errOffset, nil)
if result.pcreObj == nil:
# failed to compile
raise SyntaxError(msg: $errorMsg, pos: errOffset, pattern: pattern)
if opts.study:
if study:
# XXX investigate JIT
result.pcreExtra = pcre.study(result.pcreObj, 0x0, addr errorMsg)
if errorMsg != nil:
@@ -436,7 +425,9 @@ proc initRegex(pattern: string, options: string): Regex =
result.captureNameToId = result.getNameToNumberTable()
proc re*(pattern: string, options = ""): Regex = initRegex(pattern, options)
proc re*(pattern: string): Regex =
let (pattern, flags, study) = extractOptions(pattern)
initRegex(pattern, flags, study)
# }}}
# Operations {{{

View File

@@ -21,5 +21,5 @@ suite "find":
check("".findAll(re"") == @[""])
check("abc".findAll(re"") == @["", "", "", ""])
check("word word".findAll(re"\b") == @["", "", "", ""])
check("word\r\lword".findAll(re(r"$", "m<anycrlf>")) == @["", ""])
check("слово слово".findAll(re(r"\b", "uW")) == @["", "", "", ""])
check("word\r\lword".findAll(re"(*ANYCRLF)(?m)$") == @["", ""])
check("слово слово".findAll(re"(*U)\b") == @["", "", "", ""])

View File

@@ -1,24 +1,30 @@
import unittest
import nre
import unittest, private/pcre
include nre
suite "Test NRE initialization":
test "correct intialization":
check(re("[0-9]+") != nil)
check(re("[0-9]+", "i") != nil)
check(re("(?i)[0-9]+") != nil)
test "correct options":
expect(SyntaxError): # ValueError would be bad
discard re("[0-9]+",
"89AEfimNsUWXxY<any><anycrlf><cr><crlf><lf><bsr_anycrlf><bsr_unicode><js><no_study>")
expect(SyntaxError):
discard re("[0-9]+",
"<utf8><no_utf8><anchored><dollar_endonly><firstline>" &
"<case_insensitive><multiline><no_auto_capture><dotall><ungreedy>" &
"<ucp><extra><extended><no_start_optimize>")
test "options":
check(extractOptions("(*NEVER_UTF)") ==
("", pcre.NEVER_UTF, true))
check(extractOptions("(*UTF8)(*ANCHORED)(*UCP)z") ==
("(*UTF8)(*UCP)z", pcre.ANCHORED, true))
check(extractOptions("(*ANCHORED)(*UTF8)(*JAVASCRIPT_COMPAT)z") ==
("(*UTF8)z", pcre.ANCHORED or pcre.JAVASCRIPT_COMPAT, true))
check(extractOptions("(*NO_STUDY)(") == ("(", 0, false))
check(extractOptions("(*LIMIT_MATCH=6)(*ANCHORED)z") ==
("(*LIMIT_MATCH=6)z", pcre.ANCHORED, true))
test "incorrect options":
expect(KeyError): discard re("[0-9]+", "a")
expect(KeyError): discard re("[0-9]+", "<does_not_exist>")
for s in ["CR", "(CR", "(*CR", "(*abc)", "(*abc)CR",
"(?i)",
"(*LIMIT_MATCH=5", "(*NO_AUTO_POSSESS=5)"]:
let ss = s & "(*NEVER_UTF)"
check(extractOptions(ss) == (ss, 0, true))
test "invalid regex":
expect(SyntaxError): discard re("[0-9")
@@ -28,4 +34,3 @@ suite "Test NRE initialization":
let ex = SyntaxError(getCurrentException())
check(ex.pos == 4)
check(ex.pattern == "[0-9")

View File

@@ -2,8 +2,8 @@ import unittest, nre, strutils, optional_t.nonstrict
suite "Misc tests":
test "unicode":
check("".find(re("", "8")).match == "")
check("перевірка".replace(re(r"\w", "uW"), "") == "")
check("".find(re"(*UTF8)").match == "")
check("перевірка".replace(re"(*U)\w", "") == "")
test "empty or non-empty match":
check("abc".findall(re"|.").join(":") == ":a::b::c:")

View File

@@ -21,8 +21,8 @@ suite "string splitting":
check("12345".split(re("")) == @["1", "2", "3", "4", "5"])
check("".split(re"") == newSeq[string]())
check("word word".split(re"\b") == @["word", " ", "word"])
check("word\r\lword".split(re(r"$", "m<anycrlf>")) == @["word", "\r\lword"])
check("слово слово".split(re(r"(\b)", "uW")) == @["", "слово", "", " ", "", "слово", ""])
check("word\r\lword".split(re"(*ANYCRLF)(?m)$") == @["word", "\r\lword"])
check("слово слово".split(re"(*U)(\b)") == @["", "слово", "", " ", "", "слово", ""])
test "perl split tests":
check("forty-two" .split(re"") .join(",") == "f,o,r,t,y,-,t,w,o")