Implement match, find, split

This commit is contained in:
Flaviu Tamas
2015-01-12 20:09:19 -05:00
parent 6fe0de0639
commit 2474758ed5
5 changed files with 145 additions and 8 deletions

View File

@@ -8,4 +8,4 @@ srcDir = "src"
[Deps]
Requires: "nim >= 0.10.0"
Requires: "optional_t >= 1.0"
Requires: "optional_t >= 1.1.0"

View File

@@ -6,6 +6,7 @@ from future import lc, `[]`
from strutils import toLower, `%`
from math import ceil
import optional_t
from unicode import runeLenAt
# Type definitions {{{
type
@@ -54,6 +55,27 @@ proc captureNameId*(self: Regex): Table[string, int] =
## Returns a map from named capture groups to their numerical
## identifier
return self.captureNameToId
proc matchesCrLf(self: Regex): bool =
let flags = getinfo[cint](self, pcre.INFO_OPTIONS)
let newlineFlags = flags and (pcre.NEWLINE_CRLF or
pcre.NEWLINE_ANY or
pcre.NEWLINE_ANYCRLF)
if newLineFlags > 0:
return true
# get flags from build config
var confFlags: cint
if pcre.config(pcre.CONFIG_NEWLINE, addr confFlags) != 0:
assert(false, "CONFIG_NEWLINE apparently got screwed up")
case confFlags
of 13: return false
of 10: return false
of (13 shl 8) or 10: return true
of -2: return true
of -1: return true
else: return false
# }}}
# Capture accessors {{{
@@ -255,11 +277,7 @@ proc initRegex*(pattern: string, options = "Sx"): Regex =
result.captureNameToId = result.getNameToNumberTable()
# }}}
proc match*(self: Regex, str: string, start = 0, endpos = -1): Option[RegexMatch] =
## Returns Some if there is a match between `start` and `endpos`, otherwise
## it returns None.
##
## if `endpos == -1`, then `endpos = str.len`
proc matchImpl*(self: Regex, str: string, start, endpos: int, flags: int): Option[RegexMatch] =
var result: RegexMatch
new(result)
result.pattern = self
@@ -277,11 +295,105 @@ proc match*(self: Regex, str: string, start = 0, endpos = -1): Option[RegexMatch
cstring(str),
cint(max(str.len, endpos)),
cint(start),
cint(0),
cast[ptr cint](addr result.pcreMatchBounds[0]), cint(vecsize))
cint(flags),
cast[ptr cint](addr result.pcreMatchBounds[0]),
cint(vecsize))
if execRet >= 0:
return Some(result)
elif execRet == pcre.ERROR_NOMATCH:
return None[RegexMatch]()
else:
raise newException(AssertionError, "Internal error: errno " & $execRet)
proc match*(self: Regex, str: string, start = 0, endpos = -1): Option[RegexMatch] =
## Returns Some if there is a match between `start` and `endpos`, otherwise
## it returns None.
##
## if `endpos == -1`, then `endpos = str.len`
return matchImpl(self, str, start, endpos, 0)
iterator findIter*(self: Regex, str: string, start = 0, endpos = -1): RegexMatch =
# see pcredemo for explaination
let matchesCrLf = self.matchesCrLf()
let unicode = bool(getinfo[cint](self, pcre.INFO_OPTIONS) and pcre.UTF8)
let endpos = if endpos == -1: str.len else: endpos
var offset = start
var previousMatch: RegexMatch
while offset != endpos:
if offset > endpos:
# eos occurs in the middle of a unicode char? die.
raise newException(AssertionError, "Input string has malformed unicode")
var flags = 0
if previousMatch != nil and
previousMatch.matchBounds.a == previousMatch.matchBounds.b:
# 0-len match
flags = pcre.NOTEMPTY_ATSTART or pcre.ANCHORED
let currentMatch = self.matchImpl(str, offset, endpos, flags)
previousMatch = currentMatch.get(nil)
if currentMatch.isNone:
# either the end of the input or the string
# cannot be split here
offset += 1
if matchesCrLf and offset < (str.len - 1) and
str[offset] == '\r' and str[offset + 1] == '\l':
# if PCRE treats CrLf as newline, skip both at the same time
offset += 1
elif unicode:
# XXX what about invalid unicode?
offset += str.runeLenAt(offset)
else:
let currentMatch = currentMatch.get
offset = currentMatch.matchBounds.b
yield currentMatch
proc find*(self: Regex, str: string, start = 0, endpos = -1): Option[RegexMatch] =
for match in self.findIter(str, start, endpos):
return Some(match)
return None[RegexMatch]()
proc findAll*(self: Regex, str: string, start = 0, endpos = -1): seq[RegexMatch] =
accumulateResult(self.findIter(str, start, endpos))
proc renderBounds(str: string, bounds: Slice[int]): string =
result = " " & str & "\n"
for i in -1 .. <bounds.a:
result.add(" ")
for i in bounds.a .. bounds.b:
result.add("^")
proc split*(self: Regex, str: string): seq[string] =
result = @[]
var lastIdx = 0
for match in self.findIter(str):
# upper bound is exclusive, lower is inclusive:
#
# 0123456
# ^^^
# (1, 4)
var bounds = match.matchBounds
if lastIdx == 0 and
lastIdx == bounds.a and
bounds.a == bounds.b:
# "12".split("") would be @["", "1", "2"], but
# if we skip an empty first match, it's the correct
# @["1", "2"]
discard
else:
result.add(str.substr(lastIdx, bounds.a - 1))
lastIdx = bounds.b
# last match: Each match takes the previous substring,
# but "1 2".split(/ /) needs to return @["1", "2"].
# This handles "2"
result.add(str.substr(lastIdx, str.len - 1))

14
test/find.nim Normal file
View File

@@ -0,0 +1,14 @@
import unittest
include nre
suite "find":
test "find text":
check(initRegex(r"[a-z]").find("3213a").get.match == "a")
check(initRegex(r" ", "S").findAll("1 2 3 4 5 6 7 8 ").map(
proc (a: RegexMatch): string = a.match
) == @[" ", " ", " ", " ", " ", " ", " ", " "])
test "find bounds":
check(initRegex(r" ", "S").findAll("1 2 3 4 5 ").map(
proc (a: RegexMatch): Slice[int] = a.matchBounds
) == @[1..2, 3..4, 5..6, 7..8, 9..10])

9
test/split.nim Normal file
View File

@@ -0,0 +1,9 @@
import unittest
include nre
suite "string splitting":
test "splitting strings":
check(initRegex("").split("12345") == @["1", "2", "3", "4", "5"])
check(initRegex(" ", "S").split("1 2 3 4 5 6 ") == @["1", "2", "3", "4", "5", "6", ""])
check(initRegex(" ", "S").split("1 2 ") == @["1", "", "2", "", ""])
check(initRegex(" ", "S").split("1 2") == @["1", "2"])

View File

@@ -1,3 +1,5 @@
import nre
import init
import captures
import find
import split