mirror of
https://github.com/nim-lang/Nim.git
synced 2026-01-01 19:02:18 +00:00
Implement match, find, split
This commit is contained in:
@@ -8,4 +8,4 @@ srcDir = "src"
|
||||
|
||||
[Deps]
|
||||
Requires: "nim >= 0.10.0"
|
||||
Requires: "optional_t >= 1.0"
|
||||
Requires: "optional_t >= 1.1.0"
|
||||
|
||||
126
src/nre.nim
126
src/nre.nim
@@ -6,6 +6,7 @@ from future import lc, `[]`
|
||||
from strutils import toLower, `%`
|
||||
from math import ceil
|
||||
import optional_t
|
||||
from unicode import runeLenAt
|
||||
|
||||
# Type definitions {{{
|
||||
type
|
||||
@@ -54,6 +55,27 @@ proc captureNameId*(self: Regex): Table[string, int] =
|
||||
## Returns a map from named capture groups to their numerical
|
||||
## identifier
|
||||
return self.captureNameToId
|
||||
|
||||
proc matchesCrLf(self: Regex): bool =
|
||||
let flags = getinfo[cint](self, pcre.INFO_OPTIONS)
|
||||
let newlineFlags = flags and (pcre.NEWLINE_CRLF or
|
||||
pcre.NEWLINE_ANY or
|
||||
pcre.NEWLINE_ANYCRLF)
|
||||
if newLineFlags > 0:
|
||||
return true
|
||||
|
||||
# get flags from build config
|
||||
var confFlags: cint
|
||||
if pcre.config(pcre.CONFIG_NEWLINE, addr confFlags) != 0:
|
||||
assert(false, "CONFIG_NEWLINE apparently got screwed up")
|
||||
|
||||
case confFlags
|
||||
of 13: return false
|
||||
of 10: return false
|
||||
of (13 shl 8) or 10: return true
|
||||
of -2: return true
|
||||
of -1: return true
|
||||
else: return false
|
||||
# }}}
|
||||
|
||||
# Capture accessors {{{
|
||||
@@ -255,11 +277,7 @@ proc initRegex*(pattern: string, options = "Sx"): Regex =
|
||||
result.captureNameToId = result.getNameToNumberTable()
|
||||
# }}}
|
||||
|
||||
proc match*(self: Regex, str: string, start = 0, endpos = -1): Option[RegexMatch] =
|
||||
## Returns Some if there is a match between `start` and `endpos`, otherwise
|
||||
## it returns None.
|
||||
##
|
||||
## if `endpos == -1`, then `endpos = str.len`
|
||||
proc matchImpl*(self: Regex, str: string, start, endpos: int, flags: int): Option[RegexMatch] =
|
||||
var result: RegexMatch
|
||||
new(result)
|
||||
result.pattern = self
|
||||
@@ -277,11 +295,105 @@ proc match*(self: Regex, str: string, start = 0, endpos = -1): Option[RegexMatch
|
||||
cstring(str),
|
||||
cint(max(str.len, endpos)),
|
||||
cint(start),
|
||||
cint(0),
|
||||
cast[ptr cint](addr result.pcreMatchBounds[0]), cint(vecsize))
|
||||
cint(flags),
|
||||
cast[ptr cint](addr result.pcreMatchBounds[0]),
|
||||
cint(vecsize))
|
||||
if execRet >= 0:
|
||||
return Some(result)
|
||||
elif execRet == pcre.ERROR_NOMATCH:
|
||||
return None[RegexMatch]()
|
||||
else:
|
||||
raise newException(AssertionError, "Internal error: errno " & $execRet)
|
||||
|
||||
proc match*(self: Regex, str: string, start = 0, endpos = -1): Option[RegexMatch] =
|
||||
## Returns Some if there is a match between `start` and `endpos`, otherwise
|
||||
## it returns None.
|
||||
##
|
||||
## if `endpos == -1`, then `endpos = str.len`
|
||||
return matchImpl(self, str, start, endpos, 0)
|
||||
|
||||
iterator findIter*(self: Regex, str: string, start = 0, endpos = -1): RegexMatch =
|
||||
# see pcredemo for explaination
|
||||
let matchesCrLf = self.matchesCrLf()
|
||||
let unicode = bool(getinfo[cint](self, pcre.INFO_OPTIONS) and pcre.UTF8)
|
||||
let endpos = if endpos == -1: str.len else: endpos
|
||||
|
||||
var offset = start
|
||||
var previousMatch: RegexMatch
|
||||
while offset != endpos:
|
||||
if offset > endpos:
|
||||
# eos occurs in the middle of a unicode char? die.
|
||||
raise newException(AssertionError, "Input string has malformed unicode")
|
||||
|
||||
var flags = 0
|
||||
|
||||
if previousMatch != nil and
|
||||
previousMatch.matchBounds.a == previousMatch.matchBounds.b:
|
||||
# 0-len match
|
||||
flags = pcre.NOTEMPTY_ATSTART or pcre.ANCHORED
|
||||
|
||||
let currentMatch = self.matchImpl(str, offset, endpos, flags)
|
||||
previousMatch = currentMatch.get(nil)
|
||||
|
||||
if currentMatch.isNone:
|
||||
# either the end of the input or the string
|
||||
# cannot be split here
|
||||
offset += 1
|
||||
|
||||
if matchesCrLf and offset < (str.len - 1) and
|
||||
str[offset] == '\r' and str[offset + 1] == '\l':
|
||||
# if PCRE treats CrLf as newline, skip both at the same time
|
||||
offset += 1
|
||||
elif unicode:
|
||||
# XXX what about invalid unicode?
|
||||
offset += str.runeLenAt(offset)
|
||||
else:
|
||||
let currentMatch = currentMatch.get
|
||||
offset = currentMatch.matchBounds.b
|
||||
|
||||
yield currentMatch
|
||||
|
||||
proc find*(self: Regex, str: string, start = 0, endpos = -1): Option[RegexMatch] =
|
||||
for match in self.findIter(str, start, endpos):
|
||||
return Some(match)
|
||||
|
||||
return None[RegexMatch]()
|
||||
|
||||
proc findAll*(self: Regex, str: string, start = 0, endpos = -1): seq[RegexMatch] =
|
||||
accumulateResult(self.findIter(str, start, endpos))
|
||||
|
||||
proc renderBounds(str: string, bounds: Slice[int]): string =
|
||||
result = " " & str & "⫞\n"
|
||||
for i in -1 .. <bounds.a:
|
||||
result.add(" ")
|
||||
for i in bounds.a .. bounds.b:
|
||||
result.add("^")
|
||||
|
||||
proc split*(self: Regex, str: string): seq[string] =
|
||||
result = @[]
|
||||
var lastIdx = 0
|
||||
|
||||
for match in self.findIter(str):
|
||||
# upper bound is exclusive, lower is inclusive:
|
||||
#
|
||||
# 0123456
|
||||
# ^^^
|
||||
# (1, 4)
|
||||
var bounds = match.matchBounds
|
||||
|
||||
if lastIdx == 0 and
|
||||
lastIdx == bounds.a and
|
||||
bounds.a == bounds.b:
|
||||
# "12".split("") would be @["", "1", "2"], but
|
||||
# if we skip an empty first match, it's the correct
|
||||
# @["1", "2"]
|
||||
discard
|
||||
else:
|
||||
result.add(str.substr(lastIdx, bounds.a - 1))
|
||||
|
||||
lastIdx = bounds.b
|
||||
|
||||
# last match: Each match takes the previous substring,
|
||||
# but "1 2".split(/ /) needs to return @["1", "2"].
|
||||
# This handles "2"
|
||||
result.add(str.substr(lastIdx, str.len - 1))
|
||||
|
||||
14
test/find.nim
Normal file
14
test/find.nim
Normal file
@@ -0,0 +1,14 @@
|
||||
import unittest
|
||||
include nre
|
||||
|
||||
suite "find":
|
||||
test "find text":
|
||||
check(initRegex(r"[a-z]").find("3213a").get.match == "a")
|
||||
check(initRegex(r" ", "S").findAll("1 2 3 4 5 6 7 8 ").map(
|
||||
proc (a: RegexMatch): string = a.match
|
||||
) == @[" ", " ", " ", " ", " ", " ", " ", " "])
|
||||
|
||||
test "find bounds":
|
||||
check(initRegex(r" ", "S").findAll("1 2 3 4 5 ").map(
|
||||
proc (a: RegexMatch): Slice[int] = a.matchBounds
|
||||
) == @[1..2, 3..4, 5..6, 7..8, 9..10])
|
||||
9
test/split.nim
Normal file
9
test/split.nim
Normal file
@@ -0,0 +1,9 @@
|
||||
import unittest
|
||||
include nre
|
||||
|
||||
suite "string splitting":
|
||||
test "splitting strings":
|
||||
check(initRegex("").split("12345") == @["1", "2", "3", "4", "5"])
|
||||
check(initRegex(" ", "S").split("1 2 3 4 5 6 ") == @["1", "2", "3", "4", "5", "6", ""])
|
||||
check(initRegex(" ", "S").split("1 2 ") == @["1", "", "2", "", ""])
|
||||
check(initRegex(" ", "S").split("1 2") == @["1", "2"])
|
||||
@@ -1,3 +1,5 @@
|
||||
import nre
|
||||
import init
|
||||
import captures
|
||||
import find
|
||||
import split
|
||||
|
||||
Reference in New Issue
Block a user