diff --git a/README.asciidoc b/README.asciidoc index 0584a60514..26a9c66168 100644 --- a/README.asciidoc +++ b/README.asciidoc @@ -29,24 +29,24 @@ provides in its standard library is inadequate: === Operations [[proc-find]] -==== find(string, Regex, start = 0, endpos = -1): RegexMatch +==== find(string, Regex, start = 0, endpos = int.high): RegexMatch Finds the given pattern in the string between the end and start positions. `start` :: The start point at which to start matching. `|abc` is `0`; `a|bc` is `1` -`endpos` :: The maximum index for a match; `-1` means the end of the string, - otherwise it's an exclusive upper bound. +`endpos` :: The maximum index for a match; `int.high` means the end of the + string, otherwise it's an inclusive upper bound. [[proc-match]] -==== match(string, Regex, start = 0, endpos = -1): RegexMatch +==== match(string, Regex, start = 0, endpos = int.high): RegexMatch Like link:#proc-find[`find(...)`], but anchored to the start of the string. This means that `"foo".match(re"f") == true`, but `"foo".match(re"o") == false`. [[iter-find]] -==== iterator findIter(string, Regex, start = 0, endpos = -1): RegexMatch +==== iterator findIter(string, Regex, start = 0, endpos = int.high): RegexMatch Works the same as link:#proc-find[`find(...)`], but finds every non-overlapping match. `"2222".find(re"22")` is `"22", "22"`, not `"22", "22", "22"`. @@ -118,11 +118,10 @@ at that id. If the value is invalid, then behavior is undefined. If the id is - `"abc".match(re"(\w)\w").captures[-1] == "ab"` `captureBounds[]: Option[Slice[int]]` :: gets the bounds of the given capture according to the same rules as the above. If the capture is not -filled, then `None` is returned. The upper bound is exclusive, the lower bound -is inclusive. - - `"abc".match(re"(\w)").captureBounds[0] == 0..1` - - `"abc".match(re"").captureBounds[-1] == 0..0` - - `"abc".match(re"abc").captureBounds[-1] == 0..3` +filled, then `None` is returned. The bounds are both inclusive. + - `"abc".match(re"(\w)").captureBounds[0] == 0 .. 0` + - `"abc".match(re"").captureBounds[-1] == 0 .. -1` + - `"abc".match(re"abc").captureBounds[-1] == 0 .. 2` `match: string` :: the full text of the match. `matchBounds: Slice[int]` :: the bounds of the match, as in `captureBounds[]` `(captureBounds|captures).toTable` :: returns a table with each named capture diff --git a/src/nre.nim b/src/nre.nim index 83b0d9db1d..af5dab785f 100644 --- a/src/nre.nim +++ b/src/nre.nim @@ -97,7 +97,7 @@ proc `[]`*(pattern: CaptureBounds, i: int): Option[Slice[int]] = let pattern = RegexMatch(pattern) if pattern.pcreMatchBounds[i + 1].a != -1: let bounds = pattern.pcreMatchBounds[i + 1] - return Some(int(bounds.a) .. int(bounds.b)) + return Some(int(bounds.a) .. int(bounds.b-1)) else: return None[Slice[int]]() @@ -111,7 +111,7 @@ proc `[]`*(pattern: Captures, i: int): string = if bounds: let bounds = bounds.get - return pattern.str.substr(bounds.a, bounds.b-1) + return pattern.str.substr(bounds.a, bounds.b) else: return nil @@ -311,7 +311,7 @@ proc matchImpl(str: string, pattern: Regex, start, endpos: int, flags: int): Opt result.pcreMatchBounds = newSeq[Slice[cint]](ceil(vecsize / 2).int) result.pcreMatchBounds.setLen(vecsize div 3) - let strlen = if endpos == -1: str.len else: endpos + let strlen = if endpos == int.high: str.len else: endpos+1 let execRet = pcre.exec(pattern.pcreObj, pattern.pcreExtra, @@ -328,14 +328,14 @@ proc matchImpl(str: string, pattern: Regex, start, endpos: int, flags: int): Opt else: raise newException(AssertionError, "Internal error: errno " & $execRet) -proc match*(str: string, pattern: Regex, start = 0, endpos = -1): Option[RegexMatch] = +proc match*(str: string, pattern: Regex, start = 0, endpos = int.high): Option[RegexMatch] = return str.matchImpl(pattern, start, endpos, pcre.ANCHORED) -iterator findIter*(str: string, pattern: Regex, start = 0, endpos = -1): RegexMatch = +iterator findIter*(str: string, pattern: Regex, start = 0, endpos = int.high): RegexMatch = # see pcredemo for explaination let matchesCrLf = pattern.matchesCrLf() let unicode = (getinfo[cint](pattern, pcre.INFO_OPTIONS) and pcre.UTF8) > 0 - let endpos = if endpos == -1: str.len else: endpos + let strlen = if endpos == int.high: str.len else: endpos+1 var offset = start var match: Option[RegexMatch] @@ -343,7 +343,7 @@ iterator findIter*(str: string, pattern: Regex, start = 0, endpos = -1): RegexMa var flags = 0 if match and - match.get.matchBounds.a == match.get.matchBounds.b: + match.get.matchBounds.a > match.get.matchBounds.b: # 0-len match flags = pcre.NOTEMPTY_ATSTART or pcre.ANCHORED @@ -361,24 +361,24 @@ iterator findIter*(str: string, pattern: Regex, start = 0, endpos = -1): RegexMa elif unicode: # XXX what about invalid unicode? offset += str.runeLenAt(offset) - assert(offset <= endpos) + assert(offset <= strlen) else: - offset = match.get.matchBounds.b + offset = match.get.matchBounds.b + 1 yield match.get - if offset >= endpos: + if offset >= strlen: # do while break -proc find*(str: string, pattern: Regex, start = 0, endpos = -1): Option[RegexMatch] = +proc find*(str: string, pattern: Regex, start = 0, endpos = int.high): Option[RegexMatch] = ## Returns a `RegexMatch` if there is a match between `start` and `endpos`, otherwise ## it returns nil. ## - ## if `endpos == -1`, then `endpos = str.len` + ## if `endpos == int.high`, then `endpos = str.len` return str.matchImpl(pattern, start, endpos, 0) -proc findAll*(str: string, pattern: Regex, start = 0, endpos = -1): seq[string] = +proc findAll*(str: string, pattern: Regex, start = 0, endpos = int.high): seq[string] = result = @[] for match in str.findIter(pattern, start, endpos): result.add(match.match) @@ -387,24 +387,24 @@ proc split*(str: string, pattern: Regex, maxSplit = -1, start = 0): seq[string] result = @[] var lastIdx = start var splits = 0 - var bounds: Slice[int] + var bounds = 0 .. -1 for match in str.findIter(pattern, start = start): - # upper bound is exclusive, lower is inclusive: + # bounds are inclusive: # # 0123456 # ^^^ - # (1, 4) + # (1, 3) bounds = match.matchBounds # "12".split("") would be @["", "1", "2"], but # if we skip an empty first match, it's the correct # @["1", "2"] - if bounds.a < bounds.b or bounds.a > start: + if bounds.a <= bounds.b or bounds.a > start: result.add(str.substr(lastIdx, bounds.a - 1)) splits += 1 - lastIdx = bounds.b + lastIdx = bounds.b + 1 for cap in match.captures: # if there are captures, include them in the result @@ -416,11 +416,11 @@ proc split*(str: string, pattern: Regex, maxSplit = -1, start = 0): seq[string] # "12".split("\b") would be @["1", "2", ""], but # if we skip an empty last match, it's the correct # @["1", "2"] - if bounds.a < bounds.b or bounds.b < str.len: + if bounds.a <= bounds.b or bounds.b < str.high: # last match: Each match takes the previous substring, # but "1 2".split(/ /) needs to return @["1", "2"]. # This handles "2" - result.add(str.substr(bounds.b, str.len - 1)) + result.add(str.substr(bounds.b + 1, str.high)) template replaceImpl(str: string, pattern: Regex, replacement: expr): stmt {.immediate, dirty.} = @@ -435,7 +435,7 @@ template replaceImpl(str: string, pattern: Regex, assert(nextVal != nil) result.add(nextVal) - lastIdx = bounds.b + lastIdx = bounds.b + 1 result.add(str.substr(lastIdx, str.len - 1)) return result diff --git a/test/captures.nim b/test/captures.nim index 4bf3a81ce2..24ba021f59 100644 --- a/test/captures.nim +++ b/test/captures.nim @@ -8,17 +8,17 @@ suite "captures": test "capture bounds are correct": let ex1 = re("([0-9])") - check("1 23".find(ex1).matchBounds == 0 .. 1) - check("1 23".find(ex1).captureBounds[0].get == 0 .. 1) - check("1 23".find(ex1, 1).matchBounds == 2 .. 3) - check("1 23".find(ex1, 3).matchBounds == 3 .. 4) + check("1 23".find(ex1).matchBounds == 0 .. 0) + check("1 23".find(ex1).captureBounds[0].get == 0 .. 0) + check("1 23".find(ex1, 1).matchBounds == 2 .. 2) + check("1 23".find(ex1, 3).matchBounds == 3 .. 3) let ex2 = re("()()()()()()()()()()([0-9])") - check("824".find(ex2).captureBounds[0].get == 0 .. 0) - check("824".find(ex2).captureBounds[10].get == 0 .. 1) + check("824".find(ex2).captureBounds[0].get == 0 .. -1) + check("824".find(ex2).captureBounds[10].get == 0 .. 0) let ex3 = re("([0-9]+)") - check("824".find(ex3).captureBounds[0].get == 0 .. 3) + check("824".find(ex3).captureBounds[0].get == 0 .. 2) test "named captures": let ex1 = "foobar".find(re("(?foo)(?bar)")) @@ -31,7 +31,7 @@ suite "captures": test "named capture bounds": let ex1 = "foo".find(re("(?foo)(?bar)?")) - check(ex1.captureBounds["foo"] == Some(0..3)) + check(ex1.captureBounds["foo"] == Some(0..2)) check(ex1.captureBounds["bar"] == None[Slice[int]]()) test "capture count": @@ -42,7 +42,7 @@ suite "captures": test "named capture table": let ex1 = "foo".find(re("(?foo)(?bar)?")) check(ex1.captures.toTable == {"foo" : "foo", "bar" : nil}.toTable()) - check(ex1.captureBounds.toTable == {"foo" : Some(0..3), "bar" : None[Slice[int]]()}.toTable()) + check(ex1.captureBounds.toTable == {"foo" : Some(0..2), "bar" : None[Slice[int]]()}.toTable()) check(ex1.captures.toTable("") == {"foo" : "foo", "bar" : ""}.toTable()) let ex2 = "foobar".find(re("(?foo)(?bar)?")) @@ -51,7 +51,7 @@ suite "captures": test "capture sequence": let ex1 = "foo".find(re("(?foo)(?bar)?")) check(ex1.captures.toSeq == @["foo", nil]) - check(ex1.captureBounds.toSeq == @[Some(0..3), None[Slice[int]]()]) + check(ex1.captureBounds.toSeq == @[Some(0..2), None[Slice[int]]()]) check(ex1.captures.toSeq("") == @["foo", ""]) let ex2 = "foobar".find(re("(?foo)(?bar)?")) diff --git a/test/find.nim b/test/find.nim index b4be08e88b..8ab70fb420 100644 --- a/test/find.nim +++ b/test/find.nim @@ -10,7 +10,7 @@ suite "find": test "find bounds": check(toSeq(findIter("1 2 3 4 5 ", re" ")).map( proc (a: RegexMatch): Slice[int] = a.matchBounds - ) == @[1..2, 3..4, 5..6, 7..8, 9..10]) + ) == @[1..1, 3..3, 5..5, 7..7, 9..9]) test "overlapping find": check("222".findAll(re"22") == @["22"]) diff --git a/test/match.nim b/test/match.nim index 277e8b3b00..16fb931c0e 100644 --- a/test/match.nim +++ b/test/match.nim @@ -1,17 +1,18 @@ include nre, unittest, optional_t.nonstrict suite "match": - test "upper bound must be exclusive": - check("abc".match(re"abc", endpos = 0) == None[RegexMatch]()) - check("abc".match(re"abc", endpos = 3) != None[RegexMatch]()) + test "upper bound must be inclusive": + check("abc".match(re"abc", endpos = -1) == None[RegexMatch]()) + check("abc".match(re"abc", endpos = 1) == None[RegexMatch]()) + check("abc".match(re"abc", endpos = 2) != None[RegexMatch]()) test "match examples": check("abc".match(re"(\w)").captures[0] == "a") check("abc".match(re"(?\w)").captures["letter"] == "a") check("abc".match(re"(\w)\w").captures[-1] == "ab") - check("abc".match(re"(\w)").captureBounds[0].get == 0..1) - check("abc".match(re"").captureBounds[-1].get == 0..0) - check("abc".match(re"abc").captureBounds[-1].get == 0..3) + check("abc".match(re"(\w)").captureBounds[0].get == 0 .. 0) + check("abc".match(re"").captureBounds[-1].get == 0 .. -1) + check("abc".match(re"abc").captureBounds[-1].get == 0 .. 2) test "match test cases": - check("123".match(re"").matchBounds == 0..0) + check("123".match(re"").matchBounds == 0 .. -1)