Merge pull request #8 from BlaXpirit/incl-indices

Change capture upper bounds to inclusive
2026-02-15 23:54:19 +00:00 · 2015-04-09 17:15:34 -04:00
parent bdd8567f50 2f0375c4c8
commit 6c213802f0
5 changed files with 49 additions and 49 deletions
--- a/README.asciidoc
+++ b/README.asciidoc
@@ -29,24 +29,24 @@ provides in its standard library is inadequate:
 === Operations

 [[proc-find]]
-==== find(string, Regex, start = 0, endpos = -1): RegexMatch
+==== find(string, Regex, start = 0, endpos = int.high): RegexMatch

 Finds the given pattern in the string between the end and start positions.

 `start` :: The start point at which to start matching. `|abc` is `0`; `a|bc`
   is `1`
-`endpos` :: The maximum index for a match; `-1` means the end of the string,
-   otherwise it's an exclusive upper bound.
+`endpos` :: The maximum index for a match; `int.high` means the end of the
+   string, otherwise it's an inclusive upper bound.

 [[proc-match]]
-==== match(string, Regex, start = 0, endpos = -1): RegexMatch
+==== match(string, Regex, start = 0, endpos = int.high): RegexMatch

 Like link:#proc-find[`find(...)`], but anchored to the start of the string.
 This means that `"foo".match(re"f") == true`, but `"foo".match(re"o") ==
 false`.

 [[iter-find]]
-==== iterator findIter(string, Regex, start = 0, endpos = -1): RegexMatch
+==== iterator findIter(string, Regex, start = 0, endpos = int.high): RegexMatch

 Works the same as link:#proc-find[`find(...)`], but finds every non-overlapping
 match. `"2222".find(re"22")` is `"22", "22"`, not `"22", "22", "22"`.
@@ -118,11 +118,10 @@ at that id. If the value is invalid, then behavior is undefined. If the id is
 - `"abc".match(re"(\w)\w").captures[-1] == "ab"`
 `captureBounds[]: Option[Slice[int]]` :: gets the bounds of the
 given capture according to the same rules as the above. If the capture is not
-filled, then `None` is returned. The upper bound is exclusive, the lower bound
-is inclusive.
- - `"abc".match(re"(\w)").captureBounds[0] == 0..1`
- - `"abc".match(re"").captureBounds[-1] == 0..0`
- - `"abc".match(re"abc").captureBounds[-1] == 0..3`
+filled, then `None` is returned. The bounds are both inclusive.
+ - `"abc".match(re"(\w)").captureBounds[0] == 0 .. 0`
+ - `"abc".match(re"").captureBounds[-1] == 0 .. -1`
+ - `"abc".match(re"abc").captureBounds[-1] == 0 .. 2`
 `match: string` :: the full text of the match.
 `matchBounds: Slice[int]` :: the bounds of the match, as in `captureBounds[]`
 `(captureBounds|captures).toTable` :: returns a table with each named capture
--- a/src/nre.nim
+++ b/src/nre.nim
@@ -97,7 +97,7 @@ proc `[]`*(pattern: CaptureBounds, i: int): Option[Slice[int]] =
  let pattern = RegexMatch(pattern)
  if pattern.pcreMatchBounds[i + 1].a != -1:
    let bounds = pattern.pcreMatchBounds[i + 1]
-    return Some(int(bounds.a) .. int(bounds.b))
+    return Some(int(bounds.a) .. int(bounds.b-1))
  else:
    return None[Slice[int]]()

@@ -111,7 +111,7 @@ proc `[]`*(pattern: Captures, i: int): string =

  if bounds:
    let bounds = bounds.get
-    return pattern.str.substr(bounds.a, bounds.b-1)
+    return pattern.str.substr(bounds.a, bounds.b)
  else:
    return nil

@@ -311,7 +311,7 @@ proc matchImpl(str: string, pattern: Regex, start, endpos: int, flags: int): Opt
  result.pcreMatchBounds = newSeq[Slice[cint]](ceil(vecsize / 2).int)
  result.pcreMatchBounds.setLen(vecsize div 3)

-  let strlen = if endpos == -1: str.len else: endpos
+  let strlen = if endpos == int.high: str.len else: endpos+1

  let execRet = pcre.exec(pattern.pcreObj,
                          pattern.pcreExtra,
@@ -328,14 +328,14 @@ proc matchImpl(str: string, pattern: Regex, start, endpos: int, flags: int): Opt
  else:
    raise newException(AssertionError, "Internal error: errno " & $execRet)

-proc match*(str: string, pattern: Regex, start = 0, endpos = -1): Option[RegexMatch] =
+proc match*(str: string, pattern: Regex, start = 0, endpos = int.high): Option[RegexMatch] =
  return str.matchImpl(pattern, start, endpos, pcre.ANCHORED)

-iterator findIter*(str: string, pattern: Regex, start = 0, endpos = -1): RegexMatch =
+iterator findIter*(str: string, pattern: Regex, start = 0, endpos = int.high): RegexMatch =
  # see pcredemo for explaination
  let matchesCrLf = pattern.matchesCrLf()
  let unicode = (getinfo[cint](pattern, pcre.INFO_OPTIONS) and pcre.UTF8) > 0
-  let endpos = if endpos == -1: str.len else: endpos
+  let strlen = if endpos == int.high: str.len else: endpos+1

  var offset = start
  var match: Option[RegexMatch]
@@ -343,7 +343,7 @@ iterator findIter*(str: string, pattern: Regex, start = 0, endpos = -1): RegexMa
    var flags = 0

    if match and
-       match.get.matchBounds.a == match.get.matchBounds.b:
+       match.get.matchBounds.a > match.get.matchBounds.b:
      # 0-len match
      flags = pcre.NOTEMPTY_ATSTART or pcre.ANCHORED

@@ -361,24 +361,24 @@ iterator findIter*(str: string, pattern: Regex, start = 0, endpos = -1): RegexMa
      elif unicode:
        # XXX what about invalid unicode?
        offset += str.runeLenAt(offset)
-        assert(offset <= endpos)
+        assert(offset <= strlen)
    else:
-      offset = match.get.matchBounds.b
+      offset = match.get.matchBounds.b + 1

      yield match.get

-    if offset >= endpos:
+    if offset >= strlen:
      # do while
      break

-proc find*(str: string, pattern: Regex, start = 0, endpos = -1): Option[RegexMatch] =
+proc find*(str: string, pattern: Regex, start = 0, endpos = int.high): Option[RegexMatch] =
  ## Returns a `RegexMatch` if there is a match between `start` and `endpos`, otherwise
  ## it returns nil.
  ##
-  ## if `endpos == -1`, then `endpos = str.len`
+  ## if `endpos == int.high`, then `endpos = str.len`
  return str.matchImpl(pattern, start, endpos, 0)

-proc findAll*(str: string, pattern: Regex, start = 0, endpos = -1): seq[string] =
+proc findAll*(str: string, pattern: Regex, start = 0, endpos = int.high): seq[string] =
  result = @[]
  for match in str.findIter(pattern, start, endpos):
    result.add(match.match)
@@ -387,24 +387,24 @@ proc split*(str: string, pattern: Regex, maxSplit = -1, start = 0): seq[string]
  result = @[]
  var lastIdx = start
  var splits = 0
-  var bounds: Slice[int]
+  var bounds = 0 .. -1

  for match in str.findIter(pattern, start = start):
-    # upper bound is exclusive, lower is inclusive:
+    # bounds are inclusive:
    #
    # 0123456
    #  ^^^
-    # (1, 4)
+    # (1, 3)
    bounds = match.matchBounds

    # "12".split("") would be @["", "1", "2"], but
    # if we skip an empty first match, it's the correct
    # @["1", "2"]
-    if bounds.a < bounds.b or bounds.a > start:
+    if bounds.a <= bounds.b or bounds.a > start:
      result.add(str.substr(lastIdx, bounds.a - 1))
      splits += 1

-    lastIdx = bounds.b
+    lastIdx = bounds.b + 1

    for cap in match.captures:
      # if there are captures, include them in the result
@@ -416,11 +416,11 @@ proc split*(str: string, pattern: Regex, maxSplit = -1, start = 0): seq[string]
  # "12".split("\b") would be @["1", "2", ""], but
  # if we skip an empty last match, it's the correct
  # @["1", "2"]
-  if bounds.a < bounds.b or bounds.b < str.len:
+  if bounds.a <= bounds.b or bounds.b < str.high:
    # last match: Each match takes the previous substring,
    # but "1 2".split(/ /) needs to return @["1", "2"].
    # This handles "2"
-    result.add(str.substr(bounds.b, str.len - 1))
+    result.add(str.substr(bounds.b + 1, str.high))

 template replaceImpl(str: string, pattern: Regex,
                     replacement: expr): stmt {.immediate, dirty.} =
@@ -435,7 +435,7 @@ template replaceImpl(str: string, pattern: Regex,
    assert(nextVal != nil)
    result.add(nextVal)

-    lastIdx = bounds.b
+    lastIdx = bounds.b + 1

  result.add(str.substr(lastIdx, str.len - 1))
  return result
--- a/test/captures.nim
+++ b/test/captures.nim
@@ -8,17 +8,17 @@ suite "captures":

  test "capture bounds are correct":
    let ex1 = re("([0-9])")
-    check("1 23".find(ex1).matchBounds == 0 .. 1)
-    check("1 23".find(ex1).captureBounds[0].get == 0 .. 1)
-    check("1 23".find(ex1, 1).matchBounds == 2 .. 3)
-    check("1 23".find(ex1, 3).matchBounds == 3 .. 4)
+    check("1 23".find(ex1).matchBounds == 0 .. 0)
+    check("1 23".find(ex1).captureBounds[0].get == 0 .. 0)
+    check("1 23".find(ex1, 1).matchBounds == 2 .. 2)
+    check("1 23".find(ex1, 3).matchBounds == 3 .. 3)

    let ex2 = re("()()()()()()()()()()([0-9])")
-    check("824".find(ex2).captureBounds[0].get == 0 .. 0)
-    check("824".find(ex2).captureBounds[10].get == 0 .. 1)
+    check("824".find(ex2).captureBounds[0].get == 0 .. -1)
+    check("824".find(ex2).captureBounds[10].get == 0 .. 0)

    let ex3 = re("([0-9]+)")
-    check("824".find(ex3).captureBounds[0].get == 0 .. 3)
+    check("824".find(ex3).captureBounds[0].get == 0 .. 2)

  test "named captures":
    let ex1 = "foobar".find(re("(?<foo>foo)(?<bar>bar)"))
@@ -31,7 +31,7 @@ suite "captures":

  test "named capture bounds":
    let ex1 = "foo".find(re("(?<foo>foo)(?<bar>bar)?"))
-    check(ex1.captureBounds["foo"] == Some(0..3))
+    check(ex1.captureBounds["foo"] == Some(0..2))
    check(ex1.captureBounds["bar"] == None[Slice[int]]())

  test "capture count":
@@ -42,7 +42,7 @@ suite "captures":
  test "named capture table":
    let ex1 = "foo".find(re("(?<foo>foo)(?<bar>bar)?"))
    check(ex1.captures.toTable == {"foo" : "foo", "bar" : nil}.toTable())
-    check(ex1.captureBounds.toTable == {"foo" : Some(0..3), "bar" : None[Slice[int]]()}.toTable())
+    check(ex1.captureBounds.toTable == {"foo" : Some(0..2), "bar" : None[Slice[int]]()}.toTable())
    check(ex1.captures.toTable("") == {"foo" : "foo", "bar" : ""}.toTable())

    let ex2 = "foobar".find(re("(?<foo>foo)(?<bar>bar)?"))
@@ -51,7 +51,7 @@ suite "captures":
  test "capture sequence":
    let ex1 = "foo".find(re("(?<foo>foo)(?<bar>bar)?"))
    check(ex1.captures.toSeq == @["foo", nil])
-    check(ex1.captureBounds.toSeq == @[Some(0..3), None[Slice[int]]()])
+    check(ex1.captureBounds.toSeq == @[Some(0..2), None[Slice[int]]()])
    check(ex1.captures.toSeq("") == @["foo", ""])

    let ex2 = "foobar".find(re("(?<foo>foo)(?<bar>bar)?"))
--- a/test/find.nim
+++ b/test/find.nim
@@ -10,7 +10,7 @@ suite "find":
  test "find bounds":
    check(toSeq(findIter("1 2 3 4 5 ", re" ")).map(
      proc (a: RegexMatch): Slice[int] = a.matchBounds
-    ) == @[1..2, 3..4, 5..6, 7..8, 9..10])
+    ) == @[1..1, 3..3, 5..5, 7..7, 9..9])

  test "overlapping find":
    check("222".findAll(re"22") == @["22"])
--- a/test/match.nim
+++ b/test/match.nim
@@ -1,17 +1,18 @@
 include nre, unittest, optional_t.nonstrict

 suite "match":
-  test "upper bound must be exclusive":
-    check("abc".match(re"abc", endpos = 0) == None[RegexMatch]())
-    check("abc".match(re"abc", endpos = 3) != None[RegexMatch]())
+  test "upper bound must be inclusive":
+    check("abc".match(re"abc", endpos = -1) == None[RegexMatch]())
+    check("abc".match(re"abc", endpos = 1) == None[RegexMatch]())
+    check("abc".match(re"abc", endpos = 2) != None[RegexMatch]())

  test "match examples":
    check("abc".match(re"(\w)").captures[0] == "a")
    check("abc".match(re"(?<letter>\w)").captures["letter"] == "a")
    check("abc".match(re"(\w)\w").captures[-1] == "ab")
-    check("abc".match(re"(\w)").captureBounds[0].get == 0..1)
-    check("abc".match(re"").captureBounds[-1].get == 0..0)
-    check("abc".match(re"abc").captureBounds[-1].get == 0..3)
+    check("abc".match(re"(\w)").captureBounds[0].get == 0 .. 0)
+    check("abc".match(re"").captureBounds[-1].get == 0 .. -1)
+    check("abc".match(re"abc").captureBounds[-1].get == 0 .. 2)

  test "match test cases":
-    check("123".match(re"").matchBounds == 0..0)
+    check("123".match(re"").matchBounds == 0 .. -1)