close #14284 document semantics for start for re,nre; improve examples (#14483)

2026-06-07 20:34:21 +00:00 · 2020-05-28 11:43:36 -07:00
parent fe7a2d60f9
commit 17d08ff71c
3 changed files with 175 additions and 196 deletions
--- a/lib/impure/nre.nim
+++ b/lib/impure/nre.nim
@@ -58,6 +58,12 @@ runnableExamples:
    let matchBounds = firstVowel.get().captureBounds[-1]
    doAssert matchBounds.a == 1

+  ## as with module `re`, unless specified otherwise, `start` parameter in each
+  ## proc indicates where the scan starts, but outputs are relative to the start
+  ## of the input string, not to `start`:
+  doAssert find("uxabc", re"(?<=x|y)ab", start = 1).get.captures[-1] == "ab"
+  doAssert find("uxabc", re"ab", start = 3).isNone
+
 from pcre import nil
 import nre/private/util
 import tables
@@ -222,15 +228,15 @@ type
    ## code.

 runnableExamples:
-    # This MUST be kept in sync with the examples in RegexMatch
-    doAssert "abc".match(re"(\w)").get.captures[0] == "a"
-    doAssert "abc".match(re"(?<letter>\w)").get.captures["letter"] == "a"
-    doAssert "abc".match(re"(\w)\w").get.captures[-1] == "ab"
+  # This MUST be kept in sync with the examples in RegexMatch
+  doAssert "abc".match(re"(\w)").get.captures[0] == "a"
+  doAssert "abc".match(re"(?<letter>\w)").get.captures["letter"] == "a"
+  doAssert "abc".match(re"(\w)\w").get.captures[-1] == "ab"

-    doAssert "abc".match(re"(\w)").get.captureBounds[0] == 0 .. 0
-    doAssert 0 in "abc".match(re"(\w)").get.captureBounds == true
-    doAssert "abc".match(re"").get.captureBounds[-1] == 0 .. -1
-    doAssert "abc".match(re"abc").get.captureBounds[-1] == 0 .. 2
+  doAssert "abc".match(re"(\w)").get.captureBounds[0] == 0 .. 0
+  doAssert 0 in "abc".match(re"(\w)").get.captureBounds == true
+  doAssert "abc".match(re"").get.captureBounds[-1] == 0 .. -1
+  doAssert "abc".match(re"abc").get.captureBounds[-1] == 0 .. 2


 proc destroyRegex(pattern: Regex) =
@@ -614,9 +620,9 @@ proc contains*(str: string, pattern: Regex, start = 0, endpos = int.high): bool
  ## This function is equivalent to ``isSome(str.find(pattern, start, endpos))``.
  ##
  runnableExamples:
-    doAssert "abc".contains(re"bc") == true
-    doAssert "abc".contains(re"cd") == false
-    doAssert "abc".contains(re"a", start = 1) == false
+    doAssert "abc".contains(re"bc")
+    doAssert not "abc".contains(re"cd")
+    doAssert not "abc".contains(re"a", start = 1)

  return isSome(str.find(pattern, start, endpos))

--- a/lib/impure/re.nim
+++ b/lib/impure/re.nim
@@ -22,6 +22,15 @@ when defined(js):
 ## .. include:: ../../doc/regexprs.txt
 ##

+runnableExamples:
+  ## Unless specified otherwise, `start` parameter in each proc indicates
+  ## where the scan starts, but outputs are relative to the start of the input
+  ## string, not to `start`:
+  doAssert find("uxabc", re"(?<=x|y)ab", start = 1) == 2 # lookbehind assertion
+  doAssert find("uxabc", re"ab", start = 3) == -1 # we're past `start` => not found
+  doAssert not match("xabc", re"^abc$", start = 1)
+    # can't match start of string since we're starting at 1
+
 import
  pcre, strutils, rtarrays

@@ -209,11 +218,8 @@ proc findBounds*(s: string, pattern: Regex,
  ## If it does not match, ``(-1,0)`` is returned.
  ##
  ## Note: there is a speed improvement if the matches do not need to be captured.
-  ##
-  ## Example:
-  ##
-  ## .. code-block:: nim
-  ##   assert findBounds("01234abc89", re"abc") == (5,7)
+  runnableExamples:
+    assert findBounds("01234abc89", re"abc") == (5,7)
  result = findBounds(cstring(s), pattern, start, s.len)

 proc matchOrFind(buf: cstring, pattern: Regex, start, bufSize: int, flags: cint): cint =
@@ -244,12 +250,10 @@ proc matchLen*(s: string, pattern: Regex, start = 0): int {.inline.} =
  ## if there is no match, ``-1`` is returned. Note that a match length
  ## of zero can happen.
  ##
-  ## Example:
-  ##
-  ## .. code-block:: nim
-  ##   echo matchLen("abcdefg", re"cde", 2)  # =>  3
-  ##   echo matchLen("abcdefg", re"abcde")   # =>  5
-  ##   echo matchLen("abcdefg", re"cde")     # => -1
+  runnableExamples:
+    doAssert matchLen("abcdefg", re"cde", 2) == 3
+    doAssert matchLen("abcdefg", re"abcde") == 5
+    doAssert matchLen("abcdefg", re"cde") == -1
  result = matchOrFind(cstring(s), pattern, start.cint, s.len.cint, pcre.ANCHORED)

 proc matchLen*(buf: cstring, pattern: Regex, start = 0, bufSize: int): int {.inline.} =
@@ -269,13 +273,11 @@ proc match*(s: string, pattern: Regex, matches: var openArray[string],
  ## match, nothing is written into ``matches`` and ``false`` is
  ## returned.
  ##
-  ## Example:
-  ##
-  ## .. code-block:: nim
-  ##   var matches: array[2, string]
-  ##   if match("abcdefg", re"c(d)ef(g)", matches, 2):
-  ##     for s in matches:
-  ##       echo s       # => d g
+  runnableExamples:
+    import sequtils
+    var matches: array[2, string]
+    if match("abcdefg", re"c(d)ef(g)", matches, 2):
+      doAssert toSeq(matches) == @["d", "g"]
  result = matchLen(cstring(s), pattern, matches, start, s.len) != -1

 proc match*(buf: cstring, pattern: Regex, matches: var openArray[string],
@@ -327,14 +329,15 @@ proc find*(buf: cstring, pattern: Regex, start = 0, bufSize: int): int =

 proc find*(s: string, pattern: Regex, start = 0): int {.inline.} =
  ## returns the starting position of ``pattern`` in ``s``. If it does not
-  ## match, ``-1`` is returned.
-  ##
-  ## Example:
-  ##
-  ## .. code-block:: nim
-  ##  echo find("abcdefg", re"cde")  # => 2
-  ##  echo find("abcdefg", re"abc")  # => 0
-  ##  echo find("abcdefg", re"zz")  # => -1
+  ## match, ``-1`` is returned. We start the scan at `start`.
+  runnableExamples:
+    doAssert find("abcdefg", re"cde") == 2
+    doAssert find("abcdefg", re"abc") == 0
+    doAssert find("abcdefg", re"zz") == -1 # not found
+    doAssert find("abcdefg", re"cde", start = 2) == 2 # still 2
+    doAssert find("abcdefg", re"cde", start = 3) == -1 # we're past the start position
+    doAssert find("xabc", re"(?<=x|y)abc", start = 1) == 1
+      # lookbehind assertion `(?<=x|y)` can look behind `start`
  result = find(cstring(s), pattern, start, s.len)

 iterator findAll*(s: string, pattern: Regex, start = 0): string =
@@ -389,21 +392,17 @@ when not defined(nimhygiene):
 template `=~` *(s: string, pattern: Regex): untyped =
  ## This calls ``match`` with an implicit declared ``matches`` array that
  ## can be used in the scope of the ``=~`` call:
-  ##
-  ## .. code-block:: nim
-  ##
-  ##   if line =~ re"\s*(\w+)\s*\=\s*(\w+)":
-  ##     # matches a key=value pair:
-  ##     echo("Key: ", matches[0])
-  ##     echo("Value: ", matches[1])
-  ##   elif line =~ re"\s*(\#.*)":
-  ##     # matches a comment
-  ##     # note that the implicit ``matches`` array is different from the
-  ##     # ``matches`` array of the first branch
-  ##     echo("comment: ", matches[0])
-  ##   else:
-  ##     echo("syntax error")
-  ##
+  runnableExamples:
+    proc parse(line: string): string =
+      if line =~ re"\s*(\w+)\s*\=\s*(\w+)": # matches a key=value pair:
+        result = $(matches[0], matches[1])
+      elif line =~ re"\s*(\#.*)": # matches a comment
+        # note that the implicit ``matches`` array is different from 1st branch
+        result = $(matches[0],)
+      else: doAssert false
+      doAssert not declared(matches)
+    doAssert parse("NAME = LENA") == """("NAME", "LENA")"""
+    doAssert parse("   # comment ... ") == """("# comment ... ",)"""
  bind MaxSubpatterns
  when not declaredInScope(matches):
    var matches {.inject.}: array[MaxSubpatterns, string]
@@ -432,17 +431,9 @@ proc endsWith*(s: string, suffix: Regex): bool {.inline.} =
 proc replace*(s: string, sub: Regex, by = ""): string =
  ## Replaces ``sub`` in ``s`` by the string ``by``. Captures cannot be
  ## accessed in ``by``.
-  ##
-  ## Example:
-  ##
-  ## .. code-block:: nim
-  ##   "var1=key; var2=key2".replace(re"(\w+)=(\w+)")
-  ##
-  ## Results in:
-  ##
-  ## .. code-block:: nim
-  ##
-  ##   "; "
+  runnableExamples:
+    doAssert "var1=key; var2=key2".replace(re"(\w+)=(\w+)") == "; "
+    doAssert "var1=key; var2=key2".replace(re"(\w+)=(\w+)", "?") == "?; ?"
  result = ""
  var prev = 0
  while prev < s.len:
@@ -457,17 +448,9 @@ proc replace*(s: string, sub: Regex, by = ""): string =
 proc replacef*(s: string, sub: Regex, by: string): string =
  ## Replaces ``sub`` in ``s`` by the string ``by``. Captures can be accessed in ``by``
  ## with the notation ``$i`` and ``$#`` (see strutils.\`%\`).
-  ##
-  ## Example:
-  ##
-  ## .. code-block:: nim
-  ##   "var1=key; var2=key2".replacef(re"(\w+)=(\w+)", "$1<-$2$2")
-  ##
-  ## Results in:
-  ##
-  ## .. code-block:: nim
-  ##
-  ## "var1<-keykey; var2<-key2key2"
+  runnableExamples:
+    doAssert "var1=key; var2=key2".replacef(re"(\w+)=(\w+)", "$1<-$2$2") ==
+      "var1<-keykey; var2<-key2key2"
  result = ""
  var caps: array[MaxSubpatterns, string]
  var prev = 0
@@ -520,23 +503,10 @@ iterator split*(s: string, sep: Regex; maxsplit = -1): string =
  ##
  ## Substrings are separated by the regular expression ``sep``
  ## (and the portion matched by ``sep`` is not returned).
-  ##
-  ## Example:
-  ##
-  ## .. code-block:: nim
-  ##   for word in split("00232this02939is39an22example111", re"\d+"):
-  ##     writeLine(stdout, word)
-  ##
-  ## Results in:
-  ##
-  ## .. code-block:: nim
-  ##   ""
-  ##   "this"
-  ##   "is"
-  ##   "an"
-  ##   "example"
-  ##   ""
-  ##
+  runnableExamples:
+    import sequtils
+    doAssert toSeq(split("00232this02939is39an22example111", re"\d+")) ==
+      @["", "this", "is", "an", "example", ""]
  var last = 0
  var splits = maxsplit
  var x: int
@@ -576,108 +546,3 @@ proc escapeRe*(s: string): string =
    else:
      result.add("\\x")
      result.add(toHex(ord(c), 2))
-
-when isMainModule:
-  doAssert match("(a b c)", rex"\( .* \)")
-  doAssert match("WHiLe", re("while", {reIgnoreCase}))
-
-  doAssert "0158787".match(re"\d+")
-  doAssert "ABC 0232".match(re"\w+\s+\d+")
-  doAssert "ABC".match(rex"\d+ | \w+")
-
-  {.push warnings:off.}
-  doAssert matchLen("key", re"\b[a-zA-Z_]+[a-zA-Z_0-9]*\b") == 3
-  {.pop.}
-
-  var pattern = re"[a-z0-9]+\s*=\s*[a-z0-9]+"
-  doAssert matchLen("key1=  cal9", pattern) == 11
-
-  doAssert find("_____abc_______", re"abc") == 5
-  doAssert findBounds("_____abc_______", re"abc") == (5,7)
-
-  var matches: array[6, string]
-  if match("abcdefg", re"c(d)ef(g)", matches, 2):
-    doAssert matches[0] == "d"
-    doAssert matches[1] == "g"
-  else:
-    doAssert false
-
-  if "abc" =~ re"(a)bcxyz|(\w+)":
-    doAssert matches[1] == "abc"
-  else:
-    doAssert false
-
-  if "abc" =~ re"(cba)?.*":
-    doAssert matches[0] == ""
-  else: doAssert false
-
-  if "abc" =~ re"().*":
-    doAssert matches[0] == ""
-  else: doAssert false
-
-  doAssert "var1=key; var2=key2".endsWith(re"\w+=\w+")
-  doAssert("var1=key; var2=key2".replacef(re"(\w+)=(\w+)", "$1<-$2$2") ==
-         "var1<-keykey; var2<-key2key2")
-  doAssert("var1=key; var2=key2".replace(re"(\w+)=(\w+)", "$1<-$2$2") ==
-         "$1<-$2$2; $1<-$2$2")
-
-  var accum: seq[string] = @[]
-  for word in split("00232this02939is39an22example111", re"\d+"):
-    accum.add(word)
-  doAssert(accum == @["", "this", "is", "an", "example", ""])
-
-  accum = @[]
-  for word in split("00232this02939is39an22example111", re"\d+", maxsplit=2):
-    accum.add(word)
-  doAssert(accum == @["", "this", "is39an22example111"])
-
-  accum = @[]
-  for word in split("AAA :   : BBB", re"\s*:\s*"):
-    accum.add(word)
-  doAssert(accum == @["AAA", "", "BBB"])
-
-  doAssert(split("abc", re"") == @["a", "b", "c"])
-  doAssert(split("", re"") == @[])
-
-  doAssert(split("a;b;c", re";") == @["a", "b", "c"])
-  doAssert(split(";a;b;c", re";") == @["", "a", "b", "c"])
-  doAssert(split(";a;b;c;", re";") == @["", "a", "b", "c", ""])
-  doAssert(split("a;b;c;", re";") == @["a", "b", "c", ""])
-  doAssert(split("00232this02939is39an22example111", re"\d+", maxsplit=2) == @["", "this", "is39an22example111"])
-
-
-  for x in findAll("abcdef", re"^{.}", 3):
-    doAssert x == "d"
-  accum = @[]
-  for x in findAll("abcdef", re".", 3):
-    accum.add(x)
-  doAssert(accum == @["d", "e", "f"])
-
-  doAssert("XYZ".find(re"^\d*") == 0)
-  doAssert("XYZ".match(re"^\d*") == true)
-
-  block:
-    var matches: array[16, string]
-    if match("abcdefghijklmnop", re"(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)(l)(m)(n)(o)(p)", matches):
-      for i in 0..matches.high:
-        doAssert matches[i] == $chr(i + 'a'.ord)
-    else:
-      doAssert false
-
-  block:   # Buffer based RE
-    var cs: cstring = "_____abc_______"
-    doAssert(cs.find(re"abc", bufSize=15) == 5)
-    doAssert(cs.matchLen(re"_*abc", bufSize=15) == 8)
-    doAssert(cs.matchLen(re"abc", start=5, bufSize=15) == 3)
-    doAssert(cs.matchLen(re"abc", start=5, bufSize=7) == -1)
-    doAssert(cs.matchLen(re"abc_*", start=5, bufSize=10) == 5)
-    var accum: seq[string] = @[]
-    for x in cs.findAll(re"[a-z]", start=3, bufSize=15):
-      accum.add($x)
-    doAssert(accum == @["a","b","c"])
-
-  block:
-    # bug #9306
-    doAssert replace("bar", re"^", "foo") == "foobar"
-    doAssert replace("foo", re"", "-") == "-foo"
-    doAssert replace("foo", re"$", "bar") == "foobar"
--- a/tests/stdlib/tre.nim
+++ b/tests/stdlib/tre.nim
@@ -0,0 +1,108 @@
+import std/re
+
+proc testAll() =
+  doAssert match("(a b c)", rex"\( .* \)")
+  doAssert match("WHiLe", re("while", {reIgnoreCase}))
+
+  doAssert "0158787".match(re"\d+")
+  doAssert "ABC 0232".match(re"\w+\s+\d+")
+  doAssert "ABC".match(rex"\d+ | \w+")
+
+  {.push warnings:off.}
+  doAssert matchLen("key", re"\b[a-zA-Z_]+[a-zA-Z_0-9]*\b") == 3
+  {.pop.}
+
+  var pattern = re"[a-z0-9]+\s*=\s*[a-z0-9]+"
+  doAssert matchLen("key1=  cal9", pattern) == 11
+
+  doAssert find("_____abc_______", re"abc") == 5
+  doAssert findBounds("_____abc_______", re"abc") == (5,7)
+
+  var matches: array[6, string]
+  if match("abcdefg", re"c(d)ef(g)", matches, 2):
+    doAssert matches[0] == "d"
+    doAssert matches[1] == "g"
+  else:
+    doAssert false
+
+  if "abc" =~ re"(a)bcxyz|(\w+)":
+    doAssert matches[1] == "abc"
+  else:
+    doAssert false
+
+  if "abc" =~ re"(cba)?.*":
+    doAssert matches[0] == ""
+  else: doAssert false
+
+  if "abc" =~ re"().*":
+    doAssert matches[0] == ""
+  else: doAssert false
+
+  doAssert "var1=key; var2=key2".endsWith(re"\w+=\w+")
+  doAssert("var1=key; var2=key2".replacef(re"(\w+)=(\w+)", "$1<-$2$2") ==
+         "var1<-keykey; var2<-key2key2")
+  doAssert("var1=key; var2=key2".replace(re"(\w+)=(\w+)", "$1<-$2$2") ==
+         "$1<-$2$2; $1<-$2$2")
+
+  var accum: seq[string] = @[]
+  for word in split("00232this02939is39an22example111", re"\d+"):
+    accum.add(word)
+  doAssert(accum == @["", "this", "is", "an", "example", ""])
+
+  accum = @[]
+  for word in split("00232this02939is39an22example111", re"\d+", maxsplit=2):
+    accum.add(word)
+  doAssert(accum == @["", "this", "is39an22example111"])
+
+  accum = @[]
+  for word in split("AAA :   : BBB", re"\s*:\s*"):
+    accum.add(word)
+  doAssert(accum == @["AAA", "", "BBB"])
+
+  doAssert(split("abc", re"") == @["a", "b", "c"])
+  doAssert(split("", re"") == @[])
+
+  doAssert(split("a;b;c", re";") == @["a", "b", "c"])
+  doAssert(split(";a;b;c", re";") == @["", "a", "b", "c"])
+  doAssert(split(";a;b;c;", re";") == @["", "a", "b", "c", ""])
+  doAssert(split("a;b;c;", re";") == @["a", "b", "c", ""])
+  doAssert(split("00232this02939is39an22example111", re"\d+", maxsplit=2) == @["", "this", "is39an22example111"])
+
+
+  for x in findAll("abcdef", re"^{.}", 3):
+    doAssert x == "d"
+  accum = @[]
+  for x in findAll("abcdef", re".", 3):
+    accum.add(x)
+  doAssert(accum == @["d", "e", "f"])
+
+  doAssert("XYZ".find(re"^\d*") == 0)
+  doAssert("XYZ".match(re"^\d*") == true)
+
+  block:
+    var matches: array[16, string]
+    if match("abcdefghijklmnop", re"(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)(l)(m)(n)(o)(p)", matches):
+      for i in 0..matches.high:
+        doAssert matches[i] == $chr(i + 'a'.ord)
+    else:
+      doAssert false
+
+  block:   # Buffer based RE
+    var cs: cstring = "_____abc_______"
+    doAssert(cs.find(re"abc", bufSize=15) == 5)
+    doAssert(cs.matchLen(re"_*abc", bufSize=15) == 8)
+    doAssert(cs.matchLen(re"abc", start=5, bufSize=15) == 3)
+    doAssert(cs.matchLen(re"abc", start=5, bufSize=7) == -1)
+    doAssert(cs.matchLen(re"abc_*", start=5, bufSize=10) == 5)
+    var accum: seq[string] = @[]
+    for x in cs.findAll(re"[a-z]", start=3, bufSize=15):
+      accum.add($x)
+    doAssert(accum == @["a","b","c"])
+
+  block:
+    # bug #9306
+    doAssert replace("bar", re"^", "foo") == "foobar"
+    doAssert replace("foo", re"", "-") == "-foo"
+    doAssert replace("foo", re"$", "bar") == "foobar"
+
+testAll()