fixes #5444 - nre.findIter keeps searching when no match is possible (#5453)

2026-06-03 02:18:00 +00:00 · 2017-03-02 11:48:41 +01:00
parent 32159ee827
commit 34a3d40d18
2 changed files with 25 additions and 6 deletions
--- a/lib/impure/nre.nim
+++ b/lib/impure/nre.nim
@@ -516,23 +516,23 @@ iterator findIter*(str: string, pattern: Regex, start = 0, endpos = int.high): R
  let unicode = uint32(getinfo[culong](pattern, pcre.INFO_OPTIONS) and
    pcre.UTF8) > 0u32
  let strlen = if endpos == int.high: str.len else: endpos+1
-
  var offset = start
  var match: Option[RegexMatch]
+  var neverMatched = true
+
  while true:
    var flags = 0
-
    if match.isSome and
       match.get.matchBounds.a > match.get.matchBounds.b:
      # 0-len match
      flags = pcre.NOTEMPTY_ATSTART
-
    match = str.matchImpl(pattern, offset, endpos, flags)

    if match.isNone:
      # either the end of the input or the string
-      # cannot be split here
-      if offset >= strlen:
+      # cannot be split here - we also need to bail
+      # if we've never matched and we've already tried to...
+      if offset >= strlen or neverMatched:
        break

      if matchesCrLf and offset < (str.len - 1) and
@@ -546,11 +546,11 @@ iterator findIter*(str: string, pattern: Regex, start = 0, endpos = int.high): R
      else:
        offset += 1
    else:
+      neverMatched = false
      offset = match.get.matchBounds.b + 1

      yield match.get

-
 proc find*(str: string, pattern: Regex, start = 0, endpos = int.high): Option[RegexMatch] =
  ## Finds the given pattern in the string between the end and start
  ## positions.
--- a/tests/stdlib/nre/find.nim
+++ b/tests/stdlib/nre/find.nim
@@ -1,6 +1,7 @@
 import unittest, sequtils
 import nre except toSeq
 import optional_nonstrict
+import times, strutils

 suite "find":
  test "find text":
@@ -25,3 +26,21 @@ suite "find":
    check("word word".findAll(re"\b") == @["", "", "", ""])
    check("word\r\lword".findAll(re"(*ANYCRLF)(?m)$") == @["", ""])
    check("слово слово".findAll(re"(*U)\b") == @["", "", "", ""])
+
+  test "bail early":
+    ## we expect nothing to be found and we should be bailing out early which means that
+    ## the timing difference between searching in small and large data should be well
+    ## within a tolerance area
+    const tolerance = 0.0001
+    var smallData = repeat("url.sequence = \"http://whatever.com/jwhrejrhrjrhrjhrrjhrjrhrjrh\"", 10)
+    var largeData = repeat("url.sequence = \"http://whatever.com/jwhrejrhrjrhrjhrrjhrjrhrjrh\"", 1000000)
+    var start = cpuTime()
+    check(largeData.findAll(re"url.*? = &#39;(.*?)&#39;") == newSeq[string]())
+    var stop = cpuTime()
+    var elapsedLarge = stop - start
+    start = cpuTime()
+    check(smallData.findAll(re"url.*? = &#39;(.*?)&#39;") == newSeq[string]())
+    stop = cpuTime()
+    var elapsedSmall = stop - start
+    var difference =  elapsedLarge - elapsedSmall
+    check(difference < tolerance)