Fix #4305: Make split proc for set[char] consistent

2026-02-13 14:53:46 +00:00 · 2016-06-16 13:40:56 -06:00
parent d8ee2c2409
commit 76f81d4aa4
1 changed files with 63 additions and 70 deletions
--- a/lib/pure/strutils.nim
+++ b/lib/pure/strutils.nim
@@ -341,17 +341,49 @@ proc isNilOrWhitespace*(s: string): bool {.noSideEffect, procvar, rtl, extern: "
    if not c.isSpace():
      return false

+proc substrEq(s: string, pos: int, substr: string): bool =
+  var i = 0
+  var length = substr.len
+  while i < length and s[pos+i] == substr[i]:
+    inc i
+
+  return i == length
+
+# --------- Private templates for different split separators -----------
+
+template stringHasSep(s: string, index: int, seps: set[char]): bool =
+  s[index] in seps
+
+template stringHasSep(s: string, index: int, sep: char): bool =
+  s[index] == sep
+
+template stringHasSep(s: string, index: int, sep: string): bool =
+  s.substrEq(index, sep)
+
+template splitCommon(s, sep, maxsplit, sepLen) =
+  ## Common code for split procedures
+  var last = 0
+  var splits = maxsplit
+
+  if len(s) > 0:
+    while last <= len(s):
+      var first = last
+      while last < len(s) and not stringHasSep(s, last, sep):
+        inc(last)
+      if splits == 0: last = len(s)
+      yield substr(s, first, last-1)
+      if splits == 0: break
+      dec(splits)
+      inc(last, sepLen)
+
 iterator split*(s: string, seps: set[char] = Whitespace,
                maxsplit: int = -1): string =
  ## Splits the string `s` into substrings using a group of separators.
  ##
-  ## Substrings are separated by a substring containing only `seps`. Note
-  ## that whole sequences of characters found in ``seps`` will be counted as
-  ## a single split point and leading/trailing separators will be ignored.
-  ## The following example:
+  ## Substrings are separated by a substring containing only `seps`.
  ##
  ## .. code-block:: nim
-  ##   for word in split("  this is an  example  "):
+  ##   for word in split("this\lis an\texample"):
  ##     writeLine(stdout, word)
  ##
  ## ...generates this output:
@@ -365,7 +397,7 @@ iterator split*(s: string, seps: set[char] = Whitespace,
  ## And the following code:
  ##
  ## .. code-block:: nim
-  ##   for word in split(";;this;is;an;;example;;;", {';'}):
+  ##   for word in split("this:is;an$example", {';', ':', '$'}):
  ##     writeLine(stdout, word)
  ##
  ## ...produces the same output as the first example. The code:
@@ -386,26 +418,13 @@ iterator split*(s: string, seps: set[char] = Whitespace,
  ##   "08"
  ##   "08.398990"
  ##
-  var last = 0
-  var splits = maxsplit
-  assert(not ('\0' in seps))
-  while last < len(s):
-    while s[last] in seps: inc(last)
-    var first = last
-    while last < len(s) and s[last] notin seps: inc(last) # BUGFIX!
-    if first <= last-1:
-      if splits == 0: last = len(s)
-      yield substr(s, first, last-1)
-      if splits == 0: break
-      dec(splits)
+  splitCommon(s, seps, maxsplit, 1)

 iterator split*(s: string, sep: char, maxsplit: int = -1): string =
  ## Splits the string `s` into substrings using a single separator.
  ##
  ## Substrings are separated by the character `sep`.
-  ## Unlike the version of the iterator which accepts a set of separator
-  ## characters, this proc will not coalesce groups of the
-  ## separator, returning a string for each found character. The code:
+  ## The code:
  ##
  ## .. code-block:: nim
  ##   for word in split(";;this;is;an;;example;;;", ';'):
@@ -425,56 +444,27 @@ iterator split*(s: string, sep: char, maxsplit: int = -1): string =
  ##   ""
  ##   ""
  ##
-  var last = 0
-  var splits = maxsplit
-  assert('\0' != sep)
-  if len(s) > 0:
-    # `<=` is correct here for the edge cases!
-    while last <= len(s):
-      var first = last
-      while last < len(s) and s[last] != sep: inc(last)
-      if splits == 0: last = len(s)
-      yield substr(s, first, last-1)
-      if splits == 0: break
-      dec(splits)
-      inc(last)
-
-proc substrEq(s: string, pos: int, substr: string): bool =
-  var i = 0
-  var length = substr.len
-  while i < length and s[pos+i] == substr[i]:
-    inc i
-
-  return i == length
+  splitCommon(s, sep, maxsplit, 1)

 iterator split*(s: string, sep: string, maxsplit: int = -1): string =
  ## Splits the string `s` into substrings using a string separator.
  ##
  ## Substrings are separated by the string `sep`.
-  var last = 0
-  var splits = maxsplit
+  ## The code:
+  ##
+  ## .. code-block:: nim
+  ##   for word in split("thisDATAisDATAcorrupted", "DATA"):
+  ##     writeLine(stdout, word)
+  ##
+  ## Results in:
+  ##
+  ## .. code-block::
+  ##   "this"
+  ##   "is"
+  ##   "corrupted"
+  ##

-  if len(s) > 0:
-    while last <= len(s):
-      var first = last
-      while last < len(s) and not s.substrEq(last, sep):
-        inc(last)
-      if splits == 0: last = len(s)
-      yield substr(s, first, last-1)
-      if splits == 0: break
-      dec(splits)
-      inc(last, sep.len)
-
-# --------- Private templates for different rsplit separators -----------
-
-template stringHasSep(s: string, index: int, seps: set[char]): bool =
-  s[index] in seps
-
-template stringHasSep(s: string, index: int, sep: char): bool =
-  s[index] == sep
-
-template stringHasSep(s: string, index: int, sep: string): bool =
-  s.substrEq(index, sep)
+  splitCommon(s, sep, maxsplit, sep.len)

 template rsplitCommon(s, sep, maxsplit, sepLen) =
  ## Common code for rsplit functions
@@ -2244,11 +2234,14 @@ bar
    bar
  """.unindent() == "foo\nfoo\nbar\n"

-  let s = " this   is     an example   "
-  doAssert s.split() == @["this", "is", "an", "example"]
-  doAssert s.split(maxsplit=4) == @["this", "is", "an", "example"]
-  doAssert s.split(' ', maxsplit=4) == @["", "this", "", "", "is     an example   "]
-  doAssert s.split(" ", maxsplit=4) == @["", "this", "", "", "is     an example   "]
+  let s = " this is an example  "
+  let s2 = ":this;is;an:example;;"
+
+  doAssert s.split() == @["", "this", "is", "an", "example", "", ""]
+  doAssert s2.split(seps={':', ';'}) == @["", "this", "is", "an", "example", "", ""]
+  doAssert s.split(maxsplit=4) == @["", "this", "is", "an", "example  "]
+  doAssert s.split(' ', maxsplit=1) == @["", "this is an example  "]
+  doAssert s.split(" ", maxsplit=4) == @["", "this", "is", "an", "example  "]

  block: # formatEng tests
    doAssert formatEng(0, 2, trim=false) == "0.00"