Merge pull request #4361 from jyapayne/fix_split

Fix #4305: Make split proc for set[char] consistent
2026-02-25 04:15:09 +00:00 · 2016-07-02 17:55:57 +02:00
parent 97b59506d6 79a8a5ee72
commit b56e5e159a
3 changed files with 95 additions and 71 deletions
--- a/lib/pure/strutils.nim
+++ b/lib/pure/strutils.nim
@@ -26,6 +26,12 @@ include "system/inclrtl"

 {.pop.}

+# Support old split with set[char]
+when defined(nimOldSplit):
+  {.pragma: deprecatedSplit, deprecated.}
+else:
+  {.pragma: deprecatedSplit.}
+
 type
  CharSet* {.deprecated.} = set[char] # for compatibility with Nim
 {.deprecated: [TCharSet: CharSet].}
@@ -472,17 +478,65 @@ proc isNilOrWhitespace*(s: string): bool {.noSideEffect, procvar, rtl, extern: "
    if not c.isSpace():
      return false

+proc substrEq(s: string, pos: int, substr: string): bool =
+  var i = 0
+  var length = substr.len
+  while i < length and s[pos+i] == substr[i]:
+    inc i
+
+  return i == length
+
+# --------- Private templates for different split separators -----------
+
+template stringHasSep(s: string, index: int, seps: set[char]): bool =
+  s[index] in seps
+
+template stringHasSep(s: string, index: int, sep: char): bool =
+  s[index] == sep
+
+template stringHasSep(s: string, index: int, sep: string): bool =
+  s.substrEq(index, sep)
+
+template splitCommon(s, sep, maxsplit, sepLen) =
+  ## Common code for split procedures
+  var last = 0
+  var splits = maxsplit
+
+  if len(s) > 0:
+    while last <= len(s):
+      var first = last
+      while last < len(s) and not stringHasSep(s, last, sep):
+        inc(last)
+      if splits == 0: last = len(s)
+      yield substr(s, first, last-1)
+      if splits == 0: break
+      dec(splits)
+      inc(last, sepLen)
+
+when defined(nimOldSplit):
+  template oldSplit(s, seps, maxsplit) =
+    ## Deprecated split[char] for transition period
+    var last = 0
+    var splits = maxsplit
+    assert(not ('\0' in seps))
+    while last < len(s):
+      while s[last] in seps: inc(last)
+      var first = last
+      while last < len(s) and s[last] notin seps: inc(last) # BUGFIX!
+      if first <= last-1:
+        if splits == 0: last = len(s)
+        yield substr(s, first, last-1)
+        if splits == 0: break
+        dec(splits)
+
 iterator split*(s: string, seps: set[char] = Whitespace,
                maxsplit: int = -1): string =
  ## Splits the string `s` into substrings using a group of separators.
  ##
-  ## Substrings are separated by a substring containing only `seps`. Note
-  ## that whole sequences of characters found in ``seps`` will be counted as
-  ## a single split point and leading/trailing separators will be ignored.
-  ## The following example:
+  ## Substrings are separated by a substring containing only `seps`.
  ##
  ## .. code-block:: nim
-  ##   for word in split("  this is an  example  "):
+  ##   for word in split("this\lis an\texample"):
  ##     writeLine(stdout, word)
  ##
  ## ...generates this output:
@@ -496,7 +550,7 @@ iterator split*(s: string, seps: set[char] = Whitespace,
  ## And the following code:
  ##
  ## .. code-block:: nim
-  ##   for word in split(";;this;is;an;;example;;;", {';'}):
+  ##   for word in split("this:is;an$example", {';', ':', '$'}):
  ##     writeLine(stdout, word)
  ##
  ## ...produces the same output as the first example. The code:
@@ -517,26 +571,16 @@ iterator split*(s: string, seps: set[char] = Whitespace,
  ##   "08"
  ##   "08.398990"
  ##
-  var last = 0
-  var splits = maxsplit
-  assert(not ('\0' in seps))
-  while last < len(s):
-    while s[last] in seps: inc(last)
-    var first = last
-    while last < len(s) and s[last] notin seps: inc(last) # BUGFIX!
-    if first <= last-1:
-      if splits == 0: last = len(s)
-      yield substr(s, first, last-1)
-      if splits == 0: break
-      dec(splits)
+  when defined(nimOldSplit):
+    oldSplit(s, seps, maxsplit)
+  else:
+    splitCommon(s, seps, maxsplit, 1)

 iterator split*(s: string, sep: char, maxsplit: int = -1): string =
  ## Splits the string `s` into substrings using a single separator.
  ##
  ## Substrings are separated by the character `sep`.
-  ## Unlike the version of the iterator which accepts a set of separator
-  ## characters, this proc will not coalesce groups of the
-  ## separator, returning a string for each found character. The code:
+  ## The code:
  ##
  ## .. code-block:: nim
  ##   for word in split(";;this;is;an;;example;;;", ';'):
@@ -556,56 +600,27 @@ iterator split*(s: string, sep: char, maxsplit: int = -1): string =
  ##   ""
  ##   ""
  ##
-  var last = 0
-  var splits = maxsplit
-  assert('\0' != sep)
-  if len(s) > 0:
-    # `<=` is correct here for the edge cases!
-    while last <= len(s):
-      var first = last
-      while last < len(s) and s[last] != sep: inc(last)
-      if splits == 0: last = len(s)
-      yield substr(s, first, last-1)
-      if splits == 0: break
-      dec(splits)
-      inc(last)
-
-proc substrEq(s: string, pos: int, substr: string): bool =
-  var i = 0
-  var length = substr.len
-  while i < length and s[pos+i] == substr[i]:
-    inc i
-
-  return i == length
+  splitCommon(s, sep, maxsplit, 1)

 iterator split*(s: string, sep: string, maxsplit: int = -1): string =
  ## Splits the string `s` into substrings using a string separator.
  ##
  ## Substrings are separated by the string `sep`.
-  var last = 0
-  var splits = maxsplit
+  ## The code:
+  ##
+  ## .. code-block:: nim
+  ##   for word in split("thisDATAisDATAcorrupted", "DATA"):
+  ##     writeLine(stdout, word)
+  ##
+  ## Results in:
+  ##
+  ## .. code-block::
+  ##   "this"
+  ##   "is"
+  ##   "corrupted"
+  ##

-  if len(s) > 0:
-    while last <= len(s):
-      var first = last
-      while last < len(s) and not s.substrEq(last, sep):
-        inc(last)
-      if splits == 0: last = len(s)
-      yield substr(s, first, last-1)
-      if splits == 0: break
-      dec(splits)
-      inc(last, sep.len)
-
-# --------- Private templates for different rsplit separators -----------
-
-template stringHasSep(s: string, index: int, seps: set[char]): bool =
-  s[index] in seps
-
-template stringHasSep(s: string, index: int, sep: char): bool =
-  s[index] == sep
-
-template stringHasSep(s: string, index: int, sep: string): bool =
-  s.substrEq(index, sep)
+  splitCommon(s, sep, maxsplit, sep.len)

 template rsplitCommon(s, sep, maxsplit, sepLen) =
  ## Common code for rsplit functions
@@ -2375,11 +2390,14 @@ bar
    bar
  """.unindent() == "foo\nfoo\nbar\n"

-  let s = " this   is     an example   "
-  doAssert s.split() == @["this", "is", "an", "example"]
-  doAssert s.split(maxsplit=4) == @["this", "is", "an", "example"]
-  doAssert s.split(' ', maxsplit=4) == @["", "this", "", "", "is     an example   "]
-  doAssert s.split(" ", maxsplit=4) == @["", "this", "", "", "is     an example   "]
+  let s = " this is an example  "
+  let s2 = ":this;is;an:example;;"
+
+  doAssert s.split() == @["", "this", "is", "an", "example", "", ""]
+  doAssert s2.split(seps={':', ';'}) == @["", "this", "is", "an", "example", "", ""]
+  doAssert s.split(maxsplit=4) == @["", "this", "is", "an", "example  "]
+  doAssert s.split(' ', maxsplit=1) == @["", "this is an example  "]
+  doAssert s.split(" ", maxsplit=4) == @["", "this", "is", "an", "example  "]

  block: # formatEng tests
    doAssert formatEng(0, 2, trim=false) == "0.00"
--- a/tests/stdlib/tsplit.nim
+++ b/tests/stdlib/tsplit.nim
@@ -9,7 +9,7 @@ for w in split("|abc|xy|z", {'|'}):
  s.add("#")
  s.add(w)

-if s == "#abc#xy#z":
+if s == "##abc#xy#z":
  echo "true"
 else:
  echo "false"
--- a/web/news/version_0_15_released.rst
+++ b/web/news/version_0_15_released.rst
@@ -13,6 +13,12 @@ Changes affecting backwards compatibility
 - De-deprecated ``re.nim`` because we have too much code using it
  and it got the basic API right.

+- ``split`` with ``set[char]`` as a delimiter in ``strutils.nim``
+  no longer strips and splits characters out of the target string
+  by the entire set of characters. Instead, it now behaves in a
+  similar fashion to ``split`` with ``string`` and ``char``
+  delimiters.
+
 Library Additions
 -----------------