make re.split consistent with strutils.split and other programming languages; refs #7278

2026-02-18 17:08:32 +00:00 · 2018-03-05 21:39:13 +01:00
parent 5c8332d871
commit e2094bc6f4
2 changed files with 33 additions and 13 deletions
--- a/changelog.md
+++ b/changelog.md
@@ -4,10 +4,16 @@

 #### Breaking changes in the standard library

+- ``re.split`` for empty regular expressions now yields every character in
+  the string which is what other programming languages chose to do.
+
 #### Breaking changes in the compiler

 ### Library additions

+- ``re.split`` now also supports the ``maxsplit`` parameter for consistency
+  with ``strutils.split``.
+
 ### Library changes

 ### Language additions
--- a/lib/impure/re.nim
+++ b/lib/impure/re.nim
@@ -498,7 +498,7 @@ proc transformFile*(infile, outfile: string,
  var x = readFile(infile).string
  writeFile(outfile, x.multiReplace(subs))

-iterator split*(s: string, sep: Regex): string =
+iterator split*(s: string, sep: Regex; maxsplit = -1): string =
  ## Splits the string ``s`` into substrings.
  ##
  ## Substrings are separated by the regular expression ``sep``
@@ -520,22 +520,28 @@ iterator split*(s: string, sep: Regex): string =
  ##   "example"
  ##   ""
  ##
-  var
-    first = -1
-    last = -1
-  while last < len(s):
-    var x = matchLen(s, sep, last)
-    if x > 0: inc(last, x)
-    first = last
-    if x == 0: inc(last)
+  var last = 0
+  var splits = maxsplit
+  var x: int
+  while last <= len(s):
+    var first = last
+    var sepLen = 1
    while last < len(s):
      x = matchLen(s, sep, last)
-      if x >= 0: break
+      if x >= 0:
+        sepLen = x
+        break
      inc(last)
-    if first <= last:
-      yield substr(s, first, last-1)
+    if x == 0:
+      if last >= len(s): break
+      inc last
+    if splits == 0: last = len(s)
+    yield substr(s, first, last-1)
+    if splits == 0: break
+    dec(splits)
+    inc(last, sepLen)

-proc split*(s: string, sep: Regex): seq[string] {.inline.} =
+proc split*(s: string, sep: Regex, maxsplit = -1): seq[string] {.inline.} =
  ## Splits the string ``s`` into a seq of substrings.
  ##
  ## The portion matched by ``sep`` is not returned.
@@ -632,6 +638,14 @@ when isMainModule:
    accum.add(word)
  doAssert(accum == @["AAA", "", "BBB"])

+  doAssert(split("abc", re"") == @["a", "b", "c"])
+  doAssert(split("", re"") == @[])
+
+  doAssert(split("a;b;c", re";") == @["a", "b", "c"])
+  doAssert(split(";a;b;c", re";") == @["", "a", "b", "c"])
+  doAssert(split(";a;b;c;", re";") == @["", "a", "b", "c", ""])
+  doAssert(split("a;b;c;", re";") == @["a", "b", "c", ""])
+
  for x in findAll("abcdef", re"^{.}", 3):
    doAssert x == "d"
  accum = @[]