From 5e529b3bfa7b3ca09cde7237134acd47e47ca48c Mon Sep 17 00:00:00 2001 From: Zoom Date: Wed, 21 Jun 2023 06:52:33 +0000 Subject: [PATCH] `strutils.split/rsplit` now return src on an empty sep (#22136) This is a rebase of an earlier rejected PR. Following the discussion around it, this commit provides a valid output for and edge case of an empty separator for `split` and `rsplit` routines. The empty separator is interpreted as "split by no separators" and the initial string is returned. This is consistent with the behaviour of the `set[char]` version of `split`/`rsplit` routines and unifies them all. Compared to a commit merged earlier, this one has a benefit of not using assertions that will be removed in release builds and thus still not preventing possible infinite loops (which was the earlier behaviour for this edge case for separator of type `string`). Co-authored-by: Andreas Rumpf --- changelogs/changelog_2_0_0.md | 3 +- lib/pure/strutils.nim | 60 ++++++++++++++++++++------------ tests/js/tstdlib_various.nim | 7 +--- tests/stdlib/tstdlib_various.nim | 7 +--- tests/stdlib/tstrutils.nim | 16 +++++---- 5 files changed, 50 insertions(+), 43 deletions(-) diff --git a/changelogs/changelog_2_0_0.md b/changelogs/changelog_2_0_0.md index b56a3e71c7..4a0d505c8d 100644 --- a/changelogs/changelog_2_0_0.md +++ b/changelogs/changelog_2_0_0.md @@ -256,8 +256,6 @@ declared when they are not available on the backend. Previously it would call `doAssert false` at runtime despite the condition being compile-time. -- `strutils.split` and `strutils.rsplit` now forbid an empty separator. - - Custom destructors now supports non-var parameters, e.g. `proc =destroy[T: object](x: T)` is valid. `proc =destroy[T: object](x: var T)` is deprecated. - Relative imports will not resolve to searched paths anymore, e.g. `import ./tables` now reports an error properly. @@ -275,6 +273,7 @@ - Changed `mimedb` to use an `OrderedTable` instead of `OrderedTableRef` to support `const` tables. - `strutils.find` now uses and defaults to `last = -1` for whole string searches, making limiting it to just the first char (`last = 0`) valid. +- `strutils.split` and `strutils.rsplit` now return a source string as a single element for an empty separator. - `random.rand` now works with `Ordinal`s. - Undeprecated `os.isvalidfilename`. - `std/oids` now uses `int64` to store time internally (before it was int32). diff --git a/lib/pure/strutils.nim b/lib/pure/strutils.nim index fe9f2a8c3d..6c1e495648 100644 --- a/lib/pure/strutils.nim +++ b/lib/pure/strutils.nim @@ -366,11 +366,14 @@ func cmpIgnoreStyle*(a, b: string): int {.rtl, extern: "nsuCmpIgnoreStyle".} = # --------- Private templates for different split separators ----------- func substrEq(s: string, pos: int, substr: string): bool = - var i = 0 + # Always returns false for empty `substr` var length = substr.len - while i < length and pos+i < s.len and s[pos+i] == substr[i]: - inc i - return i == length + if length > 0: + var i = 0 + while i < length and pos+i < s.len and s[pos+i] == substr[i]: + inc i + i == length + else: false template stringHasSep(s: string, index: int, seps: set[char]): bool = s[index] in seps @@ -487,15 +490,15 @@ iterator split*(s: string, seps: set[char] = Whitespace, ## "22" ## "08" ## "08.398990" - ## - ## .. warning:: `seps` should not be empty. + ## + ## .. note:: Empty separator set results in returning an original string, + ## following the interpretation "split by no element". ## ## See also: ## * `rsplit iterator<#rsplit.i,string,set[char],int>`_ ## * `splitLines iterator<#splitLines.i,string>`_ ## * `splitWhitespace iterator<#splitWhitespace.i,string,int>`_ ## * `split func<#split,string,set[char],int>`_ - assert seps.card > 0, "Empty separator" splitCommon(s, seps, maxsplit, 1) iterator split*(s: string, sep: string, maxsplit: int = -1): string = @@ -515,15 +518,17 @@ iterator split*(s: string, sep: string, maxsplit: int = -1): string = ## "is" ## "corrupted" ## - ## .. warning:: `sep` should not be empty. + ## .. note:: Empty separator string results in returning an original string, + ## following the interpretation "split by no element". ## ## See also: ## * `rsplit iterator<#rsplit.i,string,string,int,bool>`_ ## * `splitLines iterator<#splitLines.i,string>`_ ## * `splitWhitespace iterator<#splitWhitespace.i,string,int>`_ ## * `split func<#split,string,string,int>`_ - assert sep.len > 0, "Empty separator" - splitCommon(s, sep, maxsplit, sep.len) + let sepLen = if sep.len == 0: 1 # prevents infinite loop + else: sep.len + splitCommon(s, sep, maxsplit, sepLen) template rsplitCommon(s, sep, maxsplit, sepLen) = @@ -591,16 +596,16 @@ iterator rsplit*(s: string, seps: set[char] = Whitespace, ## "bar" ## "foo" ## - ## Substrings are separated from the right by the set of chars `seps`. - ## - ## .. warning:: `seps` should not be empty. + ## Substrings are separated from the right by the set of chars `seps` + ## + ## .. note:: Empty separator set results in returning an original string, + ## following the interpretation "split by no element". ## ## See also: ## * `split iterator<#split.i,string,set[char],int>`_ ## * `splitLines iterator<#splitLines.i,string>`_ ## * `splitWhitespace iterator<#splitWhitespace.i,string,int>`_ ## * `rsplit func<#rsplit,string,set[char],int>`_ - assert seps.card > 0, "Empty separator" rsplitCommon(s, seps, maxsplit, 1) iterator rsplit*(s: string, sep: string, maxsplit: int = -1, @@ -619,17 +624,19 @@ iterator rsplit*(s: string, sep: string, maxsplit: int = -1, ## "bar" ## "foo" ## - ## Substrings are separated from the right by the string `sep`. - ## - ## .. warning:: `sep` should not be empty. + ## Substrings are separated from the right by the string `sep` + ## + ## .. note:: Empty separator string results in returning an original string, + ## following the interpretation "split by no element". ## ## See also: ## * `split iterator<#split.i,string,string,int>`_ ## * `splitLines iterator<#splitLines.i,string>`_ ## * `splitWhitespace iterator<#splitWhitespace.i,string,int>`_ ## * `rsplit func<#rsplit,string,string,int>`_ - assert sep.len > 0, "Empty separator" - rsplitCommon(s, sep, maxsplit, sep.len) + let sepLen = if sep.len == 0: 1 # prevents infinite loop + else: sep.len + rsplitCommon(s, sep, maxsplit, sepLen) iterator splitLines*(s: string, keepEol = false): string = ## Splits the string `s` into its containing lines. @@ -740,7 +747,8 @@ func split*(s: string, seps: set[char] = Whitespace, maxsplit: int = -1): seq[ ## The same as the `split iterator <#split.i,string,set[char],int>`_ (see its ## documentation), but is a func that returns a sequence of substrings. ## - ## .. warning:: `seps` should not be empty. + ## .. note:: Empty separator set results in returning an original string, + ## following the interpretation "split by no element". ## ## See also: ## * `split iterator <#split.i,string,set[char],int>`_ @@ -750,6 +758,7 @@ func split*(s: string, seps: set[char] = Whitespace, maxsplit: int = -1): seq[ runnableExamples: doAssert "a,b;c".split({',', ';'}) == @["a", "b", "c"] doAssert "".split({' '}) == @[""] + doAssert "empty seps return unsplit s".split({}) == @["empty seps return unsplit s"] accResult(split(s, seps, maxsplit)) func split*(s: string, sep: string, maxsplit: int = -1): seq[string] {.rtl, @@ -759,7 +768,8 @@ func split*(s: string, sep: string, maxsplit: int = -1): seq[string] {.rtl, ## Substrings are separated by the string `sep`. This is a wrapper around the ## `split iterator <#split.i,string,string,int>`_. ## - ## .. warning:: `sep` should not be empty. + ## .. note:: Empty separator string results in returning an original string, + ## following the interpretation "split by no element". ## ## See also: ## * `split iterator <#split.i,string,string,int>`_ @@ -773,6 +783,7 @@ func split*(s: string, sep: string, maxsplit: int = -1): seq[string] {.rtl, doAssert "a largely spaced sentence".split(" ") == @["a", "", "largely", "", "", "", "spaced", "sentence"] doAssert "a largely spaced sentence".split(" ", maxsplit = 1) == @["a", " largely spaced sentence"] + doAssert "empty sep returns unsplit s".split("") == @["empty sep returns unsplit s"] accResult(split(s, sep, maxsplit)) func rsplit*(s: string, sep: char, maxsplit: int = -1): seq[string] {.rtl, @@ -822,7 +833,8 @@ func rsplit*(s: string, seps: set[char] = Whitespace, ## .. code-block:: nim ## @["Root#Object#Method", "Index"] ## - ## .. warning:: `seps` should not be empty. + ## .. note:: Empty separator set results in returning an original string, + ## following the interpretation "split by no element". ## ## See also: ## * `rsplit iterator <#rsplit.i,string,set[char],int>`_ @@ -851,7 +863,8 @@ func rsplit*(s: string, sep: string, maxsplit: int = -1): seq[string] {.rtl, ## .. code-block:: nim ## @["Root#Object#Method", "Index"] ## - ## .. warning:: `sep` should not be empty. + ## .. note:: Empty separator string results in returning an original string, + ## following the interpretation "split by no element". ## ## See also: ## * `rsplit iterator <#rsplit.i,string,string,int,bool>`_ @@ -867,6 +880,7 @@ func rsplit*(s: string, sep: string, maxsplit: int = -1): seq[string] {.rtl, doAssert "".rsplit("Elon Musk") == @[""] doAssert "a largely spaced sentence".rsplit(" ") == @["a", "", "largely", "", "", "", "spaced", "sentence"] + doAssert "empty sep returns unsplit s".rsplit("") == @["empty sep returns unsplit s"] accResult(rsplit(s, sep, maxsplit)) result.reverse() diff --git a/tests/js/tstdlib_various.nim b/tests/js/tstdlib_various.nim index 4b5ce1de85..1e584f735e 100644 --- a/tests/js/tstdlib_various.nim +++ b/tests/js/tstdlib_various.nim @@ -150,12 +150,7 @@ block tsplit2: s.add("#") s.add(w) - var errored = false - try: - discard "hello".split("") - except AssertionDefect: - errored = true - doAssert errored + doAssert "true".split("") == @["true"] block txmlgen: var nim = "Nim" diff --git a/tests/stdlib/tstdlib_various.nim b/tests/stdlib/tstdlib_various.nim index ce9c9a7c57..bac5018fae 100644 --- a/tests/stdlib/tstdlib_various.nim +++ b/tests/stdlib/tstdlib_various.nim @@ -27,7 +27,6 @@ Hi Andreas! How do you feel, Rumpf? [2, 3, 4, 5] [2, 3, 4, 5, 6] [1, 2, 3, 4, 5, 6] -true

Nim

''' """ @@ -207,11 +206,7 @@ block tsplit2: s.add("#") s.add(w) - try: - discard "hello".split("") - echo "false" - except AssertionDefect: - echo "true" + doAssert "true".split("") == @["true"] diff --git a/tests/stdlib/tstrutils.nim b/tests/stdlib/tstrutils.nim index d53e9d8b4b..847e226560 100644 --- a/tests/stdlib/tstrutils.nim +++ b/tests/stdlib/tstrutils.nim @@ -58,8 +58,10 @@ template main() = doAssert "".split(" ") == @[""] doAssert "".split({' '}) == @[""] # Empty separators: - doAssertRaises(AssertionDefect): discard s.split({}) - doAssertRaises(AssertionDefect): discard s.split("") + doAssert "".split({}) == @[""] + doAssert "".split("") == @[""] + doAssert s.split({}) == @[s] + doAssert s.split("") == @[s] block: # splitLines let fixture = "a\nb\rc\r\nd" @@ -70,8 +72,7 @@ template main() = block: # rsplit doAssert rsplit("foo bar", seps = Whitespace) == @["foo", "bar"] doAssert rsplit(" foo bar", seps = Whitespace, maxsplit = 1) == @[" foo", "bar"] - doAssert rsplit(" foo bar ", seps = Whitespace, maxsplit = 1) == @[ - " foo bar", ""] + doAssert rsplit(" foo bar ", seps = Whitespace, maxsplit = 1) == @[" foo bar", ""] doAssert rsplit(":foo:bar", sep = ':') == @["", "foo", "bar"] doAssert rsplit(":foo:bar", sep = ':', maxsplit = 2) == @["", "foo", "bar"] doAssert rsplit(":foo:bar", sep = ':', maxsplit = 3) == @["", "foo", "bar"] @@ -81,8 +82,11 @@ template main() = doAssert "".rsplit(" ") == @[""] doAssert "".rsplit({' '}) == @[""] # Empty separators: - doAssertRaises(AssertionDefect): discard "".rsplit({}) - doAssertRaises(AssertionDefect): discard "".rsplit("") + let s = " this is an example " + doAssert "".rsplit({}) == @[""] + doAssert "".rsplit("") == @[""] + doAssert s.rsplit({}) == @[s] + doAssert s.rsplit("") == @[s] block: # splitWhitespace let s = " this is an example "