[feature] stdlib: strutils.multiReplace for character sets (#24805)

Multiple replacements based on character sets in a single pass. Useful for string sanitation. Follows existing `multiReplace` semantics. Note: initially copied the substring version logic with a `while` and a named block break, but Godbolt showed it had produced slightly larger assembly using higher registers than the final version. - [x] Tests - [x] changelog.md (cherry picked from commit 909f3b8b79)
2026-07-27 02:46:29 +00:00 · 2025-03-25 10:40:01 +04:00
parent e68a91c8df
commit 81eabe3b9e
3 changed files with 57 additions and 4 deletions
--- a/changelog.md
+++ b/changelog.md
@@ -25,6 +25,8 @@ errors.
 - `setutils.symmetricDifference` along with its operator version
  `` setutils.`-+-` `` and in-place version `setutils.toggle` have been added
  to more efficiently calculate the symmetric difference of bitsets.
+- `strutils.multiReplace` overload for character set replacements in a single pass.
+	Useful for string sanitation. Follows existing multiReplace semantics.

 [//]: # "Changes:"
 - `std/math` The `^` symbol now supports floating-point as exponent in addition to the Natural type.
--- a/lib/pure/strutils.nim
+++ b/lib/pure/strutils.nim
@@ -2202,7 +2202,8 @@ func replace*(s, sub: string, by = ""): string {.rtl,
  ## * `replace func<#replace,string,char,char>`_ for replacing
  ##   single characters
  ## * `replaceWord func<#replaceWord,string,string,string>`_
-  ## * `multiReplace func<#multiReplace,string,varargs[]>`_
+  ## * `multiReplace func<#multiReplace,string,varargs[]>`_ for substrings
+  ## * `multiReplace func<#multiReplace,openArray[char],varargs[]>`_ for single characters
  result = ""
  let subLen = sub.len
  if subLen == 0:
@@ -2245,7 +2246,8 @@ func replace*(s: string, sub, by: char): string {.rtl,
  ## See also:
  ## * `find func<#find,string,char,Natural,int>`_
  ## * `replaceWord func<#replaceWord,string,string,string>`_
-  ## * `multiReplace func<#multiReplace,string,varargs[]>`_
+  ## * `multiReplace func<#multiReplace,string,varargs[]>`_ for substrings
+  ## * `multiReplace func<#multiReplace,openArray[char],varargs[]>`_ for single characters
  result = newString(s.len)
  var i = 0
  while i < s.len:
@@ -2330,7 +2332,39 @@ func multiReplace*(s: string, replacements: varargs[(string, string)]): string =
      add result, s[i]
      inc(i)

-
+func multiReplace*(s: openArray[char]; replacements: varargs[(set[char], char)]): string {.noinit.} =
+  ## Performs multiple character replacements in a single pass through the input.
+  ##
+  ## `multiReplace` scans the input `s` from left to right and replaces
+  ## characters based on character sets, applying the first matching replacement
+  ## at each position. Useful for sanitizing or transforming strings with
+  ## predefined character mappings.
+  ##
+  ## The order of the `replacements` matters:
+  ##   - First matching replacement is applied
+  ##   - Subsequent replacements are not considered for the same character
+  ##
+  ## See also:
+  ## - `multiReplace(s: string; replacements: varargs[(string, string)]) <#multiReplace,string,varargs[]>`_,
+  runnableExamples:
+    const WinSanitationRules = [
+      ({'\0'..'\31'}, ' '),
+      ({'"'}, '\''),
+      ({'/', '\\', ':', '|'}, '-'),
+      ({'*', '?', '<', '>'}, '_'),
+    ]
+    # Sanitize a filename with Windows-incompatible characters
+    const file = "a/file:with?invalid*chars.txt"
+    doAssert file.multiReplace(WinSanitationRules) == "a-file-with_invalid_chars.txt"
+  {.cast(noSideEffect).}:
+    result = newStringUninit(s.len)
+  for i in 0..<s.len:
+    var nextChar = s[i]
+    for subs, by in replacements.items:
+      if nextChar in subs:
+        nextChar = by
+        break
+    result[i] = nextChar

 func insertSep*(s: string, sep = '_', digits = 3): string {.rtl,
    extern: "nsuInsertSep".} =
--- a/tests/stdlib/tstrutils.nim
+++ b/tests/stdlib/tstrutils.nim
@@ -575,12 +575,29 @@ template main() =
    doAssert "-lda-ldz -ld abc".replaceWord("-ld") == "-lda-ldz  abc"
    doAssert "-lda-ldz -ld abc".replaceWord("") == "-lda-ldz -ld abc"

-  block: # multiReplace
+  block: # multiReplace substrings
    doAssert "abba".multiReplace(("a", "b"), ("b", "a")) == "baab"
    doAssert "Hello World.".multiReplace(("ello", "ELLO"), ("World.",
        "PEOPLE!")) == "HELLO PEOPLE!"
    doAssert "aaaa".multiReplace(("a", "aa"), ("aa", "bb")) == "aaaaaaaa"

+  block: # multiReplace characters
+    # https://learn.microsoft.com/en-us/windows/win32/fileio/naming-a-file#naming-conventions
+    const SanitationRules = [
+        ({'\0'..'\31'}, ' '),
+        ({'"'}, '\''),
+        ({'/', '\\', ':', '|'}, '-'),
+        ({'*', '?', '<', '>'}, '_'),
+      ]
+    # Basic character set replacements
+    doAssert multiReplace("abba", SanitationRules) == "abba"
+    doAssert multiReplace("a/b\\c:d", SanitationRules) == "a-b-c-d"
+    doAssert multiReplace("a*b?c", SanitationRules) == "a_b_c"
+    doAssert multiReplace("\0\3test", SanitationRules) == "  test"
+    doAssert multiReplace("testquote\"", SanitationRules) == "testquote'"
+    doAssert multiReplace("", SanitationRules) == ""
+    doAssert multiReplace("/\\:*?\"\0<>", ({'\0'..'\255'}, '.')) == "........."
+
  # `parseEnum`, ref issue #14030
  # check enum defined at top level # xxx this is probably irrelevant, and pollutes scope
  # for remaining tests