From 81eabe3b9e4ade6b2f6b50a6c25c391d83bc2921 Mon Sep 17 00:00:00 2001 From: Zoom Date: Tue, 25 Mar 2025 10:40:01 +0400 Subject: [PATCH] [feature] stdlib: strutils.multiReplace for character sets (#24805) Multiple replacements based on character sets in a single pass. Useful for string sanitation. Follows existing `multiReplace` semantics. Note: initially copied the substring version logic with a `while` and a named block break, but Godbolt showed it had produced slightly larger assembly using higher registers than the final version. - [x] Tests - [x] changelog.md (cherry picked from commit 909f3b8b798a8e2526dc19a1b8e91698402e85fb) --- changelog.md | 2 ++ lib/pure/strutils.nim | 40 +++++++++++++++++++++++++++++++++++--- tests/stdlib/tstrutils.nim | 19 +++++++++++++++++- 3 files changed, 57 insertions(+), 4 deletions(-) diff --git a/changelog.md b/changelog.md index b9671147f0..ad5ab5f0e3 100644 --- a/changelog.md +++ b/changelog.md @@ -25,6 +25,8 @@ errors. - `setutils.symmetricDifference` along with its operator version `` setutils.`-+-` `` and in-place version `setutils.toggle` have been added to more efficiently calculate the symmetric difference of bitsets. +- `strutils.multiReplace` overload for character set replacements in a single pass. + Useful for string sanitation. Follows existing multiReplace semantics. [//]: # "Changes:" - `std/math` The `^` symbol now supports floating-point as exponent in addition to the Natural type. diff --git a/lib/pure/strutils.nim b/lib/pure/strutils.nim index 687dedd514..c941afd085 100644 --- a/lib/pure/strutils.nim +++ b/lib/pure/strutils.nim @@ -2202,7 +2202,8 @@ func replace*(s, sub: string, by = ""): string {.rtl, ## * `replace func<#replace,string,char,char>`_ for replacing ## single characters ## * `replaceWord func<#replaceWord,string,string,string>`_ - ## * `multiReplace func<#multiReplace,string,varargs[]>`_ + ## * `multiReplace func<#multiReplace,string,varargs[]>`_ for substrings + ## * `multiReplace func<#multiReplace,openArray[char],varargs[]>`_ for single characters result = "" let subLen = sub.len if subLen == 0: @@ -2245,7 +2246,8 @@ func replace*(s: string, sub, by: char): string {.rtl, ## See also: ## * `find func<#find,string,char,Natural,int>`_ ## * `replaceWord func<#replaceWord,string,string,string>`_ - ## * `multiReplace func<#multiReplace,string,varargs[]>`_ + ## * `multiReplace func<#multiReplace,string,varargs[]>`_ for substrings + ## * `multiReplace func<#multiReplace,openArray[char],varargs[]>`_ for single characters result = newString(s.len) var i = 0 while i < s.len: @@ -2330,7 +2332,39 @@ func multiReplace*(s: string, replacements: varargs[(string, string)]): string = add result, s[i] inc(i) - +func multiReplace*(s: openArray[char]; replacements: varargs[(set[char], char)]): string {.noinit.} = + ## Performs multiple character replacements in a single pass through the input. + ## + ## `multiReplace` scans the input `s` from left to right and replaces + ## characters based on character sets, applying the first matching replacement + ## at each position. Useful for sanitizing or transforming strings with + ## predefined character mappings. + ## + ## The order of the `replacements` matters: + ## - First matching replacement is applied + ## - Subsequent replacements are not considered for the same character + ## + ## See also: + ## - `multiReplace(s: string; replacements: varargs[(string, string)]) <#multiReplace,string,varargs[]>`_, + runnableExamples: + const WinSanitationRules = [ + ({'\0'..'\31'}, ' '), + ({'"'}, '\''), + ({'/', '\\', ':', '|'}, '-'), + ({'*', '?', '<', '>'}, '_'), + ] + # Sanitize a filename with Windows-incompatible characters + const file = "a/file:with?invalid*chars.txt" + doAssert file.multiReplace(WinSanitationRules) == "a-file-with_invalid_chars.txt" + {.cast(noSideEffect).}: + result = newStringUninit(s.len) + for i in 0..'}, '_'), + ] + # Basic character set replacements + doAssert multiReplace("abba", SanitationRules) == "abba" + doAssert multiReplace("a/b\\c:d", SanitationRules) == "a-b-c-d" + doAssert multiReplace("a*b?c", SanitationRules) == "a_b_c" + doAssert multiReplace("\0\3test", SanitationRules) == " test" + doAssert multiReplace("testquote\"", SanitationRules) == "testquote'" + doAssert multiReplace("", SanitationRules) == "" + doAssert multiReplace("/\\:*?\"\0<>", ({'\0'..'\255'}, '.')) == "........." + # `parseEnum`, ref issue #14030 # check enum defined at top level # xxx this is probably irrelevant, and pollutes scope # for remaining tests