[feature] stdlib: strutils.multiReplace for character sets (#24805)

Multiple replacements based on character sets in a single pass. Useful
for string sanitation. Follows existing `multiReplace` semantics.

Note: initially copied the substring version logic with a `while` and a
named block break, but Godbolt showed it had produced slightly larger
assembly using higher registers than the final version.

- [x] Tests
- [x] changelog.md

(cherry picked from commit 909f3b8b79)
This commit is contained in:
Zoom
2025-03-25 10:40:01 +04:00
committed by narimiran
parent e68a91c8df
commit 81eabe3b9e
3 changed files with 57 additions and 4 deletions

View File

@@ -25,6 +25,8 @@ errors.
- `setutils.symmetricDifference` along with its operator version
`` setutils.`-+-` `` and in-place version `setutils.toggle` have been added
to more efficiently calculate the symmetric difference of bitsets.
- `strutils.multiReplace` overload for character set replacements in a single pass.
Useful for string sanitation. Follows existing multiReplace semantics.
[//]: # "Changes:"
- `std/math` The `^` symbol now supports floating-point as exponent in addition to the Natural type.

View File

@@ -2202,7 +2202,8 @@ func replace*(s, sub: string, by = ""): string {.rtl,
## * `replace func<#replace,string,char,char>`_ for replacing
## single characters
## * `replaceWord func<#replaceWord,string,string,string>`_
## * `multiReplace func<#multiReplace,string,varargs[]>`_
## * `multiReplace func<#multiReplace,string,varargs[]>`_ for substrings
## * `multiReplace func<#multiReplace,openArray[char],varargs[]>`_ for single characters
result = ""
let subLen = sub.len
if subLen == 0:
@@ -2245,7 +2246,8 @@ func replace*(s: string, sub, by: char): string {.rtl,
## See also:
## * `find func<#find,string,char,Natural,int>`_
## * `replaceWord func<#replaceWord,string,string,string>`_
## * `multiReplace func<#multiReplace,string,varargs[]>`_
## * `multiReplace func<#multiReplace,string,varargs[]>`_ for substrings
## * `multiReplace func<#multiReplace,openArray[char],varargs[]>`_ for single characters
result = newString(s.len)
var i = 0
while i < s.len:
@@ -2330,7 +2332,39 @@ func multiReplace*(s: string, replacements: varargs[(string, string)]): string =
add result, s[i]
inc(i)
func multiReplace*(s: openArray[char]; replacements: varargs[(set[char], char)]): string {.noinit.} =
## Performs multiple character replacements in a single pass through the input.
##
## `multiReplace` scans the input `s` from left to right and replaces
## characters based on character sets, applying the first matching replacement
## at each position. Useful for sanitizing or transforming strings with
## predefined character mappings.
##
## The order of the `replacements` matters:
## - First matching replacement is applied
## - Subsequent replacements are not considered for the same character
##
## See also:
## - `multiReplace(s: string; replacements: varargs[(string, string)]) <#multiReplace,string,varargs[]>`_,
runnableExamples:
const WinSanitationRules = [
({'\0'..'\31'}, ' '),
({'"'}, '\''),
({'/', '\\', ':', '|'}, '-'),
({'*', '?', '<', '>'}, '_'),
]
# Sanitize a filename with Windows-incompatible characters
const file = "a/file:with?invalid*chars.txt"
doAssert file.multiReplace(WinSanitationRules) == "a-file-with_invalid_chars.txt"
{.cast(noSideEffect).}:
result = newStringUninit(s.len)
for i in 0..<s.len:
var nextChar = s[i]
for subs, by in replacements.items:
if nextChar in subs:
nextChar = by
break
result[i] = nextChar
func insertSep*(s: string, sep = '_', digits = 3): string {.rtl,
extern: "nsuInsertSep".} =

View File

@@ -575,12 +575,29 @@ template main() =
doAssert "-lda-ldz -ld abc".replaceWord("-ld") == "-lda-ldz abc"
doAssert "-lda-ldz -ld abc".replaceWord("") == "-lda-ldz -ld abc"
block: # multiReplace
block: # multiReplace substrings
doAssert "abba".multiReplace(("a", "b"), ("b", "a")) == "baab"
doAssert "Hello World.".multiReplace(("ello", "ELLO"), ("World.",
"PEOPLE!")) == "HELLO PEOPLE!"
doAssert "aaaa".multiReplace(("a", "aa"), ("aa", "bb")) == "aaaaaaaa"
block: # multiReplace characters
# https://learn.microsoft.com/en-us/windows/win32/fileio/naming-a-file#naming-conventions
const SanitationRules = [
({'\0'..'\31'}, ' '),
({'"'}, '\''),
({'/', '\\', ':', '|'}, '-'),
({'*', '?', '<', '>'}, '_'),
]
# Basic character set replacements
doAssert multiReplace("abba", SanitationRules) == "abba"
doAssert multiReplace("a/b\\c:d", SanitationRules) == "a-b-c-d"
doAssert multiReplace("a*b?c", SanitationRules) == "a_b_c"
doAssert multiReplace("\0\3test", SanitationRules) == " test"
doAssert multiReplace("testquote\"", SanitationRules) == "testquote'"
doAssert multiReplace("", SanitationRules) == ""
doAssert multiReplace("/\\:*?\"\0<>", ({'\0'..'\255'}, '.')) == "........."
# `parseEnum`, ref issue #14030
# check enum defined at top level # xxx this is probably irrelevant, and pollutes scope
# for remaining tests