From 2679b3221cc56f593ea4b08a2370591b0e2dad21 Mon Sep 17 00:00:00 2001 From: lit Date: Tue, 11 Nov 2025 19:01:07 +0800 Subject: [PATCH] fixes #19846; std/unicode.strip trailing big chars (#25274) fixes #19846 --- lib/pure/unicode.nim | 24 ++++++++++++++---------- tests/stdlib/tunicode.nim | 17 +++++++++++++++++ 2 files changed, 31 insertions(+), 10 deletions(-) diff --git a/lib/pure/unicode.nim b/lib/pure/unicode.nim index a953ce8ccd..6337c25a05 100644 --- a/lib/pure/unicode.nim +++ b/lib/pure/unicode.nim @@ -1037,6 +1037,19 @@ proc split*(s: openArray[char], sep: Rune, maxsplit: int = -1): seq[string] {.no ## that returns a sequence of substrings. accResult(split(s, sep, maxsplit)) +func getRuneHeadIdx(s: openArray[char], idx: int): int = + ## Given `[idx]` is within a Rune, then `s[result]` is the first byte of that Rune. + result = idx + if s[result] <= '\x7F': # 0b0111_1111 + return + # 0b1... + dec result + for _ in 0..1: + if s[result] >= '\xC0': # 0b11xx_xxxx + # 0b110... or 0b1110... + return + dec result + proc strip*(s: openArray[char], leading = true, trailing = true, runes: openArray[Rune] = unicodeSpaces): string {.noSideEffect, rtl, extern: "nucStrip".} = @@ -1073,18 +1086,9 @@ proc strip*(s: openArray[char], leading = true, trailing = true, xI: int rune: Rune while i >= 0: + i = getRuneHeadIdx(s, i) xI = i fastRuneAt(s, xI, rune) - var yI = i - 1 - while yI >= 0: - var - yIend = yI - pRune: Rune - fastRuneAt(s, yIend, pRune) - if yIend < xI: break - i = yI - rune = pRune - dec(yI) if not runes.contains(rune): eI = xI - 1 break diff --git a/tests/stdlib/tunicode.nim b/tests/stdlib/tunicode.nim index b9e68b15b4..a272d16c92 100644 --- a/tests/stdlib/tunicode.nim +++ b/tests/stdlib/tunicode.nim @@ -194,6 +194,23 @@ block stripTests: doAssert(strip("×text×", leading = false, runes = ["×".asRune]) == "×text") doAssert(strip("×text×", trailing = false, runes = ["×".asRune]) == "text×") + doAssert(strip("\u2000") == "") + doAssert(strip("a\u2000") == "a") + + # bug #19846 + block: + # check against unicode whose utf8 byteLen > 2 + doAssert(strip("‟„”“‛‚’‘‗•STR•‗‘’‚‛“”„‟", runes = "•‗‘’‚‛“”„‟".toRunes) == "STR") + let chi = "abc\u8377\u9020" + doAssert(strip(chi, leading = false, runes = ["\u9020".asRune]) == "abc\u8377") + doAssert(strip(chi) == chi) # the last byte of s is \x0a, which is in unicodeSpace + + let + grinning_face = "\u{1f600}" + thinking_face = "\u{1f914}" + doAssert(strip(grinning_face & thinking_face & thinking_face, + runes = thinking_face.toRunes) == grinning_face) + block repeatTests: doAssert repeat('c'.Rune, 5) == "ccccc" doAssert repeat("×".asRune, 5) == "×××××"