From 2679b3221cc56f593ea4b08a2370591b0e2dad21 Mon Sep 17 00:00:00 2001
From: lit <litlighilit@foxmail.com>
Date: Tue, 11 Nov 2025 19:01:07 +0800
Subject: [PATCH] fixes #19846; std/unicode.strip trailing big chars (#25274)

fixes #19846
---
 lib/pure/unicode.nim      | 24 ++++++++++++++----------
 tests/stdlib/tunicode.nim | 17 +++++++++++++++++
 2 files changed, 31 insertions(+), 10 deletions(-)

diff --git a/lib/pure/unicode.nim b/lib/pure/unicode.nim
index a953ce8ccd..6337c25a05 100644
--- a/lib/pure/unicode.nim
+++ b/lib/pure/unicode.nim
@@ -1037,6 +1037,19 @@ proc split*(s: openArray[char], sep: Rune, maxsplit: int = -1): seq[string] {.no
   ## that returns a sequence of substrings.
   accResult(split(s, sep, maxsplit))
 
+func getRuneHeadIdx(s: openArray[char], idx: int): int =
+  ## Given `[idx]` is within a Rune, then `s[result]` is the first byte of that Rune.
+  result = idx
+  if s[result] <= '\x7F': # 0b0111_1111
+    return
+  # 0b1...
+  dec result
+  for _ in 0..1:
+    if s[result] >= '\xC0': # 0b11xx_xxxx
+      # 0b110... or 0b1110...
+      return
+    dec result
+
 proc strip*(s: openArray[char], leading = true, trailing = true,
             runes: openArray[Rune] = unicodeSpaces): string {.noSideEffect,
             rtl, extern: "nucStrip".} =
@@ -1073,18 +1086,9 @@ proc strip*(s: openArray[char], leading = true, trailing = true,
       xI: int
       rune: Rune
     while i >= 0:
+      i = getRuneHeadIdx(s, i)
       xI = i
       fastRuneAt(s, xI, rune)
-      var yI = i - 1
-      while yI >= 0:
-        var
-          yIend = yI
-          pRune: Rune
-        fastRuneAt(s, yIend, pRune)
-        if yIend < xI: break
-        i = yI
-        rune = pRune
-        dec(yI)
       if not runes.contains(rune):
         eI = xI - 1
         break
diff --git a/tests/stdlib/tunicode.nim b/tests/stdlib/tunicode.nim
index b9e68b15b4..a272d16c92 100644
--- a/tests/stdlib/tunicode.nim
+++ b/tests/stdlib/tunicode.nim
@@ -194,6 +194,23 @@ block stripTests:
   doAssert(strip("×text×", leading = false, runes = ["×".asRune]) == "×text")
   doAssert(strip("×text×", trailing = false, runes = ["×".asRune]) == "text×")
 
+  doAssert(strip("\u2000") == "")
+  doAssert(strip("a\u2000") == "a")
+
+  # bug #19846
+  block:
+    # check against unicode whose utf8 byteLen > 2
+    doAssert(strip("‟„”“‛‚’‘‗•STR•‗‘’‚‛“”„‟", runes = "•‗‘’‚‛“”„‟".toRunes) == "STR")
+    let chi = "abc\u8377\u9020"
+    doAssert(strip(chi, leading = false, runes = ["\u9020".asRune]) == "abc\u8377")
+    doAssert(strip(chi) == chi)  # the last byte of s is \x0a, which is in unicodeSpace
+
+    let
+      grinning_face = "\u{1f600}"
+      thinking_face = "\u{1f914}"
+    doAssert(strip(grinning_face & thinking_face & thinking_face,
+                   runes = thinking_face.toRunes) == grinning_face)
+
 block repeatTests:
   doAssert repeat('c'.Rune, 5) == "ccccc"
   doAssert repeat("×".asRune, 5) == "×××××"