Merge pull request #30272 from bfredl/replace_emoji

fix(multibyte): handle backspace of wide clusters in replace mode
2025-09-28 05:58:33 +00:00 · 2024-09-06 12:08:26 +02:00
parent 9570ad24f5 fa99afe35e
commit c81cb02dd6
9 changed files with 195 additions and 150 deletions
--- a/src/nvim/mbyte.c
+++ b/src/nvim/mbyte.c
@@ -523,12 +523,14 @@ int utf_ptr2cells(const char *p_in)
 }

 /// Convert a UTF-8 byte sequence to a character number.
-/// Doesn't handle ascii! only multibyte and illegal sequences.
+/// Doesn't handle ascii! only multibyte and illegal sequences. ASCII (including NUL)
+/// are treated like illegal sequences.
 ///
 /// @param[in]  p      String to convert.
 /// @param[in]  len    Length of the character in bytes, 0 or 1 if illegal.
 ///
-/// @return Unicode codepoint. A negative value when the sequence is illegal.
+/// @return Unicode codepoint. A negative value when the sequence is illegal (or
+///         ASCII, including NUL).
 int32_t utf_ptr2CharInfo_impl(uint8_t const *p, uintptr_t const len)
  FUNC_ATTR_PURE FUNC_ATTR_NONNULL_ALL FUNC_ATTR_WARN_UNUSED_RESULT
 {
@@ -1780,15 +1782,15 @@ int utf_head_off(const char *base_in, const char *p_in)
    start--;
  }

-  uint8_t cur_len = utf8len_tab[*start];
-  int32_t cur_code = utf_ptr2CharInfo_impl(start, (uintptr_t)cur_len);
-  if (cur_code < 0) {
+  const uint8_t last_len = utf8len_tab[*start];
+  int32_t cur_code = utf_ptr2CharInfo_impl(start, (uintptr_t)last_len);
+  if (cur_code < 0 || p - start >= last_len) {
    return 0;  // p must be part of an illegal sequence
  }
-  const uint8_t * const safe_end = start + cur_len;
+  const uint8_t * const safe_end = start + last_len;

  int cur_bc = utf8proc_get_property(cur_code)->boundclass;
-  if (always_break(cur_bc)) {
+  if (always_break(cur_bc) || start == base) {
    return (int)(p - start);
  }

@@ -1796,18 +1798,23 @@ int utf_head_off(const char *base_in, const char *p_in)
  const uint8_t *cur_pos = start;
  const uint8_t *const p_start = start;

-  if (start == base) {
-    return (int)(p - start);
-  }
+  while (true) {
+    if (start[-1] == NUL) {
+      break;
+    }
+
+    start--;
+    if (*start < 0x80) {  // stop on ascii, we are done
+      break;
+    }

-  start--;
-  while (*start >= 0x80) {  // stop on ascii, we are done
    while (start > base && (*start & 0xc0) == 0x80 && (cur_pos - start) < 6) {
      start--;
    }

-    int32_t prev_code = utf_ptr2CharInfo_impl(start, (uintptr_t)utf8len_tab[*start]);
-    if (prev_code < 0) {
+    int prev_len = utf8len_tab[*start];
+    int32_t prev_code = utf_ptr2CharInfo_impl(start, (uintptr_t)prev_len);
+    if (prev_code < 0 || prev_len < cur_pos - start) {
      start = cur_pos;  // start at valid sequence after invalid bytes
      break;
    }
@@ -1822,12 +1829,10 @@ int utf_head_off(const char *base_in, const char *p_in)
    cur_pos = start;
    cur_bc = prev_bc;
    cur_code = prev_code;
-
-    start--;
  }

  // hot path: we are already on the first codepoint of a sequence
-  if (start == p_start) {
+  if (start == p_start && last_len > p - start) {
    return (int)(p - start);
  }