Merge pull request #30272 from bfredl/replace_emoji

fix(multibyte): handle backspace of wide clusters in replace mode
This commit is contained in:
bfredl
2024-09-06 12:08:26 +02:00
committed by GitHub
9 changed files with 195 additions and 150 deletions

View File

@@ -523,12 +523,14 @@ int utf_ptr2cells(const char *p_in)
}
/// Convert a UTF-8 byte sequence to a character number.
/// Doesn't handle ascii! only multibyte and illegal sequences.
/// Doesn't handle ascii! only multibyte and illegal sequences. ASCII (including NUL)
/// are treated like illegal sequences.
///
/// @param[in] p String to convert.
/// @param[in] len Length of the character in bytes, 0 or 1 if illegal.
///
/// @return Unicode codepoint. A negative value when the sequence is illegal.
/// @return Unicode codepoint. A negative value when the sequence is illegal (or
/// ASCII, including NUL).
int32_t utf_ptr2CharInfo_impl(uint8_t const *p, uintptr_t const len)
FUNC_ATTR_PURE FUNC_ATTR_NONNULL_ALL FUNC_ATTR_WARN_UNUSED_RESULT
{
@@ -1780,15 +1782,15 @@ int utf_head_off(const char *base_in, const char *p_in)
start--;
}
uint8_t cur_len = utf8len_tab[*start];
int32_t cur_code = utf_ptr2CharInfo_impl(start, (uintptr_t)cur_len);
if (cur_code < 0) {
const uint8_t last_len = utf8len_tab[*start];
int32_t cur_code = utf_ptr2CharInfo_impl(start, (uintptr_t)last_len);
if (cur_code < 0 || p - start >= last_len) {
return 0; // p must be part of an illegal sequence
}
const uint8_t * const safe_end = start + cur_len;
const uint8_t * const safe_end = start + last_len;
int cur_bc = utf8proc_get_property(cur_code)->boundclass;
if (always_break(cur_bc)) {
if (always_break(cur_bc) || start == base) {
return (int)(p - start);
}
@@ -1796,18 +1798,23 @@ int utf_head_off(const char *base_in, const char *p_in)
const uint8_t *cur_pos = start;
const uint8_t *const p_start = start;
if (start == base) {
return (int)(p - start);
}
while (true) {
if (start[-1] == NUL) {
break;
}
start--;
if (*start < 0x80) { // stop on ascii, we are done
break;
}
start--;
while (*start >= 0x80) { // stop on ascii, we are done
while (start > base && (*start & 0xc0) == 0x80 && (cur_pos - start) < 6) {
start--;
}
int32_t prev_code = utf_ptr2CharInfo_impl(start, (uintptr_t)utf8len_tab[*start]);
if (prev_code < 0) {
int prev_len = utf8len_tab[*start];
int32_t prev_code = utf_ptr2CharInfo_impl(start, (uintptr_t)prev_len);
if (prev_code < 0 || prev_len < cur_pos - start) {
start = cur_pos; // start at valid sequence after invalid bytes
break;
}
@@ -1822,12 +1829,10 @@ int utf_head_off(const char *base_in, const char *p_in)
cur_pos = start;
cur_bc = prev_bc;
cur_code = prev_code;
start--;
}
// hot path: we are already on the first codepoint of a sequence
if (start == p_start) {
if (start == p_start && last_len > p - start) {
return (int)(p - start);
}