perf: don't decode utf8 character multiple times in getvcol()

The optimized virtual column calculation loop in getvcol()
was decoding the current character twice: once in ptr2cells()
and the second time in utfc_ptr2len(). For combining charcters, they were
decoded up to 2 times in utfc_ptr2len(). Additionally, the function used to
decode the character could be further optimised.
This commit is contained in:
VanaIgr
2023-12-17 16:48:27 -06:00
committed by zeertzjq
parent 2f2f12122f
commit b5653984e5
6 changed files with 230 additions and 61 deletions

View File

@@ -6,6 +6,7 @@
#include "nvim/cmdexpand_defs.h" // IWYU pragma: keep
#include "nvim/eval/typval_defs.h" // IWYU pragma: keep
#include "nvim/macros_defs.h"
#include "nvim/mbyte_defs.h" // IWYU pragma: keep
#include "nvim/types_defs.h" // IWYU pragma: keep
@@ -13,6 +14,10 @@
# include "mbyte.h.generated.h"
#endif
enum {
kInvalidByteCells = 4,
};
// Return byte length of character that starts with byte "b".
// Returns 1 for a single-byte character.
// MB_BYTE2LEN_CHECK() can be used to count a special key as one byte.
@@ -44,3 +49,64 @@ extern const uint8_t utf8len_tab[256];
// multi-byte characters if needed. Only use with "p" > "s" !
#define MB_PTR_BACK(s, p) \
(p -= utf_head_off((char *)(s), (char *)(p) - 1) + 1)
static inline CharInfo utf_ptr2CharInfo(char const *p_in)
REAL_FATTR_NONNULL_ALL REAL_FATTR_PURE REAL_FATTR_WARN_UNUSED_RESULT REAL_FATTR_ALWAYS_INLINE;
/// Convert a UTF-8 byte sequence to a Unicode code point.
/// Handles ascii, multibyte sequiences and illegal sequences.
///
/// @param[in] p_in String to convert.
///
/// @return information abouth the character. When the sequence is illegal,
/// 'value' is negative, 'len' is 1.
static inline CharInfo utf_ptr2CharInfo(char const *const p_in)
{
uint8_t const *const p = (uint8_t const *)p_in;
uint8_t const first = *p;
if (first < 0x80) {
return (CharInfo){ .value = first, .len = 1 };
} else {
int len = utf8len_tab[first];
int32_t const code_point = utf_ptr2CharInfo_impl(p, (uintptr_t)len);
if (code_point < 0) {
len = 1;
}
return (CharInfo){ .value = code_point, .len = len };
}
}
static inline StrCharInfo utfc_next(StrCharInfo cur)
REAL_FATTR_NONNULL_ALL REAL_FATTR_ALWAYS_INLINE REAL_FATTR_PURE;
/// Return information about the next character.
/// Composing and combining characters are
/// considered a part of the current character.
///
/// @param[in] cur Pointer to the current character. Must not point to NUL
/// @param[in] cur_char Decoded charater at 'cur'.
static inline StrCharInfo utfc_next(StrCharInfo cur)
{
int32_t prev_code = cur.chr.value;
uint8_t *next = (uint8_t *)(cur.ptr + cur.chr.len);
while (true) {
if (EXPECT(*next < 0x80U, true)) {
return (StrCharInfo){
.ptr = (char *)next,
.chr = (CharInfo){ .value = *next, .len = 1 },
};
}
uint8_t const next_len = utf8len_tab[*next];
int32_t const next_code = utf_ptr2CharInfo_impl(next, (uintptr_t)next_len);
if (!utf_char_composinglike(prev_code, next_code)) {
return (StrCharInfo){
.ptr = (char *)next,
.chr = (CharInfo){ .value = next_code, .len = (next_code < 0 ? 1 : next_len) },
};
}
prev_code = next_code;
next += next_len;
}
}