fix(mbyte): fix bugs in utf_cp_*_off() functions

Problems:
- Illegal bytes after valid UTF-8 char cause utf_cp_*_off() to fail.
- When stream isn't NUL-terminated, utf_cp_*_off() may go over the end.
Solution: Don't go over end of the char of end of the string.
This commit is contained in:
VanaIgr
2024-02-26 04:12:55 -06:00
committed by GitHub
parent 8b4e269156
commit ad5a155b1f
9 changed files with 134 additions and 93 deletions

View File

@@ -1884,99 +1884,52 @@ void mb_copy_char(const char **const fp, char **const tp)
*fp += l;
}
/// Return the offset from "p_in" to the first byte of a character. When "p_in" is
/// at the start of a character 0 is returned, otherwise the offset to the next
/// character. Can start anywhere in a stream of bytes.
int mb_off_next(const char *base, const char *p_in)
{
const uint8_t *p = (uint8_t *)p_in;
int i;
if (*p < 0x80) { // be quick for ASCII
return 0;
}
// Find the next character that isn't 10xx.xxxx
for (i = 0; (p[i] & 0xc0) == 0x80; i++) {}
if (i > 0) {
int j;
// Check for illegal sequence.
for (j = 0; p - j > (uint8_t *)base; j++) {
if ((p[-j] & 0xc0) != 0x80) {
break;
}
}
if (utf8len_tab[p[-j]] != i + j) {
return 0;
}
}
return i;
}
/// Return the offset from `p_in` to the last byte of the codepoint it points
/// to. Can start anywhere in a stream of bytes.
/// Returns the offset in bytes from "p_in" to the first and one-past-end bytes
/// of the codepoint it points to.
/// "p_in" can point anywhere in a stream of bytes.
/// "p_len" limits number of bytes after "p_in".
/// Note: Counts individual codepoints of composed characters separately.
int utf_cp_tail_off(const char *base, const char *p_in)
CharBoundsOff utf_cp_bounds_len(char const *base, char const *p_in, int p_len)
FUNC_ATTR_PURE FUNC_ATTR_NONNULL_ALL
{
const uint8_t *p = (uint8_t *)p_in;
int i;
int j;
if (*p == NUL) {
return 0;
assert(base <= p_in && p_len > 0);
uint8_t const *const b = (uint8_t *)base;
uint8_t const *const p = (uint8_t *)p_in;
if (*p < 0x80U) { // be quick for ASCII
return (CharBoundsOff){ 0, 1 };
}
// Find the last character that is 10xx.xxxx
for (i = 0; (p[i + 1] & 0xc0) == 0x80; i++) {}
// Check for illegal sequence.
for (j = 0; p_in - j > base; j++) {
if ((p[-j] & 0xc0) != 0x80) {
break;
int const max_first_off = -MIN((int)(p - b), MB_MAXCHAR - 1);
int first_off = 0;
for (; utf_is_trail_byte(p[first_off]); first_off--) {
if (first_off == max_first_off) { // failed to find first byte
return (CharBoundsOff){ 0, 1 };
}
}
if (utf8len_tab[p[-j]] != i + j + 1) {
return 0;
int const max_end_off = utf8len_tab[p[first_off]] + first_off;
if (max_end_off <= 0 || max_end_off > p_len) { // illegal or incomplete sequence
return (CharBoundsOff){ 0, 1 };
}
return i;
for (int end_off = 1; end_off < max_end_off; end_off++) {
if (!utf_is_trail_byte(p[end_off])) { // not enough trail bytes
return (CharBoundsOff){ 0, 1 };
}
}
return (CharBoundsOff){ .begin_off = (int8_t)-first_off, .end_off = (int8_t)max_end_off };
}
/// Return the offset from "p" to the first byte of the codepoint it points
/// to. Can start anywhere in a stream of bytes.
/// Note: Unlike `utf_head_off`, this counts individual codepoints of composed characters
/// separately.
///
/// @param[in] base Pointer to start of string
/// @param[in] p Pointer to byte for which to return the offset to the previous codepoint
//
/// @return 0 if invalid sequence, else number of bytes to previous codepoint
int utf_cp_head_off(const char *base, const char *p)
/// Returns the offset in bytes from "p_in" to the first and one-past-end bytes
/// of the codepoint it points to.
/// "p_in" can point anywhere in a stream of bytes.
/// Stream must be NUL-terminated.
/// Note: Counts individual codepoints of composed characters separately.
CharBoundsOff utf_cp_bounds(char const *base, char const *p_in)
FUNC_ATTR_PURE FUNC_ATTR_NONNULL_ALL
{
int i;
if (*p == NUL) {
return 0;
}
// Find the first character that is not 10xx.xxxx
for (i = 0; p - i >= base; i++) {
if (((uint8_t)p[-i] & 0xc0) != 0x80) {
break;
}
}
// Find the last character that is 10xx.xxxx (condition terminates on NUL)
int j = 1;
while (((uint8_t)p[j] & 0xc0) == 0x80) {
j++;
}
// Check for illegal sequence.
if (utf8len_tab[(uint8_t)p[-i]] != j + i) {
return 0;
}
return i;
return utf_cp_bounds_len(base, p_in, INT_MAX);
}
// Find the next illegal byte sequence.