mirror of
				https://github.com/neovim/neovim.git
				synced 2025-10-26 12:27:24 +00:00 
			
		
		
		
	fix(mbyte): fix bugs in utf_cp_*_off() functions
Problems: - Illegal bytes after valid UTF-8 char cause utf_cp_*_off() to fail. - When stream isn't NUL-terminated, utf_cp_*_off() may go over the end. Solution: Don't go over end of the char of end of the string.
This commit is contained in:
		| @@ -203,7 +203,8 @@ information. | |||||||
|   mb_off2cells                                         utf_off2cells |   mb_off2cells                                         utf_off2cells | ||||||
|   mb_ptr2char                                          utf_ptr2char |   mb_ptr2char                                          utf_ptr2char | ||||||
|   mb_head_off                                          utf_head_off |   mb_head_off                                          utf_head_off | ||||||
|   mb_tail_off                                         utf_cp_tail_off |   mb_tail_off                                          utf_cp_bounds | ||||||
|  |   mb_off_next                                          utf_cp_bounds | ||||||
|   mb_lefthalve                                        grid_lefthalve |   mb_lefthalve                                        grid_lefthalve | ||||||
|   mb_fix_col                                           grid_fix_col |   mb_fix_col                                           grid_fix_col | ||||||
|   utf_off2cells                                       grid_off2cells |   utf_off2cells                                       grid_off2cells | ||||||
|   | |||||||
| @@ -3436,7 +3436,7 @@ void maketitle(void) | |||||||
|       int len = (int)strlen(buf_p); |       int len = (int)strlen(buf_p); | ||||||
|       if (len > 100) { |       if (len > 100) { | ||||||
|         len -= 100; |         len -= 100; | ||||||
|         len += utf_cp_tail_off(buf_p, buf_p + len) + 1; |         len += utf_cp_bounds(buf_p, buf_p + len).end_off; | ||||||
|         buf_p += len; |         buf_p += len; | ||||||
|       } |       } | ||||||
|       STRCPY(icon_str, buf_p); |       STRCPY(icon_str, buf_p); | ||||||
|   | |||||||
| @@ -1512,8 +1512,8 @@ static int command_line_erase_chars(CommandLineState *s) | |||||||
|   } |   } | ||||||
|  |  | ||||||
|   if (s->c == K_DEL) { |   if (s->c == K_DEL) { | ||||||
|     ccline.cmdpos += mb_off_next(ccline.cmdbuff, |     CharBoundsOff bounds = utf_cp_bounds(ccline.cmdbuff, ccline.cmdbuff + ccline.cmdpos); | ||||||
|                                  ccline.cmdbuff + ccline.cmdpos); |     ccline.cmdpos += bounds.begin_off != 0 ? bounds.end_off : 0; | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   if (ccline.cmdpos > 0) { |   if (ccline.cmdpos > 0) { | ||||||
|   | |||||||
| @@ -277,7 +277,7 @@ void line_do_arabic_shape(schar_T *buf, int cols) | |||||||
|       // Too bigly, discard one code-point. |       // Too bigly, discard one code-point. | ||||||
|       // This should be enough as c0 cannot grow more than from 2 to 4 bytes |       // This should be enough as c0 cannot grow more than from 2 to 4 bytes | ||||||
|       // (base arabic to extended arabic) |       // (base arabic to extended arabic) | ||||||
|       rest -= (size_t)utf_cp_head_off(scbuf + off, scbuf + off + rest - 1) + 1; |       rest -= (size_t)utf_cp_bounds(scbuf + off, scbuf + off + rest - 1).begin_off + 1; | ||||||
|     } |     } | ||||||
|     memcpy(scbuf_new + len, scbuf + off, rest); |     memcpy(scbuf_new + len, scbuf + off, rest); | ||||||
|     buf[i] = schar_from_buf(scbuf_new, len + rest); |     buf[i] = schar_from_buf(scbuf_new, len + rest); | ||||||
|   | |||||||
| @@ -226,11 +226,12 @@ static int nlua_str_utf_start(lua_State *const lstate) FUNC_ATTR_NONNULL_ALL | |||||||
|   size_t s1_len; |   size_t s1_len; | ||||||
|   const char *s1 = luaL_checklstring(lstate, 1, &s1_len); |   const char *s1 = luaL_checklstring(lstate, 1, &s1_len); | ||||||
|   ptrdiff_t offset = luaL_checkinteger(lstate, 2); |   ptrdiff_t offset = luaL_checkinteger(lstate, 2); | ||||||
|   if (offset < 0 || offset > (intptr_t)s1_len) { |   if (offset <= 0 || offset > (intptr_t)s1_len) { | ||||||
|     return luaL_error(lstate, "index out of range"); |     return luaL_error(lstate, "index out of range"); | ||||||
|   } |   } | ||||||
|   int head_offset = -utf_cp_head_off(s1, s1 + offset - 1); |   size_t const off = (size_t)(offset - 1); | ||||||
|   lua_pushinteger(lstate, head_offset); |   int head_off = -utf_cp_bounds_len(s1, s1 + off, (int)(s1_len - off)).begin_off; | ||||||
|  |   lua_pushinteger(lstate, head_off); | ||||||
|   return 1; |   return 1; | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -246,11 +247,12 @@ static int nlua_str_utf_end(lua_State *const lstate) FUNC_ATTR_NONNULL_ALL | |||||||
|   size_t s1_len; |   size_t s1_len; | ||||||
|   const char *s1 = luaL_checklstring(lstate, 1, &s1_len); |   const char *s1 = luaL_checklstring(lstate, 1, &s1_len); | ||||||
|   ptrdiff_t offset = luaL_checkinteger(lstate, 2); |   ptrdiff_t offset = luaL_checkinteger(lstate, 2); | ||||||
|   if (offset < 0 || offset > (intptr_t)s1_len) { |   if (offset <= 0 || offset > (intptr_t)s1_len) { | ||||||
|     return luaL_error(lstate, "index out of range"); |     return luaL_error(lstate, "index out of range"); | ||||||
|   } |   } | ||||||
|   int tail_offset = utf_cp_tail_off(s1, s1 + offset - 1); |   size_t const off = (size_t)(offset - 1); | ||||||
|   lua_pushinteger(lstate, tail_offset); |   int tail_off = utf_cp_bounds_len(s1, s1 + off, (int)(s1_len - off)).end_off - 1; | ||||||
|  |   lua_pushinteger(lstate, tail_off); | ||||||
|   return 1; |   return 1; | ||||||
| } | } | ||||||
|  |  | ||||||
|   | |||||||
							
								
								
									
										117
									
								
								src/nvim/mbyte.c
									
									
									
									
									
								
							
							
						
						
									
										117
									
								
								src/nvim/mbyte.c
									
									
									
									
									
								
							| @@ -1884,99 +1884,52 @@ void mb_copy_char(const char **const fp, char **const tp) | |||||||
|   *fp += l; |   *fp += l; | ||||||
| } | } | ||||||
|  |  | ||||||
| /// Return the offset from "p_in" to the first byte of a character.  When "p_in" is | /// Returns the offset in bytes from "p_in" to the first and one-past-end bytes | ||||||
| /// at the start of a character 0 is returned, otherwise the offset to the next | /// of the codepoint it points to. | ||||||
| /// character.  Can start anywhere in a stream of bytes. | /// "p_in" can point anywhere in a stream of bytes. | ||||||
| int mb_off_next(const char *base, const char *p_in) | /// "p_len" limits number of bytes after "p_in". | ||||||
| { |  | ||||||
|   const uint8_t *p = (uint8_t *)p_in; |  | ||||||
|   int i; |  | ||||||
|  |  | ||||||
|   if (*p < 0x80) {              // be quick for ASCII |  | ||||||
|     return 0; |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   // Find the next character that isn't 10xx.xxxx |  | ||||||
|   for (i = 0; (p[i] & 0xc0) == 0x80; i++) {} |  | ||||||
|   if (i > 0) { |  | ||||||
|     int j; |  | ||||||
|     // Check for illegal sequence. |  | ||||||
|     for (j = 0; p - j > (uint8_t *)base; j++) { |  | ||||||
|       if ((p[-j] & 0xc0) != 0x80) { |  | ||||||
|         break; |  | ||||||
|       } |  | ||||||
|     } |  | ||||||
|     if (utf8len_tab[p[-j]] != i + j) { |  | ||||||
|       return 0; |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
|   return i; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| /// Return the offset from `p_in` to the last byte of the codepoint it points |  | ||||||
| /// to.  Can start anywhere in a stream of bytes. |  | ||||||
| /// Note: Counts individual codepoints of composed characters separately. | /// Note: Counts individual codepoints of composed characters separately. | ||||||
| int utf_cp_tail_off(const char *base, const char *p_in) | CharBoundsOff utf_cp_bounds_len(char const *base, char const *p_in, int p_len) | ||||||
|  |   FUNC_ATTR_PURE FUNC_ATTR_NONNULL_ALL | ||||||
| { | { | ||||||
|   const uint8_t *p = (uint8_t *)p_in; |   assert(base <= p_in && p_len > 0); | ||||||
|   int i; |   uint8_t const *const b = (uint8_t *)base; | ||||||
|   int j; |   uint8_t const *const p = (uint8_t *)p_in; | ||||||
|  |   if (*p < 0x80U) {  // be quick for ASCII | ||||||
|   if (*p == NUL) { |     return (CharBoundsOff){ 0, 1 }; | ||||||
|     return 0; |  | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   // Find the last character that is 10xx.xxxx |   int const max_first_off = -MIN((int)(p - b), MB_MAXCHAR - 1); | ||||||
|   for (i = 0; (p[i + 1] & 0xc0) == 0x80; i++) {} |   int first_off = 0; | ||||||
|  |   for (; utf_is_trail_byte(p[first_off]); first_off--) { | ||||||
|   // Check for illegal sequence. |     if (first_off == max_first_off) {  // failed to find first byte | ||||||
|   for (j = 0; p_in - j > base; j++) { |       return (CharBoundsOff){ 0, 1 }; | ||||||
|     if ((p[-j] & 0xc0) != 0x80) { |  | ||||||
|       break; |  | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   if (utf8len_tab[p[-j]] != i + j + 1) { |   int const max_end_off = utf8len_tab[p[first_off]] + first_off; | ||||||
|     return 0; |   if (max_end_off <= 0 || max_end_off > p_len) {  // illegal or incomplete sequence | ||||||
|  |     return (CharBoundsOff){ 0, 1 }; | ||||||
|   } |   } | ||||||
|   return i; |  | ||||||
|  |   for (int end_off = 1; end_off < max_end_off; end_off++) { | ||||||
|  |     if (!utf_is_trail_byte(p[end_off])) {  // not enough trail bytes | ||||||
|  |       return (CharBoundsOff){ 0, 1 }; | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   return (CharBoundsOff){ .begin_off = (int8_t)-first_off, .end_off = (int8_t)max_end_off }; | ||||||
| } | } | ||||||
|  |  | ||||||
| /// Return the offset from "p" to the first byte of the codepoint it points | /// Returns the offset in bytes from "p_in" to the first and one-past-end bytes | ||||||
| /// to. Can start anywhere in a stream of bytes. | /// of the codepoint it points to. | ||||||
| /// Note: Unlike `utf_head_off`, this counts individual codepoints of composed characters | /// "p_in" can point anywhere in a stream of bytes. | ||||||
| /// separately. | /// Stream must be NUL-terminated. | ||||||
| /// | /// Note: Counts individual codepoints of composed characters separately. | ||||||
| /// @param[in] base  Pointer to start of string | CharBoundsOff utf_cp_bounds(char const *base, char const *p_in) | ||||||
| /// @param[in] p     Pointer to byte for which to return the offset to the previous codepoint |   FUNC_ATTR_PURE FUNC_ATTR_NONNULL_ALL | ||||||
| // |  | ||||||
| /// @return 0 if invalid sequence, else number of bytes to previous codepoint |  | ||||||
| int utf_cp_head_off(const char *base, const char *p) |  | ||||||
| { | { | ||||||
|   int i; |   return utf_cp_bounds_len(base, p_in, INT_MAX); | ||||||
|  |  | ||||||
|   if (*p == NUL) { |  | ||||||
|     return 0; |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   // Find the first character that is not 10xx.xxxx |  | ||||||
|   for (i = 0; p - i >= base; i++) { |  | ||||||
|     if (((uint8_t)p[-i] & 0xc0) != 0x80) { |  | ||||||
|       break; |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   // Find the last character that is 10xx.xxxx (condition terminates on NUL) |  | ||||||
|   int j = 1; |  | ||||||
|   while (((uint8_t)p[j] & 0xc0) == 0x80) { |  | ||||||
|     j++; |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   // Check for illegal sequence. |  | ||||||
|   if (utf8len_tab[(uint8_t)p[-i]] != j + i) { |  | ||||||
|     return 0; |  | ||||||
|   } |  | ||||||
|   return i; |  | ||||||
| } | } | ||||||
|  |  | ||||||
| // Find the next illegal byte sequence. | // Find the next illegal byte sequence. | ||||||
|   | |||||||
| @@ -52,6 +52,16 @@ extern const uint8_t utf8len_tab[256]; | |||||||
| #define MB_PTR_BACK(s, p) \ | #define MB_PTR_BACK(s, p) \ | ||||||
|   (p -= utf_head_off((char *)(s), (char *)(p) - 1) + 1) |   (p -= utf_head_off((char *)(s), (char *)(p) - 1) + 1) | ||||||
|  |  | ||||||
|  | /// Check whether a given UTF-8 byte is a trailing byte (10xx.xxxx). | ||||||
|  | static inline bool utf_is_trail_byte(uint8_t byte) | ||||||
|  |   REAL_FATTR_CONST REAL_FATTR_ALWAYS_INLINE; | ||||||
|  |  | ||||||
|  | static inline bool utf_is_trail_byte(uint8_t const byte) | ||||||
|  | { | ||||||
|  |   // uint8_t is for clang to use smaller cmp | ||||||
|  |   return (uint8_t)(byte & 0xC0U) == 0x80U; | ||||||
|  | } | ||||||
|  |  | ||||||
| static inline CharInfo utf_ptr2CharInfo(char const *p_in) | static inline CharInfo utf_ptr2CharInfo(char const *p_in) | ||||||
|   REAL_FATTR_NONNULL_ALL REAL_FATTR_PURE REAL_FATTR_WARN_UNUSED_RESULT REAL_FATTR_ALWAYS_INLINE; |   REAL_FATTR_NONNULL_ALL REAL_FATTR_PURE REAL_FATTR_WARN_UNUSED_RESULT REAL_FATTR_ALWAYS_INLINE; | ||||||
|  |  | ||||||
|   | |||||||
| @@ -66,3 +66,8 @@ typedef struct { | |||||||
|   char *ptr;     ///< Pointer to the first byte of the character. |   char *ptr;     ///< Pointer to the first byte of the character. | ||||||
|   CharInfo chr;  ///< Information about the character. |   CharInfo chr;  ///< Information about the character. | ||||||
| } StrCharInfo; | } StrCharInfo; | ||||||
|  |  | ||||||
|  | typedef struct { | ||||||
|  |   int8_t begin_off;  ///< Offset to the first byte of the codepoint. | ||||||
|  |   int8_t end_off;    ///< Offset to one past the end byte of the codepoint. | ||||||
|  | } CharBoundsOff; | ||||||
|   | |||||||
| @@ -203,4 +203,74 @@ describe('mbyte', function() | |||||||
|       ) |       ) | ||||||
|     end) |     end) | ||||||
|   end) |   end) | ||||||
|  |  | ||||||
|  |   describe('utf_cp_bounds_len', function() | ||||||
|  |     local to_cstr = helpers.to_cstr | ||||||
|  |  | ||||||
|  |     local tests = { | ||||||
|  |       { | ||||||
|  |         name = 'for valid string', | ||||||
|  |         str = 'iÀiiⱠiⱠⱠ𐀀i', | ||||||
|  |         offsets = { | ||||||
|  |           b = { 0, 0, 1, 0, 0, 0, 1, 2, 0, 0, 1, 2, 0, 1, 2, 0, 1, 2, 3, 0 }, | ||||||
|  |           e = { 1, 2, 1, 1, 1, 3, 2, 1, 1, 3, 2, 1, 3, 2, 1, 4, 3, 2, 1, 1 }, | ||||||
|  |         }, | ||||||
|  |       }, | ||||||
|  |       { | ||||||
|  |         name = 'for string with incomplete sequence', | ||||||
|  |         str = 'i\xC3iÀⱠiÀ\xE2\xB1Ⱡ\xF0\x90\x80', | ||||||
|  |         offsets = { | ||||||
|  |           b = { 0, 0, 0, 0, 1, 0, 1, 2, 0, 0, 1, 0, 0, 0, 1, 2, 0, 0, 0 }, | ||||||
|  |           e = { 1, 1, 1, 2, 1, 3, 2, 1, 1, 2, 1, 1, 1, 3, 2, 1, 1, 1, 1 }, | ||||||
|  |         }, | ||||||
|  |       }, | ||||||
|  |       { | ||||||
|  |         name = 'for string with trailing bytes after multibyte', | ||||||
|  |         str = 'iÀ\xA0Ⱡ\xA0Ⱡ𐀀\xA0i', | ||||||
|  |         offsets = { | ||||||
|  |           b = { 0, 0, 1, 0, 0, 1, 2, 0, 0, 1, 2, 0, 1, 2, 3, 0, 0 }, | ||||||
|  |           e = { 1, 2, 1, 1, 3, 2, 1, 1, 3, 2, 1, 4, 3, 2, 1, 1, 1 }, | ||||||
|  |         }, | ||||||
|  |       }, | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     for _, test in ipairs(tests) do | ||||||
|  |       itp(test.name, function() | ||||||
|  |         local cstr = to_cstr(test.str) | ||||||
|  |         local b_offsets, e_offsets = {}, {} | ||||||
|  |         for i = 1, #test.str do | ||||||
|  |           local result = lib.utf_cp_bounds_len(cstr, cstr + i - 1, #test.str - (i - 1)) | ||||||
|  |           table.insert(b_offsets, result.begin_off) | ||||||
|  |           table.insert(e_offsets, result.end_off) | ||||||
|  |         end | ||||||
|  |         eq(test.offsets, { b = b_offsets, e = e_offsets }) | ||||||
|  |       end) | ||||||
|  |     end | ||||||
|  |  | ||||||
|  |     itp('does not read before start', function() | ||||||
|  |       local str = '𐀀' | ||||||
|  |       local expected_offsets = { b = { 0, 0, 0 }, e = { 1, 1, 1 } } | ||||||
|  |       local cstr = to_cstr(str) + 1 | ||||||
|  |       local b_offsets, e_offsets = {}, {} | ||||||
|  |       for i = 1, 3 do | ||||||
|  |         local result = lib.utf_cp_bounds_len(cstr, cstr + i - 1, 3 - (i - 1)) | ||||||
|  |         table.insert(b_offsets, result.begin_off) | ||||||
|  |         table.insert(e_offsets, result.end_off) | ||||||
|  |       end | ||||||
|  |       eq(expected_offsets, { b = b_offsets, e = e_offsets }) | ||||||
|  |     end) | ||||||
|  |  | ||||||
|  |     itp('does not read past the end', function() | ||||||
|  |       local str = '𐀀' | ||||||
|  |       local expected_offsets = { b = { 0, 0, 0 }, e = { 1, 1, 1 } } | ||||||
|  |       local cstr = to_cstr(str) | ||||||
|  |       local b_offsets, e_offsets = {}, {} | ||||||
|  |       for i = 1, 3 do | ||||||
|  |         local result = lib.utf_cp_bounds_len(cstr, cstr + i - 1, 3 - (i - 1)) | ||||||
|  |         table.insert(b_offsets, result.begin_off) | ||||||
|  |         table.insert(e_offsets, result.end_off) | ||||||
|  |       end | ||||||
|  |       eq(expected_offsets, { b = b_offsets, e = e_offsets }) | ||||||
|  |     end) | ||||||
|  |   end) | ||||||
| end) | end) | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 VanaIgr
					VanaIgr