mirror of
				https://github.com/neovim/neovim.git
				synced 2025-10-26 12:27:24 +00:00 
			
		
		
		
	lua: support getting UTF-32 and UTF-16 sizes of replaced text
This commit is contained in:
		| @@ -208,14 +208,17 @@ they are allowed. | ||||
|  | ||||
| |nvim_buf_attach| will take keyword args for the callbacks. "on_lines" will | ||||
| receive parameters ("lines", {buf}, {changedtick}, {firstline}, {lastline}, | ||||
| {new_lastline}, {old_bytecount}). | ||||
| {new_lastline}, {old_byte_size}[, {old_utf32_size}, {old_utf16_size}]). | ||||
| Unlike remote channel events the text contents are not passed. The new text can | ||||
| be accessed inside the callback as | ||||
|  | ||||
|     `vim.api.nvim_buf_get_lines(buf, firstline, new_lastline, true)` | ||||
|  | ||||
| {old_bytecount} is the total size of the replaced region {firstline} to | ||||
| {lastline} in bytes, including the final newline after {lastline}. | ||||
| {old_byte_size} is the total size of the replaced region {firstline} to | ||||
| {lastline} in bytes, including the final newline after {lastline}. if | ||||
| `utf_sizes` is set to true in |nvim_buf_attach()| keyword args, then the | ||||
| UTF-32 and UTF-16 sizes of the deleted region is also passed as additional | ||||
| arguments {old_utf32_size} and {old_utf16_size}. | ||||
|  | ||||
| "on_changedtick" is invoked when |b:changedtick| was incremented but no text | ||||
| was changed. The parameters recieved are ("changedtick", {buf}, {changedtick}). | ||||
|   | ||||
| @@ -109,9 +109,11 @@ String buffer_get_line(Buffer buffer, Integer index, Error *err) | ||||
| ///        `nvim_buf_lines_event`. Otherwise, the first notification will be | ||||
| ///        a `nvim_buf_changedtick_event`. Not used for lua callbacks. | ||||
| /// @param  opts  Optional parameters. | ||||
| ///               `on_lines`: lua callback received on change. | ||||
| ///               `on_lines`:       lua callback received on change. | ||||
| ///               `on_changedtick`: lua callback received on changedtick | ||||
| ///                                 increment without text change. | ||||
| ///               `utf_sizes`:      include UTF-32 and UTF-16 size of | ||||
| ///                                 the replaced region. | ||||
| ///               See |api-buffer-updates-lua| for more information | ||||
| /// @param[out] err Error details, if any | ||||
| /// @return False when updates couldn't be enabled because the buffer isn't | ||||
| @@ -156,6 +158,12 @@ Boolean nvim_buf_attach(uint64_t channel_id, | ||||
|       } | ||||
|       cb.on_detach = v->data.luaref; | ||||
|       v->data.integer = LUA_NOREF; | ||||
|     } else if (is_lua && strequal("utf_sizes", k.data)) { | ||||
|       if (v->type != kObjectTypeBoolean) { | ||||
|         api_set_error(err, kErrorTypeValidation, "utf_sizes must be boolean"); | ||||
|         goto error; | ||||
|       } | ||||
|       cb.utf_sizes = v->data.boolean; | ||||
|     } else { | ||||
|       api_set_error(err, kErrorTypeValidation, "unexpected key: %s", k.data); | ||||
|       goto error; | ||||
| @@ -1196,6 +1204,7 @@ Dictionary nvim__buf_stats(Buffer buffer, Error *err) | ||||
|   // NB: this should be zero at any time API functions are called, | ||||
|   // this exists to debug issues | ||||
|   PUT(rv, "dirty_bytes", INTEGER_OBJ((Integer)buf->deleted_bytes)); | ||||
|  | ||||
|   return rv; | ||||
| } | ||||
|  | ||||
|   | ||||
| @@ -459,8 +459,9 @@ typedef struct { | ||||
|   LuaRef on_lines; | ||||
|   LuaRef on_changedtick; | ||||
|   LuaRef on_detach; | ||||
|   bool utf_sizes; | ||||
| } BufUpdateCallbacks; | ||||
| #define BUF_UPDATE_CALLBACKS_INIT { LUA_NOREF, LUA_NOREF, LUA_NOREF } | ||||
| #define BUF_UPDATE_CALLBACKS_INIT { LUA_NOREF, LUA_NOREF, LUA_NOREF, false } | ||||
|  | ||||
| #define BUF_HAS_QF_ENTRY 1 | ||||
| #define BUF_HAS_LL_ENTRY 2 | ||||
| @@ -802,12 +803,24 @@ struct file_buffer { | ||||
|  | ||||
|   kvec_t(BufhlLine *) b_bufhl_move_space;  // temporary space for highlights | ||||
|  | ||||
|   // array of channelids which have asked to receive updates for this | ||||
|   // array of channel_id:s which have asked to receive updates for this | ||||
|   // buffer. | ||||
|   kvec_t(uint64_t) update_channels; | ||||
|   // array of lua callbacks for buffer updates. | ||||
|   kvec_t(BufUpdateCallbacks) update_callbacks; | ||||
|  | ||||
|   // whether an update callback has requested codepoint size of deleted regions. | ||||
|   bool update_need_codepoints; | ||||
|  | ||||
|   // Measurements of the deleted or replaced region since the last update | ||||
|   // event. Some consumers of buffer changes need to know the byte size (like | ||||
|   // tree-sitter) or the corresponding UTF-32/UTF-16 size (like LSP) of the | ||||
|   // deleted text. | ||||
|   size_t deleted_bytes; | ||||
|   size_t deleted_codepoints; | ||||
|   size_t deleted_codeunits; | ||||
|  | ||||
|   // The number for times the current line has been flushed in the memline. | ||||
|   int flush_count; | ||||
|  | ||||
|   int b_diff_failed;    // internal diff failed for this buffer | ||||
|   | ||||
| @@ -26,6 +26,9 @@ bool buf_updates_register(buf_T *buf, uint64_t channel_id, | ||||
|  | ||||
|   if (channel_id == LUA_INTERNAL_CALL) { | ||||
|     kv_push(buf->update_callbacks, cb); | ||||
|     if (cb.utf_sizes) { | ||||
|       buf->update_need_codepoints = true; | ||||
|     } | ||||
|     return true; | ||||
|   } | ||||
|  | ||||
| @@ -169,7 +172,9 @@ void buf_updates_send_changes(buf_T *buf, | ||||
|                               int64_t num_removed, | ||||
|                               bool send_tick) | ||||
| { | ||||
|   size_t deleted_bytes = ml_flush_deleted_bytes(buf); | ||||
|   size_t deleted_codepoints, deleted_codeunits; | ||||
|   size_t deleted_bytes = ml_flush_deleted_bytes(buf, &deleted_codepoints, | ||||
|                                                 &deleted_codeunits); | ||||
|  | ||||
|   if (!buf_updates_active(buf)) { | ||||
|     return; | ||||
| @@ -233,8 +238,8 @@ void buf_updates_send_changes(buf_T *buf, | ||||
|     bool keep = true; | ||||
|     if (cb.on_lines != LUA_NOREF) { | ||||
|       Array args = ARRAY_DICT_INIT; | ||||
|       Object items[6]; | ||||
|       args.size = 6; | ||||
|       Object items[8]; | ||||
|       args.size = 6;  // may be increased to 8 below | ||||
|       args.items = items; | ||||
|  | ||||
|       // the first argument is always the buffer handle | ||||
| @@ -254,6 +259,11 @@ void buf_updates_send_changes(buf_T *buf, | ||||
|  | ||||
|       // byte count of previous contents | ||||
|       args.items[5] = INTEGER_OBJ((Integer)deleted_bytes); | ||||
|       if (cb.utf_sizes) { | ||||
|         args.size = 8; | ||||
|         args.items[6] = INTEGER_OBJ((Integer)deleted_codepoints); | ||||
|         args.items[7] = INTEGER_OBJ((Integer)deleted_codeunits); | ||||
|       } | ||||
|       textlock++; | ||||
|       Object res = executor_exec_lua_cb(cb.on_lines, "lines", args, true); | ||||
|       textlock--; | ||||
|   | ||||
| @@ -1756,6 +1756,8 @@ failed: | ||||
|       linecnt--; | ||||
|     } | ||||
|     curbuf->deleted_bytes = 0; | ||||
|     curbuf->deleted_codepoints = 0; | ||||
|     curbuf->deleted_codeunits = 0; | ||||
|     linecnt = curbuf->b_ml.ml_line_count - linecnt; | ||||
|     if (filesize == 0) | ||||
|       linecnt = 0; | ||||
|   | ||||
| @@ -627,6 +627,8 @@ EXTERN pos_T Insstart_orig; | ||||
| EXTERN int orig_line_count INIT(= 0);       /* Line count when "gR" started */ | ||||
| EXTERN int vr_lines_changed INIT(= 0);      /* #Lines changed by "gR" so far */ | ||||
|  | ||||
| // increase around internal delete/replace | ||||
| EXTERN int inhibit_delete_count INIT(= 0); | ||||
|  | ||||
| /* | ||||
|  * These flags are set based upon 'fileencoding'. | ||||
|   | ||||
| @@ -1438,6 +1438,39 @@ int utf16_to_utf8(const wchar_t *strw, char **str) | ||||
|  | ||||
| #endif | ||||
|  | ||||
| /// Measure the length of a string in corresponding UTF-32 and UTF-16 units. | ||||
| /// | ||||
| /// Invalid UTF-8 bytes, or embedded surrogates, count as one code point/unit | ||||
| /// each. | ||||
| /// | ||||
| /// The out parameters are incremented. This is used to measure the size of | ||||
| /// a buffer region consisting of multiple line segments. | ||||
| /// | ||||
| /// @param s the string | ||||
| /// @param len maximum length (an earlier NUL terminates) | ||||
| /// @param[out] codepoints incremented with UTF-32 code point size | ||||
| /// @param[out] codeunits incremented with UTF-16 code unit size | ||||
| void mb_utflen(const char_u *s, size_t len, size_t *codepoints, | ||||
|                size_t *codeunits) | ||||
|   FUNC_ATTR_NONNULL_ALL | ||||
| { | ||||
|   size_t count = 0, extra = 0; | ||||
|   size_t clen; | ||||
|   for (size_t i = 0; i < len && s[i] != NUL; i += clen) { | ||||
|     clen = utf_ptr2len_len(s+i, len-i); | ||||
|     // NB: gets the byte value of invalid sequence bytes. | ||||
|     // we only care whether the char fits in the BMP or not | ||||
|     int c = (clen > 1) ? utf_ptr2char(s+i) : s[i]; | ||||
|     count++; | ||||
|     if (c > 0xFFFF) { | ||||
|       extra++; | ||||
|     } | ||||
|   } | ||||
|   *codepoints += count; | ||||
|   *codeunits += count + extra; | ||||
| } | ||||
|  | ||||
|  | ||||
| /* | ||||
|  * Version of strnicmp() that handles multi-byte characters. | ||||
|  * Needed for Big5, Shift-JIS and UTF-8 encoding.  Other DBCS encodings can | ||||
|   | ||||
| @@ -2383,6 +2383,23 @@ static int ml_append_int( | ||||
|   return OK; | ||||
| } | ||||
|  | ||||
| void ml_add_deleted_len(char_u *ptr, ssize_t len) | ||||
| { | ||||
|   if (inhibit_delete_count) { | ||||
|     return; | ||||
|   } | ||||
|   if (len == -1) { | ||||
|     len = STRLEN(ptr); | ||||
|   } | ||||
|   curbuf->deleted_bytes += len+1; | ||||
|   if (curbuf->update_need_codepoints) { | ||||
|     mb_utflen(ptr, len, &curbuf->deleted_codepoints, | ||||
|               &curbuf->deleted_codeunits); | ||||
|     curbuf->deleted_codepoints++;  // NL char | ||||
|     curbuf->deleted_codeunits++; | ||||
|   } | ||||
| } | ||||
|  | ||||
| /* | ||||
|  * Replace line lnum, with buffering, in current buffer. | ||||
|  * | ||||
| @@ -2408,19 +2425,17 @@ int ml_replace(linenr_T lnum, char_u *line, bool copy) | ||||
|   if (copy) { | ||||
|     line = vim_strsave(line); | ||||
|   } | ||||
|   if (curbuf->b_ml.ml_line_lnum != lnum) {           /* other line buffered */ | ||||
|     ml_flush_line(curbuf);                          /* flush it */ | ||||
|   } else if (curbuf->b_ml.ml_flags & ML_LINE_DIRTY) {  /* same line allocated */ | ||||
|     // TODO FIXME: see other "TODO FIXME" | ||||
|     curbuf->deleted_bytes += STRLEN(curbuf->b_ml.ml_line_ptr)+1; | ||||
|     xfree(curbuf->b_ml.ml_line_ptr);             /* free it */ | ||||
|     readlen = false; // already read it. | ||||
|   if (curbuf->b_ml.ml_line_lnum != lnum) {  // other line buffered | ||||
|     ml_flush_line(curbuf);  // flush it | ||||
|   } else if (curbuf->b_ml.ml_flags & ML_LINE_DIRTY) {  // same line allocated | ||||
|     ml_add_deleted_len(curbuf->b_ml.ml_line_ptr, -1); | ||||
|     readlen = false;  // already added the length | ||||
|  | ||||
|     xfree(curbuf->b_ml.ml_line_ptr);  // free it | ||||
|   } | ||||
|  | ||||
|   if (readlen) { | ||||
|     if (true) { // TODO: buffer updates active | ||||
|       curbuf->deleted_bytes += STRLEN(ml_get_buf(curbuf, lnum, false))+1; | ||||
|     } | ||||
|   if (readlen && kv_size(curbuf->update_callbacks)) { | ||||
|     ml_add_deleted_len(ml_get_buf(curbuf, lnum, false), -1); | ||||
|   } | ||||
|  | ||||
|   curbuf->b_ml.ml_line_ptr = line; | ||||
| @@ -2504,7 +2519,10 @@ static int ml_delete_int(buf_T *buf, linenr_T lnum, bool message) | ||||
|   else | ||||
|     line_size = ((dp->db_index[idx - 1]) & DB_INDEX_MASK) - line_start; | ||||
|  | ||||
|   buf->deleted_bytes += line_size; | ||||
|   // Line should always have an NL char internally (represented as NUL), | ||||
|   // even if 'noeol' is set. | ||||
|   assert(line_size >= 1); | ||||
|   ml_add_deleted_len((char_u *)dp + line_start, line_size-1); | ||||
|  | ||||
|   /* | ||||
|    * special case: If there is only one line in the data block it becomes empty. | ||||
| @@ -2690,10 +2708,14 @@ void ml_clearmarked(void) | ||||
|   return; | ||||
| } | ||||
|  | ||||
| size_t ml_flush_deleted_bytes(buf_T *buf) | ||||
| size_t ml_flush_deleted_bytes(buf_T *buf, size_t *codepoints, size_t *codeunits) | ||||
| { | ||||
|   size_t ret = buf->deleted_bytes; | ||||
|   *codepoints = buf->deleted_codepoints; | ||||
|   *codeunits = buf->deleted_codeunits; | ||||
|   buf->deleted_bytes = 0; | ||||
|   buf->deleted_codepoints = 0; | ||||
|   buf->deleted_codeunits = 0; | ||||
|   return ret; | ||||
| } | ||||
|  | ||||
|   | ||||
| @@ -780,6 +780,7 @@ open_line ( | ||||
|     did_append = FALSE; | ||||
|   } | ||||
|  | ||||
|   inhibit_delete_count++; | ||||
|   if (newindent | ||||
|       || did_si | ||||
|       ) { | ||||
| @@ -821,6 +822,7 @@ open_line ( | ||||
|       did_si = false; | ||||
|     } | ||||
|   } | ||||
|   inhibit_delete_count--; | ||||
|  | ||||
|   /* | ||||
|    * In REPLACE mode, for each character in the extra leader, there must be | ||||
| @@ -1685,7 +1687,7 @@ int del_bytes(colnr_T count, bool fixpos_arg, bool use_delcombine) | ||||
|   bool was_alloced = ml_line_alloced();     // check if oldp was allocated | ||||
|   char_u *newp; | ||||
|   if (was_alloced) { | ||||
|     curbuf->deleted_bytes += (size_t)oldlen+1; | ||||
|     ml_add_deleted_len(curbuf->b_ml.ml_line_ptr, oldlen); | ||||
|     newp = oldp;                            // use same allocated memory | ||||
|   } else {                                  // need to allocate a new line | ||||
|     newp = xmalloc((size_t)(oldlen + 1 - count)); | ||||
|   | ||||
| @@ -13,7 +13,8 @@ local origlines = {"original line 1", | ||||
|                    "original line 3", | ||||
|                    "original line 4", | ||||
|                    "original line 5", | ||||
|                    "original line 6"} | ||||
|                    "original line 6", | ||||
|                    "    indented line"} | ||||
|  | ||||
| describe('lua: buffer event callbacks', function() | ||||
|   before_each(function() | ||||
| @@ -21,14 +22,14 @@ describe('lua: buffer event callbacks', function() | ||||
|     exec_lua([[ | ||||
|       local events = {} | ||||
|  | ||||
|       function test_register(bufnr, id, changedtick) | ||||
|       function test_register(bufnr, id, changedtick, utf_sizes) | ||||
|         local function callback(...) | ||||
|           table.insert(events, {id, ...}) | ||||
|           if test_unreg == id then | ||||
|             return true | ||||
|           end | ||||
|         end | ||||
|         local opts = {on_lines=callback, on_detach=callback} | ||||
|         local opts = {on_lines=callback, on_detach=callback, utf_sizes=utf_sizes} | ||||
|         if changedtick then | ||||
|           opts.on_changedtick = callback | ||||
|         end | ||||
| @@ -48,18 +49,26 @@ describe('lua: buffer event callbacks', function() | ||||
|   -- assert the wrong thing), but masks errors with unflushed lines (as | ||||
|   -- nvim_buf_get_offset forces a flush of the memline). To be safe run the | ||||
|   -- test both ways. | ||||
|   local function check(verify) | ||||
|   local function check(verify,utf_sizes) | ||||
|     local lastsize | ||||
|     meths.buf_set_lines(0, 0, -1, true, origlines) | ||||
|     if verify then | ||||
|       lastsize = meths.buf_get_offset(0, meths.buf_line_count(0)) | ||||
|     end | ||||
|     exec_lua("return test_register(...)", 0, "test1") | ||||
|     exec_lua("return test_register(...)", 0, "test1",false,utf_sizes) | ||||
|     local tick = meths.buf_get_changedtick(0) | ||||
|  | ||||
|     local verify_name = "test1" | ||||
|     local function check_events(expected) | ||||
|       local events = exec_lua("return get_events(...)" ) | ||||
|       if utf_sizes then | ||||
|         -- this test case uses ASCII only, so sizes sshould be the same. | ||||
|         -- Unicode is tested below. | ||||
|         for _, event in ipairs(expected) do | ||||
|           event[9] = event[8] | ||||
|           event[10] = event[8] | ||||
|         end | ||||
|       end | ||||
|       eq(expected, events) | ||||
|       if verify then | ||||
|         for _, event in ipairs(events) do | ||||
| @@ -75,6 +84,7 @@ describe('lua: buffer event callbacks', function() | ||||
|       end | ||||
|     end | ||||
|  | ||||
|     command('set autoindent') | ||||
|     command('normal! GyyggP') | ||||
|     tick = tick + 1 | ||||
|     check_events({{ "test1", "lines", 1, tick, 0, 0, 1, 0}}) | ||||
| @@ -83,7 +93,7 @@ describe('lua: buffer event callbacks', function() | ||||
|     tick = tick + 1 | ||||
|     check_events({{ "test1", "lines", 1, tick, 3, 5, 4, 32 }}) | ||||
|  | ||||
|     exec_lua("return test_register(...)", 0, "test2", true) | ||||
|     exec_lua("return test_register(...)", 0, "test2", true, utf_sizes) | ||||
|     tick = tick + 1 | ||||
|     command('undo') | ||||
|  | ||||
| @@ -124,7 +134,13 @@ describe('lua: buffer event callbacks', function() | ||||
|     tick = tick + 1 | ||||
|     check_events({{ "test2", "lines", 1, tick, 4, 5, 5, 19 }}) | ||||
|  | ||||
|     feed('<esc>') | ||||
|     feed('<esc>Go') | ||||
|     tick = tick + 1 | ||||
|     check_events({{ "test2", "lines", 1, tick, 11, 11, 12, 0 }}) | ||||
|  | ||||
|     feed('x') | ||||
|     tick = tick + 1 | ||||
|     check_events({{ "test2", "lines", 1, tick, 11, 12, 12, 5 }}) | ||||
|  | ||||
|     command('bwipe!') | ||||
|     check_events({{ "test2", "detach", 1 }}) | ||||
| @@ -137,4 +153,54 @@ describe('lua: buffer event callbacks', function() | ||||
|   it('works with verify', function() | ||||
|     check(true) | ||||
|   end) | ||||
|  | ||||
|   it('works with utf_sizes and ASCII text', function() | ||||
|     check(false,true) | ||||
|   end) | ||||
|  | ||||
|   it('works with utf_sizes and unicode text', function() | ||||
|     local unicode_text = {"ascii text", | ||||
|                           "latin text åäö", | ||||
|                           "BMP text ɧ αλφά", | ||||
|                           "BMP text 汉语 ↥↧", | ||||
|                           "SMP 🤦 🦄🦃", | ||||
|                           "combining å بِيَّة"} | ||||
|     meths.buf_set_lines(0, 0, -1, true, unicode_text) | ||||
|     feed('gg') | ||||
|     exec_lua("return test_register(...)", 0, "test1", false, true) | ||||
|     local tick = meths.buf_get_changedtick(0) | ||||
|  | ||||
|     feed('dd') | ||||
|     tick = tick + 1 | ||||
|     eq({{ "test1", "lines", 1, tick, 0, 1, 0, 11, 11, 11 }}, exec_lua("return get_events(...)" )) | ||||
|  | ||||
|     feed('A<bs>') | ||||
|     tick = tick + 1 | ||||
|     eq({{ "test1", "lines", 1, tick, 0, 1, 1, 18, 15, 15 }}, exec_lua("return get_events(...)" )) | ||||
|  | ||||
|     feed('<esc>jylp') | ||||
|     tick = tick + 1 | ||||
|     eq({{ "test1", "lines", 1, tick, 1, 2, 2, 21, 16, 16 }}, exec_lua("return get_events(...)" )) | ||||
|  | ||||
|     feed('+eea<cr>') | ||||
|     tick = tick + 1 | ||||
|     eq({{ "test1", "lines", 1, tick, 2, 3, 4, 23, 15, 15 }}, exec_lua("return get_events(...)" )) | ||||
|  | ||||
|     feed('<esc>jdw') | ||||
|     tick = tick + 1 | ||||
|     -- non-BMP chars count as 2 UTF-2 codeunits | ||||
|     eq({{ "test1", "lines", 1, tick, 4, 5, 5, 18, 9, 12 }}, exec_lua("return get_events(...)" )) | ||||
|  | ||||
|     feed('+rx') | ||||
|     tick = tick + 1 | ||||
|     -- count the individual codepoints of a composed character. | ||||
|     eq({{ "test1", "lines", 1, tick, 5, 6, 6, 27, 20, 20 }}, exec_lua("return get_events(...)" )) | ||||
|  | ||||
|     feed('kJ') | ||||
|     tick = tick + 1 | ||||
|     -- NB: this is inefficient (but not really wrong). | ||||
|     eq({{ "test1", "lines", 1,   tick, 4, 5, 5, 14, 5, 8 }, | ||||
|         { "test1", "lines", 1, tick+1, 5, 6, 5, 27, 20, 20 }}, exec_lua("return get_events(...)" )) | ||||
|   end) | ||||
|  | ||||
| end) | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Björn Linse
					Björn Linse