lua: support getting UTF-32 and UTF-16 sizes of replaced text

2025-10-26 12:27:24 +00:00 · 2019-08-04 12:22:22 +02:00
parent b0e26199ec
commit c0993ed343
10 changed files with 192 additions and 30 deletions
--- a/runtime/doc/api.txt
+++ b/runtime/doc/api.txt
@@ -208,14 +208,17 @@ they are allowed.

 |nvim_buf_attach| will take keyword args for the callbacks. "on_lines" will
 receive parameters ("lines", {buf}, {changedtick}, {firstline}, {lastline},
-{new_lastline}, {old_bytecount}).
+{new_lastline}, {old_byte_size}[, {old_utf32_size}, {old_utf16_size}]).
 Unlike remote channel events the text contents are not passed. The new text can
 be accessed inside the callback as

    `vim.api.nvim_buf_get_lines(buf, firstline, new_lastline, true)`

-{old_bytecount} is the total size of the replaced region {firstline} to
-{lastline} in bytes, including the final newline after {lastline}.
+{old_byte_size} is the total size of the replaced region {firstline} to
+{lastline} in bytes, including the final newline after {lastline}. if
+`utf_sizes` is set to true in |nvim_buf_attach()| keyword args, then the
+UTF-32 and UTF-16 sizes of the deleted region is also passed as additional
+arguments {old_utf32_size} and {old_utf16_size}.

 "on_changedtick" is invoked when |b:changedtick| was incremented but no text
 was changed. The parameters recieved are ("changedtick", {buf}, {changedtick}).
--- a/src/nvim/api/buffer.c
+++ b/src/nvim/api/buffer.c
@@ -109,9 +109,11 @@ String buffer_get_line(Buffer buffer, Integer index, Error *err)
 ///        `nvim_buf_lines_event`. Otherwise, the first notification will be
 ///        a `nvim_buf_changedtick_event`. Not used for lua callbacks.
 /// @param  opts  Optional parameters.
-///               `on_lines`: lua callback received on change.
+///               `on_lines`:       lua callback received on change.
 ///               `on_changedtick`: lua callback received on changedtick
 ///                                 increment without text change.
+///               `utf_sizes`:      include UTF-32 and UTF-16 size of
+///                                 the replaced region.
 ///               See |api-buffer-updates-lua| for more information
 /// @param[out] err Error details, if any
 /// @return False when updates couldn't be enabled because the buffer isn't
@@ -156,6 +158,12 @@ Boolean nvim_buf_attach(uint64_t channel_id,
      }
      cb.on_detach = v->data.luaref;
      v->data.integer = LUA_NOREF;
+    } else if (is_lua && strequal("utf_sizes", k.data)) {
+      if (v->type != kObjectTypeBoolean) {
+        api_set_error(err, kErrorTypeValidation, "utf_sizes must be boolean");
+        goto error;
+      }
+      cb.utf_sizes = v->data.boolean;
    } else {
      api_set_error(err, kErrorTypeValidation, "unexpected key: %s", k.data);
      goto error;
@@ -1196,6 +1204,7 @@ Dictionary nvim__buf_stats(Buffer buffer, Error *err)
  // NB: this should be zero at any time API functions are called,
  // this exists to debug issues
  PUT(rv, "dirty_bytes", INTEGER_OBJ((Integer)buf->deleted_bytes));
+
  return rv;
 }

--- a/src/nvim/buffer_defs.h
+++ b/src/nvim/buffer_defs.h
@@ -459,8 +459,9 @@ typedef struct {
  LuaRef on_lines;
  LuaRef on_changedtick;
  LuaRef on_detach;
+  bool utf_sizes;
 } BufUpdateCallbacks;
-#define BUF_UPDATE_CALLBACKS_INIT { LUA_NOREF, LUA_NOREF, LUA_NOREF }
+#define BUF_UPDATE_CALLBACKS_INIT { LUA_NOREF, LUA_NOREF, LUA_NOREF, false }

 #define BUF_HAS_QF_ENTRY 1
 #define BUF_HAS_LL_ENTRY 2
@@ -802,12 +803,24 @@ struct file_buffer {

  kvec_t(BufhlLine *) b_bufhl_move_space;  // temporary space for highlights

-  // array of channelids which have asked to receive updates for this
+  // array of channel_id:s which have asked to receive updates for this
  // buffer.
  kvec_t(uint64_t) update_channels;
+  // array of lua callbacks for buffer updates.
  kvec_t(BufUpdateCallbacks) update_callbacks;

+  // whether an update callback has requested codepoint size of deleted regions.
+  bool update_need_codepoints;
+
+  // Measurements of the deleted or replaced region since the last update
+  // event. Some consumers of buffer changes need to know the byte size (like
+  // tree-sitter) or the corresponding UTF-32/UTF-16 size (like LSP) of the
+  // deleted text.
  size_t deleted_bytes;
+  size_t deleted_codepoints;
+  size_t deleted_codeunits;
+
+  // The number for times the current line has been flushed in the memline.
  int flush_count;

  int b_diff_failed;    // internal diff failed for this buffer
--- a/src/nvim/buffer_updates.c
+++ b/src/nvim/buffer_updates.c
@@ -26,6 +26,9 @@ bool buf_updates_register(buf_T *buf, uint64_t channel_id,

  if (channel_id == LUA_INTERNAL_CALL) {
    kv_push(buf->update_callbacks, cb);
+    if (cb.utf_sizes) {
+      buf->update_need_codepoints = true;
+    }
    return true;
  }

@@ -169,7 +172,9 @@ void buf_updates_send_changes(buf_T *buf,
                              int64_t num_removed,
                              bool send_tick)
 {
-  size_t deleted_bytes = ml_flush_deleted_bytes(buf);
+  size_t deleted_codepoints, deleted_codeunits;
+  size_t deleted_bytes = ml_flush_deleted_bytes(buf, &deleted_codepoints,
+                                                &deleted_codeunits);

  if (!buf_updates_active(buf)) {
    return;
@@ -233,8 +238,8 @@ void buf_updates_send_changes(buf_T *buf,
    bool keep = true;
    if (cb.on_lines != LUA_NOREF) {
      Array args = ARRAY_DICT_INIT;
-      Object items[6];
-      args.size = 6;
+      Object items[8];
+      args.size = 6;  // may be increased to 8 below
      args.items = items;

      // the first argument is always the buffer handle
@@ -254,6 +259,11 @@ void buf_updates_send_changes(buf_T *buf,

      // byte count of previous contents
      args.items[5] = INTEGER_OBJ((Integer)deleted_bytes);
+      if (cb.utf_sizes) {
+        args.size = 8;
+        args.items[6] = INTEGER_OBJ((Integer)deleted_codepoints);
+        args.items[7] = INTEGER_OBJ((Integer)deleted_codeunits);
+      }
      textlock++;
      Object res = executor_exec_lua_cb(cb.on_lines, "lines", args, true);
      textlock--;
--- a/src/nvim/fileio.c
+++ b/src/nvim/fileio.c
@@ -1756,6 +1756,8 @@ failed:
      linecnt--;
    }
    curbuf->deleted_bytes = 0;
+    curbuf->deleted_codepoints = 0;
+    curbuf->deleted_codeunits = 0;
    linecnt = curbuf->b_ml.ml_line_count - linecnt;
    if (filesize == 0)
      linecnt = 0;
--- a/src/nvim/globals.h
+++ b/src/nvim/globals.h
@@ -627,6 +627,8 @@ EXTERN pos_T Insstart_orig;
 EXTERN int orig_line_count INIT(= 0);       /* Line count when "gR" started */
 EXTERN int vr_lines_changed INIT(= 0);      /* #Lines changed by "gR" so far */

+// increase around internal delete/replace
+EXTERN int inhibit_delete_count INIT(= 0);

 /*
 * These flags are set based upon 'fileencoding'.
--- a/src/nvim/mbyte.c
+++ b/src/nvim/mbyte.c
@@ -1438,6 +1438,39 @@ int utf16_to_utf8(const wchar_t *strw, char **str)

 #endif

+/// Measure the length of a string in corresponding UTF-32 and UTF-16 units.
+///
+/// Invalid UTF-8 bytes, or embedded surrogates, count as one code point/unit
+/// each.
+///
+/// The out parameters are incremented. This is used to measure the size of
+/// a buffer region consisting of multiple line segments.
+///
+/// @param s the string
+/// @param len maximum length (an earlier NUL terminates)
+/// @param[out] codepoints incremented with UTF-32 code point size
+/// @param[out] codeunits incremented with UTF-16 code unit size
+void mb_utflen(const char_u *s, size_t len, size_t *codepoints,
+               size_t *codeunits)
+  FUNC_ATTR_NONNULL_ALL
+{
+  size_t count = 0, extra = 0;
+  size_t clen;
+  for (size_t i = 0; i < len && s[i] != NUL; i += clen) {
+    clen = utf_ptr2len_len(s+i, len-i);
+    // NB: gets the byte value of invalid sequence bytes.
+    // we only care whether the char fits in the BMP or not
+    int c = (clen > 1) ? utf_ptr2char(s+i) : s[i];
+    count++;
+    if (c > 0xFFFF) {
+      extra++;
+    }
+  }
+  *codepoints += count;
+  *codeunits += count + extra;
+}
+
+
 /*
 * Version of strnicmp() that handles multi-byte characters.
 * Needed for Big5, Shift-JIS and UTF-8 encoding.  Other DBCS encodings can
--- a/src/nvim/memline.c
+++ b/src/nvim/memline.c
@@ -2383,6 +2383,23 @@ static int ml_append_int(
  return OK;
 }

+void ml_add_deleted_len(char_u *ptr, ssize_t len)
+{
+  if (inhibit_delete_count) {
+    return;
+  }
+  if (len == -1) {
+    len = STRLEN(ptr);
+  }
+  curbuf->deleted_bytes += len+1;
+  if (curbuf->update_need_codepoints) {
+    mb_utflen(ptr, len, &curbuf->deleted_codepoints,
+              &curbuf->deleted_codeunits);
+    curbuf->deleted_codepoints++;  // NL char
+    curbuf->deleted_codeunits++;
+  }
+}
+
 /*
 * Replace line lnum, with buffering, in current buffer.
 *
@@ -2408,19 +2425,17 @@ int ml_replace(linenr_T lnum, char_u *line, bool copy)
  if (copy) {
    line = vim_strsave(line);
  }
-  if (curbuf->b_ml.ml_line_lnum != lnum) {           /* other line buffered */
-    ml_flush_line(curbuf);                          /* flush it */
-  } else if (curbuf->b_ml.ml_flags & ML_LINE_DIRTY) {  /* same line allocated */
-    // TODO FIXME: see other "TODO FIXME"
-    curbuf->deleted_bytes += STRLEN(curbuf->b_ml.ml_line_ptr)+1;
-    xfree(curbuf->b_ml.ml_line_ptr);             /* free it */
-    readlen = false; // already read it.
+  if (curbuf->b_ml.ml_line_lnum != lnum) {  // other line buffered
+    ml_flush_line(curbuf);  // flush it
+  } else if (curbuf->b_ml.ml_flags & ML_LINE_DIRTY) {  // same line allocated
+    ml_add_deleted_len(curbuf->b_ml.ml_line_ptr, -1);
+    readlen = false;  // already added the length
+
+    xfree(curbuf->b_ml.ml_line_ptr);  // free it
  }

-  if (readlen) {
-    if (true) { // TODO: buffer updates active
-      curbuf->deleted_bytes += STRLEN(ml_get_buf(curbuf, lnum, false))+1;
-    }
+  if (readlen && kv_size(curbuf->update_callbacks)) {
+    ml_add_deleted_len(ml_get_buf(curbuf, lnum, false), -1);
  }

  curbuf->b_ml.ml_line_ptr = line;
@@ -2504,7 +2519,10 @@ static int ml_delete_int(buf_T *buf, linenr_T lnum, bool message)
  else
    line_size = ((dp->db_index[idx - 1]) & DB_INDEX_MASK) - line_start;

-  buf->deleted_bytes += line_size;
+  // Line should always have an NL char internally (represented as NUL),
+  // even if 'noeol' is set.
+  assert(line_size >= 1);
+  ml_add_deleted_len((char_u *)dp + line_start, line_size-1);

  /*
   * special case: If there is only one line in the data block it becomes empty.
@@ -2690,10 +2708,14 @@ void ml_clearmarked(void)
  return;
 }

-size_t ml_flush_deleted_bytes(buf_T *buf)
+size_t ml_flush_deleted_bytes(buf_T *buf, size_t *codepoints, size_t *codeunits)
 {
  size_t ret = buf->deleted_bytes;
+  *codepoints = buf->deleted_codepoints;
+  *codeunits = buf->deleted_codeunits;
  buf->deleted_bytes = 0;
+  buf->deleted_codepoints = 0;
+  buf->deleted_codeunits = 0;
  return ret;
 }

--- a/src/nvim/misc1.c
+++ b/src/nvim/misc1.c
@@ -780,6 +780,7 @@ open_line (
    did_append = FALSE;
  }

+  inhibit_delete_count++;
  if (newindent
      || did_si
      ) {
@@ -821,6 +822,7 @@ open_line (
      did_si = false;
    }
  }
+  inhibit_delete_count--;

  /*
   * In REPLACE mode, for each character in the extra leader, there must be
@@ -1685,7 +1687,7 @@ int del_bytes(colnr_T count, bool fixpos_arg, bool use_delcombine)
  bool was_alloced = ml_line_alloced();     // check if oldp was allocated
  char_u *newp;
  if (was_alloced) {
-    curbuf->deleted_bytes += (size_t)oldlen+1;
+    ml_add_deleted_len(curbuf->b_ml.ml_line_ptr, oldlen);
    newp = oldp;                            // use same allocated memory
  } else {                                  // need to allocate a new line
    newp = xmalloc((size_t)(oldlen + 1 - count));
--- a/test/functional/lua/buffer_updates_spec.lua
+++ b/test/functional/lua/buffer_updates_spec.lua
@@ -13,7 +13,8 @@ local origlines = {"original line 1",
                   "original line 3",
                   "original line 4",
                   "original line 5",
-                   "original line 6"}
+                   "original line 6",
+                   "    indented line"}

 describe('lua: buffer event callbacks', function()
  before_each(function()
@@ -21,14 +22,14 @@ describe('lua: buffer event callbacks', function()
    exec_lua([[
      local events = {}

-      function test_register(bufnr, id, changedtick)
+      function test_register(bufnr, id, changedtick, utf_sizes)
        local function callback(...)
          table.insert(events, {id, ...})
          if test_unreg == id then
            return true
          end
        end
-        local opts = {on_lines=callback, on_detach=callback}
+        local opts = {on_lines=callback, on_detach=callback, utf_sizes=utf_sizes}
        if changedtick then
          opts.on_changedtick = callback
        end
@@ -48,18 +49,26 @@ describe('lua: buffer event callbacks', function()
  -- assert the wrong thing), but masks errors with unflushed lines (as
  -- nvim_buf_get_offset forces a flush of the memline). To be safe run the
  -- test both ways.
-  local function check(verify)
+  local function check(verify,utf_sizes)
    local lastsize
    meths.buf_set_lines(0, 0, -1, true, origlines)
    if verify then
      lastsize = meths.buf_get_offset(0, meths.buf_line_count(0))
    end
-    exec_lua("return test_register(...)", 0, "test1")
+    exec_lua("return test_register(...)", 0, "test1",false,utf_sizes)
    local tick = meths.buf_get_changedtick(0)

    local verify_name = "test1"
    local function check_events(expected)
      local events = exec_lua("return get_events(...)" )
+      if utf_sizes then
+        -- this test case uses ASCII only, so sizes sshould be the same.
+        -- Unicode is tested below.
+        for _, event in ipairs(expected) do
+          event[9] = event[8]
+          event[10] = event[8]
+        end
+      end
      eq(expected, events)
      if verify then
        for _, event in ipairs(events) do
@@ -75,6 +84,7 @@ describe('lua: buffer event callbacks', function()
      end
    end

+    command('set autoindent')
    command('normal! GyyggP')
    tick = tick + 1
    check_events({{ "test1", "lines", 1, tick, 0, 0, 1, 0}})
@@ -83,7 +93,7 @@ describe('lua: buffer event callbacks', function()
    tick = tick + 1
    check_events({{ "test1", "lines", 1, tick, 3, 5, 4, 32 }})

-    exec_lua("return test_register(...)", 0, "test2", true)
+    exec_lua("return test_register(...)", 0, "test2", true, utf_sizes)
    tick = tick + 1
    command('undo')

@@ -124,7 +134,13 @@ describe('lua: buffer event callbacks', function()
    tick = tick + 1
    check_events({{ "test2", "lines", 1, tick, 4, 5, 5, 19 }})

-    feed('<esc>')
+    feed('<esc>Go')
+    tick = tick + 1
+    check_events({{ "test2", "lines", 1, tick, 11, 11, 12, 0 }})
+
+    feed('x')
+    tick = tick + 1
+    check_events({{ "test2", "lines", 1, tick, 11, 12, 12, 5 }})

    command('bwipe!')
    check_events({{ "test2", "detach", 1 }})
@@ -137,4 +153,54 @@ describe('lua: buffer event callbacks', function()
  it('works with verify', function()
    check(true)
  end)
+
+  it('works with utf_sizes and ASCII text', function()
+    check(false,true)
+  end)
+
+  it('works with utf_sizes and unicode text', function()
+    local unicode_text = {"ascii text",
+                          "latin text åäö",
+                          "BMP text ɧ αλφά",
+                          "BMP text 汉语 ↥↧",
+                          "SMP 🤦 🦄🦃",
+                          "combining å بِيَّة"}
+    meths.buf_set_lines(0, 0, -1, true, unicode_text)
+    feed('gg')
+    exec_lua("return test_register(...)", 0, "test1", false, true)
+    local tick = meths.buf_get_changedtick(0)
+
+    feed('dd')
+    tick = tick + 1
+    eq({{ "test1", "lines", 1, tick, 0, 1, 0, 11, 11, 11 }}, exec_lua("return get_events(...)" ))
+
+    feed('A<bs>')
+    tick = tick + 1
+    eq({{ "test1", "lines", 1, tick, 0, 1, 1, 18, 15, 15 }}, exec_lua("return get_events(...)" ))
+
+    feed('<esc>jylp')
+    tick = tick + 1
+    eq({{ "test1", "lines", 1, tick, 1, 2, 2, 21, 16, 16 }}, exec_lua("return get_events(...)" ))
+
+    feed('+eea<cr>')
+    tick = tick + 1
+    eq({{ "test1", "lines", 1, tick, 2, 3, 4, 23, 15, 15 }}, exec_lua("return get_events(...)" ))
+
+    feed('<esc>jdw')
+    tick = tick + 1
+    -- non-BMP chars count as 2 UTF-2 codeunits
+    eq({{ "test1", "lines", 1, tick, 4, 5, 5, 18, 9, 12 }}, exec_lua("return get_events(...)" ))
+
+    feed('+rx')
+    tick = tick + 1
+    -- count the individual codepoints of a composed character.
+    eq({{ "test1", "lines", 1, tick, 5, 6, 6, 27, 20, 20 }}, exec_lua("return get_events(...)" ))
+
+    feed('kJ')
+    tick = tick + 1
+    -- NB: this is inefficient (but not really wrong).
+    eq({{ "test1", "lines", 1,   tick, 4, 5, 5, 14, 5, 8 },
+        { "test1", "lines", 1, tick+1, 5, 6, 5, 27, 20, 20 }}, exec_lua("return get_events(...)" ))
+  end)
+
 end)