lua: support getting UTF-32 and UTF-16 sizes of replaced text

This commit is contained in:
Björn Linse
2019-08-04 12:22:22 +02:00
parent b0e26199ec
commit c0993ed343
10 changed files with 192 additions and 30 deletions

View File

@@ -208,14 +208,17 @@ they are allowed.
|nvim_buf_attach| will take keyword args for the callbacks. "on_lines" will |nvim_buf_attach| will take keyword args for the callbacks. "on_lines" will
receive parameters ("lines", {buf}, {changedtick}, {firstline}, {lastline}, receive parameters ("lines", {buf}, {changedtick}, {firstline}, {lastline},
{new_lastline}, {old_bytecount}). {new_lastline}, {old_byte_size}[, {old_utf32_size}, {old_utf16_size}]).
Unlike remote channel events the text contents are not passed. The new text can Unlike remote channel events the text contents are not passed. The new text can
be accessed inside the callback as be accessed inside the callback as
`vim.api.nvim_buf_get_lines(buf, firstline, new_lastline, true)` `vim.api.nvim_buf_get_lines(buf, firstline, new_lastline, true)`
{old_bytecount} is the total size of the replaced region {firstline} to {old_byte_size} is the total size of the replaced region {firstline} to
{lastline} in bytes, including the final newline after {lastline}. {lastline} in bytes, including the final newline after {lastline}. if
`utf_sizes` is set to true in |nvim_buf_attach()| keyword args, then the
UTF-32 and UTF-16 sizes of the deleted region is also passed as additional
arguments {old_utf32_size} and {old_utf16_size}.
"on_changedtick" is invoked when |b:changedtick| was incremented but no text "on_changedtick" is invoked when |b:changedtick| was incremented but no text
was changed. The parameters recieved are ("changedtick", {buf}, {changedtick}). was changed. The parameters recieved are ("changedtick", {buf}, {changedtick}).

View File

@@ -109,9 +109,11 @@ String buffer_get_line(Buffer buffer, Integer index, Error *err)
/// `nvim_buf_lines_event`. Otherwise, the first notification will be /// `nvim_buf_lines_event`. Otherwise, the first notification will be
/// a `nvim_buf_changedtick_event`. Not used for lua callbacks. /// a `nvim_buf_changedtick_event`. Not used for lua callbacks.
/// @param opts Optional parameters. /// @param opts Optional parameters.
/// `on_lines`: lua callback received on change. /// `on_lines`: lua callback received on change.
/// `on_changedtick`: lua callback received on changedtick /// `on_changedtick`: lua callback received on changedtick
/// increment without text change. /// increment without text change.
/// `utf_sizes`: include UTF-32 and UTF-16 size of
/// the replaced region.
/// See |api-buffer-updates-lua| for more information /// See |api-buffer-updates-lua| for more information
/// @param[out] err Error details, if any /// @param[out] err Error details, if any
/// @return False when updates couldn't be enabled because the buffer isn't /// @return False when updates couldn't be enabled because the buffer isn't
@@ -156,6 +158,12 @@ Boolean nvim_buf_attach(uint64_t channel_id,
} }
cb.on_detach = v->data.luaref; cb.on_detach = v->data.luaref;
v->data.integer = LUA_NOREF; v->data.integer = LUA_NOREF;
} else if (is_lua && strequal("utf_sizes", k.data)) {
if (v->type != kObjectTypeBoolean) {
api_set_error(err, kErrorTypeValidation, "utf_sizes must be boolean");
goto error;
}
cb.utf_sizes = v->data.boolean;
} else { } else {
api_set_error(err, kErrorTypeValidation, "unexpected key: %s", k.data); api_set_error(err, kErrorTypeValidation, "unexpected key: %s", k.data);
goto error; goto error;
@@ -1196,6 +1204,7 @@ Dictionary nvim__buf_stats(Buffer buffer, Error *err)
// NB: this should be zero at any time API functions are called, // NB: this should be zero at any time API functions are called,
// this exists to debug issues // this exists to debug issues
PUT(rv, "dirty_bytes", INTEGER_OBJ((Integer)buf->deleted_bytes)); PUT(rv, "dirty_bytes", INTEGER_OBJ((Integer)buf->deleted_bytes));
return rv; return rv;
} }

View File

@@ -459,8 +459,9 @@ typedef struct {
LuaRef on_lines; LuaRef on_lines;
LuaRef on_changedtick; LuaRef on_changedtick;
LuaRef on_detach; LuaRef on_detach;
bool utf_sizes;
} BufUpdateCallbacks; } BufUpdateCallbacks;
#define BUF_UPDATE_CALLBACKS_INIT { LUA_NOREF, LUA_NOREF, LUA_NOREF } #define BUF_UPDATE_CALLBACKS_INIT { LUA_NOREF, LUA_NOREF, LUA_NOREF, false }
#define BUF_HAS_QF_ENTRY 1 #define BUF_HAS_QF_ENTRY 1
#define BUF_HAS_LL_ENTRY 2 #define BUF_HAS_LL_ENTRY 2
@@ -802,12 +803,24 @@ struct file_buffer {
kvec_t(BufhlLine *) b_bufhl_move_space; // temporary space for highlights kvec_t(BufhlLine *) b_bufhl_move_space; // temporary space for highlights
// array of channelids which have asked to receive updates for this // array of channel_id:s which have asked to receive updates for this
// buffer. // buffer.
kvec_t(uint64_t) update_channels; kvec_t(uint64_t) update_channels;
// array of lua callbacks for buffer updates.
kvec_t(BufUpdateCallbacks) update_callbacks; kvec_t(BufUpdateCallbacks) update_callbacks;
// whether an update callback has requested codepoint size of deleted regions.
bool update_need_codepoints;
// Measurements of the deleted or replaced region since the last update
// event. Some consumers of buffer changes need to know the byte size (like
// tree-sitter) or the corresponding UTF-32/UTF-16 size (like LSP) of the
// deleted text.
size_t deleted_bytes; size_t deleted_bytes;
size_t deleted_codepoints;
size_t deleted_codeunits;
// The number for times the current line has been flushed in the memline.
int flush_count; int flush_count;
int b_diff_failed; // internal diff failed for this buffer int b_diff_failed; // internal diff failed for this buffer

View File

@@ -26,6 +26,9 @@ bool buf_updates_register(buf_T *buf, uint64_t channel_id,
if (channel_id == LUA_INTERNAL_CALL) { if (channel_id == LUA_INTERNAL_CALL) {
kv_push(buf->update_callbacks, cb); kv_push(buf->update_callbacks, cb);
if (cb.utf_sizes) {
buf->update_need_codepoints = true;
}
return true; return true;
} }
@@ -169,7 +172,9 @@ void buf_updates_send_changes(buf_T *buf,
int64_t num_removed, int64_t num_removed,
bool send_tick) bool send_tick)
{ {
size_t deleted_bytes = ml_flush_deleted_bytes(buf); size_t deleted_codepoints, deleted_codeunits;
size_t deleted_bytes = ml_flush_deleted_bytes(buf, &deleted_codepoints,
&deleted_codeunits);
if (!buf_updates_active(buf)) { if (!buf_updates_active(buf)) {
return; return;
@@ -233,8 +238,8 @@ void buf_updates_send_changes(buf_T *buf,
bool keep = true; bool keep = true;
if (cb.on_lines != LUA_NOREF) { if (cb.on_lines != LUA_NOREF) {
Array args = ARRAY_DICT_INIT; Array args = ARRAY_DICT_INIT;
Object items[6]; Object items[8];
args.size = 6; args.size = 6; // may be increased to 8 below
args.items = items; args.items = items;
// the first argument is always the buffer handle // the first argument is always the buffer handle
@@ -254,6 +259,11 @@ void buf_updates_send_changes(buf_T *buf,
// byte count of previous contents // byte count of previous contents
args.items[5] = INTEGER_OBJ((Integer)deleted_bytes); args.items[5] = INTEGER_OBJ((Integer)deleted_bytes);
if (cb.utf_sizes) {
args.size = 8;
args.items[6] = INTEGER_OBJ((Integer)deleted_codepoints);
args.items[7] = INTEGER_OBJ((Integer)deleted_codeunits);
}
textlock++; textlock++;
Object res = executor_exec_lua_cb(cb.on_lines, "lines", args, true); Object res = executor_exec_lua_cb(cb.on_lines, "lines", args, true);
textlock--; textlock--;

View File

@@ -1756,6 +1756,8 @@ failed:
linecnt--; linecnt--;
} }
curbuf->deleted_bytes = 0; curbuf->deleted_bytes = 0;
curbuf->deleted_codepoints = 0;
curbuf->deleted_codeunits = 0;
linecnt = curbuf->b_ml.ml_line_count - linecnt; linecnt = curbuf->b_ml.ml_line_count - linecnt;
if (filesize == 0) if (filesize == 0)
linecnt = 0; linecnt = 0;

View File

@@ -627,6 +627,8 @@ EXTERN pos_T Insstart_orig;
EXTERN int orig_line_count INIT(= 0); /* Line count when "gR" started */ EXTERN int orig_line_count INIT(= 0); /* Line count when "gR" started */
EXTERN int vr_lines_changed INIT(= 0); /* #Lines changed by "gR" so far */ EXTERN int vr_lines_changed INIT(= 0); /* #Lines changed by "gR" so far */
// increase around internal delete/replace
EXTERN int inhibit_delete_count INIT(= 0);
/* /*
* These flags are set based upon 'fileencoding'. * These flags are set based upon 'fileencoding'.

View File

@@ -1438,6 +1438,39 @@ int utf16_to_utf8(const wchar_t *strw, char **str)
#endif #endif
/// Measure the length of a string in corresponding UTF-32 and UTF-16 units.
///
/// Invalid UTF-8 bytes, or embedded surrogates, count as one code point/unit
/// each.
///
/// The out parameters are incremented. This is used to measure the size of
/// a buffer region consisting of multiple line segments.
///
/// @param s the string
/// @param len maximum length (an earlier NUL terminates)
/// @param[out] codepoints incremented with UTF-32 code point size
/// @param[out] codeunits incremented with UTF-16 code unit size
void mb_utflen(const char_u *s, size_t len, size_t *codepoints,
size_t *codeunits)
FUNC_ATTR_NONNULL_ALL
{
size_t count = 0, extra = 0;
size_t clen;
for (size_t i = 0; i < len && s[i] != NUL; i += clen) {
clen = utf_ptr2len_len(s+i, len-i);
// NB: gets the byte value of invalid sequence bytes.
// we only care whether the char fits in the BMP or not
int c = (clen > 1) ? utf_ptr2char(s+i) : s[i];
count++;
if (c > 0xFFFF) {
extra++;
}
}
*codepoints += count;
*codeunits += count + extra;
}
/* /*
* Version of strnicmp() that handles multi-byte characters. * Version of strnicmp() that handles multi-byte characters.
* Needed for Big5, Shift-JIS and UTF-8 encoding. Other DBCS encodings can * Needed for Big5, Shift-JIS and UTF-8 encoding. Other DBCS encodings can

View File

@@ -2383,6 +2383,23 @@ static int ml_append_int(
return OK; return OK;
} }
void ml_add_deleted_len(char_u *ptr, ssize_t len)
{
if (inhibit_delete_count) {
return;
}
if (len == -1) {
len = STRLEN(ptr);
}
curbuf->deleted_bytes += len+1;
if (curbuf->update_need_codepoints) {
mb_utflen(ptr, len, &curbuf->deleted_codepoints,
&curbuf->deleted_codeunits);
curbuf->deleted_codepoints++; // NL char
curbuf->deleted_codeunits++;
}
}
/* /*
* Replace line lnum, with buffering, in current buffer. * Replace line lnum, with buffering, in current buffer.
* *
@@ -2408,19 +2425,17 @@ int ml_replace(linenr_T lnum, char_u *line, bool copy)
if (copy) { if (copy) {
line = vim_strsave(line); line = vim_strsave(line);
} }
if (curbuf->b_ml.ml_line_lnum != lnum) { /* other line buffered */ if (curbuf->b_ml.ml_line_lnum != lnum) { // other line buffered
ml_flush_line(curbuf); /* flush it */ ml_flush_line(curbuf); // flush it
} else if (curbuf->b_ml.ml_flags & ML_LINE_DIRTY) { /* same line allocated */ } else if (curbuf->b_ml.ml_flags & ML_LINE_DIRTY) { // same line allocated
// TODO FIXME: see other "TODO FIXME" ml_add_deleted_len(curbuf->b_ml.ml_line_ptr, -1);
curbuf->deleted_bytes += STRLEN(curbuf->b_ml.ml_line_ptr)+1; readlen = false; // already added the length
xfree(curbuf->b_ml.ml_line_ptr); /* free it */
readlen = false; // already read it. xfree(curbuf->b_ml.ml_line_ptr); // free it
} }
if (readlen) { if (readlen && kv_size(curbuf->update_callbacks)) {
if (true) { // TODO: buffer updates active ml_add_deleted_len(ml_get_buf(curbuf, lnum, false), -1);
curbuf->deleted_bytes += STRLEN(ml_get_buf(curbuf, lnum, false))+1;
}
} }
curbuf->b_ml.ml_line_ptr = line; curbuf->b_ml.ml_line_ptr = line;
@@ -2504,7 +2519,10 @@ static int ml_delete_int(buf_T *buf, linenr_T lnum, bool message)
else else
line_size = ((dp->db_index[idx - 1]) & DB_INDEX_MASK) - line_start; line_size = ((dp->db_index[idx - 1]) & DB_INDEX_MASK) - line_start;
buf->deleted_bytes += line_size; // Line should always have an NL char internally (represented as NUL),
// even if 'noeol' is set.
assert(line_size >= 1);
ml_add_deleted_len((char_u *)dp + line_start, line_size-1);
/* /*
* special case: If there is only one line in the data block it becomes empty. * special case: If there is only one line in the data block it becomes empty.
@@ -2690,10 +2708,14 @@ void ml_clearmarked(void)
return; return;
} }
size_t ml_flush_deleted_bytes(buf_T *buf) size_t ml_flush_deleted_bytes(buf_T *buf, size_t *codepoints, size_t *codeunits)
{ {
size_t ret = buf->deleted_bytes; size_t ret = buf->deleted_bytes;
*codepoints = buf->deleted_codepoints;
*codeunits = buf->deleted_codeunits;
buf->deleted_bytes = 0; buf->deleted_bytes = 0;
buf->deleted_codepoints = 0;
buf->deleted_codeunits = 0;
return ret; return ret;
} }

View File

@@ -780,6 +780,7 @@ open_line (
did_append = FALSE; did_append = FALSE;
} }
inhibit_delete_count++;
if (newindent if (newindent
|| did_si || did_si
) { ) {
@@ -821,6 +822,7 @@ open_line (
did_si = false; did_si = false;
} }
} }
inhibit_delete_count--;
/* /*
* In REPLACE mode, for each character in the extra leader, there must be * In REPLACE mode, for each character in the extra leader, there must be
@@ -1685,7 +1687,7 @@ int del_bytes(colnr_T count, bool fixpos_arg, bool use_delcombine)
bool was_alloced = ml_line_alloced(); // check if oldp was allocated bool was_alloced = ml_line_alloced(); // check if oldp was allocated
char_u *newp; char_u *newp;
if (was_alloced) { if (was_alloced) {
curbuf->deleted_bytes += (size_t)oldlen+1; ml_add_deleted_len(curbuf->b_ml.ml_line_ptr, oldlen);
newp = oldp; // use same allocated memory newp = oldp; // use same allocated memory
} else { // need to allocate a new line } else { // need to allocate a new line
newp = xmalloc((size_t)(oldlen + 1 - count)); newp = xmalloc((size_t)(oldlen + 1 - count));

View File

@@ -13,7 +13,8 @@ local origlines = {"original line 1",
"original line 3", "original line 3",
"original line 4", "original line 4",
"original line 5", "original line 5",
"original line 6"} "original line 6",
" indented line"}
describe('lua: buffer event callbacks', function() describe('lua: buffer event callbacks', function()
before_each(function() before_each(function()
@@ -21,14 +22,14 @@ describe('lua: buffer event callbacks', function()
exec_lua([[ exec_lua([[
local events = {} local events = {}
function test_register(bufnr, id, changedtick) function test_register(bufnr, id, changedtick, utf_sizes)
local function callback(...) local function callback(...)
table.insert(events, {id, ...}) table.insert(events, {id, ...})
if test_unreg == id then if test_unreg == id then
return true return true
end end
end end
local opts = {on_lines=callback, on_detach=callback} local opts = {on_lines=callback, on_detach=callback, utf_sizes=utf_sizes}
if changedtick then if changedtick then
opts.on_changedtick = callback opts.on_changedtick = callback
end end
@@ -48,18 +49,26 @@ describe('lua: buffer event callbacks', function()
-- assert the wrong thing), but masks errors with unflushed lines (as -- assert the wrong thing), but masks errors with unflushed lines (as
-- nvim_buf_get_offset forces a flush of the memline). To be safe run the -- nvim_buf_get_offset forces a flush of the memline). To be safe run the
-- test both ways. -- test both ways.
local function check(verify) local function check(verify,utf_sizes)
local lastsize local lastsize
meths.buf_set_lines(0, 0, -1, true, origlines) meths.buf_set_lines(0, 0, -1, true, origlines)
if verify then if verify then
lastsize = meths.buf_get_offset(0, meths.buf_line_count(0)) lastsize = meths.buf_get_offset(0, meths.buf_line_count(0))
end end
exec_lua("return test_register(...)", 0, "test1") exec_lua("return test_register(...)", 0, "test1",false,utf_sizes)
local tick = meths.buf_get_changedtick(0) local tick = meths.buf_get_changedtick(0)
local verify_name = "test1" local verify_name = "test1"
local function check_events(expected) local function check_events(expected)
local events = exec_lua("return get_events(...)" ) local events = exec_lua("return get_events(...)" )
if utf_sizes then
-- this test case uses ASCII only, so sizes sshould be the same.
-- Unicode is tested below.
for _, event in ipairs(expected) do
event[9] = event[8]
event[10] = event[8]
end
end
eq(expected, events) eq(expected, events)
if verify then if verify then
for _, event in ipairs(events) do for _, event in ipairs(events) do
@@ -75,6 +84,7 @@ describe('lua: buffer event callbacks', function()
end end
end end
command('set autoindent')
command('normal! GyyggP') command('normal! GyyggP')
tick = tick + 1 tick = tick + 1
check_events({{ "test1", "lines", 1, tick, 0, 0, 1, 0}}) check_events({{ "test1", "lines", 1, tick, 0, 0, 1, 0}})
@@ -83,7 +93,7 @@ describe('lua: buffer event callbacks', function()
tick = tick + 1 tick = tick + 1
check_events({{ "test1", "lines", 1, tick, 3, 5, 4, 32 }}) check_events({{ "test1", "lines", 1, tick, 3, 5, 4, 32 }})
exec_lua("return test_register(...)", 0, "test2", true) exec_lua("return test_register(...)", 0, "test2", true, utf_sizes)
tick = tick + 1 tick = tick + 1
command('undo') command('undo')
@@ -124,7 +134,13 @@ describe('lua: buffer event callbacks', function()
tick = tick + 1 tick = tick + 1
check_events({{ "test2", "lines", 1, tick, 4, 5, 5, 19 }}) check_events({{ "test2", "lines", 1, tick, 4, 5, 5, 19 }})
feed('<esc>') feed('<esc>Go')
tick = tick + 1
check_events({{ "test2", "lines", 1, tick, 11, 11, 12, 0 }})
feed('x')
tick = tick + 1
check_events({{ "test2", "lines", 1, tick, 11, 12, 12, 5 }})
command('bwipe!') command('bwipe!')
check_events({{ "test2", "detach", 1 }}) check_events({{ "test2", "detach", 1 }})
@@ -137,4 +153,54 @@ describe('lua: buffer event callbacks', function()
it('works with verify', function() it('works with verify', function()
check(true) check(true)
end) end)
it('works with utf_sizes and ASCII text', function()
check(false,true)
end)
it('works with utf_sizes and unicode text', function()
local unicode_text = {"ascii text",
"latin text åäö",
"BMP text ɧ αλφά",
"BMP text 汉语 ↥↧",
"SMP 🤦 🦄🦃",
"combining å بِيَّة"}
meths.buf_set_lines(0, 0, -1, true, unicode_text)
feed('gg')
exec_lua("return test_register(...)", 0, "test1", false, true)
local tick = meths.buf_get_changedtick(0)
feed('dd')
tick = tick + 1
eq({{ "test1", "lines", 1, tick, 0, 1, 0, 11, 11, 11 }}, exec_lua("return get_events(...)" ))
feed('A<bs>')
tick = tick + 1
eq({{ "test1", "lines", 1, tick, 0, 1, 1, 18, 15, 15 }}, exec_lua("return get_events(...)" ))
feed('<esc>jylp')
tick = tick + 1
eq({{ "test1", "lines", 1, tick, 1, 2, 2, 21, 16, 16 }}, exec_lua("return get_events(...)" ))
feed('+eea<cr>')
tick = tick + 1
eq({{ "test1", "lines", 1, tick, 2, 3, 4, 23, 15, 15 }}, exec_lua("return get_events(...)" ))
feed('<esc>jdw')
tick = tick + 1
-- non-BMP chars count as 2 UTF-2 codeunits
eq({{ "test1", "lines", 1, tick, 4, 5, 5, 18, 9, 12 }}, exec_lua("return get_events(...)" ))
feed('+rx')
tick = tick + 1
-- count the individual codepoints of a composed character.
eq({{ "test1", "lines", 1, tick, 5, 6, 6, 27, 20, 20 }}, exec_lua("return get_events(...)" ))
feed('kJ')
tick = tick + 1
-- NB: this is inefficient (but not really wrong).
eq({{ "test1", "lines", 1, tick, 4, 5, 5, 14, 5, 8 },
{ "test1", "lines", 1, tick+1, 5, 6, 5, 27, 20, 20 }}, exec_lua("return get_events(...)" ))
end)
end) end)