feat(stdlib): overload vim.str_byteindex, vim.str_utfindex #30735

PROBLEM: There are several limitations to vim.str_byteindex, vim.str_utfindex: 1. They throw given out-of-range indexes. An invalid (often user/lsp-provided) index doesn't feel exceptional and should be handled by the caller. `:help dev-error-patterns` suggests that `retval, errmsg` is the preferred way to handle this kind of failure. 2. They cannot accept an encoding. So LSP needs wrapper functions. #25272 3. The current signatures are not extensible. * Calling: The function currently uses a fairly opaque boolean value to indicate to identify the encoding. * Returns: The fact it can throw requires wrapping in pcall. 4. The current name doesn't follow suggestions in `:h dev-naming` and I think `get` would be suitable. SOLUTION: - Because these are performance-sensitive, don't introduce `opts`. - Introduce an "overload" that accepts `encoding:string` and `strict_indexing:bool` params. ```lua local col = vim.str_utfindex(line, encoding, [index, [no_out_of_range]]) ``` Support the old versions by dispatching on the type of argument 2, and deprecate that form. ```lua vim.str_utfindex(line) -- (utf-32 length, utf-16 length), deprecated vim.str_utfindex(line, index) -- (utf-32 index, utf-16 index), deprecated vim.str_utfindex(line, 'utf-16') -- utf-16 length vim.str_utfindex(line, 'utf-16', index) -- utf-16 index vim.str_utfindex(line, 'utf-16', math.huge) -- error: index out of range vim.str_utfindex(line, 'utf-16', math.huge, false) -- utf-16 length ```
2025-09-05 19:08:15 +00:00 · 2024-10-23 14:33:57 +01:00
parent 3a86b60032
commit 230b0c7f02
5 changed files with 283 additions and 68 deletions
--- a/runtime/doc/lua.txt
+++ b/runtime/doc/lua.txt
@@ -1000,22 +1000,6 @@ vim.schedule({fn})                                            *vim.schedule()*
    Parameters: ~
      • {fn}  (`fun()`)
 vim.str_byteindex({str}, {index}, {use_utf16})           *vim.str_byteindex()*
    Convert UTF-32 or UTF-16 {index} to byte index. If {use_utf16} is not
    supplied, it defaults to false (use UTF-32). Returns the byte index.
    Invalid UTF-8 and NUL is treated like in |vim.str_utfindex()|. An {index}
    in the middle of a UTF-16 sequence is rounded upwards to the end of that
    sequence.
    Parameters: ~
      • {str}        (`string`)
      • {index}      (`integer`)
      • {use_utf16}  (`boolean?`)
    Return: ~
        (`integer`)
 vim.str_utf_end({str}, {index})                            *vim.str_utf_end()*
    Gets the distance (in bytes) from the last byte of the codepoint
    (character) that {index} points to.
@@ -1073,23 +1057,6 @@ vim.str_utf_start({str}, {index})                        *vim.str_utf_start()*
    Return: ~
        (`integer`)
 vim.str_utfindex({str}, {index})                          *vim.str_utfindex()*
    Convert byte index to UTF-32 and UTF-16 indices. If {index} is not
    supplied, the length of the string is used. All indices are zero-based.
    Embedded NUL bytes are treated as terminating the string. Invalid UTF-8
    bytes, and embedded surrogates are counted as one code point each. An
    {index} in the middle of a UTF-8 sequence is rounded upwards to the end of
    that sequence.
    Parameters: ~
      • {str}    (`string`)
      • {index}  (`integer?`)
    Return (multiple): ~
        (`integer`) UTF-32 index
        (`integer`) UTF-16 index
 vim.stricmp({a}, {b})                                          *vim.stricmp()*
    Compares strings case-insensitively.
@@ -1776,6 +1743,44 @@ vim.schedule_wrap({fn})                                  *vim.schedule_wrap()*
      • |vim.schedule()|
      • |vim.in_fast_event()|
                                                         *vim.str_byteindex()*
 vim.str_byteindex({s}, {encoding}, {index}, {strict_indexing})
    Convert UTF-32, UTF-16 or UTF-8 {index} to byte index. If
    {strict_indexing} is false then then an out of range index will return
    byte length instead of throwing an error.
    Invalid UTF-8 and NUL is treated like in |vim.str_utfindex()|. An {index}
    in the middle of a UTF-16 sequence is rounded upwards to the end of that
    sequence.
    Parameters: ~
      • {s}                (`string`)
      • {encoding}         (`"utf-8"|"utf-16"|"utf-32"`)
      • {index}            (`integer`)
      • {strict_indexing}  (`boolean?`) default: true
    Return: ~
        (`integer`)
                                                          *vim.str_utfindex()*
 vim.str_utfindex({s}, {encoding}, {index}, {strict_indexing})
    Convert byte index to UTF-32, UTF-16 or UTF-8 indices. If {index} is not
    supplied, the length of the string is used. All indices are zero-based.
    If {strict_indexing} is false then an out of range index will return
    string length instead of throwing an error. Invalid UTF-8 bytes, and
    embedded surrogates are counted as one code point each. An {index} in the
    middle of a UTF-8 sequence is rounded upwards to the end of that sequence.
    Parameters: ~
      • {s}                (`string`)
      • {encoding}         (`"utf-8"|"utf-16"|"utf-32"`)
      • {index}            (`integer?`)
      • {strict_indexing}  (`boolean?`) default: true
    Return: ~
        (`integer`)
 vim.system({cmd}, {opts}, {on_exit})                            *vim.system()*
    Runs a system command or throws an error if {cmd} cannot be run.
--- a/runtime/lua/vim/_editor.lua
+++ b/runtime/lua/vim/_editor.lua
@@ -68,6 +68,12 @@ vim.log = {
  },
 }
 local utfs = {
  ['utf-8'] = true,
  ['utf-16'] = true,
  ['utf-32'] = true,
 }
 -- TODO(lewis6991): document that the signature is system({cmd}, [{opts},] {on_exit})
 --- Runs a system command or throws an error if {cmd} cannot be run.
 ---
@@ -714,7 +720,127 @@ function vim._on_key(buf, typed_buf)
  end
 end
--- Generates a list of possible completions for the string.
+--- Convert UTF-32, UTF-16 or UTF-8 {index} to byte index.
 --- If {strict_indexing} is false
 --- then then an out of range index will return byte length
 --- instead of throwing an error.
 ---
 --- Invalid UTF-8 and NUL is treated like in |vim.str_utfindex()|.
 --- An {index} in the middle of a UTF-16 sequence is rounded upwards to
 --- the end of that sequence.
 ---@param s string
 ---@param encoding "utf-8"|"utf-16"|"utf-32"
 ---@param index integer
 ---@param strict_indexing? boolean # default: true
 ---@return integer
 function vim.str_byteindex(s, encoding, index, strict_indexing)
  if type(encoding) == 'number' then
    -- Legacy support for old API
    -- Parameters: ~
    --   • {str}        (`string`)
    --   • {index}      (`integer`)
    --   • {use_utf16}  (`boolean?`)
    local old_index = encoding
    local use_utf16 = index or false
    return vim.__str_byteindex(s, old_index, use_utf16) or error('index out of range')
  end
  vim.validate('s', s, 'string')
  vim.validate('index', index, 'number')
  local len = #s
  if index == 0 or len == 0 then
    return 0
  end
  vim.validate('encoding', encoding, function(v)
    return utfs[v], 'invalid encoding'
  end)
  vim.validate('strict_indexing', strict_indexing, 'boolean', true)
  if strict_indexing == nil then
    strict_indexing = true
  end
  if encoding == 'utf-8' then
    if index > len then
      return strict_indexing and error('index out of range') or len
    end
    return index
  end
  return vim.__str_byteindex(s, index, encoding == 'utf-16')
    or strict_indexing and error('index out of range')
    or len
 end
 --- Convert byte index to UTF-32, UTF-16 or UTF-8 indices. If {index} is not
 --- supplied, the length of the string is used. All indices are zero-based.
 ---
 --- If {strict_indexing} is false then an out of range index will return string
 --- length instead of throwing an error.
 --- Invalid UTF-8 bytes, and embedded surrogates are counted as one code point
 --- each. An {index} in the middle of a UTF-8 sequence is rounded upwards to the end of
 --- that sequence.
 ---@param s string
 ---@param encoding "utf-8"|"utf-16"|"utf-32"
 ---@param index? integer
 ---@param strict_indexing? boolean # default: true
 ---@return integer
 function vim.str_utfindex(s, encoding, index, strict_indexing)
  if encoding == nil or type(encoding) == 'number' then
    -- Legacy support for old API
    -- Parameters: ~
    --   • {str}    (`string`)
    --   • {index}  (`integer?`)
    local old_index = encoding
    local col32, col16 = vim.__str_utfindex(s, old_index) --[[@as integer,integer]]
    if not col32 or not col16 then
      error('index out of range')
    end
    -- Return (multiple): ~
    --     (`integer`) UTF-32 index
    --     (`integer`) UTF-16 index
    return col32, col16
  end
  vim.validate('s', s, 'string')
  vim.validate('index', index, 'number', true)
  if not index then
    index = math.huge
    strict_indexing = false
  end
  if index == 0 then
    return 0
  end
  vim.validate('encoding', encoding, function(v)
    return utfs[v], 'invalid encoding'
  end)
  vim.validate('strict_indexing', strict_indexing, 'boolean', true)
  if strict_indexing == nil then
    strict_indexing = true
  end
  if encoding == 'utf-8' then
    local len = #s
    return index <= len and index or (strict_indexing and error('index out of range') or len)
  end
  local col32, col16 = vim.__str_utfindex(s, index) --[[@as integer?,integer?]]
  local col = encoding == 'utf-16' and col16 or col32
  if col then
    return col
  end
  if strict_indexing then
    error('index out of range')
  end
  local max32, max16 = vim.__str_utfindex(s)--[[@as integer integer]]
  return encoding == 'utf-16' and max16 or max32
 end
 --- Generates a list of possible completions for the str
 --- String has the pattern.
 ---
 --- 1. Can we get it to just return things in the global namespace with that name prefix
--- a/runtime/lua/vim/_meta/builtin.lua
+++ b/runtime/lua/vim/_meta/builtin.lua
@@ -112,18 +112,6 @@ function vim.rpcrequest(channel, method, ...) end
 --- equal, {a} is greater than {b} or {a} is lesser than {b}, respectively.
 function vim.stricmp(a, b) end
 --- Convert UTF-32 or UTF-16 {index} to byte index. If {use_utf16} is not
 --- supplied, it defaults to false (use UTF-32). Returns the byte index.
 ---
 --- Invalid UTF-8 and NUL is treated like in |vim.str_utfindex()|.
 --- An {index} in the middle of a UTF-16 sequence is rounded upwards to
 --- the end of that sequence.
 --- @param str string
 --- @param index integer
 --- @param use_utf16? boolean
 --- @return integer
 function vim.str_byteindex(str, index, use_utf16) end
 --- Gets a list of the starting byte positions of each UTF-8 codepoint in the given string.
 ---
 --- Embedded NUL bytes are treated as terminating the string.
@@ -173,19 +161,6 @@ function vim.str_utf_start(str, index) end
 --- @return integer
 function vim.str_utf_end(str, index) end
 --- Convert byte index to UTF-32 and UTF-16 indices. If {index} is not
 --- supplied, the length of the string is used. All indices are zero-based.
 ---
 --- Embedded NUL bytes are treated as terminating the string. Invalid UTF-8
 --- bytes, and embedded surrogates are counted as one code point each. An
 --- {index} in the middle of a UTF-8 sequence is rounded upwards to the end of
 --- that sequence.
 --- @param str string
 --- @param index? integer
 --- @return integer # UTF-32 index
 --- @return integer # UTF-16 index
 function vim.str_utfindex(str, index) end
 --- The result is a String, which is the text {str} converted from
 --- encoding {from} to encoding {to}. When the conversion fails `nil` is
 --- returned.  When some characters could not be converted they
--- a/src/nvim/lua/stdlib.c
+++ b/src/nvim/lua/stdlib.c
@@ -181,7 +181,9 @@ int nlua_str_utfindex(lua_State *const lstate) FUNC_ATTR_NONNULL_ALL
  } else {
    idx = luaL_checkinteger(lstate, 2);
    if (idx < 0 || idx > (intptr_t)s1_len) {
-      return luaL_error(lstate, "index out of range");
+      lua_pushnil(lstate);
      lua_pushnil(lstate);
      return 2;
    }
  }
@@ -272,7 +274,8 @@ int nlua_str_byteindex(lua_State *const lstate) FUNC_ATTR_NONNULL_ALL
  const char *s1 = luaL_checklstring(lstate, 1, &s1_len);
  intptr_t idx = luaL_checkinteger(lstate, 2);
  if (idx < 0) {
-    return luaL_error(lstate, "index out of range");
+    lua_pushnil(lstate);
    return 1;
  }
  bool use_utf16 = false;
  if (lua_gettop(lstate) >= 3) {
@@ -281,7 +284,8 @@ int nlua_str_byteindex(lua_State *const lstate) FUNC_ATTR_NONNULL_ALL
  ssize_t byteidx = mb_utf_index_to_bytes(s1, s1_len, (size_t)idx, use_utf16);
  if (byteidx == -1) {
-    return luaL_error(lstate, "index out of range");
+    lua_pushnil(lstate);
    return 1;
  }
  lua_pushinteger(lstate, (lua_Integer)byteidx);
@@ -695,10 +699,10 @@ void nlua_state_add_stdlib(lua_State *const lstate, bool is_thread)
    lua_setfield(lstate, -2, "stricmp");
    // str_utfindex
    lua_pushcfunction(lstate, &nlua_str_utfindex);
-    lua_setfield(lstate, -2, "str_utfindex");
+    lua_setfield(lstate, -2, "__str_utfindex");
    // str_byteindex
    lua_pushcfunction(lstate, &nlua_str_byteindex);
-    lua_setfield(lstate, -2, "str_byteindex");
+    lua_setfield(lstate, -2, "__str_byteindex");
    // str_utf_pos
    lua_pushcfunction(lstate, &nlua_str_utf_pos);
    lua_setfield(lstate, -2, "str_utf_pos");
--- a/test/functional/lua/vim_spec.lua
+++ b/test/functional/lua/vim_spec.lua
@@ -312,21 +312,106 @@ describe('lua stdlib', function()
      49,
      51,
    }
    local indices8 = {
      [0] = 0,
      1,
      2,
      3,
      4,
      5,
      6,
      7,
      8,
      9,
      10,
      11,
      12,
      13,
      14,
      15,
      16,
      17,
      18,
      19,
      20,
      21,
      22,
      23,
      24,
      25,
      26,
      27,
      28,
      29,
      30,
      31,
      32,
      33,
      34,
      35,
      36,
      37,
      38,
      39,
      40,
      41,
      42,
      43,
      44,
      45,
      46,
      47,
      48,
      49,
      50,
      51,
    }
    for i, k in pairs(indices32) do
      eq(k, exec_lua('return vim.str_byteindex(_G.test_text, ...)', i), i)
      eq(k, exec_lua('return vim.str_byteindex(_G.test_text, ..., false)', i), i)
      eq(k, exec_lua('return vim.str_byteindex(_G.test_text, "utf-32", ...)', i), i)
    end
    for i, k in pairs(indices16) do
      eq(k, exec_lua('return vim.str_byteindex(_G.test_text, ..., true)', i), i)
      eq(k, exec_lua('return vim.str_byteindex(_G.test_text, "utf-16", ...)', i), i)
    end
-    eq(
+    for i, k in pairs(indices8) do
      eq(k, exec_lua('return vim.str_byteindex(_G.test_text, "utf-8", ...)', i), i)
    end
    matches(
      'index out of range',
      pcall_err(exec_lua, 'return vim.str_byteindex(_G.test_text, ...)', #indices32 + 1)
    )
-    eq(
+    matches(
      'index out of range',
      pcall_err(exec_lua, 'return vim.str_byteindex(_G.test_text, ..., true)', #indices16 + 1)
    )
-    local i32, i16 = 0, 0
+    matches(
      'index out of range',
      pcall_err(exec_lua, 'return vim.str_byteindex(_G.test_text, "utf-16", ...)', #indices16 + 1)
    )
    matches(
      'index out of range',
      pcall_err(exec_lua, 'return vim.str_byteindex(_G.test_text, "utf-32", ...)', #indices32 + 1)
    )
    matches(
      'invalid encoding',
      pcall_err(exec_lua, 'return vim.str_byteindex("hello", "madeupencoding", 1)')
    )
    eq(
      indices32[#indices32],
      exec_lua('return vim.str_byteindex(_G.test_text, "utf-32", 99999, false)')
    )
    eq(
      indices16[#indices16],
      exec_lua('return vim.str_byteindex(_G.test_text, "utf-16", 99999, false)')
    )
    eq(
      indices8[#indices8],
      exec_lua('return vim.str_byteindex(_G.test_text, "utf-8", 99999, false)')
    )
    eq(2, exec_lua('return vim.str_byteindex("é", "utf-16", 2, false)'))
    local i32, i16, i8 = 0, 0, 0
    local len = 51
    for k = 0, len do
      if indices32[i32] < k then
@@ -338,9 +423,29 @@ describe('lua stdlib', function()
          i16 = i16 + 1
        end
      end
      if indices8[i8] < k then
        i8 = i8 + 1
      end
      eq({ i32, i16 }, exec_lua('return {vim.str_utfindex(_G.test_text, ...)}', k), k)
      eq({ i32 }, exec_lua('return {vim.str_utfindex(_G.test_text, "utf-32", ...)}', k), k)
      eq({ i16 }, exec_lua('return {vim.str_utfindex(_G.test_text, "utf-16", ...)}', k), k)
      eq({ i8 }, exec_lua('return {vim.str_utfindex(_G.test_text, "utf-8", ...)}', k), k)
    end
-    eq(
+
    eq({ #indices32, #indices16 }, exec_lua('return {vim.str_utfindex(_G.test_text)}'))
    eq(#indices32, exec_lua('return vim.str_utfindex(_G.test_text, "utf-32", math.huge, false)'))
    eq(#indices16, exec_lua('return vim.str_utfindex(_G.test_text, "utf-16", math.huge, false)'))
    eq(#indices8, exec_lua('return vim.str_utfindex(_G.test_text, "utf-8", math.huge, false)'))
    eq(#indices32, exec_lua('return vim.str_utfindex(_G.test_text, "utf-32")'))
    eq(#indices16, exec_lua('return vim.str_utfindex(_G.test_text, "utf-16")'))
    eq(#indices8, exec_lua('return vim.str_utfindex(_G.test_text, "utf-8")'))
    matches(
      'invalid encoding',
      pcall_err(exec_lua, 'return vim.str_utfindex(_G.test_text, "madeupencoding", ...)', 1)
    )
    matches(
      'index out of range',
      pcall_err(exec_lua, 'return vim.str_utfindex(_G.test_text, ...)', len + 1)
    )