vim-patch:8.2.2233: cannot convert a byte index into a character index (#13978)

Problem: Cannot convert a byte index into a character index. Solution: Add charidx(). (Yegappan Lakshmanan, closes vim/vim#7561) 17793ef23a
2025-12-16 03:15:39 +00:00 · 2021-02-23 10:13:14 +09:00
parent 9d5f842807
commit 0450e155d4
5 changed files with 98 additions and 0 deletions
--- a/runtime/doc/eval.txt
+++ b/runtime/doc/eval.txt
@@ -2077,6 +2077,8 @@ changenr()			Number	current change number
 chanclose({id}[, {stream}])	Number	Closes a channel or one of its streams
 chansend({id}, {data})		Number	Writes {data} to channel
 char2nr({expr}[, {utf8}])	Number	ASCII/UTF8 value of first char in {expr}
+charidx({string}, {idx} [, {countcc}])
+				Number  char index of byte {idx} in {string}
 cindent({lnum})		Number	C indent for line {lnum}
 clearmatches([{win}])		none	clear all matches
 col({expr})			Number	column nr of cursor or mark
@@ -3027,6 +3029,29 @@ char2nr({expr} [, {utf8}])					*char2nr()*
 		A combining character is a separate character.
 		|nr2char()| does the opposite.

+							*charidx()*
+charidx({string}, {idx} [, {countcc}])
+		Return the character index of the byte at {idx} in {string}.
+		The index of the first character is zero.
+		If there are no multibyte characters the returned value is
+		equal to {idx}.
+		When {countcc} is omitted or zero, then composing characters
+		are not counted separately, their byte length is added to the
+		preceding base character.
+		When {countcc} is set to 1, then composing characters are
+		counted as separate characters.
+		Returns -1 if the arguments are invalid or if {idx} is greater
+		than the index of the last byte in {string}.  An error is
+		given if the first argument is not a string, the second
+		argument is not a number or when the third argument is present
+		and is not zero or one.
+		See |byteidx()| and |byteidxcomp()| for getting the byte index
+		from the character index.
+		Examples: >
+			echo charidx('áb́ć', 3)		returns 1
+			echo charidx('áb́ć', 6, 1)	returns 4
+			echo charidx('áb́ć', 16)		returns -1
+
 cindent({lnum})						*cindent()*
 		Get the amount of indent for line {lnum} according the C
 		indenting rules, as with 'cindent'.
--- a/runtime/doc/usr_41.txt
+++ b/runtime/doc/usr_41.txt
@@ -613,6 +613,7 @@ String manipulation:					*string-functions*
 	iconv()			convert text from one encoding to another
 	byteidx()		byte index of a character in a string
 	byteidxcomp()		like byteidx() but count composing characters
+	charidx()		character index of a byte in a string
 	repeat()		repeat a string multiple times
 	eval()			evaluate a string expression
 	execute()		execute an Ex command and get the output
--- a/src/nvim/eval.lua
+++ b/src/nvim/eval.lua
@@ -63,6 +63,7 @@ return {
    chanclose={args={1, 2}},
    chansend={args=2},
    char2nr={args={1, 2}},
+    charidx={args={2, 3}},
    cindent={args=1},
    clearmatches={args={0, 1}},
    col={args=1},
--- a/src/nvim/eval/funcs.c
+++ b/src/nvim/eval/funcs.c
@@ -940,6 +940,52 @@ static void f_char2nr(typval_T *argvars, typval_T *rettv, FunPtr fptr)
      (const char_u *)tv_get_string(&argvars[0]));
 }

+// "charidx()" function
+static void f_charidx(typval_T *argvars, typval_T *rettv, FunPtr fptr)
+{
+  rettv->vval.v_number = -1;
+
+  if (argvars[0].v_type != VAR_STRING
+      || argvars[1].v_type != VAR_NUMBER
+      || (argvars[2].v_type != VAR_UNKNOWN
+          && argvars[2].v_type != VAR_NUMBER)) {
+    EMSG(_(e_invarg));
+    return;
+  }
+
+  const char *str = tv_get_string_chk(&argvars[0]);
+  varnumber_T idx = tv_get_number_chk(&argvars[1], NULL);
+  if (str == NULL || idx < 0) {
+    return;
+  }
+  int countcc = 0;
+  if (argvars[2].v_type != VAR_UNKNOWN) {
+    countcc = (int)tv_get_number(&argvars[2]);
+  }
+  if (countcc < 0 || countcc > 1) {
+    EMSG(_(e_invarg));
+    return;
+  }
+
+  int (*ptr2len)(const char_u *);
+  if (countcc) {
+    ptr2len = utf_ptr2len;
+  } else {
+    ptr2len = utfc_ptr2len;
+  }
+
+  const char *p;
+  int len;
+  for (p = str, len = 0; p <= str + idx; len++) {
+    if (*p == NUL) {
+      return;
+    }
+    p += ptr2len((const char_u *)p);
+  }
+
+  rettv->vval.v_number = len > 0 ? len - 1 : 0;
+}
+
 /*
 * "cindent(lnum)" function
 */
--- a/src/nvim/testdir/test_functions.vim
+++ b/src/nvim/testdir/test_functions.vim
@@ -833,6 +833,31 @@ func Test_byte2line_line2byte()
  bw!
 endfunc

+" Test for charidx()
+func Test_charidx()
+  let a = 'xáb́y'
+  call assert_equal(0, charidx(a, 0))
+  call assert_equal(1, charidx(a, 3))
+  call assert_equal(2, charidx(a, 4))
+  call assert_equal(3, charidx(a, 7))
+  call assert_equal(-1, charidx(a, 8))
+  call assert_equal(-1, charidx('', 0))
+
+  " count composing characters
+  call assert_equal(0, charidx(a, 0, 1))
+  call assert_equal(2, charidx(a, 2, 1))
+  call assert_equal(3, charidx(a, 4, 1))
+  call assert_equal(5, charidx(a, 7, 1))
+  call assert_equal(-1, charidx(a, 8, 1))
+  call assert_equal(-1, charidx('', 0, 1))
+
+  call assert_fails('let x = charidx([], 1)', 'E474:')
+  call assert_fails('let x = charidx("abc", [])', 'E474:')
+  call assert_fails('let x = charidx("abc", 1, [])', 'E474:')
+  call assert_fails('let x = charidx("abc", 1, -1)', 'E474:')
+  call assert_fails('let x = charidx("abc", 1, 2)', 'E474:')
+endfunc
+
 func Test_count()
  let l = ['a', 'a', 'A', 'b']
  call assert_equal(2, count(l, 'a'))