feat(mbyte): support extended grapheme clusters including more emoji

Use the grapheme break algorithm from utf8proc to support grapheme clusters from recent unicode versions. Handle variant selector VS16 turning some codepoints into double-width emoji. This means we need to use ptr2cells rather than char2cells when possible.
2025-11-28 05:00:44 +00:00 · 2024-08-08 10:42:08 +02:00
parent 4353996d0f
commit cfdf68a7ac
34 changed files with 657 additions and 221 deletions
--- a/test/functional/api/vim_spec.lua
+++ b/test/functional/api/vim_spec.lua
@@ -1435,6 +1435,28 @@ describe('API', function()
    it('cannot handle NULs', function()
      eq(0, api.nvim_strwidth('\0abc'))
    end)
+
+    it('can handle emoji with variant selectors and ZWJ', function()
+      local selector = '❤️'
+      eq(2, fn.strchars(selector))
+      eq(1, fn.strcharlen(selector))
+      eq(2, api.nvim_strwidth(selector))
+
+      local no_selector = '❤'
+      eq(1, fn.strchars(no_selector))
+      eq(1, fn.strcharlen(no_selector))
+      eq(1, api.nvim_strwidth(no_selector))
+
+      local selector_zwj_selector = '🏳️‍⚧️'
+      eq(5, fn.strchars(selector_zwj_selector))
+      eq(1, fn.strcharlen(selector_zwj_selector))
+      eq(2, api.nvim_strwidth(selector_zwj_selector))
+
+      local emoji_zwj_emoji = '🧑‍🌾'
+      eq(3, fn.strchars(emoji_zwj_emoji))
+      eq(1, fn.strcharlen(emoji_zwj_emoji))
+      eq(2, api.nvim_strwidth(emoji_zwj_emoji))
+    end)
  end)

  describe('nvim_get_current_line, nvim_set_current_line', function()
--- a/test/functional/ui/decorations_spec.lua
+++ b/test/functional/ui/decorations_spec.lua
@@ -5620,6 +5620,27 @@ l5
      ]]
    })
  end)
+
+  it('supports emoji as signs', function()
+    insert(example_test3)
+    feed 'gg'
+    api.nvim_buf_set_extmark(0, ns, 1, 0, {sign_text='🧑‍🌾'})
+    -- VS16 can change width of character
+    api.nvim_buf_set_extmark(0, ns, 2, 0, {sign_text='❤️'})
+    api.nvim_buf_set_extmark(0, ns, 3, 0, {sign_text='❤'})
+    api.nvim_buf_set_extmark(0, ns, 4, 0, {sign_text='❤x'})
+    screen:expect([[
+      {7:  }^l1                                              |
+      🧑‍🌾l2                                              |
+      ❤️l3                                              |
+      ❤ l4                                              |
+      ❤xl5                                              |
+      {7:  }                                                |
+      {1:~                                                 }|*3
+                                                        |
+    ]])
+    eq("Invalid 'sign_text'", pcall_err(api.nvim_buf_set_extmark, 0, ns, 5, 0, {sign_text='❤️x'}))
+  end)
 end)

 describe('decorations: virt_text', function()
--- a/test/functional/ui/messages_spec.lua
+++ b/test/functional/ui/messages_spec.lua
@@ -1436,6 +1436,41 @@ vimComment     xxx match /\s"[^\-:.%#=*].*$/ms=s+1,lc=1  excludenl contains=@vim
    }
  end)

+  it('supports nvim_echo messages with emoji', function()
+    -- stylua: ignore
+    async_meths.nvim_echo(
+      { { 'wow, 🏳️‍⚧️🧑‍🌾❤️😂🏴‍☠️\nvariant ❤️ one\nvariant ❤ two' } }, true, {}
+    )
+
+    screen:expect([[
+                                                                  |
+      {1:~                                                           }|
+      {3:                                                            }|
+      wow, 🏳️‍⚧️🧑‍🌾❤️😂🏴‍☠️                                             |
+      variant ❤️ one                                              |
+      variant ❤ two                                               |
+      {6:Press ENTER or type command to continue}^                     |
+    ]])
+
+    feed '<cr>'
+    screen:expect([[
+      ^                                                            |
+      {1:~                                                           }|*5
+                                                                  |
+    ]])
+
+    feed ':messages<cr>'
+    screen:expect([[
+                                                                  |
+      {1:~                                                           }|
+      {3:                                                            }|
+      wow, 🏳️‍⚧️🧑‍🌾❤️😂🏴‍☠️                                             |
+      variant ❤️ one                                              |
+      variant ❤ two                                               |
+      {6:Press ENTER or type command to continue}^                     |
+    ]])
+  end)
+
  it('prints lines in Ex mode correctly with a burst of carriage returns #19341', function()
    command('set number')
    api.nvim_buf_set_lines(0, 0, 0, true, { 'aaa', 'bbb', 'ccc' })
--- a/test/functional/ui/multibyte_spec.lua
+++ b/test/functional/ui/multibyte_spec.lua
@@ -296,6 +296,86 @@ describe('multibyte rendering', function()
    ]],
    }
  end)
+
+  it('supports emoji with variant selectors and ZWJ', function()
+    command('set ruler')
+    insert('🏳️‍⚧️')
+    screen:expect([[
+      ^🏳️‍⚧️                                                          |
+      {1:~                                                           }|*4
+                                                1,1           All |
+    ]])
+
+    feed('a word<esc>')
+    screen:expect([[
+      🏳️‍⚧️ wor^d                                                     |
+      {1:~                                                           }|*4
+                                                1,21-7        All |
+    ]])
+
+    feed('0')
+    screen:expect([[
+      ^🏳️‍⚧️ word                                                     |
+      {1:~                                                           }|*4
+                                                1,1           All |
+    ]])
+
+    feed('l')
+    screen:expect([[
+        🏳️‍⚧️^ word                                                     |
+        {1:~                                                           }|*4
+                                                  1,17-3        All |
+    ]])
+
+    feed('h')
+    screen:expect([[
+      ^🏳️‍⚧️ word                                                     |
+      {1:~                                                           }|*4
+                                                1,1           All |
+    ]])
+
+    feed('o❤️ variant selected<esc>')
+    screen:expect([[
+      🏳️‍⚧️ word                                                     |
+      ❤️ variant selecte^d                                         |
+      {1:~                                                           }|*3
+                                                2,23-19       All |
+    ]])
+
+    feed('0')
+    screen:expect([[
+      🏳️‍⚧️ word                                                     |
+      ^❤️ variant selected                                         |
+      {1:~                                                           }|*3
+                                                2,1           All |
+    ]])
+
+    feed('l')
+    screen:expect([[
+      🏳️‍⚧️ word                                                     |
+      ❤️^ variant selected                                         |
+      {1:~                                                           }|*3
+                                                2,7-3         All |
+    ]])
+
+    feed('h')
+    screen:expect([[
+      🏳️‍⚧️ word                                                     |
+      ^❤️ variant selected                                         |
+      {1:~                                                           }|*3
+                                                2,1           All |
+    ]])
+
+    -- without selector: single width (note column 18 and not 19)
+    feed('o❤ variant selected<esc>')
+    screen:expect([[
+      🏳️‍⚧️ word                                                     |
+      ❤️ variant selected                                         |
+      ❤ variant selecte^d                                          |
+      {1:~                                                           }|*2
+                                                3,20-18       All |
+    ]])
+  end)
 end)

 describe('multibyte rendering: statusline', function()
@@ -348,11 +428,12 @@ describe('multibyte rendering: statusline', function()
  it('non-printable followed by MAX_MCO unicode combination points', function()
    command('set statusline≠⃯ᷰ⃐⃧⃝')
    -- U+9F + U+1DF0 + U+20EF + U+0338 + U+20D0 + U+20E7 + U+20DD
+    -- TODO: not ideal, better with plain ">" and then space+combining
    screen:expect([[
-    ^                                        |
-    {1:~                                       }|
-    {3:<9f><1df0><20ef><0338><20d0><20e7><20dd>}|
-                                            |
+      ^                                        |
+      {1:~                                       }|
+      {3:<9f≯⃯ᷰ⃐⃧⃝                                    }|
+                                              |
    ]])
  end)

@@ -368,9 +449,20 @@ describe('multibyte rendering: statusline', function()
    }
  end)

-  it('unprintable chars in filename with default stl', function()
+  it('emoji with ZWJ in filename with default stl', function()
    command('file 🧑‍💻')
-    -- TODO: this is wrong but avoids a crash
+    screen:expect {
+      grid = [[
+      ^                                        |
+      {1:~                                       }|
+      {3:🧑‍💻                                      }|
+                                              |
+    ]],
+    }
+  end)
+
+  it('unprintable chars in filename with default stl', function()
+    command('file 🧑💻')
    screen:expect {
      grid = [[
      ^                                        |
@@ -381,15 +473,27 @@ describe('multibyte rendering: statusline', function()
    }
  end)

-  it('unprintable chars in filename with custom stl', function()
+  it('emoji with ZWJ in filename with custom stl', function()
    command('set statusline=xx%#ErrorMsg#%f%##yy')
    command('file 🧑‍💻')
-    -- TODO: this is also wrong but also avoids a crash
    screen:expect {
      grid = [[
      ^                                        |
      {1:~                                       }|
-      {3:xx}{9:🧑<200d>💻}{3:yy                          }|
+      {3:xx}{9:🧑‍💻}{3:yy                                  }|
+                                              |
+    ]],
+    }
+  end)
+
+  it('unprintable chars in filename with custom stl', function()
+    command('set statusline=xx%#ErrorMsg#%f%##yy')
+    command('file 🧑💻')
+    screen:expect {
+      grid = [[
+      ^                                        |
+      {1:~                                       }|
+      {3:xx}{9:🧑<200b>💻}{3:yy                          }|
                                              |
    ]],
    }
--- a/test/old/testdir/test_functions.vim
+++ b/test/old/testdir/test_functions.vim
@@ -3663,7 +3663,7 @@ func Test_string_reverse()
    call assert_equal('', reverse(v:_null_string))
    for [s1, s2] in [['', ''], ['a', 'a'], ['ab', 'ba'], ['abc', 'cba'],
                   \ ['abcd', 'dcba'], ['«-«-»-»', '»-»-«-«'],
-                   \ ['🇦', '🇦'], ['🇦🇧', '🇧🇦'], ['🇦🇧🇨', '🇨🇧🇦'],
+                   \ ['🇦', '🇦'], ['🇦🇧', '🇦🇧'], ['🇦🇧🇨', '🇨🇦🇧'],
                   \ ['🇦«🇧-🇨»🇩', '🇩»🇨-🇧«🇦']]
      call assert_equal(s2, reverse(s1))
    endfor
--- a/test/old/testdir/test_normal.vim
+++ b/test/old/testdir/test_normal.vim
@@ -3897,9 +3897,9 @@ func Test_normal_count_after_operator()
  bw!
 endfunc

-func Test_normal_gj_on_extra_wide_char()
+func Test_normal_gj_on_6_cell_wide_unprintable_char()
  new | 25vsp
-  let text='1 foooooooo ar e  ins‍zwe1 foooooooo ins‍zwei' .
+  let text='1 foooooooo ar e  inszwe1 foooooooo inszwei' .
         \ ' i drei vier fünf sechs sieben acht un zehn elf zwöfl' .
         \ ' dreizehn v ierzehn fünfzehn'
  put =text
--- a/test/unit/mbyte_spec.lua
+++ b/test/unit/mbyte_spec.lua
@@ -3,8 +3,15 @@ local itp = t.gen_itp(it)

 local ffi = t.ffi
 local eq = t.eq
+local to_cstr = t.to_cstr
+local ok = t.ok

-local lib = t.cimport('./src/nvim/mbyte.h', './src/nvim/charset.h', './src/nvim/grid.h')
+local lib = t.cimport(
+  './src/nvim/mbyte.h',
+  './src/nvim/charset.h',
+  './src/nvim/grid.h',
+  './src/nvim/option_vars.h'
+)

 describe('mbyte', function()
  -- Convert from bytes to string
@@ -45,12 +52,21 @@ describe('mbyte', function()
    end)
  end

-  describe('utfc_ptr2schar_len', function()
+  describe('utfc_ptr2schar', function()
    local function test_seq(seq)
      local firstc = ffi.new('int[1]')
      local buf = ffi.new('char[32]')
-      lib.schar_get(buf, lib.utfc_ptr2schar_len(to_string(seq), #seq, firstc))
-      return { ffi.string(buf), firstc[0] }
+      lib.schar_get(buf, lib.utfc_ptr2schar(to_string(seq), firstc))
+      local str = ffi.string(buf)
+      if 1 > 2 then -- for debugging
+        local tabel = {}
+        for i = 1, #str do
+          table.insert(tabel, string.format('0x%02x', string.byte(str, i)))
+        end
+        print('{ ' .. table.concat(tabel, ', ') .. ' }')
+        io.stdout:flush()
+      end
+      return { str, firstc[0] }
    end

    local function byte(val)
@@ -88,7 +104,9 @@ describe('mbyte', function()
      eq(byte(0x7f), test_seq { 0x7f, 0xc2, 0x80 })

      -- Combining character is U+0300
-      eq({ '\x7f\xcc\x80', 0x7f }, test_seq { 0x7f, 0xcc, 0x80 })
+      eq({ '\x29\xcc\x80', 0x29 }, test_seq { 0x29, 0xcc, 0x80 })
+      -- invalid start byte for combining
+      eq({ '\x7f', 0x7f }, test_seq { 0x7f, 0xcc, 0x80 })

      -- No UTF-8 sequence
      eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f, 0xcc })
@@ -102,18 +120,21 @@ describe('mbyte', function()
    itp('4-byte sequences', function()
      -- No following combining character
      eq(byte(0x7f), test_seq { 0x7f, 0x7f, 0xcc, 0x80 })
+      eq(byte(0x29), test_seq { 0x29, 0x29, 0xcc, 0x80 })
      -- No second UTF-8 character
      eq(byte(0x7f), test_seq { 0x7f, 0xc2, 0xcc, 0x80 })

      -- Combining character U+0300
-      eq({ '\x7f\xcc\x80', 0x7f }, test_seq { 0x7f, 0xcc, 0x80, 0xcc })
+      eq({ '\x29\xcc\x80', 0x29 }, test_seq { 0x29, 0xcc, 0x80, 0xcc })

      -- No UTF-8 sequence
      eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f, 0xcc, 0x80 })
      -- No following UTF-8 character
      eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0xcc })
      -- Combining character U+0301
-      eq({ '\xc2\x80\xcc\x81', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81 })
+      eq({ '\xc2\xbc\xcc\x81', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0x81 })
+      -- U+0080 : not a valid start char
+      eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81 })

      -- One UTF-8 character
      eq({ '\xf4\x80\x80\x80', 0x100000 }, test_seq { 0xf4, 0x80, 0x80, 0x80 })
@@ -126,36 +147,36 @@ describe('mbyte', function()
      eq(byte(0x7f), test_seq { 0x7f, 0xc2, 0xcc, 0x80, 0x80 })

      -- Combining character U+0300
-      eq({ '\x7f\xcc\x80', 0x7f }, test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x00 })
+      eq({ '\x29\xcc\x80', 0x29 }, test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x00 })

      -- Combining characters U+0300 and U+0301
-      eq({ '\x7f\xcc\x80\xcc\x81', 0x7f }, test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81 })
+      eq({ '\x29\xcc\x80\xcc\x81', 0x29 }, test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81 })
      -- Combining characters U+0300, U+0301, U+0302
      eq(
-        { '\x7f\xcc\x80\xcc\x81\xcc\x82', 0x7f },
-        test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82 }
+        { '\x29\xcc\x80\xcc\x81\xcc\x82', 0x29 },
+        test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82 }
      )
      -- Combining characters U+0300, U+0301, U+0302, U+0303
      eq(
-        { '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83', 0x7f },
-        test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83 }
+        { '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83', 0x29 },
+        test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83 }
      )
      -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304
      eq(
-        { '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84', 0x7f },
-        test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84 }
+        { '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84', 0x29 },
+        test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84 }
      )
      -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305
      eq(
-        { '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85', 0x7f },
-        test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85 }
+        { '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85', 0x29 },
+        test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85 }
      )

      -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305, U+0306
      eq(
-        { '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85\xcc\x86', 0x7f },
+        { '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85\xcc\x86', 0x29 },
        test_seq {
-          0x7f,
+          0x29,
          0xcc,
          0x80,
          0xcc,
@@ -175,18 +196,18 @@ describe('mbyte', function()

      -- Only three following combining characters U+0300, U+0301, U+0302
      eq(
-        { '\x7f\xcc\x80\xcc\x81\xcc\x82', 0x7f },
-        test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xc2, 0x80, 0xcc, 0x84, 0xcc, 0x85 }
+        { '\x29\xcc\x80\xcc\x81\xcc\x82', 0x29 },
+        test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xc2, 0x80, 0xcc, 0x84, 0xcc, 0x85 }
      )

      -- No UTF-8 sequence
      eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f, 0xcc, 0x80, 0x80 })
      -- No following UTF-8 character
-      eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0xcc, 0x80 })
+      eq({ '\xc2\xbc', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0xcc, 0x80 })
      -- Combining character U+0301
-      eq({ '\xc2\x80\xcc\x81', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81, 0x7f })
+      eq({ '\xc2\xbc\xcc\x81', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0x81, 0x7f })
      -- Combining character U+0301
-      eq({ '\xc2\x80\xcc\x81', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81, 0xcc })
+      eq({ '\xc2\xbc\xcc\x81', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0x81, 0xcc })

      -- One UTF-8 character
      eq({ '\xf4\x80\x80\x80', 0x100000 }, test_seq { 0xf4, 0x80, 0x80, 0x80, 0x7f })
@@ -205,8 +226,6 @@ describe('mbyte', function()
  end)

  describe('utf_cp_bounds_len', function()
-    local to_cstr = t.to_cstr
-
    local tests = {
      {
        name = 'for valid string',
@@ -273,4 +292,52 @@ describe('mbyte', function()
      eq(expected_offsets, { b = b_offsets, e = e_offsets })
    end)
  end)
+
+  itp('utf_head_off', function()
+    local function check(str, expected_glyphs)
+      local len = #str
+      local cstr = to_cstr(str)
+      local breaks = { 0 } -- SOT
+      local pos = 0
+      local mb_glyphs = {}
+      while pos < len do
+        local clen = lib.utfc_ptr2len(cstr + pos)
+        ok(clen > 0) -- otherwise we get stuck
+        if clen > 1 then
+          table.insert(mb_glyphs, string.sub(str, pos + 1, pos + clen))
+        end
+        pos = pos + clen
+        table.insert(breaks, pos)
+      end
+      eq(breaks[#breaks], len) -- include EOT as break
+      -- we could also send in breaks, but this is more human readable
+      eq(mb_glyphs, expected_glyphs)
+
+      for i = 1, #breaks - 1 do
+        local start, next = breaks[i], breaks[i + 1]
+
+        for p = start, next - 1 do
+          eq(p - start, lib.utf_head_off(cstr, cstr + p))
+        end
+      end
+      eq(0, lib.utf_head_off(cstr, cstr + len)) -- NUL byte is safe
+    end
+    -- stylua doesn't like ZWJ chars..
+    -- stylua: ignore start
+    check('hej och hå 🧑‍🌾!', { 'å', '🧑‍🌾' })
+    -- emoji only (various kinds of combinations, use g8 to see them)
+    check("🏳️‍⚧️🧑‍🌾❤️😂🏴‍☠️", {"🏳️‍⚧️", "🧑‍🌾", "❤️", "😂", "🏴‍☠️"})
+    check('🏳️‍⚧️xy🧑‍🌾\r❤️😂å🏴‍☠️', { '🏳️‍⚧️', '🧑‍🌾', '❤️', '😂', 'å', '🏴‍☠️', '' })
+
+    check('🇦🅱️ 🇦🇽 🇦🇨🇦 🇲🇽🇹🇱',{'🇦', '🅱️', '🇦🇽', '🇦🇨', '🇦', '🇲🇽', '🇹🇱'})
+    check('🏴󠁧󠁢󠁳󠁣󠁴󠁿🏴󠁧󠁢󠁷󠁬󠁳󠁿', {'🏴󠁧󠁢󠁳󠁣󠁴󠁿', '🏴󠁧󠁢󠁷󠁬󠁳󠁿'})
+
+    lib.p_arshape = true -- default
+    check('سلام', { 'س', 'لا', 'م' })
+    lib.p_arshape = false
+    check('سلام', { 'س', 'ل', 'ا', 'م' })
+
+    check('L̓̉̑̒̌̚ơ̗̌̒̄̀ŕ̈̈̎̐̕è̇̅̄̄̐m̖̟̟̅̄̚', {'L̓̉̑̒̌̚', 'ơ̗̌̒̄̀', 'ŕ̈̈̎̐̕', 'è̇̅̄̄̐', 'm̖̟̟̅̄̚'})
+    -- stylua: ignore end
+  end)
 end)