feat(mbyte): support extended grapheme clusters including more emoji

Use the grapheme break algorithm from utf8proc to support grapheme
clusters from recent unicode versions.

Handle variant selector VS16 turning some codepoints into double-width
emoji. This means we need to use ptr2cells rather than char2cells when
possible.
This commit is contained in:
bfredl
2024-08-08 10:42:08 +02:00
parent 4353996d0f
commit cfdf68a7ac
34 changed files with 657 additions and 221 deletions

View File

@@ -1435,6 +1435,28 @@ describe('API', function()
it('cannot handle NULs', function()
eq(0, api.nvim_strwidth('\0abc'))
end)
it('can handle emoji with variant selectors and ZWJ', function()
local selector = '❤️'
eq(2, fn.strchars(selector))
eq(1, fn.strcharlen(selector))
eq(2, api.nvim_strwidth(selector))
local no_selector = ''
eq(1, fn.strchars(no_selector))
eq(1, fn.strcharlen(no_selector))
eq(1, api.nvim_strwidth(no_selector))
local selector_zwj_selector = '🏳️‍⚧️'
eq(5, fn.strchars(selector_zwj_selector))
eq(1, fn.strcharlen(selector_zwj_selector))
eq(2, api.nvim_strwidth(selector_zwj_selector))
local emoji_zwj_emoji = '🧑‍🌾'
eq(3, fn.strchars(emoji_zwj_emoji))
eq(1, fn.strcharlen(emoji_zwj_emoji))
eq(2, api.nvim_strwidth(emoji_zwj_emoji))
end)
end)
describe('nvim_get_current_line, nvim_set_current_line', function()

View File

@@ -5620,6 +5620,27 @@ l5
]]
})
end)
it('supports emoji as signs', function()
insert(example_test3)
feed 'gg'
api.nvim_buf_set_extmark(0, ns, 1, 0, {sign_text='🧑‍🌾'})
-- VS16 can change width of character
api.nvim_buf_set_extmark(0, ns, 2, 0, {sign_text='❤️'})
api.nvim_buf_set_extmark(0, ns, 3, 0, {sign_text=''})
api.nvim_buf_set_extmark(0, ns, 4, 0, {sign_text='❤x'})
screen:expect([[
{7: }^l1 |
🧑🌾l2 |
l3 |
❤ l4 |
❤xl5 |
{7: } |
{1:~ }|*3
|
]])
eq("Invalid 'sign_text'", pcall_err(api.nvim_buf_set_extmark, 0, ns, 5, 0, {sign_text='x'}))
end)
end)
describe('decorations: virt_text', function()

View File

@@ -1436,6 +1436,41 @@ vimComment xxx match /\s"[^\-:.%#=*].*$/ms=s+1,lc=1 excludenl contains=@vim
}
end)
it('supports nvim_echo messages with emoji', function()
-- stylua: ignore
async_meths.nvim_echo(
{ { 'wow, 🏳️‍⚧️🧑‍🌾❤️😂🏴‍☠️\nvariant ❤️ one\nvariant ❤ two' } }, true, {}
)
screen:expect([[
|
{1:~ }|
{3: }|
wow, 🏳️‍⚧️🧑‍🌾❤️😂🏴‍☠️ |
variant ❤️ one |
variant ❤ two |
{6:Press ENTER or type command to continue}^ |
]])
feed '<cr>'
screen:expect([[
^ |
{1:~ }|*5
|
]])
feed ':messages<cr>'
screen:expect([[
|
{1:~ }|
{3: }|
wow, 🏳️‍⚧️🧑‍🌾❤️😂🏴‍☠️ |
variant ❤️ one |
variant ❤ two |
{6:Press ENTER or type command to continue}^ |
]])
end)
it('prints lines in Ex mode correctly with a burst of carriage returns #19341', function()
command('set number')
api.nvim_buf_set_lines(0, 0, 0, true, { 'aaa', 'bbb', 'ccc' })

View File

@@ -296,6 +296,86 @@ describe('multibyte rendering', function()
]],
}
end)
it('supports emoji with variant selectors and ZWJ', function()
command('set ruler')
insert('🏳️‍⚧️')
screen:expect([[
^🏳️‍⚧️ |
{1:~ }|*4
1,1 All |
]])
feed('a word<esc>')
screen:expect([[
🏳️‍⚧️ wor^d |
{1:~ }|*4
1,21-7 All |
]])
feed('0')
screen:expect([[
^🏳️‍⚧️ word |
{1:~ }|*4
1,1 All |
]])
feed('l')
screen:expect([[
🏳️‍⚧️^ word |
{1:~ }|*4
1,17-3 All |
]])
feed('h')
screen:expect([[
^🏳️‍⚧️ word |
{1:~ }|*4
1,1 All |
]])
feed('o❤ variant selected<esc>')
screen:expect([[
🏳️‍⚧️ word |
❤️ variant selecte^d |
{1:~ }|*3
2,23-19 All |
]])
feed('0')
screen:expect([[
🏳️‍⚧️ word |
^❤️ variant selected |
{1:~ }|*3
2,1 All |
]])
feed('l')
screen:expect([[
🏳️‍⚧️ word |
❤️^ variant selected |
{1:~ }|*3
2,7-3 All |
]])
feed('h')
screen:expect([[
🏳️‍⚧️ word |
^❤️ variant selected |
{1:~ }|*3
2,1 All |
]])
-- without selector: single width (note column 18 and not 19)
feed('o❤ variant selected<esc>')
screen:expect([[
🏳️‍⚧️ word |
❤️ variant selected |
❤ variant selecte^d |
{1:~ }|*2
3,20-18 All |
]])
end)
end)
describe('multibyte rendering: statusline', function()
@@ -348,11 +428,12 @@ describe('multibyte rendering: statusline', function()
it('non-printable followed by MAX_MCO unicode combination points', function()
command('set statusline=Ÿ̸⃯ᷰ⃐⃧⃝')
-- U+9F + U+1DF0 + U+20EF + U+0338 + U+20D0 + U+20E7 + U+20DD
-- TODO: not ideal, better with plain ">" and then space+combining
screen:expect([[
^ |
{1:~ }|
{3:<9f><1df0><20ef><0338><20d0><20e7><20dd>}|
|
^ |
{1:~ }|
{3:<9f≯⃯ᷰ⃐⃧⃝ }|
|
]])
end)
@@ -368,9 +449,20 @@ describe('multibyte rendering: statusline', function()
}
end)
it('unprintable chars in filename with default stl', function()
it('emoji with ZWJ in filename with default stl', function()
command('file 🧑‍💻')
-- TODO: this is wrong but avoids a crash
screen:expect {
grid = [[
^ |
{1:~ }|
{3:🧑‍💻 }|
|
]],
}
end)
it('unprintable chars in filename with default stl', function()
command('file 🧑​💻')
screen:expect {
grid = [[
^ |
@@ -381,15 +473,27 @@ describe('multibyte rendering: statusline', function()
}
end)
it('unprintable chars in filename with custom stl', function()
it('emoji with ZWJ in filename with custom stl', function()
command('set statusline=xx%#ErrorMsg#%f%##yy')
command('file 🧑‍💻')
-- TODO: this is also wrong but also avoids a crash
screen:expect {
grid = [[
^ |
{1:~ }|
{3:xx}{9:🧑<200d>💻}{3:yy }|
{3:xx}{9:🧑💻}{3:yy }|
|
]],
}
end)
it('unprintable chars in filename with custom stl', function()
command('set statusline=xx%#ErrorMsg#%f%##yy')
command('file 🧑​💻')
screen:expect {
grid = [[
^ |
{1:~ }|
{3:xx}{9:🧑<200b>💻}{3:yy }|
|
]],
}

View File

@@ -3663,7 +3663,7 @@ func Test_string_reverse()
call assert_equal('', reverse(v:_null_string))
for [s1, s2] in [['', ''], ['a', 'a'], ['ab', 'ba'], ['abc', 'cba'],
\ ['abcd', 'dcba'], ['«-«-»-»', '»-»-«-«'],
\ ['🇦', '🇦'], ['🇦🇧', '🇧🇦'], ['🇦🇧🇨', '🇨🇧🇦'],
\ ['🇦', '🇦'], ['🇦🇧', '🇦🇧'], ['🇦🇧🇨', '🇨🇦🇧'],
\ ['🇦«🇧-🇨»🇩', '🇩»🇨-🇧«🇦']]
call assert_equal(s2, reverse(s1))
endfor

View File

@@ -3897,9 +3897,9 @@ func Test_normal_count_after_operator()
bw!
endfunc
func Test_normal_gj_on_extra_wide_char()
func Test_normal_gj_on_6_cell_wide_unprintable_char()
new | 25vsp
let text='1 foooooooo ar e inszwe1 foooooooo inszwei' .
let text='1 foooooooo ar e inszwe1 foooooooo inszwei' .
\ ' i drei vier fünf sechs sieben acht un zehn elf zwöfl' .
\ ' dreizehn v ierzehn fünfzehn'
put =text

View File

@@ -3,8 +3,15 @@ local itp = t.gen_itp(it)
local ffi = t.ffi
local eq = t.eq
local to_cstr = t.to_cstr
local ok = t.ok
local lib = t.cimport('./src/nvim/mbyte.h', './src/nvim/charset.h', './src/nvim/grid.h')
local lib = t.cimport(
'./src/nvim/mbyte.h',
'./src/nvim/charset.h',
'./src/nvim/grid.h',
'./src/nvim/option_vars.h'
)
describe('mbyte', function()
-- Convert from bytes to string
@@ -45,12 +52,21 @@ describe('mbyte', function()
end)
end
describe('utfc_ptr2schar_len', function()
describe('utfc_ptr2schar', function()
local function test_seq(seq)
local firstc = ffi.new('int[1]')
local buf = ffi.new('char[32]')
lib.schar_get(buf, lib.utfc_ptr2schar_len(to_string(seq), #seq, firstc))
return { ffi.string(buf), firstc[0] }
lib.schar_get(buf, lib.utfc_ptr2schar(to_string(seq), firstc))
local str = ffi.string(buf)
if 1 > 2 then -- for debugging
local tabel = {}
for i = 1, #str do
table.insert(tabel, string.format('0x%02x', string.byte(str, i)))
end
print('{ ' .. table.concat(tabel, ', ') .. ' }')
io.stdout:flush()
end
return { str, firstc[0] }
end
local function byte(val)
@@ -88,7 +104,9 @@ describe('mbyte', function()
eq(byte(0x7f), test_seq { 0x7f, 0xc2, 0x80 })
-- Combining character is U+0300
eq({ '\x7f\xcc\x80', 0x7f }, test_seq { 0x7f, 0xcc, 0x80 })
eq({ '\x29\xcc\x80', 0x29 }, test_seq { 0x29, 0xcc, 0x80 })
-- invalid start byte for combining
eq({ '\x7f', 0x7f }, test_seq { 0x7f, 0xcc, 0x80 })
-- No UTF-8 sequence
eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f, 0xcc })
@@ -102,18 +120,21 @@ describe('mbyte', function()
itp('4-byte sequences', function()
-- No following combining character
eq(byte(0x7f), test_seq { 0x7f, 0x7f, 0xcc, 0x80 })
eq(byte(0x29), test_seq { 0x29, 0x29, 0xcc, 0x80 })
-- No second UTF-8 character
eq(byte(0x7f), test_seq { 0x7f, 0xc2, 0xcc, 0x80 })
-- Combining character U+0300
eq({ '\x7f\xcc\x80', 0x7f }, test_seq { 0x7f, 0xcc, 0x80, 0xcc })
eq({ '\x29\xcc\x80', 0x29 }, test_seq { 0x29, 0xcc, 0x80, 0xcc })
-- No UTF-8 sequence
eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f, 0xcc, 0x80 })
-- No following UTF-8 character
eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0xcc })
-- Combining character U+0301
eq({ '\xc2\x80\xcc\x81', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81 })
eq({ '\xc2\xbc\xcc\x81', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0x81 })
-- U+0080 : not a valid start char
eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81 })
-- One UTF-8 character
eq({ '\xf4\x80\x80\x80', 0x100000 }, test_seq { 0xf4, 0x80, 0x80, 0x80 })
@@ -126,36 +147,36 @@ describe('mbyte', function()
eq(byte(0x7f), test_seq { 0x7f, 0xc2, 0xcc, 0x80, 0x80 })
-- Combining character U+0300
eq({ '\x7f\xcc\x80', 0x7f }, test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x00 })
eq({ '\x29\xcc\x80', 0x29 }, test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x00 })
-- Combining characters U+0300 and U+0301
eq({ '\x7f\xcc\x80\xcc\x81', 0x7f }, test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81 })
eq({ '\x29\xcc\x80\xcc\x81', 0x29 }, test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81 })
-- Combining characters U+0300, U+0301, U+0302
eq(
{ '\x7f\xcc\x80\xcc\x81\xcc\x82', 0x7f },
test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82 }
{ '\x29\xcc\x80\xcc\x81\xcc\x82', 0x29 },
test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82 }
)
-- Combining characters U+0300, U+0301, U+0302, U+0303
eq(
{ '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83', 0x7f },
test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83 }
{ '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83', 0x29 },
test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83 }
)
-- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304
eq(
{ '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84', 0x7f },
test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84 }
{ '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84', 0x29 },
test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84 }
)
-- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305
eq(
{ '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85', 0x7f },
test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85 }
{ '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85', 0x29 },
test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85 }
)
-- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305, U+0306
eq(
{ '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85\xcc\x86', 0x7f },
{ '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85\xcc\x86', 0x29 },
test_seq {
0x7f,
0x29,
0xcc,
0x80,
0xcc,
@@ -175,18 +196,18 @@ describe('mbyte', function()
-- Only three following combining characters U+0300, U+0301, U+0302
eq(
{ '\x7f\xcc\x80\xcc\x81\xcc\x82', 0x7f },
test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xc2, 0x80, 0xcc, 0x84, 0xcc, 0x85 }
{ '\x29\xcc\x80\xcc\x81\xcc\x82', 0x29 },
test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xc2, 0x80, 0xcc, 0x84, 0xcc, 0x85 }
)
-- No UTF-8 sequence
eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f, 0xcc, 0x80, 0x80 })
-- No following UTF-8 character
eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0xcc, 0x80 })
eq({ '\xc2\xbc', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0xcc, 0x80 })
-- Combining character U+0301
eq({ '\xc2\x80\xcc\x81', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81, 0x7f })
eq({ '\xc2\xbc\xcc\x81', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0x81, 0x7f })
-- Combining character U+0301
eq({ '\xc2\x80\xcc\x81', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81, 0xcc })
eq({ '\xc2\xbc\xcc\x81', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0x81, 0xcc })
-- One UTF-8 character
eq({ '\xf4\x80\x80\x80', 0x100000 }, test_seq { 0xf4, 0x80, 0x80, 0x80, 0x7f })
@@ -205,8 +226,6 @@ describe('mbyte', function()
end)
describe('utf_cp_bounds_len', function()
local to_cstr = t.to_cstr
local tests = {
{
name = 'for valid string',
@@ -273,4 +292,52 @@ describe('mbyte', function()
eq(expected_offsets, { b = b_offsets, e = e_offsets })
end)
end)
itp('utf_head_off', function()
local function check(str, expected_glyphs)
local len = #str
local cstr = to_cstr(str)
local breaks = { 0 } -- SOT
local pos = 0
local mb_glyphs = {}
while pos < len do
local clen = lib.utfc_ptr2len(cstr + pos)
ok(clen > 0) -- otherwise we get stuck
if clen > 1 then
table.insert(mb_glyphs, string.sub(str, pos + 1, pos + clen))
end
pos = pos + clen
table.insert(breaks, pos)
end
eq(breaks[#breaks], len) -- include EOT as break
-- we could also send in breaks, but this is more human readable
eq(mb_glyphs, expected_glyphs)
for i = 1, #breaks - 1 do
local start, next = breaks[i], breaks[i + 1]
for p = start, next - 1 do
eq(p - start, lib.utf_head_off(cstr, cstr + p))
end
end
eq(0, lib.utf_head_off(cstr, cstr + len)) -- NUL byte is safe
end
-- stylua doesn't like ZWJ chars..
-- stylua: ignore start
check('hej och hå 🧑‍🌾!', { 'å', '🧑‍🌾' })
-- emoji only (various kinds of combinations, use g8 to see them)
check("🏳️‍⚧️🧑‍🌾❤️😂🏴‍☠️", {"🏳️‍⚧️", "🧑‍🌾", "❤️", "😂", "🏴‍☠️"})
check('🏳xy🧑🌾\r❤️😂å🏴‍☠️€', { '🏳️‍⚧️', '🧑‍🌾', '❤️', '😂', 'å', '🏴‍☠️', '€' })
check('🇦🅱️ 🇦🇽 🇦🇨🇦 🇲🇽🇹🇱',{'🇦', '🅱️', '🇦🇽', '🇦🇨', '🇦', '🇲🇽', '🇹🇱'})
check('🏴󠁧󠁢󠁳󠁣󠁴󠁿🏴󠁧󠁢󠁷󠁬󠁳󠁿', {'🏴󠁧󠁢󠁳󠁣󠁴󠁿', '🏴󠁧󠁢󠁷󠁬󠁳󠁿'})
lib.p_arshape = true -- default
check('سلام', { 'س', 'لا', 'م' })
lib.p_arshape = false
check('سلام', { 'س', 'ل', 'ا', 'م' })
check('L̓̉̑̒̌̚ơ̗̌̒̄̀ŕ̈̈̎̐̕è̇̅̄̄̐m̖̟̟̅̄̚', {'L̓̉̑̒̌̚', 'ơ̗̌̒̄̀', 'ŕ̈̈̎̐̕', 'è̇̅̄̄̐', 'm̖̟̟̅̄̚'})
-- stylua: ignore end
end)
end)