feat(mbyte): support extended grapheme clusters including more emoji

Use the grapheme break algorithm from utf8proc to support grapheme
clusters from recent unicode versions.

Handle variant selector VS16 turning some codepoints into double-width
emoji. This means we need to use ptr2cells rather than char2cells when
possible.
This commit is contained in:
bfredl
2024-08-08 10:42:08 +02:00
parent 4353996d0f
commit cfdf68a7ac
34 changed files with 657 additions and 221 deletions

View File

@@ -3,8 +3,15 @@ local itp = t.gen_itp(it)
local ffi = t.ffi
local eq = t.eq
local to_cstr = t.to_cstr
local ok = t.ok
local lib = t.cimport('./src/nvim/mbyte.h', './src/nvim/charset.h', './src/nvim/grid.h')
local lib = t.cimport(
'./src/nvim/mbyte.h',
'./src/nvim/charset.h',
'./src/nvim/grid.h',
'./src/nvim/option_vars.h'
)
describe('mbyte', function()
-- Convert from bytes to string
@@ -45,12 +52,21 @@ describe('mbyte', function()
end)
end
describe('utfc_ptr2schar_len', function()
describe('utfc_ptr2schar', function()
local function test_seq(seq)
local firstc = ffi.new('int[1]')
local buf = ffi.new('char[32]')
lib.schar_get(buf, lib.utfc_ptr2schar_len(to_string(seq), #seq, firstc))
return { ffi.string(buf), firstc[0] }
lib.schar_get(buf, lib.utfc_ptr2schar(to_string(seq), firstc))
local str = ffi.string(buf)
if 1 > 2 then -- for debugging
local tabel = {}
for i = 1, #str do
table.insert(tabel, string.format('0x%02x', string.byte(str, i)))
end
print('{ ' .. table.concat(tabel, ', ') .. ' }')
io.stdout:flush()
end
return { str, firstc[0] }
end
local function byte(val)
@@ -88,7 +104,9 @@ describe('mbyte', function()
eq(byte(0x7f), test_seq { 0x7f, 0xc2, 0x80 })
-- Combining character is U+0300
eq({ '\x7f\xcc\x80', 0x7f }, test_seq { 0x7f, 0xcc, 0x80 })
eq({ '\x29\xcc\x80', 0x29 }, test_seq { 0x29, 0xcc, 0x80 })
-- invalid start byte for combining
eq({ '\x7f', 0x7f }, test_seq { 0x7f, 0xcc, 0x80 })
-- No UTF-8 sequence
eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f, 0xcc })
@@ -102,18 +120,21 @@ describe('mbyte', function()
itp('4-byte sequences', function()
-- No following combining character
eq(byte(0x7f), test_seq { 0x7f, 0x7f, 0xcc, 0x80 })
eq(byte(0x29), test_seq { 0x29, 0x29, 0xcc, 0x80 })
-- No second UTF-8 character
eq(byte(0x7f), test_seq { 0x7f, 0xc2, 0xcc, 0x80 })
-- Combining character U+0300
eq({ '\x7f\xcc\x80', 0x7f }, test_seq { 0x7f, 0xcc, 0x80, 0xcc })
eq({ '\x29\xcc\x80', 0x29 }, test_seq { 0x29, 0xcc, 0x80, 0xcc })
-- No UTF-8 sequence
eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f, 0xcc, 0x80 })
-- No following UTF-8 character
eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0xcc })
-- Combining character U+0301
eq({ '\xc2\x80\xcc\x81', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81 })
eq({ '\xc2\xbc\xcc\x81', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0x81 })
-- U+0080 : not a valid start char
eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81 })
-- One UTF-8 character
eq({ '\xf4\x80\x80\x80', 0x100000 }, test_seq { 0xf4, 0x80, 0x80, 0x80 })
@@ -126,36 +147,36 @@ describe('mbyte', function()
eq(byte(0x7f), test_seq { 0x7f, 0xc2, 0xcc, 0x80, 0x80 })
-- Combining character U+0300
eq({ '\x7f\xcc\x80', 0x7f }, test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x00 })
eq({ '\x29\xcc\x80', 0x29 }, test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x00 })
-- Combining characters U+0300 and U+0301
eq({ '\x7f\xcc\x80\xcc\x81', 0x7f }, test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81 })
eq({ '\x29\xcc\x80\xcc\x81', 0x29 }, test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81 })
-- Combining characters U+0300, U+0301, U+0302
eq(
{ '\x7f\xcc\x80\xcc\x81\xcc\x82', 0x7f },
test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82 }
{ '\x29\xcc\x80\xcc\x81\xcc\x82', 0x29 },
test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82 }
)
-- Combining characters U+0300, U+0301, U+0302, U+0303
eq(
{ '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83', 0x7f },
test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83 }
{ '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83', 0x29 },
test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83 }
)
-- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304
eq(
{ '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84', 0x7f },
test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84 }
{ '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84', 0x29 },
test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84 }
)
-- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305
eq(
{ '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85', 0x7f },
test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85 }
{ '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85', 0x29 },
test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85 }
)
-- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305, U+0306
eq(
{ '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85\xcc\x86', 0x7f },
{ '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85\xcc\x86', 0x29 },
test_seq {
0x7f,
0x29,
0xcc,
0x80,
0xcc,
@@ -175,18 +196,18 @@ describe('mbyte', function()
-- Only three following combining characters U+0300, U+0301, U+0302
eq(
{ '\x7f\xcc\x80\xcc\x81\xcc\x82', 0x7f },
test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xc2, 0x80, 0xcc, 0x84, 0xcc, 0x85 }
{ '\x29\xcc\x80\xcc\x81\xcc\x82', 0x29 },
test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xc2, 0x80, 0xcc, 0x84, 0xcc, 0x85 }
)
-- No UTF-8 sequence
eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f, 0xcc, 0x80, 0x80 })
-- No following UTF-8 character
eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0xcc, 0x80 })
eq({ '\xc2\xbc', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0xcc, 0x80 })
-- Combining character U+0301
eq({ '\xc2\x80\xcc\x81', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81, 0x7f })
eq({ '\xc2\xbc\xcc\x81', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0x81, 0x7f })
-- Combining character U+0301
eq({ '\xc2\x80\xcc\x81', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81, 0xcc })
eq({ '\xc2\xbc\xcc\x81', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0x81, 0xcc })
-- One UTF-8 character
eq({ '\xf4\x80\x80\x80', 0x100000 }, test_seq { 0xf4, 0x80, 0x80, 0x80, 0x7f })
@@ -205,8 +226,6 @@ describe('mbyte', function()
end)
describe('utf_cp_bounds_len', function()
local to_cstr = t.to_cstr
local tests = {
{
name = 'for valid string',
@@ -273,4 +292,52 @@ describe('mbyte', function()
eq(expected_offsets, { b = b_offsets, e = e_offsets })
end)
end)
itp('utf_head_off', function()
local function check(str, expected_glyphs)
local len = #str
local cstr = to_cstr(str)
local breaks = { 0 } -- SOT
local pos = 0
local mb_glyphs = {}
while pos < len do
local clen = lib.utfc_ptr2len(cstr + pos)
ok(clen > 0) -- otherwise we get stuck
if clen > 1 then
table.insert(mb_glyphs, string.sub(str, pos + 1, pos + clen))
end
pos = pos + clen
table.insert(breaks, pos)
end
eq(breaks[#breaks], len) -- include EOT as break
-- we could also send in breaks, but this is more human readable
eq(mb_glyphs, expected_glyphs)
for i = 1, #breaks - 1 do
local start, next = breaks[i], breaks[i + 1]
for p = start, next - 1 do
eq(p - start, lib.utf_head_off(cstr, cstr + p))
end
end
eq(0, lib.utf_head_off(cstr, cstr + len)) -- NUL byte is safe
end
-- stylua doesn't like ZWJ chars..
-- stylua: ignore start
check('hej och hå 🧑‍🌾!', { 'å', '🧑‍🌾' })
-- emoji only (various kinds of combinations, use g8 to see them)
check("🏳️‍⚧️🧑‍🌾❤️😂🏴‍☠️", {"🏳️‍⚧️", "🧑‍🌾", "❤️", "😂", "🏴‍☠️"})
check('🏳xy🧑🌾\r❤️😂å🏴‍☠️€', { '🏳️‍⚧️', '🧑‍🌾', '❤️', '😂', 'å', '🏴‍☠️', '€' })
check('🇦🅱️ 🇦🇽 🇦🇨🇦 🇲🇽🇹🇱',{'🇦', '🅱️', '🇦🇽', '🇦🇨', '🇦', '🇲🇽', '🇹🇱'})
check('🏴󠁧󠁢󠁳󠁣󠁴󠁿🏴󠁧󠁢󠁷󠁬󠁳󠁿', {'🏴󠁧󠁢󠁳󠁣󠁴󠁿', '🏴󠁧󠁢󠁷󠁬󠁳󠁿'})
lib.p_arshape = true -- default
check('سلام', { 'س', 'لا', 'م' })
lib.p_arshape = false
check('سلام', { 'س', 'ل', 'ا', 'م' })
check('L̓̉̑̒̌̚ơ̗̌̒̄̀ŕ̈̈̎̐̕è̇̅̄̄̐m̖̟̟̅̄̚', {'L̓̉̑̒̌̚', 'ơ̗̌̒̄̀', 'ŕ̈̈̎̐̕', 'è̇̅̄̄̐', 'm̖̟̟̅̄̚'})
-- stylua: ignore end
end)
end)