mirror of
https://github.com/neovim/neovim.git
synced 2025-10-26 12:27:24 +00:00
feat(mbyte): support extended grapheme clusters including more emoji
Use the grapheme break algorithm from utf8proc to support grapheme clusters from recent unicode versions. Handle variant selector VS16 turning some codepoints into double-width emoji. This means we need to use ptr2cells rather than char2cells when possible.
This commit is contained in:
@@ -3,8 +3,15 @@ local itp = t.gen_itp(it)
|
||||
|
||||
local ffi = t.ffi
|
||||
local eq = t.eq
|
||||
local to_cstr = t.to_cstr
|
||||
local ok = t.ok
|
||||
|
||||
local lib = t.cimport('./src/nvim/mbyte.h', './src/nvim/charset.h', './src/nvim/grid.h')
|
||||
local lib = t.cimport(
|
||||
'./src/nvim/mbyte.h',
|
||||
'./src/nvim/charset.h',
|
||||
'./src/nvim/grid.h',
|
||||
'./src/nvim/option_vars.h'
|
||||
)
|
||||
|
||||
describe('mbyte', function()
|
||||
-- Convert from bytes to string
|
||||
@@ -45,12 +52,21 @@ describe('mbyte', function()
|
||||
end)
|
||||
end
|
||||
|
||||
describe('utfc_ptr2schar_len', function()
|
||||
describe('utfc_ptr2schar', function()
|
||||
local function test_seq(seq)
|
||||
local firstc = ffi.new('int[1]')
|
||||
local buf = ffi.new('char[32]')
|
||||
lib.schar_get(buf, lib.utfc_ptr2schar_len(to_string(seq), #seq, firstc))
|
||||
return { ffi.string(buf), firstc[0] }
|
||||
lib.schar_get(buf, lib.utfc_ptr2schar(to_string(seq), firstc))
|
||||
local str = ffi.string(buf)
|
||||
if 1 > 2 then -- for debugging
|
||||
local tabel = {}
|
||||
for i = 1, #str do
|
||||
table.insert(tabel, string.format('0x%02x', string.byte(str, i)))
|
||||
end
|
||||
print('{ ' .. table.concat(tabel, ', ') .. ' }')
|
||||
io.stdout:flush()
|
||||
end
|
||||
return { str, firstc[0] }
|
||||
end
|
||||
|
||||
local function byte(val)
|
||||
@@ -88,7 +104,9 @@ describe('mbyte', function()
|
||||
eq(byte(0x7f), test_seq { 0x7f, 0xc2, 0x80 })
|
||||
|
||||
-- Combining character is U+0300
|
||||
eq({ '\x7f\xcc\x80', 0x7f }, test_seq { 0x7f, 0xcc, 0x80 })
|
||||
eq({ '\x29\xcc\x80', 0x29 }, test_seq { 0x29, 0xcc, 0x80 })
|
||||
-- invalid start byte for combining
|
||||
eq({ '\x7f', 0x7f }, test_seq { 0x7f, 0xcc, 0x80 })
|
||||
|
||||
-- No UTF-8 sequence
|
||||
eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f, 0xcc })
|
||||
@@ -102,18 +120,21 @@ describe('mbyte', function()
|
||||
itp('4-byte sequences', function()
|
||||
-- No following combining character
|
||||
eq(byte(0x7f), test_seq { 0x7f, 0x7f, 0xcc, 0x80 })
|
||||
eq(byte(0x29), test_seq { 0x29, 0x29, 0xcc, 0x80 })
|
||||
-- No second UTF-8 character
|
||||
eq(byte(0x7f), test_seq { 0x7f, 0xc2, 0xcc, 0x80 })
|
||||
|
||||
-- Combining character U+0300
|
||||
eq({ '\x7f\xcc\x80', 0x7f }, test_seq { 0x7f, 0xcc, 0x80, 0xcc })
|
||||
eq({ '\x29\xcc\x80', 0x29 }, test_seq { 0x29, 0xcc, 0x80, 0xcc })
|
||||
|
||||
-- No UTF-8 sequence
|
||||
eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f, 0xcc, 0x80 })
|
||||
-- No following UTF-8 character
|
||||
eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0xcc })
|
||||
-- Combining character U+0301
|
||||
eq({ '\xc2\x80\xcc\x81', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81 })
|
||||
eq({ '\xc2\xbc\xcc\x81', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0x81 })
|
||||
-- U+0080 : not a valid start char
|
||||
eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81 })
|
||||
|
||||
-- One UTF-8 character
|
||||
eq({ '\xf4\x80\x80\x80', 0x100000 }, test_seq { 0xf4, 0x80, 0x80, 0x80 })
|
||||
@@ -126,36 +147,36 @@ describe('mbyte', function()
|
||||
eq(byte(0x7f), test_seq { 0x7f, 0xc2, 0xcc, 0x80, 0x80 })
|
||||
|
||||
-- Combining character U+0300
|
||||
eq({ '\x7f\xcc\x80', 0x7f }, test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x00 })
|
||||
eq({ '\x29\xcc\x80', 0x29 }, test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x00 })
|
||||
|
||||
-- Combining characters U+0300 and U+0301
|
||||
eq({ '\x7f\xcc\x80\xcc\x81', 0x7f }, test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81 })
|
||||
eq({ '\x29\xcc\x80\xcc\x81', 0x29 }, test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81 })
|
||||
-- Combining characters U+0300, U+0301, U+0302
|
||||
eq(
|
||||
{ '\x7f\xcc\x80\xcc\x81\xcc\x82', 0x7f },
|
||||
test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82 }
|
||||
{ '\x29\xcc\x80\xcc\x81\xcc\x82', 0x29 },
|
||||
test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82 }
|
||||
)
|
||||
-- Combining characters U+0300, U+0301, U+0302, U+0303
|
||||
eq(
|
||||
{ '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83', 0x7f },
|
||||
test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83 }
|
||||
{ '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83', 0x29 },
|
||||
test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83 }
|
||||
)
|
||||
-- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304
|
||||
eq(
|
||||
{ '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84', 0x7f },
|
||||
test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84 }
|
||||
{ '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84', 0x29 },
|
||||
test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84 }
|
||||
)
|
||||
-- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305
|
||||
eq(
|
||||
{ '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85', 0x7f },
|
||||
test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85 }
|
||||
{ '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85', 0x29 },
|
||||
test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85 }
|
||||
)
|
||||
|
||||
-- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305, U+0306
|
||||
eq(
|
||||
{ '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85\xcc\x86', 0x7f },
|
||||
{ '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85\xcc\x86', 0x29 },
|
||||
test_seq {
|
||||
0x7f,
|
||||
0x29,
|
||||
0xcc,
|
||||
0x80,
|
||||
0xcc,
|
||||
@@ -175,18 +196,18 @@ describe('mbyte', function()
|
||||
|
||||
-- Only three following combining characters U+0300, U+0301, U+0302
|
||||
eq(
|
||||
{ '\x7f\xcc\x80\xcc\x81\xcc\x82', 0x7f },
|
||||
test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xc2, 0x80, 0xcc, 0x84, 0xcc, 0x85 }
|
||||
{ '\x29\xcc\x80\xcc\x81\xcc\x82', 0x29 },
|
||||
test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xc2, 0x80, 0xcc, 0x84, 0xcc, 0x85 }
|
||||
)
|
||||
|
||||
-- No UTF-8 sequence
|
||||
eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f, 0xcc, 0x80, 0x80 })
|
||||
-- No following UTF-8 character
|
||||
eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0xcc, 0x80 })
|
||||
eq({ '\xc2\xbc', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0xcc, 0x80 })
|
||||
-- Combining character U+0301
|
||||
eq({ '\xc2\x80\xcc\x81', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81, 0x7f })
|
||||
eq({ '\xc2\xbc\xcc\x81', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0x81, 0x7f })
|
||||
-- Combining character U+0301
|
||||
eq({ '\xc2\x80\xcc\x81', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81, 0xcc })
|
||||
eq({ '\xc2\xbc\xcc\x81', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0x81, 0xcc })
|
||||
|
||||
-- One UTF-8 character
|
||||
eq({ '\xf4\x80\x80\x80', 0x100000 }, test_seq { 0xf4, 0x80, 0x80, 0x80, 0x7f })
|
||||
@@ -205,8 +226,6 @@ describe('mbyte', function()
|
||||
end)
|
||||
|
||||
describe('utf_cp_bounds_len', function()
|
||||
local to_cstr = t.to_cstr
|
||||
|
||||
local tests = {
|
||||
{
|
||||
name = 'for valid string',
|
||||
@@ -273,4 +292,52 @@ describe('mbyte', function()
|
||||
eq(expected_offsets, { b = b_offsets, e = e_offsets })
|
||||
end)
|
||||
end)
|
||||
|
||||
itp('utf_head_off', function()
|
||||
local function check(str, expected_glyphs)
|
||||
local len = #str
|
||||
local cstr = to_cstr(str)
|
||||
local breaks = { 0 } -- SOT
|
||||
local pos = 0
|
||||
local mb_glyphs = {}
|
||||
while pos < len do
|
||||
local clen = lib.utfc_ptr2len(cstr + pos)
|
||||
ok(clen > 0) -- otherwise we get stuck
|
||||
if clen > 1 then
|
||||
table.insert(mb_glyphs, string.sub(str, pos + 1, pos + clen))
|
||||
end
|
||||
pos = pos + clen
|
||||
table.insert(breaks, pos)
|
||||
end
|
||||
eq(breaks[#breaks], len) -- include EOT as break
|
||||
-- we could also send in breaks, but this is more human readable
|
||||
eq(mb_glyphs, expected_glyphs)
|
||||
|
||||
for i = 1, #breaks - 1 do
|
||||
local start, next = breaks[i], breaks[i + 1]
|
||||
|
||||
for p = start, next - 1 do
|
||||
eq(p - start, lib.utf_head_off(cstr, cstr + p))
|
||||
end
|
||||
end
|
||||
eq(0, lib.utf_head_off(cstr, cstr + len)) -- NUL byte is safe
|
||||
end
|
||||
-- stylua doesn't like ZWJ chars..
|
||||
-- stylua: ignore start
|
||||
check('hej och hå 🧑🌾!', { 'å', '🧑🌾' })
|
||||
-- emoji only (various kinds of combinations, use g8 to see them)
|
||||
check("🏳️⚧️🧑🌾❤️😂🏴☠️", {"🏳️⚧️", "🧑🌾", "❤️", "😂", "🏴☠️"})
|
||||
check('🏳️⚧️xy🧑🌾\r❤️😂å🏴☠️', { '🏳️⚧️', '🧑🌾', '❤️', '😂', 'å', '🏴☠️', '' })
|
||||
|
||||
check('🇦🅱️ 🇦🇽 🇦🇨🇦 🇲🇽🇹🇱',{'🇦', '🅱️', '🇦🇽', '🇦🇨', '🇦', '🇲🇽', '🇹🇱'})
|
||||
check('🏴🏴', {'🏴', '🏴'})
|
||||
|
||||
lib.p_arshape = true -- default
|
||||
check('سلام', { 'س', 'لا', 'م' })
|
||||
lib.p_arshape = false
|
||||
check('سلام', { 'س', 'ل', 'ا', 'م' })
|
||||
|
||||
check('L̓̉̑̒̌̚ơ̗̌̒̄̀ŕ̈̈̎̐̕è̇̅̄̄̐m̖̟̟̅̄̚', {'L̓̉̑̒̌̚', 'ơ̗̌̒̄̀', 'ŕ̈̈̎̐̕', 'è̇̅̄̄̐', 'm̖̟̟̅̄̚'})
|
||||
-- stylua: ignore end
|
||||
end)
|
||||
end)
|
||||
|
||||
Reference in New Issue
Block a user