refactor(grid): make screen rendering more multibyte than ever before

Problem: buffer text with composing chars are converted from UTF-8
to an array of up to seven UTF-32 values and then converted back
to UTF-8 strings.

Solution: Convert buffer text directly to UTF-8 based schar_T values.

The limit of the text size is now in schar_T bytes, which is currently
31+1 but easily could be raised as it no longer multiplies the size
of the entire screen grid when not used, the full size is only required
for temporary scratch buffers.

Also does some general cleanup to win_line text handling, which was
unnecessarily complicated due to multibyte rendering being an "opt-in"
feature long ago. Nowadays, a char is just a char, regardless if it consists
of one ASCII byte or multiple bytes.
This commit is contained in:
bfredl
2023-11-06 14:52:27 +01:00
parent 20ec4c776a
commit b522cb1ac3
26 changed files with 399 additions and 602 deletions

View File

@@ -4,17 +4,9 @@ local itp = helpers.gen_itp(it)
local ffi = helpers.ffi
local eq = helpers.eq
local mbyte = helpers.cimport("./src/nvim/mbyte.h")
local charset = helpers.cimport('./src/nvim/charset.h')
local lib = helpers.cimport('./src/nvim/mbyte.h', './src/nvim/charset.h', './src/nvim/grid.h')
describe('mbyte', function()
-- Array for composing characters
local intp = ffi.typeof('int[?]')
local function to_intp()
-- how to get MAX_MCO from globals.h?
return intp(7, 1)
end
-- Convert from bytes to string
local function to_string(bytes)
local s = {}
@@ -30,14 +22,14 @@ describe('mbyte', function()
itp('utf_ptr2char', function()
-- For strings with length 1 the first byte is returned.
for c = 0, 255 do
eq(c, mbyte.utf_ptr2char(to_string({c, 0})))
eq(c, lib.utf_ptr2char(to_string({c, 0})))
end
-- Some ill formed byte sequences that should not be recognized as UTF-8
-- First byte: 0xc0 or 0xc1
-- Second byte: 0x80 .. 0xbf
--eq(0x00c0, mbyte.utf_ptr2char(to_string({0xc0, 0x80})))
--eq(0x00c1, mbyte.utf_ptr2char(to_string({0xc1, 0xbf})))
--eq(0x00c0, lib.utf_ptr2char(to_string({0xc0, 0x80})))
--eq(0x00c1, lib.utf_ptr2char(to_string({0xc1, 0xbf})))
--
-- Sequences with more than four bytes
end)
@@ -47,240 +39,133 @@ describe('mbyte', function()
local char_p = ffi.typeof('char[?]')
for c = n * 0x1000, n * 0x1000 + 0xFFF do
local p = char_p(4, 0)
mbyte.utf_char2bytes(c, p)
eq(c, mbyte.utf_ptr2char(p))
eq(charset.vim_iswordc(c), charset.vim_iswordp(p))
lib.utf_char2bytes(c, p)
eq(c, lib.utf_ptr2char(p))
eq(lib.vim_iswordc(c), lib.vim_iswordp(p))
end
end)
end
describe('utfc_ptr2char_len', function()
describe('utfc_ptr2schar_len', function()
local function test_seq(seq)
local firstc = ffi.new("int[1]")
local buf = ffi.new("char[32]")
lib.schar_get(buf, lib.utfc_ptr2schar_len(to_string(seq), #seq, firstc))
return {ffi.string(buf), firstc[0]}
end
local function byte(val)
return {string.char(val), val}
end
itp('1-byte sequences', function()
local pcc = to_intp()
for c = 0, 255 do
eq(c, mbyte.utfc_ptr2char_len(to_string({c}), pcc, 1))
eq(0, pcc[0])
eq({'', 0}, test_seq{0})
for c = 1, 127 do
eq(byte(c), test_seq{c})
end
for c = 128, 255 do
eq({'', c}, test_seq{c})
end
end)
itp('2-byte sequences', function()
local pcc = to_intp()
-- No combining characters
eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0x7f}), pcc, 2))
eq(0, pcc[0])
eq(byte(0x7f), test_seq{0x7f, 0x7f})
-- No combining characters
pcc = to_intp()
eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0x80}), pcc, 2))
eq(0, pcc[0])
eq(byte(0x7f), test_seq{0x7f, 0x80})
-- No UTF-8 sequence
pcc = to_intp()
eq(0x00c2, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x7f}), pcc, 2))
eq(0, pcc[0])
eq({'', 0xc2}, test_seq{0xc2, 0x7f})
-- One UTF-8 character
pcc = to_intp()
eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80}), pcc, 2))
eq(0, pcc[0])
eq({'\xc2\x80', 0x80}, test_seq{0xc2, 0x80})
-- No UTF-8 sequence
pcc = to_intp()
eq(0x00c2, mbyte.utfc_ptr2char_len(to_string({0xc2, 0xc0}), pcc, 2))
eq(0, pcc[0])
eq({'', 0xc2}, test_seq{0xc2, 0xc0})
end)
itp('3-byte sequences', function()
local pcc = to_intp()
-- No second UTF-8 character
eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0x80, 0x80}), pcc, 3))
eq(0, pcc[0])
eq(byte(0x7f), test_seq{0x7f, 0x80, 0x80})
-- No combining character
pcc = to_intp()
eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xc2, 0x80}), pcc, 3))
eq(0, pcc[0])
eq(byte(0x7f), test_seq{0x7f, 0xc2, 0x80})
-- Combining character is U+0300
pcc = to_intp()
eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80}), pcc, 3))
eq(0x0300, pcc[0])
eq(0x0000, pcc[1])
eq({"\x7f\xcc\x80", 0x7f}, test_seq{0x7f, 0xcc, 0x80})
-- No UTF-8 sequence
pcc = to_intp()
eq(0x00c2, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x7f, 0xcc}), pcc, 3))
eq(0, pcc[0])
eq({'', 0xc2}, test_seq{0xc2, 0x7f, 0xcc})
-- Incomplete combining character
pcc = to_intp()
eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc}), pcc, 3))
eq(0, pcc[0])
eq({"\xc2\x80", 0x80}, test_seq{0xc2, 0x80, 0xcc})
-- One UTF-8 character
pcc = to_intp()
eq(0x20d0, mbyte.utfc_ptr2char_len(to_string({0xe2, 0x83, 0x90}), pcc, 3))
eq(0, pcc[0])
-- One UTF-8 character (composing only)
eq({" \xe2\x83\x90", 0x20d0}, test_seq{0xe2, 0x83, 0x90})
end)
itp('4-byte sequences', function()
local pcc = to_intp()
-- No following combining character
eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0x7f, 0xcc, 0x80}), pcc, 4))
eq(0, pcc[0])
eq(byte(0x7f), test_seq{0x7f, 0x7f, 0xcc, 0x80})
-- No second UTF-8 character
pcc = to_intp()
eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xc2, 0xcc, 0x80}), pcc, 4))
eq(0, pcc[0])
eq(byte(0x7f), test_seq{0x7f, 0xc2, 0xcc, 0x80})
-- Combining character U+0300
pcc = to_intp()
eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80, 0xcc}), pcc, 4))
eq(0x0300, pcc[0])
eq(0x0000, pcc[1])
eq({"\x7f\xcc\x80", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc})
-- No UTF-8 sequence
pcc = to_intp()
eq(0x00c2, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x7f, 0xcc, 0x80}), pcc, 4))
eq(0, pcc[0])
eq({'', 0xc2}, test_seq{0xc2, 0x7f, 0xcc, 0x80})
-- No following UTF-8 character
pcc = to_intp()
eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc, 0xcc}), pcc, 4))
eq(0, pcc[0])
eq({"\xc2\x80", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0xcc})
-- Combining character U+0301
pcc = to_intp()
eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc, 0x81}), pcc, 4))
eq(0x0301, pcc[0])
eq(0x0000, pcc[1])
eq({"\xc2\x80\xcc\x81", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0x81})
-- One UTF-8 character
pcc = to_intp()
eq(0x100000, mbyte.utfc_ptr2char_len(to_string({0xf4, 0x80, 0x80, 0x80}), pcc, 4))
eq(0, pcc[0])
eq({"\xf4\x80\x80\x80", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80})
end)
itp('5+-byte sequences', function()
local pcc = to_intp()
-- No following combining character
eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0x7f, 0xcc, 0x80, 0x80}), pcc, 5))
eq(0, pcc[0])
eq(byte(0x7f), test_seq{0x7f, 0x7f, 0xcc, 0x80, 0x80})
-- No second UTF-8 character
pcc = to_intp()
eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xc2, 0xcc, 0x80, 0x80}), pcc, 5))
eq(0, pcc[0])
eq(byte(0x7f), test_seq{0x7f, 0xc2, 0xcc, 0x80, 0x80})
-- Combining character U+0300
pcc = to_intp()
eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80, 0xcc}), pcc, 5))
eq(0x0300, pcc[0])
eq(0x0000, pcc[1])
eq({"\x7f\xcc\x80", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x00})
-- Combining characters U+0300 and U+0301
pcc = to_intp()
eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80, 0xcc, 0x81}), pcc, 5))
eq(0x0300, pcc[0])
eq(0x0301, pcc[1])
eq(0x0000, pcc[2])
eq({"\x7f\xcc\x80\xcc\x81", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81})
-- Combining characters U+0300, U+0301, U+0302
pcc = to_intp()
eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82}), pcc, 7))
eq(0x0300, pcc[0])
eq(0x0301, pcc[1])
eq(0x0302, pcc[2])
eq(0x0000, pcc[3])
eq({"\x7f\xcc\x80\xcc\x81\xcc\x82", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82})
-- Combining characters U+0300, U+0301, U+0302, U+0303
pcc = to_intp()
eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83}), pcc, 9))
eq(0x0300, pcc[0])
eq(0x0301, pcc[1])
eq(0x0302, pcc[2])
eq(0x0303, pcc[3])
eq(0x0000, pcc[4])
eq({"\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83})
-- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304
pcc = to_intp()
eq(0x007f, mbyte.utfc_ptr2char_len(to_string(
{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84}), pcc, 11))
eq(0x0300, pcc[0])
eq(0x0301, pcc[1])
eq(0x0302, pcc[2])
eq(0x0303, pcc[3])
eq(0x0304, pcc[4])
eq(0x0000, pcc[5])
-- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304,
-- U+0305
pcc = to_intp()
eq(0x007f, mbyte.utfc_ptr2char_len(to_string(
{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85}), pcc, 13))
eq(0x0300, pcc[0])
eq(0x0301, pcc[1])
eq(0x0302, pcc[2])
eq(0x0303, pcc[3])
eq(0x0304, pcc[4])
eq(0x0305, pcc[5])
eq(1, pcc[6])
eq({"\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84})
-- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305
eq({"\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85})
-- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304,
-- U+0305, U+0306, but only save six (= MAX_MCO).
pcc = to_intp()
eq(0x007f, mbyte.utfc_ptr2char_len(to_string(
{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85, 0xcc, 0x86}), pcc, 15))
eq(0x0300, pcc[0])
eq(0x0301, pcc[1])
eq(0x0302, pcc[2])
eq(0x0303, pcc[3])
eq(0x0304, pcc[4])
eq(0x0305, pcc[5])
eq(0x0001, pcc[6])
-- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305, U+0306
eq({"\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85\xcc\x86", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85, 0xcc, 0x86})
-- Only three following combining characters U+0300, U+0301, U+0302
pcc = to_intp()
eq(0x007f, mbyte.utfc_ptr2char_len(to_string(
{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xc2, 0x80, 0xcc, 0x84, 0xcc, 0x85}), pcc, 13))
eq(0x0300, pcc[0])
eq(0x0301, pcc[1])
eq(0x0302, pcc[2])
eq(0x0000, pcc[3])
eq({"\x7f\xcc\x80\xcc\x81\xcc\x82", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xc2, 0x80, 0xcc, 0x84, 0xcc, 0x85})
-- No UTF-8 sequence
pcc = to_intp()
eq(0x00c2, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x7f, 0xcc, 0x80, 0x80}), pcc, 5))
eq(0, pcc[0])
eq({'', 0xc2}, test_seq{0xc2, 0x7f, 0xcc, 0x80, 0x80})
-- No following UTF-8 character
pcc = to_intp()
eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc, 0xcc, 0x80}), pcc, 5))
eq(0, pcc[0])
eq({"\xc2\x80", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0xcc, 0x80})
-- Combining character U+0301
pcc = to_intp()
eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc, 0x81, 0x7f}), pcc, 5))
eq(0x0301, pcc[0])
eq(0x0000, pcc[1])
eq({"\xc2\x80\xcc\x81", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0x81, 0x7f})
-- Combining character U+0301
pcc = to_intp()
eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc, 0x81, 0xcc}), pcc, 5))
eq(0x0301, pcc[0])
eq(0x0000, pcc[1])
eq({"\xc2\x80\xcc\x81", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0x81, 0xcc})
-- One UTF-8 character
pcc = to_intp()
eq(0x100000, mbyte.utfc_ptr2char_len(to_string({0xf4, 0x80, 0x80, 0x80, 0x7f}), pcc, 5))
eq(0, pcc[0])
eq({"\xf4\x80\x80\x80", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80, 0x7f})
-- One UTF-8 character
pcc = to_intp()
eq(0x100000, mbyte.utfc_ptr2char_len(to_string({0xf4, 0x80, 0x80, 0x80, 0x80}), pcc, 5))
eq(0, pcc[0])
eq({"\xf4\x80\x80\x80", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80, 0x80})
-- One UTF-8 character
pcc = to_intp()
eq(0x100000, mbyte.utfc_ptr2char_len(to_string({0xf4, 0x80, 0x80, 0x80, 0xcc}), pcc, 5))
eq(0, pcc[0])
eq({"\xf4\x80\x80\x80", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80, 0xcc})
-- Combining characters U+1AB0 and U+0301
pcc = to_intp()
eq(0x100000, mbyte.utfc_ptr2char_len(to_string(
{0xf4, 0x80, 0x80, 0x80, 0xe1, 0xaa, 0xb0, 0xcc, 0x81}), pcc, 9))
eq(0x1ab0, pcc[0])
eq(0x0301, pcc[1])
eq(0x0000, pcc[2])
eq({"\xf4\x80\x80\x80\xe1\xaa\xb0\xcc\x81", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80, 0xe1, 0xaa, 0xb0, 0xcc, 0x81})
end)
end)