mirror of
https://github.com/neovim/neovim.git
synced 2025-09-05 19:08:15 +00:00
vim-patch:9.1.1276: inline word diff treats multibyte chars as word char (#33323)
Problem: inline word diff treats multibyte chars as word char
(after 9.1.1243)
Solution: treat all non-alphanumeric characters as non-word characters
(Yee Cheng Chin)
Previously inline word diff simply used Vim's definition of keyword to
determine what is a word, which leads to multi-byte character classes
such as emojis and CJK (Chinese/Japanese/Korean) characters all
classifying as word characters, leading to entire sentences being
grouped as a single word which does not provide meaningful information
in a diff highlight.
Fix this by treating all non-alphanumeric characters (with class number
above 2) as non-word characters, as there is usually no benefit in using
word diff on them. These include CJK characters, emojis, and also
subscript/superscript numbers. Meanwhile, multi-byte characters like
Cyrillic and Greek letters will still continue to considered as words.
Note that this is slightly inconsistent with how words are defined
elsewhere, as Vim usually considers any character with class >=2 to be
a "word".
related: vim/vim#16881 (diff inline highlight)
closes: vim/vim#17050
9aa120f7ad
Co-authored-by: Yee Cheng Chin <ychin.git@gmail.com>
This commit is contained in:
@@ -2132,7 +2132,10 @@ A jump table for the options with a short description can be found at |Q_op|.
|
||||
difference.
|
||||
word Use internal diff to perform a
|
||||
|word|-wise diff and highlight the
|
||||
difference.
|
||||
difference. Non-alphanumeric
|
||||
multi-byte characters such as emoji
|
||||
and CJK characters are considered
|
||||
individual words.
|
||||
|
||||
internal Use the internal diff library. This is
|
||||
ignored when 'diffexpr' is set. *E960*
|
||||
|
5
runtime/lua/vim/_meta/options.lua
generated
5
runtime/lua/vim/_meta/options.lua
generated
@@ -1729,7 +1729,10 @@ vim.go.dex = vim.go.diffexpr
|
||||
--- difference.
|
||||
--- word Use internal diff to perform a
|
||||
--- `word`-wise diff and highlight the
|
||||
--- difference.
|
||||
--- difference. Non-alphanumeric
|
||||
--- multi-byte characters such as emoji
|
||||
--- and CJK characters are considered
|
||||
--- individual words.
|
||||
---
|
||||
--- internal Use the internal diff library. This is
|
||||
--- ignored when 'diffexpr' is set. *E960*
|
||||
|
@@ -2990,10 +2990,15 @@ static void diff_find_change_inline_diff(diff_T *dp)
|
||||
|
||||
char *s = curline;
|
||||
while (*s != NUL) {
|
||||
// Always use the first buffer's 'iskeyword' to have a consistent diff
|
||||
bool new_in_keyword = false;
|
||||
if (diff_flags & DIFF_INLINE_WORD) {
|
||||
new_in_keyword = vim_iswordp_buf(s, curtab->tp_diffbuf[file1_idx]);
|
||||
// Always use the first buffer's 'iskeyword' to have a
|
||||
// consistent diff.
|
||||
// For multibyte chars, only treat alphanumeric chars
|
||||
// (class 2) as "word", as other classes such as emojis and
|
||||
// CJK ideographs do not usually benefit from word diff as
|
||||
// Vim doesn't have a good way to segment them.
|
||||
new_in_keyword = (mb_get_class_tab(s, curtab->tp_diffbuf[file1_idx]->b_chartab) == 2);
|
||||
}
|
||||
if (in_keyword && !new_in_keyword) {
|
||||
ga_append(curstr, NL);
|
||||
|
@@ -417,11 +417,11 @@ void remove_bom(char *s)
|
||||
}
|
||||
}
|
||||
|
||||
// Get class of pointer:
|
||||
// 0 for blank or NUL
|
||||
// 1 for punctuation
|
||||
// 2 for an (ASCII) word character
|
||||
// >2 for other word characters
|
||||
/// Get class of pointer:
|
||||
/// 0 for blank or NUL
|
||||
/// 1 for punctuation
|
||||
/// 2 for an alphanumeric word character
|
||||
/// >2 for other word characters, including CJK and emoji
|
||||
int mb_get_class(const char *p)
|
||||
FUNC_ATTR_PURE
|
||||
{
|
||||
|
@@ -2286,7 +2286,10 @@ local options = {
|
||||
difference.
|
||||
word Use internal diff to perform a
|
||||
|word|-wise diff and highlight the
|
||||
difference.
|
||||
difference. Non-alphanumeric
|
||||
multi-byte characters such as emoji
|
||||
and CJK characters are considered
|
||||
individual words.
|
||||
|
||||
internal Use the internal diff library. This is
|
||||
ignored when 'diffexpr' is set. *E960*
|
||||
|
@@ -2485,6 +2485,22 @@ it('diff mode inline highlighting', function()
|
||||
|
||||
command('windo set iskeyword& | 1wincmd w')
|
||||
|
||||
screen:try_resize(75, 20)
|
||||
command('wincmd =')
|
||||
-- word diff: test handling of multi-byte characters. Only alphanumeric chars
|
||||
-- (e.g. Greek alphabet, but not CJK/emoji) count as words.
|
||||
WriteDiffFiles(
|
||||
'🚀⛵️一二三ひらがなΔέλτα Δelta foobar',
|
||||
'🚀🛸一二四ひらなδέλτα δelta foobar'
|
||||
)
|
||||
command('set diffopt=internal,filler diffopt+=inline:word')
|
||||
screen:expect([[
|
||||
{7: }{4:^🚀}{27:⛵️}{4:一二}{27:三}{4:ひら}{100:が}{4:な}{27:Δέλτα}{4: }{27:Δelta}{4: fooba}│{7: }{4:🚀}{27:🛸}{4:一二}{27:四}{4:ひらな}{27:δέλτα}{4: }{27:δelta}{4: foobar }|
|
||||
{1:~ }│{1:~ }|*17
|
||||
{3:Xdifile1 }{2:Xdifile2 }|
|
||||
|
|
||||
]])
|
||||
|
||||
screen:try_resize(69, 20)
|
||||
command('wincmd =')
|
||||
-- char diff: should slide highlight to whitespace boundary if possible for
|
||||
|
@@ -2193,6 +2193,11 @@ func Test_diff_inline()
|
||||
call term_sendkeys(buf, ":set iskeyword+=+\<CR>:wincmd w\<CR>:diffupdate\<CR>")
|
||||
" Use the previous screen dump as 2nd buffer's iskeyword does not matter
|
||||
call VerifyInternal(buf, "Test_diff_inline_word_01", " diffopt+=inline:word")
|
||||
|
||||
call term_sendkeys(buf, ":windo set iskeyword&\<CR>:1wincmd w\<CR>")
|
||||
|
||||
" word diff: test handling of multi-byte characters. Only alphanumeric chars
|
||||
" (e.g. Greek alphabet, but not CJK/emoji) count as words.
|
||||
call WriteDiffFiles(buf, ["🚀⛵️一二三ひらがなΔέλτα Δelta foobar"], ["🚀🛸一二四ひらなδέλτα δelta foobar"])
|
||||
call VerifyInternal(buf, "Test_diff_inline_word_03", " diffopt+=inline:word")
|
||||
|
||||
|
Reference in New Issue
Block a user