mirror of
https://github.com/neovim/neovim.git
synced 2025-12-10 08:32:42 +00:00
Merge pull request #30014 from bfredl/neoemoji
support emojis with ZWJ and variant selectors
This commit is contained in:
@@ -646,6 +646,12 @@ widespread as file format.
|
|||||||
A composing or combining character is used to change the meaning of the
|
A composing or combining character is used to change the meaning of the
|
||||||
character before it. The combining characters are drawn on top of the
|
character before it. The combining characters are drawn on top of the
|
||||||
preceding character.
|
preceding character.
|
||||||
|
|
||||||
|
Nvim largely follows the definition of extended grapheme clusters in UAX#29
|
||||||
|
in the Unicode standard, with some modifications: An ascii char will always
|
||||||
|
start a new cluster. In addition 'arabicshape' enables the combining of some
|
||||||
|
arabic letters, when they are shaped to be displayed together in a single cell.
|
||||||
|
|
||||||
Too big combined characters cannot be displayed, but they can still be
|
Too big combined characters cannot be displayed, but they can still be
|
||||||
inspected using the |g8| and |ga| commands described below.
|
inspected using the |g8| and |ga| commands described below.
|
||||||
When editing text a composing character is mostly considered part of the
|
When editing text a composing character is mostly considered part of the
|
||||||
|
|||||||
@@ -200,6 +200,12 @@ These existing features changed their behavior.
|
|||||||
top lines are calculated using screen line numbers which take virtual lines
|
top lines are calculated using screen line numbers which take virtual lines
|
||||||
into account.
|
into account.
|
||||||
|
|
||||||
|
• The implementation of grapheme clusters (or combining chars |mbyte-combining|)
|
||||||
|
was upgraded to closely follow extended grapheme clusters as defined by UAX#29
|
||||||
|
in the unicode standard. Noteworthily, this enables proper display of many
|
||||||
|
more emoji characters than before, including those encoded with multiple
|
||||||
|
emoji codepoints combined with ZWJ (zero width joiner) codepoints.
|
||||||
|
|
||||||
==============================================================================
|
==============================================================================
|
||||||
REMOVED FEATURES *news-removed*
|
REMOVED FEATURES *news-removed*
|
||||||
|
|
||||||
|
|||||||
@@ -2217,9 +2217,12 @@ A jump table for the options with a short description can be found at |Q_op|.
|
|||||||
global
|
global
|
||||||
When on all Unicode emoji characters are considered to be full width.
|
When on all Unicode emoji characters are considered to be full width.
|
||||||
This excludes "text emoji" characters, which are normally displayed as
|
This excludes "text emoji" characters, which are normally displayed as
|
||||||
single width. Unfortunately there is no good specification for this
|
single width. However, such "text emoji" are treated as full-width
|
||||||
and it has been determined on trial-and-error basis. Use the
|
emoji if they are followed by the U+FE0F variant selector.
|
||||||
|setcellwidths()| function to change the behavior.
|
|
||||||
|
Unfortunately there is no good specification for this and it has been
|
||||||
|
determined on trial-and-error basis. Use the |setcellwidths()|
|
||||||
|
function to change the behavior.
|
||||||
|
|
||||||
*'encoding'* *'enc'*
|
*'encoding'* *'enc'*
|
||||||
'encoding' 'enc' string (default "utf-8")
|
'encoding' 'enc' string (default "utf-8")
|
||||||
|
|||||||
9
runtime/lua/vim/_meta/options.lua
generated
9
runtime/lua/vim/_meta/options.lua
generated
@@ -1829,9 +1829,12 @@ vim.go.ead = vim.go.eadirection
|
|||||||
|
|
||||||
--- When on all Unicode emoji characters are considered to be full width.
|
--- When on all Unicode emoji characters are considered to be full width.
|
||||||
--- This excludes "text emoji" characters, which are normally displayed as
|
--- This excludes "text emoji" characters, which are normally displayed as
|
||||||
--- single width. Unfortunately there is no good specification for this
|
--- single width. However, such "text emoji" are treated as full-width
|
||||||
--- and it has been determined on trial-and-error basis. Use the
|
--- emoji if they are followed by the U+FE0F variant selector.
|
||||||
--- `setcellwidths()` function to change the behavior.
|
---
|
||||||
|
--- Unfortunately there is no good specification for this and it has been
|
||||||
|
--- determined on trial-and-error basis. Use the `setcellwidths()`
|
||||||
|
--- function to change the behavior.
|
||||||
---
|
---
|
||||||
--- @type boolean
|
--- @type boolean
|
||||||
vim.o.emoji = true
|
vim.o.emoji = true
|
||||||
|
|||||||
@@ -571,7 +571,7 @@ Integer nvim_buf_set_extmark(Buffer buffer, Integer ns_id, Integer line, Integer
|
|||||||
String c = opts->conceal;
|
String c = opts->conceal;
|
||||||
if (c.size > 0) {
|
if (c.size > 0) {
|
||||||
int ch;
|
int ch;
|
||||||
hl.conceal_char = utfc_ptr2schar_len(c.data, (int)c.size, &ch);
|
hl.conceal_char = utfc_ptr2schar(c.data, &ch);
|
||||||
if (!hl.conceal_char || !vim_isprintc(ch)) {
|
if (!hl.conceal_char || !vim_isprintc(ch)) {
|
||||||
api_set_error(err, kErrorTypeValidation, "conceal char has to be printable");
|
api_set_error(err, kErrorTypeValidation, "conceal char has to be printable");
|
||||||
goto error;
|
goto error;
|
||||||
|
|||||||
@@ -847,7 +847,7 @@ void remote_ui_raw_line(RemoteUI *ui, Integer grid, Integer row, Integer startco
|
|||||||
char sc_buf[MAX_SCHAR_SIZE];
|
char sc_buf[MAX_SCHAR_SIZE];
|
||||||
schar_get(sc_buf, chunk[i]);
|
schar_get(sc_buf, chunk[i]);
|
||||||
remote_ui_put(ui, sc_buf);
|
remote_ui_put(ui, sc_buf);
|
||||||
if (utf_ambiguous_width(utf_ptr2char(sc_buf))) {
|
if (utf_ambiguous_width(sc_buf)) {
|
||||||
ui->client_col = -1; // force cursor update
|
ui->client_col = -1; // force cursor update
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -896,14 +896,15 @@ int del_bytes(colnr_T count, bool fixpos_arg, bool use_delcombine)
|
|||||||
// delete the last combining character.
|
// delete the last combining character.
|
||||||
if (p_deco && use_delcombine && utfc_ptr2len(oldp + col) >= count) {
|
if (p_deco && use_delcombine && utfc_ptr2len(oldp + col) >= count) {
|
||||||
char *p0 = oldp + col;
|
char *p0 = oldp + col;
|
||||||
if (utf_composinglike(p0, p0 + utf_ptr2len(p0))) {
|
GraphemeState state = GRAPHEME_STATE_INIT;
|
||||||
|
if (utf_composinglike(p0, p0 + utf_ptr2len(p0), &state)) {
|
||||||
// Find the last composing char, there can be several.
|
// Find the last composing char, there can be several.
|
||||||
int n = col;
|
int n = col;
|
||||||
do {
|
do {
|
||||||
col = n;
|
col = n;
|
||||||
count = utf_ptr2len(oldp + n);
|
count = utf_ptr2len(oldp + n);
|
||||||
n += count;
|
n += count;
|
||||||
} while (utf_composinglike(oldp + col, oldp + n));
|
} while (utf_composinglike(oldp + col, oldp + n, &state));
|
||||||
fixpos = false;
|
fixpos = false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1694,7 +1695,7 @@ bool open_line(int dir, int flags, int second_line_indent, bool *did_do_comment)
|
|||||||
}
|
}
|
||||||
if (curbuf->b_p_ai || (flags & OPENLINE_DELSPACES)) {
|
if (curbuf->b_p_ai || (flags & OPENLINE_DELSPACES)) {
|
||||||
while ((*p_extra == ' ' || *p_extra == '\t')
|
while ((*p_extra == ' ' || *p_extra == '\t')
|
||||||
&& !utf_iscomposing(utf_ptr2char(p_extra + 1))) {
|
&& !utf_iscomposing_first(utf_ptr2char(p_extra + 1))) {
|
||||||
if (REPLACE_NORMAL(State)) {
|
if (REPLACE_NORMAL(State)) {
|
||||||
replace_push(*p_extra);
|
replace_push(*p_extra);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1865,7 +1865,7 @@ static void printdigraph(const digr_T *dp, result_T *previous)
|
|||||||
p = buf;
|
p = buf;
|
||||||
|
|
||||||
// add a space to draw a composing char on
|
// add a space to draw a composing char on
|
||||||
if (utf_iscomposing(dp->result)) {
|
if (utf_iscomposing_first(dp->result)) {
|
||||||
*p++ = ' ';
|
*p++ = ' ';
|
||||||
}
|
}
|
||||||
p += utf_char2bytes(dp->result, p);
|
p += utf_char2bytes(dp->result, p);
|
||||||
|
|||||||
@@ -1826,7 +1826,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, int col_rows, s
|
|||||||
|
|
||||||
// If a double-width char doesn't fit display a '>' in the last column.
|
// If a double-width char doesn't fit display a '>' in the last column.
|
||||||
// Don't advance the pointer but put the character at the start of the next line.
|
// Don't advance the pointer but put the character at the start of the next line.
|
||||||
if (wlv.col >= grid->cols - 1 && utf_char2cells(mb_c) == 2) {
|
if (wlv.col >= grid->cols - 1 && schar_cells(mb_schar) == 2) {
|
||||||
mb_c = '>';
|
mb_c = '>';
|
||||||
mb_l = 1;
|
mb_l = 1;
|
||||||
(void)mb_l;
|
(void)mb_l;
|
||||||
@@ -1922,7 +1922,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, int col_rows, s
|
|||||||
// If a double-width char doesn't fit display a '>' in the
|
// If a double-width char doesn't fit display a '>' in the
|
||||||
// last column; the character is displayed at the start of the
|
// last column; the character is displayed at the start of the
|
||||||
// next line.
|
// next line.
|
||||||
if (wlv.col >= grid->cols - 1 && utf_char2cells(mb_c) == 2) {
|
if (wlv.col >= grid->cols - 1 && schar_cells(mb_schar) == 2) {
|
||||||
mb_schar = schar_from_ascii('>');
|
mb_schar = schar_from_ascii('>');
|
||||||
mb_c = '>';
|
mb_c = '>';
|
||||||
mb_l = 1;
|
mb_l = 1;
|
||||||
@@ -2393,6 +2393,12 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, int col_rows, s
|
|||||||
|| (decor_conceal && decor_state.conceal_char)
|
|| (decor_conceal && decor_state.conceal_char)
|
||||||
|| wp->w_p_cole == 1)
|
|| wp->w_p_cole == 1)
|
||||||
&& wp->w_p_cole != 3) {
|
&& wp->w_p_cole != 3) {
|
||||||
|
if (schar_cells(mb_schar) > 1) {
|
||||||
|
// When the first char to be concealed is double-width,
|
||||||
|
// need to advance one more virtual column.
|
||||||
|
wlv.n_extra++;
|
||||||
|
}
|
||||||
|
|
||||||
// First time at this concealed item: display one
|
// First time at this concealed item: display one
|
||||||
// character.
|
// character.
|
||||||
if (has_match_conc && match_conc) {
|
if (has_match_conc && match_conc) {
|
||||||
@@ -2410,12 +2416,6 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, int col_rows, s
|
|||||||
mb_schar = schar_from_ascii(' ');
|
mb_schar = schar_from_ascii(' ');
|
||||||
}
|
}
|
||||||
|
|
||||||
if (utf_char2cells(mb_c) > 1) {
|
|
||||||
// When the first char to be concealed is double-width,
|
|
||||||
// need to advance one more virtual column.
|
|
||||||
wlv.n_extra++;
|
|
||||||
}
|
|
||||||
|
|
||||||
mb_c = schar_get_first_codepoint(mb_schar);
|
mb_c = schar_get_first_codepoint(mb_schar);
|
||||||
|
|
||||||
prev_syntax_id = syntax_seqnr;
|
prev_syntax_id = syntax_seqnr;
|
||||||
@@ -2484,7 +2484,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, int col_rows, s
|
|||||||
&& mb_schar != NUL) {
|
&& mb_schar != NUL) {
|
||||||
mb_schar = wp->w_p_lcs_chars.prec;
|
mb_schar = wp->w_p_lcs_chars.prec;
|
||||||
lcs_prec_todo = NUL;
|
lcs_prec_todo = NUL;
|
||||||
if (utf_char2cells(mb_c) > 1) {
|
if (schar_cells(mb_schar) > 1) {
|
||||||
// Double-width character being overwritten by the "precedes"
|
// Double-width character being overwritten by the "precedes"
|
||||||
// character, need to fill up half the character.
|
// character, need to fill up half the character.
|
||||||
wlv.sc_extra = schar_from_ascii(MB_FILLER_CHAR);
|
wlv.sc_extra = schar_from_ascii(MB_FILLER_CHAR);
|
||||||
@@ -2725,7 +2725,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, int col_rows, s
|
|||||||
|
|
||||||
linebuf_vcol[wlv.off] = wlv.vcol;
|
linebuf_vcol[wlv.off] = wlv.vcol;
|
||||||
|
|
||||||
if (utf_char2cells(mb_c) > 1) {
|
if (schar_cells(mb_schar) > 1) {
|
||||||
// Need to fill two screen columns.
|
// Need to fill two screen columns.
|
||||||
wlv.off++;
|
wlv.off++;
|
||||||
wlv.col++;
|
wlv.col++;
|
||||||
@@ -2744,7 +2744,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, int col_rows, s
|
|||||||
wlv.off++;
|
wlv.off++;
|
||||||
wlv.col++;
|
wlv.col++;
|
||||||
} else if (wp->w_p_cole > 0 && is_concealing) {
|
} else if (wp->w_p_cole > 0 && is_concealing) {
|
||||||
bool concealed_wide = utf_char2cells(mb_c) > 1;
|
bool concealed_wide = schar_cells(mb_schar) > 1;
|
||||||
|
|
||||||
wlv.skip_cells--;
|
wlv.skip_cells--;
|
||||||
wlv.vcol_off_co++;
|
wlv.vcol_off_co++;
|
||||||
|
|||||||
@@ -2832,6 +2832,8 @@ int replace_push_mb(char *p)
|
|||||||
{
|
{
|
||||||
int l = utfc_ptr2len(p);
|
int l = utfc_ptr2len(p);
|
||||||
|
|
||||||
|
// TODO(bfredl): stop doing this insantity and instead use utf_head_off() when popping.
|
||||||
|
// or just keep a secondary array with char byte lenghts
|
||||||
for (int j = l - 1; j >= 0; j--) {
|
for (int j = l - 1; j >= 0; j--) {
|
||||||
replace_push(p[j]);
|
replace_push(p[j]);
|
||||||
}
|
}
|
||||||
@@ -2911,7 +2913,9 @@ static void mb_replace_pop_ins(int cc)
|
|||||||
for (int i = 1; i < n; i++) {
|
for (int i = 1; i < n; i++) {
|
||||||
buf[i] = (uint8_t)replace_pop();
|
buf[i] = (uint8_t)replace_pop();
|
||||||
}
|
}
|
||||||
if (utf_iscomposing(utf_ptr2char((char *)buf))) {
|
// TODO(bfredl): by fixing replace_push_mb, upgrade to use
|
||||||
|
// the new composing algorithm
|
||||||
|
if (utf_iscomposing_legacy(utf_ptr2char((char *)buf))) {
|
||||||
ins_bytes_len((char *)buf, (size_t)n);
|
ins_bytes_len((char *)buf, (size_t)n);
|
||||||
} else {
|
} else {
|
||||||
// Not a composing char, put it back.
|
// Not a composing char, put it back.
|
||||||
@@ -3843,7 +3847,7 @@ static bool ins_bs(int c, int mode, int *inserted_space_p)
|
|||||||
space_sci = sci;
|
space_sci = sci;
|
||||||
space_vcol = vcol;
|
space_vcol = vcol;
|
||||||
}
|
}
|
||||||
vcol += charsize_nowrap(curbuf, use_ts, vcol, sci.chr.value);
|
vcol += charsize_nowrap(curbuf, sci.ptr, use_ts, vcol, sci.chr.value);
|
||||||
sci = utfc_next(sci);
|
sci = utfc_next(sci);
|
||||||
prev_space = cur_space;
|
prev_space = cur_space;
|
||||||
}
|
}
|
||||||
@@ -3859,7 +3863,7 @@ static bool ins_bs(int c, int mode, int *inserted_space_p)
|
|||||||
// Find the position to stop backspacing.
|
// Find the position to stop backspacing.
|
||||||
// Use charsize_nowrap() so that virtual text and wrapping are ignored.
|
// Use charsize_nowrap() so that virtual text and wrapping are ignored.
|
||||||
while (true) {
|
while (true) {
|
||||||
int size = charsize_nowrap(curbuf, use_ts, space_vcol, space_sci.chr.value);
|
int size = charsize_nowrap(curbuf, space_sci.ptr, use_ts, space_vcol, space_sci.chr.value);
|
||||||
if (space_vcol + size > want_vcol) {
|
if (space_vcol + size > want_vcol) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@@ -3930,7 +3934,7 @@ static bool ins_bs(int c, int mode, int *inserted_space_p)
|
|||||||
bool has_composing = false;
|
bool has_composing = false;
|
||||||
if (p_deco) {
|
if (p_deco) {
|
||||||
char *p0 = get_cursor_pos_ptr();
|
char *p0 = get_cursor_pos_ptr();
|
||||||
has_composing = utf_composinglike(p0, p0 + utf_ptr2len(p0));
|
has_composing = utf_composinglike(p0, p0 + utf_ptr2len(p0), NULL);
|
||||||
}
|
}
|
||||||
del_char(false);
|
del_char(false);
|
||||||
// If there are combining characters and 'delcombine' is set
|
// If there are combining characters and 'delcombine' is set
|
||||||
|
|||||||
@@ -204,7 +204,7 @@ void do_ascii(exarg_T *eap)
|
|||||||
IObuff[iobuff_len++] = ' ';
|
IObuff[iobuff_len++] = ' ';
|
||||||
}
|
}
|
||||||
IObuff[iobuff_len++] = '<';
|
IObuff[iobuff_len++] = '<';
|
||||||
if (utf_iscomposing(c)) {
|
if (utf_iscomposing_first(c)) {
|
||||||
IObuff[iobuff_len++] = ' '; // Draw composing char on top of a space.
|
IObuff[iobuff_len++] = ' '; // Draw composing char on top of a space.
|
||||||
}
|
}
|
||||||
iobuff_len += (size_t)utf_char2bytes(c, IObuff + iobuff_len);
|
iobuff_len += (size_t)utf_char2bytes(c, IObuff + iobuff_len);
|
||||||
|
|||||||
@@ -2118,7 +2118,7 @@ static int command_line_handle_key(CommandLineState *s)
|
|||||||
s->do_abbr = false; // don't do abbreviation now
|
s->do_abbr = false; // don't do abbreviation now
|
||||||
ccline.special_char = NUL;
|
ccline.special_char = NUL;
|
||||||
// may need to remove ^ when composing char was typed
|
// may need to remove ^ when composing char was typed
|
||||||
if (utf_iscomposing(s->c) && !cmd_silent) {
|
if (utf_iscomposing_first(s->c) && !cmd_silent) {
|
||||||
if (ui_has(kUICmdline)) {
|
if (ui_has(kUICmdline)) {
|
||||||
// TODO(bfredl): why not make unputcmdline also work with true?
|
// TODO(bfredl): why not make unputcmdline also work with true?
|
||||||
unputcmdline();
|
unputcmdline();
|
||||||
@@ -3585,7 +3585,9 @@ void put_on_cmdline(const char *str, int len, bool redraw)
|
|||||||
// backup to the character before it. There could be two of them.
|
// backup to the character before it. There could be two of them.
|
||||||
int i = 0;
|
int i = 0;
|
||||||
int c = utf_ptr2char(ccline.cmdbuff + ccline.cmdpos);
|
int c = utf_ptr2char(ccline.cmdbuff + ccline.cmdpos);
|
||||||
while (ccline.cmdpos > 0 && utf_iscomposing(c)) {
|
// TODO(bfredl): this can be corrected/simplified as utf_head_off implements the
|
||||||
|
// correct grapheme cluster breaks
|
||||||
|
while (ccline.cmdpos > 0 && utf_iscomposing_legacy(c)) {
|
||||||
i = utf_head_off(ccline.cmdbuff, ccline.cmdbuff + ccline.cmdpos - 1) + 1;
|
i = utf_head_off(ccline.cmdbuff, ccline.cmdbuff + ccline.cmdpos - 1) + 1;
|
||||||
ccline.cmdpos -= i;
|
ccline.cmdpos -= i;
|
||||||
len += i;
|
len += i;
|
||||||
|
|||||||
@@ -186,6 +186,24 @@ size_t schar_len(schar_T sc)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int schar_cells(schar_T sc)
|
||||||
|
{
|
||||||
|
// hot path
|
||||||
|
#ifdef ORDER_BIG_ENDIAN
|
||||||
|
if (!(sc & 0x80FFFFFF)) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
if (sc < 0x80) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
char sc_buf[MAX_SCHAR_SIZE];
|
||||||
|
schar_get(sc_buf, sc);
|
||||||
|
return utf_ptr2cells(sc_buf);
|
||||||
|
}
|
||||||
|
|
||||||
/// gets first raw UTF-8 byte of an schar
|
/// gets first raw UTF-8 byte of an schar
|
||||||
static char schar_get_first_byte(schar_T sc)
|
static char schar_get_first_byte(schar_T sc)
|
||||||
{
|
{
|
||||||
@@ -428,14 +446,19 @@ int grid_line_puts(int col, const char *text, int textlen, int attr)
|
|||||||
const int max_col = grid_line_maxcol;
|
const int max_col = grid_line_maxcol;
|
||||||
while (col < max_col && (len < 0 || (int)(ptr - text) < len) && *ptr != NUL) {
|
while (col < max_col && (len < 0 || (int)(ptr - text) < len) && *ptr != NUL) {
|
||||||
// check if this is the first byte of a multibyte
|
// check if this is the first byte of a multibyte
|
||||||
int mbyte_blen = len > 0
|
int mbyte_blen;
|
||||||
? utfc_ptr2len_len(ptr, (int)((text + len) - ptr))
|
if (len >= 0) {
|
||||||
: utfc_ptr2len(ptr);
|
int maxlen = (int)((text + len) - ptr);
|
||||||
|
mbyte_blen = utfc_ptr2len_len(ptr, maxlen);
|
||||||
|
if (mbyte_blen > maxlen) {
|
||||||
|
mbyte_blen = 1;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
mbyte_blen = utfc_ptr2len(ptr);
|
||||||
|
}
|
||||||
int firstc;
|
int firstc;
|
||||||
schar_T schar = len >= 0
|
schar_T schar = utfc_ptrlen2schar(ptr, mbyte_blen, &firstc);
|
||||||
? utfc_ptr2schar_len(ptr, (int)((text + len) - ptr), &firstc)
|
int mbyte_cells = utf_ptr2cells_len(ptr, mbyte_blen);
|
||||||
: utfc_ptr2schar(ptr, &firstc);
|
|
||||||
int mbyte_cells = utf_char2cells(firstc);
|
|
||||||
if (mbyte_cells > 2 || schar == 0) {
|
if (mbyte_cells > 2 || schar == 0) {
|
||||||
mbyte_cells = 1;
|
mbyte_cells = 1;
|
||||||
schar = schar_from_char(0xFFFD);
|
schar = schar_from_char(0xFFFD);
|
||||||
|
|||||||
288
src/nvim/mbyte.c
288
src/nvim/mbyte.c
@@ -511,20 +511,30 @@ int utf_char2cells(int c)
|
|||||||
|
|
||||||
/// Return the number of display cells character at "*p" occupies.
|
/// Return the number of display cells character at "*p" occupies.
|
||||||
/// This doesn't take care of unprintable characters, use ptr2cells() for that.
|
/// This doesn't take care of unprintable characters, use ptr2cells() for that.
|
||||||
int utf_ptr2cells(const char *p)
|
int utf_ptr2cells(const char *p_in)
|
||||||
{
|
{
|
||||||
|
const uint8_t *p = (const uint8_t *)p_in;
|
||||||
// Need to convert to a character number.
|
// Need to convert to a character number.
|
||||||
if ((uint8_t)(*p) >= 0x80) {
|
if ((*p) >= 0x80) {
|
||||||
int c = utf_ptr2char(p);
|
int len = utf8len_tab[*p];
|
||||||
|
int32_t c = utf_ptr2CharInfo_impl(p, (uintptr_t)len);
|
||||||
// An illegal byte is displayed as <xx>.
|
// An illegal byte is displayed as <xx>.
|
||||||
if (utf_ptr2len(p) == 1 || c == NUL) {
|
if (c <= 0) {
|
||||||
return 4;
|
return 4;
|
||||||
}
|
}
|
||||||
// If the char is ASCII it must be an overlong sequence.
|
// If the char is ASCII it must be an overlong sequence.
|
||||||
if (c < 0x80) {
|
if (c < 0x80) {
|
||||||
return char2cells(c);
|
return char2cells(c);
|
||||||
}
|
}
|
||||||
return utf_char2cells(c);
|
int cells = utf_char2cells(c);
|
||||||
|
if (cells == 1 && p_emoji
|
||||||
|
&& intable(emoji_all, ARRAY_SIZE(emoji_all), c)) {
|
||||||
|
int c2 = utf_ptr2char(p_in + len);
|
||||||
|
if (c2 == 0xFE0F) {
|
||||||
|
return 2; // emoji presentation
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return cells;
|
||||||
}
|
}
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
@@ -603,7 +613,8 @@ int utf_ptr2cells_len(const char *p, int size)
|
|||||||
{
|
{
|
||||||
// Need to convert to a wide character.
|
// Need to convert to a wide character.
|
||||||
if (size > 0 && (uint8_t)(*p) >= 0x80) {
|
if (size > 0 && (uint8_t)(*p) >= 0x80) {
|
||||||
if (utf_ptr2len_len(p, size) < utf8len_tab[(uint8_t)(*p)]) {
|
int len = utf_ptr2len_len(p, size);
|
||||||
|
if (len < utf8len_tab[(uint8_t)(*p)]) {
|
||||||
return 1; // truncated
|
return 1; // truncated
|
||||||
}
|
}
|
||||||
int c = utf_ptr2char(p);
|
int c = utf_ptr2char(p);
|
||||||
@@ -615,7 +626,16 @@ int utf_ptr2cells_len(const char *p, int size)
|
|||||||
if (c < 0x80) {
|
if (c < 0x80) {
|
||||||
return char2cells(c);
|
return char2cells(c);
|
||||||
}
|
}
|
||||||
return utf_char2cells(c);
|
int cells = utf_char2cells(c);
|
||||||
|
if (cells == 1 && p_emoji && size > len
|
||||||
|
&& intable(emoji_all, ARRAY_SIZE(emoji_all), c)
|
||||||
|
&& utf_ptr2len_len(p + len, size - len) == utf8len_tab[(uint8_t)p[len]]) {
|
||||||
|
int c2 = utf_ptr2char(p + len);
|
||||||
|
if (c2 == 0xFE0F) {
|
||||||
|
return 2; // emoji presentation
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return cells;
|
||||||
}
|
}
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
@@ -648,8 +668,8 @@ size_t mb_string2cells_len(const char *str, size_t size)
|
|||||||
size_t clen = 0;
|
size_t clen = 0;
|
||||||
|
|
||||||
for (const char *p = str; *p != NUL && p < str + size;
|
for (const char *p = str; *p != NUL && p < str + size;
|
||||||
p += utfc_ptr2len_len(p, (int)size + (int)(p - str))) {
|
p += utfc_ptr2len_len(p, (int)size - (int)(p - str))) {
|
||||||
clen += (size_t)utf_ptr2cells(p);
|
clen += (size_t)utf_ptr2cells_len(p, (int)size - (int)(p - str));
|
||||||
}
|
}
|
||||||
|
|
||||||
return clen;
|
return clen;
|
||||||
@@ -793,29 +813,48 @@ int mb_cptr2char_adv(const char **pp)
|
|||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Check if the character pointed to by "p2" is a composing character when it
|
/// When "c" is the first char of a string, determine if it needs to be prefixed
|
||||||
/// comes after "p1". For Arabic sometimes "ab" is replaced with "c", which
|
/// by a space byte to be drawn correctly, and not merge with the space left of
|
||||||
/// behaves like a composing character.
|
/// the string.
|
||||||
bool utf_composinglike(const char *p1, const char *p2)
|
bool utf_iscomposing_first(int c)
|
||||||
{
|
{
|
||||||
int c2 = utf_ptr2char(p2);
|
return c >= 128 && !utf8proc_grapheme_break(' ', c);
|
||||||
if (utf_iscomposing(c2)) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
if (!arabic_maycombine(c2)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return arabic_combine(utf_ptr2char(p1), c2);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Check if the next character is a composing character when it
|
/// Check if the character pointed to by "p2" is a composing character when it
|
||||||
/// comes after the first. For Arabic sometimes "ab" is replaced with "c", which
|
/// comes after "p1".
|
||||||
/// behaves like a composing character.
|
///
|
||||||
/// returns false for negative values
|
/// We use the definition in UAX#29 as implemented by utf8proc with the following
|
||||||
bool utf_char_composinglike(int32_t const first, int32_t const next)
|
/// exceptions:
|
||||||
FUNC_ATTR_PURE
|
///
|
||||||
|
/// - ASCII chars always begin a new cluster. This is a long assumed invariant
|
||||||
|
/// in the code base and very useful for performance (we can exit early for ASCII
|
||||||
|
/// all over the place, branch predictor go brrr in ASCII-only text).
|
||||||
|
/// As of Unicode 15.1 this will only break BOUNDCLASS_UREPEND followed by ASCII,
|
||||||
|
/// which should be exceedingly rare (these PREPEND chars are expected to be
|
||||||
|
/// followed by multibyte chars within the same script family)
|
||||||
|
///
|
||||||
|
/// - When 'arabicshape' is active, some pairs of arabic letters "ab" is replaced with
|
||||||
|
/// "c" taking one single cell, which behaves like a cluster.
|
||||||
|
///
|
||||||
|
/// @param "state" should be set to GRAPHEME_STATE_INIT before first call
|
||||||
|
/// it is allowed to be null, but will then not handle some longer
|
||||||
|
/// sequences, like ZWJ based emoji
|
||||||
|
bool utf_composinglike(const char *p1, const char *p2, GraphemeState *state)
|
||||||
|
FUNC_ATTR_NONNULL_ARG(1, 2)
|
||||||
{
|
{
|
||||||
return utf_iscomposing(next) || arabic_combine(first, next);
|
if ((uint8_t)(*p2) < 128) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
int first = utf_ptr2char(p1);
|
||||||
|
int second = utf_ptr2char(p2);
|
||||||
|
|
||||||
|
if (!utf8proc_grapheme_break_stateful(first, second, state)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return arabic_combine(first, second);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get the screen char at the beginning of a string
|
/// Get the screen char at the beginning of a string
|
||||||
@@ -834,7 +873,7 @@ schar_T utfc_ptr2schar(const char *p, int *firstc)
|
|||||||
{
|
{
|
||||||
int c = utf_ptr2char(p);
|
int c = utf_ptr2char(p);
|
||||||
*firstc = c; // NOT optional, you are gonna need it
|
*firstc = c; // NOT optional, you are gonna need it
|
||||||
bool first_compose = utf_iscomposing(c);
|
bool first_compose = utf_iscomposing_first(c);
|
||||||
size_t maxlen = MAX_SCHAR_SIZE - 1 - first_compose;
|
size_t maxlen = MAX_SCHAR_SIZE - 1 - first_compose;
|
||||||
size_t len = (size_t)utfc_ptr2len_len(p, (int)maxlen);
|
size_t len = (size_t)utfc_ptr2len_len(p, (int)maxlen);
|
||||||
|
|
||||||
@@ -845,16 +884,13 @@ schar_T utfc_ptr2schar(const char *p, int *firstc)
|
|||||||
return schar_from_buf_first(p, len, first_compose);
|
return schar_from_buf_first(p, len, first_compose);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get the screen char at the beginning of a string with length
|
/// Get the screen char from a char with a known length
|
||||||
///
|
///
|
||||||
/// Like utfc_ptr2schar but use no more than p[maxlen].
|
/// Like utfc_ptr2schar but use no more than p[maxlen].
|
||||||
schar_T utfc_ptr2schar_len(const char *p, int maxlen, int *firstc)
|
schar_T utfc_ptrlen2schar(const char *p, int len, int *firstc)
|
||||||
FUNC_ATTR_NONNULL_ALL
|
FUNC_ATTR_NONNULL_ALL
|
||||||
{
|
{
|
||||||
assert(maxlen > 0);
|
if ((len == 1 && (uint8_t)(*p) >= 0x80) || len == 0) {
|
||||||
|
|
||||||
size_t len = (size_t)utf_ptr2len_len(p, maxlen);
|
|
||||||
if (len > (size_t)maxlen || (len == 1 && (uint8_t)(*p) >= 0x80) || len == 0) {
|
|
||||||
// invalid or truncated sequence
|
// invalid or truncated sequence
|
||||||
*firstc = (uint8_t)(*p);
|
*firstc = (uint8_t)(*p);
|
||||||
return 0;
|
return 0;
|
||||||
@@ -862,11 +898,13 @@ schar_T utfc_ptr2schar_len(const char *p, int maxlen, int *firstc)
|
|||||||
|
|
||||||
int c = utf_ptr2char(p);
|
int c = utf_ptr2char(p);
|
||||||
*firstc = c;
|
*firstc = c;
|
||||||
bool first_compose = utf_iscomposing(c);
|
bool first_compose = utf_iscomposing_first(c);
|
||||||
maxlen = MIN(maxlen, MAX_SCHAR_SIZE - 1 - first_compose);
|
int maxlen = MAX_SCHAR_SIZE - 1 - first_compose;
|
||||||
len = (size_t)utfc_ptr2len_len(p, maxlen);
|
if (len > maxlen) {
|
||||||
|
len = utfc_ptr2len_len(p, maxlen);
|
||||||
|
}
|
||||||
|
|
||||||
return schar_from_buf_first(p, len, first_compose);
|
return schar_from_buf_first(p, (size_t)len, first_compose);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Caller must ensure there is space for `first_compose`
|
/// Caller must ensure there is space for `first_compose`
|
||||||
@@ -964,8 +1002,9 @@ int utfc_ptr2len(const char *const p)
|
|||||||
|
|
||||||
// Check for composing characters.
|
// Check for composing characters.
|
||||||
int prevlen = 0;
|
int prevlen = 0;
|
||||||
|
GraphemeState state = GRAPHEME_STATE_INIT;
|
||||||
while (true) {
|
while (true) {
|
||||||
if ((uint8_t)p[len] < 0x80 || !utf_composinglike(p + prevlen, p + len)) {
|
if ((uint8_t)p[len] < 0x80 || !utf_composinglike(p + prevlen, p + len, &state)) {
|
||||||
return len;
|
return len;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -996,9 +1035,10 @@ int utfc_ptr2len_len(const char *p, int size)
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check for composing characters. We can handle only the first six, but
|
// Check for composing characters. We can only display a limited amount, but
|
||||||
// skip all of them (otherwise the cursor would get stuck).
|
// skip all of them (otherwise the cursor would get stuck).
|
||||||
int prevlen = 0;
|
int prevlen = 0;
|
||||||
|
GraphemeState state = GRAPHEME_STATE_INIT;
|
||||||
while (len < size) {
|
while (len < size) {
|
||||||
if ((uint8_t)p[len] < 0x80) {
|
if ((uint8_t)p[len] < 0x80) {
|
||||||
break;
|
break;
|
||||||
@@ -1011,7 +1051,7 @@ int utfc_ptr2len_len(const char *p, int size)
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!utf_composinglike(p + prevlen, p + len)) {
|
if (!utf_composinglike(p + prevlen, p + len, &state)) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1084,11 +1124,18 @@ int utf_char2bytes(const int c, char *const buf)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Return true if "c" is a composing UTF-8 character.
|
/// Return true if "c" is a legacy composing UTF-8 character.
|
||||||
/// This means it will be drawn on top of the preceding character.
|
///
|
||||||
|
/// This is deprecated in favour of utf_composinglike() which uses the modern
|
||||||
|
/// stateful algorithm to determine grapheme clusters. Still available
|
||||||
|
/// to support some legacy code which hasn't been refactored yet.
|
||||||
|
///
|
||||||
|
/// To check if a char would combine with a preceeding space, use
|
||||||
|
/// utf_iscomposing_first() instead.
|
||||||
|
///
|
||||||
/// Based on code from Markus Kuhn.
|
/// Based on code from Markus Kuhn.
|
||||||
/// Returns false for negative values.
|
/// Returns false for negative values.
|
||||||
bool utf_iscomposing(int c)
|
bool utf_iscomposing_legacy(int c)
|
||||||
{
|
{
|
||||||
return intable(combining, ARRAY_SIZE(combining), c);
|
return intable(combining, ARRAY_SIZE(combining), c);
|
||||||
}
|
}
|
||||||
@@ -1278,8 +1325,9 @@ int utf_class_tab(const int c, const uint64_t *const chartab)
|
|||||||
return 2;
|
return 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool utf_ambiguous_width(int c)
|
bool utf_ambiguous_width(const char *p)
|
||||||
{
|
{
|
||||||
|
int c = utf_ptr2char(p);
|
||||||
return c >= 0x80 && (intable(ambiguous, ARRAY_SIZE(ambiguous), c)
|
return c >= 0x80 && (intable(ambiguous, ARRAY_SIZE(ambiguous), c)
|
||||||
|| intable(emoji_all, ARRAY_SIZE(emoji_all), c));
|
|| intable(emoji_all, ARRAY_SIZE(emoji_all), c));
|
||||||
}
|
}
|
||||||
@@ -1666,6 +1714,26 @@ void show_utf8(void)
|
|||||||
msg(IObuff, 0);
|
msg(IObuff, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// @return true if boundclass bc always starts a new cluster regardless of what's before
|
||||||
|
/// false negatives are allowed (perf cost, not correctness)
|
||||||
|
static bool always_break(int bc)
|
||||||
|
{
|
||||||
|
return (bc == UTF8PROC_BOUNDCLASS_CONTROL);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// @return true if bc2 always starts a cluster after bc1
|
||||||
|
/// false negatives are allowed (perf cost, not correctness)
|
||||||
|
static bool always_break_two(int bc1, int bc2)
|
||||||
|
{
|
||||||
|
// don't check for UTF8PROC_BOUNDCLASS_CONTROL for bc2 as it either has been checked by
|
||||||
|
// "always_break" on first iteration or when it was bc1 in the previous iteration
|
||||||
|
return ((bc1 != UTF8PROC_BOUNDCLASS_PREPEND && bc2 == UTF8PROC_BOUNDCLASS_OTHER)
|
||||||
|
|| (bc1 >= UTF8PROC_BOUNDCLASS_CR && bc1 <= UTF8PROC_BOUNDCLASS_CONTROL)
|
||||||
|
|| (bc2 == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC
|
||||||
|
&& (bc1 == UTF8PROC_BOUNDCLASS_OTHER
|
||||||
|
|| bc1 == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC)));
|
||||||
|
}
|
||||||
|
|
||||||
/// Return offset from "p" to the start of a character, including composing characters.
|
/// Return offset from "p" to the start of a character, including composing characters.
|
||||||
/// "base" must be the start of the string, which must be NUL terminated.
|
/// "base" must be the start of the string, which must be NUL terminated.
|
||||||
/// If "p" points to the NUL at the end of the string return 0.
|
/// If "p" points to the NUL at the end of the string return 0.
|
||||||
@@ -1679,50 +1747,108 @@ int utf_head_off(const char *base_in, const char *p_in)
|
|||||||
const uint8_t *base = (uint8_t *)base_in;
|
const uint8_t *base = (uint8_t *)base_in;
|
||||||
const uint8_t *p = (uint8_t *)p_in;
|
const uint8_t *p = (uint8_t *)p_in;
|
||||||
|
|
||||||
// Skip backwards over trailing bytes: 10xx.xxxx
|
const uint8_t *start = p;
|
||||||
// Skip backwards again if on a composing char.
|
|
||||||
const uint8_t *q;
|
|
||||||
for (q = p;; q--) {
|
|
||||||
// Move s to the last byte of this char.
|
|
||||||
const uint8_t *s;
|
|
||||||
for (s = q; (s[1] & 0xc0) == 0x80; s++) {}
|
|
||||||
|
|
||||||
// Move q to the first byte of this char.
|
// move start to the first byte of this codepoint
|
||||||
while (q > base && (*q & 0xc0) == 0x80) {
|
// might stop on a continuation byte if overlong, handled by utf_ptr2CharInfo_impl
|
||||||
q--;
|
while (start > base && (*start & 0xc0) == 0x80 && (p - start) < 6) {
|
||||||
}
|
start--;
|
||||||
// Check for illegal sequence. Do allow an illegal byte after where we
|
|
||||||
// started.
|
|
||||||
int len = utf8len_tab[*q];
|
|
||||||
if (len != (int)(s - q + 1) && len != (int)(p - q + 1)) {
|
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (q <= base) {
|
uint8_t cur_len = utf8len_tab[*start];
|
||||||
|
int32_t cur_code = utf_ptr2CharInfo_impl(start, (uintptr_t)cur_len);
|
||||||
|
if (cur_code < 0) {
|
||||||
|
return 0; // p must be part of an illegal sequence
|
||||||
|
}
|
||||||
|
const uint8_t * const safe_end = start + cur_len;
|
||||||
|
|
||||||
|
int cur_bc = utf8proc_get_property(cur_code)->boundclass;
|
||||||
|
if (always_break(cur_bc)) {
|
||||||
|
return (int)(p - start);
|
||||||
|
}
|
||||||
|
|
||||||
|
// backtrack to find the start of a cluster. we might go too far, checked in the next loop
|
||||||
|
const uint8_t *cur_pos = start;
|
||||||
|
const uint8_t *const p_start = start;
|
||||||
|
|
||||||
|
if (start == base) {
|
||||||
|
return (int)(p - start);
|
||||||
|
}
|
||||||
|
|
||||||
|
start--;
|
||||||
|
while (*start >= 0x80) { // stop on ascii, we are done
|
||||||
|
while (start > base && (*start & 0xc0) == 0x80 && (cur_pos - start) < 6) {
|
||||||
|
start--;
|
||||||
|
}
|
||||||
|
|
||||||
|
int32_t prev_code = utf_ptr2CharInfo_impl(start, (uintptr_t)utf8len_tab[*start]);
|
||||||
|
if (prev_code < 0) {
|
||||||
|
start = cur_pos; // start at valid sequence after invalid bytes
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
int c = utf_ptr2char((char *)q);
|
int prev_bc = utf8proc_get_property(prev_code)->boundclass;
|
||||||
if (utf_iscomposing(c)) {
|
if (always_break_two(prev_bc, cur_bc) && !arabic_combine(prev_code, cur_code)) {
|
||||||
continue;
|
start = cur_pos; // prev_code cannot be a part of this cluster
|
||||||
}
|
break;
|
||||||
|
} else if (start == base) {
|
||||||
if (arabic_maycombine(c)) {
|
|
||||||
// Advance to get a sneak-peak at the next char
|
|
||||||
const uint8_t *j = q;
|
|
||||||
j--;
|
|
||||||
// Move j to the first byte of this char.
|
|
||||||
while (j > base && (*j & 0xc0) == 0x80) {
|
|
||||||
j--;
|
|
||||||
}
|
|
||||||
if (arabic_combine(utf_ptr2char((char *)j), c)) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
cur_pos = start;
|
||||||
|
cur_bc = prev_bc;
|
||||||
|
cur_code = prev_code;
|
||||||
|
|
||||||
|
start--;
|
||||||
|
}
|
||||||
|
|
||||||
|
// hot path: we are already on the first codepoint of a sequence
|
||||||
|
if (start == p_start) {
|
||||||
|
return (int)(p - start);
|
||||||
|
}
|
||||||
|
|
||||||
|
const uint8_t *q = start;
|
||||||
|
while (q < p) {
|
||||||
|
// don't need to find end of cluster. once we reached the codepoint of p, we are done
|
||||||
|
int len = utfc_ptr2len_len((const char *)q, (int)(safe_end - q));
|
||||||
|
|
||||||
|
if (q + len > p) {
|
||||||
return (int)(p - q);
|
return (int)(p - q);
|
||||||
|
}
|
||||||
|
|
||||||
|
q += len;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Assumes caller already handles ascii. see `utfc_next`
|
||||||
|
StrCharInfo utfc_next_impl(StrCharInfo cur)
|
||||||
|
{
|
||||||
|
int32_t prev_code = cur.chr.value;
|
||||||
|
uint8_t *next = (uint8_t *)(cur.ptr + cur.chr.len);
|
||||||
|
GraphemeState state = GRAPHEME_STATE_INIT;
|
||||||
|
assert(*next >= 0x80);
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
uint8_t const next_len = utf8len_tab[*next];
|
||||||
|
int32_t const next_code = utf_ptr2CharInfo_impl(next, (uintptr_t)next_len);
|
||||||
|
if (utf8proc_grapheme_break_stateful(prev_code, next_code, &state)
|
||||||
|
&& !arabic_combine(prev_code, next_code)) {
|
||||||
|
return (StrCharInfo){
|
||||||
|
.ptr = (char *)next,
|
||||||
|
.chr = (CharInfo){ .value = next_code, .len = (next_code < 0 ? 1 : next_len) },
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
prev_code = next_code;
|
||||||
|
next += next_len;
|
||||||
|
if (EXPECT(*next < 0x80U, true)) {
|
||||||
|
return (StrCharInfo){
|
||||||
|
.ptr = (char *)next,
|
||||||
|
.chr = (CharInfo){ .value = *next, .len = 1 },
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Whether space is NOT allowed before/after 'c'.
|
// Whether space is NOT allowed before/after 'c'.
|
||||||
@@ -2681,7 +2807,7 @@ char *string_convert_ext(const vimconv_T *const vcp, char *ptr, size_t *lenp, si
|
|||||||
c = 0x100; break; // not in latin9
|
c = 0x100; break; // not in latin9
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!utf_iscomposing(c)) { // skip composing chars
|
if (!utf_iscomposing_legacy(c)) { // skip composing chars
|
||||||
if (c < 0x100) {
|
if (c < 0x100) {
|
||||||
*d++ = (uint8_t)c;
|
*d++ = (uint8_t)c;
|
||||||
} else if (vcp->vc_fail) {
|
} else if (vcp->vc_fail) {
|
||||||
|
|||||||
@@ -3,6 +3,7 @@
|
|||||||
#include <stdbool.h>
|
#include <stdbool.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <sys/types.h> // IWYU pragma: keep
|
#include <sys/types.h> // IWYU pragma: keep
|
||||||
|
#include <utf8proc.h>
|
||||||
#include <uv.h> // IWYU pragma: keep
|
#include <uv.h> // IWYU pragma: keep
|
||||||
|
|
||||||
#include "nvim/cmdexpand_defs.h" // IWYU pragma: keep
|
#include "nvim/cmdexpand_defs.h" // IWYU pragma: keep
|
||||||
@@ -11,6 +12,9 @@
|
|||||||
#include "nvim/mbyte_defs.h" // IWYU pragma: keep
|
#include "nvim/mbyte_defs.h" // IWYU pragma: keep
|
||||||
#include "nvim/types_defs.h" // IWYU pragma: keep
|
#include "nvim/types_defs.h" // IWYU pragma: keep
|
||||||
|
|
||||||
|
typedef utf8proc_int32_t GraphemeState;
|
||||||
|
#define GRAPHEME_STATE_INIT 0
|
||||||
|
|
||||||
#ifdef INCLUDE_GENERATED_DECLARATIONS
|
#ifdef INCLUDE_GENERATED_DECLARATIONS
|
||||||
# include "mbyte.h.generated.h"
|
# include "mbyte.h.generated.h"
|
||||||
# include "mbyte.h.inline.generated.h"
|
# include "mbyte.h.inline.generated.h"
|
||||||
@@ -92,28 +96,16 @@ static inline CharInfo utf_ptr2CharInfo(char const *const p_in)
|
|||||||
static inline StrCharInfo utfc_next(StrCharInfo cur)
|
static inline StrCharInfo utfc_next(StrCharInfo cur)
|
||||||
FUNC_ATTR_NONNULL_ALL FUNC_ATTR_ALWAYS_INLINE FUNC_ATTR_PURE
|
FUNC_ATTR_NONNULL_ALL FUNC_ATTR_ALWAYS_INLINE FUNC_ATTR_PURE
|
||||||
{
|
{
|
||||||
int32_t prev_code = cur.chr.value;
|
// handle ASCII case inline
|
||||||
uint8_t *next = (uint8_t *)(cur.ptr + cur.chr.len);
|
uint8_t *next = (uint8_t *)(cur.ptr + cur.chr.len);
|
||||||
|
|
||||||
while (true) {
|
|
||||||
if (EXPECT(*next < 0x80U, true)) {
|
if (EXPECT(*next < 0x80U, true)) {
|
||||||
return (StrCharInfo){
|
return (StrCharInfo){
|
||||||
.ptr = (char *)next,
|
.ptr = (char *)next,
|
||||||
.chr = (CharInfo){ .value = *next, .len = 1 },
|
.chr = (CharInfo){ .value = *next, .len = 1 },
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
uint8_t const next_len = utf8len_tab[*next];
|
|
||||||
int32_t const next_code = utf_ptr2CharInfo_impl(next, (uintptr_t)next_len);
|
|
||||||
if (!utf_char_composinglike(prev_code, next_code)) {
|
|
||||||
return (StrCharInfo){
|
|
||||||
.ptr = (char *)next,
|
|
||||||
.chr = (CharInfo){ .value = next_code, .len = (next_code < 0 ? 1 : next_len) },
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
prev_code = next_code;
|
return utfc_next_impl(cur);
|
||||||
next += next_len;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline StrCharInfo utf_ptr2StrCharInfo(char *ptr)
|
static inline StrCharInfo utf_ptr2StrCharInfo(char *ptr)
|
||||||
|
|||||||
@@ -446,9 +446,7 @@ void trunc_string(const char *s, char *buf, int room_in, int buflen)
|
|||||||
// Last part: End of the string.
|
// Last part: End of the string.
|
||||||
half = i = (int)strlen(s);
|
half = i = (int)strlen(s);
|
||||||
while (true) {
|
while (true) {
|
||||||
do {
|
|
||||||
half = half - utf_head_off(s, s + half - 1) - 1;
|
half = half - utf_head_off(s, s + half - 1) - 1;
|
||||||
} while (half > 0 && utf_iscomposing(utf_ptr2char(s + half)));
|
|
||||||
n = ptr2cells(s + half);
|
n = ptr2cells(s + half);
|
||||||
if (len + n > room || half == 0) {
|
if (len + n > room || half == 0) {
|
||||||
break;
|
break;
|
||||||
|
|||||||
@@ -837,7 +837,10 @@ static void normal_get_additional_char(NormalState *s)
|
|||||||
while ((s->c = vpeekc()) > 0
|
while ((s->c = vpeekc()) > 0
|
||||||
&& (s->c >= 0x100 || MB_BYTE2LEN(vpeekc()) > 1)) {
|
&& (s->c >= 0x100 || MB_BYTE2LEN(vpeekc()) > 1)) {
|
||||||
s->c = plain_vgetc();
|
s->c = plain_vgetc();
|
||||||
if (!utf_iscomposing(s->c)) {
|
// TODO(bfredl): only allowing up to two composing chars is cringe af.
|
||||||
|
// Could reuse/abuse schar_T to at least allow us to input anything we are able
|
||||||
|
// to display and use the stateful utf8proc algorithm like utf_composinglike
|
||||||
|
if (!utf_iscomposing_legacy(s->c)) {
|
||||||
vungetc(s->c); // it wasn't, put it back
|
vungetc(s->c); // it wasn't, put it back
|
||||||
break;
|
break;
|
||||||
} else if (s->ca.ncharC1 == 0) {
|
} else if (s->ca.ncharC1 == 0) {
|
||||||
|
|||||||
@@ -2326,9 +2326,12 @@ return {
|
|||||||
desc = [=[
|
desc = [=[
|
||||||
When on all Unicode emoji characters are considered to be full width.
|
When on all Unicode emoji characters are considered to be full width.
|
||||||
This excludes "text emoji" characters, which are normally displayed as
|
This excludes "text emoji" characters, which are normally displayed as
|
||||||
single width. Unfortunately there is no good specification for this
|
single width. However, such "text emoji" are treated as full-width
|
||||||
and it has been determined on trial-and-error basis. Use the
|
emoji if they are followed by the U+FE0F variant selector.
|
||||||
|setcellwidths()| function to change the behavior.
|
|
||||||
|
Unfortunately there is no good specification for this and it has been
|
||||||
|
determined on trial-and-error basis. Use the |setcellwidths()|
|
||||||
|
function to change the behavior.
|
||||||
]=],
|
]=],
|
||||||
full_name = 'emoji',
|
full_name = 'emoji',
|
||||||
redraw = { 'all_windows', 'ui_option' },
|
redraw = { 'all_windows', 'ui_option' },
|
||||||
|
|||||||
@@ -146,7 +146,7 @@ CharSize charsize_regular(CharsizeArg *csarg, char *const cur, colnr_T const vco
|
|||||||
} else if (cur_char < 0) {
|
} else if (cur_char < 0) {
|
||||||
size = kInvalidByteCells;
|
size = kInvalidByteCells;
|
||||||
} else {
|
} else {
|
||||||
size = char2cells(cur_char);
|
size = ptr2cells(cur);
|
||||||
is_doublewidth = size == 2 && cur_char > 0x80;
|
is_doublewidth = size == 2 && cur_char > 0x80;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -337,8 +337,8 @@ CharSize charsize_regular(CharsizeArg *csarg, char *const cur, colnr_T const vco
|
|||||||
///
|
///
|
||||||
/// @see charsize_regular
|
/// @see charsize_regular
|
||||||
/// @see charsize_fast
|
/// @see charsize_fast
|
||||||
static inline CharSize charsize_fast_impl(win_T *const wp, bool use_tabstop, colnr_T const vcol,
|
static inline CharSize charsize_fast_impl(win_T *const wp, const char *cur, bool use_tabstop,
|
||||||
int32_t const cur_char)
|
colnr_T const vcol, int32_t const cur_char)
|
||||||
FUNC_ATTR_PURE FUNC_ATTR_ALWAYS_INLINE
|
FUNC_ATTR_PURE FUNC_ATTR_ALWAYS_INLINE
|
||||||
{
|
{
|
||||||
// A tab gets expanded, depending on the current column
|
// A tab gets expanded, depending on the current column
|
||||||
@@ -352,7 +352,11 @@ static inline CharSize charsize_fast_impl(win_T *const wp, bool use_tabstop, col
|
|||||||
if (cur_char < 0) {
|
if (cur_char < 0) {
|
||||||
width = kInvalidByteCells;
|
width = kInvalidByteCells;
|
||||||
} else {
|
} else {
|
||||||
width = char2cells(cur_char);
|
// TODO(bfredl): perf: often cur_char is enough at this point to determine width.
|
||||||
|
// we likely want a specialized version of utf_ptr2StrCharInfo also determining
|
||||||
|
// the ptr2cells width at the same time without any extra decoding. (also applies
|
||||||
|
// to charsize_regular and charsize_nowrap)
|
||||||
|
width = ptr2cells(cur);
|
||||||
}
|
}
|
||||||
|
|
||||||
// If a double-width char doesn't fit at the end of a line, it wraps to the next line,
|
// If a double-width char doesn't fit at the end of a line, it wraps to the next line,
|
||||||
@@ -371,23 +375,23 @@ static inline CharSize charsize_fast_impl(win_T *const wp, bool use_tabstop, col
|
|||||||
/// Can be used if CSType is kCharsizeFast.
|
/// Can be used if CSType is kCharsizeFast.
|
||||||
///
|
///
|
||||||
/// @see charsize_regular
|
/// @see charsize_regular
|
||||||
CharSize charsize_fast(CharsizeArg *csarg, colnr_T const vcol, int32_t const cur_char)
|
CharSize charsize_fast(CharsizeArg *csarg, const char *cur, colnr_T vcol, int32_t cur_char)
|
||||||
FUNC_ATTR_PURE
|
FUNC_ATTR_PURE
|
||||||
{
|
{
|
||||||
return charsize_fast_impl(csarg->win, csarg->use_tabstop, vcol, cur_char);
|
return charsize_fast_impl(csarg->win, cur, csarg->use_tabstop, vcol, cur_char);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get the number of cells taken up on the screen at given virtual column.
|
/// Get the number of cells taken up on the screen at given virtual column.
|
||||||
///
|
///
|
||||||
/// @see win_chartabsize()
|
/// @see win_chartabsize()
|
||||||
int charsize_nowrap(buf_T *buf, bool use_tabstop, colnr_T vcol, int32_t cur_char)
|
int charsize_nowrap(buf_T *buf, const char *cur, bool use_tabstop, colnr_T vcol, int32_t cur_char)
|
||||||
{
|
{
|
||||||
if (cur_char == TAB && use_tabstop) {
|
if (cur_char == TAB && use_tabstop) {
|
||||||
return tabstop_padding(vcol, buf->b_p_ts, buf->b_p_vts_array);
|
return tabstop_padding(vcol, buf->b_p_ts, buf->b_p_vts_array);
|
||||||
} else if (cur_char < 0) {
|
} else if (cur_char < 0) {
|
||||||
return kInvalidByteCells;
|
return kInvalidByteCells;
|
||||||
} else {
|
} else {
|
||||||
return char2cells(cur_char);
|
return ptr2cells(cur);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -467,7 +471,7 @@ int linesize_fast(CharsizeArg const *const csarg, int vcol_arg, colnr_T const le
|
|||||||
|
|
||||||
StrCharInfo ci = utf_ptr2StrCharInfo(line);
|
StrCharInfo ci = utf_ptr2StrCharInfo(line);
|
||||||
while (ci.ptr - line < len && *ci.ptr != NUL) {
|
while (ci.ptr - line < len && *ci.ptr != NUL) {
|
||||||
vcol += charsize_fast_impl(wp, use_tabstop, vcol_arg, ci.chr.value).width;
|
vcol += charsize_fast_impl(wp, ci.ptr, use_tabstop, vcol_arg, ci.chr.value).width;
|
||||||
ci = utfc_next(ci);
|
ci = utfc_next(ci);
|
||||||
if (vcol > MAXCOL) {
|
if (vcol > MAXCOL) {
|
||||||
vcol_arg = MAXCOL;
|
vcol_arg = MAXCOL;
|
||||||
@@ -530,7 +534,7 @@ void getvcol(win_T *wp, pos_T *pos, colnr_T *start, colnr_T *cursor, colnr_T *en
|
|||||||
char_size = (CharSize){ .width = 1 };
|
char_size = (CharSize){ .width = 1 };
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
char_size = charsize_fast_impl(wp, use_tabstop, vcol, ci.chr.value);
|
char_size = charsize_fast_impl(wp, ci.ptr, use_tabstop, vcol, ci.chr.value);
|
||||||
StrCharInfo const next = utfc_next(ci);
|
StrCharInfo const next = utfc_next(ci);
|
||||||
if (next.ptr - line > end_col) {
|
if (next.ptr - line > end_col) {
|
||||||
break;
|
break;
|
||||||
@@ -627,7 +631,7 @@ void getvvcol(win_T *wp, pos_T *pos, colnr_T *start, colnr_T *cursor, colnr_T *e
|
|||||||
if (pos->col < ml_get_buf_len(wp->w_buffer, pos->lnum)) {
|
if (pos->col < ml_get_buf_len(wp->w_buffer, pos->lnum)) {
|
||||||
int c = utf_ptr2char(ptr + pos->col);
|
int c = utf_ptr2char(ptr + pos->col);
|
||||||
if ((c != TAB) && vim_isprintc(c)) {
|
if ((c != TAB) && vim_isprintc(c)) {
|
||||||
endadd = (colnr_T)(char2cells(c) - 1);
|
endadd = (colnr_T)(ptr2cells(ptr + pos->col) - 1);
|
||||||
if (coladd > endadd) {
|
if (coladd > endadd) {
|
||||||
// past end of line
|
// past end of line
|
||||||
endadd = 0;
|
endadd = 0;
|
||||||
@@ -824,7 +828,7 @@ int plines_win_col(win_T *wp, linenr_T lnum, long column)
|
|||||||
if (cstype == kCharsizeFast) {
|
if (cstype == kCharsizeFast) {
|
||||||
bool const use_tabstop = csarg.use_tabstop;
|
bool const use_tabstop = csarg.use_tabstop;
|
||||||
while (*ci.ptr != NUL && --column >= 0) {
|
while (*ci.ptr != NUL && --column >= 0) {
|
||||||
vcol += charsize_fast_impl(wp, use_tabstop, vcol, ci.chr.value).width;
|
vcol += charsize_fast_impl(wp, ci.ptr, use_tabstop, vcol, ci.chr.value).width;
|
||||||
ci = utfc_next(ci);
|
ci = utfc_next(ci);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
@@ -54,7 +54,7 @@ static inline CharSize win_charsize(CSType cstype, int vcol, char *ptr, int32_t
|
|||||||
FUNC_ATTR_NONNULL_ALL FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_ALWAYS_INLINE
|
FUNC_ATTR_NONNULL_ALL FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_ALWAYS_INLINE
|
||||||
{
|
{
|
||||||
if (cstype == kCharsizeFast) {
|
if (cstype == kCharsizeFast) {
|
||||||
return charsize_fast(csarg, vcol, chr);
|
return charsize_fast(csarg, ptr, vcol, chr);
|
||||||
} else {
|
} else {
|
||||||
return charsize_regular(csarg, ptr, vcol, chr);
|
return charsize_regular(csarg, ptr, vcol, chr);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -3031,7 +3031,7 @@ static bool use_multibytecode(int c)
|
|||||||
{
|
{
|
||||||
return utf_char2len(c) > 1
|
return utf_char2len(c) > 1
|
||||||
&& (re_multi_type(peekchr()) != NOT_MULTI
|
&& (re_multi_type(peekchr()) != NOT_MULTI
|
||||||
|| utf_iscomposing(c));
|
|| utf_iscomposing_legacy(c));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Emit (if appropriate) a byte of code
|
// Emit (if appropriate) a byte of code
|
||||||
@@ -4326,7 +4326,7 @@ static uint8_t *regatom(int *flagp)
|
|||||||
}
|
}
|
||||||
// When '.' is followed by a composing char ignore the dot, so that
|
// When '.' is followed by a composing char ignore the dot, so that
|
||||||
// the composing char is matched here.
|
// the composing char is matched here.
|
||||||
if (c == Magic('.') && utf_iscomposing(peekchr())) {
|
if (c == Magic('.') && utf_iscomposing_legacy(peekchr())) {
|
||||||
c = getchr();
|
c = getchr();
|
||||||
goto do_multibyte;
|
goto do_multibyte;
|
||||||
}
|
}
|
||||||
@@ -5001,9 +5001,10 @@ do_multibyte:
|
|||||||
int l;
|
int l;
|
||||||
|
|
||||||
// Need to get composing character too.
|
// Need to get composing character too.
|
||||||
|
GraphemeState state = GRAPHEME_STATE_INIT;
|
||||||
while (true) {
|
while (true) {
|
||||||
l = utf_ptr2len(regparse);
|
l = utf_ptr2len(regparse);
|
||||||
if (!utf_composinglike(regparse, regparse + l)) {
|
if (!utf_composinglike(regparse, regparse + l, &state)) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
regmbc(utf_ptr2char(regparse));
|
regmbc(utf_ptr2char(regparse));
|
||||||
@@ -6569,7 +6570,7 @@ static bool regmatch(uint8_t *scan, const proftime_T *tm, int *timed_out)
|
|||||||
// Check for following composing character, unless %C
|
// Check for following composing character, unless %C
|
||||||
// follows (skips over all composing chars).
|
// follows (skips over all composing chars).
|
||||||
if (status != RA_NOMATCH
|
if (status != RA_NOMATCH
|
||||||
&& utf_composinglike((char *)rex.input, (char *)rex.input + len)
|
&& utf_composinglike((char *)rex.input, (char *)rex.input + len, NULL)
|
||||||
&& !rex.reg_icombine
|
&& !rex.reg_icombine
|
||||||
&& OP(next) != RE_COMPOSING) {
|
&& OP(next) != RE_COMPOSING) {
|
||||||
// raaron: This code makes a composing character get
|
// raaron: This code makes a composing character get
|
||||||
@@ -6624,14 +6625,14 @@ static bool regmatch(uint8_t *scan, const proftime_T *tm, int *timed_out)
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
const int opndc = utf_ptr2char((char *)opnd);
|
const int opndc = utf_ptr2char((char *)opnd);
|
||||||
if (utf_iscomposing(opndc)) {
|
if (utf_iscomposing_legacy(opndc)) {
|
||||||
// When only a composing char is given match at any
|
// When only a composing char is given match at any
|
||||||
// position where that composing char appears.
|
// position where that composing char appears.
|
||||||
status = RA_NOMATCH;
|
status = RA_NOMATCH;
|
||||||
for (i = 0; rex.input[i] != NUL;
|
for (i = 0; rex.input[i] != NUL;
|
||||||
i += utf_ptr2len((char *)rex.input + i)) {
|
i += utf_ptr2len((char *)rex.input + i)) {
|
||||||
const int inpc = utf_ptr2char((char *)rex.input + i);
|
const int inpc = utf_ptr2char((char *)rex.input + i);
|
||||||
if (!utf_iscomposing(inpc)) {
|
if (!utf_iscomposing_legacy(inpc)) {
|
||||||
if (i > 0) {
|
if (i > 0) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@@ -6654,7 +6655,7 @@ static bool regmatch(uint8_t *scan, const proftime_T *tm, int *timed_out)
|
|||||||
|
|
||||||
case RE_COMPOSING:
|
case RE_COMPOSING:
|
||||||
// Skip composing characters.
|
// Skip composing characters.
|
||||||
while (utf_iscomposing(utf_ptr2char((char *)rex.input))) {
|
while (utf_iscomposing_legacy(utf_ptr2char((char *)rex.input))) {
|
||||||
rex.input += utf_ptr2len((char *)rex.input);
|
rex.input += utf_ptr2len((char *)rex.input);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
@@ -10070,7 +10071,7 @@ static int nfa_regatom(void)
|
|||||||
}
|
}
|
||||||
// When '.' is followed by a composing char ignore the dot, so that
|
// When '.' is followed by a composing char ignore the dot, so that
|
||||||
// the composing char is matched here.
|
// the composing char is matched here.
|
||||||
if (c == Magic('.') && utf_iscomposing(peekchr())) {
|
if (c == Magic('.') && utf_iscomposing_legacy(peekchr())) {
|
||||||
old_regparse = (uint8_t *)regparse;
|
old_regparse = (uint8_t *)regparse;
|
||||||
c = getchr();
|
c = getchr();
|
||||||
goto nfa_do_multibyte;
|
goto nfa_do_multibyte;
|
||||||
@@ -10705,7 +10706,7 @@ collection:
|
|||||||
nfa_do_multibyte:
|
nfa_do_multibyte:
|
||||||
// plen is length of current char with composing chars
|
// plen is length of current char with composing chars
|
||||||
if (utf_char2len(c) != (plen = utfc_ptr2len((char *)old_regparse))
|
if (utf_char2len(c) != (plen = utfc_ptr2len((char *)old_regparse))
|
||||||
|| utf_iscomposing(c)) {
|
|| utf_iscomposing_legacy(c)) {
|
||||||
int i = 0;
|
int i = 0;
|
||||||
|
|
||||||
// A base character plus composing characters, or just one
|
// A base character plus composing characters, or just one
|
||||||
@@ -14033,7 +14034,7 @@ static int find_match_text(colnr_T *startcol, int regstart, uint8_t *match_text)
|
|||||||
}
|
}
|
||||||
if (match
|
if (match
|
||||||
// check that no composing char follows
|
// check that no composing char follows
|
||||||
&& !utf_iscomposing(utf_ptr2char((char *)s2))) {
|
&& !utf_iscomposing_legacy(utf_ptr2char((char *)s2))) {
|
||||||
cleanup_subexpr();
|
cleanup_subexpr();
|
||||||
if (REG_MULTI) {
|
if (REG_MULTI) {
|
||||||
rex.reg_startpos[0].lnum = rex.lnum;
|
rex.reg_startpos[0].lnum = rex.lnum;
|
||||||
@@ -14278,7 +14279,7 @@ static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, regsubs_T *subm
|
|||||||
// is not really a match.
|
// is not really a match.
|
||||||
if (!rex.reg_icombine
|
if (!rex.reg_icombine
|
||||||
&& rex.input != rex.line
|
&& rex.input != rex.line
|
||||||
&& utf_iscomposing(curc)) {
|
&& utf_iscomposing_legacy(curc)) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
nfa_match = true;
|
nfa_match = true;
|
||||||
@@ -14622,7 +14623,7 @@ static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, regsubs_T *subm
|
|||||||
|
|
||||||
sta = t->state->out;
|
sta = t->state->out;
|
||||||
len = 0;
|
len = 0;
|
||||||
if (utf_iscomposing(sta->c)) {
|
if (utf_iscomposing_legacy(sta->c)) {
|
||||||
// Only match composing character(s), ignore base
|
// Only match composing character(s), ignore base
|
||||||
// character. Used for ".{composing}" and "{composing}"
|
// character. Used for ".{composing}" and "{composing}"
|
||||||
// (no preceding character).
|
// (no preceding character).
|
||||||
@@ -14724,7 +14725,7 @@ static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, regsubs_T *subm
|
|||||||
int j;
|
int j;
|
||||||
|
|
||||||
sta = t->state->out->out;
|
sta = t->state->out->out;
|
||||||
if (utf_iscomposing(sta->c)) {
|
if (utf_iscomposing_legacy(sta->c)) {
|
||||||
// Only match composing character(s), ignore base
|
// Only match composing character(s), ignore base
|
||||||
// character. Used for ".{composing}" and "{composing}"
|
// character. Used for ".{composing}" and "{composing}"
|
||||||
// (no preceding character).
|
// (no preceding character).
|
||||||
@@ -14846,7 +14847,7 @@ static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, regsubs_T *subm
|
|||||||
case NFA_ANY_COMPOSING:
|
case NFA_ANY_COMPOSING:
|
||||||
// On a composing character skip over it. Otherwise do
|
// On a composing character skip over it. Otherwise do
|
||||||
// nothing. Always matches.
|
// nothing. Always matches.
|
||||||
if (utf_iscomposing(curc)) {
|
if (utf_iscomposing_legacy(curc)) {
|
||||||
add_off = clen;
|
add_off = clen;
|
||||||
} else {
|
} else {
|
||||||
add_here = true;
|
add_here = true;
|
||||||
|
|||||||
@@ -1260,7 +1260,7 @@ int do_search(oparg_T *oap, int dirc, int search_delim, char *pat, size_t patlen
|
|||||||
// empty for the search_stat feature.
|
// empty for the search_stat feature.
|
||||||
if (!cmd_silent) {
|
if (!cmd_silent) {
|
||||||
msgbuf[0] = (char)dirc;
|
msgbuf[0] = (char)dirc;
|
||||||
if (utf_iscomposing(utf_ptr2char(p))) {
|
if (utf_iscomposing_first(utf_ptr2char(p))) {
|
||||||
// Use a space to draw the composing char on.
|
// Use a space to draw the composing char on.
|
||||||
msgbuf[1] = ' ';
|
msgbuf[1] = ' ';
|
||||||
memmove(msgbuf + 2, p, plen);
|
memmove(msgbuf + 2, p, plen);
|
||||||
|
|||||||
@@ -376,7 +376,7 @@ int init_sign_text(sign_T *sp, schar_T *sign_text, char *text)
|
|||||||
if (!vim_isprintc(c)) {
|
if (!vim_isprintc(c)) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
int width = utf_char2cells(c);
|
int width = utf_ptr2cells(s);
|
||||||
if (width == 2) {
|
if (width == 2) {
|
||||||
sign_text[cells + 1] = 0;
|
sign_text[cells + 1] = 0;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1792,10 +1792,8 @@ static void suggest_trie_walk(suginfo_T *su, langp_T *lp, char *fword, bool soun
|
|||||||
// For changing a composing character adjust
|
// For changing a composing character adjust
|
||||||
// the score from SCORE_SUBST to
|
// the score from SCORE_SUBST to
|
||||||
// SCORE_SUBCOMP.
|
// SCORE_SUBCOMP.
|
||||||
if (utf_iscomposing(utf_ptr2char(tword + sp->ts_twordlen
|
if (utf_iscomposing_legacy(utf_ptr2char(tword + sp->ts_twordlen - sp->ts_tcharlen))
|
||||||
- sp->ts_tcharlen))
|
&& utf_iscomposing_legacy(utf_ptr2char(fword + sp->ts_fcharstart))) {
|
||||||
&& utf_iscomposing(utf_ptr2char(fword
|
|
||||||
+ sp->ts_fcharstart))) {
|
|
||||||
sp->ts_score -= SCORE_SUBST - SCORE_SUBCOMP;
|
sp->ts_score -= SCORE_SUBST - SCORE_SUBCOMP;
|
||||||
} else if (!soundfold
|
} else if (!soundfold
|
||||||
&& slang->sl_has_map
|
&& slang->sl_has_map
|
||||||
@@ -1811,7 +1809,7 @@ static void suggest_trie_walk(suginfo_T *su, langp_T *lp, char *fword, bool soun
|
|||||||
&& sp->ts_twordlen > sp->ts_tcharlen) {
|
&& sp->ts_twordlen > sp->ts_tcharlen) {
|
||||||
p = tword + sp->ts_twordlen - sp->ts_tcharlen;
|
p = tword + sp->ts_twordlen - sp->ts_tcharlen;
|
||||||
c = utf_ptr2char(p);
|
c = utf_ptr2char(p);
|
||||||
if (utf_iscomposing(c)) {
|
if (utf_iscomposing_legacy(c)) {
|
||||||
// Inserting a composing char doesn't
|
// Inserting a composing char doesn't
|
||||||
// count that much.
|
// count that much.
|
||||||
sp->ts_score -= SCORE_INS - SCORE_INSCOMP;
|
sp->ts_score -= SCORE_INS - SCORE_INSCOMP;
|
||||||
@@ -1876,7 +1874,7 @@ static void suggest_trie_walk(suginfo_T *su, langp_T *lp, char *fword, bool soun
|
|||||||
c = utf_ptr2char(fword + sp->ts_fidx);
|
c = utf_ptr2char(fword + sp->ts_fidx);
|
||||||
stack[depth].ts_fidx =
|
stack[depth].ts_fidx =
|
||||||
(uint8_t)(stack[depth].ts_fidx + utfc_ptr2len(fword + sp->ts_fidx));
|
(uint8_t)(stack[depth].ts_fidx + utfc_ptr2len(fword + sp->ts_fidx));
|
||||||
if (utf_iscomposing(c)) {
|
if (utf_iscomposing_legacy(c)) {
|
||||||
stack[depth].ts_score -= SCORE_DEL - SCORE_DELCOMP;
|
stack[depth].ts_score -= SCORE_DEL - SCORE_DELCOMP;
|
||||||
} else if (c == utf_ptr2char(fword + stack[depth].ts_fidx)) {
|
} else if (c == utf_ptr2char(fword + stack[depth].ts_fidx)) {
|
||||||
stack[depth].ts_score -= SCORE_DEL - SCORE_DELDUP;
|
stack[depth].ts_score -= SCORE_DEL - SCORE_DELDUP;
|
||||||
|
|||||||
@@ -47,7 +47,7 @@ static bool did_add_space = false; ///< auto_format() added an extra space
|
|||||||
///< under the cursor
|
///< under the cursor
|
||||||
|
|
||||||
#define WHITECHAR(cc) (ascii_iswhite(cc) \
|
#define WHITECHAR(cc) (ascii_iswhite(cc) \
|
||||||
&& !utf_iscomposing(utf_ptr2char((char *)get_cursor_pos_ptr() + 1)))
|
&& !utf_iscomposing_first(utf_ptr2char((char *)get_cursor_pos_ptr() + 1)))
|
||||||
|
|
||||||
/// Return true if format option 'x' is in effect.
|
/// Return true if format option 'x' is in effect.
|
||||||
/// Take care of no formatting when 'paste' is set.
|
/// Take care of no formatting when 'paste' is set.
|
||||||
|
|||||||
@@ -109,6 +109,7 @@ struct TUIData {
|
|||||||
bool set_cursor_color_as_str;
|
bool set_cursor_color_as_str;
|
||||||
bool cursor_color_changed;
|
bool cursor_color_changed;
|
||||||
bool is_starting;
|
bool is_starting;
|
||||||
|
bool did_set_grapheme_cluster_mode;
|
||||||
FILE *screenshot;
|
FILE *screenshot;
|
||||||
cursorentry_T cursor_shapes[SHAPE_IDX_COUNT];
|
cursorentry_T cursor_shapes[SHAPE_IDX_COUNT];
|
||||||
HlAttrs clear_attrs;
|
HlAttrs clear_attrs;
|
||||||
@@ -220,6 +221,7 @@ static void tui_set_term_mode(TUIData *tui, TermMode mode, bool set)
|
|||||||
void tui_handle_term_mode(TUIData *tui, TermMode mode, TermModeState state)
|
void tui_handle_term_mode(TUIData *tui, TermMode mode, TermModeState state)
|
||||||
FUNC_ATTR_NONNULL_ALL
|
FUNC_ATTR_NONNULL_ALL
|
||||||
{
|
{
|
||||||
|
bool is_set = false;
|
||||||
switch (state) {
|
switch (state) {
|
||||||
case kTermModeNotRecognized:
|
case kTermModeNotRecognized:
|
||||||
case kTermModePermanentlySet:
|
case kTermModePermanentlySet:
|
||||||
@@ -228,6 +230,8 @@ void tui_handle_term_mode(TUIData *tui, TermMode mode, TermModeState state)
|
|||||||
// then there is nothing to do
|
// then there is nothing to do
|
||||||
break;
|
break;
|
||||||
case kTermModeSet:
|
case kTermModeSet:
|
||||||
|
is_set = true;
|
||||||
|
FALLTHROUGH;
|
||||||
case kTermModeReset:
|
case kTermModeReset:
|
||||||
// The terminal supports changing the given mode
|
// The terminal supports changing the given mode
|
||||||
switch (mode) {
|
switch (mode) {
|
||||||
@@ -240,6 +244,12 @@ void tui_handle_term_mode(TUIData *tui, TermMode mode, TermModeState state)
|
|||||||
signal_watcher_stop(&tui->winch_handle);
|
signal_watcher_stop(&tui->winch_handle);
|
||||||
tui_set_term_mode(tui, mode, true);
|
tui_set_term_mode(tui, mode, true);
|
||||||
break;
|
break;
|
||||||
|
case kTermModeGraphemeClusters:
|
||||||
|
if (!is_set) {
|
||||||
|
tui_set_term_mode(tui, mode, true);
|
||||||
|
tui->did_set_grapheme_cluster_mode = true;
|
||||||
|
}
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -434,6 +444,7 @@ static void terminfo_start(TUIData *tui)
|
|||||||
if (!nsterm) {
|
if (!nsterm) {
|
||||||
tui_request_term_mode(tui, kTermModeSynchronizedOutput);
|
tui_request_term_mode(tui, kTermModeSynchronizedOutput);
|
||||||
tui_request_term_mode(tui, kTermModeResizeEvents);
|
tui_request_term_mode(tui, kTermModeResizeEvents);
|
||||||
|
tui_request_term_mode(tui, kTermModeGraphemeClusters);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Don't use DECRQSS in screen or tmux, as they behave strangely when receiving it.
|
// Don't use DECRQSS in screen or tmux, as they behave strangely when receiving it.
|
||||||
@@ -494,7 +505,9 @@ static void terminfo_stop(TUIData *tui)
|
|||||||
|
|
||||||
// Disable resize events
|
// Disable resize events
|
||||||
tui_set_term_mode(tui, kTermModeResizeEvents, false);
|
tui_set_term_mode(tui, kTermModeResizeEvents, false);
|
||||||
|
if (tui->did_set_grapheme_cluster_mode) {
|
||||||
|
tui_set_term_mode(tui, kTermModeGraphemeClusters, false);
|
||||||
|
}
|
||||||
// May restore old title before exiting alternate screen.
|
// May restore old title before exiting alternate screen.
|
||||||
tui_set_title(tui, NULL_STRING);
|
tui_set_title(tui, NULL_STRING);
|
||||||
if (ui_client_exit_status == 0) {
|
if (ui_client_exit_status == 0) {
|
||||||
@@ -1010,7 +1023,7 @@ static void print_cell_at_pos(TUIData *tui, int row, int col, UCell *cell, bool
|
|||||||
char buf[MAX_SCHAR_SIZE];
|
char buf[MAX_SCHAR_SIZE];
|
||||||
schar_get(buf, cell->data);
|
schar_get(buf, cell->data);
|
||||||
int c = utf_ptr2char(buf);
|
int c = utf_ptr2char(buf);
|
||||||
bool is_ambiwidth = utf_ambiguous_width(c);
|
bool is_ambiwidth = utf_ambiguous_width(buf);
|
||||||
if (is_doublewidth && (is_ambiwidth || utf_char2cells(c) == 1)) {
|
if (is_doublewidth && (is_ambiwidth || utf_char2cells(c) == 1)) {
|
||||||
// If the server used setcellwidths() to treat a single-width char as double-width,
|
// If the server used setcellwidths() to treat a single-width char as double-width,
|
||||||
// it needs to be treated like an ambiguous-width char.
|
// it needs to be treated like an ambiguous-width char.
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ typedef struct TUIData TUIData;
|
|||||||
|
|
||||||
typedef enum {
|
typedef enum {
|
||||||
kTermModeSynchronizedOutput = 2026,
|
kTermModeSynchronizedOutput = 2026,
|
||||||
|
kTermModeGraphemeClusters = 2027,
|
||||||
kTermModeResizeEvents = 2048,
|
kTermModeResizeEvents = 2048,
|
||||||
} TermMode;
|
} TermMode;
|
||||||
|
|
||||||
|
|||||||
@@ -1435,6 +1435,28 @@ describe('API', function()
|
|||||||
it('cannot handle NULs', function()
|
it('cannot handle NULs', function()
|
||||||
eq(0, api.nvim_strwidth('\0abc'))
|
eq(0, api.nvim_strwidth('\0abc'))
|
||||||
end)
|
end)
|
||||||
|
|
||||||
|
it('can handle emoji with variant selectors and ZWJ', function()
|
||||||
|
local selector = '❤️'
|
||||||
|
eq(2, fn.strchars(selector))
|
||||||
|
eq(1, fn.strcharlen(selector))
|
||||||
|
eq(2, api.nvim_strwidth(selector))
|
||||||
|
|
||||||
|
local no_selector = '❤'
|
||||||
|
eq(1, fn.strchars(no_selector))
|
||||||
|
eq(1, fn.strcharlen(no_selector))
|
||||||
|
eq(1, api.nvim_strwidth(no_selector))
|
||||||
|
|
||||||
|
local selector_zwj_selector = '🏳️⚧️'
|
||||||
|
eq(5, fn.strchars(selector_zwj_selector))
|
||||||
|
eq(1, fn.strcharlen(selector_zwj_selector))
|
||||||
|
eq(2, api.nvim_strwidth(selector_zwj_selector))
|
||||||
|
|
||||||
|
local emoji_zwj_emoji = '🧑🌾'
|
||||||
|
eq(3, fn.strchars(emoji_zwj_emoji))
|
||||||
|
eq(1, fn.strcharlen(emoji_zwj_emoji))
|
||||||
|
eq(2, api.nvim_strwidth(emoji_zwj_emoji))
|
||||||
|
end)
|
||||||
end)
|
end)
|
||||||
|
|
||||||
describe('nvim_get_current_line, nvim_set_current_line', function()
|
describe('nvim_get_current_line, nvim_set_current_line', function()
|
||||||
|
|||||||
@@ -5620,6 +5620,27 @@ l5
|
|||||||
]]
|
]]
|
||||||
})
|
})
|
||||||
end)
|
end)
|
||||||
|
|
||||||
|
it('supports emoji as signs', function()
|
||||||
|
insert(example_test3)
|
||||||
|
feed 'gg'
|
||||||
|
api.nvim_buf_set_extmark(0, ns, 1, 0, {sign_text='🧑🌾'})
|
||||||
|
-- VS16 can change width of character
|
||||||
|
api.nvim_buf_set_extmark(0, ns, 2, 0, {sign_text='❤️'})
|
||||||
|
api.nvim_buf_set_extmark(0, ns, 3, 0, {sign_text='❤'})
|
||||||
|
api.nvim_buf_set_extmark(0, ns, 4, 0, {sign_text='❤x'})
|
||||||
|
screen:expect([[
|
||||||
|
{7: }^l1 |
|
||||||
|
🧑🌾l2 |
|
||||||
|
❤️l3 |
|
||||||
|
❤ l4 |
|
||||||
|
❤xl5 |
|
||||||
|
{7: } |
|
||||||
|
{1:~ }|*3
|
||||||
|
|
|
||||||
|
]])
|
||||||
|
eq("Invalid 'sign_text'", pcall_err(api.nvim_buf_set_extmark, 0, ns, 5, 0, {sign_text='❤️x'}))
|
||||||
|
end)
|
||||||
end)
|
end)
|
||||||
|
|
||||||
describe('decorations: virt_text', function()
|
describe('decorations: virt_text', function()
|
||||||
|
|||||||
@@ -1436,6 +1436,41 @@ vimComment xxx match /\s"[^\-:.%#=*].*$/ms=s+1,lc=1 excludenl contains=@vim
|
|||||||
}
|
}
|
||||||
end)
|
end)
|
||||||
|
|
||||||
|
it('supports nvim_echo messages with emoji', function()
|
||||||
|
-- stylua: ignore
|
||||||
|
async_meths.nvim_echo(
|
||||||
|
{ { 'wow, 🏳️⚧️🧑🌾❤️😂🏴☠️\nvariant ❤️ one\nvariant ❤ two' } }, true, {}
|
||||||
|
)
|
||||||
|
|
||||||
|
screen:expect([[
|
||||||
|
|
|
||||||
|
{1:~ }|
|
||||||
|
{3: }|
|
||||||
|
wow, 🏳️⚧️🧑🌾❤️😂🏴☠️ |
|
||||||
|
variant ❤️ one |
|
||||||
|
variant ❤ two |
|
||||||
|
{6:Press ENTER or type command to continue}^ |
|
||||||
|
]])
|
||||||
|
|
||||||
|
feed '<cr>'
|
||||||
|
screen:expect([[
|
||||||
|
^ |
|
||||||
|
{1:~ }|*5
|
||||||
|
|
|
||||||
|
]])
|
||||||
|
|
||||||
|
feed ':messages<cr>'
|
||||||
|
screen:expect([[
|
||||||
|
|
|
||||||
|
{1:~ }|
|
||||||
|
{3: }|
|
||||||
|
wow, 🏳️⚧️🧑🌾❤️😂🏴☠️ |
|
||||||
|
variant ❤️ one |
|
||||||
|
variant ❤ two |
|
||||||
|
{6:Press ENTER or type command to continue}^ |
|
||||||
|
]])
|
||||||
|
end)
|
||||||
|
|
||||||
it('prints lines in Ex mode correctly with a burst of carriage returns #19341', function()
|
it('prints lines in Ex mode correctly with a burst of carriage returns #19341', function()
|
||||||
command('set number')
|
command('set number')
|
||||||
api.nvim_buf_set_lines(0, 0, 0, true, { 'aaa', 'bbb', 'ccc' })
|
api.nvim_buf_set_lines(0, 0, 0, true, { 'aaa', 'bbb', 'ccc' })
|
||||||
|
|||||||
@@ -296,6 +296,86 @@ describe('multibyte rendering', function()
|
|||||||
]],
|
]],
|
||||||
}
|
}
|
||||||
end)
|
end)
|
||||||
|
|
||||||
|
it('supports emoji with variant selectors and ZWJ', function()
|
||||||
|
command('set ruler')
|
||||||
|
insert('🏳️⚧️')
|
||||||
|
screen:expect([[
|
||||||
|
^🏳️⚧️ |
|
||||||
|
{1:~ }|*4
|
||||||
|
1,1 All |
|
||||||
|
]])
|
||||||
|
|
||||||
|
feed('a word<esc>')
|
||||||
|
screen:expect([[
|
||||||
|
🏳️⚧️ wor^d |
|
||||||
|
{1:~ }|*4
|
||||||
|
1,21-7 All |
|
||||||
|
]])
|
||||||
|
|
||||||
|
feed('0')
|
||||||
|
screen:expect([[
|
||||||
|
^🏳️⚧️ word |
|
||||||
|
{1:~ }|*4
|
||||||
|
1,1 All |
|
||||||
|
]])
|
||||||
|
|
||||||
|
feed('l')
|
||||||
|
screen:expect([[
|
||||||
|
🏳️⚧️^ word |
|
||||||
|
{1:~ }|*4
|
||||||
|
1,17-3 All |
|
||||||
|
]])
|
||||||
|
|
||||||
|
feed('h')
|
||||||
|
screen:expect([[
|
||||||
|
^🏳️⚧️ word |
|
||||||
|
{1:~ }|*4
|
||||||
|
1,1 All |
|
||||||
|
]])
|
||||||
|
|
||||||
|
feed('o❤️ variant selected<esc>')
|
||||||
|
screen:expect([[
|
||||||
|
🏳️⚧️ word |
|
||||||
|
❤️ variant selecte^d |
|
||||||
|
{1:~ }|*3
|
||||||
|
2,23-19 All |
|
||||||
|
]])
|
||||||
|
|
||||||
|
feed('0')
|
||||||
|
screen:expect([[
|
||||||
|
🏳️⚧️ word |
|
||||||
|
^❤️ variant selected |
|
||||||
|
{1:~ }|*3
|
||||||
|
2,1 All |
|
||||||
|
]])
|
||||||
|
|
||||||
|
feed('l')
|
||||||
|
screen:expect([[
|
||||||
|
🏳️⚧️ word |
|
||||||
|
❤️^ variant selected |
|
||||||
|
{1:~ }|*3
|
||||||
|
2,7-3 All |
|
||||||
|
]])
|
||||||
|
|
||||||
|
feed('h')
|
||||||
|
screen:expect([[
|
||||||
|
🏳️⚧️ word |
|
||||||
|
^❤️ variant selected |
|
||||||
|
{1:~ }|*3
|
||||||
|
2,1 All |
|
||||||
|
]])
|
||||||
|
|
||||||
|
-- without selector: single width (note column 18 and not 19)
|
||||||
|
feed('o❤ variant selected<esc>')
|
||||||
|
screen:expect([[
|
||||||
|
🏳️⚧️ word |
|
||||||
|
❤️ variant selected |
|
||||||
|
❤ variant selecte^d |
|
||||||
|
{1:~ }|*2
|
||||||
|
3,20-18 All |
|
||||||
|
]])
|
||||||
|
end)
|
||||||
end)
|
end)
|
||||||
|
|
||||||
describe('multibyte rendering: statusline', function()
|
describe('multibyte rendering: statusline', function()
|
||||||
@@ -348,10 +428,11 @@ describe('multibyte rendering: statusline', function()
|
|||||||
it('non-printable followed by MAX_MCO unicode combination points', function()
|
it('non-printable followed by MAX_MCO unicode combination points', function()
|
||||||
command('set statusline≠⃯ᷰ⃐⃧⃝')
|
command('set statusline≠⃯ᷰ⃐⃧⃝')
|
||||||
-- U+9F + U+1DF0 + U+20EF + U+0338 + U+20D0 + U+20E7 + U+20DD
|
-- U+9F + U+1DF0 + U+20EF + U+0338 + U+20D0 + U+20E7 + U+20DD
|
||||||
|
-- TODO: not ideal, better with plain ">" and then space+combining
|
||||||
screen:expect([[
|
screen:expect([[
|
||||||
^ |
|
^ |
|
||||||
{1:~ }|
|
{1:~ }|
|
||||||
{3:<9f><1df0><20ef><0338><20d0><20e7><20dd>}|
|
{3:<9f≯⃯ᷰ⃐⃧⃝ }|
|
||||||
|
|
|
|
||||||
]])
|
]])
|
||||||
end)
|
end)
|
||||||
@@ -368,9 +449,20 @@ describe('multibyte rendering: statusline', function()
|
|||||||
}
|
}
|
||||||
end)
|
end)
|
||||||
|
|
||||||
it('unprintable chars in filename with default stl', function()
|
it('emoji with ZWJ in filename with default stl', function()
|
||||||
command('file 🧑💻')
|
command('file 🧑💻')
|
||||||
-- TODO: this is wrong but avoids a crash
|
screen:expect {
|
||||||
|
grid = [[
|
||||||
|
^ |
|
||||||
|
{1:~ }|
|
||||||
|
{3:🧑💻 }|
|
||||||
|
|
|
||||||
|
]],
|
||||||
|
}
|
||||||
|
end)
|
||||||
|
|
||||||
|
it('unprintable chars in filename with default stl', function()
|
||||||
|
command('file 🧑💻')
|
||||||
screen:expect {
|
screen:expect {
|
||||||
grid = [[
|
grid = [[
|
||||||
^ |
|
^ |
|
||||||
@@ -381,15 +473,27 @@ describe('multibyte rendering: statusline', function()
|
|||||||
}
|
}
|
||||||
end)
|
end)
|
||||||
|
|
||||||
it('unprintable chars in filename with custom stl', function()
|
it('emoji with ZWJ in filename with custom stl', function()
|
||||||
command('set statusline=xx%#ErrorMsg#%f%##yy')
|
command('set statusline=xx%#ErrorMsg#%f%##yy')
|
||||||
command('file 🧑💻')
|
command('file 🧑💻')
|
||||||
-- TODO: this is also wrong but also avoids a crash
|
|
||||||
screen:expect {
|
screen:expect {
|
||||||
grid = [[
|
grid = [[
|
||||||
^ |
|
^ |
|
||||||
{1:~ }|
|
{1:~ }|
|
||||||
{3:xx}{9:🧑<200d>💻}{3:yy }|
|
{3:xx}{9:🧑💻}{3:yy }|
|
||||||
|
|
|
||||||
|
]],
|
||||||
|
}
|
||||||
|
end)
|
||||||
|
|
||||||
|
it('unprintable chars in filename with custom stl', function()
|
||||||
|
command('set statusline=xx%#ErrorMsg#%f%##yy')
|
||||||
|
command('file 🧑💻')
|
||||||
|
screen:expect {
|
||||||
|
grid = [[
|
||||||
|
^ |
|
||||||
|
{1:~ }|
|
||||||
|
{3:xx}{9:🧑<200b>💻}{3:yy }|
|
||||||
|
|
|
|
||||||
]],
|
]],
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -3663,7 +3663,7 @@ func Test_string_reverse()
|
|||||||
call assert_equal('', reverse(v:_null_string))
|
call assert_equal('', reverse(v:_null_string))
|
||||||
for [s1, s2] in [['', ''], ['a', 'a'], ['ab', 'ba'], ['abc', 'cba'],
|
for [s1, s2] in [['', ''], ['a', 'a'], ['ab', 'ba'], ['abc', 'cba'],
|
||||||
\ ['abcd', 'dcba'], ['«-«-»-»', '»-»-«-«'],
|
\ ['abcd', 'dcba'], ['«-«-»-»', '»-»-«-«'],
|
||||||
\ ['🇦', '🇦'], ['🇦🇧', '🇧🇦'], ['🇦🇧🇨', '🇨🇧🇦'],
|
\ ['🇦', '🇦'], ['🇦🇧', '🇦🇧'], ['🇦🇧🇨', '🇨🇦🇧'],
|
||||||
\ ['🇦«🇧-🇨»🇩', '🇩»🇨-🇧«🇦']]
|
\ ['🇦«🇧-🇨»🇩', '🇩»🇨-🇧«🇦']]
|
||||||
call assert_equal(s2, reverse(s1))
|
call assert_equal(s2, reverse(s1))
|
||||||
endfor
|
endfor
|
||||||
|
|||||||
@@ -3897,9 +3897,9 @@ func Test_normal_count_after_operator()
|
|||||||
bw!
|
bw!
|
||||||
endfunc
|
endfunc
|
||||||
|
|
||||||
func Test_normal_gj_on_extra_wide_char()
|
func Test_normal_gj_on_6_cell_wide_unprintable_char()
|
||||||
new | 25vsp
|
new | 25vsp
|
||||||
let text='1 foooooooo ar e inszwe1 foooooooo inszwei' .
|
let text='1 foooooooo ar e inszwe1 foooooooo inszwei' .
|
||||||
\ ' i drei vier fünf sechs sieben acht un zehn elf zwöfl' .
|
\ ' i drei vier fünf sechs sieben acht un zehn elf zwöfl' .
|
||||||
\ ' dreizehn v ierzehn fünfzehn'
|
\ ' dreizehn v ierzehn fünfzehn'
|
||||||
put =text
|
put =text
|
||||||
|
|||||||
@@ -3,8 +3,15 @@ local itp = t.gen_itp(it)
|
|||||||
|
|
||||||
local ffi = t.ffi
|
local ffi = t.ffi
|
||||||
local eq = t.eq
|
local eq = t.eq
|
||||||
|
local to_cstr = t.to_cstr
|
||||||
|
local ok = t.ok
|
||||||
|
|
||||||
local lib = t.cimport('./src/nvim/mbyte.h', './src/nvim/charset.h', './src/nvim/grid.h')
|
local lib = t.cimport(
|
||||||
|
'./src/nvim/mbyte.h',
|
||||||
|
'./src/nvim/charset.h',
|
||||||
|
'./src/nvim/grid.h',
|
||||||
|
'./src/nvim/option_vars.h'
|
||||||
|
)
|
||||||
|
|
||||||
describe('mbyte', function()
|
describe('mbyte', function()
|
||||||
-- Convert from bytes to string
|
-- Convert from bytes to string
|
||||||
@@ -45,12 +52,21 @@ describe('mbyte', function()
|
|||||||
end)
|
end)
|
||||||
end
|
end
|
||||||
|
|
||||||
describe('utfc_ptr2schar_len', function()
|
describe('utfc_ptr2schar', function()
|
||||||
local function test_seq(seq)
|
local function test_seq(seq)
|
||||||
local firstc = ffi.new('int[1]')
|
local firstc = ffi.new('int[1]')
|
||||||
local buf = ffi.new('char[32]')
|
local buf = ffi.new('char[32]')
|
||||||
lib.schar_get(buf, lib.utfc_ptr2schar_len(to_string(seq), #seq, firstc))
|
lib.schar_get(buf, lib.utfc_ptr2schar(to_string(seq), firstc))
|
||||||
return { ffi.string(buf), firstc[0] }
|
local str = ffi.string(buf)
|
||||||
|
if 1 > 2 then -- for debugging
|
||||||
|
local tabel = {}
|
||||||
|
for i = 1, #str do
|
||||||
|
table.insert(tabel, string.format('0x%02x', string.byte(str, i)))
|
||||||
|
end
|
||||||
|
print('{ ' .. table.concat(tabel, ', ') .. ' }')
|
||||||
|
io.stdout:flush()
|
||||||
|
end
|
||||||
|
return { str, firstc[0] }
|
||||||
end
|
end
|
||||||
|
|
||||||
local function byte(val)
|
local function byte(val)
|
||||||
@@ -88,7 +104,9 @@ describe('mbyte', function()
|
|||||||
eq(byte(0x7f), test_seq { 0x7f, 0xc2, 0x80 })
|
eq(byte(0x7f), test_seq { 0x7f, 0xc2, 0x80 })
|
||||||
|
|
||||||
-- Combining character is U+0300
|
-- Combining character is U+0300
|
||||||
eq({ '\x7f\xcc\x80', 0x7f }, test_seq { 0x7f, 0xcc, 0x80 })
|
eq({ '\x29\xcc\x80', 0x29 }, test_seq { 0x29, 0xcc, 0x80 })
|
||||||
|
-- invalid start byte for combining
|
||||||
|
eq({ '\x7f', 0x7f }, test_seq { 0x7f, 0xcc, 0x80 })
|
||||||
|
|
||||||
-- No UTF-8 sequence
|
-- No UTF-8 sequence
|
||||||
eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f, 0xcc })
|
eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f, 0xcc })
|
||||||
@@ -102,18 +120,21 @@ describe('mbyte', function()
|
|||||||
itp('4-byte sequences', function()
|
itp('4-byte sequences', function()
|
||||||
-- No following combining character
|
-- No following combining character
|
||||||
eq(byte(0x7f), test_seq { 0x7f, 0x7f, 0xcc, 0x80 })
|
eq(byte(0x7f), test_seq { 0x7f, 0x7f, 0xcc, 0x80 })
|
||||||
|
eq(byte(0x29), test_seq { 0x29, 0x29, 0xcc, 0x80 })
|
||||||
-- No second UTF-8 character
|
-- No second UTF-8 character
|
||||||
eq(byte(0x7f), test_seq { 0x7f, 0xc2, 0xcc, 0x80 })
|
eq(byte(0x7f), test_seq { 0x7f, 0xc2, 0xcc, 0x80 })
|
||||||
|
|
||||||
-- Combining character U+0300
|
-- Combining character U+0300
|
||||||
eq({ '\x7f\xcc\x80', 0x7f }, test_seq { 0x7f, 0xcc, 0x80, 0xcc })
|
eq({ '\x29\xcc\x80', 0x29 }, test_seq { 0x29, 0xcc, 0x80, 0xcc })
|
||||||
|
|
||||||
-- No UTF-8 sequence
|
-- No UTF-8 sequence
|
||||||
eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f, 0xcc, 0x80 })
|
eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f, 0xcc, 0x80 })
|
||||||
-- No following UTF-8 character
|
-- No following UTF-8 character
|
||||||
eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0xcc })
|
eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0xcc })
|
||||||
-- Combining character U+0301
|
-- Combining character U+0301
|
||||||
eq({ '\xc2\x80\xcc\x81', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81 })
|
eq({ '\xc2\xbc\xcc\x81', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0x81 })
|
||||||
|
-- U+0080 : not a valid start char
|
||||||
|
eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81 })
|
||||||
|
|
||||||
-- One UTF-8 character
|
-- One UTF-8 character
|
||||||
eq({ '\xf4\x80\x80\x80', 0x100000 }, test_seq { 0xf4, 0x80, 0x80, 0x80 })
|
eq({ '\xf4\x80\x80\x80', 0x100000 }, test_seq { 0xf4, 0x80, 0x80, 0x80 })
|
||||||
@@ -126,36 +147,36 @@ describe('mbyte', function()
|
|||||||
eq(byte(0x7f), test_seq { 0x7f, 0xc2, 0xcc, 0x80, 0x80 })
|
eq(byte(0x7f), test_seq { 0x7f, 0xc2, 0xcc, 0x80, 0x80 })
|
||||||
|
|
||||||
-- Combining character U+0300
|
-- Combining character U+0300
|
||||||
eq({ '\x7f\xcc\x80', 0x7f }, test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x00 })
|
eq({ '\x29\xcc\x80', 0x29 }, test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x00 })
|
||||||
|
|
||||||
-- Combining characters U+0300 and U+0301
|
-- Combining characters U+0300 and U+0301
|
||||||
eq({ '\x7f\xcc\x80\xcc\x81', 0x7f }, test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81 })
|
eq({ '\x29\xcc\x80\xcc\x81', 0x29 }, test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81 })
|
||||||
-- Combining characters U+0300, U+0301, U+0302
|
-- Combining characters U+0300, U+0301, U+0302
|
||||||
eq(
|
eq(
|
||||||
{ '\x7f\xcc\x80\xcc\x81\xcc\x82', 0x7f },
|
{ '\x29\xcc\x80\xcc\x81\xcc\x82', 0x29 },
|
||||||
test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82 }
|
test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82 }
|
||||||
)
|
)
|
||||||
-- Combining characters U+0300, U+0301, U+0302, U+0303
|
-- Combining characters U+0300, U+0301, U+0302, U+0303
|
||||||
eq(
|
eq(
|
||||||
{ '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83', 0x7f },
|
{ '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83', 0x29 },
|
||||||
test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83 }
|
test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83 }
|
||||||
)
|
)
|
||||||
-- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304
|
-- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304
|
||||||
eq(
|
eq(
|
||||||
{ '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84', 0x7f },
|
{ '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84', 0x29 },
|
||||||
test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84 }
|
test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84 }
|
||||||
)
|
)
|
||||||
-- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305
|
-- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305
|
||||||
eq(
|
eq(
|
||||||
{ '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85', 0x7f },
|
{ '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85', 0x29 },
|
||||||
test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85 }
|
test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85 }
|
||||||
)
|
)
|
||||||
|
|
||||||
-- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305, U+0306
|
-- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305, U+0306
|
||||||
eq(
|
eq(
|
||||||
{ '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85\xcc\x86', 0x7f },
|
{ '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85\xcc\x86', 0x29 },
|
||||||
test_seq {
|
test_seq {
|
||||||
0x7f,
|
0x29,
|
||||||
0xcc,
|
0xcc,
|
||||||
0x80,
|
0x80,
|
||||||
0xcc,
|
0xcc,
|
||||||
@@ -175,18 +196,18 @@ describe('mbyte', function()
|
|||||||
|
|
||||||
-- Only three following combining characters U+0300, U+0301, U+0302
|
-- Only three following combining characters U+0300, U+0301, U+0302
|
||||||
eq(
|
eq(
|
||||||
{ '\x7f\xcc\x80\xcc\x81\xcc\x82', 0x7f },
|
{ '\x29\xcc\x80\xcc\x81\xcc\x82', 0x29 },
|
||||||
test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xc2, 0x80, 0xcc, 0x84, 0xcc, 0x85 }
|
test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xc2, 0x80, 0xcc, 0x84, 0xcc, 0x85 }
|
||||||
)
|
)
|
||||||
|
|
||||||
-- No UTF-8 sequence
|
-- No UTF-8 sequence
|
||||||
eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f, 0xcc, 0x80, 0x80 })
|
eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f, 0xcc, 0x80, 0x80 })
|
||||||
-- No following UTF-8 character
|
-- No following UTF-8 character
|
||||||
eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0xcc, 0x80 })
|
eq({ '\xc2\xbc', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0xcc, 0x80 })
|
||||||
-- Combining character U+0301
|
-- Combining character U+0301
|
||||||
eq({ '\xc2\x80\xcc\x81', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81, 0x7f })
|
eq({ '\xc2\xbc\xcc\x81', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0x81, 0x7f })
|
||||||
-- Combining character U+0301
|
-- Combining character U+0301
|
||||||
eq({ '\xc2\x80\xcc\x81', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81, 0xcc })
|
eq({ '\xc2\xbc\xcc\x81', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0x81, 0xcc })
|
||||||
|
|
||||||
-- One UTF-8 character
|
-- One UTF-8 character
|
||||||
eq({ '\xf4\x80\x80\x80', 0x100000 }, test_seq { 0xf4, 0x80, 0x80, 0x80, 0x7f })
|
eq({ '\xf4\x80\x80\x80', 0x100000 }, test_seq { 0xf4, 0x80, 0x80, 0x80, 0x7f })
|
||||||
@@ -205,8 +226,6 @@ describe('mbyte', function()
|
|||||||
end)
|
end)
|
||||||
|
|
||||||
describe('utf_cp_bounds_len', function()
|
describe('utf_cp_bounds_len', function()
|
||||||
local to_cstr = t.to_cstr
|
|
||||||
|
|
||||||
local tests = {
|
local tests = {
|
||||||
{
|
{
|
||||||
name = 'for valid string',
|
name = 'for valid string',
|
||||||
@@ -273,4 +292,52 @@ describe('mbyte', function()
|
|||||||
eq(expected_offsets, { b = b_offsets, e = e_offsets })
|
eq(expected_offsets, { b = b_offsets, e = e_offsets })
|
||||||
end)
|
end)
|
||||||
end)
|
end)
|
||||||
|
|
||||||
|
itp('utf_head_off', function()
|
||||||
|
local function check(str, expected_glyphs)
|
||||||
|
local len = #str
|
||||||
|
local cstr = to_cstr(str)
|
||||||
|
local breaks = { 0 } -- SOT
|
||||||
|
local pos = 0
|
||||||
|
local mb_glyphs = {}
|
||||||
|
while pos < len do
|
||||||
|
local clen = lib.utfc_ptr2len(cstr + pos)
|
||||||
|
ok(clen > 0) -- otherwise we get stuck
|
||||||
|
if clen > 1 then
|
||||||
|
table.insert(mb_glyphs, string.sub(str, pos + 1, pos + clen))
|
||||||
|
end
|
||||||
|
pos = pos + clen
|
||||||
|
table.insert(breaks, pos)
|
||||||
|
end
|
||||||
|
eq(breaks[#breaks], len) -- include EOT as break
|
||||||
|
-- we could also send in breaks, but this is more human readable
|
||||||
|
eq(mb_glyphs, expected_glyphs)
|
||||||
|
|
||||||
|
for i = 1, #breaks - 1 do
|
||||||
|
local start, next = breaks[i], breaks[i + 1]
|
||||||
|
|
||||||
|
for p = start, next - 1 do
|
||||||
|
eq(p - start, lib.utf_head_off(cstr, cstr + p))
|
||||||
|
end
|
||||||
|
end
|
||||||
|
eq(0, lib.utf_head_off(cstr, cstr + len)) -- NUL byte is safe
|
||||||
|
end
|
||||||
|
-- stylua doesn't like ZWJ chars..
|
||||||
|
-- stylua: ignore start
|
||||||
|
check('hej och hå 🧑🌾!', { 'å', '🧑🌾' })
|
||||||
|
-- emoji only (various kinds of combinations, use g8 to see them)
|
||||||
|
check("🏳️⚧️🧑🌾❤️😂🏴☠️", {"🏳️⚧️", "🧑🌾", "❤️", "😂", "🏴☠️"})
|
||||||
|
check('🏳️⚧️xy🧑🌾\r❤️😂å🏴☠️', { '🏳️⚧️', '🧑🌾', '❤️', '😂', 'å', '🏴☠️', '' })
|
||||||
|
|
||||||
|
check('🇦🅱️ 🇦🇽 🇦🇨🇦 🇲🇽🇹🇱',{'🇦', '🅱️', '🇦🇽', '🇦🇨', '🇦', '🇲🇽', '🇹🇱'})
|
||||||
|
check('🏴🏴', {'🏴', '🏴'})
|
||||||
|
|
||||||
|
lib.p_arshape = true -- default
|
||||||
|
check('سلام', { 'س', 'لا', 'م' })
|
||||||
|
lib.p_arshape = false
|
||||||
|
check('سلام', { 'س', 'ل', 'ا', 'م' })
|
||||||
|
|
||||||
|
check('L̓̉̑̒̌̚ơ̗̌̒̄̀ŕ̈̈̎̐̕è̇̅̄̄̐m̖̟̟̅̄̚', {'L̓̉̑̒̌̚', 'ơ̗̌̒̄̀', 'ŕ̈̈̎̐̕', 'è̇̅̄̄̐', 'm̖̟̟̅̄̚'})
|
||||||
|
-- stylua: ignore end
|
||||||
|
end)
|
||||||
end)
|
end)
|
||||||
|
|||||||
Reference in New Issue
Block a user