mirror of
https://github.com/neovim/neovim.git
synced 2025-09-29 14:38:32 +00:00
encoding: cleanup mbyte.c given fixed encoding=utf-8
Eliminate mb_init(): Set "enc_utf" and "has_mbyte" early. Eliminate "enc_unicode" and "enc_latin1like". init_chartab() and screenalloc() are already invoked elsewhere in the initialization process. The EncodingChanged autocmd cannot be triggered. At initialization, there is no spellfiles to reload
This commit is contained in:
@@ -1612,9 +1612,7 @@ bool vim_islower(int c)
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (enc_latin1like) {
|
return (latin1flags[c] & LATIN1LOWER) == LATIN1LOWER;
|
||||||
return (latin1flags[c] & LATIN1LOWER) == LATIN1LOWER;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
return islower(c);
|
return islower(c);
|
||||||
}
|
}
|
||||||
@@ -1643,9 +1641,7 @@ bool vim_isupper(int c)
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (enc_latin1like) {
|
return (latin1flags[c] & LATIN1UPPER) == LATIN1UPPER;
|
||||||
return (latin1flags[c] & LATIN1UPPER) == LATIN1UPPER;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
return isupper(c);
|
return isupper(c);
|
||||||
}
|
}
|
||||||
@@ -1670,9 +1666,7 @@ int vim_toupper(int c)
|
|||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (enc_latin1like) {
|
return latin1upper[c];
|
||||||
return latin1upper[c];
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
return TOUPPER_LOC(c);
|
return TOUPPER_LOC(c);
|
||||||
}
|
}
|
||||||
@@ -1697,9 +1691,7 @@ int vim_tolower(int c)
|
|||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (enc_latin1like) {
|
return latin1lower[c];
|
||||||
return latin1lower[c];
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
return TOLOWER_LOC(c);
|
return TOLOWER_LOC(c);
|
||||||
}
|
}
|
||||||
|
@@ -4165,9 +4165,8 @@ static bool need_conversion(const char_u *fenc)
|
|||||||
same_encoding = (enc_flags != 0 && fenc_flags == enc_flags);
|
same_encoding = (enc_flags != 0 && fenc_flags == enc_flags);
|
||||||
}
|
}
|
||||||
if (same_encoding) {
|
if (same_encoding) {
|
||||||
/* Specified encoding matches with 'encoding'. This requires
|
// Specified file encoding matches UTF-8.
|
||||||
* conversion when 'encoding' is Unicode but not UTF-8. */
|
return false;
|
||||||
return enc_unicode != 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Encodings differ. However, conversion is not needed when 'enc' is any
|
/* Encodings differ. However, conversion is not needed when 'enc' is any
|
||||||
|
@@ -778,44 +778,18 @@ EXTERN int vr_lines_changed INIT(= 0); /* #Lines changed by "gR" so far */
|
|||||||
# define DBCS_2BYTE 1 /* 2byte- */
|
# define DBCS_2BYTE 1 /* 2byte- */
|
||||||
# define DBCS_DEBUG -1
|
# define DBCS_DEBUG -1
|
||||||
|
|
||||||
EXTERN int enc_dbcs INIT(= 0); /* One of DBCS_xxx values if
|
// mbyte flags that used to depend on 'encoding'. These are now deprecated, as
|
||||||
DBCS encoding */
|
// 'encoding' is always "utf-8". Code that use them can be refactored to
|
||||||
EXTERN int enc_unicode INIT(= 0); /* 2: UCS-2 or UTF-16, 4: UCS-4 */
|
// remove dead code.
|
||||||
EXTERN bool enc_utf8 INIT(= false); /* UTF-8 encoded Unicode */
|
#define enc_dbcs false
|
||||||
EXTERN int enc_latin1like INIT(= TRUE); /* 'encoding' is latin1 comp. */
|
#define enc_utf8 true
|
||||||
EXTERN int has_mbyte INIT(= 0); /* any multi-byte encoding */
|
#define has_mbyte true
|
||||||
|
|
||||||
/// Encoding used when 'fencs' is set to "default"
|
/// Encoding used when 'fencs' is set to "default"
|
||||||
EXTERN char_u *fenc_default INIT(= NULL);
|
EXTERN char_u *fenc_default INIT(= NULL);
|
||||||
|
|
||||||
/*
|
// To speed up BYTELEN() we keep a table with the byte lengths for utf-8
|
||||||
* To speed up BYTELEN() we fill a table with the byte lengths whenever
|
EXTERN char utf8len_tab[256];
|
||||||
* enc_utf8 or enc_dbcs changes.
|
|
||||||
*/
|
|
||||||
EXTERN char mb_bytelen_tab[256];
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Function pointers, used to quickly get to the right function. Each has
|
|
||||||
* three possible values: latin_ (8-bit), utfc_ or utf_ (utf-8) and dbcs_
|
|
||||||
* (DBCS).
|
|
||||||
* The value is set in mb_init();
|
|
||||||
*/
|
|
||||||
/* length of char in bytes, including following composing chars */
|
|
||||||
EXTERN int (*mb_ptr2len)(const char_u *p) INIT(= latin_ptr2len);
|
|
||||||
/* idem, with limit on string length */
|
|
||||||
EXTERN int (*mb_ptr2len_len)(const char_u *p, int size) INIT(= latin_ptr2len_len);
|
|
||||||
/* byte length of char */
|
|
||||||
EXTERN int (*mb_char2len)(int c) INIT(= latin_char2len);
|
|
||||||
/* convert char to bytes, return the length */
|
|
||||||
EXTERN int (*mb_char2bytes)(int c, char_u *buf) INIT(= latin_char2bytes);
|
|
||||||
EXTERN int (*mb_ptr2cells)(const char_u *p) INIT(= latin_ptr2cells);
|
|
||||||
EXTERN int (*mb_ptr2cells_len)(const char_u *p, int size) INIT(
|
|
||||||
= latin_ptr2cells_len);
|
|
||||||
EXTERN int (*mb_char2cells)(int c) INIT(= latin_char2cells);
|
|
||||||
EXTERN int (*mb_off2cells)(unsigned off, unsigned max_off) INIT(
|
|
||||||
= latin_off2cells);
|
|
||||||
EXTERN int (*mb_ptr2char)(const char_u *p) INIT(= latin_ptr2char);
|
|
||||||
EXTERN int (*mb_head_off)(const char_u *base, const char_u *p) INIT(= latin_head_off);
|
|
||||||
|
|
||||||
# if defined(USE_ICONV) && defined(DYNAMIC_ICONV)
|
# if defined(USE_ICONV) && defined(DYNAMIC_ICONV)
|
||||||
/* Pointers to functions and variables to be loaded at runtime */
|
/* Pointers to functions and variables to be loaded at runtime */
|
||||||
|
@@ -122,32 +122,29 @@
|
|||||||
/* Whether to draw the vertical bar on the right side of the cell. */
|
/* Whether to draw the vertical bar on the right side of the cell. */
|
||||||
# define CURSOR_BAR_RIGHT (curwin->w_p_rl && (!(State & CMDLINE) || cmdmsg_rl))
|
# define CURSOR_BAR_RIGHT (curwin->w_p_rl && (!(State & CMDLINE) || cmdmsg_rl))
|
||||||
|
|
||||||
/*
|
// mb_ptr_adv(): advance a pointer to the next character, taking care of
|
||||||
* mb_ptr_adv(): advance a pointer to the next character, taking care of
|
// multi-byte characters if needed.
|
||||||
* multi-byte characters if needed.
|
// mb_ptr_back(): backup a pointer to the previous character, taking care of
|
||||||
* mb_ptr_back(): backup a pointer to the previous character, taking care of
|
// multi-byte characters if needed.
|
||||||
* multi-byte characters if needed.
|
// MB_COPY_CHAR(f, t): copy one char from "f" to "t" and advance the pointers.
|
||||||
* MB_COPY_CHAR(f, t): copy one char from "f" to "t" and advance the pointers.
|
// PTR2CHAR(): get character from pointer.
|
||||||
* PTR2CHAR(): get character from pointer.
|
|
||||||
*/
|
|
||||||
/* Get the length of the character p points to */
|
|
||||||
# define MB_PTR2LEN(p) (has_mbyte ? (*mb_ptr2len)(p) : 1)
|
|
||||||
/* Advance multi-byte pointer, skip over composing chars. */
|
|
||||||
# define mb_ptr_adv(p) (p += has_mbyte ? (*mb_ptr2len)((char_u *)p) : 1)
|
|
||||||
/* Advance multi-byte pointer, do not skip over composing chars. */
|
|
||||||
# define mb_cptr_adv(p) (p += \
|
|
||||||
enc_utf8 ? utf_ptr2len(p) : has_mbyte ? (*mb_ptr2len)(p) : 1)
|
|
||||||
/* Backup multi-byte pointer. Only use with "p" > "s" ! */
|
|
||||||
# define mb_ptr_back(s, p) (p -= has_mbyte ? ((*mb_head_off)((char_u *)s, (char_u *)p - 1) + 1) : 1)
|
|
||||||
/* get length of multi-byte char, not including composing chars */
|
|
||||||
# define mb_cptr2len(p) (enc_utf8 ? utf_ptr2len(p) : (*mb_ptr2len)(p))
|
|
||||||
|
|
||||||
# define MB_COPY_CHAR(f, t) \
|
// Get the length of the character p points to
|
||||||
if (has_mbyte) mb_copy_char((const char_u **)(&f), &t); \
|
# define MB_PTR2LEN(p) mb_ptr2len(p)
|
||||||
else *t++ = *f++
|
// Advance multi-byte pointer, skip over composing chars.
|
||||||
# define MB_CHARLEN(p) (has_mbyte ? mb_charlen(p) : (int)STRLEN(p))
|
# define mb_ptr_adv(p) (p += mb_ptr2len((char_u *)p))
|
||||||
# define MB_CHAR2LEN(c) (has_mbyte ? mb_char2len(c) : 1)
|
// Advance multi-byte pointer, do not skip over composing chars.
|
||||||
# define PTR2CHAR(p) (has_mbyte ? mb_ptr2char(p) : (int)*(p))
|
# define mb_cptr_adv(p) (p += utf_ptr2len(p))
|
||||||
|
// Backup multi-byte pointer. Only use with "p" > "s" !
|
||||||
|
# define mb_ptr_back(s, p) (p -= mb_head_off((char_u *)s, (char_u *)p - 1) + 1)
|
||||||
|
// get length of multi-byte char, not including composing chars
|
||||||
|
# define mb_cptr2len(p) utf_ptr2len(p)
|
||||||
|
|
||||||
|
# define MB_COPY_CHAR(f, t) mb_copy_char((const char_u **)(&f), &t);
|
||||||
|
|
||||||
|
# define MB_CHARLEN(p) mb_charlen(p)
|
||||||
|
# define MB_CHAR2LEN(c) mb_char2len(c)
|
||||||
|
# define PTR2CHAR(p) mb_ptr2char(p)
|
||||||
|
|
||||||
# define RESET_BINDING(wp) (wp)->w_p_scb = FALSE; (wp)->w_p_crb = FALSE
|
# define RESET_BINDING(wp) (wp)->w_p_scb = FALSE; (wp)->w_p_crb = FALSE
|
||||||
|
|
||||||
|
@@ -177,7 +177,6 @@ void early_init(void)
|
|||||||
fs_init();
|
fs_init();
|
||||||
handle_init();
|
handle_init();
|
||||||
|
|
||||||
(void)mb_init(); // init mb_bytelen_tab[] to ones
|
|
||||||
eval_init(); // init global variables
|
eval_init(); // init global variables
|
||||||
|
|
||||||
// Init the table of Normal mode commands.
|
// Init the table of Normal mode commands.
|
||||||
|
348
src/nvim/mbyte.c
348
src/nvim/mbyte.c
@@ -1,68 +1,27 @@
|
|||||||
/*
|
/// mbyte.c: Code specifically for handling multi-byte characters.
|
||||||
* mbyte.c: Code specifically for handling multi-byte characters.
|
/// Multibyte extensions partly by Sung-Hoon Baek
|
||||||
* Multibyte extensions partly by Sung-Hoon Baek
|
///
|
||||||
*
|
/// The encoding used in nvim is always UTF-8. "enc_utf8" and "has_mbyte" is
|
||||||
* The encoding used in the core is set with 'encoding'. When 'encoding' is
|
/// thus always true. "enc_dbcs" is always zero. The 'encoding' option is
|
||||||
* changed, the following four variables are set (for speed).
|
/// read-only and always reads "utf-8".
|
||||||
* Currently these types of character encodings are supported:
|
///
|
||||||
*
|
/// The cell width on the display needs to be determined from the character
|
||||||
* "enc_dbcs" When non-zero it tells the type of double byte character
|
/// value. Recognizing UTF-8 bytes is easy: 0xxx.xxxx is a single-byte char,
|
||||||
* encoding (Chinese, Korean, Japanese, etc.).
|
/// 10xx.xxxx is a trailing byte, 11xx.xxxx is a leading byte of a multi-byte
|
||||||
* The cell width on the display is equal to the number of
|
/// character. To make things complicated, up to six composing characters
|
||||||
* bytes. (exception: DBCS_JPNU with first byte 0x8e)
|
/// are allowed. These are drawn on top of the first char. For most editing
|
||||||
* Recognizing the first or second byte is difficult, it
|
/// the sequence of bytes with composing characters included is considered to
|
||||||
* requires checking a byte sequence from the start.
|
/// be one character.
|
||||||
* "enc_utf8" When TRUE use Unicode characters in UTF-8 encoding.
|
///
|
||||||
* The cell width on the display needs to be determined from
|
/// UTF-8 is used everywhere in the core. This is in registers, text
|
||||||
* the character value.
|
/// manipulation, buffers, etc. Nvim core communicates with external plugins
|
||||||
* Recognizing bytes is easy: 0xxx.xxxx is a single-byte
|
/// and GUIs in this encoding.
|
||||||
* char, 10xx.xxxx is a trailing byte, 11xx.xxxx is a leading
|
///
|
||||||
* byte of a multi-byte character.
|
/// The encoding of a file is specified with 'fileencoding'. Conversion
|
||||||
* To make things complicated, up to six composing characters
|
/// is to be done when it's different from "utf-8".
|
||||||
* are allowed. These are drawn on top of the first char.
|
///
|
||||||
* For most editing the sequence of bytes with composing
|
/// Vim scripts may contain an ":scriptencoding" command. This has an effect
|
||||||
* characters included is considered to be one character.
|
/// for some commands, like ":menutrans".
|
||||||
* "enc_unicode" When 2 use 16-bit Unicode characters (or UTF-16).
|
|
||||||
* When 4 use 32-but Unicode characters.
|
|
||||||
* Internally characters are stored in UTF-8 encoding to
|
|
||||||
* avoid NUL bytes. Conversion happens when doing I/O.
|
|
||||||
* "enc_utf8" will also be TRUE.
|
|
||||||
*
|
|
||||||
* "has_mbyte" is set when "enc_dbcs" or "enc_utf8" is non-zero.
|
|
||||||
*
|
|
||||||
* If none of these is TRUE, 8-bit bytes are used for a character. The
|
|
||||||
* encoding isn't currently specified (TODO).
|
|
||||||
*
|
|
||||||
* 'encoding' specifies the encoding used in the core. This is in registers,
|
|
||||||
* text manipulation, buffers, etc. Conversion has to be done when characters
|
|
||||||
* in another encoding are received or send:
|
|
||||||
*
|
|
||||||
* clipboard
|
|
||||||
* ^
|
|
||||||
* | (2)
|
|
||||||
* V
|
|
||||||
* +---------------+
|
|
||||||
* (1) | | (3)
|
|
||||||
* keyboard ----->| core |-----> display
|
|
||||||
* | |
|
|
||||||
* +---------------+
|
|
||||||
* ^
|
|
||||||
* | (4)
|
|
||||||
* V
|
|
||||||
* file
|
|
||||||
*
|
|
||||||
* (1) Typed characters arrive in the current locale.
|
|
||||||
* (2) Text will be made available with the encoding specified with
|
|
||||||
* 'encoding'. If this is not sufficient, system-specific conversion
|
|
||||||
* might be required.
|
|
||||||
* (3) For the GUI the correct font must be selected, no conversion done.
|
|
||||||
* (4) The encoding of the file is specified with 'fileencoding'. Conversion
|
|
||||||
* is to be done when it's different from 'encoding'.
|
|
||||||
*
|
|
||||||
* The ShaDa file is a special case: Only text is converted, not file names.
|
|
||||||
* Vim scripts may contain an ":encoding" command. This has an effect for
|
|
||||||
* some commands, like ":menutrans"
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include <inttypes.h>
|
#include <inttypes.h>
|
||||||
#include <stdbool.h>
|
#include <stdbool.h>
|
||||||
@@ -115,7 +74,7 @@ struct interval {
|
|||||||
* Bytes which are illegal when used as the first byte have a 1.
|
* Bytes which are illegal when used as the first byte have a 1.
|
||||||
* The NUL byte has length 1.
|
* The NUL byte has length 1.
|
||||||
*/
|
*/
|
||||||
static char utf8len_tab[256] =
|
char utf8len_tab[256] =
|
||||||
{
|
{
|
||||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||||
@@ -384,207 +343,6 @@ int enc_canon_props(const char_u *name)
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* Set up for using multi-byte characters.
|
|
||||||
* Called in three cases:
|
|
||||||
* - by main() to initialize (p_enc == NULL)
|
|
||||||
* - by set_init_1() after 'encoding' was set to its default.
|
|
||||||
* - by do_set() when 'encoding' has been set.
|
|
||||||
* p_enc must have been passed through enc_canonize() already.
|
|
||||||
* Sets the "enc_unicode", "enc_utf8", "enc_dbcs" and "has_mbyte" flags.
|
|
||||||
* Fills mb_bytelen_tab[] and returns NULL when there are no problems.
|
|
||||||
* When there is something wrong: Returns an error message and doesn't change
|
|
||||||
* anything.
|
|
||||||
*/
|
|
||||||
char_u * mb_init(void)
|
|
||||||
{
|
|
||||||
int i;
|
|
||||||
int idx;
|
|
||||||
int n;
|
|
||||||
int enc_dbcs_new = 0;
|
|
||||||
#if defined(USE_ICONV) && !defined(WIN3264) && !defined(WIN32UNIX) \
|
|
||||||
&& !defined(MACOS)
|
|
||||||
# define LEN_FROM_CONV
|
|
||||||
vimconv_T vimconv;
|
|
||||||
char_u *p;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
if (p_enc == NULL) {
|
|
||||||
/* Just starting up: set the whole table to one's. */
|
|
||||||
for (i = 0; i < 256; ++i)
|
|
||||||
mb_bytelen_tab[i] = 1;
|
|
||||||
return NULL;
|
|
||||||
} else if (STRNCMP(p_enc, "8bit-", 5) == 0
|
|
||||||
|| STRNCMP(p_enc, "iso-8859-", 9) == 0) {
|
|
||||||
/* Accept any "8bit-" or "iso-8859-" name. */
|
|
||||||
enc_unicode = 0;
|
|
||||||
enc_utf8 = false;
|
|
||||||
} else if (STRNCMP(p_enc, "2byte-", 6) == 0) {
|
|
||||||
/* Unix: accept any "2byte-" name, assume current locale. */
|
|
||||||
enc_dbcs_new = DBCS_2BYTE;
|
|
||||||
} else if ((idx = enc_canon_search(p_enc)) >= 0) {
|
|
||||||
i = enc_canon_table[idx].prop;
|
|
||||||
if (i & ENC_UNICODE) {
|
|
||||||
/* Unicode */
|
|
||||||
enc_utf8 = true;
|
|
||||||
if (i & (ENC_2BYTE | ENC_2WORD))
|
|
||||||
enc_unicode = 2;
|
|
||||||
else if (i & ENC_4BYTE)
|
|
||||||
enc_unicode = 4;
|
|
||||||
else
|
|
||||||
enc_unicode = 0;
|
|
||||||
} else if (i & ENC_DBCS) {
|
|
||||||
/* 2byte, handle below */
|
|
||||||
enc_dbcs_new = enc_canon_table[idx].codepage;
|
|
||||||
} else {
|
|
||||||
/* Must be 8-bit. */
|
|
||||||
enc_unicode = 0;
|
|
||||||
enc_utf8 = false;
|
|
||||||
}
|
|
||||||
} else /* Don't know what encoding this is, reject it. */
|
|
||||||
return e_invarg;
|
|
||||||
|
|
||||||
if (enc_dbcs_new != 0) {
|
|
||||||
enc_unicode = 0;
|
|
||||||
enc_utf8 = false;
|
|
||||||
}
|
|
||||||
enc_dbcs = enc_dbcs_new;
|
|
||||||
has_mbyte = (enc_dbcs != 0 || enc_utf8);
|
|
||||||
|
|
||||||
|
|
||||||
/* Detect an encoding that uses latin1 characters. */
|
|
||||||
enc_latin1like = (enc_utf8 || STRCMP(p_enc, "latin1") == 0
|
|
||||||
|| STRCMP(p_enc, "iso-8859-15") == 0);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Set the function pointers.
|
|
||||||
*/
|
|
||||||
if (enc_utf8) {
|
|
||||||
mb_ptr2len = utfc_ptr2len;
|
|
||||||
mb_ptr2len_len = utfc_ptr2len_len;
|
|
||||||
mb_char2len = utf_char2len;
|
|
||||||
mb_char2bytes = utf_char2bytes;
|
|
||||||
mb_ptr2cells = utf_ptr2cells;
|
|
||||||
mb_ptr2cells_len = utf_ptr2cells_len;
|
|
||||||
mb_char2cells = utf_char2cells;
|
|
||||||
mb_off2cells = utf_off2cells;
|
|
||||||
mb_ptr2char = utf_ptr2char;
|
|
||||||
mb_head_off = utf_head_off;
|
|
||||||
} else if (enc_dbcs != 0) {
|
|
||||||
mb_ptr2len = dbcs_ptr2len;
|
|
||||||
mb_ptr2len_len = dbcs_ptr2len_len;
|
|
||||||
mb_char2len = dbcs_char2len;
|
|
||||||
mb_char2bytes = dbcs_char2bytes;
|
|
||||||
mb_ptr2cells = dbcs_ptr2cells;
|
|
||||||
mb_ptr2cells_len = dbcs_ptr2cells_len;
|
|
||||||
mb_char2cells = dbcs_char2cells;
|
|
||||||
mb_off2cells = dbcs_off2cells;
|
|
||||||
mb_ptr2char = dbcs_ptr2char;
|
|
||||||
mb_head_off = dbcs_head_off;
|
|
||||||
} else {
|
|
||||||
mb_ptr2len = latin_ptr2len;
|
|
||||||
mb_ptr2len_len = latin_ptr2len_len;
|
|
||||||
mb_char2len = latin_char2len;
|
|
||||||
mb_char2bytes = latin_char2bytes;
|
|
||||||
mb_ptr2cells = latin_ptr2cells;
|
|
||||||
mb_ptr2cells_len = latin_ptr2cells_len;
|
|
||||||
mb_char2cells = latin_char2cells;
|
|
||||||
mb_off2cells = latin_off2cells;
|
|
||||||
mb_ptr2char = latin_ptr2char;
|
|
||||||
mb_head_off = latin_head_off;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Fill the mb_bytelen_tab[] for MB_BYTE2LEN().
|
|
||||||
*/
|
|
||||||
#ifdef LEN_FROM_CONV
|
|
||||||
/* When 'encoding' is different from the current locale mblen() won't
|
|
||||||
* work. Use conversion to "utf-8" instead. */
|
|
||||||
vimconv.vc_type = CONV_NONE;
|
|
||||||
if (enc_dbcs) {
|
|
||||||
p = enc_locale();
|
|
||||||
if (p == NULL || STRCMP(p, p_enc) != 0) {
|
|
||||||
convert_setup(&vimconv, p_enc, (char_u *)"utf-8");
|
|
||||||
vimconv.vc_fail = true;
|
|
||||||
}
|
|
||||||
xfree(p);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
for (i = 0; i < 256; ++i) {
|
|
||||||
/* Our own function to reliably check the length of UTF-8 characters,
|
|
||||||
* independent of mblen(). */
|
|
||||||
if (enc_utf8)
|
|
||||||
n = utf8len_tab[i];
|
|
||||||
else if (enc_dbcs == 0)
|
|
||||||
n = 1;
|
|
||||||
else {
|
|
||||||
char buf[MB_MAXBYTES + 1];
|
|
||||||
if (i == NUL) /* just in case mblen() can't handle "" */
|
|
||||||
n = 1;
|
|
||||||
else {
|
|
||||||
buf[0] = i;
|
|
||||||
buf[1] = 0;
|
|
||||||
#ifdef LEN_FROM_CONV
|
|
||||||
if (vimconv.vc_type != CONV_NONE) {
|
|
||||||
/*
|
|
||||||
* string_convert() should fail when converting the first
|
|
||||||
* byte of a double-byte character.
|
|
||||||
*/
|
|
||||||
p = string_convert(&vimconv, (char_u *)buf, NULL);
|
|
||||||
if (p != NULL) {
|
|
||||||
xfree(p);
|
|
||||||
n = 1;
|
|
||||||
} else
|
|
||||||
n = 2;
|
|
||||||
} else
|
|
||||||
#endif
|
|
||||||
{
|
|
||||||
/*
|
|
||||||
* mblen() should return -1 for invalid (means the leading
|
|
||||||
* multibyte) character. However there are some platforms
|
|
||||||
* where mblen() returns 0 for invalid character.
|
|
||||||
* Therefore, following condition includes 0.
|
|
||||||
*/
|
|
||||||
ignored = mblen(NULL, 0); /* First reset the state. */
|
|
||||||
if (mblen(buf, (size_t)1) <= 0)
|
|
||||||
n = 2;
|
|
||||||
else
|
|
||||||
n = 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
mb_bytelen_tab[i] = n;
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef LEN_FROM_CONV
|
|
||||||
convert_setup(&vimconv, NULL, NULL);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* The cell width depends on the type of multi-byte characters. */
|
|
||||||
(void)init_chartab();
|
|
||||||
|
|
||||||
/* When enc_utf8 is set or reset, (de)allocate ScreenLinesUC[] */
|
|
||||||
screenalloc(false);
|
|
||||||
|
|
||||||
#ifdef HAVE_WORKING_LIBINTL
|
|
||||||
/* GNU gettext 0.10.37 supports this feature: set the codeset used for
|
|
||||||
* translated messages independently from the current locale. */
|
|
||||||
(void)bind_textdomain_codeset(PROJECT_NAME,
|
|
||||||
enc_utf8 ? "utf-8" : (char *)p_enc);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
||||||
/* Fire an autocommand to let people do custom font setup. This must be
|
|
||||||
* after Vim has been setup for the new encoding. */
|
|
||||||
apply_autocmds(EVENT_ENCODINGCHANGED, NULL, (char_u *)"", FALSE, curbuf);
|
|
||||||
|
|
||||||
/* Need to reload spell dictionaries */
|
|
||||||
spell_reload();
|
|
||||||
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Return the size of the BOM for the current buffer:
|
* Return the size of the BOM for the current buffer:
|
||||||
* 0 - no BOM
|
* 0 - no BOM
|
||||||
@@ -597,20 +355,15 @@ int bomb_size(void)
|
|||||||
int n = 0;
|
int n = 0;
|
||||||
|
|
||||||
if (curbuf->b_p_bomb && !curbuf->b_p_bin) {
|
if (curbuf->b_p_bomb && !curbuf->b_p_bin) {
|
||||||
if (*curbuf->b_p_fenc == NUL) {
|
if (*curbuf->b_p_fenc == NUL
|
||||||
if (enc_utf8) {
|
|| STRCMP(curbuf->b_p_fenc, "utf-8") == 0) {
|
||||||
if (enc_unicode != 0)
|
|
||||||
n = enc_unicode;
|
|
||||||
else
|
|
||||||
n = 3;
|
|
||||||
}
|
|
||||||
} else if (STRCMP(curbuf->b_p_fenc, "utf-8") == 0)
|
|
||||||
n = 3;
|
n = 3;
|
||||||
else if (STRNCMP(curbuf->b_p_fenc, "ucs-2", 5) == 0
|
} else if (STRNCMP(curbuf->b_p_fenc, "ucs-2", 5) == 0
|
||||||
|| STRNCMP(curbuf->b_p_fenc, "utf-16", 6) == 0)
|
|| STRNCMP(curbuf->b_p_fenc, "utf-16", 6) == 0) {
|
||||||
n = 2;
|
n = 2;
|
||||||
else if (STRNCMP(curbuf->b_p_fenc, "ucs-4", 5) == 0)
|
} else if (STRNCMP(curbuf->b_p_fenc, "ucs-4", 5) == 0) {
|
||||||
n = 4;
|
n = 4;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return n;
|
return n;
|
||||||
}
|
}
|
||||||
@@ -1010,7 +763,7 @@ int latin_ptr2cells_len(const char_u *p, int size)
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int utf_ptr2cells_len(const char_u *p, int size)
|
int utf_ptr2cells_len(const char_u *p, int size)
|
||||||
{
|
{
|
||||||
int c;
|
int c;
|
||||||
|
|
||||||
@@ -2232,26 +1985,20 @@ int mb_tail_off(char_u *base, char_u *p)
|
|||||||
if (*p == NUL)
|
if (*p == NUL)
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
if (enc_utf8) {
|
// Find the last character that is 10xx.xxxx
|
||||||
/* Find the last character that is 10xx.xxxx */
|
for (i = 0; (p[i + 1] & 0xc0) == 0x80; i++) {}
|
||||||
for (i = 0; (p[i + 1] & 0xc0) == 0x80; ++i)
|
|
||||||
;
|
// Check for illegal sequence.
|
||||||
/* Check for illegal sequence. */
|
for (j = 0; p - j > base; j++) {
|
||||||
for (j = 0; p - j > base; ++j)
|
if ((p[-j] & 0xc0) != 0x80) {
|
||||||
if ((p[-j] & 0xc0) != 0x80)
|
break;
|
||||||
break;
|
}
|
||||||
if (utf8len_tab[p[-j]] != i + j + 1)
|
|
||||||
return 0;
|
|
||||||
return i;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* It can't be the first byte if a double-byte when not using DBCS, at the
|
if (utf8len_tab[p[-j]] != i + j + 1) {
|
||||||
* end of the string or the byte can't start a double-byte. */
|
|
||||||
if (enc_dbcs == 0 || p[1] == NUL || MB_BYTE2LEN(*p) == 1)
|
|
||||||
return 0;
|
return 0;
|
||||||
|
}
|
||||||
/* Return 1 when on the lead byte, 0 when on the tail byte. */
|
return i;
|
||||||
return 1 - dbcs_head_off(base, p);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@@ -2466,13 +2213,10 @@ int mb_fix_col(int col, int row)
|
|||||||
{
|
{
|
||||||
col = check_col(col);
|
col = check_col(col);
|
||||||
row = check_row(row);
|
row = check_row(row);
|
||||||
if (has_mbyte && ScreenLines != NULL && col > 0
|
if (ScreenLines != NULL && col > 0
|
||||||
&& ((enc_dbcs
|
&& ScreenLines[LineOffset[row] + col] == 0) {
|
||||||
&& ScreenLines[LineOffset[row] + col] != NUL
|
|
||||||
&& dbcs_screen_head_off(ScreenLines + LineOffset[row],
|
|
||||||
ScreenLines + LineOffset[row] + col))
|
|
||||||
|| (enc_utf8 && ScreenLines[LineOffset[row] + col] == 0)))
|
|
||||||
return col - 1;
|
return col - 1;
|
||||||
|
}
|
||||||
return col;
|
return col;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -9,8 +9,8 @@
|
|||||||
* MB_BYTE2LEN_CHECK() can be used to count a special key as one byte.
|
* MB_BYTE2LEN_CHECK() can be used to count a special key as one byte.
|
||||||
* Don't call MB_BYTE2LEN(b) with b < 0 or b > 255!
|
* Don't call MB_BYTE2LEN(b) with b < 0 or b > 255!
|
||||||
*/
|
*/
|
||||||
#define MB_BYTE2LEN(b) mb_bytelen_tab[b]
|
#define MB_BYTE2LEN(b) utf8len_tab[b]
|
||||||
#define MB_BYTE2LEN_CHECK(b) (((b) < 0 || (b) > 255) ? 1 : mb_bytelen_tab[b])
|
#define MB_BYTE2LEN_CHECK(b) (((b) < 0 || (b) > 255) ? 1 : utf8len_tab[b])
|
||||||
|
|
||||||
/* properties used in enc_canon_table[] (first three mutually exclusive) */
|
/* properties used in enc_canon_table[] (first three mutually exclusive) */
|
||||||
#define ENC_8BIT 0x01
|
#define ENC_8BIT 0x01
|
||||||
@@ -28,6 +28,18 @@
|
|||||||
#define ENC_LATIN9 0x400 /* Latin9 */
|
#define ENC_LATIN9 0x400 /* Latin9 */
|
||||||
#define ENC_MACROMAN 0x800 /* Mac Roman (not Macro Man! :-) */
|
#define ENC_MACROMAN 0x800 /* Mac Roman (not Macro Man! :-) */
|
||||||
|
|
||||||
|
// TODO(bfredl): eventually we should keep only one of the namings
|
||||||
|
#define mb_ptr2len utfc_ptr2len
|
||||||
|
#define mb_ptr2len_len utfc_ptr2len_len
|
||||||
|
#define mb_char2len utf_char2len
|
||||||
|
#define mb_char2bytes utf_char2bytes
|
||||||
|
#define mb_ptr2cells utf_ptr2cells
|
||||||
|
#define mb_ptr2cells_len utf_ptr2cells_len
|
||||||
|
#define mb_char2cells utf_char2cells
|
||||||
|
#define mb_off2cells utf_off2cells
|
||||||
|
#define mb_ptr2char utf_ptr2char
|
||||||
|
#define mb_head_off utf_head_off
|
||||||
|
|
||||||
#ifdef INCLUDE_GENERATED_DECLARATIONS
|
#ifdef INCLUDE_GENERATED_DECLARATIONS
|
||||||
# include "mbyte.h.generated.h"
|
# include "mbyte.h.generated.h"
|
||||||
#endif
|
#endif
|
||||||
|
@@ -1936,8 +1936,7 @@ int swapchar(int op_type, pos_T *pos)
|
|||||||
if (c >= 0x80 && op_type == OP_ROT13)
|
if (c >= 0x80 && op_type == OP_ROT13)
|
||||||
return FALSE;
|
return FALSE;
|
||||||
|
|
||||||
if (op_type == OP_UPPER && c == 0xdf
|
if (op_type == OP_UPPER && c == 0xdf) {
|
||||||
&& (enc_latin1like || STRCMP(p_enc, "iso-8859-2") == 0)) {
|
|
||||||
pos_T sp = curwin->w_cursor;
|
pos_T sp = curwin->w_cursor;
|
||||||
|
|
||||||
/* Special handling of German sharp s: change to "SS". */
|
/* Special handling of German sharp s: change to "SS". */
|
||||||
|
@@ -780,8 +780,11 @@ void set_init_1(void)
|
|||||||
}
|
}
|
||||||
fenc_default = p;
|
fenc_default = p;
|
||||||
|
|
||||||
// Initialize multibyte (utf-8) handling
|
#ifdef HAVE_WORKING_LIBINTL
|
||||||
mb_init();
|
// GNU gettext 0.10.37 supports this feature: set the codeset used for
|
||||||
|
// translated messages independently from the current locale.
|
||||||
|
(void)bind_textdomain_codeset(PROJECT_NAME, (char *)p_enc);
|
||||||
|
#endif
|
||||||
|
|
||||||
/* Set the default for 'helplang'. */
|
/* Set the default for 'helplang'. */
|
||||||
set_helplang_default(get_mess_lang());
|
set_helplang_default(get_mess_lang());
|
||||||
|
@@ -5292,7 +5292,7 @@ void screen_puts_len(char_u *text, int textlen, int row, int col, int attr)
|
|||||||
int force_redraw_next = FALSE;
|
int force_redraw_next = FALSE;
|
||||||
int need_redraw;
|
int need_redraw;
|
||||||
|
|
||||||
const int l_has_mbyte = has_mbyte;
|
const bool l_has_mbyte = has_mbyte;
|
||||||
const bool l_enc_utf8 = enc_utf8;
|
const bool l_enc_utf8 = enc_utf8;
|
||||||
const int l_enc_dbcs = enc_dbcs;
|
const int l_enc_dbcs = enc_dbcs;
|
||||||
|
|
||||||
|
@@ -9266,9 +9266,7 @@ static void allcap_copy(char_u *word, char_u *wcopy)
|
|||||||
else
|
else
|
||||||
c = *s++;
|
c = *s++;
|
||||||
|
|
||||||
// We only change 0xdf to SS when we are certain latin1 is used. It
|
if (c == 0xdf) {
|
||||||
// would cause weird errors in other 8-bit encodings.
|
|
||||||
if (enc_latin1like && c == 0xdf) {
|
|
||||||
c = 'S';
|
c = 'S';
|
||||||
if (d - wcopy >= MAXWLEN - 1)
|
if (d - wcopy >= MAXWLEN - 1)
|
||||||
break;
|
break;
|
||||||
@@ -12602,7 +12600,7 @@ static int spell_edit_score(slang_T *slang, char_u *badword, char_u *goodword)
|
|||||||
char_u *p;
|
char_u *p;
|
||||||
int wbadword[MAXWLEN];
|
int wbadword[MAXWLEN];
|
||||||
int wgoodword[MAXWLEN];
|
int wgoodword[MAXWLEN];
|
||||||
const int l_has_mbyte = has_mbyte;
|
const bool l_has_mbyte = has_mbyte;
|
||||||
|
|
||||||
if (l_has_mbyte) {
|
if (l_has_mbyte) {
|
||||||
// Get the characters from the multi-byte strings and put them in an
|
// Get the characters from the multi-byte strings and put them in an
|
||||||
|
@@ -31,8 +31,8 @@ void term_input_init(TermInput *input, Loop *loop)
|
|||||||
if (!term) {
|
if (!term) {
|
||||||
term = ""; // termkey_new_abstract assumes non-null (#2745)
|
term = ""; // termkey_new_abstract assumes non-null (#2745)
|
||||||
}
|
}
|
||||||
int enc_flag = enc_utf8 ? TERMKEY_FLAG_UTF8 : TERMKEY_FLAG_RAW;
|
|
||||||
input->tk = termkey_new_abstract(term, enc_flag);
|
input->tk = termkey_new_abstract(term, TERMKEY_FLAG_UTF8);
|
||||||
|
|
||||||
int curflags = termkey_get_canonflags(input->tk);
|
int curflags = termkey_get_canonflags(input->tk);
|
||||||
termkey_set_canonflags(input->tk, curflags | TERMKEY_CANON_DELBS);
|
termkey_set_canonflags(input->tk, curflags | TERMKEY_CANON_DELBS);
|
||||||
|
Reference in New Issue
Block a user