vim-patch:8.0.0519: character classes not well tested (#8460)

Problem:    Character classes are not well tested. They can differ between
            platforms.
Solution:   Add tests.  In the documentation make clear which classes depend
            on what library function.  Only use :cntrl: and :graph: for ASCII.
            (Kazunobu Kuriyama, Dominique Pelle, closes vim/vim#1560)
            Update the documentation.
0c078fc7db
This commit is contained in:
KunMing Xie
2018-06-02 01:57:22 +08:00
committed by Justin M. Keyes
parent c7350f542a
commit 49a497a67c
4 changed files with 91 additions and 33 deletions

View File

@@ -1071,25 +1071,27 @@ x A single character, with no special meaning, matches itself
- A character class expression is evaluated to the set of characters
belonging to that character class. The following character classes
are supported:
Name Contents ~
*[:alnum:]* [:alnum:] ASCII letters and digits
*[:alpha:]* [:alpha:] ASCII letters
*[:blank:]* [:blank:] space and tab characters
*[:cntrl:]* [:cntrl:] control characters
*[:digit:]* [:digit:] decimal digits
*[:graph:]* [:graph:] printable characters excluding space
*[:lower:]* [:lower:] lowercase letters (all letters when
Name Func Contents ~
*[:alnum:]* [:alnum:] isalnum ASCII letters and digits
*[:alpha:]* [:alpha:] isalpha ASCII letters
*[:blank:]* [:blank:] space and tab
*[:cntrl:]* [:cntrl:] iscntrl ASCII control characters
*[:digit:]* [:digit:] decimal digits '0' to '9'
*[:graph:]* [:graph:] isgraph ASCII printable characters excluding
space
*[:lower:]* [:lower:] (1) lowercase letters (all letters when
'ignorecase' is used)
*[:print:]* [:print:] printable characters including space
*[:punct:]* [:punct:] ASCII punctuation characters
*[:space:]* [:space:] whitespace characters
*[:upper:]* [:upper:] uppercase letters (all letters when
*[:print:]* [:print:] (2) printable characters including space
*[:punct:]* [:punct:] ispunct ASCII punctuation characters
*[:space:]* [:space:] whitespace characters: space, tab, CR,
NL, vertical tab, form feed
*[:upper:]* [:upper:] (3) uppercase letters (all letters when
'ignorecase' is used)
*[:xdigit:]* [:xdigit:] hexadecimal digits
*[:return:]* [:return:] the <CR> character
*[:tab:]* [:tab:] the <Tab> character
*[:escape:]* [:escape:] the <Esc> character
*[:backspace:]* [:backspace:] the <BS> character
*[:xdigit:]* [:xdigit:] hexadecimal digits: 0-9, a-f, A-F
*[:return:]* [:return:] the <CR> character
*[:tab:]* [:tab:] the <Tab> character
*[:escape:]* [:escape:] the <Esc> character
*[:backspace:]* [:backspace:] the <BS> character
The brackets in character class expressions are additional to the
brackets delimiting a collection. For example, the following is a
plausible pattern for a Unix filename: "[-./[:alnum:]_~]\+" That is,
@@ -1100,6 +1102,13 @@ x A single character, with no special meaning, matches itself
regexp engine. See |two-engines|. In the future these items may
work for multi-byte characters. For now, to get all "alpha"
characters you can use: [[:lower:][:upper:]].
The "Func" column shows what library function is used. The
implementation depends on the system. Otherwise:
(1) Uses islower() for ASCII and Vim builtin rules for other
characters when built with the |+multi_byte| feature.
(2) Uses Vim builtin rules
(3) As with (1) but using isupper()
*/[[=* *[==]*
- An equivalence class. This means that characters are matched that
have almost the same meaning, e.g., when ignoring accents. This

View File

@@ -2328,21 +2328,21 @@ collection:
regc('\t');
break;
case CLASS_CNTRL:
for (cu = 1; cu <= 255; cu++) {
for (cu = 1; cu <= 127; cu++) {
if (iscntrl(cu)) {
regmbc(cu);
}
}
break;
case CLASS_DIGIT:
for (cu = 1; cu <= 255; cu++) {
for (cu = 1; cu <= 127; cu++) {
if (ascii_isdigit(cu)) {
regmbc(cu);
}
}
break;
case CLASS_GRAPH:
for (cu = 1; cu <= 255; cu++) {
for (cu = 1; cu <= 127; cu++) {
if (isgraph(cu)) {
regmbc(cu);
}

View File

@@ -4358,16 +4358,18 @@ static int check_char_class(int class, int c)
return OK;
break;
case NFA_CLASS_CNTRL:
if (c >= 1 && c <= 255 && iscntrl(c))
if (c >= 1 && c <= 127 && iscntrl(c)) {
return OK;
}
break;
case NFA_CLASS_DIGIT:
if (ascii_isdigit(c))
return OK;
break;
case NFA_CLASS_GRAPH:
if (c >= 1 && c <= 255 && isgraph(c))
if (c >= 1 && c <= 127 && isgraph(c)) {
return OK;
}
break;
case NFA_CLASS_LOWER:
if (mb_islower(c) && c != 170 && c != 186) {

View File

@@ -35,12 +35,21 @@ func s:classes_test()
set isprint=@,161-255
call assert_equal('Motörhead', matchstr('Motörhead', '[[:print:]]\+'))
let alphachars = ''
let lowerchars = ''
let upperchars = ''
let alnumchars = ''
let alphachars = ''
let backspacechar = ''
let blankchars = ''
let cntrlchars = ''
let digitchars = ''
let escapechar = ''
let graphchars = ''
let lowerchars = ''
let printchars = ''
let punctchars = ''
let returnchar = ''
let spacechars = ''
let tabchar = ''
let upperchars = ''
let xdigitchars = ''
let i = 1
while i <= 255
@@ -48,21 +57,48 @@ func s:classes_test()
if c =~ '[[:alpha:]]'
let alphachars .= c
endif
if c =~ '[[:lower:]]'
let lowerchars .= c
endif
if c =~ '[[:upper:]]'
let upperchars .= c
endif
if c =~ '[[:alnum:]]'
let alnumchars .= c
endif
if c =~ '[[:backspace:]]'
let backspacechar .= c
endif
if c =~ '[[:blank:]]'
let blankchars .= c
endif
if c =~ '[[:cntrl:]]'
let cntrlchars .= c
endif
if c =~ '[[:digit:]]'
let digitchars .= c
endif
if c =~ '[[:escape:]]'
let escapechar .= c
endif
if c =~ '[[:graph:]]'
let graphchars .= c
endif
if c =~ '[[:lower:]]'
let lowerchars .= c
endif
if c =~ '[[:print:]]'
let printchars .= c
endif
if c =~ '[[:punct:]]'
let punctchars .= c
endif
if c =~ '[[:return:]]'
let returnchar .= c
endif
if c =~ '[[:space:]]'
let spacechars .= c
endif
if c =~ '[[:tab:]]'
let tabchar .= c
endif
if c =~ '[[:upper:]]'
let upperchars .= c
endif
if c =~ '[[:xdigit:]]'
let xdigitchars .= c
endif
@@ -70,11 +106,22 @@ func s:classes_test()
endwhile
call assert_equal('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', alphachars)
call assert_equal('abcdefghijklmnopqrstuvwxyzµßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ', lowerchars)
call assert_equal('ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ', upperchars)
call assert_equal('0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', alnumchars)
call assert_equal("\b", backspacechar)
call assert_equal("\t ", blankchars)
" Commented out: it succeeds on Linux and Windows, but fails on macOs in Travis.
" call assert_equal("\x01\x02\x03\x04\x05\x06\x07\b\t\n\x0b\f\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\e\x1c\x1d\x1e\x1f\x7f", cntrlchars)
call assert_equal("0123456789", digitchars)
call assert_equal("\<Esc>", escapechar)
" Commented out: it succeeds on Linux and Windows, but fails on macOs in Travis.
" call assert_equal('!"#$%&''()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~', graphchars)
call assert_equal('abcdefghijklmnopqrstuvwxyzµßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ', lowerchars)
call assert_equal(' !"#$%&''()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ', printchars)
call assert_equal('!"#$%&''()*+,-./:;<=>?@[\]^_`{|}~', punctchars)
call assert_equal('ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ', upperchars)
call assert_equal("\r", returnchar)
call assert_equal("\t\n\x0b\f\r ", spacechars)
call assert_equal("\t", tabchar)
call assert_equal('0123456789ABCDEFabcdef', xdigitchars)
endfunc