From e5667b9c157a26731d4bbcaf8a9cf69e4c73e8fb Mon Sep 17 00:00:00 2001 From: zeertzjq Date: Thu, 12 Mar 2026 07:42:31 +0800 Subject: [PATCH] vim-patch:9.2.0137: [security]: crash with composing char in collection range (#38261) Problem: Using a composing character as the end of a range inside a collection may corrupt the NFA postfix stack (Nathan Mills, after v9.1.0011) Solution: When a character is used as the endpoint of a range, do not emit its composing characters separately. Range handling only uses the base codepoint. supported by AI Github Advisory: https://github.com/vim/vim/security/advisories/GHSA-9phh-423r-778r https://github.com/vim/vim/commit/36d6e87542cf823d833e451e09a90ee429899cec Co-authored-by: Christian Brabandt --- src/nvim/regexp.c | 18 ++++++++++++++++-- test/old/testdir/test_regexp_utf8.vim | 18 ++++++++++++++++++ 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/src/nvim/regexp.c b/src/nvim/regexp.c index d547cb17e1..ba3d9cb5d6 100644 --- a/src/nvim/regexp.c +++ b/src/nvim/regexp.c @@ -10430,6 +10430,7 @@ collection: p = (uint8_t *)regparse; endp = (uint8_t *)skip_anyof((char *)p); if (*endp == ']') { + bool range_endpoint; // Try to reverse engineer character classes. For example, // recognize that [0-9] stands for \d and [A-Za-z_] for \h, // and perform the necessary substitutions in the NFA. @@ -10466,6 +10467,7 @@ collection: emit_range = false; while ((uint8_t *)regparse < endp) { int oldstartc = startc; + range_endpoint = false; startc = -1; got_coll_char = false; if (*regparse == '[') { @@ -10609,6 +10611,7 @@ collection: // Previous char was '-', so this char is end of range. if (emit_range) { int endc = startc; + range_endpoint = true; startc = oldstartc; if (startc > endc) { EMSG_RET_FAIL(_(e_reverse_range)); @@ -10673,7 +10676,14 @@ collection: } int plen; - if (utf_ptr2len(regparse) != (plen = utfc_ptr2len(regparse))) { + // + // If this character was consumed as the end of a range, do not emit its + // composing characters separately. Range handling only uses the base + // codepoint; emitting the composing part again would duplicate the + // character in the postfix stream and corrupt the NFA stack. + // + if (!range_endpoint + && utf_ptr2len(regparse) != (plen = utfc_ptr2len(regparse))) { int i = utf_ptr2len(regparse); c = utf_ptr2char(regparse + i); @@ -11839,7 +11849,11 @@ static int nfa_max_width(nfa_state_T *startstate, int depth) // Matches some character, including composing chars. len += MB_MAXBYTES; if (state->c != NFA_ANY) { - // Skip over the characters. + // Skip over the compiled collection. + // malformed NFAs must not crash width estimation. + if (state->out1 == NULL || state->out1->out == NULL) { + return -1; + } state = state->out1->out; continue; } diff --git a/test/old/testdir/test_regexp_utf8.vim b/test/old/testdir/test_regexp_utf8.vim index 5cd9c6e967..d6284a559f 100644 --- a/test/old/testdir/test_regexp_utf8.vim +++ b/test/old/testdir/test_regexp_utf8.vim @@ -638,4 +638,22 @@ func Test_replace_multibyte_match_in_multi_lines() set ignorecase&vim re&vim endfun +func Test_regex_collection_range_with_composing_crash() + " Regression test: composing char in collection range caused NFA crash/E874 + new + call setline(1, ['00', '0ֻ', '01']) + let patterns = [ '0[0-0ֻ]\@", 'E486:') + endfor + endfor + + bwipe! +endfunc + " vim: shiftwidth=2 sts=2 expandtab