mirror of
				https://github.com/neovim/neovim.git
				synced 2025-10-26 12:27:24 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			378 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			Lua
		
	
	
	
	
	
			
		
		
	
	
			378 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			Lua
		
	
	
	
	
	
| --- @brief Glob-to-LPeg Converter (Peglob)
 | ||
| --- This module converts glob patterns to LPeg patterns according to the LSP 3.17 specification:
 | ||
| --- https://microsoft.github.io/language-server-protocol/specifications/lsp/3.17/specification/#pattern
 | ||
| ---
 | ||
| --- Glob grammar overview:
 | ||
| --- - `*` to match zero or more characters in a path segment
 | ||
| --- - `?` to match on one character in a path segment
 | ||
| --- - `**` to match any number of path segments, including none
 | ||
| --- - `{}` to group conditions (e.g. `*.{ts,js}` matches TypeScript and JavaScript files)
 | ||
| --- - `[]` to declare a range of characters to match in a path segment
 | ||
| ---   (e.g., `example.[0-9]` to match on `example.0`, `example.1`, …)
 | ||
| --- - `[!...]` to negate a range of characters to match in a path segment
 | ||
| ---   (e.g., `example.[!0-9]` to match on `example.a`, `example.b`, but not `example.0`)
 | ||
| ---
 | ||
| --- Additional constraints:
 | ||
| --- - A Glob pattern must match an entire path, with partial matches
 | ||
| ---   considered failures.
 | ||
| --- - The pattern only determines success or failure, without specifying
 | ||
| ---   which parts correspond to which characters.
 | ||
| --- - A *path segment* is the portion of a path between two adjacent path
 | ||
| ---   separators (`/`), or between the start/end of the path and the nearest
 | ||
| ---   separator.
 | ||
| --- - The `**` (*globstar*) pattern matches zero or more path segments,
 | ||
| ---   including intervening separators (`/`). Within pattern strings, `**`
 | ||
| ---   must be delimited by path separators (`/`) or pattern boundaries and
 | ||
| ---   cannot be adjacent to any characters other than `/`. If `**` is not
 | ||
| ---   the final element, it must be followed by `/`.
 | ||
| --- - `{}` (*braced conditions*) contains valid Glob patterns as branches,
 | ||
| ---   separated by commas. Commas are exclusively used for separating
 | ||
| ---   branches and cannot appear within a branch for any other purpose.
 | ||
| ---   Nested `{}` structures are allowed, but `{}` must contain at least two
 | ||
| ---   branches—zero or one branch is not permitted.
 | ||
| --- - In `[]` or `[!...]`, a *character range* consists of character
 | ||
| ---   intervals (e.g., `a-z`) or individual characters (e.g., `w`). A range
 | ||
| ---   including `/` won’t match that character.
 | ||
| 
 | ||
| --- @diagnostic disable: missing-fields
 | ||
| 
 | ||
| local m = vim.lpeg
 | ||
| local mt = getmetatable(m.P(0))
 | ||
| local re = vim.re
 | ||
| local bit = require('bit')
 | ||
| 
 | ||
| local M = {}
 | ||
| 
 | ||
| -- Basic patterns for matching glob components
 | ||
| local letter = m.P(1) - m.S('*?[]{}/\\') -- Any character except special glob characters
 | ||
| local slash = m.P '/' * m.Cc(m.P '/') -- Path separator with capture
 | ||
| local notslash = m.P(1) - m.P '/' -- Any character except path separator
 | ||
| local notcomma = m.P(1) - m.S(',\\') -- Any character except comma and backslash
 | ||
| 
 | ||
| --- Handle EOF, considering whether we're in a segment or not
 | ||
| --- @type vim.lpeg.Pattern
 | ||
| local eof = -1
 | ||
|   * m.Cb('inseg')
 | ||
|   / function(flag)
 | ||
|     if flag then
 | ||
|       return #m.P '/'
 | ||
|     else
 | ||
|       return m.P(-1)
 | ||
|     end
 | ||
|   end
 | ||
| 
 | ||
| ---@alias pat_table { F: string?, [1]: string, [2]: vim.lpeg.Pattern }
 | ||
| ---@alias seg_part { [string]: any, [integer]: pat_table }
 | ||
| 
 | ||
| --- @param p pat_table Initial segment pattern data
 | ||
| --- @return seg_part Segment structure with start pattern
 | ||
| local function start_seg(p)
 | ||
|   return { s = p[2], e = true, n = 0 }
 | ||
| end
 | ||
| 
 | ||
| --- @param t seg_part Segment structure
 | ||
| --- @param p pat_table Pattern to look for
 | ||
| --- @return table Updated segment structure
 | ||
| local function lookfor(t, p)
 | ||
|   t.n = t.n + 1
 | ||
|   t[t.n] = p
 | ||
|   return t
 | ||
| end
 | ||
| 
 | ||
| --- @param t seg_part Segment structure
 | ||
| --- @return table Segment structure with end pattern
 | ||
| local function to_seg_end(t)
 | ||
|   t.e = notslash ^ 0
 | ||
|   return t
 | ||
| end
 | ||
| 
 | ||
| --- Constructs a segment matching pattern from collected components
 | ||
| ---
 | ||
| --- @param t seg_part Segment structure with patterns
 | ||
| --- @return vim.lpeg.Pattern Complete segment match pattern
 | ||
| local function end_seg(t)
 | ||
|   --- @type table<any,any>
 | ||
|   local seg_grammar = { 's' }
 | ||
|   if t.n > 0 then
 | ||
|     seg_grammar.s = t.s
 | ||
|     for i = 1, t.n do
 | ||
|       local rname = t[i][1]
 | ||
|       if not seg_grammar[rname] then
 | ||
|         -- Optimize search when deterministic first character is available
 | ||
|         if t[i].F then
 | ||
|           seg_grammar[rname] = t[i][2] + notslash * (notslash - m.P(t[i].F)) ^ 0 * m.V(rname)
 | ||
|         else
 | ||
|           seg_grammar[rname] = t[i][2] + notslash * m.V(rname)
 | ||
|         end
 | ||
|       end
 | ||
|       seg_grammar.s = seg_grammar.s * m.V(rname)
 | ||
|     end
 | ||
|     if t.e then
 | ||
|       seg_grammar.s = seg_grammar.s * t.e
 | ||
|     end
 | ||
|     return m.P(seg_grammar)
 | ||
|   else
 | ||
|     seg_grammar.s = t.s
 | ||
|     if t.e then
 | ||
|       seg_grammar.s = seg_grammar.s * t.e
 | ||
|     end
 | ||
|     return seg_grammar.s
 | ||
|   end
 | ||
| end
 | ||
| 
 | ||
| --- @param p vim.lpeg.Pattern Pattern directly after `**/`
 | ||
| --- @return vim.lpeg.Pattern LPeg pattern for `**/p`
 | ||
| local function dseg(p)
 | ||
|   return m.P { p + notslash ^ 0 * m.P '/' * m.V(1) }
 | ||
| end
 | ||
| 
 | ||
| --- @type (vim.lpeg.Pattern|table)
 | ||
| local g = nil
 | ||
| 
 | ||
| --- Multiplies conditions for braced expansion (Cartesian product)
 | ||
| ---
 | ||
| --- @param a string|string[] First part
 | ||
| --- @param b string|string[] Second part
 | ||
| --- @return string|string[] Cartesian product of values
 | ||
| local function mul_cond(a, b)
 | ||
|   if type(a) == 'string' then
 | ||
|     if type(b) == 'string' then
 | ||
|       return a .. b
 | ||
|     elseif type(b) == 'table' then
 | ||
|       for i = 1, #b do
 | ||
|         b[i] = a .. b[i]
 | ||
|       end
 | ||
|       return b
 | ||
|     else
 | ||
|       return a
 | ||
|     end
 | ||
|   elseif type(a) == 'table' then
 | ||
|     if type(b) == 'string' then
 | ||
|       for i = 1, #a do
 | ||
|         a[i] = a[i] .. b
 | ||
|       end
 | ||
|       return a
 | ||
|     elseif type(b) == 'table' then
 | ||
|       --- @type string[]
 | ||
|       local res = {}
 | ||
|       local idx = 0
 | ||
|       for i = 1, #a do
 | ||
|         for j = 1, #b do
 | ||
|           idx = idx + 1
 | ||
|           res[idx] = a[i] .. b[j]
 | ||
|         end
 | ||
|       end
 | ||
|       return res
 | ||
|     else
 | ||
|       return a
 | ||
|     end
 | ||
|   else
 | ||
|     return b
 | ||
|   end
 | ||
| end
 | ||
| 
 | ||
| --- Combines alternatives in braced patterns
 | ||
| ---
 | ||
| --- @param a string|table First part
 | ||
| --- @param b string|table Second part
 | ||
| --- @return table #Combined alternatives
 | ||
| local function add_cond(a, b)
 | ||
|   if type(a) == 'string' then
 | ||
|     if type(b) == 'string' then
 | ||
|       return { a, b }
 | ||
|     elseif type(b) == 'table' then
 | ||
|       table.insert(b, 1, a)
 | ||
|       return b
 | ||
|     end
 | ||
|   elseif type(a) == 'table' then
 | ||
|     if type(b) == 'string' then
 | ||
|       table.insert(a, b)
 | ||
|       return a
 | ||
|     elseif type(b) == 'table' then
 | ||
|       for i = 1, #b do
 | ||
|         table.insert(a, b[i])
 | ||
|       end
 | ||
|       return a
 | ||
|     end
 | ||
|     --- @diagnostic disable-next-line: missing-return
 | ||
|   end
 | ||
| end
 | ||
| 
 | ||
| --- Expands patterns handling segment boundaries
 | ||
| --- `#` prefix is added for sub-grammar to detect in-segment flag
 | ||
| ---
 | ||
| ---@param a (any[]|vim.lpeg.Pattern[]) Array of patterns
 | ||
| ---@param b string Tail string
 | ||
| ---@param inseg boolean Whether inside a path segment
 | ||
| ---@return vim.lpeg.Pattern #Expanded pattern
 | ||
| local function expand(a, b, inseg)
 | ||
|   for i = 1, #a do
 | ||
|     if inseg then
 | ||
|       a[i] = '#' .. a[i]
 | ||
|     end
 | ||
|     a[i] = g:match(a[i] .. b)
 | ||
|   end
 | ||
|   local res = a[1]
 | ||
|   for i = 2, #a do
 | ||
|     res = res + a[i]
 | ||
|   end
 | ||
|   return res
 | ||
| end
 | ||
| 
 | ||
| --- Converts a UTF-8 character to its Unicode codepoint
 | ||
| ---
 | ||
| --- @param utf8_str string UTF-8 character
 | ||
| --- @return number #Codepoint value
 | ||
| local function to_codepoint(utf8_str)
 | ||
|   local codepoint = 0
 | ||
|   local byte_count = 0
 | ||
| 
 | ||
|   for i = 1, #utf8_str do
 | ||
|     local byte = utf8_str:byte(i)
 | ||
| 
 | ||
|     if byte_count ~= 0 then
 | ||
|       codepoint = bit.bor(bit.lshift(codepoint, 6), bit.band(byte, 0x3F))
 | ||
|       byte_count = byte_count - 1
 | ||
|     else
 | ||
|       if byte < 0x80 then
 | ||
|         codepoint = byte
 | ||
|       elseif byte < 0xE0 then
 | ||
|         byte_count = 1
 | ||
|         codepoint = bit.band(byte, 0x1F)
 | ||
|       elseif byte < 0xF0 then
 | ||
|         byte_count = 2
 | ||
|         codepoint = bit.band(byte, 0x0F)
 | ||
|       else
 | ||
|         byte_count = 3
 | ||
|         codepoint = bit.band(byte, 0x07)
 | ||
|       end
 | ||
|     end
 | ||
| 
 | ||
|     if byte_count == 0 then
 | ||
|       break
 | ||
|     end
 | ||
|   end
 | ||
| 
 | ||
|   return codepoint
 | ||
| end
 | ||
| 
 | ||
| --- Pattern for matching UTF-8 characters
 | ||
| local cont = m.R('\128\191')
 | ||
| local any_utf8 = m.R('\0\127')
 | ||
|   + m.R('\194\223') * cont
 | ||
|   + m.R('\224\239') * cont * cont
 | ||
|   + m.R('\240\244') * cont * cont * cont
 | ||
| 
 | ||
| --- Creates a character class pattern for glob ranges
 | ||
| --- @param inv string Inversion flag ('!' or '')
 | ||
| --- @param ranges (string|string[])[] Character ranges
 | ||
| --- @return vim.lpeg.Pattern #Character class pattern
 | ||
| local function class(inv, ranges)
 | ||
|   local patt = m.P(false)
 | ||
|   if #ranges == 0 then
 | ||
|     if inv == '!' then
 | ||
|       return m.P '[!]'
 | ||
|     else
 | ||
|       return m.P '[]'
 | ||
|     end
 | ||
|   end
 | ||
|   for _, v in ipairs(ranges) do
 | ||
|     patt = patt + (type(v) == 'table' and m.utfR(to_codepoint(v[1]), to_codepoint(v[2])) or m.P(v))
 | ||
|   end
 | ||
|   if inv == '!' then
 | ||
|     patt = m.P(1) - patt --[[@as vim.lpeg.Pattern]]
 | ||
|   end
 | ||
|   return patt - m.P '/'
 | ||
| end
 | ||
| 
 | ||
| -- Parse constraints for optimizing braced conditions
 | ||
| local noopt_condlist = re.compile [[
 | ||
|   s <- '/' / '**' / . [^/*]* s
 | ||
| ]]
 | ||
| 
 | ||
| local opt_tail = re.compile [[
 | ||
|   s <- (!'**' [^{/])* &'/'
 | ||
| ]]
 | ||
| 
 | ||
| -- stylua: ignore start
 | ||
| --- @nodoc
 | ||
| --- @diagnostic disable
 | ||
| --- Main grammar for glob pattern matching
 | ||
| g = {
 | ||
|   'Glob',
 | ||
|   Glob     = (m.P'#' * m.Cg(m.Cc(true), 'inseg') + m.Cg(m.Cc(false), 'inseg')) *
 | ||
|              m.Cf(m.V'Element'^-1 * (slash * m.V'Element')^0 * (slash^-1 * eof), mt.__mul),
 | ||
|   -- Elements handle segments, globstar patterns
 | ||
|   Element  = m.V'DSeg' + m.V'DSEnd' + m.Cf(m.V'Segment' * (slash * m.V'Segment')^0 * (slash * eof + eof^-1), mt.__mul),
 | ||
|   -- Globstar patterns
 | ||
|   DSeg     = m.P'**/' * ((m.V'Element' + eof) / dseg),
 | ||
|   DSEnd    = m.P'**' * -1 * m.Cc(m.P(1)^0),
 | ||
|   -- Segment handling with word and star patterns
 | ||
|   Segment  = (m.V'Word' / start_seg + m.Cc({ '', true }) / start_seg * (m.V'Star' * m.V'Word' % lookfor)) *
 | ||
|               (m.V'Star' * m.V'Word' % lookfor)^0 * (m.V'Star' * m.V'CheckBnd' % to_seg_end)^-1 / end_seg
 | ||
|              + m.V'Star' * m.V'CheckBnd' * m.Cc(notslash^0),
 | ||
|   CheckBnd = #m.P'/' + -1,  -- Boundary constraint
 | ||
| 
 | ||
|   -- Word patterns for fixed-length matching
 | ||
|   Word     = -m.P'*' * m.Ct( m.V('FIRST')^-1 * m.C(m.V'WordAux') ),
 | ||
|   WordAux  = m.V'Branch' + m.Cf(m.V'Simple'^1 * m.V'Branch'^-1, mt.__mul),
 | ||
|   Simple   = m.Cg( m.V'Token' * (m.V'Token' % mt.__mul)^0 * (m.V'Boundary' % mt.__mul)^-1),
 | ||
|   Boundary = #m.P'/' * m.Cc(#m.P'/') + eof,
 | ||
|   Token    = m.V'Ques' + m.V'Class' + m.V'Escape' + m.V'Literal',
 | ||
|   Star     = m.P'*',
 | ||
|   Ques     = m.P'?' * m.Cc(notslash),
 | ||
|   Escape   = m.P'\\' * m.C(1) / m.P,
 | ||
|   Literal  = m.C(letter^1) / m.P,
 | ||
| 
 | ||
|   -- Branch handling for braced conditions
 | ||
|   Branch   = m.Cmt(m.C(m.V'CondList'), function(s, i, p1, p2)
 | ||
|                                          -- Optimize brace expansion when possible
 | ||
|                                          -- p1: string form of condition list, p2: transformed lua table
 | ||
|                                          if noopt_condlist:match(p1) then
 | ||
|                                            -- Cannot optimize, match till the end
 | ||
|                                            return #s + 1, p2, s:sub(i)
 | ||
|                                          end
 | ||
|                                          -- Find point to cut for optimization
 | ||
|                                          local cut = opt_tail:match(s, i)
 | ||
|                                          if cut then
 | ||
|                                            -- Can optimize: match till cut point
 | ||
|                                            -- true flag tells expand to transform EOF matches to &'/' predicates
 | ||
|                                            return cut, p2, s:sub(i, cut - 1), true
 | ||
|                                          else
 | ||
|                                            -- Cannot optimize
 | ||
|                                            return #s + 1, p2, s:sub(i)
 | ||
|                                          end
 | ||
|                                        end) / expand,
 | ||
|   -- Brace expansion handling
 | ||
|   CondList = m.Cf(m.P'{' * m.V'Cond' * (m.P',' * m.V'Cond')^1 * m.P'}', add_cond),
 | ||
|   Cond     = m.Cf((m.C((notcomma + m.P'\\' * 1 - m.S'{}')^1) + m.V'CondList')^1, mul_cond) + m.C(true),
 | ||
| 
 | ||
|   -- Character class handling
 | ||
|   Class    = m.P'[' * m.C(m.P'!'^-1) * m.Ct(
 | ||
|               (m.Ct(m.C(any_utf8) * m.P'-' * m.C(any_utf8 - m.P']')) + m.C(any_utf8 - m.P']'))^0
 | ||
|             ) * m.P']' / class,
 | ||
| 
 | ||
|   -- Deterministic first character extraction for optimization
 | ||
|   FIRST    = m.Cg(m.P(function(s, i)
 | ||
|                         if letter:match(s, i) then return true, s:sub(i, i)
 | ||
|                         else return false end
 | ||
|                       end), 'F')
 | ||
| }
 | ||
| -- stylua: ignore end
 | ||
| --- @diagnostic enable
 | ||
| 
 | ||
| --- @nodoc
 | ||
| g = m.P(g)
 | ||
| 
 | ||
| --- Parses a raw glob into an |lua-lpeg| pattern.
 | ||
| ---
 | ||
| ---@param pattern string The raw glob pattern
 | ||
| ---@return vim.lpeg.Pattern #An |lua-lpeg| representation of the pattern
 | ||
| function M.to_lpeg(pattern)
 | ||
|   local lpeg_pattern = g:match(pattern) --[[@as vim.lpeg.Pattern?]]
 | ||
|   assert(lpeg_pattern, string.format('Invalid glob: %s', pattern))
 | ||
|   return lpeg_pattern
 | ||
| end
 | ||
| 
 | ||
| return M
 | 
