viml/expressions: Add lexer with some basic tests

This commit is contained in:
ZyX
2017-08-20 18:40:22 +03:00
parent ad58e50b45
commit 0300c4d109
5 changed files with 953 additions and 0 deletions

View File

@@ -0,0 +1,367 @@
// This is an open source non-commercial project. Dear PVS-Studio, please check
// it. PVS-Studio Static Code Analyzer for C, C++ and C#: http://www.viva64.com
/// VimL expression parser
#include <stdbool.h>
#include <stddef.h>
#include <assert.h>
#include <string.h>
#include "nvim/vim.h"
#include "nvim/memory.h"
#include "nvim/types.h"
#include "nvim/charset.h"
#include "nvim/ascii.h"
#include "nvim/viml/parser/expressions.h"
#include "nvim/viml/parser/parser.h"
#ifdef INCLUDE_GENERATED_DECLARATIONS
# include "viml/parser/expressions.c.generated.h"
#endif
/// Character used as a separator in autoload function/variable names.
#define AUTOLOAD_CHAR '#'
/// Get next token for the VimL expression input
LexExprToken viml_pexpr_next_token(ParserState *const pstate)
FUNC_ATTR_WARN_UNUSED_RESULT
{
LexExprToken ret = {
.type = kExprLexInvalid,
.start = pstate->pos,
};
ParserLine pline;
if (!viml_parser_get_remaining_line(pstate, &pline)) {
ret.type = kExprLexEOC;
return ret;
}
if (pline.size <= 0) {
ret.len = 0;
ret.type = kExprLexEOC;
goto viml_pexpr_next_token_adv_return;
}
ret.len = 1;
const uint8_t schar = (uint8_t)pline.data[0];
#define GET_CCS(ret, pline) \
do { \
if (ret.len < pline.size \
&& strchr("?#", pline.data[ret.len]) != NULL) { \
ret.data.cmp.ccs = \
(CaseCompareStrategy)pline.data[ret.len]; \
ret.len++; \
} else { \
ret.data.cmp.ccs = kCCStrategyUseOption; \
} \
} while (0)
switch (schar) {
// Paired brackets.
#define BRACKET(typ, opning, clsing) \
case opning: \
case clsing: { \
ret.type = typ; \
ret.data.brc.closing = (schar == clsing); \
break; \
}
BRACKET(kExprLexParenthesis, '(', ')')
BRACKET(kExprLexBracket, '[', ']')
BRACKET(kExprLexFigureBrace, '{', '}')
#undef BRACKET
// Single character tokens without data.
#define CHAR(typ, ch) \
case ch: { \
ret.type = typ; \
break; \
}
CHAR(kExprLexQuestion, '?')
CHAR(kExprLexColon, ':')
CHAR(kExprLexDot, '.')
CHAR(kExprLexPlus, '+')
CHAR(kExprLexComma, ',')
#undef CHAR
// Multiplication/division/modulo.
#define MUL(mul_type, ch) \
case ch: { \
ret.type = kExprLexMultiplication; \
ret.data.mul.type = mul_type; \
break; \
}
MUL(kExprLexMulMul, '*')
MUL(kExprLexMulDiv, '/')
MUL(kExprLexMulMod, '%')
#undef MUL
#define CHARREG(typ, cond) \
do { \
ret.type = typ; \
for (; (ret.len < pline.size \
&& cond(pline.data[ret.len])) \
; ret.len++) { \
} \
} while (0)
// Whitespace.
case ' ':
case TAB: {
CHARREG(kExprLexSpacing, ascii_iswhite);
break;
}
// Control character, except for NUL, NL and TAB.
case Ctrl_A: case Ctrl_B: case Ctrl_C: case Ctrl_D: case Ctrl_E:
case Ctrl_F: case Ctrl_G: case Ctrl_H:
case Ctrl_K: case Ctrl_L: case Ctrl_M: case Ctrl_N: case Ctrl_O:
case Ctrl_P: case Ctrl_Q: case Ctrl_R: case Ctrl_S: case Ctrl_T:
case Ctrl_U: case Ctrl_V: case Ctrl_W: case Ctrl_X: case Ctrl_Y:
case Ctrl_Z: {
#define ISCTRL(schar) (schar < ' ')
CHARREG(kExprLexInvalid, ISCTRL);
ret.data.err.type = kExprLexSpacing;
ret.data.err.msg =
_("E15: Invalid control character present in input: %.*s");
break;
#undef ISCTRL
}
// Number.
// Note: determining whether dot is (not) a part of a float needs more
// context, so lexer does not do this.
// FIXME: Resolve ambiguity by additional argument.
case '0': case '1': case '2': case '3': case '4': case '5': case '6':
case '7': case '8': case '9': {
CHARREG(kExprLexNumber, ascii_isdigit);
break;
}
// Environment variable.
case '$': {
CHARREG(kExprLexEnv, vim_isIDc);
break;
}
// Normal variable/function name.
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
case 'v': case 'w': case 'x': case 'y': case 'z':
case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
case 'V': case 'W': case 'X': case 'Y': case 'Z':
case '_': {
#define ISWORD_OR_AUTOLOAD(x) \
(ASCII_ISALNUM(x) || (x) == AUTOLOAD_CHAR || (x) == '_')
#define ISWORD(x) \
(ASCII_ISALNUM(x) || (x) == '_')
ret.data.var.scope = 0;
ret.data.var.autoload = false;
CHARREG(kExprLexPlainIdentifier, ISWORD);
// "is" and "isnot" operators.
if ((ret.len == 2 && memcmp(pline.data, "is", 2) == 0)
|| (ret.len == 5 && memcmp(pline.data, "isnot", 5) == 0)) {
ret.type = kExprLexComparison;
ret.data.cmp.type = kExprLexCmpIdentical;
ret.data.cmp.inv = (ret.len == 5);
GET_CCS(ret, pline);
// Scope: `s:`, etc.
} else if (ret.len == 1
&& pline.size > 1
&& strchr("sgvbwtla", schar) != NULL
&& pline.data[ret.len] == ':') {
ret.len++;
ret.data.var.scope = schar;
CHARREG(kExprLexPlainIdentifier, ISWORD_OR_AUTOLOAD);
ret.data.var.autoload = (
memchr(pline.data + 2, AUTOLOAD_CHAR, ret.len - 2)
!= NULL);
// Previous CHARREG stopped at autoload character in order to make it
// possible to detect `is#`. Continue now with autoload characters
// included.
//
// Warning: there is ambiguity for the lexer: `is#Foo(1)` is a call of
// function `is#Foo()`, `1is#Foo(1)` is a comparison `1 is# Foo(1)`. This
// needs to be resolved on the higher level where context is available.
} else if (pline.size > ret.len
&& pline.data[ret.len] == AUTOLOAD_CHAR) {
ret.data.var.autoload = true;
CHARREG(kExprLexPlainIdentifier, ISWORD_OR_AUTOLOAD);
}
break;
#undef ISWORD_OR_AUTOLOAD
#undef ISWORD
}
#undef CHARREG
// Option.
case '&': {
#define OPTNAMEMISS(ret) \
do { \
ret.type = kExprLexInvalid; \
ret.data.err.type = kExprLexOption; \
ret.data.err.msg = _("E112: Option name missing: %.*s"); \
} while (0)
if (pline.size > 1 && pline.data[1] == '&') {
ret.type = kExprLexAnd;
ret.len++;
break;
}
if (pline.size == 1 || !ASCII_ISALPHA(pline.data[1])) {
OPTNAMEMISS(ret);
break;
}
ret.type = kExprLexOption;
if (pline.size > 2
&& pline.data[2] == ':'
&& strchr("gl", pline.data[1]) != NULL) {
ret.len += 2;
ret.data.opt.scope = (pline.data[1] == 'g'
? kExprLexOptGlobal
: kExprLexOptLocal);
ret.data.opt.name = pline.data + 3;
} else {
ret.data.opt.scope = kExprLexOptUnspecified;
ret.data.opt.name = pline.data + 1;
}
const char *p = ret.data.opt.name;
const char *const e = pline.data + pline.size;
if (e - p >= 4 && p[0] == 't' && p[1] == '_') {
ret.data.opt.len = 4;
ret.len += 4;
} else {
for (; p < e && ASCII_ISALPHA(*p); p++) {
}
ret.data.opt.len = (size_t)(p - ret.data.opt.name);
if (ret.data.opt.len == 0) {
OPTNAMEMISS(ret);
} else {
ret.len += ret.data.opt.len;
}
}
break;
#undef OPTNAMEMISS
}
// Register.
case '@': {
ret.type = kExprLexRegister;
if (pline.size > 1) {
ret.len++;
ret.data.reg.name = (uint8_t)pline.data[1];
} else {
ret.data.reg.name = -1;
}
break;
}
// Single quoted string.
case '\'': {
ret.type = kExprLexSingleQuotedString;
ret.data.str.closed = false;
for (; ret.len < pline.size && !ret.data.str.closed; ret.len++) {
if (pline.data[ret.len] == '\'') {
if (ret.len + 1 < pline.size && pline.data[ret.len + 1] == '\'') {
ret.len++;
} else {
ret.data.str.closed = true;
}
}
}
break;
}
// Double quoted string.
case '"': {
ret.type = kExprLexDoubleQuotedString;
ret.data.str.closed = false;
for (; ret.len < pline.size && !ret.data.str.closed; ret.len++) {
if (pline.data[ret.len] == '\\') {
if (ret.len + 1 < pline.size) {
ret.len++;
}
} else if (pline.data[ret.len] == '"') {
ret.data.str.closed = true;
}
}
break;
}
// Unary not, (un)equality and regex (not) match comparison operators.
case '!':
case '=': {
if (pline.size == 1) {
viml_pexpr_next_token_invalid_comparison:
ret.type = (schar == '!' ? kExprLexNot : kExprLexInvalid);
if (ret.type == kExprLexInvalid) {
ret.data.err.msg = _("E15: Expected == or =~: %.*s");
ret.data.err.type = kExprLexComparison;
}
break;
}
ret.type = kExprLexComparison;
ret.data.cmp.inv = (schar == '!');
if (pline.data[1] == '=') {
ret.data.cmp.type = kExprLexCmpEqual;
ret.len++;
} else if (pline.data[1] == '~') {
ret.data.cmp.type = kExprLexCmpMatches;
ret.len++;
} else {
goto viml_pexpr_next_token_invalid_comparison;
}
GET_CCS(ret, pline);
break;
}
// Less/greater [or equal to] comparison operators.
case '>':
case '<': {
ret.type = kExprLexComparison;
const bool haseqsign = (pline.size > 1 && pline.data[1] == '=');
if (haseqsign) {
ret.len++;
}
GET_CCS(ret, pline);
ret.data.cmp.inv = (schar == '<');
ret.data.cmp.type = ((ret.data.cmp.inv ^ haseqsign)
? kExprLexCmpGreaterOrEqual
: kExprLexCmpGreater);
break;
}
// Minus sign or arrow from lambdas.
case '-': {
if (pline.size > 1 && pline.data[1] == '>') {
ret.len++;
ret.type = kExprLexArrow;
} else {
ret.type = kExprLexMinus;
}
break;
}
// Expression end because Ex command ended.
case NUL:
case NL: {
ret.type = kExprLexEOC;
break;
}
// Everything else is not valid.
default: {
ret.len = (size_t)utfc_ptr2len_len((const char_u *)pline.data,
(int)pline.size);
ret.type = kExprLexInvalid;
ret.data.err.type = kExprLexPlainIdentifier;
ret.data.err.msg = _("E15: Unidentified character: %.*s");
break;
}
}
#undef GET_CCS
viml_pexpr_next_token_adv_return:
viml_parser_advance(pstate, ret.len);
return ret;
}

View File

@@ -0,0 +1,118 @@
#ifndef NVIM_VIML_PARSER_EXPRESSIONS_H
#define NVIM_VIML_PARSER_EXPRESSIONS_H
#include <stddef.h>
#include <stdbool.h>
#include "nvim/types.h"
#include "nvim/viml/parser/parser.h"
// Defines whether to ignore case:
// == kCCStrategyUseOption
// ==# kCCStrategyMatchCase
// ==? kCCStrategyIgnoreCase
typedef enum {
kCCStrategyUseOption = 0, // 0 for xcalloc
kCCStrategyMatchCase = '#',
kCCStrategyIgnoreCase = '?',
} CaseCompareStrategy;
/// Lexer token type
typedef enum {
kExprLexInvalid = 0, ///< Invalid token, indicaten an error.
kExprLexMissing, ///< Missing token, for use in parser.
kExprLexSpacing, ///< Spaces, tabs, newlines, etc.
kExprLexEOC, ///< End of command character: NL, |, just end of stream.
kExprLexQuestion, ///< Question mark, for use in ternary.
kExprLexColon, ///< Colon, for use in ternary.
kExprLexOr, ///< Logical or operator.
kExprLexAnd, ///< Logical and operator.
kExprLexComparison, ///< One of the comparison operators.
kExprLexPlus, ///< Plus sign.
kExprLexMinus, ///< Minus sign.
kExprLexDot, ///< Dot: either concat or subscript, also part of the float.
kExprLexMultiplication, ///< Multiplication, division or modulo operator.
kExprLexNot, ///< Not: !.
kExprLexNumber, ///< Integer number literal, or part of a float.
kExprLexSingleQuotedString, ///< Single quoted string literal.
kExprLexDoubleQuotedString, ///< Double quoted string literal.
kExprLexOption, ///< &optionname option value.
kExprLexRegister, ///< @r register value.
kExprLexEnv, ///< Environment $variable value.
kExprLexPlainIdentifier, ///< Identifier without scope: `abc`, `foo#bar`.
kExprLexBracket, ///< Bracket, either opening or closing.
kExprLexFigureBrace, ///< Figure brace, either opening or closing.
kExprLexParenthesis, ///< Parenthesis, either opening or closing.
kExprLexComma, ///< Comma.
kExprLexArrow, ///< Arrow, like from lambda expressions.
} LexExprTokenType;
/// Lexer token
typedef struct {
ParserPosition start;
size_t len;
LexExprTokenType type;
union {
struct {
enum {
kExprLexCmpEqual, ///< Equality, unequality.
kExprLexCmpMatches, ///< Matches regex, not matches regex.
kExprLexCmpGreater, ///< `>` or `<=`
kExprLexCmpGreaterOrEqual, ///< `>=` or `<`.
kExprLexCmpIdentical, ///< `is` or `isnot`
} type; ///< Comparison type.
CaseCompareStrategy ccs; ///< Case comparison strategy.
bool inv; ///< True if comparison is to be inverted.
} cmp; ///< For kExprLexComparison.
struct {
enum {
kExprLexMulMul, ///< Real multiplication.
kExprLexMulDiv, ///< Division.
kExprLexMulMod, ///< Modulo.
} type; ///< Multiplication type.
} mul; ///< For kExprLexMultiplication.
struct {
bool closing; ///< True if bracket/etc is a closing one.
} brc; ///< For brackets/braces/parenthesis.
struct {
int name; ///< Register name, may be -1 if name not present.
} reg; ///< For kExprLexRegister.
struct {
bool closed; ///< True if quote was closed.
} str; ///< For kExprLexSingleQuotedString and kExprLexDoubleQuotedString.
struct {
const char *name; ///< Option name start.
size_t len; ///< Option name length.
enum {
kExprLexOptUnspecified = 0,
kExprLexOptGlobal = 1,
kExprLexOptLocal = 2,
} scope; ///< Option scope: &l:, &g: or not specified.
} opt; ///< Option properties.
struct {
int scope; ///< Scope character or 0 if not present.
bool autoload; ///< Has autoload characters.
} var; ///< For kExprLexPlainIdentifier
struct {
LexExprTokenType type; ///< Suggested type for parsing incorrect code.
const char *msg; ///< Error message.
} err; ///< For kExprLexInvalid
} data; ///< Additional data, if needed.
} LexExprToken;
#ifdef INCLUDE_GENERATED_DECLARATIONS
# include "viml/parser/expressions.h.generated.h"
#endif
#endif // NVIM_VIML_PARSER_EXPRESSIONS_H

View File

@@ -0,0 +1,129 @@
#ifndef NVIM_VIML_PARSER_PARSER_H
#define NVIM_VIML_PARSER_PARSER_H
#include <stdbool.h>
#include <stddef.h>
#include <assert.h>
#include "nvim/lib/kvec.h"
#include "nvim/func_attr.h"
/// One parsed line
typedef struct {
const char *data; ///< Parsed line pointer
size_t size; ///< Parsed line size
} ParserLine;
/// Line getter type for parser
///
/// Line getter must return {NULL, 0} for EOF.
typedef void (*ParserLineGetter)(void *cookie, ParserLine *ret_pline);
/// Parser position in the input
typedef struct {
size_t line; ///< Line index in ParserInputReader.lines.
size_t col; ///< Byte index in the line.
} ParserPosition;
/// Parser state item.
typedef struct {
enum {
kPTopStateParsingCommand = 0,
kPTopStateParsingExpression,
} type;
union {
struct {
enum {
kExprUnknown = 0,
} type;
} expr;
} data;
} ParserStateItem;
/// Structure defining input reader
typedef struct {
/// Function used to get next line.
ParserLineGetter get_line;
/// Data for get_line function.
void *cookie;
/// All lines obtained by get_line.
kvec_withinit_t(ParserLine, 4) lines;
} ParserInputReader;
/// Highlighted region definition
///
/// Note: one chunk may highlight only one line.
typedef struct {
ParserPosition start; ///< Start of the highlight: line and column.
size_t end_col; ///< End column, points to the start of the next character.
const char *group; ///< Highlight group.
} ParserHighlightChunk;
/// Highlighting defined by a parser
typedef kvec_withinit_t(ParserHighlightChunk, 16) ParserHighlight;
/// Structure defining parser state
typedef struct {
/// Line reader.
ParserInputReader reader;
/// Position up to which input was parsed.
ParserPosition pos;
/// Parser state stack.
kvec_withinit_t(ParserStateItem, 16) stack;
/// Highlighting support.
ParserHighlight *colors;
/// True if line continuation can be used.
bool can_continuate;
} ParserState;
static inline bool viml_parser_get_remaining_line(ParserState *const pstate,
ParserLine *const ret_pline)
REAL_FATTR_ALWAYS_INLINE REAL_FATTR_WARN_UNUSED_RESULT REAL_FATTR_NONNULL_ALL;
/// Get currently parsed line, shifted to pstate->pos.col
///
/// @param pstate Parser state to operate on.
///
/// @return True if there is a line, false in case of EOF.
static inline bool viml_parser_get_remaining_line(ParserState *const pstate,
ParserLine *const ret_pline)
{
const size_t num_lines = kv_size(pstate->reader.lines);
if (pstate->pos.line == num_lines) {
pstate->reader.get_line(pstate->reader.cookie, ret_pline);
kvi_push(pstate->reader.lines, *ret_pline);
} else {
*ret_pline = kv_last(pstate->reader.lines);
}
assert(pstate->pos.line == kv_size(pstate->reader.lines) - 1);
if (ret_pline->data != NULL) {
ret_pline->data += pstate->pos.col;
ret_pline->size -= pstate->pos.col;
}
return ret_pline->data != NULL;
}
static inline void viml_parser_advance(ParserState *const pstate,
const size_t len)
REAL_FATTR_ALWAYS_INLINE REAL_FATTR_NONNULL_ALL;
/// Advance position by a given number of bytes
///
/// At maximum advances to the next line.
///
/// @param pstate Parser state to advance.
/// @param[in] len Number of bytes to advance.
static inline void viml_parser_advance(ParserState *const pstate,
const size_t len)
{
assert(pstate->pos.line == kv_size(pstate->reader.lines) - 1);
const ParserLine pline = kv_last(pstate->reader.lines);
if (pstate->pos.col + len >= pline.size) {
pstate->pos.line++;
pstate->pos.col = 0;
} else {
pstate->pos.col += len;
}
}
#endif // NVIM_VIML_PARSER_PARSER_H