viml/parser/expressions: Add support for string parsing

2025-09-19 17:58:18 +00:00 · 2017-10-08 21:52:38 +03:00
parent c484613ce0
commit af38cea133
8 changed files with 1938 additions and 13 deletions
--- a/src/nvim/viml/parser/expressions.c
+++ b/src/nvim/viml/parser/expressions.c
@@ -915,8 +915,6 @@ static inline void viml_pexpr_debug_print_token(
 //
 // NVimIdentifierKey -> Identifier
 //
-// NVimFigureBrace -> NVimInternalError
-//
 // NVimUnaryPlus -> NVimUnaryOperator
 // NVimBinaryPlus -> NVimBinaryOperator
 // NVimConcat -> NVimBinaryOperator
@@ -929,6 +927,18 @@ static inline void viml_pexpr_debug_print_token(
 // NVimNestingParenthesis -> NVimParenthesis
 // NVimCallingParenthesis -> NVimParenthesis
 //
+// NVimString -> String
+// NVimStringSpecial -> SpecialChar
+// NVimSingleQuote -> NVimString
+// NVimSingleQuotedBody -> NVimString
+// NVimSingleQuotedQuote -> NVimStringSpecial
+// NVimDoubleQuote -> NVimString
+// NVimDoubleQuotedBody -> NVimString
+// NVimDoubleQuotedEscape -> NVimStringSpecial
+// NVimDoubleQuotedUnknownEscape -> NVimInvalid
+//
+// " Note: NVimDoubleQuotedUnknownEscape is not actually invalid
+//
 // NVimInvalidComma -> NVimInvalidDelimiter
 // NVimInvalidSpacing -> NVimInvalid
 // NVimInvalidTernary -> NVimInvalidOperator
@@ -952,6 +962,19 @@ static inline void viml_pexpr_debug_print_token(
 // NVimInvalidList -> NVimInvalidDelimiter
 // NVimInvalidSubscript -> NVimInvalidDelimiter
 // NVimInvalidSubscriptColon -> NVimInvalidSubscript
+// NVimInvalidString -> NVimInvalidValue
+// NVimInvalidStringSpecial -> NVimInvalidString
+// NVimInvalidSingleQuote -> NVimInvalidString
+// NVimInvalidSingleQuotedBody -> NVimInvalidString
+// NVimInvalidSingleQuotedQuote -> NVimInvalidStringSpecial
+// NVimInvalidDoubleQuote -> NVimInvalidString
+// NVimInvalidDoubleQuotedBody -> NVimInvalidString
+// NVimInvalidDoubleQuotedEscape -> NVimInvalidStringSpecial
+// NVimInvalidDoubleQuotedUnknownEscape -> NVimInvalid
+//
+// NVimFigureBrace -> NVimInternalError
+// NVimInvalidSingleQuotedUnknownEscape -> NVimInternalError
+// NVimSingleQuotedUnknownEscape -> NVimInternalError

 /// Allocate a new node and set some of the values
 ///
@@ -1402,6 +1425,318 @@ static inline void east_set_error(const ParserState *const pstate,
      } \
    } while (0)

+/// Structure used to define “string shifts” necessary to map string
+/// highlighting to actual strings.
+typedef struct {
+  size_t start;  ///< Where special character starts in original string.
+  size_t orig_len;  ///< Length of orininal string (e.g. 4 for "\x80").
+  size_t act_len;  ///< Length of resulting character(s) (e.g. 1 for "\x80").
+  bool escape_not_known;  ///< True if escape sequence in original is not known.
+} StringShift;
+
+/// Parse and highlight single- or double-quoted string
+///
+/// Function is supposed to detect and highlight regular expressions (but does
+/// not do now).
+///
+/// @param[out]  pstate  Parser state which also contains a place where
+///                      highlighting is saved.
+/// @param[out]  node  Node where string parsing results are saved.
+/// @param[in]  token  Token to highlight.
+/// @param[in]  ast_stack  Parser AST stack, used to detect whether current
+///                        string is a regex.
+/// @param[in]  is_invalid  Whether currently processed token is not valid.
+static void parse_quoted_string(ParserState *const pstate,
+                                ExprASTNode *const node,
+                                const LexExprToken token,
+                                const ExprASTStack ast_stack,
+                                const bool is_invalid)
+  FUNC_ATTR_NONNULL_ALL
+{
+  const ParserLine pline = pstate->reader.lines.items[token.start.line];
+  const char *const s = pline.data + token.start.col;
+  const char *const e = s + token.len - token.data.str.closed;
+  const char *p = s + 1;
+  const bool is_double = (token.type == kExprLexDoubleQuotedString);
+  size_t size = token.len - token.data.str.closed - 1;
+  kvec_withinit_t(StringShift, 16) shifts;
+  kvi_init(shifts);
+  if (!is_double) {
+    viml_parser_highlight(pstate, token.start, 1, HL(SingleQuotedString));
+    while (p < e) {
+      const char *const chunk_e = memchr(p, '\'', (size_t)(e - p));
+      if (chunk_e == NULL) {
+        break;
+      }
+      size--;
+      p = chunk_e + 2;
+      if (pstate->colors) {
+        kvi_push(shifts, ((StringShift) {
+            .start = token.start.col + (size_t)(chunk_e - s),
+            .orig_len = 2,
+            .act_len = 1,
+            .escape_not_known = false,
+        }));
+      }
+    }
+    node->data.str.size = size;
+    if (size == 0) {
+      node->data.str.value = NULL;
+    } else {
+      char *v_p;
+      v_p = node->data.str.value = xmalloc(size);
+      p = s + 1;
+      while (p < e) {
+        const char *const chunk_e = memchr(p, '\'', (size_t)(e - p));
+        if (chunk_e == NULL) {
+          memcpy(v_p, p, (size_t)(e - p));
+          break;
+        }
+        memcpy(v_p, p, (size_t)(chunk_e - p));
+        v_p += (size_t)(chunk_e - p) + 1;
+        v_p[-1] = '\'';
+        p = chunk_e + 2;
+      }
+    }
+  } else {
+    viml_parser_highlight(pstate, token.start, 1, HL(DoubleQuotedString));
+    for (p = s + 1; p < e; p++) {
+      if (*p == '\\' && p + 1 < e) {
+        p++;
+        if (p + 1 == e) {
+          size--;
+          break;
+        }
+        switch (*p) {
+          // A "\<x>" form occupies at least 4 characters, and produces up to
+          // 6 characters: reserve space for 2 extra, but do not compute actual
+          // length just now, it would be costy.
+          case '<': {
+            size += 2;
+            break;
+          }
+          // Hexadecimal, always single byte, but at least three bytes each.
+          case 'x': case 'X': {
+            size--;
+            if (ascii_isxdigit(p[1])) {
+              size--;
+              if (p + 2 < e && ascii_isxdigit(p[2])) {
+                size--;
+              }
+            }
+            break;
+          }
+          // Unicode
+          //
+          // \uF takes 1 byte which is 2 bytes less then escape sequence.
+          // \uFF: 2 bytes, 2 bytes less.
+          // \uFFF: 3 bytes, 2 bytes less.
+          // \uFFFF: 3 bytes, 3 bytes less.
+          // \UFFFFF: 4 bytes, 3 bytes less.
+          // \UFFFFFF: 5 bytes, 3 bytes less.
+          // \UFFFFFFF: 6 bytes, 3 bytes less.
+          // \U7FFFFFFF: 6 bytes, 4 bytes less.
+          case 'u': case 'U': {
+            const char *const esc_start = p;
+            size_t n = (*p == 'u' ? 4 : 8);
+            int nr = 0;
+            p++;
+            while (n-- && ascii_isxdigit(p[1])) {
+              p++;
+              nr = (nr << 4) + hex2nr(*p);
+            }
+            // Escape length: (esc_start - 1) points to "\\", esc_start to "u"
+            // or "U", p to the byte after last byte. So escape sequence
+            // occupies p - (esc_start - 1), but it stands for a utf_char2len
+            // bytes.
+            size -= (size_t)((p - (esc_start - 1)) - utf_char2len(nr));
+            p--;
+            break;
+          }
+          // Octal, always single byte, but at least two bytes each.
+          case '0': case '1': case '2': case '3': case '4': case '5': case '6':
+          case '7': {
+            size--;
+            p++;
+            if (*p >= '0' && *p <= '7') {
+              size--;
+              p++;
+              if (*p >= '0' && *p <= '7') {
+                size--;
+                p++;
+              }
+            }
+            break;
+          }
+          default: {
+            size--;
+            break;
+          }
+        }
+      }
+    }
+    if (size == 0) {
+      node->data.str.value = NULL;
+      node->data.str.size = 0;
+    } else {
+      char *v_p;
+      v_p = node->data.str.value = xmalloc(size);
+      p = s + 1;
+      while (p < e) {
+        const char *const chunk_e = memchr(p, '\\', (size_t)(e - p));
+        if (chunk_e == NULL) {
+          memcpy(v_p, p, (size_t)(e - p));
+          v_p += e - p;
+          break;
+        }
+        memcpy(v_p, p, (size_t)(chunk_e - p));
+        v_p += (size_t)(chunk_e - p);
+        p = chunk_e + 1;
+        if (p == e) {
+          *v_p++ = '\\';
+          break;
+        }
+        bool is_unknown = false;
+        const char *const v_p_start = v_p;
+        switch (*p) {
+#define SINGLE_CHAR_ESC(ch, real_ch) \
+          case ch: { \
+            *v_p++ = real_ch; \
+            p++; \
+            break; \
+          }
+          SINGLE_CHAR_ESC('b', BS)
+          SINGLE_CHAR_ESC('e', ESC)
+          SINGLE_CHAR_ESC('f', FF)
+          SINGLE_CHAR_ESC('n', NL)
+          SINGLE_CHAR_ESC('r', CAR)
+          SINGLE_CHAR_ESC('t', TAB)
+          SINGLE_CHAR_ESC('"', '"')
+          SINGLE_CHAR_ESC('\\', '\\')
+#undef SINGLE_CHAR_ESC
+
+          // Hexadecimal or unicode.
+          case 'X': case 'x': case 'u': case 'U': {
+            if (ascii_isxdigit(p[1])) {
+              size_t n;
+              int nr;
+              bool is_hex = (*p == 'x' || *p == 'X');
+
+              if (is_hex) {
+                n = 2;
+              } else if (*p == 'u') {
+                n = 4;
+              } else {
+                n = 8;
+              }
+              nr = 0;
+              while (n-- && ascii_isxdigit(p[1])) {
+                p++;
+                nr = (nr << 4) + hex2nr(*p);
+              }
+              p++;
+              if (is_hex) {
+                *v_p++ = (char)nr;
+              } else {
+                v_p += utf_char2bytes(nr, (char_u *)v_p);
+              }
+            } else {
+              is_unknown = true;
+              *v_p++ = *p;
+              p++;
+            }
+            break;
+          }
+          // Octal: "\1", "\12", "\123".
+          case '0': case '1': case '2': case '3': case '4': case '5': case '6':
+          case '7': {
+            uint8_t ch = (uint8_t)(*p++ - '0');
+            if (*p >= '0' && *p <= '7') {
+              ch = (uint8_t)((ch << 3) + *p++ - '0');
+              if (*p >= '0' && *p <= '7') {
+                ch = (uint8_t)((ch << 3) + *p++ - '0');
+              }
+            }
+            *v_p++ = (char)ch;
+            break;
+          }
+          // Special key, e.g.: "\<C-W>"
+          case '<': {
+            const size_t special_len = (
+                trans_special((const char_u **)&p, (size_t)(e - p),
+                              (char_u *)v_p, true, true));
+            if (special_len != 0) {
+              v_p += special_len;
+            } else {
+              is_unknown = true;
+              mb_copy_char((const char_u **)&p, (char_u **)&v_p);
+            }
+            break;
+          }
+          default: {
+            is_unknown = true;
+            mb_copy_char((const char_u **)&p, (char_u **)&v_p);
+            break;
+          }
+        }
+        if (pstate->colors) {
+          kvi_push(shifts, ((StringShift) {
+              .start = token.start.col + (size_t)(chunk_e - s),
+              .orig_len = (size_t)(p - chunk_e),
+              .act_len = (size_t)(v_p - (char *)v_p_start),
+              .escape_not_known = is_unknown,
+          }));
+        }
+      }
+      node->data.str.size = (size_t)(v_p - node->data.str.value);
+    }
+  }
+  if (pstate->colors) {
+    // TODO(ZyX-I): use ast_stack to determine and highlight regular expressions
+    // TODO(ZyX-I): use ast_stack to determine and highlight printf format str
+    // TODO(ZyX-I): use ast_stack to determine and highlight expression strings
+    size_t next_col = 1;
+    const char *const body_str = (is_double
+                                  ? HL(DoubleQuotedBody)
+                                  : HL(SingleQuotedBody));
+    const char *const esc_str = (is_double
+                                 ? HL(DoubleQuotedEscape)
+                                 : HL(SingleQuotedQuote));
+    const char *const ukn_esc_str = (is_double
+                                     ? HL(DoubleQuotedUnknownEscape)
+                                     : HL(SingleQuotedUnknownEscape));
+    for (size_t i = 0; i < kv_size(shifts); i++) {
+      const StringShift cur_shift = kv_A(shifts, i);
+      if (cur_shift.start > next_col) {
+        viml_parser_highlight(pstate, shifted_pos(token.start, next_col),
+                              cur_shift.start - next_col,
+                              body_str);
+      }
+      viml_parser_highlight(pstate, shifted_pos(token.start, cur_shift.start),
+                            cur_shift.orig_len,
+                            (cur_shift.escape_not_known
+                             ? ukn_esc_str
+                             : esc_str));
+      next_col = cur_shift.start + cur_shift.orig_len;
+    }
+    if (next_col < token.len - token.data.str.closed) {
+      viml_parser_highlight(pstate, shifted_pos(token.start, next_col),
+                            token.len - token.data.str.closed - next_col,
+                            body_str);
+    }
+  }
+  if (token.data.str.closed) {
+    if (is_double) {
+      viml_parser_highlight(pstate, shifted_pos(token.start, token.len - 1),
+                            1, HL(DoubleQuotedString));
+    } else {
+      viml_parser_highlight(pstate, shifted_pos(token.start, token.len - 1),
+                            1, HL(SingleQuotedString));
+    }
+  }
+  kvi_destroy(shifts);
+}
+
 /// Parse one VimL expression
 ///
 /// @param  pstate  Parser state.
@@ -1714,12 +2049,7 @@ viml_pexpr_parse_invalid_comma:
          } else if (eastnode_lvl >= kEOpLvlComma) {
            can_be_ternary = false;
          } else {
-viml_pexpr_parse_invalid_colon:
-            ERROR_FROM_TOKEN_AND_MSG(
-                cur_token,
-                _("E15: Colon outside of dictionary or ternary operator: "
-                  "%.*s"));
-            break;
+            goto viml_pexpr_parse_invalid_colon;
          }
          if (i == kv_size(ast_stack) - 1) {
            goto viml_pexpr_parse_invalid_colon;
@@ -1741,6 +2071,12 @@ viml_pexpr_parse_invalid_colon:
          ADD_OP_NODE(cur_node);
          HL_CUR_TOKEN(SubscriptColon);
        } else {
+          goto viml_pexpr_parse_valid_colon;
+viml_pexpr_parse_invalid_colon:
+          ERROR_FROM_TOKEN_AND_MSG(
+              cur_token,
+              _("E15: Colon outside of dictionary or ternary operator: %.*s"));
+viml_pexpr_parse_valid_colon:
          ADD_VALUE_IF_MISSING(_(EXP_VAL_COLON));
          NEW_NODE_WITH_CUR_POS(cur_node, kExprNodeColon);
          if (is_ternary) {
@@ -2201,6 +2537,30 @@ viml_pexpr_parse_no_paren_closing_error: {}
        kvi_push(ast_stack, &ter_val_node->children);
        break;
      }
+      case kExprLexDoubleQuotedString:
+      case kExprLexSingleQuotedString: {
+        const bool is_double = (tok_type == kExprLexDoubleQuotedString);
+        if (!cur_token.data.str.closed) {
+          // It is weird, but Vim has two identical errors messages with
+          // different error numbers: "E114: Missing quote" and
+          // "E115: Missing quote".
+          ERROR_FROM_TOKEN_AND_MSG(
+              cur_token, (is_double
+                          ? _("E114: Missing double quote: %.*s")
+                          : _("E115: Missing single quote: %.*s")));
+        }
+        if (want_node == kENodeOperator) {
+          OP_MISSING;
+        }
+        NEW_NODE_WITH_CUR_POS(
+            cur_node, (is_double
+                       ? kExprNodeDoubleQuotedString
+                       : kExprNodeSingleQuotedString));
+        *top_node_p = cur_node;
+        parse_quoted_string(pstate, cur_node, cur_token, ast_stack, is_invalid);
+        want_node = kENodeOperator;
+        break;
+      }
    }
 viml_pexpr_parse_cycle_end:
    prev_token = cur_token;
--- a/src/nvim/viml/parser/expressions.h
+++ b/src/nvim/viml/parser/expressions.h
@@ -195,6 +195,8 @@ typedef enum {
  kExprNodeConcatOrSubscript = 'S',
  kExprNodeInteger = '0',  ///< Integral number.
  kExprNodeFloat = '1',  ///< Floating-point number.
+  kExprNodeSingleQuotedString = '\'',
+  kExprNodeDoubleQuotedString = '"',
 } ExprASTNodeType;

 typedef struct expr_ast_node ExprASTNode;
@@ -249,6 +251,11 @@ struct expr_ast_node {
    struct {
      float_T value;
    } flt;  ///< For kExprNodeFloat.
+    struct {
+      char *value;
+      size_t size;
+    } str;  ///< For kExprNodeSingleQuotedString and
+            ///< kExprNodeDoubleQuotedString.
  } data;
 };