fix(diff): use mmfile_t in linematch

Problem: Linematch used to use strchr to navigate a string, however strchr does not supoprt embedded NULs. Solution: Use `mmfile_t` instead of `char *` in linematch and introduce `strnchr()`. Also remove heap allocations from `matching_char_iwhite()` Fixes: #30505
2025-12-22 14:18:56 +00:00 · 2024-09-26 16:10:11 +01:00
parent 20251be15a
commit c65646c247
9 changed files with 108 additions and 79 deletions
--- a/runtime/doc/lua.txt
+++ b/runtime/doc/lua.txt
@@ -713,8 +713,8 @@ vim.diff({a}, {b}, {opts})                                        *vim.diff()*
    Parameters: ~
      • {a}     (`string`) First string to compare
      • {b}     (`string`) Second string to compare
-      • {opts}  (`table`) Optional parameters:
-                • {on_hunk}
+      • {opts}  (`table?`) Optional parameters:
+                • {on_hunk}?
                  (`fun(start_a: integer, count_a: integer, start_b: integer, count_b: integer): integer`)
                  Invoked for each hunk in the diff. Return a negative number
                  to cancel the callback for any remaining hunks. Arguments:
@@ -722,33 +722,33 @@ vim.diff({a}, {b}, {opts})                                        *vim.diff()*
                  • `count_a` (`integer`): Hunk size in {a}.
                  • `start_b` (`integer`): Start line of hunk in {b}.
                  • `count_b` (`integer`): Hunk size in {b}.
-                • {result_type} (`'unified'|'indices'`, default: `'unified'`)
+                • {result_type}? (`'unified'|'indices'`, default: `'unified'`)
                  Form of the returned diff:
                  • `unified`: String in unified format.
                  • `indices`: Array of hunk locations. Note: This option is
                    ignored if `on_hunk` is used.
-                • {linematch} (`boolean|integer`) Run linematch on the
+                • {linematch}? (`boolean|integer`) Run linematch on the
                  resulting hunks from xdiff. When integer, only hunks upto
                  this size in lines are run through linematch. Requires
                  `result_type = indices`, ignored otherwise.
-                • {algorithm} (`'myers'|'minimal'|'patience'|'histogram'`,
+                • {algorithm}? (`'myers'|'minimal'|'patience'|'histogram'`,
                  default: `'myers'`) Diff algorithm to use. Values:
                  • `myers`: the default algorithm
                  • `minimal`: spend extra time to generate the smallest
                    possible diff
                  • `patience`: patience diff algorithm
                  • `histogram`: histogram diff algorithm
-                • {ctxlen} (`integer`) Context length
-                • {interhunkctxlen} (`integer`) Inter hunk context length
-                • {ignore_whitespace} (`boolean`) Ignore whitespace
-                • {ignore_whitespace_change} (`boolean`) Ignore whitespace
+                • {ctxlen}? (`integer`) Context length
+                • {interhunkctxlen}? (`integer`) Inter hunk context length
+                • {ignore_whitespace}? (`boolean`) Ignore whitespace
+                • {ignore_whitespace_change}? (`boolean`) Ignore whitespace
                  change
-                • {ignore_whitespace_change_at_eol} (`boolean`) Ignore
+                • {ignore_whitespace_change_at_eol}? (`boolean`) Ignore
                  whitespace change at end-of-line.
-                • {ignore_cr_at_eol} (`boolean`) Ignore carriage return at
+                • {ignore_cr_at_eol}? (`boolean`) Ignore carriage return at
                  end-of-line
-                • {ignore_blank_lines} (`boolean`) Ignore blank lines
-                • {indent_heuristic} (`boolean`) Use the indent heuristic for
+                • {ignore_blank_lines}? (`boolean`) Ignore blank lines
+                • {indent_heuristic}? (`boolean`) Use the indent heuristic for
                  the internal diff library.

    Return: ~
--- a/runtime/lua/vim/_meta/diff.lua
+++ b/runtime/lua/vim/_meta/diff.lua
@@ -11,19 +11,19 @@
 ---   - `count_a` (`integer`): Hunk size in {a}.
 ---   - `start_b` (`integer`): Start line of hunk in {b}.
 ---   - `count_b` (`integer`): Hunk size in {b}.
--- @field on_hunk fun(start_a: integer, count_a: integer, start_b: integer, count_b: integer): integer
+--- @field on_hunk? fun(start_a: integer, count_a: integer, start_b: integer, count_b: integer): integer
 ---
 --- Form of the returned diff:
 ---   - `unified`: String in unified format.
 ---   - `indices`: Array of hunk locations.
 --- Note: This option is ignored if `on_hunk` is used.
 --- (default: `'unified'`)
--- @field result_type 'unified'|'indices'
+--- @field result_type? 'unified'|'indices'
 ---
 --- Run linematch on the resulting hunks from xdiff. When integer, only hunks
 --- upto this size in lines are run through linematch.
 --- Requires `result_type = indices`, ignored otherwise.
--- @field linematch boolean|integer
+--- @field linematch? boolean|integer
 ---
 --- Diff algorithm to use. Values:
 ---   - `myers`: the default algorithm
@@ -31,15 +31,15 @@
 ---   - `patience`: patience diff algorithm
 ---   - `histogram`: histogram diff algorithm
 --- (default: `'myers'`)
--- @field algorithm 'myers'|'minimal'|'patience'|'histogram'
--- @field ctxlen integer Context length
--- @field interhunkctxlen integer Inter hunk context length
--- @field ignore_whitespace boolean Ignore whitespace
--- @field ignore_whitespace_change boolean Ignore whitespace change
--- @field ignore_whitespace_change_at_eol boolean Ignore whitespace change at end-of-line.
--- @field ignore_cr_at_eol boolean Ignore carriage return at end-of-line
--- @field ignore_blank_lines boolean Ignore blank lines
--- @field indent_heuristic boolean Use the indent heuristic for the internal diff library.
+--- @field algorithm? 'myers'|'minimal'|'patience'|'histogram'
+--- @field ctxlen? integer Context length
+--- @field interhunkctxlen? integer Inter hunk context length
+--- @field ignore_whitespace? boolean Ignore whitespace
+--- @field ignore_whitespace_change? boolean Ignore whitespace change
+--- @field ignore_whitespace_change_at_eol? boolean Ignore whitespace change at end-of-line.
+--- @field ignore_cr_at_eol? boolean Ignore carriage return at end-of-line
+--- @field ignore_blank_lines? boolean Ignore blank lines
+--- @field indent_heuristic? boolean Use the indent heuristic for the internal diff library.

 -- luacheck: no unused args

@@ -65,7 +65,7 @@
 ---
 ---@param a string First string to compare
 ---@param b string Second string to compare
---@param opts vim.diff.Opts
+---@param opts? vim.diff.Opts
 ---@return string|integer[][]?
 ---     See {opts.result_type}. `nil` if {opts.on_hunk} is given.
 function vim.diff(a, b, opts) end
--- a/src/clint.py
+++ b/src/clint.py
@@ -881,6 +881,7 @@ def CheckIncludes(filename, lines, error):
            "nvim/func_attr.h",
            "termkey/termkey.h",
            "vterm/vterm.h",
+            "xdiff/xdiff.h",
            ]

    for i in check_includes_ignore:
--- a/src/nvim/diff.c
+++ b/src/nvim/diff.c
@@ -2005,7 +2005,7 @@ static void run_linematch_algorithm(diff_T *dp)
 {
  // define buffers for diff algorithm
  mmfile_t diffbufs_mm[DB_COUNT];
-  const char *diffbufs[DB_COUNT];
+  const mmfile_t *diffbufs[DB_COUNT];
  int diff_length[DB_COUNT];
  size_t ndiffs = 0;
  for (int i = 0; i < DB_COUNT; i++) {
@@ -2015,9 +2015,7 @@ static void run_linematch_algorithm(diff_T *dp)
      diff_write_buffer(curtab->tp_diffbuf[i], &diffbufs_mm[ndiffs],
                        dp->df_lnum[i], dp->df_lnum[i] + dp->df_count[i] - 1);

-      // we want to get the char* to the diff buffer that was just written
-      // we add it to the array of char*, diffbufs
-      diffbufs[ndiffs] = diffbufs_mm[ndiffs].ptr;
+      diffbufs[ndiffs] = &diffbufs_mm[ndiffs];

      // keep track of the length of this diff block to pass it to the linematch
      // algorithm
--- a/src/nvim/linematch.c
+++ b/src/nvim/linematch.c
@@ -10,6 +10,8 @@
 #include "nvim/macros_defs.h"
 #include "nvim/memory.h"
 #include "nvim/pos_defs.h"
+#include "nvim/strings.h"
+#include "xdiff/xdiff.h"

 #define LN_MAX_BUFS 8
 #define LN_DECISION_MAX 255  // pow(2, LN_MAX_BUFS(8)) - 1 = 255
@@ -29,48 +31,48 @@ struct diffcmppath_S {
 # include "linematch.c.generated.h"
 #endif

-static size_t line_len(const char *s)
+static size_t line_len(const mmfile_t *m)
 {
-  char *end = strchr(s, '\n');
+  char *s = m->ptr;
+  size_t n = (size_t)m->size;
+  char *end = strnchr(s, &n, '\n');
  if (end) {
    return (size_t)(end - s);
  }
-  return strlen(s);
+  return (size_t)m->size;
 }

+#define MATCH_CHAR_MAX_LEN 800
+
 /// Same as matching_chars but ignore whitespace
 ///
 /// @param s1
 /// @param s2
-static int matching_chars_iwhite(const char *s1, const char *s2)
+static int matching_chars_iwhite(const mmfile_t *s1, const mmfile_t *s2)
 {
  // the newly processed strings that will be compared
-  // delete the white space characters, and/or replace all upper case with lower
-  char *strsproc[2];
-  const char *strsorig[2] = { s1, s2 };
+  // delete the white space characters
+  mmfile_t sp[2];
+  char p[2][MATCH_CHAR_MAX_LEN];
  for (int k = 0; k < 2; k++) {
-    size_t d = 0;
-    size_t i = 0;
-    size_t slen = line_len(strsorig[k]);
-    strsproc[k] = xmalloc((slen + 1) * sizeof(char));
-    while (d + i < slen) {
-      char e = strsorig[k][i + d];
+    const mmfile_t *s = k == 0 ? s1 : s2;
+    size_t pi = 0;
+    size_t slen = MIN(MATCH_CHAR_MAX_LEN - 1, line_len(s));
+    for (size_t i = 0; i <= slen; i++) {
+      char e = s->ptr[i];
      if (e != ' ' && e != '\t') {
-        strsproc[k][i] = e;
-        i++;
-      } else {
-        d++;
+        p[k][pi] = e;
+        pi++;
      }
    }
-    strsproc[k][i] = NUL;
-  }
-  int matching = matching_chars(strsproc[0], strsproc[1]);
-  xfree(strsproc[0]);
-  xfree(strsproc[1]);
-  return matching;
-}

-#define MATCH_CHAR_MAX_LEN 800
+    sp[k] = (mmfile_t){
+      .ptr = p[k],
+      .size = (int)pi,
+    };
+  }
+  return matching_chars(&sp[0], &sp[1]);
+}

 /// Return matching characters between "s1" and "s2" whilst respecting sequence order.
 /// Consider the case of two strings 'AAACCC' and 'CCCAAA', the
@@ -83,12 +85,14 @@ static int matching_chars_iwhite(const char *s1, const char *s2)
 ///   matching_chars("abcdefg", "gfedcba")         -> 1  // all characters in common,
 ///                                                      // but only at most 1 in sequence
 ///
-/// @param s1
-/// @param s2
-static int matching_chars(const char *s1, const char *s2)
+/// @param m1
+/// @param m2
+static int matching_chars(const mmfile_t *m1, const mmfile_t *m2)
 {
-  size_t s1len = MIN(MATCH_CHAR_MAX_LEN - 1, line_len(s1));
-  size_t s2len = MIN(MATCH_CHAR_MAX_LEN - 1, line_len(s2));
+  size_t s1len = MIN(MATCH_CHAR_MAX_LEN - 1, line_len(m1));
+  size_t s2len = MIN(MATCH_CHAR_MAX_LEN - 1, line_len(m2));
+  char *s1 = m1->ptr;
+  char *s2 = m2->ptr;
  int matrix[2][MATCH_CHAR_MAX_LEN] = { 0 };
  bool icur = 1;  // save space by storing only two rows for i axis
  for (size_t i = 0; i < s1len; i++) {
@@ -119,13 +123,13 @@ static int matching_chars(const char *s1, const char *s2)
 /// @param sp
 /// @param fomvals
 /// @param n
-static int count_n_matched_chars(const char **sp, const size_t n, bool iwhite)
+static int count_n_matched_chars(mmfile_t **sp, const size_t n, bool iwhite)
 {
  int matched_chars = 0;
  int matched = 0;
  for (size_t i = 0; i < n; i++) {
    for (size_t j = i + 1; j < n; j++) {
-      if (sp[i] != NULL && sp[j] != NULL) {
+      if (sp[i]->ptr != NULL && sp[j]->ptr != NULL) {
        matched++;
        // TODO(lewis6991): handle whitespace ignoring higher up in the stack
        matched_chars += iwhite ? matching_chars_iwhite(sp[i], sp[j])
@@ -143,15 +147,17 @@ static int count_n_matched_chars(const char **sp, const size_t n, bool iwhite)
  return matched_chars;
 }

-void fastforward_buf_to_lnum(const char **s, linenr_T lnum)
+mmfile_t fastforward_buf_to_lnum(mmfile_t s, linenr_T lnum)
 {
  for (int i = 0; i < lnum - 1; i++) {
-    *s = strchr(*s, '\n');
-    if (!*s) {
-      return;
+    s.ptr = strnchr(s.ptr, (size_t *)&s.size, '\n');
+    if (!s.ptr) {
+      break;
    }
-    (*s)++;
+    s.ptr++;
+    s.size--;
  }
+  return s;
 }

 /// try all the different ways to compare these lines and use the one that
@@ -167,25 +173,25 @@ void fastforward_buf_to_lnum(const char **s, linenr_T lnum)
 /// @param diff_blk
 static void try_possible_paths(const int *df_iters, const size_t *paths, const int npaths,
                               const int path_idx, int *choice, diffcmppath_T *diffcmppath,
-                               const int *diff_len, const size_t ndiffs, const char **diff_blk,
+                               const int *diff_len, const size_t ndiffs, const mmfile_t **diff_blk,
                               bool iwhite)
 {
  if (path_idx == npaths) {
    if ((*choice) > 0) {
      int from_vals[LN_MAX_BUFS] = { 0 };
      const int *to_vals = df_iters;
-      const char *current_lines[LN_MAX_BUFS];
+      mmfile_t mm[LN_MAX_BUFS];  // stack memory for current_lines
+      mmfile_t *current_lines[LN_MAX_BUFS];
      for (size_t k = 0; k < ndiffs; k++) {
        from_vals[k] = df_iters[k];
        // get the index at all of the places
        if ((*choice) & (1 << k)) {
          from_vals[k]--;
-          const char *p = diff_blk[k];
-          fastforward_buf_to_lnum(&p, df_iters[k]);
-          current_lines[k] = p;
+          mm[k] = fastforward_buf_to_lnum(*diff_blk[k], df_iters[k]);
        } else {
-          current_lines[k] = NULL;
+          mm[k] = (mmfile_t){ 0 };
        }
+        current_lines[k] = &mm[k];
      }
      size_t unwrapped_idx_from = unwrap_indexes(from_vals, diff_len, ndiffs);
      size_t unwrapped_idx_to = unwrap_indexes(to_vals, diff_len, ndiffs);
@@ -244,7 +250,7 @@ static size_t unwrap_indexes(const int *values, const int *diff_len, const size_
 /// @param ndiffs
 /// @param diff_blk
 static void populate_tensor(int *df_iters, const size_t ch_dim, diffcmppath_T *diffcmppath,
-                            const int *diff_len, const size_t ndiffs, const char **diff_blk,
+                            const int *diff_len, const size_t ndiffs, const mmfile_t **diff_blk,
                            bool iwhite)
 {
  if (ch_dim == ndiffs) {
@@ -327,7 +333,7 @@ static void populate_tensor(int *df_iters, const size_t ch_dim, diffcmppath_T *d
 /// @param ndiffs
 /// @param [out] [allocated] decisions
 /// @return the length of decisions
-size_t linematch_nbuffers(const char **diff_blk, const int *diff_len, const size_t ndiffs,
+size_t linematch_nbuffers(const mmfile_t **diff_blk, const int *diff_len, const size_t ndiffs,
                          int **decisions, bool iwhite)
 {
  assert(ndiffs <= LN_MAX_BUFS);
--- a/src/nvim/linematch.h
+++ b/src/nvim/linematch.h
@@ -3,6 +3,7 @@
 #include <stddef.h>  // IWYU pragma: keep

 #include "nvim/pos_defs.h"  // IWYU pragma: keep
+#include "xdiff/xdiff.h"

 #ifdef INCLUDE_GENERATED_DECLARATIONS
 # include "linematch.h.generated.h"
--- a/src/nvim/lua/xdiff.c
+++ b/src/nvim/lua/xdiff.c
@@ -67,11 +67,11 @@ static void get_linematch_results(lua_State *lstate, mmfile_t *ma, mmfile_t *mb,
                                  int count_a, int start_b, int count_b, bool iwhite)
 {
  // get the pointer to char of the start of the diff to pass it to linematch algorithm
-  const char *diff_begin[2] = { ma->ptr, mb->ptr };
-  int diff_length[2] = { count_a, count_b };
+  mmfile_t ma0 = fastforward_buf_to_lnum(*ma, (linenr_T)start_a + 1);
+  mmfile_t mb0 = fastforward_buf_to_lnum(*mb, (linenr_T)start_b + 1);

-  fastforward_buf_to_lnum(&diff_begin[0], (linenr_T)start_a + 1);
-  fastforward_buf_to_lnum(&diff_begin[1], (linenr_T)start_b + 1);
+  const mmfile_t *diff_begin[2] = { &ma0, &mb0 };
+  int diff_length[2] = { count_a, count_b };

  int *decisions = NULL;
  size_t decisions_length = linematch_nbuffers(diff_begin, diff_length, 2, &decisions, iwhite);
--- a/src/nvim/strings.c
+++ b/src/nvim/strings.c
@@ -496,6 +496,20 @@ char *vim_strchr(const char *const string, const int c)
  }
 }

+// Sized version of strchr that can handle embedded NULs.
+// Adjusts n to the new size.
+char *strnchr(const char *p, size_t *n, int c)
+{
+  while (*n > 0) {
+    if (*p == c) {
+      return (char *)p;
+    }
+    p++;
+    (*n)--;
+  }
+  return NULL;
+}
+
 // Sort an array of strings.

 static int sort_compare(const void *s1, const void *s2)
--- a/test/functional/lua/xdiff_spec.lua
+++ b/test/functional/lua/xdiff_spec.lua
@@ -174,4 +174,13 @@ describe('xdiff bindings', function()
      pcall_err(exec_lua, [[vim.diff('a', 'b', { on_hunk = true })]])
    )
  end)
+
+  it('can handle strings with embedded NUL characters (GitHub #30305)', function()
+    eq(
+      { { 0, 0, 1, 1 }, { 1, 0, 3, 2 } },
+      exec_lua(function()
+        return vim.diff('\n', '\0\n\n\nb', { linematch = true, result_type = 'indices' })
+      end)
+    )
+  end)
 end)