vim-patch:9.2.0140: file reading performance can be improved (#38266)

Problem: Reading large files is slow because UTF-8 validation and newline scanning are performed byte-by-byte. Initial file loading also triggers listener and channel processing. Solution: Use memchr() for SIMD-optimized newline scanning, implement word-at-a-time ASCII skipping during UTF-8 validation using a bitmask, skip listener/netbeans/channel notifications when the ML_APPEND_NEW flag is set during readfile() (Yasuhiro Matsumoto). closes: vim/vim#19612 2ca96b09d7 Co-authored-by: Yasuhiro Matsumoto <mattn.jp@gmail.com> Co-authored-by: NRK <nrk@disroot.org>
2026-03-31 04:42:03 +00:00 · 2026-03-12 10:28:03 +08:00
parent 692f882052
commit f847aa6208
1 changed files with 101 additions and 57 deletions
--- a/src/nvim/fileio.c
+++ b/src/nvim/fileio.c
@@ -97,6 +97,10 @@

 static const char *e_auchangedbuf = N_("E812: Autocommands changed buffer or buffer name");

+// Bitmask with 0x80 set in each byte of a uint64_t word, used to detect
+// non-ASCII bytes (high bit set) in multiple bytes at once.
+#define NONASCII_MASK (((uint64_t)(-1) / 0xFF) * 0x80)
+
 void filemess(buf_T *buf, char *name, char *s)
 {
  int prev_msg_col = msg_col;
@@ -1333,9 +1337,24 @@ retry:
        bool incomplete_tail = false;

        // Reading UTF-8: Check if the bytes are valid UTF-8.
-        for (p = (uint8_t *)ptr;; p++) {
-          int todo = (int)(((uint8_t *)ptr + size) - p);
+        for (p = (uint8_t *)ptr;;) {
+          // Skip ASCII bytes quickly using word-at-a-time check.
+          {
+            uint8_t *ascii_end = (uint8_t *)ptr + size;
+            while (ascii_end - p >= (ptrdiff_t)sizeof(uint64_t)) {
+              uint64_t word;
+              memcpy(&word, p, sizeof(uint64_t));
+              if (word & NONASCII_MASK) {
+                break;
+              }
+              p += sizeof(uint64_t);
+            }
+            while (p < ascii_end && *p < 0x80) {
+              p++;
+            }
+          }

+          int todo = (int)(((uint8_t *)ptr + size) - p);
          if (todo <= 0) {
            break;
          }
@@ -1384,13 +1403,15 @@ retry:
              // Drop, keep or replace the bad byte.
              if (bad_char_behavior == BAD_DROP) {
                memmove(p, p + 1, (size_t)(todo - 1));
-                p--;
                size--;
-              } else if (bad_char_behavior != BAD_KEEP) {
-                *p = (uint8_t)bad_char_behavior;
+              } else {
+                if (bad_char_behavior != BAD_KEEP) {
+                  *p = (uint8_t)bad_char_behavior;
+                }
+                p++;
              }
            } else {
-              p += l - 1;
+              p += l;
            }
          }
        }
@@ -1515,60 +1536,83 @@ rewind_retry:
        }
      }
    } else {
-      ptr--;
-      while (++ptr, --size >= 0) {
-        if ((c = *ptr) != NUL && c != NL) {        // catch most common case
-          continue;
-        }
-        if (c == NUL) {
-          *ptr = NL;            // NULs are replaced by newlines!
-        } else {
-          if (skip_count == 0) {
-            *ptr = NUL;                         // end of line
-            len = (colnr_T)(ptr - line_start + 1);
-            if (fileformat == EOL_DOS) {
-              if (ptr > line_start && ptr[-1] == CAR) {
-                // remove CR before NL
-                ptr[-1] = NUL;
-                len--;
-              } else if (ff_error != EOL_DOS) {
-                // Reading in Dos format, but no CR-LF found!
-                // When 'fileformats' includes "unix", delete all
-                // the lines read so far and start all over again.
-                // Otherwise give an error message later.
-                if (try_unix
-                    && !read_stdin
-                    && (read_buffer || vim_lseek(fd, 0, SEEK_SET) == 0)) {
-                  fileformat = EOL_UNIX;
-                  if (set_options) {
-                    set_fileformat(EOL_UNIX, OPT_LOCAL);
-                  }
-                  file_rewind = true;
-                  keep_fileformat = true;
-                  goto retry;
-                }
-                ff_error = EOL_DOS;
-              }
-            }
-            if (ml_append(lnum, line_start, len, newfile) == FAIL) {
-              error = true;
-              break;
-            }
-            if (read_undo_file) {
-              sha256_update(&sha_ctx, (uint8_t *)line_start, (size_t)len);
-            }
-            lnum++;
-            if (--read_count == 0) {
-              error = true;                         // break loop
-              line_start = ptr;                 // nothing left to write
-              break;
-            }
-          } else {
-            skip_count--;
+      // Use memchr() for SIMD-optimized newline scanning instead
+      // of scanning each byte individually.
+      char *end = ptr + size;
+
+      while (ptr < end) {
+        char *nl = memchr(ptr, NL, (size_t)(end - ptr));
+        char *nul_scan;
+
+        if (nl == NULL) {
+          // No more newlines in buffer.
+          // Replace any NUL bytes with NL in remaining data.
+          while ((nul_scan = memchr(ptr, NUL, (size_t)(end - ptr))) != NULL) {
+            *nul_scan = NL;
+            ptr = nul_scan + 1;
          }
-          line_start = ptr + 1;
+          ptr = end;
+          break;
        }
+
+        // Replace NUL bytes with NL before the newline.
+        {
+          char *scan = ptr;
+          while ((nul_scan = memchr(scan, NUL, (size_t)(nl - scan))) != NULL) {
+            *nul_scan = NL;
+            scan = nul_scan + 1;
+          }
+        }
+
+        // Process the newline.
+        ptr = nl;
+        if (skip_count == 0) {
+          *ptr = NUL;                         // end of line
+          len = (colnr_T)(ptr - line_start + 1);
+          if (fileformat == EOL_DOS) {
+            if (ptr > line_start && ptr[-1] == CAR) {
+              // remove CR before NL
+              ptr[-1] = NUL;
+              len--;
+            } else if (ff_error != EOL_DOS) {
+              // Reading in Dos format, but no CR-LF found!
+              // When 'fileformats' includes "unix", delete all
+              // the lines read so far and start all over again.
+              // Otherwise give an error message later.
+              if (try_unix
+                  && !read_stdin
+                  && (read_buffer || vim_lseek(fd, 0, SEEK_SET) == 0)) {
+                fileformat = EOL_UNIX;
+                if (set_options) {
+                  set_fileformat(EOL_UNIX, OPT_LOCAL);
+                }
+                file_rewind = true;
+                keep_fileformat = true;
+                goto retry;
+              }
+              ff_error = EOL_DOS;
+            }
+          }
+          if (ml_append(lnum, line_start, len, newfile) == FAIL) {
+            error = true;
+            break;
+          }
+          if (read_undo_file) {
+            sha256_update(&sha_ctx, (uint8_t *)line_start, (size_t)len);
+          }
+          lnum++;
+          if (--read_count == 0) {
+            error = true;                         // break loop
+            line_start = ptr;                 // nothing left to write
+            break;
+          }
+        } else {
+          skip_count--;
+        }
+        line_start = ptr + 1;
+        ptr++;
      }
+      size = -1;
    }
    linerest = (ptr - line_start);
    os_breakcheck();