libghostty: Remove all libc++ and libc++ ABI dependencies (#12291)

This updates simdutf to my fork which has a SIMDUTF_NO_LIBCXX option that removes all libc++ and libc++ ABI dependencies. The plan is to open an upstream PR with this, but I want to verify it here first. From there, the hand-written simd code we have has been updated to also no longer use any libc++ features. Part of this required removing utfcpp since it depended on libc++ (`<iterator>`). libghostty-vt now only depends on libc. ## Benchmark Results | Corpus | Current `HEAD` median | `main` median | Delta vs `main` | Notes | | --- | ---: | ---: | ---: | --- | | `valid-mixed-1g-seed1.bin` | `9.245s` | `9.111s` | `1.5%` slower | Near tie; `main` remains slightly faster on fully valid input | | `malformed-mixed-1g-seed1-rate0.005.bin` | `9.251s` | `12.705s` | `37.3%` faster | Large improvement on malformed UTF-8 input | Approximate throughput from the medians: - Valid corpus: current `HEAD` `110.8 MiB/s`, `main` `112.4 MiB/s` - Malformed corpus: current `HEAD` `110.7 MiB/s`, `main` `80.6 MiB/s`
2026-07-11 03:39:36 +00:00 · 2026-04-15 11:36:16 -07:00
parent efa8da6aea e51de8b58f
commit 43a05dc968
25 changed files with 48248 additions and 33886 deletions
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -419,8 +419,14 @@ jobs:
          echo "Libs:   $(pkg-config --libs libghostty-vt)"
          echo "Static: $(pkg-config --libs --static libghostty-vt)"

-          # Libs.private must include the C++ standard library
-          pkg-config --libs --static libghostty-vt | grep -q -- '-lc++'
+          # Libs.private must NOT include the C++ runtime libraries (all
+          # vendored C++ deps are built in no-libcxx mode).
+          ! pkg-config --libs --static libghostty-vt | grep -qE -- '-lc\+\+|-lc\+\+abi'
+
+      - name: Verify shared library has no libc++ dependency
+        run: |
+          ldd zig-out/lib/libghostty-vt.so.0.1.0
+          ! ldd zig-out/lib/libghostty-vt.so.0.1.0 2>/dev/null | grep -qE 'libc\+\+|libc\+\+abi'

      - name: Verify static archive contains SIMD deps
        run: |
@@ -452,16 +458,15 @@ jobs:
      - name: Test static link via pkg-config
        run: |
          export PKG_CONFIG_PATH="$PWD/zig-out/share/pkgconfig"
-          # The static library is compiled with LLVM libc++ (not GNU
-          # libstdc++), so linking requires a libc++-compatible toolchain.
-          # zig cc, clang, or gcc with libc++-dev installed all work.
-          nix develop -c zig cc -o /tmp/test_static /tmp/test_libghostty_vt.c \
+          # The static archive must link cleanly into a plain C program
+          # without any extra C++ runtime flags.
+          nix develop -c cc -o /tmp/test_static /tmp/test_libghostty_vt.c \
            $(pkg-config --cflags libghostty-vt) \
            "$PWD/zig-out/lib/libghostty-vt.a" \
            $(pkg-config --libs-only-l --static libghostty-vt | sed 's/-lghostty-vt//')
          /tmp/test_static
-          # Verify it's truly statically linked (no libghostty-vt.so dependency)
-          ! ldd /tmp/test_static 2>/dev/null | grep -q libghostty-vt
+          # Verify it doesn't depend on the shared lib or a C++ runtime.
+          ! ldd /tmp/test_static 2>/dev/null | grep -qE 'libghostty-vt|libc\+\+|libc\+\+abi'

      # Test system integration: rebuild with -Dsystem-simdutf=true so
      # simdutf comes from the system instead of being vendored. This
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -178,9 +178,8 @@ add_dependencies(ghostty-vt zig_build_lib_vt)
 # Static
 #
 # On Linux and macOS, the static library is a fat archive that bundles
-# the vendored SIMD dependencies (highway, simdutf, utfcpp). Consumers
-# only need to link libc and libc++ (LLVM's C++ runtime, not GNU
-# libstdc++). Use zig cc, clang, or any toolchain with libc++ support.
+# the vendored SIMD dependencies (highway, simdutf). Consumers
+# only need to link libc.
 #
 # On Windows, the SIMD dependencies are not bundled and must be linked
 # separately.
@@ -349,11 +348,7 @@ function(ghostty_vt_add_target)
    )
    if(_GVT_ZIG_TARGET MATCHES "windows")
        set_target_properties(${_static_target} PROPERTIES
-            INTERFACE_LINK_LIBRARIES "c++;ntdll;kernel32"
-        )
-    else()
-        set_target_properties(${_static_target} PROPERTIES
-            INTERFACE_LINK_LIBRARIES "c++"
+            INTERFACE_LINK_LIBRARIES "ntdll;kernel32"
        )
    endif()
    add_dependencies(${_static_target} ${_build_target})
--- a/build.zig.zon
+++ b/build.zig.zon
@@ -76,7 +76,6 @@
        .opengl = .{ .path = "./pkg/opengl", .lazy = true },
        .sentry = .{ .path = "./pkg/sentry", .lazy = true },
        .simdutf = .{ .path = "./pkg/simdutf", .lazy = true },
-        .utfcpp = .{ .path = "./pkg/utfcpp", .lazy = true },
        .wuffs = .{ .path = "./pkg/wuffs", .lazy = true },
        .zlib = .{ .path = "./pkg/zlib", .lazy = true },

--- a/build.zig.zon.json
+++ b/build.zig.zon.json
@@ -109,11 +109,6 @@
    "url": "https://deps.files.ghostty.org/spirv_cross-1220fb3b5586e8be67bc3feb34cbe749cf42a60d628d2953632c2f8141302748c8da.tar.gz",
    "hash": "sha256-tStvz8Ref6abHwahNiwVVHNETizAmZVVaxVsU7pmV+M="
  },
-  "N-V-__8AAHffAgDU0YQmynL8K35WzkcnMUmBVQHQ0jlcKpjH": {
-    "name": "utfcpp",
-    "url": "https://deps.files.ghostty.org/utfcpp-1220d4d18426ca72fc2b7e56ce47273149815501d0d2395c2a98c726b31ba931e641.tar.gz",
-    "hash": "sha256-/8ZooxDndgfTk/PBizJxXyI9oerExNbgV5oR345rWc8="
-  },
  "uucode-0.1.0-ZZjBPj96QADXyt5sqwBJUnhaDYs_qBeeKijZvlRa0eqM": {
    "name": "uucode",
    "url": "git+https://github.com/jacobsandlund/uucode#5f05f8f83a75caea201f12cc8ea32a2d82ea9732",
--- a/build.zig.zon.nix
+++ b/build.zig.zon.nix
@@ -258,14 +258,6 @@ in
        hash = "sha256-tStvz8Ref6abHwahNiwVVHNETizAmZVVaxVsU7pmV+M=";
      };
    }
-    {
-      name = "N-V-__8AAHffAgDU0YQmynL8K35WzkcnMUmBVQHQ0jlcKpjH";
-      path = fetchZigArtifact {
-        name = "utfcpp";
-        url = "https://deps.files.ghostty.org/utfcpp-1220d4d18426ca72fc2b7e56ce47273149815501d0d2395c2a98c726b31ba931e641.tar.gz";
-        hash = "sha256-/8ZooxDndgfTk/PBizJxXyI9oerExNbgV5oR345rWc8=";
-      };
-    }
    {
      name = "uucode-0.1.0-ZZjBPj96QADXyt5sqwBJUnhaDYs_qBeeKijZvlRa0eqM";
      path = fetchZigArtifact {
--- a/build.zig.zon.txt
+++ b/build.zig.zon.txt
@@ -20,7 +20,6 @@ https://deps.files.ghostty.org/pixels-12207ff340169c7d40c570b4b6a97db614fe47e0d8
 https://deps.files.ghostty.org/plasma_wayland_protocols-12207e0851c12acdeee0991e893e0132fc87bb763969a585dc16ecca33e88334c566.tar.gz
 https://deps.files.ghostty.org/sentry-1220446be831adcca918167647c06c7b825849fa3fba5f22da394667974537a9c77e.tar.gz
 https://deps.files.ghostty.org/spirv_cross-1220fb3b5586e8be67bc3feb34cbe749cf42a60d628d2953632c2f8141302748c8da.tar.gz
-https://deps.files.ghostty.org/utfcpp-1220d4d18426ca72fc2b7e56ce47273149815501d0d2395c2a98c726b31ba931e641.tar.gz
 https://deps.files.ghostty.org/uucode-0.2.0-ZZjBPqZVVABQepOqZHR7vV_NcaN-wats0IB6o-Exj6m9.tar.gz
 https://deps.files.ghostty.org/vaxis-7dbb9fd3122e4ffad262dd7c151d80d863b68558.tar.gz
 https://deps.files.ghostty.org/wayland-9cb3d7aa9dc995ffafdbdef7ab86a949d0fb0e7d.tar.gz
--- a/example/cpp-vt-stream/build.zig
+++ b/example/cpp-vt-stream/build.zig
@@ -14,6 +14,7 @@ pub fn build(b: *std.Build) void {
        .root = b.path("src"),
        .files = &.{"main.cpp"},
    });
+    exe_mod.link_libcpp = true;

    // You'll want to use a lazy dependency here so that ghostty is only
    // downloaded if you actually need it.
--- a/example/swift-vt-xcframework/Package.swift
+++ b/example/swift-vt-xcframework/Package.swift
@@ -8,10 +8,7 @@ let package = Package(
        .executableTarget(
            name: "swift-vt-xcframework",
            dependencies: ["GhosttyVt"],
-            path: "Sources",
-            linkerSettings: [
-                .linkedLibrary("c++"),
-            ]
+            path: "Sources"
        ),
        .binaryTarget(
            name: "GhosttyVt",
--- a/flatpak/zig-packages.json
+++ b/flatpak/zig-packages.json
@@ -131,12 +131,6 @@
    "dest": "vendor/p/N-V-__8AANb6pwD7O1WG6L5nvD_rNMvnSc9Cpg1ijSlTYywv",
    "sha256": "b52b6fcfc45e7fa69b1f06a1362c155473444e2cc09995556b156c53ba6657e3"
  },
-  {
-    "type": "archive",
-    "url": "https://deps.files.ghostty.org/utfcpp-1220d4d18426ca72fc2b7e56ce47273149815501d0d2395c2a98c726b31ba931e641.tar.gz",
-    "dest": "vendor/p/N-V-__8AAHffAgDU0YQmynL8K35WzkcnMUmBVQHQ0jlcKpjH",
-    "sha256": "ffc668a310e77607d393f3c18b32715f223da1eac4c4d6e0579a11df8e6b59cf"
-  },
  {
    "type": "git",
    "url": "https://github.com/jacobsandlund/uucode",
--- a/nix/build-support/check-zig-cache.sh
+++ b/nix/build-support/check-zig-cache.sh
@@ -79,6 +79,11 @@ elif [ "$1" != "--update" ]; then
  exit 1
 fi

+# Fetch all dependencies (including lazy ones) into the global cache
+# so that zon2nix can find them when resolving transitive dependencies.
+# Otherwise, lazy dependencies that aren't unpacked will fail below.
+zig build --fetch=all
+
 zon2nix "$BUILD_ZIG_ZON" --15 --nix "$WORK_DIR/build.zig.zon.nix" --txt "$WORK_DIR/build.zig.zon.txt" --json "$WORK_DIR/build.zig.zon.json" --flatpak "$WORK_DIR/zig-packages.json"
 alejandra --quiet "$WORK_DIR/build.zig.zon.nix"
 prettier --log-level warn --write "$WORK_DIR/build.zig.zon.json"
--- a/pkg/simdutf/build.zig
+++ b/pkg/simdutf/build.zig
@@ -3,6 +3,7 @@ const std = @import("std");
 pub fn build(b: *std.Build) !void {
    const optimize = b.standardOptimizeOption(.{});
    const target = b.standardTargetOptions(.{});
+    const no_libcxx = b.option(bool, "no_libcxx", "Set SIMDUTF_NO_LIBCXX to avoid libc++ dependency") orelse false;

    const lib = b.addLibrary(.{
        .name = "simdutf",
@@ -13,13 +14,15 @@ pub fn build(b: *std.Build) !void {
        .linkage = .static,
    });
    lib.linkLibC();
-    // On MSVC, we must not use linkLibCpp because Zig unconditionally
-    // passes -nostdinc++ and then adds its bundled libc++/libc++abi
-    // include paths, which conflict with MSVC's own C++ runtime headers.
-    // The MSVC SDK include directories (added via linkLibC) contain
-    // both C and C++ headers, so linkLibCpp is not needed.
-    if (target.result.abi != .msvc) {
-        lib.linkLibCpp();
+    if (!no_libcxx) {
+        // On MSVC, we must not use linkLibCpp because Zig unconditionally
+        // passes -nostdinc++ and then adds its bundled libc++/libc++abi
+        // include paths, which conflict with MSVC's own C++ runtime headers.
+        // The MSVC SDK include directories (added via linkLibC) contain
+        // both C and C++ headers, so linkLibCpp is not needed.
+        if (target.result.abi != .msvc) {
+            lib.linkLibCpp();
+        }
    }
    lib.addIncludePath(b.path("vendor"));

@@ -45,6 +48,13 @@ pub fn build(b: *std.Build) !void {
        "-fno-sanitize-trap=undefined",
    });

+    if (no_libcxx) {
+        try flags.append(b.allocator, "-DSIMDUTF_NO_LIBCXX");
+        try flags.append(b.allocator, "-fno-exceptions");
+        try flags.append(b.allocator, "-fno-rtti");
+        lib.root_module.addCMacro("SIMDUTF_NO_LIBCXX", "1");
+    }
+
    if (target.result.os.tag == .freebsd or target.result.abi == .musl) {
        try flags.append(b.allocator, "-fPIC");
    }
--- a/pkg/simdutf/vendor/simdutf.cpp
+++ b/pkg/simdutf/vendor/simdutf.cpp
--- a/pkg/simdutf/vendor/simdutf.h
+++ b/pkg/simdutf/vendor/simdutf.h
--- a/pkg/utfcpp/build.zig
+++ b/pkg/utfcpp/build.zig
@@ -1,61 +0,0 @@
-const std = @import("std");
-
-pub fn build(b: *std.Build) !void {
-    const target = b.standardTargetOptions(.{});
-    const optimize = b.standardOptimizeOption(.{});
-
-    const lib = b.addLibrary(.{
-        .name = "utfcpp",
-        .root_module = b.createModule(.{
-            .target = target,
-            .optimize = optimize,
-        }),
-        .linkage = .static,
-    });
-    lib.linkLibC();
-
-    if (target.result.os.tag.isDarwin()) {
-        const apple_sdk = @import("apple_sdk");
-        try apple_sdk.addPaths(b, lib);
-    }
-
-    if (target.result.abi.isAndroid()) {
-        const android_ndk = @import("android_ndk");
-        try android_ndk.addPaths(b, lib);
-    }
-
-    var flags: std.ArrayList([]const u8) = .empty;
-    defer flags.deinit(b.allocator);
-
-    lib.addCSourceFiles(.{
-        .flags = flags.items,
-        .files = &.{"empty.cc"},
-    });
-
-    if (b.lazyDependency("utfcpp", .{})) |upstream| {
-        lib.addIncludePath(upstream.path(""));
-        lib.installHeadersDirectory(
-            upstream.path("source"),
-            "",
-            .{ .include_extensions = &.{".h"} },
-        );
-    }
-
-    b.installArtifact(lib);
-
-    // {
-    //     const test_exe = b.addTest(.{
-    //         .name = "test",
-    //         .root_source_file = .{ .path = "main.zig" },
-    //         .target = target,
-    //         .optimize = optimize,
-    //     });
-    //     test_exe.linkLibrary(lib);
-    //
-    //     var it = module.import_table.iterator();
-    //     while (it.next()) |entry| test_exe.root_module.addImport(entry.key_ptr.*, entry.value_ptr.*);
-    //     const tests_run = b.addRunArtifact(test_exe);
-    //     const test_step = b.step("test", "Run tests");
-    //     test_step.dependOn(&tests_run.step);
-    // }
-}
--- a/pkg/utfcpp/build.zig.zon
+++ b/pkg/utfcpp/build.zig.zon
@@ -1,17 +0,0 @@
-.{
-    .name = .utfcpp,
-    .version = "4.0.5",
-    .fingerprint = 0xcd99aeb2334ae11a,
-    .paths = .{""},
-    .dependencies = .{
-        // nemtrif/utfcpp
-        .utfcpp = .{
-            .url = "https://deps.files.ghostty.org/utfcpp-1220d4d18426ca72fc2b7e56ce47273149815501d0d2395c2a98c726b31ba931e641.tar.gz",
-            .hash = "N-V-__8AAHffAgDU0YQmynL8K35WzkcnMUmBVQHQ0jlcKpjH",
-            .lazy = true,
-        },
-
-        .apple_sdk = .{ .path = "../apple-sdk" },
-        .android_ndk = .{ .path = "../android-ndk" },
-    },
-}
--- a/pkg/utfcpp/empty.cc
+++ b/pkg/utfcpp/empty.cc
@@ -1,2 +0,0 @@
-// Needed for Zig build to be happy
-void ghostty_utfcpp_stub() {}
--- a/src/benchmark/AGENTS.md
+++ b/src/benchmark/AGENTS.md
@@ -26,9 +26,23 @@ The benchmark tools are split into two roles:
  based on medians instead of single runs.
 - When comparing branches, keep all benchmark inputs and CLI flags the same,
  including terminal dimensions.
+- Never run multiple benchmarks in parallel on the same machine, as they will
+  interfere with each other and produce unreliable results.

 ## Building

- Build benchmark tools with `zig build -Demit-bench`.
- On macOS, prefer `zig build -Demit-bench -Demit-macos-app=false` unless the
-  macOS app itself is part of the work.
+- Build benchmark tools with `zig build -Demit-bench -Doptimize=ReleaseFast`.
+- On macOS, add `-Demit-macos-app=false` to avoid building the macOS app.
+- Make sure you specify `-Doptimize=ReleaseFast` when building benchmarks,
+  otherwise the debug build will be very slow and not representative of real
+  performance.
+
+## Comparing Branches
+
+- When comparing branches, switch to that branch, build the binary, then
+  rename it e.g. `zig-out/bin/ghostty-bench` to `zig-out/bin/ghostty-bench-branch1`.
+  Replace branch1 with something better.
+- Then switch to the other branch, build it, and rename it to
+  `zig-out/bin/ghostty-bench-branch2`. Replace branch2 with something better.
+- Then run all the benchmarks with `hyperfine` comparing the N binaries
+  we want to.
--- a/src/build/GhosttyLibVt.zig
+++ b/src/build/GhosttyLibVt.zig
@@ -346,11 +346,8 @@ fn combineArchives(
 }

 /// Returns the Libs.private value for the pkg-config file.
-/// This includes the C++ standard library needed by SIMD code.
-///
-/// Zig compiles C++ code with LLVM's libc++ (not GNU libstdc++),
-/// so consumers linking the static library need a libc++-compatible
-/// toolchain: `zig cc`, `clang`, or GCC with `-lc++` installed.
+/// Vendored C++ dependencies are built in no-libcxx mode so consumers
+/// don't need libc++.  System-provided simdutf still requires it.
 fn libsPrivate(
    zig: *const GhosttyZig,
 ) []const u8 {
--- a/src/build/GhosttyZig.zig
+++ b/src/build/GhosttyZig.zig
@@ -119,13 +119,14 @@ fn initVt(
        .target = cfg.target,
        .optimize = cfg.optimize,

-        // SIMD require libc/libcpp (both) but otherwise we don't care.
-        // On MSVC, we must not use linkLibCpp because Zig passes
-        // -nostdinc++ and adds its bundled libc++/libc++abi headers
-        // which conflict with MSVC's C++ runtime. The MSVC SDK dirs
-        // added via link_libc contain both C and C++ headers.
+        // SIMD requires libc. Vendored C++ dependencies are built with
+        // no-libcxx mode (HWY_NO_LIBCXX / SIMDUTF_NO_LIBCXX) so we
+        // don't need libcpp. System-provided simdutf headers still
+        // use C++ stdlib headers, so we need libcpp in that case.
        .link_libc = if (cfg.simd) true else null,
-        .link_libcpp = if (cfg.simd and cfg.target.result.abi != .msvc) true else null,
+        .link_libcpp = if (cfg.simd and
+            b.systemIntegrationOption("simdutf", .{}) and
+            cfg.target.result.abi != .msvc) true else null,
    });
    vt.addOptions("build_options", general_options);
    vt_options.add(b, vt);
--- a/src/build/SharedDeps.zig
+++ b/src/build/SharedDeps.zig
@@ -762,6 +762,7 @@ pub fn addSimd(
        if (b.lazyDependency("simdutf", .{
            .target = target,
            .optimize = optimize,
+            .no_libcxx = true,
        })) |simdutf_dep| {
            m.linkLibrary(simdutf_dep.artifact("simdutf"));
            if (static_libs) |v| try v.append(
@@ -787,18 +788,6 @@ pub fn addSimd(
        }
    }

-    // utfcpp - This is used as a dependency on our hand-written C++ code
-    if (b.lazyDependency("utfcpp", .{
-        .target = target,
-        .optimize = optimize,
-    })) |utfcpp_dep| {
-        m.linkLibrary(utfcpp_dep.artifact("utfcpp"));
-        if (static_libs) |v| try v.append(
-            b.allocator,
-            utfcpp_dep.artifact("utfcpp").getEmittedBin(),
-        );
-    }
-
    // SIMD C++ files
    m.addIncludePath(b.path("src"));
    {
@@ -839,6 +828,14 @@ pub fn addSimd(
            "-DHWY_NO_LIBCXX",
        );

+        // When using the vendored simdutf, build its headers in no-libcxx
+        // mode so we don't need C++ standard library headers at all.
+        // System simdutf headers may not support this define.
+        if (!b.systemIntegrationOption("simdutf", .{})) try flags.append(
+            b.allocator,
+            "-DSIMDUTF_NO_LIBCXX",
+        );
+
        // Disable ubsan for MSVC to avoid undefined references to
        // __ubsan_handle_* symbols that require a runtime we don't link
        // and bundle. Hopefully we can fix this one day since ubsan is nice!
--- a/src/simd/codepoint_width.cpp
+++ b/src/simd/codepoint_width.cpp
@@ -4,9 +4,34 @@
 #include <hwy/foreach_target.h>  // must come before highway.h
 #include <hwy/highway.h>

-#include <algorithm>
-#include <cassert>
-#include <iterator>
+#ifndef GHOSTTY_SIMD_CPW_HELPERS_
+#define GHOSTTY_SIMD_CPW_HELPERS_
+
+#include <assert.h>
+#include <stddef.h>
+
+// Replacement for std::size() that works without libc++.
+template <typename T, size_t N>
+constexpr size_t array_size(const T (&)[N]) { return N; }
+
+// Constexpr min/max element over a C array (replaces std::min_element/
+// std::max_element).
+template <typename T, size_t N>
+constexpr T array_min(const T (&a)[N]) {
+  T m = a[0];
+  for (size_t i = 1; i < N; ++i)
+    if (a[i] != 0 && (m == 0 || a[i] < m)) m = a[i];
+  return m;
+}
+template <typename T, size_t N>
+constexpr T array_max(const T (&a)[N]) {
+  T m = a[0];
+  for (size_t i = 1; i < N; ++i)
+    if (a[i] > m) m = a[i];
+  return m;
+}
+
+#endif  // GHOSTTY_SIMD_CPW_HELPERS_

 HWY_BEFORE_NAMESPACE();
 namespace ghostty {
@@ -214,12 +239,12 @@ HWY_ALIGN constexpr uint16_t nsm_lte16[] = {
 };

 // All our tables must be identically sized
-static_assert(std::size(eaw_gte32) == std::size(eaw_lte32));
-static_assert(std::size(eaw_gte16) == std::size(eaw_lte16));
-static_assert(std::size(zero_gte32) == std::size(zero_lte32));
-static_assert(std::size(zero_gte16) == std::size(zero_lte16));
-static_assert(std::size(nsm_gte32) == std::size(nsm_lte32));
-static_assert(std::size(nsm_gte16) == std::size(nsm_lte16));
+static_assert(array_size(eaw_gte32) == array_size(eaw_lte32));
+static_assert(array_size(eaw_gte16) == array_size(eaw_lte16));
+static_assert(array_size(zero_gte32) == array_size(zero_lte32));
+static_assert(array_size(zero_gte16) == array_size(zero_lte16));
+static_assert(array_size(nsm_gte32) == array_size(nsm_lte32));
+static_assert(array_size(nsm_gte16) == array_size(nsm_lte16));

 /// Handles 16-bit codepoints.
 template <class D, typename T = uint16_t>
@@ -245,10 +270,10 @@ int8_t CodepointWidth16(D d, uint16_t input) {
        0,      0,      0,      0,      0,      0,      0,      0,      0, 0,
        0,      0,      0,      0,      0,      0,      0,      0,      0, 0,
    };
-    static_assert(std::size(gte_keys) == std::size(lte_keys));
-    static_assert(std::size(gte_keys) >= 32);
+    static_assert(array_size(gte_keys) == array_size(lte_keys));
+    static_assert(array_size(gte_keys) >= 32);
    size_t i = 0;
-    for (; i + N <= std::size(lte_keys) && lte_keys[i] != 0; i += N) {
+    for (; i + N <= array_size(lte_keys) && lte_keys[i] != 0; i += N) {
      const hn::Vec<D> lte_vec = hn::Load(d, lte_keys + i);
      const hn::Vec<D> gte_vec = hn::Load(d, gte_keys + i);
      const intptr_t idx = hn::FindFirstTrue(
@@ -267,12 +292,12 @@ int8_t CodepointWidth16(D d, uint16_t input) {

  {
    constexpr T zero_gte_min =
-        *std::min_element(zero_gte16, zero_gte16 + std::size(zero_gte16));
+        array_min(zero_gte16);
    constexpr T zero_lte_max =
-        *std::max_element(zero_lte16, zero_lte16 + std::size(zero_lte16));
+        array_max(zero_lte16);
    if (input >= zero_gte_min && input <= zero_lte_max) {
      size_t i = 0;
-      for (; i + N <= std::size(zero_gte16) && zero_gte16[i] != 0; i += N) {
+      for (; i + N <= array_size(zero_gte16) && zero_gte16[i] != 0; i += N) {
        const hn::Vec<D> lte_vec = hn::Load(d, zero_lte16 + i);
        const hn::Vec<D> gte_vec = hn::Load(d, zero_gte16 + i);
        const intptr_t idx = hn::FindFirstTrue(
@@ -286,12 +311,12 @@ int8_t CodepointWidth16(D d, uint16_t input) {

  {
    constexpr T eaw_gte_min =
-        *std::min_element(eaw_gte16, eaw_gte16 + std::size(eaw_gte16));
+        array_min(eaw_gte16);
    constexpr T eaw_lte_max =
-        *std::max_element(eaw_lte16, eaw_lte16 + std::size(eaw_lte16));
+        array_max(eaw_lte16);
    if (input >= eaw_gte_min && input <= eaw_lte_max) {
      size_t i = 0;
-      for (; i + N <= std::size(eaw_lte16) && eaw_lte16[i] != 0; i += N) {
+      for (; i + N <= array_size(eaw_lte16) && eaw_lte16[i] != 0; i += N) {
        const hn::Vec<D> lte_vec = hn::Load(d, eaw_lte16 + i);
        const hn::Vec<D> gte_vec = hn::Load(d, eaw_gte16 + i);
        const intptr_t idx = hn::FindFirstTrue(
@@ -305,12 +330,12 @@ int8_t CodepointWidth16(D d, uint16_t input) {

  {
    constexpr T nsm_gte_min =
-        *std::min_element(nsm_gte16, nsm_gte16 + std::size(nsm_gte16));
+        array_min(nsm_gte16);
    constexpr T nsm_lte_max =
-        *std::max_element(nsm_lte16, nsm_lte16 + std::size(nsm_lte16));
+        array_max(nsm_lte16);
    if (input >= nsm_gte_min && input <= nsm_lte_max) {
      size_t i = 0;
-      for (; i + N <= std::size(nsm_lte16) && nsm_lte16[i] != 0; i += N) {
+      for (; i + N <= array_size(nsm_lte16) && nsm_lte16[i] != 0; i += N) {
        const hn::Vec<D> lte_vec = hn::Load(d, nsm_lte16 + i);
        const hn::Vec<D> gte_vec = hn::Load(d, nsm_gte16 + i);
        const intptr_t idx = hn::FindFirstTrue(
@@ -342,10 +367,10 @@ int8_t CodepointWidth32(D d, T input) {
    HWY_ALIGN constexpr T lte_keys[] = {
        0x1f1ff, 0x2FFFD, 0x3FFFD, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    };
-    static_assert(std::size(gte_keys) == std::size(lte_keys));
-    static_assert(std::size(gte_keys) >= 16);
+    static_assert(array_size(gte_keys) == array_size(lte_keys));
+    static_assert(array_size(gte_keys) >= 16);
    size_t i = 0;
-    for (; i + N <= std::size(lte_keys) && lte_keys[i] != 0; i += N) {
+    for (; i + N <= array_size(lte_keys) && lte_keys[i] != 0; i += N) {
      const hn::Vec<D> lte_vec = hn::Load(d, lte_keys + i);
      const hn::Vec<D> gte_vec = hn::Load(d, gte_keys + i);
      const intptr_t idx = hn::FindFirstTrue(
@@ -364,10 +389,10 @@ int8_t CodepointWidth32(D d, T input) {
    HWY_ALIGN constexpr T lte_keys[] = {
        0xE0FFF, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    };
-    static_assert(std::size(gte_keys) == std::size(lte_keys));
-    static_assert(std::size(gte_keys) >= 16);
+    static_assert(array_size(gte_keys) == array_size(lte_keys));
+    static_assert(array_size(gte_keys) >= 16);
    size_t i = 0;
-    for (; i + N <= std::size(lte_keys) && lte_keys[i] != 0; i += N) {
+    for (; i + N <= array_size(lte_keys) && lte_keys[i] != 0; i += N) {
      const hn::Vec<D> lte_vec = hn::Load(d, lte_keys + i);
      const hn::Vec<D> gte_vec = hn::Load(d, gte_keys + i);
      const intptr_t idx = hn::FindFirstTrue(
@@ -380,12 +405,12 @@ int8_t CodepointWidth32(D d, T input) {

  {
    constexpr T zero_gte_min =
-        *std::min_element(zero_gte32, zero_gte32 + std::size(zero_gte32));
+        array_min(zero_gte32);
    constexpr T zero_lte_max =
-        *std::max_element(zero_lte32, zero_lte32 + std::size(zero_lte32));
+        array_max(zero_lte32);
    if (input >= zero_gte_min && input <= zero_lte_max) {
      size_t i = 0;
-      for (; i + N <= std::size(zero_gte32) && zero_gte32[i] != 0; i += N) {
+      for (; i + N <= array_size(zero_gte32) && zero_gte32[i] != 0; i += N) {
        const hn::Vec<D> lte_vec = hn::Load(d, zero_lte32 + i);
        const hn::Vec<D> gte_vec = hn::Load(d, zero_gte32 + i);
        const intptr_t idx = hn::FindFirstTrue(
@@ -399,12 +424,12 @@ int8_t CodepointWidth32(D d, T input) {

  {
    constexpr T eaw_gte_min =
-        *std::min_element(eaw_gte32, eaw_gte32 + std::size(eaw_gte32));
+        array_min(eaw_gte32);
    constexpr T eaw_lte_max =
-        *std::max_element(eaw_lte32, eaw_lte32 + std::size(eaw_lte32));
+        array_max(eaw_lte32);
    if (input >= eaw_gte_min && input <= eaw_lte_max) {
      size_t i = 0;
-      for (; i + N <= std::size(eaw_lte32) && eaw_lte32[i] != 0; i += N) {
+      for (; i + N <= array_size(eaw_lte32) && eaw_lte32[i] != 0; i += N) {
        const hn::Vec<D> lte_vec = hn::Load(d, eaw_lte32 + i);
        const hn::Vec<D> gte_vec = hn::Load(d, eaw_gte32 + i);
        const intptr_t idx = hn::FindFirstTrue(
@@ -418,12 +443,12 @@ int8_t CodepointWidth32(D d, T input) {

  {
    constexpr T nsm_gte_min =
-        *std::min_element(nsm_gte32, nsm_gte32 + std::size(nsm_gte32));
+        array_min(nsm_gte32);
    constexpr T nsm_lte_max =
-        *std::max_element(nsm_lte32, nsm_lte32 + std::size(nsm_lte32));
+        array_max(nsm_lte32);
    if (input >= nsm_gte_min && input <= nsm_lte_max) {
      size_t i = 0;
-      for (; i + N <= std::size(nsm_lte32) && nsm_lte32[i] != 0; i += N) {
+      for (; i + N <= array_size(nsm_lte32) && nsm_lte32[i] != 0; i += N) {
        const hn::Vec<D> lte_vec = hn::Load(d, nsm_lte32 + i);
        const hn::Vec<D> gte_vec = hn::Load(d, nsm_gte32 + i);
        const intptr_t idx = hn::FindFirstTrue(
--- a/src/simd/index_of.cpp
+++ b/src/simd/index_of.cpp
@@ -6,8 +6,6 @@

 #include <simd/index_of.h>

-#include <optional>
-
 HWY_BEFORE_NAMESPACE();
 namespace ghostty {
 namespace HWY_NAMESPACE {
--- a/src/simd/index_of.h
+++ b/src/simd/index_of.h
@@ -7,8 +7,7 @@

 #include <hwy/highway.h>

-#include <cstddef>
-#include <optional>
+#include <stddef.h>

 HWY_BEFORE_NAMESPACE();
 namespace ghostty {
@@ -16,12 +15,16 @@ namespace HWY_NAMESPACE {

 namespace hn = hwy::HWY_NAMESPACE;

+// Sentinel value returned by IndexOfChunk when no match is found.
+static constexpr size_t kNotFound = static_cast<size_t>(-1);
+
 // Return the index of the first occurrence of `needle` in `input`, where
-// the input and needle are already loaded into vectors.
+// the input and needle are already loaded into vectors. Returns kNotFound
+// if no match is found.
 template <class D, typename T = hn::TFromD<D>>
-std::optional<size_t> IndexOfChunk(D d,
-                                   hn::Vec<D> needle_vec,
-                                   hn::Vec<D> input_vec) {
+size_t IndexOfChunk(D d,
+                    hn::Vec<D> needle_vec,
+                    hn::Vec<D> input_vec) {
  // Compare the input vector with the needle vector. This produces
  // a vector where each lane is 0xFF if the corresponding lane in
  // `input_vec` is equal to the corresponding lane in `needle_vec`.
@@ -32,9 +35,9 @@ std::optional<size_t> IndexOfChunk(D d,

  // If we found a match, return the index into the input.
  if (pos >= 0) {
-    return std::optional<size_t>(static_cast<size_t>(pos));
+    return static_cast<size_t>(pos);
  } else {
-    return std::nullopt;
+    return kNotFound;
  }
 }

@@ -58,8 +61,9 @@ size_t IndexOfImpl(D d, T needle, const T* HWY_RESTRICT input, size_t count) {
  for (; i + N <= count; i += N) {
    // Load the N elements from our input into a vector and check the chunk.
    const hn::Vec<D> input_vec = hn::LoadU(d, input + i);
-    if (auto pos = IndexOfChunk(d, needle_vec, input_vec)) {
-      return i + pos.value();
+    const size_t pos = IndexOfChunk(d, needle_vec, input_vec);
+    if (pos != kNotFound) {
+      return i + pos;
    }
  }

--- a/src/simd/vt.cpp
+++ b/src/simd/vt.cpp
@@ -5,8 +5,8 @@
 #include <hwy/highway.h>

 #include <simdutf.h>
-#include <utf8.h>
-#include <vector>
+#include <stdlib.h>
+#include <string.h>

 #include <simd/index_of.h>
 #include <simd/vt.h>
@@ -19,12 +19,144 @@ namespace hn = hwy::HWY_NAMESPACE;

 using T = uint8_t;

+// Compute the length of the maximal subpart of an ill-formed UTF-8
+// subsequence starting at p[0], per Unicode Table 3-7 and the W3C
+// "U+FFFD Substitution of Maximal Subparts" algorithm.
+//
+// The maximal subpart is the longest initial subsequence that is either:
+//   (a) the start of a well-formed sequence, or
+//   (b) a single byte.
+// Each maximal subpart maps to exactly one U+FFFD.
+static size_t MaximalSubpart(const unsigned char* p, size_t len) {
+  if (len == 0) return 0;
+
+  unsigned char b0 = p[0];
+
+  // Continuation bytes (80-BF), overlong leads (C0-C1), or invalid (F5-FF):
+  // each is its own maximal subpart of length 1.
+  if (b0 < 0xC2 || b0 > 0xF4) return 1;
+
+  // Determine the expected sequence length and the valid range for each
+  // continuation byte per Unicode Table 3-7.
+  size_t seq_len;
+  unsigned char lo[3], hi[3];
+
+  if (b0 <= 0xDF) {
+    seq_len = 2;
+    lo[0] = 0x80; hi[0] = 0xBF;
+  } else if (b0 == 0xE0) {
+    seq_len = 3;
+    lo[0] = 0xA0; hi[0] = 0xBF;
+    lo[1] = 0x80; hi[1] = 0xBF;
+  } else if (b0 <= 0xEC) {
+    seq_len = 3;
+    lo[0] = 0x80; hi[0] = 0xBF;
+    lo[1] = 0x80; hi[1] = 0xBF;
+  } else if (b0 == 0xED) {
+    seq_len = 3;
+    lo[0] = 0x80; hi[0] = 0x9F;
+    lo[1] = 0x80; hi[1] = 0xBF;
+  } else if (b0 <= 0xEF) {
+    seq_len = 3;
+    lo[0] = 0x80; hi[0] = 0xBF;
+    lo[1] = 0x80; hi[1] = 0xBF;
+  } else if (b0 == 0xF0) {
+    seq_len = 4;
+    lo[0] = 0x90; hi[0] = 0xBF;
+    lo[1] = 0x80; hi[1] = 0xBF;
+    lo[2] = 0x80; hi[2] = 0xBF;
+  } else if (b0 <= 0xF3) {
+    seq_len = 4;
+    lo[0] = 0x80; hi[0] = 0xBF;
+    lo[1] = 0x80; hi[1] = 0xBF;
+    lo[2] = 0x80; hi[2] = 0xBF;
+  } else {  // b0 == 0xF4
+    seq_len = 4;
+    lo[0] = 0x80; hi[0] = 0x8F;
+    lo[1] = 0x80; hi[1] = 0xBF;
+    lo[2] = 0x80; hi[2] = 0xBF;
+  }
+
+  // Check continuation bytes against their specific valid ranges.
+  // The maximal subpart extends as far as bytes match.
+  size_t valid = 1;  // lead byte counts
+  for (size_t i = 0; i < seq_len - 1 && valid < len; i++) {
+    unsigned char cb = p[valid];
+    if (cb < lo[i] || cb > hi[i]) break;
+    valid++;
+  }
+
+  // If we matched all bytes, the sequence is structurally valid
+  // (shouldn't happen since we're called on an error), but cap
+  // to avoid skipping a valid sequence.
+  if (valid == seq_len) return valid;
+
+  return valid;
+}
+
+// Trim trailing bytes that form a valid-but-incomplete UTF-8 sequence.
+// Only trims sequences whose bytes so far match Table 3-7 ranges (i.e.,
+// truly partial sequences that could be completed by future input).
+// Invalid lead bytes (C0, C1, F5-FF) or mismatched continuations are NOT
+// trimmed — they will be handled as errors by DecodeUTF8.
+static size_t TrimValidPartialUTF8(const uint8_t* input, size_t len) {
+  if (len == 0) return 0;
+
+  // Find the start of a potential trailing partial sequence by scanning
+  // backwards from the end. We look for a lead byte (C2-F4) that could
+  // start a multi-byte sequence, possibly followed by continuation bytes.
+  //
+  // We check up to the last 4 bytes (max UTF-8 sequence length).
+  size_t check_start = len > 4 ? len - 4 : 0;
+  for (size_t pos = len; pos > check_start; pos--) {
+    unsigned char b = input[pos - 1];
+
+    // Skip continuation bytes — they might belong to the partial sequence.
+    if ((b & 0xC0) == 0x80) continue;
+
+    // Found a non-continuation byte. Only valid multi-byte leads (C2-F4)
+    // can start a partial sequence worth trimming. Anything else (ASCII,
+    // C0, C1, F5-FF) should be consumed by DecodeUTF8.
+    if (b < 0xC2 || b > 0xF4) return len;
+
+    // Determine expected sequence length from the lead byte.
+    size_t expected;
+    if (b <= 0xDF)
+      expected = 2;
+    else if (b <= 0xEF)
+      expected = 3;
+    else
+      expected = 4;
+
+    size_t seq_remaining = len - (pos - 1);
+
+    // If we have all expected bytes, the sequence is complete (not partial).
+    if (seq_remaining >= expected) return len;
+
+    // Check if the trailing bytes form a valid prefix using MaximalSubpart.
+    const unsigned char* seq_start = input + pos - 1;
+    size_t subpart = MaximalSubpart(seq_start, seq_remaining);
+
+    // Only trim if ALL trailing bytes are part of the valid prefix
+    // (the sequence is valid-so-far but incomplete).
+    if (subpart == seq_remaining) {
+      return pos - 1;
+    }
+
+    // The sequence is ill-formed, don't trim — let DecodeUTF8 handle it.
+    return len;
+  }
+
+  return len;
+}
+
 // Decode the UTF-8 text in input into output. Returns the number of decoded
 // characters. This function assumes output is large enough.
 //
 // This function handles malformed UTF-8 sequences by inserting a
-// replacement character (U+FFFD) and continuing to decode. This function
-// will consume the entire input no matter what.
+// replacement character (U+FFFD) following the W3C/Unicode "U+FFFD
+// Substitution of Maximal Subparts" algorithm and continuing to decode.
+// This function will consume the entire input no matter what.
 size_t DecodeUTF8(const uint8_t* HWY_RESTRICT input,
                  size_t count,
                  char32_t* output) {
@@ -34,27 +166,38 @@ size_t DecodeUTF8(const uint8_t* HWY_RESTRICT input,
    return 0;
  }

-  // Assume no errors for fast path.
-  const size_t decoded = simdutf::convert_utf8_to_utf32(
-      reinterpret_cast<const char*>(input), count, output);
-  if (decoded > 0) {
-    return decoded;
+  // Decode UTF-8 to UTF-32, replacing invalid sequences with U+FFFD.
+  const char* in = reinterpret_cast<const char*>(input);
+  size_t remaining = count;
+  char32_t* out = output;
+  while (remaining > 0) {
+    auto r = simdutf::convert_utf8_to_utf32_with_errors(in, remaining, out);
+
+    // If the decode was a full success then we're done!
+    if (r.error == simdutf::SUCCESS) {
+      out += r.count;
+      break;
+    }
+
+    // On error, r.count is the input byte position of the error.
+    // The output buffer is already written up to that point, but
+    // we need count_utf8 to find how many char32_t that produced.
+    out += simdutf::count_utf8(in, r.count);
+
+    // Compute the maximal subpart at the error position and emit
+    // a single U+FFFD for it.
+    const unsigned char* err_pos =
+        reinterpret_cast<const unsigned char*>(in + r.count);
+    size_t err_remaining = remaining - r.count;
+    size_t skip = r.count + MaximalSubpart(err_pos, err_remaining);
+
+    *out++ = 0xFFFD;
+
+    in += skip;
+    remaining -= skip;
  }

-  // Errors in the UTF input, take a slow path and do a decode with
-  // replacement (with U+FFFD). Note that simdutf doesn't have a
-  // decode with replacement API:
-  // https://github.com/simdutf/simdutf/issues/147
-  //
-  // Because of this, we use a separate library with heap allocation
-  // that is much, much slower (the allocation is slower, the algorithm
-  // is slower, etc.) This is just so we have something that works.
-  // I want to replace this.
-  std::vector<char> replacement_result;
-  utf8::replace_invalid(input, input + count,
-                        std::back_inserter(replacement_result), 0xFFFD);
-  return DecodeUTF8(reinterpret_cast<const uint8_t*>(replacement_result.data()),
-                    replacement_result.size(), output);
+  return static_cast<size_t>(out - output);
 }

 /// Decode the UTF-8 text in input into output until an escape
@@ -86,16 +229,16 @@ size_t DecodeUTF8UntilControlSeqImpl(D d,
    // If we don't have any escapes we keep going. We want to accumulate
    // the largest possible valid UTF-8 sequence before decoding.
    // TODO(mitchellh): benchmark this vs decoding every time
-    const auto esc_idx = IndexOfChunk(d, esc_vec, input_vec);
-    if (!esc_idx) {
+    const size_t esc_idx = IndexOfChunk(d, esc_vec, input_vec);
+    if (esc_idx == kNotFound) {
      continue;
    }

    // We have an ESC char, decode up to this point. We start by assuming
    // a valid UTF-8 sequence and slow-path into error handling if we find
    // an invalid sequence.
-    *output_count = DecodeUTF8(input, i + esc_idx.value(), output);
-    return i + esc_idx.value();
+    *output_count = DecodeUTF8(input, i + esc_idx, output);
+    return i + esc_idx;
  }

  // If we have leftover input then we decode it one byte at a time (slow!)
@@ -106,21 +249,27 @@ size_t DecodeUTF8UntilControlSeqImpl(D d,
    const hn::Vec<D1> esc1 = Set(d1, hn::GetLane(esc_vec));
    for (; i < count; ++i) {
      const hn::Vec<D1> input_vec = hn::LoadU(d1, input + i);
-      const auto esc_idx = IndexOfChunk(d1, esc1, input_vec);
-      if (!esc_idx) {
+      const size_t esc_idx = IndexOfChunk(d1, esc1, input_vec);
+      if (esc_idx == kNotFound) {
        continue;
      }

-      *output_count = DecodeUTF8(input, i + esc_idx.value(), output);
-      return i + esc_idx.value();
+      *output_count = DecodeUTF8(input, i + esc_idx, output);
+      return i + esc_idx;
    }
  }

  // If we reached this point, its possible for our input to have an
  // incomplete sequence because we're consuming the full input. We need
  // to trim any incomplete sequences from the end of the input.
-  const size_t trimmed_len =
-      simdutf::trim_partial_utf8(reinterpret_cast<const char*>(input), i);
+  //
+  // We use our own trim instead of simdutf::trim_partial_utf8 because
+  // we only want to trim sequences that are valid-so-far (true partial
+  // sequences that may be completed by future input). Invalid bytes
+  // like C0, C1, F5-FF should NOT be trimmed — they should be passed
+  // through to DecodeUTF8 which will replace them with U+FFFD per the
+  // maximal subpart algorithm.
+  const size_t trimmed_len = TrimValidPartialUTF8(input, i);
  *output_count = DecodeUTF8(input, trimmed_len, output);
  return trimmed_len;
 }
--- a/src/simd/vt.zig
+++ b/src/simd/vt.zig
@@ -45,36 +45,79 @@ fn utf8DecodeUntilControlSeqScalar(
    const idx = indexOf(input, 0x1B) orelse input.len;
    const decode = input[0..idx];

-    // Go through and decode one item at a time.
+    // Go through and decode one item at a time, following the W3C/Unicode
+    // "U+FFFD Substitution of Maximal Subparts" algorithm for ill-formed
+    // subsequences.
    var decode_offset: usize = 0;
    var decode_count: usize = 0;
    while (decode_offset < decode.len) {
-        const decode_rem = decode[decode_offset..];
-        const cp_len = std.unicode.utf8ByteSequenceLength(decode_rem[0]) catch {
-            // Note, this is matching our SIMD behavior, but it is admittedly
-            // a bit weird. See our "decode invalid leading byte" test too.
-            // SIMD should be our source of truth then we copy behavior here.
-            break;
-        };
+        const b0 = decode[decode_offset];

-        // If we don't have that number of bytes available. we finish. We
-        // assume this is a partial input and we defer to the future.
-        if (decode_rem.len < cp_len) break;
-
-        // We have the bytes available, so move forward
-        const cp_bytes = decode_rem[0..cp_len];
-        decode_offset += cp_len;
-        if (std.unicode.utf8Decode(cp_bytes)) |cp| {
-            output[decode_count] = @intCast(cp);
+        // ASCII fast path
+        if (b0 < 0x80) {
+            output[decode_count] = b0;
            decode_count += 1;
-        } else |_| {
-            // If decoding failed, we replace the leading byte with the
-            // replacement char and then continue decoding after that
-            // byte. This matches the SIMD behavior and is tested by the
-            // "invalid UTF-8" tests.
+            decode_offset += 1;
+            continue;
+        }
+
+        // Continuation byte (80-BF) or invalid byte (C0-C1, F5-FF)
+        // as lead: each is its own maximal subpart → one FFFD per byte.
+        if (b0 < 0xC2 or b0 > 0xF4) {
            output[decode_count] = 0xFFFD;
            decode_count += 1;
-            decode_offset -= cp_len - 1;
+            decode_offset += 1;
+            continue;
+        }
+
+        // Multi-byte sequence. Determine expected length and the valid
+        // range for each continuation byte per Unicode Table 3-7.
+        const seq = utf8SeqInfo(b0);
+
+        // Check how many continuation bytes form a valid prefix (the
+        // maximal subpart). We check each byte against its specific
+        // valid range.
+        var valid: usize = 1; // lead byte is valid
+        for (0..seq.len - 1) |ci| {
+            if (decode_offset + valid >= decode.len) {
+                // Truncated at end of buffer: treat as incomplete
+                // input that may be completed later. Stop decoding
+                // without consuming these bytes.
+                return .{
+                    .consumed = decode_offset,
+                    .decoded = decode_count,
+                };
+            }
+            const cb = decode[decode_offset + valid];
+            if (cb < seq.ranges[ci][0] or cb > seq.ranges[ci][1]) {
+                // Byte doesn't match expected range. The maximal
+                // subpart ends here.
+                break;
+            }
+            valid += 1;
+        }
+
+        if (valid == seq.len) {
+            // Full sequence present and structurally valid. Decode it.
+            // (Structural validity per Table 3-7 guarantees decode success.)
+            const cp_bytes = decode[decode_offset..][0..seq.len];
+            if (std.unicode.utf8Decode(cp_bytes)) |cp| {
+                output[decode_count] = @intCast(cp);
+                decode_count += 1;
+                decode_offset += seq.len;
+            } else |_| {
+                // Should not happen given Table 3-7 validation, but
+                // be safe: emit FFFD for the lead byte.
+                output[decode_count] = 0xFFFD;
+                decode_count += 1;
+                decode_offset += 1;
+            }
+        } else {
+            // Incomplete/ill-formed: the maximal subpart (valid bytes)
+            // maps to a single FFFD.
+            output[decode_count] = 0xFFFD;
+            decode_count += 1;
+            decode_offset += valid;
        }
    }

@@ -84,6 +127,27 @@ fn utf8DecodeUntilControlSeqScalar(
    };
 }

+const Utf8SeqInfo = struct {
+    len: u3,
+    ranges: [3][2]u8,
+};
+
+/// Returns the expected byte count and valid continuation byte ranges
+/// for a UTF-8 sequence based on its lead byte, per Unicode Table 3-7.
+fn utf8SeqInfo(lead: u8) Utf8SeqInfo {
+    return switch (lead) {
+        0xC2...0xDF => .{ .len = 2, .ranges = .{ .{ 0x80, 0xBF }, .{ 0, 0 }, .{ 0, 0 } } },
+        0xE0 => .{ .len = 3, .ranges = .{ .{ 0xA0, 0xBF }, .{ 0x80, 0xBF }, .{ 0, 0 } } },
+        0xE1...0xEC => .{ .len = 3, .ranges = .{ .{ 0x80, 0xBF }, .{ 0x80, 0xBF }, .{ 0, 0 } } },
+        0xED => .{ .len = 3, .ranges = .{ .{ 0x80, 0x9F }, .{ 0x80, 0xBF }, .{ 0, 0 } } },
+        0xEE...0xEF => .{ .len = 3, .ranges = .{ .{ 0x80, 0xBF }, .{ 0x80, 0xBF }, .{ 0, 0 } } },
+        0xF0 => .{ .len = 4, .ranges = .{ .{ 0x90, 0xBF }, .{ 0x80, 0xBF }, .{ 0x80, 0xBF } } },
+        0xF1...0xF3 => .{ .len = 4, .ranges = .{ .{ 0x80, 0xBF }, .{ 0x80, 0xBF }, .{ 0x80, 0xBF } } },
+        0xF4 => .{ .len = 4, .ranges = .{ .{ 0x80, 0x8F }, .{ 0x80, 0xBF }, .{ 0x80, 0xBF } } },
+        else => unreachable,
+    };
+}
+
 test "decode no escape" {
    const testing = std.testing;

@@ -131,7 +195,7 @@ test "decode incomplete UTF-8" {

    var output: [64]u32 = undefined;

-    // 2-byte
+    // 2-byte truncated at end of buffer
    {
        const str = "hello\xc2";
        try testing.expectEqual(DecodeResult{
@@ -140,16 +204,18 @@ test "decode incomplete UTF-8" {
        }, utf8DecodeUntilControlSeq(str, &output));
    }

-    // 3-byte
+    // 3-byte: \xe0 expects A0-BF next, but \x00 is not in range.
+    // \xe0 is a maximal subpart of length 1 → FFFD, then \x00 is ASCII NUL.
    {
        const str = "hello\xe0\x00";
-        try testing.expectEqual(DecodeResult{
-            .consumed = 5,
-            .decoded = 5,
-        }, utf8DecodeUntilControlSeq(str, &output));
+        const result = utf8DecodeUntilControlSeq(str, &output);
+        try testing.expectEqual(@as(usize, 7), result.consumed);
+        try testing.expectEqual(@as(usize, 7), result.decoded);
+        try testing.expectEqual(@as(u32, 0xFFFD), output[5]);
+        try testing.expectEqual(@as(u32, 0x00), output[6]);
    }

-    // 4-byte
+    // 4-byte truncated at end of buffer (F0 90 is valid so far)
    {
        const str = "hello\xf0\x90";
        try testing.expectEqual(DecodeResult{
@@ -178,19 +244,248 @@ test "decode invalid UTF-8" {
    try testing.expectEqual(@as(u32, 0x01), output[6]);
 }

-// This is testing our current behavior so that we know we have to handle
-// this case in terminal/stream.zig. If we change this behavior, we can
-// remove the special handling in terminal/stream.zig.
-test "decode invalid leading byte isn't consumed or replaced" {
+// Per the maximal subpart spec, bytes F5-FF are each replaced with FFFD.
+test "decode invalid leading byte is replaced" {
    const testing = std.testing;

    var output: [64]u32 = undefined;

    {
        const str = "hello\xFF";
-        try testing.expectEqual(DecodeResult{
-            .consumed = 5,
-            .decoded = 5,
-        }, utf8DecodeUntilControlSeq(str, &output));
+        const result = utf8DecodeUntilControlSeq(str, &output);
+        try testing.expectEqual(@as(usize, 6), result.consumed);
+        try testing.expectEqual(@as(usize, 6), result.decoded);
+        try testing.expectEqual(@as(u32, 0xFFFD), output[5]);
    }
 }
+
+test "decode invalid continuation in 3-byte sequence" {
+    const testing = std.testing;
+
+    var output: [64]u32 = undefined;
+
+    // \xe2 expects two continuation bytes, \x28 is not one
+    {
+        const str = "hello\xe2\x28world";
+        const result = utf8DecodeUntilControlSeq(str, &output);
+        // "hello" + replacement + "(" + "world" = 12 codepoints
+        try testing.expectEqual(@as(usize, 12), result.decoded);
+        try testing.expectEqual(@as(u32, 0xFFFD), output[5]);
+        try testing.expectEqual(@as(u32, '('), output[6]);
+        try testing.expectEqual(@as(u32, 'w'), output[7]);
+    }
+}
+
+test "decode invalid continuation in 4-byte sequence" {
+    const testing = std.testing;
+
+    var output: [64]u32 = undefined;
+
+    // \xf0\x90 is a valid prefix of a 4-byte sequence, but \x28 breaks it.
+    // Maximal subpart is F0 90 (length 2) → single FFFD, then '(' proceeds.
+    {
+        const str = "hello\xf0\x90\x28world";
+        const result = utf8DecodeUntilControlSeq(str, &output);
+        // "hello" + FFFD + "(" + "world" = 12 codepoints
+        try testing.expectEqual(@as(usize, 12), result.decoded);
+        try testing.expectEqual(@as(u32, 0xFFFD), output[5]);
+        try testing.expectEqual(@as(u32, '('), output[6]);
+        try testing.expectEqual(@as(u32, 'w'), output[7]);
+    }
+}
+
+test "decode multiple consecutive invalid bytes" {
+    const testing = std.testing;
+
+    var output: [64]u32 = undefined;
+
+    // Each lone continuation byte is its own maximal subpart → one FFFD each.
+    {
+        const str = "a\x80\x80b";
+        const result = utf8DecodeUntilControlSeq(str, &output);
+        // "a" + FFFD + FFFD + "b" = 4 codepoints
+        try testing.expectEqual(@as(usize, 4), result.decoded);
+        try testing.expectEqual(@as(u32, 'a'), output[0]);
+        try testing.expectEqual(@as(u32, 0xFFFD), output[1]);
+        try testing.expectEqual(@as(u32, 0xFFFD), output[2]);
+        try testing.expectEqual(@as(u32, 'b'), output[3]);
+    }
+
+    // C0 is an invalid lead byte (< C2), each byte gets its own FFFD.
+    {
+        const str = "a\xc0\xc0b";
+        const result = utf8DecodeUntilControlSeq(str, &output);
+        // "a" + FFFD + FFFD + "b" = 4 codepoints
+        try testing.expectEqual(@as(usize, 4), result.decoded);
+        try testing.expectEqual(@as(u32, 'a'), output[0]);
+        try testing.expectEqual(@as(u32, 0xFFFD), output[1]);
+        try testing.expectEqual(@as(u32, 0xFFFD), output[2]);
+        try testing.expectEqual(@as(u32, 'b'), output[3]);
+    }
+}
+
+test "decode unexpected continuation byte as lead" {
+    const testing = std.testing;
+
+    var output: [64]u32 = undefined;
+
+    // 0x80 is a continuation byte appearing as a lead byte
+    {
+        const str = "a\x80b";
+        const result = utf8DecodeUntilControlSeq(str, &output);
+        // "a" + replacement + "b" = 3 codepoints
+        try testing.expectEqual(@as(usize, 3), result.decoded);
+        try testing.expectEqual(@as(u32, 'a'), output[0]);
+        try testing.expectEqual(@as(u32, 0xFFFD), output[1]);
+        try testing.expectEqual(@as(u32, 'b'), output[2]);
+    }
+}
+
+test "decode overlong 2-byte encoding" {
+    const testing = std.testing;
+
+    var output: [64]u32 = undefined;
+
+    // \xc0\xaf: C0 is invalid lead (< C2) → FFFD, AF is lone continuation → FFFD
+    // Per Table 3-8: C0 AF → FFFD FFFD
+    {
+        const str = "a\xc0\xafb";
+        const result = utf8DecodeUntilControlSeq(str, &output);
+        // "a" + FFFD + FFFD + "b" = 4 codepoints
+        try testing.expectEqual(@as(usize, 4), result.decoded);
+        try testing.expectEqual(@as(u32, 'a'), output[0]);
+        try testing.expectEqual(@as(u32, 0xFFFD), output[1]);
+        try testing.expectEqual(@as(u32, 0xFFFD), output[2]);
+        try testing.expectEqual(@as(u32, 'b'), output[3]);
+    }
+}
+
+test "decode surrogate half" {
+    const testing = std.testing;
+
+    var output: [64]u32 = undefined;
+
+    // \xed\xa0\x80 encodes U+D800 (a surrogate). Per Table 3-7, after ED
+    // the next byte must be 80-9F. A0 is out of range, so ED is a maximal
+    // subpart of length 1 → FFFD. Then A0 and 80 are lone continuations
+    // → FFFD each. Per Table 3-9: ED A0 80 → FFFD FFFD FFFD
+    {
+        const str = "a\xed\xa0\x80b";
+        const result = utf8DecodeUntilControlSeq(str, &output);
+        // "a" + FFFD + FFFD + FFFD + "b" = 5 codepoints
+        try testing.expectEqual(@as(usize, 5), result.decoded);
+        try testing.expectEqual(@as(u32, 'a'), output[0]);
+        try testing.expectEqual(@as(u32, 0xFFFD), output[1]);
+        try testing.expectEqual(@as(u32, 0xFFFD), output[2]);
+        try testing.expectEqual(@as(u32, 0xFFFD), output[3]);
+        try testing.expectEqual(@as(u32, 'b'), output[4]);
+    }
+}
+
+test "decode valid multibyte surrounded by invalid" {
+    const testing = std.testing;
+
+    var output: [64]u32 = undefined;
+
+    // \xc3\xa9 = é (U+00E9), surrounded by invalid continuation bytes
+    {
+        const str = "\x80\xc3\xa9\x80";
+        const result = utf8DecodeUntilControlSeq(str, &output);
+        // replacement + é + replacement = 3 codepoints
+        try testing.expectEqual(@as(usize, 3), result.decoded);
+        try testing.expectEqual(@as(u32, 0xFFFD), output[0]);
+        try testing.expectEqual(@as(u32, 0x00E9), output[1]);
+        try testing.expectEqual(@as(u32, 0xFFFD), output[2]);
+    }
+}
+
+test "decode invalid byte before escape" {
+    const testing = std.testing;
+
+    var output: [64]u32 = undefined;
+
+    // Invalid byte followed by ESC - should replace then stop
+    {
+        const str = "hi\x80\x1b[0m";
+        const result = utf8DecodeUntilControlSeq(str, &output);
+        try testing.expectEqual(@as(usize, 3), result.consumed);
+        try testing.expectEqual(@as(usize, 3), result.decoded);
+        try testing.expectEqual(@as(u32, 'h'), output[0]);
+        try testing.expectEqual(@as(u32, 'i'), output[1]);
+        try testing.expectEqual(@as(u32, 0xFFFD), output[2]);
+    }
+}
+
+// Unicode Table 3-8: U+FFFD for Non-Shortest Form Sequences
+// Bytes:  C0  AF  E0  80  BF  F0  81  82  41
+// Output: FFFD FFFD FFFD FFFD FFFD FFFD FFFD FFFD 0041
+test "Table 3-8: non-shortest form sequences" {
+    const testing = std.testing;
+    var output: [64]u32 = undefined;
+
+    const str = "\xC0\xAF\xE0\x80\xBF\xF0\x81\x82\x41";
+    const result = utf8DecodeUntilControlSeq(str, &output);
+    try testing.expectEqual(@as(usize, 9), result.consumed);
+    try testing.expectEqual(@as(usize, 9), result.decoded);
+    for (0..8) |i| {
+        try testing.expectEqual(@as(u32, 0xFFFD), output[i]);
+    }
+    try testing.expectEqual(@as(u32, 0x41), output[8]);
+}
+
+// Unicode Table 3-9: U+FFFD for Ill-Formed Sequences for Surrogates
+// Bytes:  ED  A0  80  ED  BF  BF  ED  AF  41
+// Output: FFFD FFFD FFFD FFFD FFFD FFFD FFFD FFFD 0041
+test "Table 3-9: surrogate sequences" {
+    const testing = std.testing;
+    var output: [64]u32 = undefined;
+
+    const str = "\xED\xA0\x80\xED\xBF\xBF\xED\xAF\x41";
+    const result = utf8DecodeUntilControlSeq(str, &output);
+    try testing.expectEqual(@as(usize, 9), result.consumed);
+    try testing.expectEqual(@as(usize, 9), result.decoded);
+    for (0..8) |i| {
+        try testing.expectEqual(@as(u32, 0xFFFD), output[i]);
+    }
+    try testing.expectEqual(@as(u32, 0x41), output[8]);
+}
+
+// Unicode Table 3-10: U+FFFD for Other Ill-Formed Sequences
+// Bytes:  F4  91  92  93  FF  41  80  BF  42
+// Output: FFFD FFFD FFFD FFFD FFFD 0041 FFFD FFFD 0042
+test "Table 3-10: other ill-formed sequences" {
+    const testing = std.testing;
+    var output: [64]u32 = undefined;
+
+    const str = "\xF4\x91\x92\x93\xFF\x41\x80\xBF\x42";
+    const result = utf8DecodeUntilControlSeq(str, &output);
+    try testing.expectEqual(@as(usize, 9), result.consumed);
+    try testing.expectEqual(@as(usize, 9), result.decoded);
+    try testing.expectEqual(@as(u32, 0xFFFD), output[0]); // F4
+    try testing.expectEqual(@as(u32, 0xFFFD), output[1]); // 91
+    try testing.expectEqual(@as(u32, 0xFFFD), output[2]); // 92
+    try testing.expectEqual(@as(u32, 0xFFFD), output[3]); // 93
+    try testing.expectEqual(@as(u32, 0xFFFD), output[4]); // FF
+    try testing.expectEqual(@as(u32, 0x0041), output[5]); // 41
+    try testing.expectEqual(@as(u32, 0xFFFD), output[6]); // 80
+    try testing.expectEqual(@as(u32, 0xFFFD), output[7]); // BF
+    try testing.expectEqual(@as(u32, 0x0042), output[8]); // 42
+}
+
+// Unicode Table 3-11: U+FFFD for Truncated Sequences
+// Bytes:  E1  80  E2  F0  91  92  F1  BF  41
+// Output: FFFD     FFFD    FFFD         FFFD     0041
+test "Table 3-11: truncated sequences" {
+    const testing = std.testing;
+    var output: [64]u32 = undefined;
+
+    const str = "\xE1\x80\xE2\xF0\x91\x92\xF1\xBF\x41";
+    const result = utf8DecodeUntilControlSeq(str, &output);
+    try testing.expectEqual(@as(usize, 9), result.consumed);
+    try testing.expectEqual(@as(usize, 5), result.decoded);
+    try testing.expectEqual(@as(u32, 0xFFFD), output[0]); // E1 80 (truncated 3-byte)
+    try testing.expectEqual(@as(u32, 0xFFFD), output[1]); // E2 (truncated 3-byte, next byte F0 not continuation)
+    try testing.expectEqual(@as(u32, 0xFFFD), output[2]); // F0 91 92 (truncated 4-byte)
+    try testing.expectEqual(@as(u32, 0xFFFD), output[3]); // F1 BF (truncated 4-byte, next byte 41 not continuation)
+    try testing.expectEqual(@as(u32, 0x0041), output[4]); // 41
+}