libghostty: Remove all libc++ and libc++ ABI dependencies (#12291)

This updates simdutf to my fork which has a SIMDUTF_NO_LIBCXX option
that removes all libc++ and libc++ ABI dependencies. The plan is to open
an upstream PR with this, but I want to verify it here first.

From there, the hand-written simd code we have has been updated to also
no longer use any libc++ features. Part of this required removing utfcpp
since it depended on libc++ (`<iterator>`).

libghostty-vt now only depends on libc.

## Benchmark Results

| Corpus | Current `HEAD` median | `main` median | Delta vs `main` |
Notes |
| --- | ---: | ---: | ---: | --- |
| `valid-mixed-1g-seed1.bin` | `9.245s` | `9.111s` | `1.5%` slower |
Near tie; `main` remains slightly faster on fully valid input |
| `malformed-mixed-1g-seed1-rate0.005.bin` | `9.251s` | `12.705s` |
`37.3%` faster | Large improvement on malformed UTF-8 input |

Approximate throughput from the medians:

- Valid corpus: current `HEAD` `110.8 MiB/s`, `main` `112.4 MiB/s`
- Malformed corpus: current `HEAD` `110.7 MiB/s`, `main` `80.6 MiB/s`
This commit is contained in:
Mitchell Hashimoto
2026-04-15 11:36:16 -07:00
committed by GitHub
25 changed files with 48248 additions and 33886 deletions

View File

@@ -419,8 +419,14 @@ jobs:
echo "Libs: $(pkg-config --libs libghostty-vt)"
echo "Static: $(pkg-config --libs --static libghostty-vt)"
# Libs.private must include the C++ standard library
pkg-config --libs --static libghostty-vt | grep -q -- '-lc++'
# Libs.private must NOT include the C++ runtime libraries (all
# vendored C++ deps are built in no-libcxx mode).
! pkg-config --libs --static libghostty-vt | grep -qE -- '-lc\+\+|-lc\+\+abi'
- name: Verify shared library has no libc++ dependency
run: |
ldd zig-out/lib/libghostty-vt.so.0.1.0
! ldd zig-out/lib/libghostty-vt.so.0.1.0 2>/dev/null | grep -qE 'libc\+\+|libc\+\+abi'
- name: Verify static archive contains SIMD deps
run: |
@@ -452,16 +458,15 @@ jobs:
- name: Test static link via pkg-config
run: |
export PKG_CONFIG_PATH="$PWD/zig-out/share/pkgconfig"
# The static library is compiled with LLVM libc++ (not GNU
# libstdc++), so linking requires a libc++-compatible toolchain.
# zig cc, clang, or gcc with libc++-dev installed all work.
nix develop -c zig cc -o /tmp/test_static /tmp/test_libghostty_vt.c \
# The static archive must link cleanly into a plain C program
# without any extra C++ runtime flags.
nix develop -c cc -o /tmp/test_static /tmp/test_libghostty_vt.c \
$(pkg-config --cflags libghostty-vt) \
"$PWD/zig-out/lib/libghostty-vt.a" \
$(pkg-config --libs-only-l --static libghostty-vt | sed 's/-lghostty-vt//')
/tmp/test_static
# Verify it's truly statically linked (no libghostty-vt.so dependency)
! ldd /tmp/test_static 2>/dev/null | grep -q libghostty-vt
# Verify it doesn't depend on the shared lib or a C++ runtime.
! ldd /tmp/test_static 2>/dev/null | grep -qE 'libghostty-vt|libc\+\+|libc\+\+abi'
# Test system integration: rebuild with -Dsystem-simdutf=true so
# simdutf comes from the system instead of being vendored. This

View File

@@ -178,9 +178,8 @@ add_dependencies(ghostty-vt zig_build_lib_vt)
# Static
#
# On Linux and macOS, the static library is a fat archive that bundles
# the vendored SIMD dependencies (highway, simdutf, utfcpp). Consumers
# only need to link libc and libc++ (LLVM's C++ runtime, not GNU
# libstdc++). Use zig cc, clang, or any toolchain with libc++ support.
# the vendored SIMD dependencies (highway, simdutf). Consumers
# only need to link libc.
#
# On Windows, the SIMD dependencies are not bundled and must be linked
# separately.
@@ -349,11 +348,7 @@ function(ghostty_vt_add_target)
)
if(_GVT_ZIG_TARGET MATCHES "windows")
set_target_properties(${_static_target} PROPERTIES
INTERFACE_LINK_LIBRARIES "c++;ntdll;kernel32"
)
else()
set_target_properties(${_static_target} PROPERTIES
INTERFACE_LINK_LIBRARIES "c++"
INTERFACE_LINK_LIBRARIES "ntdll;kernel32"
)
endif()
add_dependencies(${_static_target} ${_build_target})

View File

@@ -76,7 +76,6 @@
.opengl = .{ .path = "./pkg/opengl", .lazy = true },
.sentry = .{ .path = "./pkg/sentry", .lazy = true },
.simdutf = .{ .path = "./pkg/simdutf", .lazy = true },
.utfcpp = .{ .path = "./pkg/utfcpp", .lazy = true },
.wuffs = .{ .path = "./pkg/wuffs", .lazy = true },
.zlib = .{ .path = "./pkg/zlib", .lazy = true },

5
build.zig.zon.json generated
View File

@@ -109,11 +109,6 @@
"url": "https://deps.files.ghostty.org/spirv_cross-1220fb3b5586e8be67bc3feb34cbe749cf42a60d628d2953632c2f8141302748c8da.tar.gz",
"hash": "sha256-tStvz8Ref6abHwahNiwVVHNETizAmZVVaxVsU7pmV+M="
},
"N-V-__8AAHffAgDU0YQmynL8K35WzkcnMUmBVQHQ0jlcKpjH": {
"name": "utfcpp",
"url": "https://deps.files.ghostty.org/utfcpp-1220d4d18426ca72fc2b7e56ce47273149815501d0d2395c2a98c726b31ba931e641.tar.gz",
"hash": "sha256-/8ZooxDndgfTk/PBizJxXyI9oerExNbgV5oR345rWc8="
},
"uucode-0.1.0-ZZjBPj96QADXyt5sqwBJUnhaDYs_qBeeKijZvlRa0eqM": {
"name": "uucode",
"url": "git+https://github.com/jacobsandlund/uucode#5f05f8f83a75caea201f12cc8ea32a2d82ea9732",

8
build.zig.zon.nix generated
View File

@@ -258,14 +258,6 @@ in
hash = "sha256-tStvz8Ref6abHwahNiwVVHNETizAmZVVaxVsU7pmV+M=";
};
}
{
name = "N-V-__8AAHffAgDU0YQmynL8K35WzkcnMUmBVQHQ0jlcKpjH";
path = fetchZigArtifact {
name = "utfcpp";
url = "https://deps.files.ghostty.org/utfcpp-1220d4d18426ca72fc2b7e56ce47273149815501d0d2395c2a98c726b31ba931e641.tar.gz";
hash = "sha256-/8ZooxDndgfTk/PBizJxXyI9oerExNbgV5oR345rWc8=";
};
}
{
name = "uucode-0.1.0-ZZjBPj96QADXyt5sqwBJUnhaDYs_qBeeKijZvlRa0eqM";
path = fetchZigArtifact {

1
build.zig.zon.txt generated
View File

@@ -20,7 +20,6 @@ https://deps.files.ghostty.org/pixels-12207ff340169c7d40c570b4b6a97db614fe47e0d8
https://deps.files.ghostty.org/plasma_wayland_protocols-12207e0851c12acdeee0991e893e0132fc87bb763969a585dc16ecca33e88334c566.tar.gz
https://deps.files.ghostty.org/sentry-1220446be831adcca918167647c06c7b825849fa3fba5f22da394667974537a9c77e.tar.gz
https://deps.files.ghostty.org/spirv_cross-1220fb3b5586e8be67bc3feb34cbe749cf42a60d628d2953632c2f8141302748c8da.tar.gz
https://deps.files.ghostty.org/utfcpp-1220d4d18426ca72fc2b7e56ce47273149815501d0d2395c2a98c726b31ba931e641.tar.gz
https://deps.files.ghostty.org/uucode-0.2.0-ZZjBPqZVVABQepOqZHR7vV_NcaN-wats0IB6o-Exj6m9.tar.gz
https://deps.files.ghostty.org/vaxis-7dbb9fd3122e4ffad262dd7c151d80d863b68558.tar.gz
https://deps.files.ghostty.org/wayland-9cb3d7aa9dc995ffafdbdef7ab86a949d0fb0e7d.tar.gz

View File

@@ -14,6 +14,7 @@ pub fn build(b: *std.Build) void {
.root = b.path("src"),
.files = &.{"main.cpp"},
});
exe_mod.link_libcpp = true;
// You'll want to use a lazy dependency here so that ghostty is only
// downloaded if you actually need it.

View File

@@ -8,10 +8,7 @@ let package = Package(
.executableTarget(
name: "swift-vt-xcframework",
dependencies: ["GhosttyVt"],
path: "Sources",
linkerSettings: [
.linkedLibrary("c++"),
]
path: "Sources"
),
.binaryTarget(
name: "GhosttyVt",

View File

@@ -131,12 +131,6 @@
"dest": "vendor/p/N-V-__8AANb6pwD7O1WG6L5nvD_rNMvnSc9Cpg1ijSlTYywv",
"sha256": "b52b6fcfc45e7fa69b1f06a1362c155473444e2cc09995556b156c53ba6657e3"
},
{
"type": "archive",
"url": "https://deps.files.ghostty.org/utfcpp-1220d4d18426ca72fc2b7e56ce47273149815501d0d2395c2a98c726b31ba931e641.tar.gz",
"dest": "vendor/p/N-V-__8AAHffAgDU0YQmynL8K35WzkcnMUmBVQHQ0jlcKpjH",
"sha256": "ffc668a310e77607d393f3c18b32715f223da1eac4c4d6e0579a11df8e6b59cf"
},
{
"type": "git",
"url": "https://github.com/jacobsandlund/uucode",

View File

@@ -79,6 +79,11 @@ elif [ "$1" != "--update" ]; then
exit 1
fi
# Fetch all dependencies (including lazy ones) into the global cache
# so that zon2nix can find them when resolving transitive dependencies.
# Otherwise, lazy dependencies that aren't unpacked will fail below.
zig build --fetch=all
zon2nix "$BUILD_ZIG_ZON" --15 --nix "$WORK_DIR/build.zig.zon.nix" --txt "$WORK_DIR/build.zig.zon.txt" --json "$WORK_DIR/build.zig.zon.json" --flatpak "$WORK_DIR/zig-packages.json"
alejandra --quiet "$WORK_DIR/build.zig.zon.nix"
prettier --log-level warn --write "$WORK_DIR/build.zig.zon.json"

View File

@@ -3,6 +3,7 @@ const std = @import("std");
pub fn build(b: *std.Build) !void {
const optimize = b.standardOptimizeOption(.{});
const target = b.standardTargetOptions(.{});
const no_libcxx = b.option(bool, "no_libcxx", "Set SIMDUTF_NO_LIBCXX to avoid libc++ dependency") orelse false;
const lib = b.addLibrary(.{
.name = "simdutf",
@@ -13,13 +14,15 @@ pub fn build(b: *std.Build) !void {
.linkage = .static,
});
lib.linkLibC();
// On MSVC, we must not use linkLibCpp because Zig unconditionally
// passes -nostdinc++ and then adds its bundled libc++/libc++abi
// include paths, which conflict with MSVC's own C++ runtime headers.
// The MSVC SDK include directories (added via linkLibC) contain
// both C and C++ headers, so linkLibCpp is not needed.
if (target.result.abi != .msvc) {
lib.linkLibCpp();
if (!no_libcxx) {
// On MSVC, we must not use linkLibCpp because Zig unconditionally
// passes -nostdinc++ and then adds its bundled libc++/libc++abi
// include paths, which conflict with MSVC's own C++ runtime headers.
// The MSVC SDK include directories (added via linkLibC) contain
// both C and C++ headers, so linkLibCpp is not needed.
if (target.result.abi != .msvc) {
lib.linkLibCpp();
}
}
lib.addIncludePath(b.path("vendor"));
@@ -45,6 +48,13 @@ pub fn build(b: *std.Build) !void {
"-fno-sanitize-trap=undefined",
});
if (no_libcxx) {
try flags.append(b.allocator, "-DSIMDUTF_NO_LIBCXX");
try flags.append(b.allocator, "-fno-exceptions");
try flags.append(b.allocator, "-fno-rtti");
lib.root_module.addCMacro("SIMDUTF_NO_LIBCXX", "1");
}
if (target.result.os.tag == .freebsd or target.result.abi == .musl) {
try flags.append(b.allocator, "-fPIC");
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,61 +0,0 @@
const std = @import("std");
pub fn build(b: *std.Build) !void {
const target = b.standardTargetOptions(.{});
const optimize = b.standardOptimizeOption(.{});
const lib = b.addLibrary(.{
.name = "utfcpp",
.root_module = b.createModule(.{
.target = target,
.optimize = optimize,
}),
.linkage = .static,
});
lib.linkLibC();
if (target.result.os.tag.isDarwin()) {
const apple_sdk = @import("apple_sdk");
try apple_sdk.addPaths(b, lib);
}
if (target.result.abi.isAndroid()) {
const android_ndk = @import("android_ndk");
try android_ndk.addPaths(b, lib);
}
var flags: std.ArrayList([]const u8) = .empty;
defer flags.deinit(b.allocator);
lib.addCSourceFiles(.{
.flags = flags.items,
.files = &.{"empty.cc"},
});
if (b.lazyDependency("utfcpp", .{})) |upstream| {
lib.addIncludePath(upstream.path(""));
lib.installHeadersDirectory(
upstream.path("source"),
"",
.{ .include_extensions = &.{".h"} },
);
}
b.installArtifact(lib);
// {
// const test_exe = b.addTest(.{
// .name = "test",
// .root_source_file = .{ .path = "main.zig" },
// .target = target,
// .optimize = optimize,
// });
// test_exe.linkLibrary(lib);
//
// var it = module.import_table.iterator();
// while (it.next()) |entry| test_exe.root_module.addImport(entry.key_ptr.*, entry.value_ptr.*);
// const tests_run = b.addRunArtifact(test_exe);
// const test_step = b.step("test", "Run tests");
// test_step.dependOn(&tests_run.step);
// }
}

View File

@@ -1,17 +0,0 @@
.{
.name = .utfcpp,
.version = "4.0.5",
.fingerprint = 0xcd99aeb2334ae11a,
.paths = .{""},
.dependencies = .{
// nemtrif/utfcpp
.utfcpp = .{
.url = "https://deps.files.ghostty.org/utfcpp-1220d4d18426ca72fc2b7e56ce47273149815501d0d2395c2a98c726b31ba931e641.tar.gz",
.hash = "N-V-__8AAHffAgDU0YQmynL8K35WzkcnMUmBVQHQ0jlcKpjH",
.lazy = true,
},
.apple_sdk = .{ .path = "../apple-sdk" },
.android_ndk = .{ .path = "../android-ndk" },
},
}

View File

@@ -1,2 +0,0 @@
// Needed for Zig build to be happy
void ghostty_utfcpp_stub() {}

View File

@@ -26,9 +26,23 @@ The benchmark tools are split into two roles:
based on medians instead of single runs.
- When comparing branches, keep all benchmark inputs and CLI flags the same,
including terminal dimensions.
- Never run multiple benchmarks in parallel on the same machine, as they will
interfere with each other and produce unreliable results.
## Building
- Build benchmark tools with `zig build -Demit-bench`.
- On macOS, prefer `zig build -Demit-bench -Demit-macos-app=false` unless the
macOS app itself is part of the work.
- Build benchmark tools with `zig build -Demit-bench -Doptimize=ReleaseFast`.
- On macOS, add `-Demit-macos-app=false` to avoid building the macOS app.
- Make sure you specify `-Doptimize=ReleaseFast` when building benchmarks,
otherwise the debug build will be very slow and not representative of real
performance.
## Comparing Branches
- When comparing branches, switch to that branch, build the binary, then
rename it e.g. `zig-out/bin/ghostty-bench` to `zig-out/bin/ghostty-bench-branch1`.
Replace branch1 with something better.
- Then switch to the other branch, build it, and rename it to
`zig-out/bin/ghostty-bench-branch2`. Replace branch2 with something better.
- Then run all the benchmarks with `hyperfine` comparing the N binaries
we want to.

View File

@@ -346,11 +346,8 @@ fn combineArchives(
}
/// Returns the Libs.private value for the pkg-config file.
/// This includes the C++ standard library needed by SIMD code.
///
/// Zig compiles C++ code with LLVM's libc++ (not GNU libstdc++),
/// so consumers linking the static library need a libc++-compatible
/// toolchain: `zig cc`, `clang`, or GCC with `-lc++` installed.
/// Vendored C++ dependencies are built in no-libcxx mode so consumers
/// don't need libc++. System-provided simdutf still requires it.
fn libsPrivate(
zig: *const GhosttyZig,
) []const u8 {

View File

@@ -119,13 +119,14 @@ fn initVt(
.target = cfg.target,
.optimize = cfg.optimize,
// SIMD require libc/libcpp (both) but otherwise we don't care.
// On MSVC, we must not use linkLibCpp because Zig passes
// -nostdinc++ and adds its bundled libc++/libc++abi headers
// which conflict with MSVC's C++ runtime. The MSVC SDK dirs
// added via link_libc contain both C and C++ headers.
// SIMD requires libc. Vendored C++ dependencies are built with
// no-libcxx mode (HWY_NO_LIBCXX / SIMDUTF_NO_LIBCXX) so we
// don't need libcpp. System-provided simdutf headers still
// use C++ stdlib headers, so we need libcpp in that case.
.link_libc = if (cfg.simd) true else null,
.link_libcpp = if (cfg.simd and cfg.target.result.abi != .msvc) true else null,
.link_libcpp = if (cfg.simd and
b.systemIntegrationOption("simdutf", .{}) and
cfg.target.result.abi != .msvc) true else null,
});
vt.addOptions("build_options", general_options);
vt_options.add(b, vt);

View File

@@ -762,6 +762,7 @@ pub fn addSimd(
if (b.lazyDependency("simdutf", .{
.target = target,
.optimize = optimize,
.no_libcxx = true,
})) |simdutf_dep| {
m.linkLibrary(simdutf_dep.artifact("simdutf"));
if (static_libs) |v| try v.append(
@@ -787,18 +788,6 @@ pub fn addSimd(
}
}
// utfcpp - This is used as a dependency on our hand-written C++ code
if (b.lazyDependency("utfcpp", .{
.target = target,
.optimize = optimize,
})) |utfcpp_dep| {
m.linkLibrary(utfcpp_dep.artifact("utfcpp"));
if (static_libs) |v| try v.append(
b.allocator,
utfcpp_dep.artifact("utfcpp").getEmittedBin(),
);
}
// SIMD C++ files
m.addIncludePath(b.path("src"));
{
@@ -839,6 +828,14 @@ pub fn addSimd(
"-DHWY_NO_LIBCXX",
);
// When using the vendored simdutf, build its headers in no-libcxx
// mode so we don't need C++ standard library headers at all.
// System simdutf headers may not support this define.
if (!b.systemIntegrationOption("simdutf", .{})) try flags.append(
b.allocator,
"-DSIMDUTF_NO_LIBCXX",
);
// Disable ubsan for MSVC to avoid undefined references to
// __ubsan_handle_* symbols that require a runtime we don't link
// and bundle. Hopefully we can fix this one day since ubsan is nice!

View File

@@ -4,9 +4,34 @@
#include <hwy/foreach_target.h> // must come before highway.h
#include <hwy/highway.h>
#include <algorithm>
#include <cassert>
#include <iterator>
#ifndef GHOSTTY_SIMD_CPW_HELPERS_
#define GHOSTTY_SIMD_CPW_HELPERS_
#include <assert.h>
#include <stddef.h>
// Replacement for std::size() that works without libc++.
template <typename T, size_t N>
constexpr size_t array_size(const T (&)[N]) { return N; }
// Constexpr min/max element over a C array (replaces std::min_element/
// std::max_element).
template <typename T, size_t N>
constexpr T array_min(const T (&a)[N]) {
T m = a[0];
for (size_t i = 1; i < N; ++i)
if (a[i] != 0 && (m == 0 || a[i] < m)) m = a[i];
return m;
}
template <typename T, size_t N>
constexpr T array_max(const T (&a)[N]) {
T m = a[0];
for (size_t i = 1; i < N; ++i)
if (a[i] > m) m = a[i];
return m;
}
#endif // GHOSTTY_SIMD_CPW_HELPERS_
HWY_BEFORE_NAMESPACE();
namespace ghostty {
@@ -214,12 +239,12 @@ HWY_ALIGN constexpr uint16_t nsm_lte16[] = {
};
// All our tables must be identically sized
static_assert(std::size(eaw_gte32) == std::size(eaw_lte32));
static_assert(std::size(eaw_gte16) == std::size(eaw_lte16));
static_assert(std::size(zero_gte32) == std::size(zero_lte32));
static_assert(std::size(zero_gte16) == std::size(zero_lte16));
static_assert(std::size(nsm_gte32) == std::size(nsm_lte32));
static_assert(std::size(nsm_gte16) == std::size(nsm_lte16));
static_assert(array_size(eaw_gte32) == array_size(eaw_lte32));
static_assert(array_size(eaw_gte16) == array_size(eaw_lte16));
static_assert(array_size(zero_gte32) == array_size(zero_lte32));
static_assert(array_size(zero_gte16) == array_size(zero_lte16));
static_assert(array_size(nsm_gte32) == array_size(nsm_lte32));
static_assert(array_size(nsm_gte16) == array_size(nsm_lte16));
/// Handles 16-bit codepoints.
template <class D, typename T = uint16_t>
@@ -245,10 +270,10 @@ int8_t CodepointWidth16(D d, uint16_t input) {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
static_assert(std::size(gte_keys) == std::size(lte_keys));
static_assert(std::size(gte_keys) >= 32);
static_assert(array_size(gte_keys) == array_size(lte_keys));
static_assert(array_size(gte_keys) >= 32);
size_t i = 0;
for (; i + N <= std::size(lte_keys) && lte_keys[i] != 0; i += N) {
for (; i + N <= array_size(lte_keys) && lte_keys[i] != 0; i += N) {
const hn::Vec<D> lte_vec = hn::Load(d, lte_keys + i);
const hn::Vec<D> gte_vec = hn::Load(d, gte_keys + i);
const intptr_t idx = hn::FindFirstTrue(
@@ -267,12 +292,12 @@ int8_t CodepointWidth16(D d, uint16_t input) {
{
constexpr T zero_gte_min =
*std::min_element(zero_gte16, zero_gte16 + std::size(zero_gte16));
array_min(zero_gte16);
constexpr T zero_lte_max =
*std::max_element(zero_lte16, zero_lte16 + std::size(zero_lte16));
array_max(zero_lte16);
if (input >= zero_gte_min && input <= zero_lte_max) {
size_t i = 0;
for (; i + N <= std::size(zero_gte16) && zero_gte16[i] != 0; i += N) {
for (; i + N <= array_size(zero_gte16) && zero_gte16[i] != 0; i += N) {
const hn::Vec<D> lte_vec = hn::Load(d, zero_lte16 + i);
const hn::Vec<D> gte_vec = hn::Load(d, zero_gte16 + i);
const intptr_t idx = hn::FindFirstTrue(
@@ -286,12 +311,12 @@ int8_t CodepointWidth16(D d, uint16_t input) {
{
constexpr T eaw_gte_min =
*std::min_element(eaw_gte16, eaw_gte16 + std::size(eaw_gte16));
array_min(eaw_gte16);
constexpr T eaw_lte_max =
*std::max_element(eaw_lte16, eaw_lte16 + std::size(eaw_lte16));
array_max(eaw_lte16);
if (input >= eaw_gte_min && input <= eaw_lte_max) {
size_t i = 0;
for (; i + N <= std::size(eaw_lte16) && eaw_lte16[i] != 0; i += N) {
for (; i + N <= array_size(eaw_lte16) && eaw_lte16[i] != 0; i += N) {
const hn::Vec<D> lte_vec = hn::Load(d, eaw_lte16 + i);
const hn::Vec<D> gte_vec = hn::Load(d, eaw_gte16 + i);
const intptr_t idx = hn::FindFirstTrue(
@@ -305,12 +330,12 @@ int8_t CodepointWidth16(D d, uint16_t input) {
{
constexpr T nsm_gte_min =
*std::min_element(nsm_gte16, nsm_gte16 + std::size(nsm_gte16));
array_min(nsm_gte16);
constexpr T nsm_lte_max =
*std::max_element(nsm_lte16, nsm_lte16 + std::size(nsm_lte16));
array_max(nsm_lte16);
if (input >= nsm_gte_min && input <= nsm_lte_max) {
size_t i = 0;
for (; i + N <= std::size(nsm_lte16) && nsm_lte16[i] != 0; i += N) {
for (; i + N <= array_size(nsm_lte16) && nsm_lte16[i] != 0; i += N) {
const hn::Vec<D> lte_vec = hn::Load(d, nsm_lte16 + i);
const hn::Vec<D> gte_vec = hn::Load(d, nsm_gte16 + i);
const intptr_t idx = hn::FindFirstTrue(
@@ -342,10 +367,10 @@ int8_t CodepointWidth32(D d, T input) {
HWY_ALIGN constexpr T lte_keys[] = {
0x1f1ff, 0x2FFFD, 0x3FFFD, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
static_assert(std::size(gte_keys) == std::size(lte_keys));
static_assert(std::size(gte_keys) >= 16);
static_assert(array_size(gte_keys) == array_size(lte_keys));
static_assert(array_size(gte_keys) >= 16);
size_t i = 0;
for (; i + N <= std::size(lte_keys) && lte_keys[i] != 0; i += N) {
for (; i + N <= array_size(lte_keys) && lte_keys[i] != 0; i += N) {
const hn::Vec<D> lte_vec = hn::Load(d, lte_keys + i);
const hn::Vec<D> gte_vec = hn::Load(d, gte_keys + i);
const intptr_t idx = hn::FindFirstTrue(
@@ -364,10 +389,10 @@ int8_t CodepointWidth32(D d, T input) {
HWY_ALIGN constexpr T lte_keys[] = {
0xE0FFF, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
static_assert(std::size(gte_keys) == std::size(lte_keys));
static_assert(std::size(gte_keys) >= 16);
static_assert(array_size(gte_keys) == array_size(lte_keys));
static_assert(array_size(gte_keys) >= 16);
size_t i = 0;
for (; i + N <= std::size(lte_keys) && lte_keys[i] != 0; i += N) {
for (; i + N <= array_size(lte_keys) && lte_keys[i] != 0; i += N) {
const hn::Vec<D> lte_vec = hn::Load(d, lte_keys + i);
const hn::Vec<D> gte_vec = hn::Load(d, gte_keys + i);
const intptr_t idx = hn::FindFirstTrue(
@@ -380,12 +405,12 @@ int8_t CodepointWidth32(D d, T input) {
{
constexpr T zero_gte_min =
*std::min_element(zero_gte32, zero_gte32 + std::size(zero_gte32));
array_min(zero_gte32);
constexpr T zero_lte_max =
*std::max_element(zero_lte32, zero_lte32 + std::size(zero_lte32));
array_max(zero_lte32);
if (input >= zero_gte_min && input <= zero_lte_max) {
size_t i = 0;
for (; i + N <= std::size(zero_gte32) && zero_gte32[i] != 0; i += N) {
for (; i + N <= array_size(zero_gte32) && zero_gte32[i] != 0; i += N) {
const hn::Vec<D> lte_vec = hn::Load(d, zero_lte32 + i);
const hn::Vec<D> gte_vec = hn::Load(d, zero_gte32 + i);
const intptr_t idx = hn::FindFirstTrue(
@@ -399,12 +424,12 @@ int8_t CodepointWidth32(D d, T input) {
{
constexpr T eaw_gte_min =
*std::min_element(eaw_gte32, eaw_gte32 + std::size(eaw_gte32));
array_min(eaw_gte32);
constexpr T eaw_lte_max =
*std::max_element(eaw_lte32, eaw_lte32 + std::size(eaw_lte32));
array_max(eaw_lte32);
if (input >= eaw_gte_min && input <= eaw_lte_max) {
size_t i = 0;
for (; i + N <= std::size(eaw_lte32) && eaw_lte32[i] != 0; i += N) {
for (; i + N <= array_size(eaw_lte32) && eaw_lte32[i] != 0; i += N) {
const hn::Vec<D> lte_vec = hn::Load(d, eaw_lte32 + i);
const hn::Vec<D> gte_vec = hn::Load(d, eaw_gte32 + i);
const intptr_t idx = hn::FindFirstTrue(
@@ -418,12 +443,12 @@ int8_t CodepointWidth32(D d, T input) {
{
constexpr T nsm_gte_min =
*std::min_element(nsm_gte32, nsm_gte32 + std::size(nsm_gte32));
array_min(nsm_gte32);
constexpr T nsm_lte_max =
*std::max_element(nsm_lte32, nsm_lte32 + std::size(nsm_lte32));
array_max(nsm_lte32);
if (input >= nsm_gte_min && input <= nsm_lte_max) {
size_t i = 0;
for (; i + N <= std::size(nsm_lte32) && nsm_lte32[i] != 0; i += N) {
for (; i + N <= array_size(nsm_lte32) && nsm_lte32[i] != 0; i += N) {
const hn::Vec<D> lte_vec = hn::Load(d, nsm_lte32 + i);
const hn::Vec<D> gte_vec = hn::Load(d, nsm_gte32 + i);
const intptr_t idx = hn::FindFirstTrue(

View File

@@ -6,8 +6,6 @@
#include <simd/index_of.h>
#include <optional>
HWY_BEFORE_NAMESPACE();
namespace ghostty {
namespace HWY_NAMESPACE {

View File

@@ -7,8 +7,7 @@
#include <hwy/highway.h>
#include <cstddef>
#include <optional>
#include <stddef.h>
HWY_BEFORE_NAMESPACE();
namespace ghostty {
@@ -16,12 +15,16 @@ namespace HWY_NAMESPACE {
namespace hn = hwy::HWY_NAMESPACE;
// Sentinel value returned by IndexOfChunk when no match is found.
static constexpr size_t kNotFound = static_cast<size_t>(-1);
// Return the index of the first occurrence of `needle` in `input`, where
// the input and needle are already loaded into vectors.
// the input and needle are already loaded into vectors. Returns kNotFound
// if no match is found.
template <class D, typename T = hn::TFromD<D>>
std::optional<size_t> IndexOfChunk(D d,
hn::Vec<D> needle_vec,
hn::Vec<D> input_vec) {
size_t IndexOfChunk(D d,
hn::Vec<D> needle_vec,
hn::Vec<D> input_vec) {
// Compare the input vector with the needle vector. This produces
// a vector where each lane is 0xFF if the corresponding lane in
// `input_vec` is equal to the corresponding lane in `needle_vec`.
@@ -32,9 +35,9 @@ std::optional<size_t> IndexOfChunk(D d,
// If we found a match, return the index into the input.
if (pos >= 0) {
return std::optional<size_t>(static_cast<size_t>(pos));
return static_cast<size_t>(pos);
} else {
return std::nullopt;
return kNotFound;
}
}
@@ -58,8 +61,9 @@ size_t IndexOfImpl(D d, T needle, const T* HWY_RESTRICT input, size_t count) {
for (; i + N <= count; i += N) {
// Load the N elements from our input into a vector and check the chunk.
const hn::Vec<D> input_vec = hn::LoadU(d, input + i);
if (auto pos = IndexOfChunk(d, needle_vec, input_vec)) {
return i + pos.value();
const size_t pos = IndexOfChunk(d, needle_vec, input_vec);
if (pos != kNotFound) {
return i + pos;
}
}

View File

@@ -5,8 +5,8 @@
#include <hwy/highway.h>
#include <simdutf.h>
#include <utf8.h>
#include <vector>
#include <stdlib.h>
#include <string.h>
#include <simd/index_of.h>
#include <simd/vt.h>
@@ -19,12 +19,144 @@ namespace hn = hwy::HWY_NAMESPACE;
using T = uint8_t;
// Compute the length of the maximal subpart of an ill-formed UTF-8
// subsequence starting at p[0], per Unicode Table 3-7 and the W3C
// "U+FFFD Substitution of Maximal Subparts" algorithm.
//
// The maximal subpart is the longest initial subsequence that is either:
// (a) the start of a well-formed sequence, or
// (b) a single byte.
// Each maximal subpart maps to exactly one U+FFFD.
static size_t MaximalSubpart(const unsigned char* p, size_t len) {
if (len == 0) return 0;
unsigned char b0 = p[0];
// Continuation bytes (80-BF), overlong leads (C0-C1), or invalid (F5-FF):
// each is its own maximal subpart of length 1.
if (b0 < 0xC2 || b0 > 0xF4) return 1;
// Determine the expected sequence length and the valid range for each
// continuation byte per Unicode Table 3-7.
size_t seq_len;
unsigned char lo[3], hi[3];
if (b0 <= 0xDF) {
seq_len = 2;
lo[0] = 0x80; hi[0] = 0xBF;
} else if (b0 == 0xE0) {
seq_len = 3;
lo[0] = 0xA0; hi[0] = 0xBF;
lo[1] = 0x80; hi[1] = 0xBF;
} else if (b0 <= 0xEC) {
seq_len = 3;
lo[0] = 0x80; hi[0] = 0xBF;
lo[1] = 0x80; hi[1] = 0xBF;
} else if (b0 == 0xED) {
seq_len = 3;
lo[0] = 0x80; hi[0] = 0x9F;
lo[1] = 0x80; hi[1] = 0xBF;
} else if (b0 <= 0xEF) {
seq_len = 3;
lo[0] = 0x80; hi[0] = 0xBF;
lo[1] = 0x80; hi[1] = 0xBF;
} else if (b0 == 0xF0) {
seq_len = 4;
lo[0] = 0x90; hi[0] = 0xBF;
lo[1] = 0x80; hi[1] = 0xBF;
lo[2] = 0x80; hi[2] = 0xBF;
} else if (b0 <= 0xF3) {
seq_len = 4;
lo[0] = 0x80; hi[0] = 0xBF;
lo[1] = 0x80; hi[1] = 0xBF;
lo[2] = 0x80; hi[2] = 0xBF;
} else { // b0 == 0xF4
seq_len = 4;
lo[0] = 0x80; hi[0] = 0x8F;
lo[1] = 0x80; hi[1] = 0xBF;
lo[2] = 0x80; hi[2] = 0xBF;
}
// Check continuation bytes against their specific valid ranges.
// The maximal subpart extends as far as bytes match.
size_t valid = 1; // lead byte counts
for (size_t i = 0; i < seq_len - 1 && valid < len; i++) {
unsigned char cb = p[valid];
if (cb < lo[i] || cb > hi[i]) break;
valid++;
}
// If we matched all bytes, the sequence is structurally valid
// (shouldn't happen since we're called on an error), but cap
// to avoid skipping a valid sequence.
if (valid == seq_len) return valid;
return valid;
}
// Trim trailing bytes that form a valid-but-incomplete UTF-8 sequence.
// Only trims sequences whose bytes so far match Table 3-7 ranges (i.e.,
// truly partial sequences that could be completed by future input).
// Invalid lead bytes (C0, C1, F5-FF) or mismatched continuations are NOT
// trimmed — they will be handled as errors by DecodeUTF8.
static size_t TrimValidPartialUTF8(const uint8_t* input, size_t len) {
if (len == 0) return 0;
// Find the start of a potential trailing partial sequence by scanning
// backwards from the end. We look for a lead byte (C2-F4) that could
// start a multi-byte sequence, possibly followed by continuation bytes.
//
// We check up to the last 4 bytes (max UTF-8 sequence length).
size_t check_start = len > 4 ? len - 4 : 0;
for (size_t pos = len; pos > check_start; pos--) {
unsigned char b = input[pos - 1];
// Skip continuation bytes — they might belong to the partial sequence.
if ((b & 0xC0) == 0x80) continue;
// Found a non-continuation byte. Only valid multi-byte leads (C2-F4)
// can start a partial sequence worth trimming. Anything else (ASCII,
// C0, C1, F5-FF) should be consumed by DecodeUTF8.
if (b < 0xC2 || b > 0xF4) return len;
// Determine expected sequence length from the lead byte.
size_t expected;
if (b <= 0xDF)
expected = 2;
else if (b <= 0xEF)
expected = 3;
else
expected = 4;
size_t seq_remaining = len - (pos - 1);
// If we have all expected bytes, the sequence is complete (not partial).
if (seq_remaining >= expected) return len;
// Check if the trailing bytes form a valid prefix using MaximalSubpart.
const unsigned char* seq_start = input + pos - 1;
size_t subpart = MaximalSubpart(seq_start, seq_remaining);
// Only trim if ALL trailing bytes are part of the valid prefix
// (the sequence is valid-so-far but incomplete).
if (subpart == seq_remaining) {
return pos - 1;
}
// The sequence is ill-formed, don't trim — let DecodeUTF8 handle it.
return len;
}
return len;
}
// Decode the UTF-8 text in input into output. Returns the number of decoded
// characters. This function assumes output is large enough.
//
// This function handles malformed UTF-8 sequences by inserting a
// replacement character (U+FFFD) and continuing to decode. This function
// will consume the entire input no matter what.
// replacement character (U+FFFD) following the W3C/Unicode "U+FFFD
// Substitution of Maximal Subparts" algorithm and continuing to decode.
// This function will consume the entire input no matter what.
size_t DecodeUTF8(const uint8_t* HWY_RESTRICT input,
size_t count,
char32_t* output) {
@@ -34,27 +166,38 @@ size_t DecodeUTF8(const uint8_t* HWY_RESTRICT input,
return 0;
}
// Assume no errors for fast path.
const size_t decoded = simdutf::convert_utf8_to_utf32(
reinterpret_cast<const char*>(input), count, output);
if (decoded > 0) {
return decoded;
// Decode UTF-8 to UTF-32, replacing invalid sequences with U+FFFD.
const char* in = reinterpret_cast<const char*>(input);
size_t remaining = count;
char32_t* out = output;
while (remaining > 0) {
auto r = simdutf::convert_utf8_to_utf32_with_errors(in, remaining, out);
// If the decode was a full success then we're done!
if (r.error == simdutf::SUCCESS) {
out += r.count;
break;
}
// On error, r.count is the input byte position of the error.
// The output buffer is already written up to that point, but
// we need count_utf8 to find how many char32_t that produced.
out += simdutf::count_utf8(in, r.count);
// Compute the maximal subpart at the error position and emit
// a single U+FFFD for it.
const unsigned char* err_pos =
reinterpret_cast<const unsigned char*>(in + r.count);
size_t err_remaining = remaining - r.count;
size_t skip = r.count + MaximalSubpart(err_pos, err_remaining);
*out++ = 0xFFFD;
in += skip;
remaining -= skip;
}
// Errors in the UTF input, take a slow path and do a decode with
// replacement (with U+FFFD). Note that simdutf doesn't have a
// decode with replacement API:
// https://github.com/simdutf/simdutf/issues/147
//
// Because of this, we use a separate library with heap allocation
// that is much, much slower (the allocation is slower, the algorithm
// is slower, etc.) This is just so we have something that works.
// I want to replace this.
std::vector<char> replacement_result;
utf8::replace_invalid(input, input + count,
std::back_inserter(replacement_result), 0xFFFD);
return DecodeUTF8(reinterpret_cast<const uint8_t*>(replacement_result.data()),
replacement_result.size(), output);
return static_cast<size_t>(out - output);
}
/// Decode the UTF-8 text in input into output until an escape
@@ -86,16 +229,16 @@ size_t DecodeUTF8UntilControlSeqImpl(D d,
// If we don't have any escapes we keep going. We want to accumulate
// the largest possible valid UTF-8 sequence before decoding.
// TODO(mitchellh): benchmark this vs decoding every time
const auto esc_idx = IndexOfChunk(d, esc_vec, input_vec);
if (!esc_idx) {
const size_t esc_idx = IndexOfChunk(d, esc_vec, input_vec);
if (esc_idx == kNotFound) {
continue;
}
// We have an ESC char, decode up to this point. We start by assuming
// a valid UTF-8 sequence and slow-path into error handling if we find
// an invalid sequence.
*output_count = DecodeUTF8(input, i + esc_idx.value(), output);
return i + esc_idx.value();
*output_count = DecodeUTF8(input, i + esc_idx, output);
return i + esc_idx;
}
// If we have leftover input then we decode it one byte at a time (slow!)
@@ -106,21 +249,27 @@ size_t DecodeUTF8UntilControlSeqImpl(D d,
const hn::Vec<D1> esc1 = Set(d1, hn::GetLane(esc_vec));
for (; i < count; ++i) {
const hn::Vec<D1> input_vec = hn::LoadU(d1, input + i);
const auto esc_idx = IndexOfChunk(d1, esc1, input_vec);
if (!esc_idx) {
const size_t esc_idx = IndexOfChunk(d1, esc1, input_vec);
if (esc_idx == kNotFound) {
continue;
}
*output_count = DecodeUTF8(input, i + esc_idx.value(), output);
return i + esc_idx.value();
*output_count = DecodeUTF8(input, i + esc_idx, output);
return i + esc_idx;
}
}
// If we reached this point, its possible for our input to have an
// incomplete sequence because we're consuming the full input. We need
// to trim any incomplete sequences from the end of the input.
const size_t trimmed_len =
simdutf::trim_partial_utf8(reinterpret_cast<const char*>(input), i);
//
// We use our own trim instead of simdutf::trim_partial_utf8 because
// we only want to trim sequences that are valid-so-far (true partial
// sequences that may be completed by future input). Invalid bytes
// like C0, C1, F5-FF should NOT be trimmed — they should be passed
// through to DecodeUTF8 which will replace them with U+FFFD per the
// maximal subpart algorithm.
const size_t trimmed_len = TrimValidPartialUTF8(input, i);
*output_count = DecodeUTF8(input, trimmed_len, output);
return trimmed_len;
}

View File

@@ -45,36 +45,79 @@ fn utf8DecodeUntilControlSeqScalar(
const idx = indexOf(input, 0x1B) orelse input.len;
const decode = input[0..idx];
// Go through and decode one item at a time.
// Go through and decode one item at a time, following the W3C/Unicode
// "U+FFFD Substitution of Maximal Subparts" algorithm for ill-formed
// subsequences.
var decode_offset: usize = 0;
var decode_count: usize = 0;
while (decode_offset < decode.len) {
const decode_rem = decode[decode_offset..];
const cp_len = std.unicode.utf8ByteSequenceLength(decode_rem[0]) catch {
// Note, this is matching our SIMD behavior, but it is admittedly
// a bit weird. See our "decode invalid leading byte" test too.
// SIMD should be our source of truth then we copy behavior here.
break;
};
const b0 = decode[decode_offset];
// If we don't have that number of bytes available. we finish. We
// assume this is a partial input and we defer to the future.
if (decode_rem.len < cp_len) break;
// We have the bytes available, so move forward
const cp_bytes = decode_rem[0..cp_len];
decode_offset += cp_len;
if (std.unicode.utf8Decode(cp_bytes)) |cp| {
output[decode_count] = @intCast(cp);
// ASCII fast path
if (b0 < 0x80) {
output[decode_count] = b0;
decode_count += 1;
} else |_| {
// If decoding failed, we replace the leading byte with the
// replacement char and then continue decoding after that
// byte. This matches the SIMD behavior and is tested by the
// "invalid UTF-8" tests.
decode_offset += 1;
continue;
}
// Continuation byte (80-BF) or invalid byte (C0-C1, F5-FF)
// as lead: each is its own maximal subpart → one FFFD per byte.
if (b0 < 0xC2 or b0 > 0xF4) {
output[decode_count] = 0xFFFD;
decode_count += 1;
decode_offset -= cp_len - 1;
decode_offset += 1;
continue;
}
// Multi-byte sequence. Determine expected length and the valid
// range for each continuation byte per Unicode Table 3-7.
const seq = utf8SeqInfo(b0);
// Check how many continuation bytes form a valid prefix (the
// maximal subpart). We check each byte against its specific
// valid range.
var valid: usize = 1; // lead byte is valid
for (0..seq.len - 1) |ci| {
if (decode_offset + valid >= decode.len) {
// Truncated at end of buffer: treat as incomplete
// input that may be completed later. Stop decoding
// without consuming these bytes.
return .{
.consumed = decode_offset,
.decoded = decode_count,
};
}
const cb = decode[decode_offset + valid];
if (cb < seq.ranges[ci][0] or cb > seq.ranges[ci][1]) {
// Byte doesn't match expected range. The maximal
// subpart ends here.
break;
}
valid += 1;
}
if (valid == seq.len) {
// Full sequence present and structurally valid. Decode it.
// (Structural validity per Table 3-7 guarantees decode success.)
const cp_bytes = decode[decode_offset..][0..seq.len];
if (std.unicode.utf8Decode(cp_bytes)) |cp| {
output[decode_count] = @intCast(cp);
decode_count += 1;
decode_offset += seq.len;
} else |_| {
// Should not happen given Table 3-7 validation, but
// be safe: emit FFFD for the lead byte.
output[decode_count] = 0xFFFD;
decode_count += 1;
decode_offset += 1;
}
} else {
// Incomplete/ill-formed: the maximal subpart (valid bytes)
// maps to a single FFFD.
output[decode_count] = 0xFFFD;
decode_count += 1;
decode_offset += valid;
}
}
@@ -84,6 +127,27 @@ fn utf8DecodeUntilControlSeqScalar(
};
}
const Utf8SeqInfo = struct {
len: u3,
ranges: [3][2]u8,
};
/// Returns the expected byte count and valid continuation byte ranges
/// for a UTF-8 sequence based on its lead byte, per Unicode Table 3-7.
fn utf8SeqInfo(lead: u8) Utf8SeqInfo {
return switch (lead) {
0xC2...0xDF => .{ .len = 2, .ranges = .{ .{ 0x80, 0xBF }, .{ 0, 0 }, .{ 0, 0 } } },
0xE0 => .{ .len = 3, .ranges = .{ .{ 0xA0, 0xBF }, .{ 0x80, 0xBF }, .{ 0, 0 } } },
0xE1...0xEC => .{ .len = 3, .ranges = .{ .{ 0x80, 0xBF }, .{ 0x80, 0xBF }, .{ 0, 0 } } },
0xED => .{ .len = 3, .ranges = .{ .{ 0x80, 0x9F }, .{ 0x80, 0xBF }, .{ 0, 0 } } },
0xEE...0xEF => .{ .len = 3, .ranges = .{ .{ 0x80, 0xBF }, .{ 0x80, 0xBF }, .{ 0, 0 } } },
0xF0 => .{ .len = 4, .ranges = .{ .{ 0x90, 0xBF }, .{ 0x80, 0xBF }, .{ 0x80, 0xBF } } },
0xF1...0xF3 => .{ .len = 4, .ranges = .{ .{ 0x80, 0xBF }, .{ 0x80, 0xBF }, .{ 0x80, 0xBF } } },
0xF4 => .{ .len = 4, .ranges = .{ .{ 0x80, 0x8F }, .{ 0x80, 0xBF }, .{ 0x80, 0xBF } } },
else => unreachable,
};
}
test "decode no escape" {
const testing = std.testing;
@@ -131,7 +195,7 @@ test "decode incomplete UTF-8" {
var output: [64]u32 = undefined;
// 2-byte
// 2-byte truncated at end of buffer
{
const str = "hello\xc2";
try testing.expectEqual(DecodeResult{
@@ -140,16 +204,18 @@ test "decode incomplete UTF-8" {
}, utf8DecodeUntilControlSeq(str, &output));
}
// 3-byte
// 3-byte: \xe0 expects A0-BF next, but \x00 is not in range.
// \xe0 is a maximal subpart of length 1 → FFFD, then \x00 is ASCII NUL.
{
const str = "hello\xe0\x00";
try testing.expectEqual(DecodeResult{
.consumed = 5,
.decoded = 5,
}, utf8DecodeUntilControlSeq(str, &output));
const result = utf8DecodeUntilControlSeq(str, &output);
try testing.expectEqual(@as(usize, 7), result.consumed);
try testing.expectEqual(@as(usize, 7), result.decoded);
try testing.expectEqual(@as(u32, 0xFFFD), output[5]);
try testing.expectEqual(@as(u32, 0x00), output[6]);
}
// 4-byte
// 4-byte truncated at end of buffer (F0 90 is valid so far)
{
const str = "hello\xf0\x90";
try testing.expectEqual(DecodeResult{
@@ -178,19 +244,248 @@ test "decode invalid UTF-8" {
try testing.expectEqual(@as(u32, 0x01), output[6]);
}
// This is testing our current behavior so that we know we have to handle
// this case in terminal/stream.zig. If we change this behavior, we can
// remove the special handling in terminal/stream.zig.
test "decode invalid leading byte isn't consumed or replaced" {
// Per the maximal subpart spec, bytes F5-FF are each replaced with FFFD.
test "decode invalid leading byte is replaced" {
const testing = std.testing;
var output: [64]u32 = undefined;
{
const str = "hello\xFF";
try testing.expectEqual(DecodeResult{
.consumed = 5,
.decoded = 5,
}, utf8DecodeUntilControlSeq(str, &output));
const result = utf8DecodeUntilControlSeq(str, &output);
try testing.expectEqual(@as(usize, 6), result.consumed);
try testing.expectEqual(@as(usize, 6), result.decoded);
try testing.expectEqual(@as(u32, 0xFFFD), output[5]);
}
}
test "decode invalid continuation in 3-byte sequence" {
const testing = std.testing;
var output: [64]u32 = undefined;
// \xe2 expects two continuation bytes, \x28 is not one
{
const str = "hello\xe2\x28world";
const result = utf8DecodeUntilControlSeq(str, &output);
// "hello" + replacement + "(" + "world" = 12 codepoints
try testing.expectEqual(@as(usize, 12), result.decoded);
try testing.expectEqual(@as(u32, 0xFFFD), output[5]);
try testing.expectEqual(@as(u32, '('), output[6]);
try testing.expectEqual(@as(u32, 'w'), output[7]);
}
}
test "decode invalid continuation in 4-byte sequence" {
const testing = std.testing;
var output: [64]u32 = undefined;
// \xf0\x90 is a valid prefix of a 4-byte sequence, but \x28 breaks it.
// Maximal subpart is F0 90 (length 2) → single FFFD, then '(' proceeds.
{
const str = "hello\xf0\x90\x28world";
const result = utf8DecodeUntilControlSeq(str, &output);
// "hello" + FFFD + "(" + "world" = 12 codepoints
try testing.expectEqual(@as(usize, 12), result.decoded);
try testing.expectEqual(@as(u32, 0xFFFD), output[5]);
try testing.expectEqual(@as(u32, '('), output[6]);
try testing.expectEqual(@as(u32, 'w'), output[7]);
}
}
test "decode multiple consecutive invalid bytes" {
const testing = std.testing;
var output: [64]u32 = undefined;
// Each lone continuation byte is its own maximal subpart → one FFFD each.
{
const str = "a\x80\x80b";
const result = utf8DecodeUntilControlSeq(str, &output);
// "a" + FFFD + FFFD + "b" = 4 codepoints
try testing.expectEqual(@as(usize, 4), result.decoded);
try testing.expectEqual(@as(u32, 'a'), output[0]);
try testing.expectEqual(@as(u32, 0xFFFD), output[1]);
try testing.expectEqual(@as(u32, 0xFFFD), output[2]);
try testing.expectEqual(@as(u32, 'b'), output[3]);
}
// C0 is an invalid lead byte (< C2), each byte gets its own FFFD.
{
const str = "a\xc0\xc0b";
const result = utf8DecodeUntilControlSeq(str, &output);
// "a" + FFFD + FFFD + "b" = 4 codepoints
try testing.expectEqual(@as(usize, 4), result.decoded);
try testing.expectEqual(@as(u32, 'a'), output[0]);
try testing.expectEqual(@as(u32, 0xFFFD), output[1]);
try testing.expectEqual(@as(u32, 0xFFFD), output[2]);
try testing.expectEqual(@as(u32, 'b'), output[3]);
}
}
test "decode unexpected continuation byte as lead" {
const testing = std.testing;
var output: [64]u32 = undefined;
// 0x80 is a continuation byte appearing as a lead byte
{
const str = "a\x80b";
const result = utf8DecodeUntilControlSeq(str, &output);
// "a" + replacement + "b" = 3 codepoints
try testing.expectEqual(@as(usize, 3), result.decoded);
try testing.expectEqual(@as(u32, 'a'), output[0]);
try testing.expectEqual(@as(u32, 0xFFFD), output[1]);
try testing.expectEqual(@as(u32, 'b'), output[2]);
}
}
test "decode overlong 2-byte encoding" {
const testing = std.testing;
var output: [64]u32 = undefined;
// \xc0\xaf: C0 is invalid lead (< C2) → FFFD, AF is lone continuation → FFFD
// Per Table 3-8: C0 AF → FFFD FFFD
{
const str = "a\xc0\xafb";
const result = utf8DecodeUntilControlSeq(str, &output);
// "a" + FFFD + FFFD + "b" = 4 codepoints
try testing.expectEqual(@as(usize, 4), result.decoded);
try testing.expectEqual(@as(u32, 'a'), output[0]);
try testing.expectEqual(@as(u32, 0xFFFD), output[1]);
try testing.expectEqual(@as(u32, 0xFFFD), output[2]);
try testing.expectEqual(@as(u32, 'b'), output[3]);
}
}
test "decode surrogate half" {
const testing = std.testing;
var output: [64]u32 = undefined;
// \xed\xa0\x80 encodes U+D800 (a surrogate). Per Table 3-7, after ED
// the next byte must be 80-9F. A0 is out of range, so ED is a maximal
// subpart of length 1 → FFFD. Then A0 and 80 are lone continuations
// → FFFD each. Per Table 3-9: ED A0 80 → FFFD FFFD FFFD
{
const str = "a\xed\xa0\x80b";
const result = utf8DecodeUntilControlSeq(str, &output);
// "a" + FFFD + FFFD + FFFD + "b" = 5 codepoints
try testing.expectEqual(@as(usize, 5), result.decoded);
try testing.expectEqual(@as(u32, 'a'), output[0]);
try testing.expectEqual(@as(u32, 0xFFFD), output[1]);
try testing.expectEqual(@as(u32, 0xFFFD), output[2]);
try testing.expectEqual(@as(u32, 0xFFFD), output[3]);
try testing.expectEqual(@as(u32, 'b'), output[4]);
}
}
test "decode valid multibyte surrounded by invalid" {
const testing = std.testing;
var output: [64]u32 = undefined;
// \xc3\xa9 = é (U+00E9), surrounded by invalid continuation bytes
{
const str = "\x80\xc3\xa9\x80";
const result = utf8DecodeUntilControlSeq(str, &output);
// replacement + é + replacement = 3 codepoints
try testing.expectEqual(@as(usize, 3), result.decoded);
try testing.expectEqual(@as(u32, 0xFFFD), output[0]);
try testing.expectEqual(@as(u32, 0x00E9), output[1]);
try testing.expectEqual(@as(u32, 0xFFFD), output[2]);
}
}
test "decode invalid byte before escape" {
const testing = std.testing;
var output: [64]u32 = undefined;
// Invalid byte followed by ESC - should replace then stop
{
const str = "hi\x80\x1b[0m";
const result = utf8DecodeUntilControlSeq(str, &output);
try testing.expectEqual(@as(usize, 3), result.consumed);
try testing.expectEqual(@as(usize, 3), result.decoded);
try testing.expectEqual(@as(u32, 'h'), output[0]);
try testing.expectEqual(@as(u32, 'i'), output[1]);
try testing.expectEqual(@as(u32, 0xFFFD), output[2]);
}
}
// Unicode Table 3-8: U+FFFD for Non-Shortest Form Sequences
// Bytes: C0 AF E0 80 BF F0 81 82 41
// Output: FFFD FFFD FFFD FFFD FFFD FFFD FFFD FFFD 0041
test "Table 3-8: non-shortest form sequences" {
const testing = std.testing;
var output: [64]u32 = undefined;
const str = "\xC0\xAF\xE0\x80\xBF\xF0\x81\x82\x41";
const result = utf8DecodeUntilControlSeq(str, &output);
try testing.expectEqual(@as(usize, 9), result.consumed);
try testing.expectEqual(@as(usize, 9), result.decoded);
for (0..8) |i| {
try testing.expectEqual(@as(u32, 0xFFFD), output[i]);
}
try testing.expectEqual(@as(u32, 0x41), output[8]);
}
// Unicode Table 3-9: U+FFFD for Ill-Formed Sequences for Surrogates
// Bytes: ED A0 80 ED BF BF ED AF 41
// Output: FFFD FFFD FFFD FFFD FFFD FFFD FFFD FFFD 0041
test "Table 3-9: surrogate sequences" {
const testing = std.testing;
var output: [64]u32 = undefined;
const str = "\xED\xA0\x80\xED\xBF\xBF\xED\xAF\x41";
const result = utf8DecodeUntilControlSeq(str, &output);
try testing.expectEqual(@as(usize, 9), result.consumed);
try testing.expectEqual(@as(usize, 9), result.decoded);
for (0..8) |i| {
try testing.expectEqual(@as(u32, 0xFFFD), output[i]);
}
try testing.expectEqual(@as(u32, 0x41), output[8]);
}
// Unicode Table 3-10: U+FFFD for Other Ill-Formed Sequences
// Bytes: F4 91 92 93 FF 41 80 BF 42
// Output: FFFD FFFD FFFD FFFD FFFD 0041 FFFD FFFD 0042
test "Table 3-10: other ill-formed sequences" {
const testing = std.testing;
var output: [64]u32 = undefined;
const str = "\xF4\x91\x92\x93\xFF\x41\x80\xBF\x42";
const result = utf8DecodeUntilControlSeq(str, &output);
try testing.expectEqual(@as(usize, 9), result.consumed);
try testing.expectEqual(@as(usize, 9), result.decoded);
try testing.expectEqual(@as(u32, 0xFFFD), output[0]); // F4
try testing.expectEqual(@as(u32, 0xFFFD), output[1]); // 91
try testing.expectEqual(@as(u32, 0xFFFD), output[2]); // 92
try testing.expectEqual(@as(u32, 0xFFFD), output[3]); // 93
try testing.expectEqual(@as(u32, 0xFFFD), output[4]); // FF
try testing.expectEqual(@as(u32, 0x0041), output[5]); // 41
try testing.expectEqual(@as(u32, 0xFFFD), output[6]); // 80
try testing.expectEqual(@as(u32, 0xFFFD), output[7]); // BF
try testing.expectEqual(@as(u32, 0x0042), output[8]); // 42
}
// Unicode Table 3-11: U+FFFD for Truncated Sequences
// Bytes: E1 80 E2 F0 91 92 F1 BF 41
// Output: FFFD FFFD FFFD FFFD 0041
test "Table 3-11: truncated sequences" {
const testing = std.testing;
var output: [64]u32 = undefined;
const str = "\xE1\x80\xE2\xF0\x91\x92\xF1\xBF\x41";
const result = utf8DecodeUntilControlSeq(str, &output);
try testing.expectEqual(@as(usize, 9), result.consumed);
try testing.expectEqual(@as(usize, 5), result.decoded);
try testing.expectEqual(@as(u32, 0xFFFD), output[0]); // E1 80 (truncated 3-byte)
try testing.expectEqual(@as(u32, 0xFFFD), output[1]); // E2 (truncated 3-byte, next byte F0 not continuation)
try testing.expectEqual(@as(u32, 0xFFFD), output[2]); // F0 91 92 (truncated 4-byte)
try testing.expectEqual(@as(u32, 0xFFFD), output[3]); // F1 BF (truncated 4-byte, next byte 41 not continuation)
try testing.expectEqual(@as(u32, 0x0041), output[4]); // 41
}