mirror of
https://github.com/ghostty-org/ghostty.git
synced 2026-05-24 05:40:15 +00:00
libghostty: Remove all libc++ and libc++ ABI dependencies (#12291)
This updates simdutf to my fork which has a SIMDUTF_NO_LIBCXX option that removes all libc++ and libc++ ABI dependencies. The plan is to open an upstream PR with this, but I want to verify it here first. From there, the hand-written simd code we have has been updated to also no longer use any libc++ features. Part of this required removing utfcpp since it depended on libc++ (`<iterator>`). libghostty-vt now only depends on libc. ## Benchmark Results | Corpus | Current `HEAD` median | `main` median | Delta vs `main` | Notes | | --- | ---: | ---: | ---: | --- | | `valid-mixed-1g-seed1.bin` | `9.245s` | `9.111s` | `1.5%` slower | Near tie; `main` remains slightly faster on fully valid input | | `malformed-mixed-1g-seed1-rate0.005.bin` | `9.251s` | `12.705s` | `37.3%` faster | Large improvement on malformed UTF-8 input | Approximate throughput from the medians: - Valid corpus: current `HEAD` `110.8 MiB/s`, `main` `112.4 MiB/s` - Malformed corpus: current `HEAD` `110.7 MiB/s`, `main` `80.6 MiB/s`
This commit is contained in:
21
.github/workflows/test.yml
vendored
21
.github/workflows/test.yml
vendored
@@ -419,8 +419,14 @@ jobs:
|
||||
echo "Libs: $(pkg-config --libs libghostty-vt)"
|
||||
echo "Static: $(pkg-config --libs --static libghostty-vt)"
|
||||
|
||||
# Libs.private must include the C++ standard library
|
||||
pkg-config --libs --static libghostty-vt | grep -q -- '-lc++'
|
||||
# Libs.private must NOT include the C++ runtime libraries (all
|
||||
# vendored C++ deps are built in no-libcxx mode).
|
||||
! pkg-config --libs --static libghostty-vt | grep -qE -- '-lc\+\+|-lc\+\+abi'
|
||||
|
||||
- name: Verify shared library has no libc++ dependency
|
||||
run: |
|
||||
ldd zig-out/lib/libghostty-vt.so.0.1.0
|
||||
! ldd zig-out/lib/libghostty-vt.so.0.1.0 2>/dev/null | grep -qE 'libc\+\+|libc\+\+abi'
|
||||
|
||||
- name: Verify static archive contains SIMD deps
|
||||
run: |
|
||||
@@ -452,16 +458,15 @@ jobs:
|
||||
- name: Test static link via pkg-config
|
||||
run: |
|
||||
export PKG_CONFIG_PATH="$PWD/zig-out/share/pkgconfig"
|
||||
# The static library is compiled with LLVM libc++ (not GNU
|
||||
# libstdc++), so linking requires a libc++-compatible toolchain.
|
||||
# zig cc, clang, or gcc with libc++-dev installed all work.
|
||||
nix develop -c zig cc -o /tmp/test_static /tmp/test_libghostty_vt.c \
|
||||
# The static archive must link cleanly into a plain C program
|
||||
# without any extra C++ runtime flags.
|
||||
nix develop -c cc -o /tmp/test_static /tmp/test_libghostty_vt.c \
|
||||
$(pkg-config --cflags libghostty-vt) \
|
||||
"$PWD/zig-out/lib/libghostty-vt.a" \
|
||||
$(pkg-config --libs-only-l --static libghostty-vt | sed 's/-lghostty-vt//')
|
||||
/tmp/test_static
|
||||
# Verify it's truly statically linked (no libghostty-vt.so dependency)
|
||||
! ldd /tmp/test_static 2>/dev/null | grep -q libghostty-vt
|
||||
# Verify it doesn't depend on the shared lib or a C++ runtime.
|
||||
! ldd /tmp/test_static 2>/dev/null | grep -qE 'libghostty-vt|libc\+\+|libc\+\+abi'
|
||||
|
||||
# Test system integration: rebuild with -Dsystem-simdutf=true so
|
||||
# simdutf comes from the system instead of being vendored. This
|
||||
|
||||
@@ -178,9 +178,8 @@ add_dependencies(ghostty-vt zig_build_lib_vt)
|
||||
# Static
|
||||
#
|
||||
# On Linux and macOS, the static library is a fat archive that bundles
|
||||
# the vendored SIMD dependencies (highway, simdutf, utfcpp). Consumers
|
||||
# only need to link libc and libc++ (LLVM's C++ runtime, not GNU
|
||||
# libstdc++). Use zig cc, clang, or any toolchain with libc++ support.
|
||||
# the vendored SIMD dependencies (highway, simdutf). Consumers
|
||||
# only need to link libc.
|
||||
#
|
||||
# On Windows, the SIMD dependencies are not bundled and must be linked
|
||||
# separately.
|
||||
@@ -349,11 +348,7 @@ function(ghostty_vt_add_target)
|
||||
)
|
||||
if(_GVT_ZIG_TARGET MATCHES "windows")
|
||||
set_target_properties(${_static_target} PROPERTIES
|
||||
INTERFACE_LINK_LIBRARIES "c++;ntdll;kernel32"
|
||||
)
|
||||
else()
|
||||
set_target_properties(${_static_target} PROPERTIES
|
||||
INTERFACE_LINK_LIBRARIES "c++"
|
||||
INTERFACE_LINK_LIBRARIES "ntdll;kernel32"
|
||||
)
|
||||
endif()
|
||||
add_dependencies(${_static_target} ${_build_target})
|
||||
|
||||
@@ -76,7 +76,6 @@
|
||||
.opengl = .{ .path = "./pkg/opengl", .lazy = true },
|
||||
.sentry = .{ .path = "./pkg/sentry", .lazy = true },
|
||||
.simdutf = .{ .path = "./pkg/simdutf", .lazy = true },
|
||||
.utfcpp = .{ .path = "./pkg/utfcpp", .lazy = true },
|
||||
.wuffs = .{ .path = "./pkg/wuffs", .lazy = true },
|
||||
.zlib = .{ .path = "./pkg/zlib", .lazy = true },
|
||||
|
||||
|
||||
5
build.zig.zon.json
generated
5
build.zig.zon.json
generated
@@ -109,11 +109,6 @@
|
||||
"url": "https://deps.files.ghostty.org/spirv_cross-1220fb3b5586e8be67bc3feb34cbe749cf42a60d628d2953632c2f8141302748c8da.tar.gz",
|
||||
"hash": "sha256-tStvz8Ref6abHwahNiwVVHNETizAmZVVaxVsU7pmV+M="
|
||||
},
|
||||
"N-V-__8AAHffAgDU0YQmynL8K35WzkcnMUmBVQHQ0jlcKpjH": {
|
||||
"name": "utfcpp",
|
||||
"url": "https://deps.files.ghostty.org/utfcpp-1220d4d18426ca72fc2b7e56ce47273149815501d0d2395c2a98c726b31ba931e641.tar.gz",
|
||||
"hash": "sha256-/8ZooxDndgfTk/PBizJxXyI9oerExNbgV5oR345rWc8="
|
||||
},
|
||||
"uucode-0.1.0-ZZjBPj96QADXyt5sqwBJUnhaDYs_qBeeKijZvlRa0eqM": {
|
||||
"name": "uucode",
|
||||
"url": "git+https://github.com/jacobsandlund/uucode#5f05f8f83a75caea201f12cc8ea32a2d82ea9732",
|
||||
|
||||
8
build.zig.zon.nix
generated
8
build.zig.zon.nix
generated
@@ -258,14 +258,6 @@ in
|
||||
hash = "sha256-tStvz8Ref6abHwahNiwVVHNETizAmZVVaxVsU7pmV+M=";
|
||||
};
|
||||
}
|
||||
{
|
||||
name = "N-V-__8AAHffAgDU0YQmynL8K35WzkcnMUmBVQHQ0jlcKpjH";
|
||||
path = fetchZigArtifact {
|
||||
name = "utfcpp";
|
||||
url = "https://deps.files.ghostty.org/utfcpp-1220d4d18426ca72fc2b7e56ce47273149815501d0d2395c2a98c726b31ba931e641.tar.gz";
|
||||
hash = "sha256-/8ZooxDndgfTk/PBizJxXyI9oerExNbgV5oR345rWc8=";
|
||||
};
|
||||
}
|
||||
{
|
||||
name = "uucode-0.1.0-ZZjBPj96QADXyt5sqwBJUnhaDYs_qBeeKijZvlRa0eqM";
|
||||
path = fetchZigArtifact {
|
||||
|
||||
1
build.zig.zon.txt
generated
1
build.zig.zon.txt
generated
@@ -20,7 +20,6 @@ https://deps.files.ghostty.org/pixels-12207ff340169c7d40c570b4b6a97db614fe47e0d8
|
||||
https://deps.files.ghostty.org/plasma_wayland_protocols-12207e0851c12acdeee0991e893e0132fc87bb763969a585dc16ecca33e88334c566.tar.gz
|
||||
https://deps.files.ghostty.org/sentry-1220446be831adcca918167647c06c7b825849fa3fba5f22da394667974537a9c77e.tar.gz
|
||||
https://deps.files.ghostty.org/spirv_cross-1220fb3b5586e8be67bc3feb34cbe749cf42a60d628d2953632c2f8141302748c8da.tar.gz
|
||||
https://deps.files.ghostty.org/utfcpp-1220d4d18426ca72fc2b7e56ce47273149815501d0d2395c2a98c726b31ba931e641.tar.gz
|
||||
https://deps.files.ghostty.org/uucode-0.2.0-ZZjBPqZVVABQepOqZHR7vV_NcaN-wats0IB6o-Exj6m9.tar.gz
|
||||
https://deps.files.ghostty.org/vaxis-7dbb9fd3122e4ffad262dd7c151d80d863b68558.tar.gz
|
||||
https://deps.files.ghostty.org/wayland-9cb3d7aa9dc995ffafdbdef7ab86a949d0fb0e7d.tar.gz
|
||||
|
||||
@@ -14,6 +14,7 @@ pub fn build(b: *std.Build) void {
|
||||
.root = b.path("src"),
|
||||
.files = &.{"main.cpp"},
|
||||
});
|
||||
exe_mod.link_libcpp = true;
|
||||
|
||||
// You'll want to use a lazy dependency here so that ghostty is only
|
||||
// downloaded if you actually need it.
|
||||
|
||||
@@ -8,10 +8,7 @@ let package = Package(
|
||||
.executableTarget(
|
||||
name: "swift-vt-xcframework",
|
||||
dependencies: ["GhosttyVt"],
|
||||
path: "Sources",
|
||||
linkerSettings: [
|
||||
.linkedLibrary("c++"),
|
||||
]
|
||||
path: "Sources"
|
||||
),
|
||||
.binaryTarget(
|
||||
name: "GhosttyVt",
|
||||
|
||||
@@ -131,12 +131,6 @@
|
||||
"dest": "vendor/p/N-V-__8AANb6pwD7O1WG6L5nvD_rNMvnSc9Cpg1ijSlTYywv",
|
||||
"sha256": "b52b6fcfc45e7fa69b1f06a1362c155473444e2cc09995556b156c53ba6657e3"
|
||||
},
|
||||
{
|
||||
"type": "archive",
|
||||
"url": "https://deps.files.ghostty.org/utfcpp-1220d4d18426ca72fc2b7e56ce47273149815501d0d2395c2a98c726b31ba931e641.tar.gz",
|
||||
"dest": "vendor/p/N-V-__8AAHffAgDU0YQmynL8K35WzkcnMUmBVQHQ0jlcKpjH",
|
||||
"sha256": "ffc668a310e77607d393f3c18b32715f223da1eac4c4d6e0579a11df8e6b59cf"
|
||||
},
|
||||
{
|
||||
"type": "git",
|
||||
"url": "https://github.com/jacobsandlund/uucode",
|
||||
|
||||
@@ -79,6 +79,11 @@ elif [ "$1" != "--update" ]; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Fetch all dependencies (including lazy ones) into the global cache
|
||||
# so that zon2nix can find them when resolving transitive dependencies.
|
||||
# Otherwise, lazy dependencies that aren't unpacked will fail below.
|
||||
zig build --fetch=all
|
||||
|
||||
zon2nix "$BUILD_ZIG_ZON" --15 --nix "$WORK_DIR/build.zig.zon.nix" --txt "$WORK_DIR/build.zig.zon.txt" --json "$WORK_DIR/build.zig.zon.json" --flatpak "$WORK_DIR/zig-packages.json"
|
||||
alejandra --quiet "$WORK_DIR/build.zig.zon.nix"
|
||||
prettier --log-level warn --write "$WORK_DIR/build.zig.zon.json"
|
||||
|
||||
@@ -3,6 +3,7 @@ const std = @import("std");
|
||||
pub fn build(b: *std.Build) !void {
|
||||
const optimize = b.standardOptimizeOption(.{});
|
||||
const target = b.standardTargetOptions(.{});
|
||||
const no_libcxx = b.option(bool, "no_libcxx", "Set SIMDUTF_NO_LIBCXX to avoid libc++ dependency") orelse false;
|
||||
|
||||
const lib = b.addLibrary(.{
|
||||
.name = "simdutf",
|
||||
@@ -13,13 +14,15 @@ pub fn build(b: *std.Build) !void {
|
||||
.linkage = .static,
|
||||
});
|
||||
lib.linkLibC();
|
||||
// On MSVC, we must not use linkLibCpp because Zig unconditionally
|
||||
// passes -nostdinc++ and then adds its bundled libc++/libc++abi
|
||||
// include paths, which conflict with MSVC's own C++ runtime headers.
|
||||
// The MSVC SDK include directories (added via linkLibC) contain
|
||||
// both C and C++ headers, so linkLibCpp is not needed.
|
||||
if (target.result.abi != .msvc) {
|
||||
lib.linkLibCpp();
|
||||
if (!no_libcxx) {
|
||||
// On MSVC, we must not use linkLibCpp because Zig unconditionally
|
||||
// passes -nostdinc++ and then adds its bundled libc++/libc++abi
|
||||
// include paths, which conflict with MSVC's own C++ runtime headers.
|
||||
// The MSVC SDK include directories (added via linkLibC) contain
|
||||
// both C and C++ headers, so linkLibCpp is not needed.
|
||||
if (target.result.abi != .msvc) {
|
||||
lib.linkLibCpp();
|
||||
}
|
||||
}
|
||||
lib.addIncludePath(b.path("vendor"));
|
||||
|
||||
@@ -45,6 +48,13 @@ pub fn build(b: *std.Build) !void {
|
||||
"-fno-sanitize-trap=undefined",
|
||||
});
|
||||
|
||||
if (no_libcxx) {
|
||||
try flags.append(b.allocator, "-DSIMDUTF_NO_LIBCXX");
|
||||
try flags.append(b.allocator, "-fno-exceptions");
|
||||
try flags.append(b.allocator, "-fno-rtti");
|
||||
lib.root_module.addCMacro("SIMDUTF_NO_LIBCXX", "1");
|
||||
}
|
||||
|
||||
if (target.result.os.tag == .freebsd or target.result.abi == .musl) {
|
||||
try flags.append(b.allocator, "-fPIC");
|
||||
}
|
||||
|
||||
69481
pkg/simdutf/vendor/simdutf.cpp
vendored
69481
pkg/simdutf/vendor/simdutf.cpp
vendored
File diff suppressed because it is too large
Load Diff
11715
pkg/simdutf/vendor/simdutf.h
vendored
11715
pkg/simdutf/vendor/simdutf.h
vendored
File diff suppressed because it is too large
Load Diff
@@ -1,61 +0,0 @@
|
||||
const std = @import("std");
|
||||
|
||||
pub fn build(b: *std.Build) !void {
|
||||
const target = b.standardTargetOptions(.{});
|
||||
const optimize = b.standardOptimizeOption(.{});
|
||||
|
||||
const lib = b.addLibrary(.{
|
||||
.name = "utfcpp",
|
||||
.root_module = b.createModule(.{
|
||||
.target = target,
|
||||
.optimize = optimize,
|
||||
}),
|
||||
.linkage = .static,
|
||||
});
|
||||
lib.linkLibC();
|
||||
|
||||
if (target.result.os.tag.isDarwin()) {
|
||||
const apple_sdk = @import("apple_sdk");
|
||||
try apple_sdk.addPaths(b, lib);
|
||||
}
|
||||
|
||||
if (target.result.abi.isAndroid()) {
|
||||
const android_ndk = @import("android_ndk");
|
||||
try android_ndk.addPaths(b, lib);
|
||||
}
|
||||
|
||||
var flags: std.ArrayList([]const u8) = .empty;
|
||||
defer flags.deinit(b.allocator);
|
||||
|
||||
lib.addCSourceFiles(.{
|
||||
.flags = flags.items,
|
||||
.files = &.{"empty.cc"},
|
||||
});
|
||||
|
||||
if (b.lazyDependency("utfcpp", .{})) |upstream| {
|
||||
lib.addIncludePath(upstream.path(""));
|
||||
lib.installHeadersDirectory(
|
||||
upstream.path("source"),
|
||||
"",
|
||||
.{ .include_extensions = &.{".h"} },
|
||||
);
|
||||
}
|
||||
|
||||
b.installArtifact(lib);
|
||||
|
||||
// {
|
||||
// const test_exe = b.addTest(.{
|
||||
// .name = "test",
|
||||
// .root_source_file = .{ .path = "main.zig" },
|
||||
// .target = target,
|
||||
// .optimize = optimize,
|
||||
// });
|
||||
// test_exe.linkLibrary(lib);
|
||||
//
|
||||
// var it = module.import_table.iterator();
|
||||
// while (it.next()) |entry| test_exe.root_module.addImport(entry.key_ptr.*, entry.value_ptr.*);
|
||||
// const tests_run = b.addRunArtifact(test_exe);
|
||||
// const test_step = b.step("test", "Run tests");
|
||||
// test_step.dependOn(&tests_run.step);
|
||||
// }
|
||||
}
|
||||
@@ -1,17 +0,0 @@
|
||||
.{
|
||||
.name = .utfcpp,
|
||||
.version = "4.0.5",
|
||||
.fingerprint = 0xcd99aeb2334ae11a,
|
||||
.paths = .{""},
|
||||
.dependencies = .{
|
||||
// nemtrif/utfcpp
|
||||
.utfcpp = .{
|
||||
.url = "https://deps.files.ghostty.org/utfcpp-1220d4d18426ca72fc2b7e56ce47273149815501d0d2395c2a98c726b31ba931e641.tar.gz",
|
||||
.hash = "N-V-__8AAHffAgDU0YQmynL8K35WzkcnMUmBVQHQ0jlcKpjH",
|
||||
.lazy = true,
|
||||
},
|
||||
|
||||
.apple_sdk = .{ .path = "../apple-sdk" },
|
||||
.android_ndk = .{ .path = "../android-ndk" },
|
||||
},
|
||||
}
|
||||
@@ -1,2 +0,0 @@
|
||||
// Needed for Zig build to be happy
|
||||
void ghostty_utfcpp_stub() {}
|
||||
@@ -26,9 +26,23 @@ The benchmark tools are split into two roles:
|
||||
based on medians instead of single runs.
|
||||
- When comparing branches, keep all benchmark inputs and CLI flags the same,
|
||||
including terminal dimensions.
|
||||
- Never run multiple benchmarks in parallel on the same machine, as they will
|
||||
interfere with each other and produce unreliable results.
|
||||
|
||||
## Building
|
||||
|
||||
- Build benchmark tools with `zig build -Demit-bench`.
|
||||
- On macOS, prefer `zig build -Demit-bench -Demit-macos-app=false` unless the
|
||||
macOS app itself is part of the work.
|
||||
- Build benchmark tools with `zig build -Demit-bench -Doptimize=ReleaseFast`.
|
||||
- On macOS, add `-Demit-macos-app=false` to avoid building the macOS app.
|
||||
- Make sure you specify `-Doptimize=ReleaseFast` when building benchmarks,
|
||||
otherwise the debug build will be very slow and not representative of real
|
||||
performance.
|
||||
|
||||
## Comparing Branches
|
||||
|
||||
- When comparing branches, switch to that branch, build the binary, then
|
||||
rename it e.g. `zig-out/bin/ghostty-bench` to `zig-out/bin/ghostty-bench-branch1`.
|
||||
Replace branch1 with something better.
|
||||
- Then switch to the other branch, build it, and rename it to
|
||||
`zig-out/bin/ghostty-bench-branch2`. Replace branch2 with something better.
|
||||
- Then run all the benchmarks with `hyperfine` comparing the N binaries
|
||||
we want to.
|
||||
|
||||
@@ -346,11 +346,8 @@ fn combineArchives(
|
||||
}
|
||||
|
||||
/// Returns the Libs.private value for the pkg-config file.
|
||||
/// This includes the C++ standard library needed by SIMD code.
|
||||
///
|
||||
/// Zig compiles C++ code with LLVM's libc++ (not GNU libstdc++),
|
||||
/// so consumers linking the static library need a libc++-compatible
|
||||
/// toolchain: `zig cc`, `clang`, or GCC with `-lc++` installed.
|
||||
/// Vendored C++ dependencies are built in no-libcxx mode so consumers
|
||||
/// don't need libc++. System-provided simdutf still requires it.
|
||||
fn libsPrivate(
|
||||
zig: *const GhosttyZig,
|
||||
) []const u8 {
|
||||
|
||||
@@ -119,13 +119,14 @@ fn initVt(
|
||||
.target = cfg.target,
|
||||
.optimize = cfg.optimize,
|
||||
|
||||
// SIMD require libc/libcpp (both) but otherwise we don't care.
|
||||
// On MSVC, we must not use linkLibCpp because Zig passes
|
||||
// -nostdinc++ and adds its bundled libc++/libc++abi headers
|
||||
// which conflict with MSVC's C++ runtime. The MSVC SDK dirs
|
||||
// added via link_libc contain both C and C++ headers.
|
||||
// SIMD requires libc. Vendored C++ dependencies are built with
|
||||
// no-libcxx mode (HWY_NO_LIBCXX / SIMDUTF_NO_LIBCXX) so we
|
||||
// don't need libcpp. System-provided simdutf headers still
|
||||
// use C++ stdlib headers, so we need libcpp in that case.
|
||||
.link_libc = if (cfg.simd) true else null,
|
||||
.link_libcpp = if (cfg.simd and cfg.target.result.abi != .msvc) true else null,
|
||||
.link_libcpp = if (cfg.simd and
|
||||
b.systemIntegrationOption("simdutf", .{}) and
|
||||
cfg.target.result.abi != .msvc) true else null,
|
||||
});
|
||||
vt.addOptions("build_options", general_options);
|
||||
vt_options.add(b, vt);
|
||||
|
||||
@@ -762,6 +762,7 @@ pub fn addSimd(
|
||||
if (b.lazyDependency("simdutf", .{
|
||||
.target = target,
|
||||
.optimize = optimize,
|
||||
.no_libcxx = true,
|
||||
})) |simdutf_dep| {
|
||||
m.linkLibrary(simdutf_dep.artifact("simdutf"));
|
||||
if (static_libs) |v| try v.append(
|
||||
@@ -787,18 +788,6 @@ pub fn addSimd(
|
||||
}
|
||||
}
|
||||
|
||||
// utfcpp - This is used as a dependency on our hand-written C++ code
|
||||
if (b.lazyDependency("utfcpp", .{
|
||||
.target = target,
|
||||
.optimize = optimize,
|
||||
})) |utfcpp_dep| {
|
||||
m.linkLibrary(utfcpp_dep.artifact("utfcpp"));
|
||||
if (static_libs) |v| try v.append(
|
||||
b.allocator,
|
||||
utfcpp_dep.artifact("utfcpp").getEmittedBin(),
|
||||
);
|
||||
}
|
||||
|
||||
// SIMD C++ files
|
||||
m.addIncludePath(b.path("src"));
|
||||
{
|
||||
@@ -839,6 +828,14 @@ pub fn addSimd(
|
||||
"-DHWY_NO_LIBCXX",
|
||||
);
|
||||
|
||||
// When using the vendored simdutf, build its headers in no-libcxx
|
||||
// mode so we don't need C++ standard library headers at all.
|
||||
// System simdutf headers may not support this define.
|
||||
if (!b.systemIntegrationOption("simdutf", .{})) try flags.append(
|
||||
b.allocator,
|
||||
"-DSIMDUTF_NO_LIBCXX",
|
||||
);
|
||||
|
||||
// Disable ubsan for MSVC to avoid undefined references to
|
||||
// __ubsan_handle_* symbols that require a runtime we don't link
|
||||
// and bundle. Hopefully we can fix this one day since ubsan is nice!
|
||||
|
||||
@@ -4,9 +4,34 @@
|
||||
#include <hwy/foreach_target.h> // must come before highway.h
|
||||
#include <hwy/highway.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <iterator>
|
||||
#ifndef GHOSTTY_SIMD_CPW_HELPERS_
|
||||
#define GHOSTTY_SIMD_CPW_HELPERS_
|
||||
|
||||
#include <assert.h>
|
||||
#include <stddef.h>
|
||||
|
||||
// Replacement for std::size() that works without libc++.
|
||||
template <typename T, size_t N>
|
||||
constexpr size_t array_size(const T (&)[N]) { return N; }
|
||||
|
||||
// Constexpr min/max element over a C array (replaces std::min_element/
|
||||
// std::max_element).
|
||||
template <typename T, size_t N>
|
||||
constexpr T array_min(const T (&a)[N]) {
|
||||
T m = a[0];
|
||||
for (size_t i = 1; i < N; ++i)
|
||||
if (a[i] != 0 && (m == 0 || a[i] < m)) m = a[i];
|
||||
return m;
|
||||
}
|
||||
template <typename T, size_t N>
|
||||
constexpr T array_max(const T (&a)[N]) {
|
||||
T m = a[0];
|
||||
for (size_t i = 1; i < N; ++i)
|
||||
if (a[i] > m) m = a[i];
|
||||
return m;
|
||||
}
|
||||
|
||||
#endif // GHOSTTY_SIMD_CPW_HELPERS_
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace ghostty {
|
||||
@@ -214,12 +239,12 @@ HWY_ALIGN constexpr uint16_t nsm_lte16[] = {
|
||||
};
|
||||
|
||||
// All our tables must be identically sized
|
||||
static_assert(std::size(eaw_gte32) == std::size(eaw_lte32));
|
||||
static_assert(std::size(eaw_gte16) == std::size(eaw_lte16));
|
||||
static_assert(std::size(zero_gte32) == std::size(zero_lte32));
|
||||
static_assert(std::size(zero_gte16) == std::size(zero_lte16));
|
||||
static_assert(std::size(nsm_gte32) == std::size(nsm_lte32));
|
||||
static_assert(std::size(nsm_gte16) == std::size(nsm_lte16));
|
||||
static_assert(array_size(eaw_gte32) == array_size(eaw_lte32));
|
||||
static_assert(array_size(eaw_gte16) == array_size(eaw_lte16));
|
||||
static_assert(array_size(zero_gte32) == array_size(zero_lte32));
|
||||
static_assert(array_size(zero_gte16) == array_size(zero_lte16));
|
||||
static_assert(array_size(nsm_gte32) == array_size(nsm_lte32));
|
||||
static_assert(array_size(nsm_gte16) == array_size(nsm_lte16));
|
||||
|
||||
/// Handles 16-bit codepoints.
|
||||
template <class D, typename T = uint16_t>
|
||||
@@ -245,10 +270,10 @@ int8_t CodepointWidth16(D d, uint16_t input) {
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
};
|
||||
static_assert(std::size(gte_keys) == std::size(lte_keys));
|
||||
static_assert(std::size(gte_keys) >= 32);
|
||||
static_assert(array_size(gte_keys) == array_size(lte_keys));
|
||||
static_assert(array_size(gte_keys) >= 32);
|
||||
size_t i = 0;
|
||||
for (; i + N <= std::size(lte_keys) && lte_keys[i] != 0; i += N) {
|
||||
for (; i + N <= array_size(lte_keys) && lte_keys[i] != 0; i += N) {
|
||||
const hn::Vec<D> lte_vec = hn::Load(d, lte_keys + i);
|
||||
const hn::Vec<D> gte_vec = hn::Load(d, gte_keys + i);
|
||||
const intptr_t idx = hn::FindFirstTrue(
|
||||
@@ -267,12 +292,12 @@ int8_t CodepointWidth16(D d, uint16_t input) {
|
||||
|
||||
{
|
||||
constexpr T zero_gte_min =
|
||||
*std::min_element(zero_gte16, zero_gte16 + std::size(zero_gte16));
|
||||
array_min(zero_gte16);
|
||||
constexpr T zero_lte_max =
|
||||
*std::max_element(zero_lte16, zero_lte16 + std::size(zero_lte16));
|
||||
array_max(zero_lte16);
|
||||
if (input >= zero_gte_min && input <= zero_lte_max) {
|
||||
size_t i = 0;
|
||||
for (; i + N <= std::size(zero_gte16) && zero_gte16[i] != 0; i += N) {
|
||||
for (; i + N <= array_size(zero_gte16) && zero_gte16[i] != 0; i += N) {
|
||||
const hn::Vec<D> lte_vec = hn::Load(d, zero_lte16 + i);
|
||||
const hn::Vec<D> gte_vec = hn::Load(d, zero_gte16 + i);
|
||||
const intptr_t idx = hn::FindFirstTrue(
|
||||
@@ -286,12 +311,12 @@ int8_t CodepointWidth16(D d, uint16_t input) {
|
||||
|
||||
{
|
||||
constexpr T eaw_gte_min =
|
||||
*std::min_element(eaw_gte16, eaw_gte16 + std::size(eaw_gte16));
|
||||
array_min(eaw_gte16);
|
||||
constexpr T eaw_lte_max =
|
||||
*std::max_element(eaw_lte16, eaw_lte16 + std::size(eaw_lte16));
|
||||
array_max(eaw_lte16);
|
||||
if (input >= eaw_gte_min && input <= eaw_lte_max) {
|
||||
size_t i = 0;
|
||||
for (; i + N <= std::size(eaw_lte16) && eaw_lte16[i] != 0; i += N) {
|
||||
for (; i + N <= array_size(eaw_lte16) && eaw_lte16[i] != 0; i += N) {
|
||||
const hn::Vec<D> lte_vec = hn::Load(d, eaw_lte16 + i);
|
||||
const hn::Vec<D> gte_vec = hn::Load(d, eaw_gte16 + i);
|
||||
const intptr_t idx = hn::FindFirstTrue(
|
||||
@@ -305,12 +330,12 @@ int8_t CodepointWidth16(D d, uint16_t input) {
|
||||
|
||||
{
|
||||
constexpr T nsm_gte_min =
|
||||
*std::min_element(nsm_gte16, nsm_gte16 + std::size(nsm_gte16));
|
||||
array_min(nsm_gte16);
|
||||
constexpr T nsm_lte_max =
|
||||
*std::max_element(nsm_lte16, nsm_lte16 + std::size(nsm_lte16));
|
||||
array_max(nsm_lte16);
|
||||
if (input >= nsm_gte_min && input <= nsm_lte_max) {
|
||||
size_t i = 0;
|
||||
for (; i + N <= std::size(nsm_lte16) && nsm_lte16[i] != 0; i += N) {
|
||||
for (; i + N <= array_size(nsm_lte16) && nsm_lte16[i] != 0; i += N) {
|
||||
const hn::Vec<D> lte_vec = hn::Load(d, nsm_lte16 + i);
|
||||
const hn::Vec<D> gte_vec = hn::Load(d, nsm_gte16 + i);
|
||||
const intptr_t idx = hn::FindFirstTrue(
|
||||
@@ -342,10 +367,10 @@ int8_t CodepointWidth32(D d, T input) {
|
||||
HWY_ALIGN constexpr T lte_keys[] = {
|
||||
0x1f1ff, 0x2FFFD, 0x3FFFD, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
};
|
||||
static_assert(std::size(gte_keys) == std::size(lte_keys));
|
||||
static_assert(std::size(gte_keys) >= 16);
|
||||
static_assert(array_size(gte_keys) == array_size(lte_keys));
|
||||
static_assert(array_size(gte_keys) >= 16);
|
||||
size_t i = 0;
|
||||
for (; i + N <= std::size(lte_keys) && lte_keys[i] != 0; i += N) {
|
||||
for (; i + N <= array_size(lte_keys) && lte_keys[i] != 0; i += N) {
|
||||
const hn::Vec<D> lte_vec = hn::Load(d, lte_keys + i);
|
||||
const hn::Vec<D> gte_vec = hn::Load(d, gte_keys + i);
|
||||
const intptr_t idx = hn::FindFirstTrue(
|
||||
@@ -364,10 +389,10 @@ int8_t CodepointWidth32(D d, T input) {
|
||||
HWY_ALIGN constexpr T lte_keys[] = {
|
||||
0xE0FFF, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
};
|
||||
static_assert(std::size(gte_keys) == std::size(lte_keys));
|
||||
static_assert(std::size(gte_keys) >= 16);
|
||||
static_assert(array_size(gte_keys) == array_size(lte_keys));
|
||||
static_assert(array_size(gte_keys) >= 16);
|
||||
size_t i = 0;
|
||||
for (; i + N <= std::size(lte_keys) && lte_keys[i] != 0; i += N) {
|
||||
for (; i + N <= array_size(lte_keys) && lte_keys[i] != 0; i += N) {
|
||||
const hn::Vec<D> lte_vec = hn::Load(d, lte_keys + i);
|
||||
const hn::Vec<D> gte_vec = hn::Load(d, gte_keys + i);
|
||||
const intptr_t idx = hn::FindFirstTrue(
|
||||
@@ -380,12 +405,12 @@ int8_t CodepointWidth32(D d, T input) {
|
||||
|
||||
{
|
||||
constexpr T zero_gte_min =
|
||||
*std::min_element(zero_gte32, zero_gte32 + std::size(zero_gte32));
|
||||
array_min(zero_gte32);
|
||||
constexpr T zero_lte_max =
|
||||
*std::max_element(zero_lte32, zero_lte32 + std::size(zero_lte32));
|
||||
array_max(zero_lte32);
|
||||
if (input >= zero_gte_min && input <= zero_lte_max) {
|
||||
size_t i = 0;
|
||||
for (; i + N <= std::size(zero_gte32) && zero_gte32[i] != 0; i += N) {
|
||||
for (; i + N <= array_size(zero_gte32) && zero_gte32[i] != 0; i += N) {
|
||||
const hn::Vec<D> lte_vec = hn::Load(d, zero_lte32 + i);
|
||||
const hn::Vec<D> gte_vec = hn::Load(d, zero_gte32 + i);
|
||||
const intptr_t idx = hn::FindFirstTrue(
|
||||
@@ -399,12 +424,12 @@ int8_t CodepointWidth32(D d, T input) {
|
||||
|
||||
{
|
||||
constexpr T eaw_gte_min =
|
||||
*std::min_element(eaw_gte32, eaw_gte32 + std::size(eaw_gte32));
|
||||
array_min(eaw_gte32);
|
||||
constexpr T eaw_lte_max =
|
||||
*std::max_element(eaw_lte32, eaw_lte32 + std::size(eaw_lte32));
|
||||
array_max(eaw_lte32);
|
||||
if (input >= eaw_gte_min && input <= eaw_lte_max) {
|
||||
size_t i = 0;
|
||||
for (; i + N <= std::size(eaw_lte32) && eaw_lte32[i] != 0; i += N) {
|
||||
for (; i + N <= array_size(eaw_lte32) && eaw_lte32[i] != 0; i += N) {
|
||||
const hn::Vec<D> lte_vec = hn::Load(d, eaw_lte32 + i);
|
||||
const hn::Vec<D> gte_vec = hn::Load(d, eaw_gte32 + i);
|
||||
const intptr_t idx = hn::FindFirstTrue(
|
||||
@@ -418,12 +443,12 @@ int8_t CodepointWidth32(D d, T input) {
|
||||
|
||||
{
|
||||
constexpr T nsm_gte_min =
|
||||
*std::min_element(nsm_gte32, nsm_gte32 + std::size(nsm_gte32));
|
||||
array_min(nsm_gte32);
|
||||
constexpr T nsm_lte_max =
|
||||
*std::max_element(nsm_lte32, nsm_lte32 + std::size(nsm_lte32));
|
||||
array_max(nsm_lte32);
|
||||
if (input >= nsm_gte_min && input <= nsm_lte_max) {
|
||||
size_t i = 0;
|
||||
for (; i + N <= std::size(nsm_lte32) && nsm_lte32[i] != 0; i += N) {
|
||||
for (; i + N <= array_size(nsm_lte32) && nsm_lte32[i] != 0; i += N) {
|
||||
const hn::Vec<D> lte_vec = hn::Load(d, nsm_lte32 + i);
|
||||
const hn::Vec<D> gte_vec = hn::Load(d, nsm_gte32 + i);
|
||||
const intptr_t idx = hn::FindFirstTrue(
|
||||
|
||||
@@ -6,8 +6,6 @@
|
||||
|
||||
#include <simd/index_of.h>
|
||||
|
||||
#include <optional>
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace ghostty {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
@@ -7,8 +7,7 @@
|
||||
|
||||
#include <hwy/highway.h>
|
||||
|
||||
#include <cstddef>
|
||||
#include <optional>
|
||||
#include <stddef.h>
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace ghostty {
|
||||
@@ -16,12 +15,16 @@ namespace HWY_NAMESPACE {
|
||||
|
||||
namespace hn = hwy::HWY_NAMESPACE;
|
||||
|
||||
// Sentinel value returned by IndexOfChunk when no match is found.
|
||||
static constexpr size_t kNotFound = static_cast<size_t>(-1);
|
||||
|
||||
// Return the index of the first occurrence of `needle` in `input`, where
|
||||
// the input and needle are already loaded into vectors.
|
||||
// the input and needle are already loaded into vectors. Returns kNotFound
|
||||
// if no match is found.
|
||||
template <class D, typename T = hn::TFromD<D>>
|
||||
std::optional<size_t> IndexOfChunk(D d,
|
||||
hn::Vec<D> needle_vec,
|
||||
hn::Vec<D> input_vec) {
|
||||
size_t IndexOfChunk(D d,
|
||||
hn::Vec<D> needle_vec,
|
||||
hn::Vec<D> input_vec) {
|
||||
// Compare the input vector with the needle vector. This produces
|
||||
// a vector where each lane is 0xFF if the corresponding lane in
|
||||
// `input_vec` is equal to the corresponding lane in `needle_vec`.
|
||||
@@ -32,9 +35,9 @@ std::optional<size_t> IndexOfChunk(D d,
|
||||
|
||||
// If we found a match, return the index into the input.
|
||||
if (pos >= 0) {
|
||||
return std::optional<size_t>(static_cast<size_t>(pos));
|
||||
return static_cast<size_t>(pos);
|
||||
} else {
|
||||
return std::nullopt;
|
||||
return kNotFound;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -58,8 +61,9 @@ size_t IndexOfImpl(D d, T needle, const T* HWY_RESTRICT input, size_t count) {
|
||||
for (; i + N <= count; i += N) {
|
||||
// Load the N elements from our input into a vector and check the chunk.
|
||||
const hn::Vec<D> input_vec = hn::LoadU(d, input + i);
|
||||
if (auto pos = IndexOfChunk(d, needle_vec, input_vec)) {
|
||||
return i + pos.value();
|
||||
const size_t pos = IndexOfChunk(d, needle_vec, input_vec);
|
||||
if (pos != kNotFound) {
|
||||
return i + pos;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
215
src/simd/vt.cpp
215
src/simd/vt.cpp
@@ -5,8 +5,8 @@
|
||||
#include <hwy/highway.h>
|
||||
|
||||
#include <simdutf.h>
|
||||
#include <utf8.h>
|
||||
#include <vector>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <simd/index_of.h>
|
||||
#include <simd/vt.h>
|
||||
@@ -19,12 +19,144 @@ namespace hn = hwy::HWY_NAMESPACE;
|
||||
|
||||
using T = uint8_t;
|
||||
|
||||
// Compute the length of the maximal subpart of an ill-formed UTF-8
|
||||
// subsequence starting at p[0], per Unicode Table 3-7 and the W3C
|
||||
// "U+FFFD Substitution of Maximal Subparts" algorithm.
|
||||
//
|
||||
// The maximal subpart is the longest initial subsequence that is either:
|
||||
// (a) the start of a well-formed sequence, or
|
||||
// (b) a single byte.
|
||||
// Each maximal subpart maps to exactly one U+FFFD.
|
||||
static size_t MaximalSubpart(const unsigned char* p, size_t len) {
|
||||
if (len == 0) return 0;
|
||||
|
||||
unsigned char b0 = p[0];
|
||||
|
||||
// Continuation bytes (80-BF), overlong leads (C0-C1), or invalid (F5-FF):
|
||||
// each is its own maximal subpart of length 1.
|
||||
if (b0 < 0xC2 || b0 > 0xF4) return 1;
|
||||
|
||||
// Determine the expected sequence length and the valid range for each
|
||||
// continuation byte per Unicode Table 3-7.
|
||||
size_t seq_len;
|
||||
unsigned char lo[3], hi[3];
|
||||
|
||||
if (b0 <= 0xDF) {
|
||||
seq_len = 2;
|
||||
lo[0] = 0x80; hi[0] = 0xBF;
|
||||
} else if (b0 == 0xE0) {
|
||||
seq_len = 3;
|
||||
lo[0] = 0xA0; hi[0] = 0xBF;
|
||||
lo[1] = 0x80; hi[1] = 0xBF;
|
||||
} else if (b0 <= 0xEC) {
|
||||
seq_len = 3;
|
||||
lo[0] = 0x80; hi[0] = 0xBF;
|
||||
lo[1] = 0x80; hi[1] = 0xBF;
|
||||
} else if (b0 == 0xED) {
|
||||
seq_len = 3;
|
||||
lo[0] = 0x80; hi[0] = 0x9F;
|
||||
lo[1] = 0x80; hi[1] = 0xBF;
|
||||
} else if (b0 <= 0xEF) {
|
||||
seq_len = 3;
|
||||
lo[0] = 0x80; hi[0] = 0xBF;
|
||||
lo[1] = 0x80; hi[1] = 0xBF;
|
||||
} else if (b0 == 0xF0) {
|
||||
seq_len = 4;
|
||||
lo[0] = 0x90; hi[0] = 0xBF;
|
||||
lo[1] = 0x80; hi[1] = 0xBF;
|
||||
lo[2] = 0x80; hi[2] = 0xBF;
|
||||
} else if (b0 <= 0xF3) {
|
||||
seq_len = 4;
|
||||
lo[0] = 0x80; hi[0] = 0xBF;
|
||||
lo[1] = 0x80; hi[1] = 0xBF;
|
||||
lo[2] = 0x80; hi[2] = 0xBF;
|
||||
} else { // b0 == 0xF4
|
||||
seq_len = 4;
|
||||
lo[0] = 0x80; hi[0] = 0x8F;
|
||||
lo[1] = 0x80; hi[1] = 0xBF;
|
||||
lo[2] = 0x80; hi[2] = 0xBF;
|
||||
}
|
||||
|
||||
// Check continuation bytes against their specific valid ranges.
|
||||
// The maximal subpart extends as far as bytes match.
|
||||
size_t valid = 1; // lead byte counts
|
||||
for (size_t i = 0; i < seq_len - 1 && valid < len; i++) {
|
||||
unsigned char cb = p[valid];
|
||||
if (cb < lo[i] || cb > hi[i]) break;
|
||||
valid++;
|
||||
}
|
||||
|
||||
// If we matched all bytes, the sequence is structurally valid
|
||||
// (shouldn't happen since we're called on an error), but cap
|
||||
// to avoid skipping a valid sequence.
|
||||
if (valid == seq_len) return valid;
|
||||
|
||||
return valid;
|
||||
}
|
||||
|
||||
// Trim trailing bytes that form a valid-but-incomplete UTF-8 sequence.
|
||||
// Only trims sequences whose bytes so far match Table 3-7 ranges (i.e.,
|
||||
// truly partial sequences that could be completed by future input).
|
||||
// Invalid lead bytes (C0, C1, F5-FF) or mismatched continuations are NOT
|
||||
// trimmed — they will be handled as errors by DecodeUTF8.
|
||||
static size_t TrimValidPartialUTF8(const uint8_t* input, size_t len) {
|
||||
if (len == 0) return 0;
|
||||
|
||||
// Find the start of a potential trailing partial sequence by scanning
|
||||
// backwards from the end. We look for a lead byte (C2-F4) that could
|
||||
// start a multi-byte sequence, possibly followed by continuation bytes.
|
||||
//
|
||||
// We check up to the last 4 bytes (max UTF-8 sequence length).
|
||||
size_t check_start = len > 4 ? len - 4 : 0;
|
||||
for (size_t pos = len; pos > check_start; pos--) {
|
||||
unsigned char b = input[pos - 1];
|
||||
|
||||
// Skip continuation bytes — they might belong to the partial sequence.
|
||||
if ((b & 0xC0) == 0x80) continue;
|
||||
|
||||
// Found a non-continuation byte. Only valid multi-byte leads (C2-F4)
|
||||
// can start a partial sequence worth trimming. Anything else (ASCII,
|
||||
// C0, C1, F5-FF) should be consumed by DecodeUTF8.
|
||||
if (b < 0xC2 || b > 0xF4) return len;
|
||||
|
||||
// Determine expected sequence length from the lead byte.
|
||||
size_t expected;
|
||||
if (b <= 0xDF)
|
||||
expected = 2;
|
||||
else if (b <= 0xEF)
|
||||
expected = 3;
|
||||
else
|
||||
expected = 4;
|
||||
|
||||
size_t seq_remaining = len - (pos - 1);
|
||||
|
||||
// If we have all expected bytes, the sequence is complete (not partial).
|
||||
if (seq_remaining >= expected) return len;
|
||||
|
||||
// Check if the trailing bytes form a valid prefix using MaximalSubpart.
|
||||
const unsigned char* seq_start = input + pos - 1;
|
||||
size_t subpart = MaximalSubpart(seq_start, seq_remaining);
|
||||
|
||||
// Only trim if ALL trailing bytes are part of the valid prefix
|
||||
// (the sequence is valid-so-far but incomplete).
|
||||
if (subpart == seq_remaining) {
|
||||
return pos - 1;
|
||||
}
|
||||
|
||||
// The sequence is ill-formed, don't trim — let DecodeUTF8 handle it.
|
||||
return len;
|
||||
}
|
||||
|
||||
return len;
|
||||
}
|
||||
|
||||
// Decode the UTF-8 text in input into output. Returns the number of decoded
|
||||
// characters. This function assumes output is large enough.
|
||||
//
|
||||
// This function handles malformed UTF-8 sequences by inserting a
|
||||
// replacement character (U+FFFD) and continuing to decode. This function
|
||||
// will consume the entire input no matter what.
|
||||
// replacement character (U+FFFD) following the W3C/Unicode "U+FFFD
|
||||
// Substitution of Maximal Subparts" algorithm and continuing to decode.
|
||||
// This function will consume the entire input no matter what.
|
||||
size_t DecodeUTF8(const uint8_t* HWY_RESTRICT input,
|
||||
size_t count,
|
||||
char32_t* output) {
|
||||
@@ -34,27 +166,38 @@ size_t DecodeUTF8(const uint8_t* HWY_RESTRICT input,
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Assume no errors for fast path.
|
||||
const size_t decoded = simdutf::convert_utf8_to_utf32(
|
||||
reinterpret_cast<const char*>(input), count, output);
|
||||
if (decoded > 0) {
|
||||
return decoded;
|
||||
// Decode UTF-8 to UTF-32, replacing invalid sequences with U+FFFD.
|
||||
const char* in = reinterpret_cast<const char*>(input);
|
||||
size_t remaining = count;
|
||||
char32_t* out = output;
|
||||
while (remaining > 0) {
|
||||
auto r = simdutf::convert_utf8_to_utf32_with_errors(in, remaining, out);
|
||||
|
||||
// If the decode was a full success then we're done!
|
||||
if (r.error == simdutf::SUCCESS) {
|
||||
out += r.count;
|
||||
break;
|
||||
}
|
||||
|
||||
// On error, r.count is the input byte position of the error.
|
||||
// The output buffer is already written up to that point, but
|
||||
// we need count_utf8 to find how many char32_t that produced.
|
||||
out += simdutf::count_utf8(in, r.count);
|
||||
|
||||
// Compute the maximal subpart at the error position and emit
|
||||
// a single U+FFFD for it.
|
||||
const unsigned char* err_pos =
|
||||
reinterpret_cast<const unsigned char*>(in + r.count);
|
||||
size_t err_remaining = remaining - r.count;
|
||||
size_t skip = r.count + MaximalSubpart(err_pos, err_remaining);
|
||||
|
||||
*out++ = 0xFFFD;
|
||||
|
||||
in += skip;
|
||||
remaining -= skip;
|
||||
}
|
||||
|
||||
// Errors in the UTF input, take a slow path and do a decode with
|
||||
// replacement (with U+FFFD). Note that simdutf doesn't have a
|
||||
// decode with replacement API:
|
||||
// https://github.com/simdutf/simdutf/issues/147
|
||||
//
|
||||
// Because of this, we use a separate library with heap allocation
|
||||
// that is much, much slower (the allocation is slower, the algorithm
|
||||
// is slower, etc.) This is just so we have something that works.
|
||||
// I want to replace this.
|
||||
std::vector<char> replacement_result;
|
||||
utf8::replace_invalid(input, input + count,
|
||||
std::back_inserter(replacement_result), 0xFFFD);
|
||||
return DecodeUTF8(reinterpret_cast<const uint8_t*>(replacement_result.data()),
|
||||
replacement_result.size(), output);
|
||||
return static_cast<size_t>(out - output);
|
||||
}
|
||||
|
||||
/// Decode the UTF-8 text in input into output until an escape
|
||||
@@ -86,16 +229,16 @@ size_t DecodeUTF8UntilControlSeqImpl(D d,
|
||||
// If we don't have any escapes we keep going. We want to accumulate
|
||||
// the largest possible valid UTF-8 sequence before decoding.
|
||||
// TODO(mitchellh): benchmark this vs decoding every time
|
||||
const auto esc_idx = IndexOfChunk(d, esc_vec, input_vec);
|
||||
if (!esc_idx) {
|
||||
const size_t esc_idx = IndexOfChunk(d, esc_vec, input_vec);
|
||||
if (esc_idx == kNotFound) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// We have an ESC char, decode up to this point. We start by assuming
|
||||
// a valid UTF-8 sequence and slow-path into error handling if we find
|
||||
// an invalid sequence.
|
||||
*output_count = DecodeUTF8(input, i + esc_idx.value(), output);
|
||||
return i + esc_idx.value();
|
||||
*output_count = DecodeUTF8(input, i + esc_idx, output);
|
||||
return i + esc_idx;
|
||||
}
|
||||
|
||||
// If we have leftover input then we decode it one byte at a time (slow!)
|
||||
@@ -106,21 +249,27 @@ size_t DecodeUTF8UntilControlSeqImpl(D d,
|
||||
const hn::Vec<D1> esc1 = Set(d1, hn::GetLane(esc_vec));
|
||||
for (; i < count; ++i) {
|
||||
const hn::Vec<D1> input_vec = hn::LoadU(d1, input + i);
|
||||
const auto esc_idx = IndexOfChunk(d1, esc1, input_vec);
|
||||
if (!esc_idx) {
|
||||
const size_t esc_idx = IndexOfChunk(d1, esc1, input_vec);
|
||||
if (esc_idx == kNotFound) {
|
||||
continue;
|
||||
}
|
||||
|
||||
*output_count = DecodeUTF8(input, i + esc_idx.value(), output);
|
||||
return i + esc_idx.value();
|
||||
*output_count = DecodeUTF8(input, i + esc_idx, output);
|
||||
return i + esc_idx;
|
||||
}
|
||||
}
|
||||
|
||||
// If we reached this point, its possible for our input to have an
|
||||
// incomplete sequence because we're consuming the full input. We need
|
||||
// to trim any incomplete sequences from the end of the input.
|
||||
const size_t trimmed_len =
|
||||
simdutf::trim_partial_utf8(reinterpret_cast<const char*>(input), i);
|
||||
//
|
||||
// We use our own trim instead of simdutf::trim_partial_utf8 because
|
||||
// we only want to trim sequences that are valid-so-far (true partial
|
||||
// sequences that may be completed by future input). Invalid bytes
|
||||
// like C0, C1, F5-FF should NOT be trimmed — they should be passed
|
||||
// through to DecodeUTF8 which will replace them with U+FFFD per the
|
||||
// maximal subpart algorithm.
|
||||
const size_t trimmed_len = TrimValidPartialUTF8(input, i);
|
||||
*output_count = DecodeUTF8(input, trimmed_len, output);
|
||||
return trimmed_len;
|
||||
}
|
||||
|
||||
371
src/simd/vt.zig
371
src/simd/vt.zig
@@ -45,36 +45,79 @@ fn utf8DecodeUntilControlSeqScalar(
|
||||
const idx = indexOf(input, 0x1B) orelse input.len;
|
||||
const decode = input[0..idx];
|
||||
|
||||
// Go through and decode one item at a time.
|
||||
// Go through and decode one item at a time, following the W3C/Unicode
|
||||
// "U+FFFD Substitution of Maximal Subparts" algorithm for ill-formed
|
||||
// subsequences.
|
||||
var decode_offset: usize = 0;
|
||||
var decode_count: usize = 0;
|
||||
while (decode_offset < decode.len) {
|
||||
const decode_rem = decode[decode_offset..];
|
||||
const cp_len = std.unicode.utf8ByteSequenceLength(decode_rem[0]) catch {
|
||||
// Note, this is matching our SIMD behavior, but it is admittedly
|
||||
// a bit weird. See our "decode invalid leading byte" test too.
|
||||
// SIMD should be our source of truth then we copy behavior here.
|
||||
break;
|
||||
};
|
||||
const b0 = decode[decode_offset];
|
||||
|
||||
// If we don't have that number of bytes available. we finish. We
|
||||
// assume this is a partial input and we defer to the future.
|
||||
if (decode_rem.len < cp_len) break;
|
||||
|
||||
// We have the bytes available, so move forward
|
||||
const cp_bytes = decode_rem[0..cp_len];
|
||||
decode_offset += cp_len;
|
||||
if (std.unicode.utf8Decode(cp_bytes)) |cp| {
|
||||
output[decode_count] = @intCast(cp);
|
||||
// ASCII fast path
|
||||
if (b0 < 0x80) {
|
||||
output[decode_count] = b0;
|
||||
decode_count += 1;
|
||||
} else |_| {
|
||||
// If decoding failed, we replace the leading byte with the
|
||||
// replacement char and then continue decoding after that
|
||||
// byte. This matches the SIMD behavior and is tested by the
|
||||
// "invalid UTF-8" tests.
|
||||
decode_offset += 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Continuation byte (80-BF) or invalid byte (C0-C1, F5-FF)
|
||||
// as lead: each is its own maximal subpart → one FFFD per byte.
|
||||
if (b0 < 0xC2 or b0 > 0xF4) {
|
||||
output[decode_count] = 0xFFFD;
|
||||
decode_count += 1;
|
||||
decode_offset -= cp_len - 1;
|
||||
decode_offset += 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Multi-byte sequence. Determine expected length and the valid
|
||||
// range for each continuation byte per Unicode Table 3-7.
|
||||
const seq = utf8SeqInfo(b0);
|
||||
|
||||
// Check how many continuation bytes form a valid prefix (the
|
||||
// maximal subpart). We check each byte against its specific
|
||||
// valid range.
|
||||
var valid: usize = 1; // lead byte is valid
|
||||
for (0..seq.len - 1) |ci| {
|
||||
if (decode_offset + valid >= decode.len) {
|
||||
// Truncated at end of buffer: treat as incomplete
|
||||
// input that may be completed later. Stop decoding
|
||||
// without consuming these bytes.
|
||||
return .{
|
||||
.consumed = decode_offset,
|
||||
.decoded = decode_count,
|
||||
};
|
||||
}
|
||||
const cb = decode[decode_offset + valid];
|
||||
if (cb < seq.ranges[ci][0] or cb > seq.ranges[ci][1]) {
|
||||
// Byte doesn't match expected range. The maximal
|
||||
// subpart ends here.
|
||||
break;
|
||||
}
|
||||
valid += 1;
|
||||
}
|
||||
|
||||
if (valid == seq.len) {
|
||||
// Full sequence present and structurally valid. Decode it.
|
||||
// (Structural validity per Table 3-7 guarantees decode success.)
|
||||
const cp_bytes = decode[decode_offset..][0..seq.len];
|
||||
if (std.unicode.utf8Decode(cp_bytes)) |cp| {
|
||||
output[decode_count] = @intCast(cp);
|
||||
decode_count += 1;
|
||||
decode_offset += seq.len;
|
||||
} else |_| {
|
||||
// Should not happen given Table 3-7 validation, but
|
||||
// be safe: emit FFFD for the lead byte.
|
||||
output[decode_count] = 0xFFFD;
|
||||
decode_count += 1;
|
||||
decode_offset += 1;
|
||||
}
|
||||
} else {
|
||||
// Incomplete/ill-formed: the maximal subpart (valid bytes)
|
||||
// maps to a single FFFD.
|
||||
output[decode_count] = 0xFFFD;
|
||||
decode_count += 1;
|
||||
decode_offset += valid;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -84,6 +127,27 @@ fn utf8DecodeUntilControlSeqScalar(
|
||||
};
|
||||
}
|
||||
|
||||
const Utf8SeqInfo = struct {
|
||||
len: u3,
|
||||
ranges: [3][2]u8,
|
||||
};
|
||||
|
||||
/// Returns the expected byte count and valid continuation byte ranges
|
||||
/// for a UTF-8 sequence based on its lead byte, per Unicode Table 3-7.
|
||||
fn utf8SeqInfo(lead: u8) Utf8SeqInfo {
|
||||
return switch (lead) {
|
||||
0xC2...0xDF => .{ .len = 2, .ranges = .{ .{ 0x80, 0xBF }, .{ 0, 0 }, .{ 0, 0 } } },
|
||||
0xE0 => .{ .len = 3, .ranges = .{ .{ 0xA0, 0xBF }, .{ 0x80, 0xBF }, .{ 0, 0 } } },
|
||||
0xE1...0xEC => .{ .len = 3, .ranges = .{ .{ 0x80, 0xBF }, .{ 0x80, 0xBF }, .{ 0, 0 } } },
|
||||
0xED => .{ .len = 3, .ranges = .{ .{ 0x80, 0x9F }, .{ 0x80, 0xBF }, .{ 0, 0 } } },
|
||||
0xEE...0xEF => .{ .len = 3, .ranges = .{ .{ 0x80, 0xBF }, .{ 0x80, 0xBF }, .{ 0, 0 } } },
|
||||
0xF0 => .{ .len = 4, .ranges = .{ .{ 0x90, 0xBF }, .{ 0x80, 0xBF }, .{ 0x80, 0xBF } } },
|
||||
0xF1...0xF3 => .{ .len = 4, .ranges = .{ .{ 0x80, 0xBF }, .{ 0x80, 0xBF }, .{ 0x80, 0xBF } } },
|
||||
0xF4 => .{ .len = 4, .ranges = .{ .{ 0x80, 0x8F }, .{ 0x80, 0xBF }, .{ 0x80, 0xBF } } },
|
||||
else => unreachable,
|
||||
};
|
||||
}
|
||||
|
||||
test "decode no escape" {
|
||||
const testing = std.testing;
|
||||
|
||||
@@ -131,7 +195,7 @@ test "decode incomplete UTF-8" {
|
||||
|
||||
var output: [64]u32 = undefined;
|
||||
|
||||
// 2-byte
|
||||
// 2-byte truncated at end of buffer
|
||||
{
|
||||
const str = "hello\xc2";
|
||||
try testing.expectEqual(DecodeResult{
|
||||
@@ -140,16 +204,18 @@ test "decode incomplete UTF-8" {
|
||||
}, utf8DecodeUntilControlSeq(str, &output));
|
||||
}
|
||||
|
||||
// 3-byte
|
||||
// 3-byte: \xe0 expects A0-BF next, but \x00 is not in range.
|
||||
// \xe0 is a maximal subpart of length 1 → FFFD, then \x00 is ASCII NUL.
|
||||
{
|
||||
const str = "hello\xe0\x00";
|
||||
try testing.expectEqual(DecodeResult{
|
||||
.consumed = 5,
|
||||
.decoded = 5,
|
||||
}, utf8DecodeUntilControlSeq(str, &output));
|
||||
const result = utf8DecodeUntilControlSeq(str, &output);
|
||||
try testing.expectEqual(@as(usize, 7), result.consumed);
|
||||
try testing.expectEqual(@as(usize, 7), result.decoded);
|
||||
try testing.expectEqual(@as(u32, 0xFFFD), output[5]);
|
||||
try testing.expectEqual(@as(u32, 0x00), output[6]);
|
||||
}
|
||||
|
||||
// 4-byte
|
||||
// 4-byte truncated at end of buffer (F0 90 is valid so far)
|
||||
{
|
||||
const str = "hello\xf0\x90";
|
||||
try testing.expectEqual(DecodeResult{
|
||||
@@ -178,19 +244,248 @@ test "decode invalid UTF-8" {
|
||||
try testing.expectEqual(@as(u32, 0x01), output[6]);
|
||||
}
|
||||
|
||||
// This is testing our current behavior so that we know we have to handle
|
||||
// this case in terminal/stream.zig. If we change this behavior, we can
|
||||
// remove the special handling in terminal/stream.zig.
|
||||
test "decode invalid leading byte isn't consumed or replaced" {
|
||||
// Per the maximal subpart spec, bytes F5-FF are each replaced with FFFD.
|
||||
test "decode invalid leading byte is replaced" {
|
||||
const testing = std.testing;
|
||||
|
||||
var output: [64]u32 = undefined;
|
||||
|
||||
{
|
||||
const str = "hello\xFF";
|
||||
try testing.expectEqual(DecodeResult{
|
||||
.consumed = 5,
|
||||
.decoded = 5,
|
||||
}, utf8DecodeUntilControlSeq(str, &output));
|
||||
const result = utf8DecodeUntilControlSeq(str, &output);
|
||||
try testing.expectEqual(@as(usize, 6), result.consumed);
|
||||
try testing.expectEqual(@as(usize, 6), result.decoded);
|
||||
try testing.expectEqual(@as(u32, 0xFFFD), output[5]);
|
||||
}
|
||||
}
|
||||
|
||||
test "decode invalid continuation in 3-byte sequence" {
|
||||
const testing = std.testing;
|
||||
|
||||
var output: [64]u32 = undefined;
|
||||
|
||||
// \xe2 expects two continuation bytes, \x28 is not one
|
||||
{
|
||||
const str = "hello\xe2\x28world";
|
||||
const result = utf8DecodeUntilControlSeq(str, &output);
|
||||
// "hello" + replacement + "(" + "world" = 12 codepoints
|
||||
try testing.expectEqual(@as(usize, 12), result.decoded);
|
||||
try testing.expectEqual(@as(u32, 0xFFFD), output[5]);
|
||||
try testing.expectEqual(@as(u32, '('), output[6]);
|
||||
try testing.expectEqual(@as(u32, 'w'), output[7]);
|
||||
}
|
||||
}
|
||||
|
||||
test "decode invalid continuation in 4-byte sequence" {
|
||||
const testing = std.testing;
|
||||
|
||||
var output: [64]u32 = undefined;
|
||||
|
||||
// \xf0\x90 is a valid prefix of a 4-byte sequence, but \x28 breaks it.
|
||||
// Maximal subpart is F0 90 (length 2) → single FFFD, then '(' proceeds.
|
||||
{
|
||||
const str = "hello\xf0\x90\x28world";
|
||||
const result = utf8DecodeUntilControlSeq(str, &output);
|
||||
// "hello" + FFFD + "(" + "world" = 12 codepoints
|
||||
try testing.expectEqual(@as(usize, 12), result.decoded);
|
||||
try testing.expectEqual(@as(u32, 0xFFFD), output[5]);
|
||||
try testing.expectEqual(@as(u32, '('), output[6]);
|
||||
try testing.expectEqual(@as(u32, 'w'), output[7]);
|
||||
}
|
||||
}
|
||||
|
||||
test "decode multiple consecutive invalid bytes" {
|
||||
const testing = std.testing;
|
||||
|
||||
var output: [64]u32 = undefined;
|
||||
|
||||
// Each lone continuation byte is its own maximal subpart → one FFFD each.
|
||||
{
|
||||
const str = "a\x80\x80b";
|
||||
const result = utf8DecodeUntilControlSeq(str, &output);
|
||||
// "a" + FFFD + FFFD + "b" = 4 codepoints
|
||||
try testing.expectEqual(@as(usize, 4), result.decoded);
|
||||
try testing.expectEqual(@as(u32, 'a'), output[0]);
|
||||
try testing.expectEqual(@as(u32, 0xFFFD), output[1]);
|
||||
try testing.expectEqual(@as(u32, 0xFFFD), output[2]);
|
||||
try testing.expectEqual(@as(u32, 'b'), output[3]);
|
||||
}
|
||||
|
||||
// C0 is an invalid lead byte (< C2), each byte gets its own FFFD.
|
||||
{
|
||||
const str = "a\xc0\xc0b";
|
||||
const result = utf8DecodeUntilControlSeq(str, &output);
|
||||
// "a" + FFFD + FFFD + "b" = 4 codepoints
|
||||
try testing.expectEqual(@as(usize, 4), result.decoded);
|
||||
try testing.expectEqual(@as(u32, 'a'), output[0]);
|
||||
try testing.expectEqual(@as(u32, 0xFFFD), output[1]);
|
||||
try testing.expectEqual(@as(u32, 0xFFFD), output[2]);
|
||||
try testing.expectEqual(@as(u32, 'b'), output[3]);
|
||||
}
|
||||
}
|
||||
|
||||
test "decode unexpected continuation byte as lead" {
|
||||
const testing = std.testing;
|
||||
|
||||
var output: [64]u32 = undefined;
|
||||
|
||||
// 0x80 is a continuation byte appearing as a lead byte
|
||||
{
|
||||
const str = "a\x80b";
|
||||
const result = utf8DecodeUntilControlSeq(str, &output);
|
||||
// "a" + replacement + "b" = 3 codepoints
|
||||
try testing.expectEqual(@as(usize, 3), result.decoded);
|
||||
try testing.expectEqual(@as(u32, 'a'), output[0]);
|
||||
try testing.expectEqual(@as(u32, 0xFFFD), output[1]);
|
||||
try testing.expectEqual(@as(u32, 'b'), output[2]);
|
||||
}
|
||||
}
|
||||
|
||||
test "decode overlong 2-byte encoding" {
|
||||
const testing = std.testing;
|
||||
|
||||
var output: [64]u32 = undefined;
|
||||
|
||||
// \xc0\xaf: C0 is invalid lead (< C2) → FFFD, AF is lone continuation → FFFD
|
||||
// Per Table 3-8: C0 AF → FFFD FFFD
|
||||
{
|
||||
const str = "a\xc0\xafb";
|
||||
const result = utf8DecodeUntilControlSeq(str, &output);
|
||||
// "a" + FFFD + FFFD + "b" = 4 codepoints
|
||||
try testing.expectEqual(@as(usize, 4), result.decoded);
|
||||
try testing.expectEqual(@as(u32, 'a'), output[0]);
|
||||
try testing.expectEqual(@as(u32, 0xFFFD), output[1]);
|
||||
try testing.expectEqual(@as(u32, 0xFFFD), output[2]);
|
||||
try testing.expectEqual(@as(u32, 'b'), output[3]);
|
||||
}
|
||||
}
|
||||
|
||||
test "decode surrogate half" {
|
||||
const testing = std.testing;
|
||||
|
||||
var output: [64]u32 = undefined;
|
||||
|
||||
// \xed\xa0\x80 encodes U+D800 (a surrogate). Per Table 3-7, after ED
|
||||
// the next byte must be 80-9F. A0 is out of range, so ED is a maximal
|
||||
// subpart of length 1 → FFFD. Then A0 and 80 are lone continuations
|
||||
// → FFFD each. Per Table 3-9: ED A0 80 → FFFD FFFD FFFD
|
||||
{
|
||||
const str = "a\xed\xa0\x80b";
|
||||
const result = utf8DecodeUntilControlSeq(str, &output);
|
||||
// "a" + FFFD + FFFD + FFFD + "b" = 5 codepoints
|
||||
try testing.expectEqual(@as(usize, 5), result.decoded);
|
||||
try testing.expectEqual(@as(u32, 'a'), output[0]);
|
||||
try testing.expectEqual(@as(u32, 0xFFFD), output[1]);
|
||||
try testing.expectEqual(@as(u32, 0xFFFD), output[2]);
|
||||
try testing.expectEqual(@as(u32, 0xFFFD), output[3]);
|
||||
try testing.expectEqual(@as(u32, 'b'), output[4]);
|
||||
}
|
||||
}
|
||||
|
||||
test "decode valid multibyte surrounded by invalid" {
|
||||
const testing = std.testing;
|
||||
|
||||
var output: [64]u32 = undefined;
|
||||
|
||||
// \xc3\xa9 = é (U+00E9), surrounded by invalid continuation bytes
|
||||
{
|
||||
const str = "\x80\xc3\xa9\x80";
|
||||
const result = utf8DecodeUntilControlSeq(str, &output);
|
||||
// replacement + é + replacement = 3 codepoints
|
||||
try testing.expectEqual(@as(usize, 3), result.decoded);
|
||||
try testing.expectEqual(@as(u32, 0xFFFD), output[0]);
|
||||
try testing.expectEqual(@as(u32, 0x00E9), output[1]);
|
||||
try testing.expectEqual(@as(u32, 0xFFFD), output[2]);
|
||||
}
|
||||
}
|
||||
|
||||
test "decode invalid byte before escape" {
|
||||
const testing = std.testing;
|
||||
|
||||
var output: [64]u32 = undefined;
|
||||
|
||||
// Invalid byte followed by ESC - should replace then stop
|
||||
{
|
||||
const str = "hi\x80\x1b[0m";
|
||||
const result = utf8DecodeUntilControlSeq(str, &output);
|
||||
try testing.expectEqual(@as(usize, 3), result.consumed);
|
||||
try testing.expectEqual(@as(usize, 3), result.decoded);
|
||||
try testing.expectEqual(@as(u32, 'h'), output[0]);
|
||||
try testing.expectEqual(@as(u32, 'i'), output[1]);
|
||||
try testing.expectEqual(@as(u32, 0xFFFD), output[2]);
|
||||
}
|
||||
}
|
||||
|
||||
// Unicode Table 3-8: U+FFFD for Non-Shortest Form Sequences
|
||||
// Bytes: C0 AF E0 80 BF F0 81 82 41
|
||||
// Output: FFFD FFFD FFFD FFFD FFFD FFFD FFFD FFFD 0041
|
||||
test "Table 3-8: non-shortest form sequences" {
|
||||
const testing = std.testing;
|
||||
var output: [64]u32 = undefined;
|
||||
|
||||
const str = "\xC0\xAF\xE0\x80\xBF\xF0\x81\x82\x41";
|
||||
const result = utf8DecodeUntilControlSeq(str, &output);
|
||||
try testing.expectEqual(@as(usize, 9), result.consumed);
|
||||
try testing.expectEqual(@as(usize, 9), result.decoded);
|
||||
for (0..8) |i| {
|
||||
try testing.expectEqual(@as(u32, 0xFFFD), output[i]);
|
||||
}
|
||||
try testing.expectEqual(@as(u32, 0x41), output[8]);
|
||||
}
|
||||
|
||||
// Unicode Table 3-9: U+FFFD for Ill-Formed Sequences for Surrogates
|
||||
// Bytes: ED A0 80 ED BF BF ED AF 41
|
||||
// Output: FFFD FFFD FFFD FFFD FFFD FFFD FFFD FFFD 0041
|
||||
test "Table 3-9: surrogate sequences" {
|
||||
const testing = std.testing;
|
||||
var output: [64]u32 = undefined;
|
||||
|
||||
const str = "\xED\xA0\x80\xED\xBF\xBF\xED\xAF\x41";
|
||||
const result = utf8DecodeUntilControlSeq(str, &output);
|
||||
try testing.expectEqual(@as(usize, 9), result.consumed);
|
||||
try testing.expectEqual(@as(usize, 9), result.decoded);
|
||||
for (0..8) |i| {
|
||||
try testing.expectEqual(@as(u32, 0xFFFD), output[i]);
|
||||
}
|
||||
try testing.expectEqual(@as(u32, 0x41), output[8]);
|
||||
}
|
||||
|
||||
// Unicode Table 3-10: U+FFFD for Other Ill-Formed Sequences
|
||||
// Bytes: F4 91 92 93 FF 41 80 BF 42
|
||||
// Output: FFFD FFFD FFFD FFFD FFFD 0041 FFFD FFFD 0042
|
||||
test "Table 3-10: other ill-formed sequences" {
|
||||
const testing = std.testing;
|
||||
var output: [64]u32 = undefined;
|
||||
|
||||
const str = "\xF4\x91\x92\x93\xFF\x41\x80\xBF\x42";
|
||||
const result = utf8DecodeUntilControlSeq(str, &output);
|
||||
try testing.expectEqual(@as(usize, 9), result.consumed);
|
||||
try testing.expectEqual(@as(usize, 9), result.decoded);
|
||||
try testing.expectEqual(@as(u32, 0xFFFD), output[0]); // F4
|
||||
try testing.expectEqual(@as(u32, 0xFFFD), output[1]); // 91
|
||||
try testing.expectEqual(@as(u32, 0xFFFD), output[2]); // 92
|
||||
try testing.expectEqual(@as(u32, 0xFFFD), output[3]); // 93
|
||||
try testing.expectEqual(@as(u32, 0xFFFD), output[4]); // FF
|
||||
try testing.expectEqual(@as(u32, 0x0041), output[5]); // 41
|
||||
try testing.expectEqual(@as(u32, 0xFFFD), output[6]); // 80
|
||||
try testing.expectEqual(@as(u32, 0xFFFD), output[7]); // BF
|
||||
try testing.expectEqual(@as(u32, 0x0042), output[8]); // 42
|
||||
}
|
||||
|
||||
// Unicode Table 3-11: U+FFFD for Truncated Sequences
|
||||
// Bytes: E1 80 E2 F0 91 92 F1 BF 41
|
||||
// Output: FFFD FFFD FFFD FFFD 0041
|
||||
test "Table 3-11: truncated sequences" {
|
||||
const testing = std.testing;
|
||||
var output: [64]u32 = undefined;
|
||||
|
||||
const str = "\xE1\x80\xE2\xF0\x91\x92\xF1\xBF\x41";
|
||||
const result = utf8DecodeUntilControlSeq(str, &output);
|
||||
try testing.expectEqual(@as(usize, 9), result.consumed);
|
||||
try testing.expectEqual(@as(usize, 5), result.decoded);
|
||||
try testing.expectEqual(@as(u32, 0xFFFD), output[0]); // E1 80 (truncated 3-byte)
|
||||
try testing.expectEqual(@as(u32, 0xFFFD), output[1]); // E2 (truncated 3-byte, next byte F0 not continuation)
|
||||
try testing.expectEqual(@as(u32, 0xFFFD), output[2]); // F0 91 92 (truncated 4-byte)
|
||||
try testing.expectEqual(@as(u32, 0xFFFD), output[3]); // F1 BF (truncated 4-byte, next byte 41 not continuation)
|
||||
try testing.expectEqual(@as(u32, 0x0041), output[4]); // 41
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user