mirror of
https://github.com/ghostty-org/ghostty.git
synced 2026-05-24 13:50:11 +00:00
This updates simdutf to my fork which has a SIMDUTF_NO_LIBCXX option that removes all libc++ and libc++ ABI dependencies. From there, the hand-written simd code we have has been updated to also no longer use any libc++ features. Part of this required removing utfcpp since it depended on libc++ (`<iterator>`). libghostty-vt now only depends on libc.
492 lines
17 KiB
Zig
492 lines
17 KiB
Zig
const std = @import("std");
|
|
const options = @import("build_options");
|
|
const assert = @import("../quirks.zig").inlineAssert;
|
|
const indexOf = @import("index_of.zig").indexOf;
|
|
|
|
// vt.cpp
|
|
extern "c" fn ghostty_simd_decode_utf8_until_control_seq(
|
|
input: [*]const u8,
|
|
count: usize,
|
|
output: [*]u32,
|
|
output_count: *usize,
|
|
) usize;
|
|
|
|
const DecodeResult = struct {
|
|
consumed: usize,
|
|
decoded: usize,
|
|
};
|
|
|
|
pub fn utf8DecodeUntilControlSeq(
|
|
input: []const u8,
|
|
output: []u32,
|
|
) DecodeResult {
|
|
assert(output.len >= input.len);
|
|
|
|
if (comptime options.simd) {
|
|
var decoded: usize = 0;
|
|
const consumed = ghostty_simd_decode_utf8_until_control_seq(
|
|
input.ptr,
|
|
input.len,
|
|
output.ptr,
|
|
&decoded,
|
|
);
|
|
|
|
return .{ .consumed = consumed, .decoded = decoded };
|
|
}
|
|
|
|
return utf8DecodeUntilControlSeqScalar(input, output);
|
|
}
|
|
|
|
fn utf8DecodeUntilControlSeqScalar(
|
|
input: []const u8,
|
|
output: []u32,
|
|
) DecodeResult {
|
|
// Find our escape
|
|
const idx = indexOf(input, 0x1B) orelse input.len;
|
|
const decode = input[0..idx];
|
|
|
|
// Go through and decode one item at a time, following the W3C/Unicode
|
|
// "U+FFFD Substitution of Maximal Subparts" algorithm for ill-formed
|
|
// subsequences.
|
|
var decode_offset: usize = 0;
|
|
var decode_count: usize = 0;
|
|
while (decode_offset < decode.len) {
|
|
const b0 = decode[decode_offset];
|
|
|
|
// ASCII fast path
|
|
if (b0 < 0x80) {
|
|
output[decode_count] = b0;
|
|
decode_count += 1;
|
|
decode_offset += 1;
|
|
continue;
|
|
}
|
|
|
|
// Continuation byte (80-BF) or invalid byte (C0-C1, F5-FF)
|
|
// as lead: each is its own maximal subpart → one FFFD per byte.
|
|
if (b0 < 0xC2 or b0 > 0xF4) {
|
|
output[decode_count] = 0xFFFD;
|
|
decode_count += 1;
|
|
decode_offset += 1;
|
|
continue;
|
|
}
|
|
|
|
// Multi-byte sequence. Determine expected length and the valid
|
|
// range for each continuation byte per Unicode Table 3-7.
|
|
const seq = utf8SeqInfo(b0);
|
|
|
|
// Check how many continuation bytes form a valid prefix (the
|
|
// maximal subpart). We check each byte against its specific
|
|
// valid range.
|
|
var valid: usize = 1; // lead byte is valid
|
|
for (0..seq.len - 1) |ci| {
|
|
if (decode_offset + valid >= decode.len) {
|
|
// Truncated at end of buffer: treat as incomplete
|
|
// input that may be completed later. Stop decoding
|
|
// without consuming these bytes.
|
|
return .{
|
|
.consumed = decode_offset,
|
|
.decoded = decode_count,
|
|
};
|
|
}
|
|
const cb = decode[decode_offset + valid];
|
|
if (cb < seq.ranges[ci][0] or cb > seq.ranges[ci][1]) {
|
|
// Byte doesn't match expected range. The maximal
|
|
// subpart ends here.
|
|
break;
|
|
}
|
|
valid += 1;
|
|
}
|
|
|
|
if (valid == seq.len) {
|
|
// Full sequence present and structurally valid. Decode it.
|
|
// (Structural validity per Table 3-7 guarantees decode success.)
|
|
const cp_bytes = decode[decode_offset..][0..seq.len];
|
|
if (std.unicode.utf8Decode(cp_bytes)) |cp| {
|
|
output[decode_count] = @intCast(cp);
|
|
decode_count += 1;
|
|
decode_offset += seq.len;
|
|
} else |_| {
|
|
// Should not happen given Table 3-7 validation, but
|
|
// be safe: emit FFFD for the lead byte.
|
|
output[decode_count] = 0xFFFD;
|
|
decode_count += 1;
|
|
decode_offset += 1;
|
|
}
|
|
} else {
|
|
// Incomplete/ill-formed: the maximal subpart (valid bytes)
|
|
// maps to a single FFFD.
|
|
output[decode_count] = 0xFFFD;
|
|
decode_count += 1;
|
|
decode_offset += valid;
|
|
}
|
|
}
|
|
|
|
return .{
|
|
.consumed = decode_offset,
|
|
.decoded = decode_count,
|
|
};
|
|
}
|
|
|
|
const Utf8SeqInfo = struct {
|
|
len: u3,
|
|
ranges: [3][2]u8,
|
|
};
|
|
|
|
/// Returns the expected byte count and valid continuation byte ranges
|
|
/// for a UTF-8 sequence based on its lead byte, per Unicode Table 3-7.
|
|
fn utf8SeqInfo(lead: u8) Utf8SeqInfo {
|
|
return switch (lead) {
|
|
0xC2...0xDF => .{ .len = 2, .ranges = .{ .{ 0x80, 0xBF }, .{ 0, 0 }, .{ 0, 0 } } },
|
|
0xE0 => .{ .len = 3, .ranges = .{ .{ 0xA0, 0xBF }, .{ 0x80, 0xBF }, .{ 0, 0 } } },
|
|
0xE1...0xEC => .{ .len = 3, .ranges = .{ .{ 0x80, 0xBF }, .{ 0x80, 0xBF }, .{ 0, 0 } } },
|
|
0xED => .{ .len = 3, .ranges = .{ .{ 0x80, 0x9F }, .{ 0x80, 0xBF }, .{ 0, 0 } } },
|
|
0xEE...0xEF => .{ .len = 3, .ranges = .{ .{ 0x80, 0xBF }, .{ 0x80, 0xBF }, .{ 0, 0 } } },
|
|
0xF0 => .{ .len = 4, .ranges = .{ .{ 0x90, 0xBF }, .{ 0x80, 0xBF }, .{ 0x80, 0xBF } } },
|
|
0xF1...0xF3 => .{ .len = 4, .ranges = .{ .{ 0x80, 0xBF }, .{ 0x80, 0xBF }, .{ 0x80, 0xBF } } },
|
|
0xF4 => .{ .len = 4, .ranges = .{ .{ 0x80, 0x8F }, .{ 0x80, 0xBF }, .{ 0x80, 0xBF } } },
|
|
else => unreachable,
|
|
};
|
|
}
|
|
|
|
test "decode no escape" {
|
|
const testing = std.testing;
|
|
|
|
var output: [1024]u32 = undefined;
|
|
|
|
// TODO: many more test cases
|
|
{
|
|
const str = "hello" ** 128;
|
|
try testing.expectEqual(DecodeResult{
|
|
.consumed = str.len,
|
|
.decoded = str.len,
|
|
}, utf8DecodeUntilControlSeq(str, &output));
|
|
}
|
|
}
|
|
|
|
test "decode ASCII to escape" {
|
|
const testing = std.testing;
|
|
|
|
var output: [1024]u32 = undefined;
|
|
|
|
// TODO: many more test cases
|
|
{
|
|
const prefix = "hello" ** 64;
|
|
const str = prefix ++ "\x1b" ++ ("world" ** 64);
|
|
try testing.expectEqual(DecodeResult{
|
|
.consumed = prefix.len,
|
|
.decoded = prefix.len,
|
|
}, utf8DecodeUntilControlSeq(str, &output));
|
|
}
|
|
}
|
|
|
|
test "decode immediate esc sequence" {
|
|
const testing = std.testing;
|
|
|
|
var output: [64]u32 = undefined;
|
|
const str = "\x1b[?5s";
|
|
try testing.expectEqual(DecodeResult{
|
|
.consumed = 0,
|
|
.decoded = 0,
|
|
}, utf8DecodeUntilControlSeq(str, &output));
|
|
}
|
|
|
|
test "decode incomplete UTF-8" {
|
|
const testing = std.testing;
|
|
|
|
var output: [64]u32 = undefined;
|
|
|
|
// 2-byte truncated at end of buffer
|
|
{
|
|
const str = "hello\xc2";
|
|
try testing.expectEqual(DecodeResult{
|
|
.consumed = 5,
|
|
.decoded = 5,
|
|
}, utf8DecodeUntilControlSeq(str, &output));
|
|
}
|
|
|
|
// 3-byte: \xe0 expects A0-BF next, but \x00 is not in range.
|
|
// \xe0 is a maximal subpart of length 1 → FFFD, then \x00 is ASCII NUL.
|
|
{
|
|
const str = "hello\xe0\x00";
|
|
const result = utf8DecodeUntilControlSeq(str, &output);
|
|
try testing.expectEqual(@as(usize, 7), result.consumed);
|
|
try testing.expectEqual(@as(usize, 7), result.decoded);
|
|
try testing.expectEqual(@as(u32, 0xFFFD), output[5]);
|
|
try testing.expectEqual(@as(u32, 0x00), output[6]);
|
|
}
|
|
|
|
// 4-byte truncated at end of buffer (F0 90 is valid so far)
|
|
{
|
|
const str = "hello\xf0\x90";
|
|
try testing.expectEqual(DecodeResult{
|
|
.consumed = 5,
|
|
.decoded = 5,
|
|
}, utf8DecodeUntilControlSeq(str, &output));
|
|
}
|
|
}
|
|
|
|
test "decode invalid UTF-8" {
|
|
const testing = std.testing;
|
|
|
|
var output: [64]u32 = undefined;
|
|
|
|
// Invalid leading 2-byte sequence
|
|
{
|
|
const str = "hello\xc2\x01";
|
|
try testing.expectEqual(DecodeResult{
|
|
.consumed = 7,
|
|
.decoded = 7,
|
|
}, utf8DecodeUntilControlSeq(str, &output));
|
|
}
|
|
|
|
// Replacement will only replace the invalid leading byte.
|
|
try testing.expectEqual(@as(u32, 0xFFFD), output[5]);
|
|
try testing.expectEqual(@as(u32, 0x01), output[6]);
|
|
}
|
|
|
|
// Per the maximal subpart spec, bytes F5-FF are each replaced with FFFD.
|
|
test "decode invalid leading byte is replaced" {
|
|
const testing = std.testing;
|
|
|
|
var output: [64]u32 = undefined;
|
|
|
|
{
|
|
const str = "hello\xFF";
|
|
const result = utf8DecodeUntilControlSeq(str, &output);
|
|
try testing.expectEqual(@as(usize, 6), result.consumed);
|
|
try testing.expectEqual(@as(usize, 6), result.decoded);
|
|
try testing.expectEqual(@as(u32, 0xFFFD), output[5]);
|
|
}
|
|
}
|
|
|
|
test "decode invalid continuation in 3-byte sequence" {
|
|
const testing = std.testing;
|
|
|
|
var output: [64]u32 = undefined;
|
|
|
|
// \xe2 expects two continuation bytes, \x28 is not one
|
|
{
|
|
const str = "hello\xe2\x28world";
|
|
const result = utf8DecodeUntilControlSeq(str, &output);
|
|
// "hello" + replacement + "(" + "world" = 12 codepoints
|
|
try testing.expectEqual(@as(usize, 12), result.decoded);
|
|
try testing.expectEqual(@as(u32, 0xFFFD), output[5]);
|
|
try testing.expectEqual(@as(u32, '('), output[6]);
|
|
try testing.expectEqual(@as(u32, 'w'), output[7]);
|
|
}
|
|
}
|
|
|
|
test "decode invalid continuation in 4-byte sequence" {
|
|
const testing = std.testing;
|
|
|
|
var output: [64]u32 = undefined;
|
|
|
|
// \xf0\x90 is a valid prefix of a 4-byte sequence, but \x28 breaks it.
|
|
// Maximal subpart is F0 90 (length 2) → single FFFD, then '(' proceeds.
|
|
{
|
|
const str = "hello\xf0\x90\x28world";
|
|
const result = utf8DecodeUntilControlSeq(str, &output);
|
|
// "hello" + FFFD + "(" + "world" = 12 codepoints
|
|
try testing.expectEqual(@as(usize, 12), result.decoded);
|
|
try testing.expectEqual(@as(u32, 0xFFFD), output[5]);
|
|
try testing.expectEqual(@as(u32, '('), output[6]);
|
|
try testing.expectEqual(@as(u32, 'w'), output[7]);
|
|
}
|
|
}
|
|
|
|
test "decode multiple consecutive invalid bytes" {
|
|
const testing = std.testing;
|
|
|
|
var output: [64]u32 = undefined;
|
|
|
|
// Each lone continuation byte is its own maximal subpart → one FFFD each.
|
|
{
|
|
const str = "a\x80\x80b";
|
|
const result = utf8DecodeUntilControlSeq(str, &output);
|
|
// "a" + FFFD + FFFD + "b" = 4 codepoints
|
|
try testing.expectEqual(@as(usize, 4), result.decoded);
|
|
try testing.expectEqual(@as(u32, 'a'), output[0]);
|
|
try testing.expectEqual(@as(u32, 0xFFFD), output[1]);
|
|
try testing.expectEqual(@as(u32, 0xFFFD), output[2]);
|
|
try testing.expectEqual(@as(u32, 'b'), output[3]);
|
|
}
|
|
|
|
// C0 is an invalid lead byte (< C2), each byte gets its own FFFD.
|
|
{
|
|
const str = "a\xc0\xc0b";
|
|
const result = utf8DecodeUntilControlSeq(str, &output);
|
|
// "a" + FFFD + FFFD + "b" = 4 codepoints
|
|
try testing.expectEqual(@as(usize, 4), result.decoded);
|
|
try testing.expectEqual(@as(u32, 'a'), output[0]);
|
|
try testing.expectEqual(@as(u32, 0xFFFD), output[1]);
|
|
try testing.expectEqual(@as(u32, 0xFFFD), output[2]);
|
|
try testing.expectEqual(@as(u32, 'b'), output[3]);
|
|
}
|
|
}
|
|
|
|
test "decode unexpected continuation byte as lead" {
|
|
const testing = std.testing;
|
|
|
|
var output: [64]u32 = undefined;
|
|
|
|
// 0x80 is a continuation byte appearing as a lead byte
|
|
{
|
|
const str = "a\x80b";
|
|
const result = utf8DecodeUntilControlSeq(str, &output);
|
|
// "a" + replacement + "b" = 3 codepoints
|
|
try testing.expectEqual(@as(usize, 3), result.decoded);
|
|
try testing.expectEqual(@as(u32, 'a'), output[0]);
|
|
try testing.expectEqual(@as(u32, 0xFFFD), output[1]);
|
|
try testing.expectEqual(@as(u32, 'b'), output[2]);
|
|
}
|
|
}
|
|
|
|
test "decode overlong 2-byte encoding" {
|
|
const testing = std.testing;
|
|
|
|
var output: [64]u32 = undefined;
|
|
|
|
// \xc0\xaf: C0 is invalid lead (< C2) → FFFD, AF is lone continuation → FFFD
|
|
// Per Table 3-8: C0 AF → FFFD FFFD
|
|
{
|
|
const str = "a\xc0\xafb";
|
|
const result = utf8DecodeUntilControlSeq(str, &output);
|
|
// "a" + FFFD + FFFD + "b" = 4 codepoints
|
|
try testing.expectEqual(@as(usize, 4), result.decoded);
|
|
try testing.expectEqual(@as(u32, 'a'), output[0]);
|
|
try testing.expectEqual(@as(u32, 0xFFFD), output[1]);
|
|
try testing.expectEqual(@as(u32, 0xFFFD), output[2]);
|
|
try testing.expectEqual(@as(u32, 'b'), output[3]);
|
|
}
|
|
}
|
|
|
|
test "decode surrogate half" {
|
|
const testing = std.testing;
|
|
|
|
var output: [64]u32 = undefined;
|
|
|
|
// \xed\xa0\x80 encodes U+D800 (a surrogate). Per Table 3-7, after ED
|
|
// the next byte must be 80-9F. A0 is out of range, so ED is a maximal
|
|
// subpart of length 1 → FFFD. Then A0 and 80 are lone continuations
|
|
// → FFFD each. Per Table 3-9: ED A0 80 → FFFD FFFD FFFD
|
|
{
|
|
const str = "a\xed\xa0\x80b";
|
|
const result = utf8DecodeUntilControlSeq(str, &output);
|
|
// "a" + FFFD + FFFD + FFFD + "b" = 5 codepoints
|
|
try testing.expectEqual(@as(usize, 5), result.decoded);
|
|
try testing.expectEqual(@as(u32, 'a'), output[0]);
|
|
try testing.expectEqual(@as(u32, 0xFFFD), output[1]);
|
|
try testing.expectEqual(@as(u32, 0xFFFD), output[2]);
|
|
try testing.expectEqual(@as(u32, 0xFFFD), output[3]);
|
|
try testing.expectEqual(@as(u32, 'b'), output[4]);
|
|
}
|
|
}
|
|
|
|
test "decode valid multibyte surrounded by invalid" {
|
|
const testing = std.testing;
|
|
|
|
var output: [64]u32 = undefined;
|
|
|
|
// \xc3\xa9 = é (U+00E9), surrounded by invalid continuation bytes
|
|
{
|
|
const str = "\x80\xc3\xa9\x80";
|
|
const result = utf8DecodeUntilControlSeq(str, &output);
|
|
// replacement + é + replacement = 3 codepoints
|
|
try testing.expectEqual(@as(usize, 3), result.decoded);
|
|
try testing.expectEqual(@as(u32, 0xFFFD), output[0]);
|
|
try testing.expectEqual(@as(u32, 0x00E9), output[1]);
|
|
try testing.expectEqual(@as(u32, 0xFFFD), output[2]);
|
|
}
|
|
}
|
|
|
|
test "decode invalid byte before escape" {
|
|
const testing = std.testing;
|
|
|
|
var output: [64]u32 = undefined;
|
|
|
|
// Invalid byte followed by ESC - should replace then stop
|
|
{
|
|
const str = "hi\x80\x1b[0m";
|
|
const result = utf8DecodeUntilControlSeq(str, &output);
|
|
try testing.expectEqual(@as(usize, 3), result.consumed);
|
|
try testing.expectEqual(@as(usize, 3), result.decoded);
|
|
try testing.expectEqual(@as(u32, 'h'), output[0]);
|
|
try testing.expectEqual(@as(u32, 'i'), output[1]);
|
|
try testing.expectEqual(@as(u32, 0xFFFD), output[2]);
|
|
}
|
|
}
|
|
|
|
// Unicode Table 3-8: U+FFFD for Non-Shortest Form Sequences
|
|
// Bytes: C0 AF E0 80 BF F0 81 82 41
|
|
// Output: FFFD FFFD FFFD FFFD FFFD FFFD FFFD FFFD 0041
|
|
test "Table 3-8: non-shortest form sequences" {
|
|
const testing = std.testing;
|
|
var output: [64]u32 = undefined;
|
|
|
|
const str = "\xC0\xAF\xE0\x80\xBF\xF0\x81\x82\x41";
|
|
const result = utf8DecodeUntilControlSeq(str, &output);
|
|
try testing.expectEqual(@as(usize, 9), result.consumed);
|
|
try testing.expectEqual(@as(usize, 9), result.decoded);
|
|
for (0..8) |i| {
|
|
try testing.expectEqual(@as(u32, 0xFFFD), output[i]);
|
|
}
|
|
try testing.expectEqual(@as(u32, 0x41), output[8]);
|
|
}
|
|
|
|
// Unicode Table 3-9: U+FFFD for Ill-Formed Sequences for Surrogates
|
|
// Bytes: ED A0 80 ED BF BF ED AF 41
|
|
// Output: FFFD FFFD FFFD FFFD FFFD FFFD FFFD FFFD 0041
|
|
test "Table 3-9: surrogate sequences" {
|
|
const testing = std.testing;
|
|
var output: [64]u32 = undefined;
|
|
|
|
const str = "\xED\xA0\x80\xED\xBF\xBF\xED\xAF\x41";
|
|
const result = utf8DecodeUntilControlSeq(str, &output);
|
|
try testing.expectEqual(@as(usize, 9), result.consumed);
|
|
try testing.expectEqual(@as(usize, 9), result.decoded);
|
|
for (0..8) |i| {
|
|
try testing.expectEqual(@as(u32, 0xFFFD), output[i]);
|
|
}
|
|
try testing.expectEqual(@as(u32, 0x41), output[8]);
|
|
}
|
|
|
|
// Unicode Table 3-10: U+FFFD for Other Ill-Formed Sequences
|
|
// Bytes: F4 91 92 93 FF 41 80 BF 42
|
|
// Output: FFFD FFFD FFFD FFFD FFFD 0041 FFFD FFFD 0042
|
|
test "Table 3-10: other ill-formed sequences" {
|
|
const testing = std.testing;
|
|
var output: [64]u32 = undefined;
|
|
|
|
const str = "\xF4\x91\x92\x93\xFF\x41\x80\xBF\x42";
|
|
const result = utf8DecodeUntilControlSeq(str, &output);
|
|
try testing.expectEqual(@as(usize, 9), result.consumed);
|
|
try testing.expectEqual(@as(usize, 9), result.decoded);
|
|
try testing.expectEqual(@as(u32, 0xFFFD), output[0]); // F4
|
|
try testing.expectEqual(@as(u32, 0xFFFD), output[1]); // 91
|
|
try testing.expectEqual(@as(u32, 0xFFFD), output[2]); // 92
|
|
try testing.expectEqual(@as(u32, 0xFFFD), output[3]); // 93
|
|
try testing.expectEqual(@as(u32, 0xFFFD), output[4]); // FF
|
|
try testing.expectEqual(@as(u32, 0x0041), output[5]); // 41
|
|
try testing.expectEqual(@as(u32, 0xFFFD), output[6]); // 80
|
|
try testing.expectEqual(@as(u32, 0xFFFD), output[7]); // BF
|
|
try testing.expectEqual(@as(u32, 0x0042), output[8]); // 42
|
|
}
|
|
|
|
// Unicode Table 3-11: U+FFFD for Truncated Sequences
|
|
// Bytes: E1 80 E2 F0 91 92 F1 BF 41
|
|
// Output: FFFD FFFD FFFD FFFD 0041
|
|
test "Table 3-11: truncated sequences" {
|
|
const testing = std.testing;
|
|
var output: [64]u32 = undefined;
|
|
|
|
const str = "\xE1\x80\xE2\xF0\x91\x92\xF1\xBF\x41";
|
|
const result = utf8DecodeUntilControlSeq(str, &output);
|
|
try testing.expectEqual(@as(usize, 9), result.consumed);
|
|
try testing.expectEqual(@as(usize, 5), result.decoded);
|
|
try testing.expectEqual(@as(u32, 0xFFFD), output[0]); // E1 80 (truncated 3-byte)
|
|
try testing.expectEqual(@as(u32, 0xFFFD), output[1]); // E2 (truncated 3-byte, next byte F0 not continuation)
|
|
try testing.expectEqual(@as(u32, 0xFFFD), output[2]); // F0 91 92 (truncated 4-byte)
|
|
try testing.expectEqual(@as(u32, 0xFFFD), output[3]); // F1 BF (truncated 4-byte, next byte 41 not continuation)
|
|
try testing.expectEqual(@as(u32, 0x0041), output[4]); // 41
|
|
}
|