ghostty/src/simd/vt.zig

const std = @import("std");
const options = @import("build_options");
const assert = @import("../quirks.zig").inlineAssert;
const indexOf = @import("index_of.zig").indexOf;

// vt.cpp
extern "c" fn ghostty_simd_decode_utf8_until_control_seq(
    input: [*]const u8,
    count: usize,
    output: [*]u32,
    output_count: *usize,
) usize;

const DecodeResult = struct {
    consumed: usize,
    decoded: usize,
};

pub fn utf8DecodeUntilControlSeq(
    input: []const u8,
    output: []u32,
) DecodeResult {
    assert(output.len >= input.len);

    if (comptime options.simd) {
        var decoded: usize = 0;
        const consumed = ghostty_simd_decode_utf8_until_control_seq(
            input.ptr,
            input.len,
            output.ptr,
            &decoded,
        );

        return .{ .consumed = consumed, .decoded = decoded };
    }

    return utf8DecodeUntilControlSeqScalar(input, output);
}

fn utf8DecodeUntilControlSeqScalar(
    input: []const u8,
    output: []u32,
) DecodeResult {
    // Find our escape
    const idx = indexOf(input, 0x1B) orelse input.len;
    const decode = input[0..idx];

    // Go through and decode one item at a time, following the W3C/Unicode
    // "U+FFFD Substitution of Maximal Subparts" algorithm for ill-formed
    // subsequences.
    var decode_offset: usize = 0;
    var decode_count: usize = 0;
    while (decode_offset < decode.len) {
        const b0 = decode[decode_offset];

        // ASCII fast path
        if (b0 < 0x80) {
            output[decode_count] = b0;
            decode_count += 1;
            decode_offset += 1;
            continue;
        }

        // Continuation byte (80-BF) or invalid byte (C0-C1, F5-FF)
        // as lead: each is its own maximal subpart → one FFFD per byte.
        if (b0 < 0xC2 or b0 > 0xF4) {
            output[decode_count] = 0xFFFD;
            decode_count += 1;
            decode_offset += 1;
            continue;
        }

        // Multi-byte sequence. Determine expected length and the valid
        // range for each continuation byte per Unicode Table 3-7.
        const seq = utf8SeqInfo(b0);

        // Check how many continuation bytes form a valid prefix (the
        // maximal subpart). We check each byte against its specific
        // valid range.
        var valid: usize = 1; // lead byte is valid
        for (0..seq.len - 1) |ci| {
            if (decode_offset + valid >= decode.len) {
                // Truncated at end of buffer: treat as incomplete
                // input that may be completed later. Stop decoding
                // without consuming these bytes.
                return .{
                    .consumed = decode_offset,
                    .decoded = decode_count,
                };
            }
            const cb = decode[decode_offset + valid];
            if (cb < seq.ranges[ci][0] or cb > seq.ranges[ci][1]) {
                // Byte doesn't match expected range. The maximal
                // subpart ends here.
                break;
            }
            valid += 1;
        }

        if (valid == seq.len) {
            // Full sequence present and structurally valid. Decode it.
            // (Structural validity per Table 3-7 guarantees decode success.)
            const cp_bytes = decode[decode_offset..][0..seq.len];
            if (std.unicode.utf8Decode(cp_bytes)) |cp| {
                output[decode_count] = @intCast(cp);
                decode_count += 1;
                decode_offset += seq.len;
            } else |_| {
                // Should not happen given Table 3-7 validation, but
                // be safe: emit FFFD for the lead byte.
                output[decode_count] = 0xFFFD;
                decode_count += 1;
                decode_offset += 1;
            }
        } else {
            // Incomplete/ill-formed: the maximal subpart (valid bytes)
            // maps to a single FFFD.
            output[decode_count] = 0xFFFD;
            decode_count += 1;
            decode_offset += valid;
        }
    }

    return .{
        .consumed = decode_offset,
        .decoded = decode_count,
    };
}

const Utf8SeqInfo = struct {
    len: u3,
    ranges: [3][2]u8,
};

/// Returns the expected byte count and valid continuation byte ranges
/// for a UTF-8 sequence based on its lead byte, per Unicode Table 3-7.
fn utf8SeqInfo(lead: u8) Utf8SeqInfo {
    return switch (lead) {
        0xC2...0xDF => .{ .len = 2, .ranges = .{ .{ 0x80, 0xBF }, .{ 0, 0 }, .{ 0, 0 } } },
        0xE0 => .{ .len = 3, .ranges = .{ .{ 0xA0, 0xBF }, .{ 0x80, 0xBF }, .{ 0, 0 } } },
        0xE1...0xEC => .{ .len = 3, .ranges = .{ .{ 0x80, 0xBF }, .{ 0x80, 0xBF }, .{ 0, 0 } } },
        0xED => .{ .len = 3, .ranges = .{ .{ 0x80, 0x9F }, .{ 0x80, 0xBF }, .{ 0, 0 } } },
        0xEE...0xEF => .{ .len = 3, .ranges = .{ .{ 0x80, 0xBF }, .{ 0x80, 0xBF }, .{ 0, 0 } } },
        0xF0 => .{ .len = 4, .ranges = .{ .{ 0x90, 0xBF }, .{ 0x80, 0xBF }, .{ 0x80, 0xBF } } },
        0xF1...0xF3 => .{ .len = 4, .ranges = .{ .{ 0x80, 0xBF }, .{ 0x80, 0xBF }, .{ 0x80, 0xBF } } },
        0xF4 => .{ .len = 4, .ranges = .{ .{ 0x80, 0x8F }, .{ 0x80, 0xBF }, .{ 0x80, 0xBF } } },
        else => unreachable,
    };
}

test "decode no escape" {
    const testing = std.testing;

    var output: [1024]u32 = undefined;

    // TODO: many more test cases
    {
        const str = "hello" ** 128;
        try testing.expectEqual(DecodeResult{
            .consumed = str.len,
            .decoded = str.len,
        }, utf8DecodeUntilControlSeq(str, &output));
    }
}

test "decode ASCII to escape" {
    const testing = std.testing;

    var output: [1024]u32 = undefined;

    // TODO: many more test cases
    {
        const prefix = "hello" ** 64;
        const str = prefix ++ "\x1b" ++ ("world" ** 64);
        try testing.expectEqual(DecodeResult{
            .consumed = prefix.len,
            .decoded = prefix.len,
        }, utf8DecodeUntilControlSeq(str, &output));
    }
}

test "decode immediate esc sequence" {
    const testing = std.testing;

    var output: [64]u32 = undefined;
    const str = "\x1b[?5s";
    try testing.expectEqual(DecodeResult{
        .consumed = 0,
        .decoded = 0,
    }, utf8DecodeUntilControlSeq(str, &output));
}

test "decode incomplete UTF-8" {
    const testing = std.testing;

    var output: [64]u32 = undefined;

    // 2-byte truncated at end of buffer
    {
        const str = "hello\xc2";
        try testing.expectEqual(DecodeResult{
            .consumed = 5,
            .decoded = 5,
        }, utf8DecodeUntilControlSeq(str, &output));
    }

    // 3-byte: \xe0 expects A0-BF next, but \x00 is not in range.
    // \xe0 is a maximal subpart of length 1 → FFFD, then \x00 is ASCII NUL.
    {
        const str = "hello\xe0\x00";
        const result = utf8DecodeUntilControlSeq(str, &output);
        try testing.expectEqual(@as(usize, 7), result.consumed);
        try testing.expectEqual(@as(usize, 7), result.decoded);
        try testing.expectEqual(@as(u32, 0xFFFD), output[5]);
        try testing.expectEqual(@as(u32, 0x00), output[6]);
    }

    // 4-byte truncated at end of buffer (F0 90 is valid so far)
    {
        const str = "hello\xf0\x90";
        try testing.expectEqual(DecodeResult{
            .consumed = 5,
            .decoded = 5,
        }, utf8DecodeUntilControlSeq(str, &output));
    }
}

test "decode invalid UTF-8" {
    const testing = std.testing;

    var output: [64]u32 = undefined;

    // Invalid leading 2-byte sequence
    {
        const str = "hello\xc2\x01";
        try testing.expectEqual(DecodeResult{
            .consumed = 7,
            .decoded = 7,
        }, utf8DecodeUntilControlSeq(str, &output));
    }

    // Replacement will only replace the invalid leading byte.
    try testing.expectEqual(@as(u32, 0xFFFD), output[5]);
    try testing.expectEqual(@as(u32, 0x01), output[6]);
}

// Per the maximal subpart spec, bytes F5-FF are each replaced with FFFD.
test "decode invalid leading byte is replaced" {
    const testing = std.testing;

    var output: [64]u32 = undefined;

    {
        const str = "hello\xFF";
        const result = utf8DecodeUntilControlSeq(str, &output);
        try testing.expectEqual(@as(usize, 6), result.consumed);
        try testing.expectEqual(@as(usize, 6), result.decoded);
        try testing.expectEqual(@as(u32, 0xFFFD), output[5]);
    }
}

test "decode invalid continuation in 3-byte sequence" {
    const testing = std.testing;

    var output: [64]u32 = undefined;

    // \xe2 expects two continuation bytes, \x28 is not one
    {
        const str = "hello\xe2\x28world";
        const result = utf8DecodeUntilControlSeq(str, &output);
        // "hello" + replacement + "(" + "world" = 12 codepoints
        try testing.expectEqual(@as(usize, 12), result.decoded);
        try testing.expectEqual(@as(u32, 0xFFFD), output[5]);
        try testing.expectEqual(@as(u32, '('), output[6]);
        try testing.expectEqual(@as(u32, 'w'), output[7]);
    }
}

test "decode invalid continuation in 4-byte sequence" {
    const testing = std.testing;

    var output: [64]u32 = undefined;

    // \xf0\x90 is a valid prefix of a 4-byte sequence, but \x28 breaks it.
    // Maximal subpart is F0 90 (length 2) → single FFFD, then '(' proceeds.
    {
        const str = "hello\xf0\x90\x28world";
        const result = utf8DecodeUntilControlSeq(str, &output);
        // "hello" + FFFD + "(" + "world" = 12 codepoints
        try testing.expectEqual(@as(usize, 12), result.decoded);
        try testing.expectEqual(@as(u32, 0xFFFD), output[5]);
        try testing.expectEqual(@as(u32, '('), output[6]);
        try testing.expectEqual(@as(u32, 'w'), output[7]);
    }
}

test "decode multiple consecutive invalid bytes" {
    const testing = std.testing;

    var output: [64]u32 = undefined;

    // Each lone continuation byte is its own maximal subpart → one FFFD each.
    {
        const str = "a\x80\x80b";
        const result = utf8DecodeUntilControlSeq(str, &output);
        // "a" + FFFD + FFFD + "b" = 4 codepoints
        try testing.expectEqual(@as(usize, 4), result.decoded);
        try testing.expectEqual(@as(u32, 'a'), output[0]);
        try testing.expectEqual(@as(u32, 0xFFFD), output[1]);
        try testing.expectEqual(@as(u32, 0xFFFD), output[2]);
        try testing.expectEqual(@as(u32, 'b'), output[3]);
    }

    // C0 is an invalid lead byte (< C2), each byte gets its own FFFD.
    {
        const str = "a\xc0\xc0b";
        const result = utf8DecodeUntilControlSeq(str, &output);
        // "a" + FFFD + FFFD + "b" = 4 codepoints
        try testing.expectEqual(@as(usize, 4), result.decoded);
        try testing.expectEqual(@as(u32, 'a'), output[0]);
        try testing.expectEqual(@as(u32, 0xFFFD), output[1]);
        try testing.expectEqual(@as(u32, 0xFFFD), output[2]);
        try testing.expectEqual(@as(u32, 'b'), output[3]);
    }
}

test "decode unexpected continuation byte as lead" {
    const testing = std.testing;

    var output: [64]u32 = undefined;

    // 0x80 is a continuation byte appearing as a lead byte
    {
        const str = "a\x80b";
        const result = utf8DecodeUntilControlSeq(str, &output);
        // "a" + replacement + "b" = 3 codepoints
        try testing.expectEqual(@as(usize, 3), result.decoded);
        try testing.expectEqual(@as(u32, 'a'), output[0]);
        try testing.expectEqual(@as(u32, 0xFFFD), output[1]);
        try testing.expectEqual(@as(u32, 'b'), output[2]);
    }
}

test "decode overlong 2-byte encoding" {
    const testing = std.testing;

    var output: [64]u32 = undefined;

    // \xc0\xaf: C0 is invalid lead (< C2) → FFFD, AF is lone continuation → FFFD
    // Per Table 3-8: C0 AF → FFFD FFFD
    {
        const str = "a\xc0\xafb";
        const result = utf8DecodeUntilControlSeq(str, &output);
        // "a" + FFFD + FFFD + "b" = 4 codepoints
        try testing.expectEqual(@as(usize, 4), result.decoded);
        try testing.expectEqual(@as(u32, 'a'), output[0]);
        try testing.expectEqual(@as(u32, 0xFFFD), output[1]);
        try testing.expectEqual(@as(u32, 0xFFFD), output[2]);
        try testing.expectEqual(@as(u32, 'b'), output[3]);
    }
}

test "decode surrogate half" {
    const testing = std.testing;

    var output: [64]u32 = undefined;

    // \xed\xa0\x80 encodes U+D800 (a surrogate). Per Table 3-7, after ED
    // the next byte must be 80-9F. A0 is out of range, so ED is a maximal
    // subpart of length 1 → FFFD. Then A0 and 80 are lone continuations
    // → FFFD each. Per Table 3-9: ED A0 80 → FFFD FFFD FFFD
    {
        const str = "a\xed\xa0\x80b";
        const result = utf8DecodeUntilControlSeq(str, &output);
        // "a" + FFFD + FFFD + FFFD + "b" = 5 codepoints
        try testing.expectEqual(@as(usize, 5), result.decoded);
        try testing.expectEqual(@as(u32, 'a'), output[0]);
        try testing.expectEqual(@as(u32, 0xFFFD), output[1]);
        try testing.expectEqual(@as(u32, 0xFFFD), output[2]);
        try testing.expectEqual(@as(u32, 0xFFFD), output[3]);
        try testing.expectEqual(@as(u32, 'b'), output[4]);
    }
}

test "decode valid multibyte surrounded by invalid" {
    const testing = std.testing;

    var output: [64]u32 = undefined;

    // \xc3\xa9 = é (U+00E9), surrounded by invalid continuation bytes
    {
        const str = "\x80\xc3\xa9\x80";
        const result = utf8DecodeUntilControlSeq(str, &output);
        // replacement + é + replacement = 3 codepoints
        try testing.expectEqual(@as(usize, 3), result.decoded);
        try testing.expectEqual(@as(u32, 0xFFFD), output[0]);
        try testing.expectEqual(@as(u32, 0x00E9), output[1]);
        try testing.expectEqual(@as(u32, 0xFFFD), output[2]);
    }
}

test "decode invalid byte before escape" {
    const testing = std.testing;

    var output: [64]u32 = undefined;

    // Invalid byte followed by ESC - should replace then stop
    {
        const str = "hi\x80\x1b[0m";
        const result = utf8DecodeUntilControlSeq(str, &output);
        try testing.expectEqual(@as(usize, 3), result.consumed);
        try testing.expectEqual(@as(usize, 3), result.decoded);
        try testing.expectEqual(@as(u32, 'h'), output[0]);
        try testing.expectEqual(@as(u32, 'i'), output[1]);
        try testing.expectEqual(@as(u32, 0xFFFD), output[2]);
    }
}

// Unicode Table 3-8: U+FFFD for Non-Shortest Form Sequences
// Bytes:  C0  AF  E0  80  BF  F0  81  82  41
// Output: FFFD FFFD FFFD FFFD FFFD FFFD FFFD FFFD 0041
test "Table 3-8: non-shortest form sequences" {
    const testing = std.testing;
    var output: [64]u32 = undefined;

    const str = "\xC0\xAF\xE0\x80\xBF\xF0\x81\x82\x41";
    const result = utf8DecodeUntilControlSeq(str, &output);
    try testing.expectEqual(@as(usize, 9), result.consumed);
    try testing.expectEqual(@as(usize, 9), result.decoded);
    for (0..8) |i| {
        try testing.expectEqual(@as(u32, 0xFFFD), output[i]);
    }
    try testing.expectEqual(@as(u32, 0x41), output[8]);
}

// Unicode Table 3-9: U+FFFD for Ill-Formed Sequences for Surrogates
// Bytes:  ED  A0  80  ED  BF  BF  ED  AF  41
// Output: FFFD FFFD FFFD FFFD FFFD FFFD FFFD FFFD 0041
test "Table 3-9: surrogate sequences" {
    const testing = std.testing;
    var output: [64]u32 = undefined;

    const str = "\xED\xA0\x80\xED\xBF\xBF\xED\xAF\x41";
    const result = utf8DecodeUntilControlSeq(str, &output);
    try testing.expectEqual(@as(usize, 9), result.consumed);
    try testing.expectEqual(@as(usize, 9), result.decoded);
    for (0..8) |i| {
        try testing.expectEqual(@as(u32, 0xFFFD), output[i]);
    }
    try testing.expectEqual(@as(u32, 0x41), output[8]);
}

// Unicode Table 3-10: U+FFFD for Other Ill-Formed Sequences
// Bytes:  F4  91  92  93  FF  41  80  BF  42
// Output: FFFD FFFD FFFD FFFD FFFD 0041 FFFD FFFD 0042
test "Table 3-10: other ill-formed sequences" {
    const testing = std.testing;
    var output: [64]u32 = undefined;

    const str = "\xF4\x91\x92\x93\xFF\x41\x80\xBF\x42";
    const result = utf8DecodeUntilControlSeq(str, &output);
    try testing.expectEqual(@as(usize, 9), result.consumed);
    try testing.expectEqual(@as(usize, 9), result.decoded);
    try testing.expectEqual(@as(u32, 0xFFFD), output[0]); // F4
    try testing.expectEqual(@as(u32, 0xFFFD), output[1]); // 91
    try testing.expectEqual(@as(u32, 0xFFFD), output[2]); // 92
    try testing.expectEqual(@as(u32, 0xFFFD), output[3]); // 93
    try testing.expectEqual(@as(u32, 0xFFFD), output[4]); // FF
    try testing.expectEqual(@as(u32, 0x0041), output[5]); // 41
    try testing.expectEqual(@as(u32, 0xFFFD), output[6]); // 80
    try testing.expectEqual(@as(u32, 0xFFFD), output[7]); // BF
    try testing.expectEqual(@as(u32, 0x0042), output[8]); // 42
}

// Unicode Table 3-11: U+FFFD for Truncated Sequences
// Bytes:  E1  80  E2  F0  91  92  F1  BF  41
// Output: FFFD     FFFD    FFFD         FFFD     0041
test "Table 3-11: truncated sequences" {
    const testing = std.testing;
    var output: [64]u32 = undefined;

    const str = "\xE1\x80\xE2\xF0\x91\x92\xF1\xBF\x41";
    const result = utf8DecodeUntilControlSeq(str, &output);
    try testing.expectEqual(@as(usize, 9), result.consumed);
    try testing.expectEqual(@as(usize, 5), result.decoded);
    try testing.expectEqual(@as(u32, 0xFFFD), output[0]); // E1 80 (truncated 3-byte)
    try testing.expectEqual(@as(u32, 0xFFFD), output[1]); // E2 (truncated 3-byte, next byte F0 not continuation)
    try testing.expectEqual(@as(u32, 0xFFFD), output[2]); // F0 91 92 (truncated 4-byte)
    try testing.expectEqual(@as(u32, 0xFFFD), output[3]); // F1 BF (truncated 4-byte, next byte 41 not continuation)
    try testing.expectEqual(@as(u32, 0x0041), output[4]); // 41
}