benchmark: add AGENTS, improve UTF-8 synthetic data (#12297)

This updates our synthetic generator for UTF-8 to expose: - Flags to change 1/2/3/4-byte UTF-8 character distribution - Flags to have only printable characters so we can benchmark pure UTF-8 vs our control sequence finder. - Flags to have invalid characters so we can benchmark our error handling. This also adds an AGENTS.md to src/benchmark so agents can do the right thing more easily. These are necessary to robustly benchmark our libc++ removal PR.
2026-07-12 04:09:40 +00:00 · 2026-04-15 09:09:11 -07:00
parent 49a43bf560 9c49c34356
commit 29f92c0c8b
3 changed files with 230 additions and 22 deletions
--- a/src/benchmark/AGENTS.md
+++ b/src/benchmark/AGENTS.md
@@ -0,0 +1,34 @@
+# Benchmarking
+
+The benchmark tools are split into two roles:
+
+- `ghostty-gen` generates synthetic input data.
+- `ghostty-bench` consumes existing input data and runs a benchmark.
+
+## Workflow
+
+- For timing comparisons, generate data first and benchmark it later.
+- Do not pipe `ghostty-gen` directly into `ghostty-bench` when comparing
+  performance. That mixes generation cost into the measurement and makes
+  branch-to-branch comparisons noisy.
+- Reuse the exact same generated files when comparing revisions.
+- Prefer deterministic generation inputs such as fixed seeds when the
+  generator supports them.
+- Keep large generated benchmark corpora outside the repository unless the
+  change explicitly requires checked-in test data.
+
+## Running Benchmarks
+
+- Prefer `hyperfine` to compare benchmark timings.
+- Benchmark the `ghostty-bench` command line, not the generator.
+- Use `ghostty-bench ... --data <path>` with pre-generated files.
+- Run multiple warmups and repeated measurements so branch comparisons are
+  based on medians instead of single runs.
+- When comparing branches, keep all benchmark inputs and CLI flags the same,
+  including terminal dimensions.
+
+## Building
+
+- Build benchmark tools with `zig build -Demit-bench`.
+- On macOS, prefer `zig build -Demit-bench -Demit-macos-app=false` unless the
+  macOS app itself is part of the work.
--- a/src/synthetic/Utf8.zig
+++ b/src/synthetic/Utf8.zig
@@ -17,6 +17,27 @@ pub const Utf8Len = enum(u3) {
    four = 4,
 };

+const InvalidSequence = struct {
+    len: u3,
+    bytes: [4]u8,
+
+    fn slice(self: *const InvalidSequence) []const u8 {
+        return self.bytes[0..self.len];
+    }
+};
+
+const invalid_sequences = [_]InvalidSequence{
+    .{ .len = 1, .bytes = .{ 0x80, 0x00, 0x00, 0x00 } },
+    .{ .len = 1, .bytes = .{ 0xC0, 0x00, 0x00, 0x00 } },
+    .{ .len = 1, .bytes = .{ 0xFF, 0x00, 0x00, 0x00 } },
+    .{ .len = 2, .bytes = .{ 0xC2, 0x20, 0x00, 0x00 } },
+    .{ .len = 2, .bytes = .{ 0xC0, 0xAF, 0x00, 0x00 } },
+    .{ .len = 2, .bytes = .{ 0x80, 0x80, 0x00, 0x00 } },
+    .{ .len = 3, .bytes = .{ 0xED, 0xA0, 0x80, 0x00 } },
+    .{ .len = 3, .bytes = .{ 0xE2, 0x28, 0x7A, 0x00 } },
+    .{ .len = 4, .bytes = .{ 0xF0, 0x90, 0x28, 0x7A } },
+};
+
 /// Random number generator.
 rand: std.Random,

@@ -37,6 +58,13 @@ max_len: usize = std.math.maxInt(usize),
 /// skew the distribution of lengths.
 p_length: std.enums.EnumArray(Utf8Len, f64) = .initFill(1.0),

+/// If true, ASCII codepoints are limited to printable ASCII.
+ascii_printable_only: bool = false,
+
+/// Probability that the next generated sequence is malformed UTF-8.
+/// This is checked for each emitted sequence while filling a buffer.
+invalid_rate: f64 = 0,
+
 pub fn generator(self: *Utf8) Generator {
    return .init(self, next);
 }
@@ -49,23 +77,44 @@ pub fn next(self: *Utf8, writer: *std.Io.Writer, max_len: usize) Generator.Error

    var rem: usize = len;
    while (rem > 0) {
-        // Pick a utf8 byte count to generate.
-        const utf8_len: Utf8Len = len: {
-            const Indexer = @TypeOf(self.p_length).Indexer;
-            const idx = self.rand.weightedIndex(f64, &self.p_length.values);
-            var utf8_len = Indexer.keyForIndex(idx);
-            assert(rem > 0);
-            while (@intFromEnum(utf8_len) > rem) {
-                // If the chosen length can't fit into the remaining buffer,
-                // choose a smaller length.
-                utf8_len = @enumFromInt(@intFromEnum(utf8_len) - 1);
-            }
-            break :len utf8_len;
-        };
+        if (try self.writeInvalid(writer, rem)) |written| {
+            rem -= written;
+            continue;
+        }
+
+        const written = try self.writeValid(writer, rem);
+        rem -= written;
+    }
+}
+
+fn writeInvalid(
+    self: *Utf8,
+    writer: *std.Io.Writer,
+    rem: usize,
+) Generator.Error!?usize {
+    if (self.invalid_rate <= 0 or self.rand.float(f64) >= self.invalid_rate) {
+        return null;
+    }
+
+    const seq = self.invalidSequence(rem) orelse return null;
+    try writer.writeAll(seq.slice());
+    return seq.len;
+}
+
+fn writeValid(
+    self: *Utf8,
+    writer: *std.Io.Writer,
+    rem: usize,
+) Generator.Error!usize {
+    while (true) {
+        const utf8_len = self.utf8Len(rem);

        // Generate a UTF-8 sequence that encodes to this length.
        const cp: u21 = switch (utf8_len) {
-            .one => self.rand.intRangeAtMostBiased(u21, 0x00, 0x7F),
+            .one => if (self.ascii_printable_only)
+                self.rand.intRangeAtMostBiased(u21, 0x20, 0x7E)
+            else
+                self.rand.intRangeAtMostBiased(u21, 0x00, 0x7F),
            .two => self.rand.intRangeAtMostBiased(u21, 0x80, 0x7FF),
            .three => self.rand.intRangeAtMostBiased(u21, 0x800, 0xFFFF),
            .four => self.rand.intRangeAtMostBiased(u21, 0x10000, 0x10FFFF),
@@ -87,11 +136,40 @@ pub fn next(self: *Utf8, writer: *std.Io.Writer, max_len: usize) Generator.Error
            // Possible, in which case we redo the loop and encode nothing.
            error.Utf8CannotEncodeSurrogateHalf => continue,
        };
+
        try writer.writeAll(buf[0..l]);
-        rem -= l;
+        return l;
    }
 }

+fn utf8Len(self: *Utf8, rem: usize) Utf8Len {
+    const Indexer = @TypeOf(self.p_length).Indexer;
+    const idx = self.rand.weightedIndex(f64, &self.p_length.values);
+    var utf8_len = Indexer.keyForIndex(idx);
+    assert(rem > 0);
+    while (@intFromEnum(utf8_len) > rem) {
+        // If the chosen length can't fit into the remaining buffer,
+        // choose a smaller length.
+        utf8_len = @enumFromInt(@intFromEnum(utf8_len) - 1);
+    }
+    return utf8_len;
+}
+
+fn invalidSequence(self: *Utf8, rem: usize) ?InvalidSequence {
+    const candidates = &invalid_sequences;
+
+    var valid_idx: [candidates.len]usize = undefined;
+    var valid_len: usize = 0;
+    for (candidates, 0..) |candidate, i| {
+        if (candidate.len > rem) continue;
+        valid_idx[valid_len] = i;
+        valid_len += 1;
+    }
+
+    if (valid_len == 0) return null;
+    return candidates[valid_idx[self.rand.uintLessThan(usize, valid_len)]];
+}
+
 test "utf8" {
    const testing = std.testing;
    var prng = std.Random.DefaultPrng.init(0);
@@ -106,3 +184,46 @@ test "utf8" {
    try testing.expectEqual(256, result.len);
    try testing.expect(std.unicode.utf8ValidateSlice(result));
 }
+
+test "utf8 printable ascii only" {
+    const testing = std.testing;
+    var prng = std.Random.DefaultPrng.init(0);
+    var buf: [256]u8 = undefined;
+    var writer: std.Io.Writer = .fixed(&buf);
+    var v: Utf8 = .{
+        .rand = prng.random(),
+        .ascii_printable_only = true,
+    };
+    v.min_len = buf.len;
+    v.max_len = buf.len;
+    v.p_length.set(.one, 1.0);
+    v.p_length.set(.two, 0.0);
+    v.p_length.set(.three, 0.0);
+    v.p_length.set(.four, 0.0);
+
+    const gen = v.generator();
+    try gen.next(&writer, buf.len);
+    const result = writer.buffered();
+    try testing.expectEqual(256, result.len);
+    try testing.expect(std.unicode.utf8ValidateSlice(result));
+    for (result) |c| try testing.expect(std.ascii.isPrint(c));
+}
+
+test "utf8 malformed output" {
+    const testing = std.testing;
+    var prng = std.Random.DefaultPrng.init(0);
+    var buf: [256]u8 = undefined;
+    var writer: std.Io.Writer = .fixed(&buf);
+    var v: Utf8 = .{
+        .rand = prng.random(),
+        .invalid_rate = 1.0,
+    };
+    v.min_len = buf.len;
+    v.max_len = buf.len;
+
+    const gen = v.generator();
+    try gen.next(&writer, buf.len);
+    const result = writer.buffered();
+    try testing.expectEqual(256, result.len);
+    try testing.expect(!std.unicode.utf8ValidateSlice(result));
+}
--- a/src/synthetic/cli/Utf8.zig
+++ b/src/synthetic/cli/Utf8.zig
@@ -1,21 +1,60 @@
 const Utf8 = @This();

 const std = @import("std");
-const assert = std.debug.assert;
 const Allocator = std.mem.Allocator;
 const synthetic = @import("../main.zig");

-const log = std.log.scoped(.@"terminal-stream-bench");
+pub const Options = struct {
+    /// Seed to use for deterministic generation. If unset, a time-based
+    /// seed is used by the generic synthetic CLI.
+    seed: ?u64 = null,

-pub const Options = struct {};
+    /// Relative weight for choosing 1-byte UTF-8 sequences.
+    @"weight-one": f64 = 1.0,
+
+    /// Relative weight for choosing 2-byte UTF-8 sequences.
+    @"weight-two": f64 = 1.0,
+
+    /// Relative weight for choosing 3-byte UTF-8 sequences.
+    @"weight-three": f64 = 1.0,
+
+    /// Relative weight for choosing 4-byte UTF-8 sequences.
+    @"weight-four": f64 = 1.0,
+
+    /// Restrict ASCII codepoints to printable characters.
+    @"ascii-printable-only": bool = false,
+
+    /// Probability that an emitted sequence is malformed UTF-8.
+    @"invalid-rate": f64 = 0.0,
+};
+
+opts: Options,

 /// Create a new terminal stream handler for the given arguments.
 pub fn create(
    alloc: Allocator,
-    _: Options,
+    opts: Options,
 ) !*Utf8 {
+    if (opts.@"invalid-rate" < 0 or opts.@"invalid-rate" > 1) {
+        return error.InvalidValue;
+    }
+
+    const weights = [_]f64{
+        opts.@"weight-one",
+        opts.@"weight-two",
+        opts.@"weight-three",
+        opts.@"weight-four",
+    };
+    var weight_sum: f64 = 0;
+    for (weights) |weight| {
+        if (weight < 0) return error.InvalidValue;
+        weight_sum += weight;
+    }
+    if (weight_sum <= 0) return error.InvalidValue;
+
    const ptr = try alloc.create(Utf8);
    errdefer alloc.destroy(ptr);
+    ptr.* = .{ .opts = opts };
    return ptr;
 }

@@ -24,11 +63,22 @@ pub fn destroy(self: *Utf8, alloc: Allocator) void {
 }

 pub fn run(self: *Utf8, writer: *std.Io.Writer, rand: std.Random) !void {
-    _ = self;
+    var prng: ?std.Random.DefaultPrng = null;
+    var gen_rand = rand;
+    if (self.opts.seed) |seed| {
+        prng = std.Random.DefaultPrng.init(seed);
+        gen_rand = prng.?.random();
+    }

    var gen: synthetic.Utf8 = .{
-        .rand = rand,
+        .rand = gen_rand,
+        .ascii_printable_only = self.opts.@"ascii-printable-only",
+        .invalid_rate = self.opts.@"invalid-rate",
    };
+    gen.p_length.set(.one, self.opts.@"weight-one");
+    gen.p_length.set(.two, self.opts.@"weight-two");
+    gen.p_length.set(.three, self.opts.@"weight-three");
+    gen.p_length.set(.four, self.opts.@"weight-four");

    while (true) {
        gen.next(writer, 1024) catch |err| {
@@ -46,7 +96,9 @@ test Utf8 {
    const testing = std.testing;
    const alloc = testing.allocator;

-    const impl: *Utf8 = try .create(alloc, .{});
+    const impl: *Utf8 = try .create(alloc, .{
+        .seed = 1,
+    });
    defer impl.destroy(alloc);

    var prng = std.Random.DefaultPrng.init(1);
@@ -55,4 +107,5 @@ test Utf8 {
    var buf: [1024]u8 = undefined;
    var writer: std.Io.Writer = .fixed(&buf);
    try impl.run(&writer, rand);
+    try testing.expectEqual(@as(usize, 1024), writer.buffered().len);
 }