From 9c49c343569791071603d63138aa1a6f7d9dd2dc Mon Sep 17 00:00:00 2001
From: Mitchell Hashimoto <m@mitchellh.com>
Date: Wed, 15 Apr 2026 08:23:42 -0700
Subject: [PATCH] benchmark: add AGENTS, improve UTF-8 synthetic data

This updates our synthetic generator for UTF-8 to expose:

  - Flags to change 1/2/3/4-byte UTF-8 character distribution
  - Flags to have only printable characters so we can benchmark
    pure UTF-8 vs our control sequence finder.
  - Flags to have invalid characters so we can benchmark our error
    handling.

This also adds an AGENTS.md to src/benchmark so agents can do the right
thing more easily.
---
 src/benchmark/AGENTS.md    |  34 +++++++++
 src/synthetic/Utf8.zig     | 151 +++++++++++++++++++++++++++++++++----
 src/synthetic/cli/Utf8.zig |  67 ++++++++++++++--
 3 files changed, 230 insertions(+), 22 deletions(-)
 create mode 100644 src/benchmark/AGENTS.md
diff --git a/src/benchmark/AGENTS.md b/src/benchmark/AGENTS.md
new file mode 100644
index 000000000..ba62500d4
--- /dev/null
+++ b/src/benchmark/AGENTS.md
@@ -0,0 +1,34 @@
+# Benchmarking
+
+The benchmark tools are split into two roles:
+
+- `ghostty-gen` generates synthetic input data.
+- `ghostty-bench` consumes existing input data and runs a benchmark.
+
+## Workflow
+
+- For timing comparisons, generate data first and benchmark it later.
+- Do not pipe `ghostty-gen` directly into `ghostty-bench` when comparing
+  performance. That mixes generation cost into the measurement and makes
+  branch-to-branch comparisons noisy.
+- Reuse the exact same generated files when comparing revisions.
+- Prefer deterministic generation inputs such as fixed seeds when the
+  generator supports them.
+- Keep large generated benchmark corpora outside the repository unless the
+  change explicitly requires checked-in test data.
+
+## Running Benchmarks
+
+- Prefer `hyperfine` to compare benchmark timings.
+- Benchmark the `ghostty-bench` command line, not the generator.
+- Use `ghostty-bench ... --data <path>` with pre-generated files.
+- Run multiple warmups and repeated measurements so branch comparisons are
+  based on medians instead of single runs.
+- When comparing branches, keep all benchmark inputs and CLI flags the same,
+  including terminal dimensions.
+
+## Building
+
+- Build benchmark tools with `zig build -Demit-bench`.
+- On macOS, prefer `zig build -Demit-bench -Demit-macos-app=false` unless the
+  macOS app itself is part of the work.
diff --git a/src/synthetic/Utf8.zig b/src/synthetic/Utf8.zig
index 0d72a8bb2..282532dc4 100644
--- a/src/synthetic/Utf8.zig
+++ b/src/synthetic/Utf8.zig
@@ -17,6 +17,27 @@ pub const Utf8Len = enum(u3) {
     four = 4,
 };
 
+const InvalidSequence = struct {
+    len: u3,
+    bytes: [4]u8,
+
+    fn slice(self: *const InvalidSequence) []const u8 {
+        return self.bytes[0..self.len];
+    }
+};
+
+const invalid_sequences = [_]InvalidSequence{
+    .{ .len = 1, .bytes = .{ 0x80, 0x00, 0x00, 0x00 } },
+    .{ .len = 1, .bytes = .{ 0xC0, 0x00, 0x00, 0x00 } },
+    .{ .len = 1, .bytes = .{ 0xFF, 0x00, 0x00, 0x00 } },
+    .{ .len = 2, .bytes = .{ 0xC2, 0x20, 0x00, 0x00 } },
+    .{ .len = 2, .bytes = .{ 0xC0, 0xAF, 0x00, 0x00 } },
+    .{ .len = 2, .bytes = .{ 0x80, 0x80, 0x00, 0x00 } },
+    .{ .len = 3, .bytes = .{ 0xED, 0xA0, 0x80, 0x00 } },
+    .{ .len = 3, .bytes = .{ 0xE2, 0x28, 0x7A, 0x00 } },
+    .{ .len = 4, .bytes = .{ 0xF0, 0x90, 0x28, 0x7A } },
+};
+
 /// Random number generator.
 rand: std.Random,
 
@@ -37,6 +58,13 @@ max_len: usize = std.math.maxInt(usize),
 /// skew the distribution of lengths.
 p_length: std.enums.EnumArray(Utf8Len, f64) = .initFill(1.0),
 
+/// If true, ASCII codepoints are limited to printable ASCII.
+ascii_printable_only: bool = false,
+
+/// Probability that the next generated sequence is malformed UTF-8.
+/// This is checked for each emitted sequence while filling a buffer.
+invalid_rate: f64 = 0,
+
 pub fn generator(self: *Utf8) Generator {
     return .init(self, next);
 }
@@ -49,23 +77,44 @@ pub fn next(self: *Utf8, writer: *std.Io.Writer, max_len: usize) Generator.Error
 
     var rem: usize = len;
     while (rem > 0) {
-        // Pick a utf8 byte count to generate.
-        const utf8_len: Utf8Len = len: {
-            const Indexer = @TypeOf(self.p_length).Indexer;
-            const idx = self.rand.weightedIndex(f64, &self.p_length.values);
-            var utf8_len = Indexer.keyForIndex(idx);
-            assert(rem > 0);
-            while (@intFromEnum(utf8_len) > rem) {
-                // If the chosen length can't fit into the remaining buffer,
-                // choose a smaller length.
-                utf8_len = @enumFromInt(@intFromEnum(utf8_len) - 1);
-            }
-            break :len utf8_len;
-        };
+        if (try self.writeInvalid(writer, rem)) |written| {
+            rem -= written;
+            continue;
+        }
+
+        const written = try self.writeValid(writer, rem);
+        rem -= written;
+    }
+}
+
+fn writeInvalid(
+    self: *Utf8,
+    writer: *std.Io.Writer,
+    rem: usize,
+) Generator.Error!?usize {
+    if (self.invalid_rate <= 0 or self.rand.float(f64) >= self.invalid_rate) {
+        return null;
+    }
+
+    const seq = self.invalidSequence(rem) orelse return null;
+    try writer.writeAll(seq.slice());
+    return seq.len;
+}
+
+fn writeValid(
+    self: *Utf8,
+    writer: *std.Io.Writer,
+    rem: usize,
+) Generator.Error!usize {
+    while (true) {
+        const utf8_len = self.utf8Len(rem);
 
         // Generate a UTF-8 sequence that encodes to this length.
         const cp: u21 = switch (utf8_len) {
-            .one => self.rand.intRangeAtMostBiased(u21, 0x00, 0x7F),
+            .one => if (self.ascii_printable_only)
+                self.rand.intRangeAtMostBiased(u21, 0x20, 0x7E)
+            else
+                self.rand.intRangeAtMostBiased(u21, 0x00, 0x7F),
             .two => self.rand.intRangeAtMostBiased(u21, 0x80, 0x7FF),
             .three => self.rand.intRangeAtMostBiased(u21, 0x800, 0xFFFF),
             .four => self.rand.intRangeAtMostBiased(u21, 0x10000, 0x10FFFF),
@@ -87,11 +136,40 @@ pub fn next(self: *Utf8, writer: *std.Io.Writer, max_len: usize) Generator.Error
             // Possible, in which case we redo the loop and encode nothing.
             error.Utf8CannotEncodeSurrogateHalf => continue,
         };
+
         try writer.writeAll(buf[0..l]);
-        rem -= l;
+        return l;
     }
 }
 
+fn utf8Len(self: *Utf8, rem: usize) Utf8Len {
+    const Indexer = @TypeOf(self.p_length).Indexer;
+    const idx = self.rand.weightedIndex(f64, &self.p_length.values);
+    var utf8_len = Indexer.keyForIndex(idx);
+    assert(rem > 0);
+    while (@intFromEnum(utf8_len) > rem) {
+        // If the chosen length can't fit into the remaining buffer,
+        // choose a smaller length.
+        utf8_len = @enumFromInt(@intFromEnum(utf8_len) - 1);
+    }
+    return utf8_len;
+}
+
+fn invalidSequence(self: *Utf8, rem: usize) ?InvalidSequence {
+    const candidates = &invalid_sequences;
+
+    var valid_idx: [candidates.len]usize = undefined;
+    var valid_len: usize = 0;
+    for (candidates, 0..) |candidate, i| {
+        if (candidate.len > rem) continue;
+        valid_idx[valid_len] = i;
+        valid_len += 1;
+    }
+
+    if (valid_len == 0) return null;
+    return candidates[valid_idx[self.rand.uintLessThan(usize, valid_len)]];
+}
+
 test "utf8" {
     const testing = std.testing;
     var prng = std.Random.DefaultPrng.init(0);
@@ -106,3 +184,46 @@ test "utf8" {
     try testing.expectEqual(256, result.len);
     try testing.expect(std.unicode.utf8ValidateSlice(result));
 }
+
+test "utf8 printable ascii only" {
+    const testing = std.testing;
+    var prng = std.Random.DefaultPrng.init(0);
+    var buf: [256]u8 = undefined;
+    var writer: std.Io.Writer = .fixed(&buf);
+    var v: Utf8 = .{
+        .rand = prng.random(),
+        .ascii_printable_only = true,
+    };
+    v.min_len = buf.len;
+    v.max_len = buf.len;
+    v.p_length.set(.one, 1.0);
+    v.p_length.set(.two, 0.0);
+    v.p_length.set(.three, 0.0);
+    v.p_length.set(.four, 0.0);
+
+    const gen = v.generator();
+    try gen.next(&writer, buf.len);
+    const result = writer.buffered();
+    try testing.expectEqual(256, result.len);
+    try testing.expect(std.unicode.utf8ValidateSlice(result));
+    for (result) |c| try testing.expect(std.ascii.isPrint(c));
+}
+
+test "utf8 malformed output" {
+    const testing = std.testing;
+    var prng = std.Random.DefaultPrng.init(0);
+    var buf: [256]u8 = undefined;
+    var writer: std.Io.Writer = .fixed(&buf);
+    var v: Utf8 = .{
+        .rand = prng.random(),
+        .invalid_rate = 1.0,
+    };
+    v.min_len = buf.len;
+    v.max_len = buf.len;
+
+    const gen = v.generator();
+    try gen.next(&writer, buf.len);
+    const result = writer.buffered();
+    try testing.expectEqual(256, result.len);
+    try testing.expect(!std.unicode.utf8ValidateSlice(result));
+}
diff --git a/src/synthetic/cli/Utf8.zig b/src/synthetic/cli/Utf8.zig
index 635704755..021dbe516 100644
--- a/src/synthetic/cli/Utf8.zig
+++ b/src/synthetic/cli/Utf8.zig
@@ -1,21 +1,60 @@
 const Utf8 = @This();
 
 const std = @import("std");
-const assert = std.debug.assert;
 const Allocator = std.mem.Allocator;
 const synthetic = @import("../main.zig");
 
-const log = std.log.scoped(.@"terminal-stream-bench");
+pub const Options = struct {
+    /// Seed to use for deterministic generation. If unset, a time-based
+    /// seed is used by the generic synthetic CLI.
+    seed: ?u64 = null,
 
-pub const Options = struct {};
+    /// Relative weight for choosing 1-byte UTF-8 sequences.
+    @"weight-one": f64 = 1.0,
+
+    /// Relative weight for choosing 2-byte UTF-8 sequences.
+    @"weight-two": f64 = 1.0,
+
+    /// Relative weight for choosing 3-byte UTF-8 sequences.
+    @"weight-three": f64 = 1.0,
+
+    /// Relative weight for choosing 4-byte UTF-8 sequences.
+    @"weight-four": f64 = 1.0,
+
+    /// Restrict ASCII codepoints to printable characters.
+    @"ascii-printable-only": bool = false,
+
+    /// Probability that an emitted sequence is malformed UTF-8.
+    @"invalid-rate": f64 = 0.0,
+};
+
+opts: Options,
 
 /// Create a new terminal stream handler for the given arguments.
 pub fn create(
     alloc: Allocator,
-    _: Options,
+    opts: Options,
 ) !*Utf8 {
+    if (opts.@"invalid-rate" < 0 or opts.@"invalid-rate" > 1) {
+        return error.InvalidValue;
+    }
+
+    const weights = [_]f64{
+        opts.@"weight-one",
+        opts.@"weight-two",
+        opts.@"weight-three",
+        opts.@"weight-four",
+    };
+    var weight_sum: f64 = 0;
+    for (weights) |weight| {
+        if (weight < 0) return error.InvalidValue;
+        weight_sum += weight;
+    }
+    if (weight_sum <= 0) return error.InvalidValue;
+
     const ptr = try alloc.create(Utf8);
     errdefer alloc.destroy(ptr);
+    ptr.* = .{ .opts = opts };
     return ptr;
 }
 
@@ -24,11 +63,22 @@ pub fn destroy(self: *Utf8, alloc: Allocator) void {
 }
 
 pub fn run(self: *Utf8, writer: *std.Io.Writer, rand: std.Random) !void {
-    _ = self;
+    var prng: ?std.Random.DefaultPrng = null;
+    var gen_rand = rand;
+    if (self.opts.seed) |seed| {
+        prng = std.Random.DefaultPrng.init(seed);
+        gen_rand = prng.?.random();
+    }
 
     var gen: synthetic.Utf8 = .{
-        .rand = rand,
+        .rand = gen_rand,
+        .ascii_printable_only = self.opts.@"ascii-printable-only",
+        .invalid_rate = self.opts.@"invalid-rate",
     };
+    gen.p_length.set(.one, self.opts.@"weight-one");
+    gen.p_length.set(.two, self.opts.@"weight-two");
+    gen.p_length.set(.three, self.opts.@"weight-three");
+    gen.p_length.set(.four, self.opts.@"weight-four");
 
     while (true) {
         gen.next(writer, 1024) catch |err| {
@@ -46,7 +96,9 @@ test Utf8 {
     const testing = std.testing;
     const alloc = testing.allocator;
 
-    const impl: *Utf8 = try .create(alloc, .{});
+    const impl: *Utf8 = try .create(alloc, .{
+        .seed = 1,
+    });
     defer impl.destroy(alloc);
 
     var prng = std.Random.DefaultPrng.init(1);
@@ -55,4 +107,5 @@ test Utf8 {
     var buf: [1024]u8 = undefined;
     var writer: std.Io.Writer = .fixed(&buf);
     try impl.run(&writer, rand);
+    try testing.expectEqual(@as(usize, 1024), writer.buffered().len);
 }