From 9c49c343569791071603d63138aa1a6f7d9dd2dc Mon Sep 17 00:00:00 2001 From: Mitchell Hashimoto Date: Wed, 15 Apr 2026 08:23:42 -0700 Subject: [PATCH] benchmark: add AGENTS, improve UTF-8 synthetic data This updates our synthetic generator for UTF-8 to expose: - Flags to change 1/2/3/4-byte UTF-8 character distribution - Flags to have only printable characters so we can benchmark pure UTF-8 vs our control sequence finder. - Flags to have invalid characters so we can benchmark our error handling. This also adds an AGENTS.md to src/benchmark so agents can do the right thing more easily. --- src/benchmark/AGENTS.md | 34 +++++++++ src/synthetic/Utf8.zig | 151 +++++++++++++++++++++++++++++++++---- src/synthetic/cli/Utf8.zig | 67 ++++++++++++++-- 3 files changed, 230 insertions(+), 22 deletions(-) create mode 100644 src/benchmark/AGENTS.md diff --git a/src/benchmark/AGENTS.md b/src/benchmark/AGENTS.md new file mode 100644 index 000000000..ba62500d4 --- /dev/null +++ b/src/benchmark/AGENTS.md @@ -0,0 +1,34 @@ +# Benchmarking + +The benchmark tools are split into two roles: + +- `ghostty-gen` generates synthetic input data. +- `ghostty-bench` consumes existing input data and runs a benchmark. + +## Workflow + +- For timing comparisons, generate data first and benchmark it later. +- Do not pipe `ghostty-gen` directly into `ghostty-bench` when comparing + performance. That mixes generation cost into the measurement and makes + branch-to-branch comparisons noisy. +- Reuse the exact same generated files when comparing revisions. +- Prefer deterministic generation inputs such as fixed seeds when the + generator supports them. +- Keep large generated benchmark corpora outside the repository unless the + change explicitly requires checked-in test data. + +## Running Benchmarks + +- Prefer `hyperfine` to compare benchmark timings. +- Benchmark the `ghostty-bench` command line, not the generator. +- Use `ghostty-bench ... --data ` with pre-generated files. +- Run multiple warmups and repeated measurements so branch comparisons are + based on medians instead of single runs. +- When comparing branches, keep all benchmark inputs and CLI flags the same, + including terminal dimensions. + +## Building + +- Build benchmark tools with `zig build -Demit-bench`. +- On macOS, prefer `zig build -Demit-bench -Demit-macos-app=false` unless the + macOS app itself is part of the work. diff --git a/src/synthetic/Utf8.zig b/src/synthetic/Utf8.zig index 0d72a8bb2..282532dc4 100644 --- a/src/synthetic/Utf8.zig +++ b/src/synthetic/Utf8.zig @@ -17,6 +17,27 @@ pub const Utf8Len = enum(u3) { four = 4, }; +const InvalidSequence = struct { + len: u3, + bytes: [4]u8, + + fn slice(self: *const InvalidSequence) []const u8 { + return self.bytes[0..self.len]; + } +}; + +const invalid_sequences = [_]InvalidSequence{ + .{ .len = 1, .bytes = .{ 0x80, 0x00, 0x00, 0x00 } }, + .{ .len = 1, .bytes = .{ 0xC0, 0x00, 0x00, 0x00 } }, + .{ .len = 1, .bytes = .{ 0xFF, 0x00, 0x00, 0x00 } }, + .{ .len = 2, .bytes = .{ 0xC2, 0x20, 0x00, 0x00 } }, + .{ .len = 2, .bytes = .{ 0xC0, 0xAF, 0x00, 0x00 } }, + .{ .len = 2, .bytes = .{ 0x80, 0x80, 0x00, 0x00 } }, + .{ .len = 3, .bytes = .{ 0xED, 0xA0, 0x80, 0x00 } }, + .{ .len = 3, .bytes = .{ 0xE2, 0x28, 0x7A, 0x00 } }, + .{ .len = 4, .bytes = .{ 0xF0, 0x90, 0x28, 0x7A } }, +}; + /// Random number generator. rand: std.Random, @@ -37,6 +58,13 @@ max_len: usize = std.math.maxInt(usize), /// skew the distribution of lengths. p_length: std.enums.EnumArray(Utf8Len, f64) = .initFill(1.0), +/// If true, ASCII codepoints are limited to printable ASCII. +ascii_printable_only: bool = false, + +/// Probability that the next generated sequence is malformed UTF-8. +/// This is checked for each emitted sequence while filling a buffer. +invalid_rate: f64 = 0, + pub fn generator(self: *Utf8) Generator { return .init(self, next); } @@ -49,23 +77,44 @@ pub fn next(self: *Utf8, writer: *std.Io.Writer, max_len: usize) Generator.Error var rem: usize = len; while (rem > 0) { - // Pick a utf8 byte count to generate. - const utf8_len: Utf8Len = len: { - const Indexer = @TypeOf(self.p_length).Indexer; - const idx = self.rand.weightedIndex(f64, &self.p_length.values); - var utf8_len = Indexer.keyForIndex(idx); - assert(rem > 0); - while (@intFromEnum(utf8_len) > rem) { - // If the chosen length can't fit into the remaining buffer, - // choose a smaller length. - utf8_len = @enumFromInt(@intFromEnum(utf8_len) - 1); - } - break :len utf8_len; - }; + if (try self.writeInvalid(writer, rem)) |written| { + rem -= written; + continue; + } + + const written = try self.writeValid(writer, rem); + rem -= written; + } +} + +fn writeInvalid( + self: *Utf8, + writer: *std.Io.Writer, + rem: usize, +) Generator.Error!?usize { + if (self.invalid_rate <= 0 or self.rand.float(f64) >= self.invalid_rate) { + return null; + } + + const seq = self.invalidSequence(rem) orelse return null; + try writer.writeAll(seq.slice()); + return seq.len; +} + +fn writeValid( + self: *Utf8, + writer: *std.Io.Writer, + rem: usize, +) Generator.Error!usize { + while (true) { + const utf8_len = self.utf8Len(rem); // Generate a UTF-8 sequence that encodes to this length. const cp: u21 = switch (utf8_len) { - .one => self.rand.intRangeAtMostBiased(u21, 0x00, 0x7F), + .one => if (self.ascii_printable_only) + self.rand.intRangeAtMostBiased(u21, 0x20, 0x7E) + else + self.rand.intRangeAtMostBiased(u21, 0x00, 0x7F), .two => self.rand.intRangeAtMostBiased(u21, 0x80, 0x7FF), .three => self.rand.intRangeAtMostBiased(u21, 0x800, 0xFFFF), .four => self.rand.intRangeAtMostBiased(u21, 0x10000, 0x10FFFF), @@ -87,11 +136,40 @@ pub fn next(self: *Utf8, writer: *std.Io.Writer, max_len: usize) Generator.Error // Possible, in which case we redo the loop and encode nothing. error.Utf8CannotEncodeSurrogateHalf => continue, }; + try writer.writeAll(buf[0..l]); - rem -= l; + return l; } } +fn utf8Len(self: *Utf8, rem: usize) Utf8Len { + const Indexer = @TypeOf(self.p_length).Indexer; + const idx = self.rand.weightedIndex(f64, &self.p_length.values); + var utf8_len = Indexer.keyForIndex(idx); + assert(rem > 0); + while (@intFromEnum(utf8_len) > rem) { + // If the chosen length can't fit into the remaining buffer, + // choose a smaller length. + utf8_len = @enumFromInt(@intFromEnum(utf8_len) - 1); + } + return utf8_len; +} + +fn invalidSequence(self: *Utf8, rem: usize) ?InvalidSequence { + const candidates = &invalid_sequences; + + var valid_idx: [candidates.len]usize = undefined; + var valid_len: usize = 0; + for (candidates, 0..) |candidate, i| { + if (candidate.len > rem) continue; + valid_idx[valid_len] = i; + valid_len += 1; + } + + if (valid_len == 0) return null; + return candidates[valid_idx[self.rand.uintLessThan(usize, valid_len)]]; +} + test "utf8" { const testing = std.testing; var prng = std.Random.DefaultPrng.init(0); @@ -106,3 +184,46 @@ test "utf8" { try testing.expectEqual(256, result.len); try testing.expect(std.unicode.utf8ValidateSlice(result)); } + +test "utf8 printable ascii only" { + const testing = std.testing; + var prng = std.Random.DefaultPrng.init(0); + var buf: [256]u8 = undefined; + var writer: std.Io.Writer = .fixed(&buf); + var v: Utf8 = .{ + .rand = prng.random(), + .ascii_printable_only = true, + }; + v.min_len = buf.len; + v.max_len = buf.len; + v.p_length.set(.one, 1.0); + v.p_length.set(.two, 0.0); + v.p_length.set(.three, 0.0); + v.p_length.set(.four, 0.0); + + const gen = v.generator(); + try gen.next(&writer, buf.len); + const result = writer.buffered(); + try testing.expectEqual(256, result.len); + try testing.expect(std.unicode.utf8ValidateSlice(result)); + for (result) |c| try testing.expect(std.ascii.isPrint(c)); +} + +test "utf8 malformed output" { + const testing = std.testing; + var prng = std.Random.DefaultPrng.init(0); + var buf: [256]u8 = undefined; + var writer: std.Io.Writer = .fixed(&buf); + var v: Utf8 = .{ + .rand = prng.random(), + .invalid_rate = 1.0, + }; + v.min_len = buf.len; + v.max_len = buf.len; + + const gen = v.generator(); + try gen.next(&writer, buf.len); + const result = writer.buffered(); + try testing.expectEqual(256, result.len); + try testing.expect(!std.unicode.utf8ValidateSlice(result)); +} diff --git a/src/synthetic/cli/Utf8.zig b/src/synthetic/cli/Utf8.zig index 635704755..021dbe516 100644 --- a/src/synthetic/cli/Utf8.zig +++ b/src/synthetic/cli/Utf8.zig @@ -1,21 +1,60 @@ const Utf8 = @This(); const std = @import("std"); -const assert = std.debug.assert; const Allocator = std.mem.Allocator; const synthetic = @import("../main.zig"); -const log = std.log.scoped(.@"terminal-stream-bench"); +pub const Options = struct { + /// Seed to use for deterministic generation. If unset, a time-based + /// seed is used by the generic synthetic CLI. + seed: ?u64 = null, -pub const Options = struct {}; + /// Relative weight for choosing 1-byte UTF-8 sequences. + @"weight-one": f64 = 1.0, + + /// Relative weight for choosing 2-byte UTF-8 sequences. + @"weight-two": f64 = 1.0, + + /// Relative weight for choosing 3-byte UTF-8 sequences. + @"weight-three": f64 = 1.0, + + /// Relative weight for choosing 4-byte UTF-8 sequences. + @"weight-four": f64 = 1.0, + + /// Restrict ASCII codepoints to printable characters. + @"ascii-printable-only": bool = false, + + /// Probability that an emitted sequence is malformed UTF-8. + @"invalid-rate": f64 = 0.0, +}; + +opts: Options, /// Create a new terminal stream handler for the given arguments. pub fn create( alloc: Allocator, - _: Options, + opts: Options, ) !*Utf8 { + if (opts.@"invalid-rate" < 0 or opts.@"invalid-rate" > 1) { + return error.InvalidValue; + } + + const weights = [_]f64{ + opts.@"weight-one", + opts.@"weight-two", + opts.@"weight-three", + opts.@"weight-four", + }; + var weight_sum: f64 = 0; + for (weights) |weight| { + if (weight < 0) return error.InvalidValue; + weight_sum += weight; + } + if (weight_sum <= 0) return error.InvalidValue; + const ptr = try alloc.create(Utf8); errdefer alloc.destroy(ptr); + ptr.* = .{ .opts = opts }; return ptr; } @@ -24,11 +63,22 @@ pub fn destroy(self: *Utf8, alloc: Allocator) void { } pub fn run(self: *Utf8, writer: *std.Io.Writer, rand: std.Random) !void { - _ = self; + var prng: ?std.Random.DefaultPrng = null; + var gen_rand = rand; + if (self.opts.seed) |seed| { + prng = std.Random.DefaultPrng.init(seed); + gen_rand = prng.?.random(); + } var gen: synthetic.Utf8 = .{ - .rand = rand, + .rand = gen_rand, + .ascii_printable_only = self.opts.@"ascii-printable-only", + .invalid_rate = self.opts.@"invalid-rate", }; + gen.p_length.set(.one, self.opts.@"weight-one"); + gen.p_length.set(.two, self.opts.@"weight-two"); + gen.p_length.set(.three, self.opts.@"weight-three"); + gen.p_length.set(.four, self.opts.@"weight-four"); while (true) { gen.next(writer, 1024) catch |err| { @@ -46,7 +96,9 @@ test Utf8 { const testing = std.testing; const alloc = testing.allocator; - const impl: *Utf8 = try .create(alloc, .{}); + const impl: *Utf8 = try .create(alloc, .{ + .seed = 1, + }); defer impl.destroy(alloc); var prng = std.Random.DefaultPrng.init(1); @@ -55,4 +107,5 @@ test Utf8 { var buf: [1024]u8 = undefined; var writer: std.Io.Writer = .fixed(&buf); try impl.run(&writer, rand); + try testing.expectEqual(@as(usize, 1024), writer.buffered().len); }