benchmark: add AGENTS, improve UTF-8 synthetic data (#12297)

This updates our synthetic generator for UTF-8 to expose:

  - Flags to change 1/2/3/4-byte UTF-8 character distribution
- Flags to have only printable characters so we can benchmark pure UTF-8
vs our control sequence finder.
- Flags to have invalid characters so we can benchmark our error
handling.

This also adds an AGENTS.md to src/benchmark so agents can do the right
thing more easily.

These are necessary to robustly benchmark our libc++ removal PR.
This commit is contained in:
Mitchell Hashimoto
2026-04-15 09:09:11 -07:00
committed by GitHub
3 changed files with 230 additions and 22 deletions

34
src/benchmark/AGENTS.md Normal file
View File

@@ -0,0 +1,34 @@
# Benchmarking
The benchmark tools are split into two roles:
- `ghostty-gen` generates synthetic input data.
- `ghostty-bench` consumes existing input data and runs a benchmark.
## Workflow
- For timing comparisons, generate data first and benchmark it later.
- Do not pipe `ghostty-gen` directly into `ghostty-bench` when comparing
performance. That mixes generation cost into the measurement and makes
branch-to-branch comparisons noisy.
- Reuse the exact same generated files when comparing revisions.
- Prefer deterministic generation inputs such as fixed seeds when the
generator supports them.
- Keep large generated benchmark corpora outside the repository unless the
change explicitly requires checked-in test data.
## Running Benchmarks
- Prefer `hyperfine` to compare benchmark timings.
- Benchmark the `ghostty-bench` command line, not the generator.
- Use `ghostty-bench ... --data <path>` with pre-generated files.
- Run multiple warmups and repeated measurements so branch comparisons are
based on medians instead of single runs.
- When comparing branches, keep all benchmark inputs and CLI flags the same,
including terminal dimensions.
## Building
- Build benchmark tools with `zig build -Demit-bench`.
- On macOS, prefer `zig build -Demit-bench -Demit-macos-app=false` unless the
macOS app itself is part of the work.

View File

@@ -17,6 +17,27 @@ pub const Utf8Len = enum(u3) {
four = 4,
};
const InvalidSequence = struct {
len: u3,
bytes: [4]u8,
fn slice(self: *const InvalidSequence) []const u8 {
return self.bytes[0..self.len];
}
};
const invalid_sequences = [_]InvalidSequence{
.{ .len = 1, .bytes = .{ 0x80, 0x00, 0x00, 0x00 } },
.{ .len = 1, .bytes = .{ 0xC0, 0x00, 0x00, 0x00 } },
.{ .len = 1, .bytes = .{ 0xFF, 0x00, 0x00, 0x00 } },
.{ .len = 2, .bytes = .{ 0xC2, 0x20, 0x00, 0x00 } },
.{ .len = 2, .bytes = .{ 0xC0, 0xAF, 0x00, 0x00 } },
.{ .len = 2, .bytes = .{ 0x80, 0x80, 0x00, 0x00 } },
.{ .len = 3, .bytes = .{ 0xED, 0xA0, 0x80, 0x00 } },
.{ .len = 3, .bytes = .{ 0xE2, 0x28, 0x7A, 0x00 } },
.{ .len = 4, .bytes = .{ 0xF0, 0x90, 0x28, 0x7A } },
};
/// Random number generator.
rand: std.Random,
@@ -37,6 +58,13 @@ max_len: usize = std.math.maxInt(usize),
/// skew the distribution of lengths.
p_length: std.enums.EnumArray(Utf8Len, f64) = .initFill(1.0),
/// If true, ASCII codepoints are limited to printable ASCII.
ascii_printable_only: bool = false,
/// Probability that the next generated sequence is malformed UTF-8.
/// This is checked for each emitted sequence while filling a buffer.
invalid_rate: f64 = 0,
pub fn generator(self: *Utf8) Generator {
return .init(self, next);
}
@@ -49,23 +77,44 @@ pub fn next(self: *Utf8, writer: *std.Io.Writer, max_len: usize) Generator.Error
var rem: usize = len;
while (rem > 0) {
// Pick a utf8 byte count to generate.
const utf8_len: Utf8Len = len: {
const Indexer = @TypeOf(self.p_length).Indexer;
const idx = self.rand.weightedIndex(f64, &self.p_length.values);
var utf8_len = Indexer.keyForIndex(idx);
assert(rem > 0);
while (@intFromEnum(utf8_len) > rem) {
// If the chosen length can't fit into the remaining buffer,
// choose a smaller length.
utf8_len = @enumFromInt(@intFromEnum(utf8_len) - 1);
}
break :len utf8_len;
};
if (try self.writeInvalid(writer, rem)) |written| {
rem -= written;
continue;
}
const written = try self.writeValid(writer, rem);
rem -= written;
}
}
fn writeInvalid(
self: *Utf8,
writer: *std.Io.Writer,
rem: usize,
) Generator.Error!?usize {
if (self.invalid_rate <= 0 or self.rand.float(f64) >= self.invalid_rate) {
return null;
}
const seq = self.invalidSequence(rem) orelse return null;
try writer.writeAll(seq.slice());
return seq.len;
}
fn writeValid(
self: *Utf8,
writer: *std.Io.Writer,
rem: usize,
) Generator.Error!usize {
while (true) {
const utf8_len = self.utf8Len(rem);
// Generate a UTF-8 sequence that encodes to this length.
const cp: u21 = switch (utf8_len) {
.one => self.rand.intRangeAtMostBiased(u21, 0x00, 0x7F),
.one => if (self.ascii_printable_only)
self.rand.intRangeAtMostBiased(u21, 0x20, 0x7E)
else
self.rand.intRangeAtMostBiased(u21, 0x00, 0x7F),
.two => self.rand.intRangeAtMostBiased(u21, 0x80, 0x7FF),
.three => self.rand.intRangeAtMostBiased(u21, 0x800, 0xFFFF),
.four => self.rand.intRangeAtMostBiased(u21, 0x10000, 0x10FFFF),
@@ -87,11 +136,40 @@ pub fn next(self: *Utf8, writer: *std.Io.Writer, max_len: usize) Generator.Error
// Possible, in which case we redo the loop and encode nothing.
error.Utf8CannotEncodeSurrogateHalf => continue,
};
try writer.writeAll(buf[0..l]);
rem -= l;
return l;
}
}
fn utf8Len(self: *Utf8, rem: usize) Utf8Len {
const Indexer = @TypeOf(self.p_length).Indexer;
const idx = self.rand.weightedIndex(f64, &self.p_length.values);
var utf8_len = Indexer.keyForIndex(idx);
assert(rem > 0);
while (@intFromEnum(utf8_len) > rem) {
// If the chosen length can't fit into the remaining buffer,
// choose a smaller length.
utf8_len = @enumFromInt(@intFromEnum(utf8_len) - 1);
}
return utf8_len;
}
fn invalidSequence(self: *Utf8, rem: usize) ?InvalidSequence {
const candidates = &invalid_sequences;
var valid_idx: [candidates.len]usize = undefined;
var valid_len: usize = 0;
for (candidates, 0..) |candidate, i| {
if (candidate.len > rem) continue;
valid_idx[valid_len] = i;
valid_len += 1;
}
if (valid_len == 0) return null;
return candidates[valid_idx[self.rand.uintLessThan(usize, valid_len)]];
}
test "utf8" {
const testing = std.testing;
var prng = std.Random.DefaultPrng.init(0);
@@ -106,3 +184,46 @@ test "utf8" {
try testing.expectEqual(256, result.len);
try testing.expect(std.unicode.utf8ValidateSlice(result));
}
test "utf8 printable ascii only" {
const testing = std.testing;
var prng = std.Random.DefaultPrng.init(0);
var buf: [256]u8 = undefined;
var writer: std.Io.Writer = .fixed(&buf);
var v: Utf8 = .{
.rand = prng.random(),
.ascii_printable_only = true,
};
v.min_len = buf.len;
v.max_len = buf.len;
v.p_length.set(.one, 1.0);
v.p_length.set(.two, 0.0);
v.p_length.set(.three, 0.0);
v.p_length.set(.four, 0.0);
const gen = v.generator();
try gen.next(&writer, buf.len);
const result = writer.buffered();
try testing.expectEqual(256, result.len);
try testing.expect(std.unicode.utf8ValidateSlice(result));
for (result) |c| try testing.expect(std.ascii.isPrint(c));
}
test "utf8 malformed output" {
const testing = std.testing;
var prng = std.Random.DefaultPrng.init(0);
var buf: [256]u8 = undefined;
var writer: std.Io.Writer = .fixed(&buf);
var v: Utf8 = .{
.rand = prng.random(),
.invalid_rate = 1.0,
};
v.min_len = buf.len;
v.max_len = buf.len;
const gen = v.generator();
try gen.next(&writer, buf.len);
const result = writer.buffered();
try testing.expectEqual(256, result.len);
try testing.expect(!std.unicode.utf8ValidateSlice(result));
}

View File

@@ -1,21 +1,60 @@
const Utf8 = @This();
const std = @import("std");
const assert = std.debug.assert;
const Allocator = std.mem.Allocator;
const synthetic = @import("../main.zig");
const log = std.log.scoped(.@"terminal-stream-bench");
pub const Options = struct {
/// Seed to use for deterministic generation. If unset, a time-based
/// seed is used by the generic synthetic CLI.
seed: ?u64 = null,
pub const Options = struct {};
/// Relative weight for choosing 1-byte UTF-8 sequences.
@"weight-one": f64 = 1.0,
/// Relative weight for choosing 2-byte UTF-8 sequences.
@"weight-two": f64 = 1.0,
/// Relative weight for choosing 3-byte UTF-8 sequences.
@"weight-three": f64 = 1.0,
/// Relative weight for choosing 4-byte UTF-8 sequences.
@"weight-four": f64 = 1.0,
/// Restrict ASCII codepoints to printable characters.
@"ascii-printable-only": bool = false,
/// Probability that an emitted sequence is malformed UTF-8.
@"invalid-rate": f64 = 0.0,
};
opts: Options,
/// Create a new terminal stream handler for the given arguments.
pub fn create(
alloc: Allocator,
_: Options,
opts: Options,
) !*Utf8 {
if (opts.@"invalid-rate" < 0 or opts.@"invalid-rate" > 1) {
return error.InvalidValue;
}
const weights = [_]f64{
opts.@"weight-one",
opts.@"weight-two",
opts.@"weight-three",
opts.@"weight-four",
};
var weight_sum: f64 = 0;
for (weights) |weight| {
if (weight < 0) return error.InvalidValue;
weight_sum += weight;
}
if (weight_sum <= 0) return error.InvalidValue;
const ptr = try alloc.create(Utf8);
errdefer alloc.destroy(ptr);
ptr.* = .{ .opts = opts };
return ptr;
}
@@ -24,11 +63,22 @@ pub fn destroy(self: *Utf8, alloc: Allocator) void {
}
pub fn run(self: *Utf8, writer: *std.Io.Writer, rand: std.Random) !void {
_ = self;
var prng: ?std.Random.DefaultPrng = null;
var gen_rand = rand;
if (self.opts.seed) |seed| {
prng = std.Random.DefaultPrng.init(seed);
gen_rand = prng.?.random();
}
var gen: synthetic.Utf8 = .{
.rand = rand,
.rand = gen_rand,
.ascii_printable_only = self.opts.@"ascii-printable-only",
.invalid_rate = self.opts.@"invalid-rate",
};
gen.p_length.set(.one, self.opts.@"weight-one");
gen.p_length.set(.two, self.opts.@"weight-two");
gen.p_length.set(.three, self.opts.@"weight-three");
gen.p_length.set(.four, self.opts.@"weight-four");
while (true) {
gen.next(writer, 1024) catch |err| {
@@ -46,7 +96,9 @@ test Utf8 {
const testing = std.testing;
const alloc = testing.allocator;
const impl: *Utf8 = try .create(alloc, .{});
const impl: *Utf8 = try .create(alloc, .{
.seed = 1,
});
defer impl.destroy(alloc);
var prng = std.Random.DefaultPrng.init(1);
@@ -55,4 +107,5 @@ test Utf8 {
var buf: [1024]u8 = undefined;
var writer: std.Io.Writer = .fixed(&buf);
try impl.run(&writer, rand);
try testing.expectEqual(@as(usize, 1024), writer.buffered().len);
}