mirror of
https://github.com/ghostty-org/ghostty.git
synced 2026-05-24 05:40:15 +00:00
benchmark: add AGENTS, improve UTF-8 synthetic data (#12297)
This updates our synthetic generator for UTF-8 to expose: - Flags to change 1/2/3/4-byte UTF-8 character distribution - Flags to have only printable characters so we can benchmark pure UTF-8 vs our control sequence finder. - Flags to have invalid characters so we can benchmark our error handling. This also adds an AGENTS.md to src/benchmark so agents can do the right thing more easily. These are necessary to robustly benchmark our libc++ removal PR.
This commit is contained in:
34
src/benchmark/AGENTS.md
Normal file
34
src/benchmark/AGENTS.md
Normal file
@@ -0,0 +1,34 @@
|
||||
# Benchmarking
|
||||
|
||||
The benchmark tools are split into two roles:
|
||||
|
||||
- `ghostty-gen` generates synthetic input data.
|
||||
- `ghostty-bench` consumes existing input data and runs a benchmark.
|
||||
|
||||
## Workflow
|
||||
|
||||
- For timing comparisons, generate data first and benchmark it later.
|
||||
- Do not pipe `ghostty-gen` directly into `ghostty-bench` when comparing
|
||||
performance. That mixes generation cost into the measurement and makes
|
||||
branch-to-branch comparisons noisy.
|
||||
- Reuse the exact same generated files when comparing revisions.
|
||||
- Prefer deterministic generation inputs such as fixed seeds when the
|
||||
generator supports them.
|
||||
- Keep large generated benchmark corpora outside the repository unless the
|
||||
change explicitly requires checked-in test data.
|
||||
|
||||
## Running Benchmarks
|
||||
|
||||
- Prefer `hyperfine` to compare benchmark timings.
|
||||
- Benchmark the `ghostty-bench` command line, not the generator.
|
||||
- Use `ghostty-bench ... --data <path>` with pre-generated files.
|
||||
- Run multiple warmups and repeated measurements so branch comparisons are
|
||||
based on medians instead of single runs.
|
||||
- When comparing branches, keep all benchmark inputs and CLI flags the same,
|
||||
including terminal dimensions.
|
||||
|
||||
## Building
|
||||
|
||||
- Build benchmark tools with `zig build -Demit-bench`.
|
||||
- On macOS, prefer `zig build -Demit-bench -Demit-macos-app=false` unless the
|
||||
macOS app itself is part of the work.
|
||||
@@ -17,6 +17,27 @@ pub const Utf8Len = enum(u3) {
|
||||
four = 4,
|
||||
};
|
||||
|
||||
const InvalidSequence = struct {
|
||||
len: u3,
|
||||
bytes: [4]u8,
|
||||
|
||||
fn slice(self: *const InvalidSequence) []const u8 {
|
||||
return self.bytes[0..self.len];
|
||||
}
|
||||
};
|
||||
|
||||
const invalid_sequences = [_]InvalidSequence{
|
||||
.{ .len = 1, .bytes = .{ 0x80, 0x00, 0x00, 0x00 } },
|
||||
.{ .len = 1, .bytes = .{ 0xC0, 0x00, 0x00, 0x00 } },
|
||||
.{ .len = 1, .bytes = .{ 0xFF, 0x00, 0x00, 0x00 } },
|
||||
.{ .len = 2, .bytes = .{ 0xC2, 0x20, 0x00, 0x00 } },
|
||||
.{ .len = 2, .bytes = .{ 0xC0, 0xAF, 0x00, 0x00 } },
|
||||
.{ .len = 2, .bytes = .{ 0x80, 0x80, 0x00, 0x00 } },
|
||||
.{ .len = 3, .bytes = .{ 0xED, 0xA0, 0x80, 0x00 } },
|
||||
.{ .len = 3, .bytes = .{ 0xE2, 0x28, 0x7A, 0x00 } },
|
||||
.{ .len = 4, .bytes = .{ 0xF0, 0x90, 0x28, 0x7A } },
|
||||
};
|
||||
|
||||
/// Random number generator.
|
||||
rand: std.Random,
|
||||
|
||||
@@ -37,6 +58,13 @@ max_len: usize = std.math.maxInt(usize),
|
||||
/// skew the distribution of lengths.
|
||||
p_length: std.enums.EnumArray(Utf8Len, f64) = .initFill(1.0),
|
||||
|
||||
/// If true, ASCII codepoints are limited to printable ASCII.
|
||||
ascii_printable_only: bool = false,
|
||||
|
||||
/// Probability that the next generated sequence is malformed UTF-8.
|
||||
/// This is checked for each emitted sequence while filling a buffer.
|
||||
invalid_rate: f64 = 0,
|
||||
|
||||
pub fn generator(self: *Utf8) Generator {
|
||||
return .init(self, next);
|
||||
}
|
||||
@@ -49,23 +77,44 @@ pub fn next(self: *Utf8, writer: *std.Io.Writer, max_len: usize) Generator.Error
|
||||
|
||||
var rem: usize = len;
|
||||
while (rem > 0) {
|
||||
// Pick a utf8 byte count to generate.
|
||||
const utf8_len: Utf8Len = len: {
|
||||
const Indexer = @TypeOf(self.p_length).Indexer;
|
||||
const idx = self.rand.weightedIndex(f64, &self.p_length.values);
|
||||
var utf8_len = Indexer.keyForIndex(idx);
|
||||
assert(rem > 0);
|
||||
while (@intFromEnum(utf8_len) > rem) {
|
||||
// If the chosen length can't fit into the remaining buffer,
|
||||
// choose a smaller length.
|
||||
utf8_len = @enumFromInt(@intFromEnum(utf8_len) - 1);
|
||||
}
|
||||
break :len utf8_len;
|
||||
};
|
||||
if (try self.writeInvalid(writer, rem)) |written| {
|
||||
rem -= written;
|
||||
continue;
|
||||
}
|
||||
|
||||
const written = try self.writeValid(writer, rem);
|
||||
rem -= written;
|
||||
}
|
||||
}
|
||||
|
||||
fn writeInvalid(
|
||||
self: *Utf8,
|
||||
writer: *std.Io.Writer,
|
||||
rem: usize,
|
||||
) Generator.Error!?usize {
|
||||
if (self.invalid_rate <= 0 or self.rand.float(f64) >= self.invalid_rate) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const seq = self.invalidSequence(rem) orelse return null;
|
||||
try writer.writeAll(seq.slice());
|
||||
return seq.len;
|
||||
}
|
||||
|
||||
fn writeValid(
|
||||
self: *Utf8,
|
||||
writer: *std.Io.Writer,
|
||||
rem: usize,
|
||||
) Generator.Error!usize {
|
||||
while (true) {
|
||||
const utf8_len = self.utf8Len(rem);
|
||||
|
||||
// Generate a UTF-8 sequence that encodes to this length.
|
||||
const cp: u21 = switch (utf8_len) {
|
||||
.one => self.rand.intRangeAtMostBiased(u21, 0x00, 0x7F),
|
||||
.one => if (self.ascii_printable_only)
|
||||
self.rand.intRangeAtMostBiased(u21, 0x20, 0x7E)
|
||||
else
|
||||
self.rand.intRangeAtMostBiased(u21, 0x00, 0x7F),
|
||||
.two => self.rand.intRangeAtMostBiased(u21, 0x80, 0x7FF),
|
||||
.three => self.rand.intRangeAtMostBiased(u21, 0x800, 0xFFFF),
|
||||
.four => self.rand.intRangeAtMostBiased(u21, 0x10000, 0x10FFFF),
|
||||
@@ -87,11 +136,40 @@ pub fn next(self: *Utf8, writer: *std.Io.Writer, max_len: usize) Generator.Error
|
||||
// Possible, in which case we redo the loop and encode nothing.
|
||||
error.Utf8CannotEncodeSurrogateHalf => continue,
|
||||
};
|
||||
|
||||
try writer.writeAll(buf[0..l]);
|
||||
rem -= l;
|
||||
return l;
|
||||
}
|
||||
}
|
||||
|
||||
fn utf8Len(self: *Utf8, rem: usize) Utf8Len {
|
||||
const Indexer = @TypeOf(self.p_length).Indexer;
|
||||
const idx = self.rand.weightedIndex(f64, &self.p_length.values);
|
||||
var utf8_len = Indexer.keyForIndex(idx);
|
||||
assert(rem > 0);
|
||||
while (@intFromEnum(utf8_len) > rem) {
|
||||
// If the chosen length can't fit into the remaining buffer,
|
||||
// choose a smaller length.
|
||||
utf8_len = @enumFromInt(@intFromEnum(utf8_len) - 1);
|
||||
}
|
||||
return utf8_len;
|
||||
}
|
||||
|
||||
fn invalidSequence(self: *Utf8, rem: usize) ?InvalidSequence {
|
||||
const candidates = &invalid_sequences;
|
||||
|
||||
var valid_idx: [candidates.len]usize = undefined;
|
||||
var valid_len: usize = 0;
|
||||
for (candidates, 0..) |candidate, i| {
|
||||
if (candidate.len > rem) continue;
|
||||
valid_idx[valid_len] = i;
|
||||
valid_len += 1;
|
||||
}
|
||||
|
||||
if (valid_len == 0) return null;
|
||||
return candidates[valid_idx[self.rand.uintLessThan(usize, valid_len)]];
|
||||
}
|
||||
|
||||
test "utf8" {
|
||||
const testing = std.testing;
|
||||
var prng = std.Random.DefaultPrng.init(0);
|
||||
@@ -106,3 +184,46 @@ test "utf8" {
|
||||
try testing.expectEqual(256, result.len);
|
||||
try testing.expect(std.unicode.utf8ValidateSlice(result));
|
||||
}
|
||||
|
||||
test "utf8 printable ascii only" {
|
||||
const testing = std.testing;
|
||||
var prng = std.Random.DefaultPrng.init(0);
|
||||
var buf: [256]u8 = undefined;
|
||||
var writer: std.Io.Writer = .fixed(&buf);
|
||||
var v: Utf8 = .{
|
||||
.rand = prng.random(),
|
||||
.ascii_printable_only = true,
|
||||
};
|
||||
v.min_len = buf.len;
|
||||
v.max_len = buf.len;
|
||||
v.p_length.set(.one, 1.0);
|
||||
v.p_length.set(.two, 0.0);
|
||||
v.p_length.set(.three, 0.0);
|
||||
v.p_length.set(.four, 0.0);
|
||||
|
||||
const gen = v.generator();
|
||||
try gen.next(&writer, buf.len);
|
||||
const result = writer.buffered();
|
||||
try testing.expectEqual(256, result.len);
|
||||
try testing.expect(std.unicode.utf8ValidateSlice(result));
|
||||
for (result) |c| try testing.expect(std.ascii.isPrint(c));
|
||||
}
|
||||
|
||||
test "utf8 malformed output" {
|
||||
const testing = std.testing;
|
||||
var prng = std.Random.DefaultPrng.init(0);
|
||||
var buf: [256]u8 = undefined;
|
||||
var writer: std.Io.Writer = .fixed(&buf);
|
||||
var v: Utf8 = .{
|
||||
.rand = prng.random(),
|
||||
.invalid_rate = 1.0,
|
||||
};
|
||||
v.min_len = buf.len;
|
||||
v.max_len = buf.len;
|
||||
|
||||
const gen = v.generator();
|
||||
try gen.next(&writer, buf.len);
|
||||
const result = writer.buffered();
|
||||
try testing.expectEqual(256, result.len);
|
||||
try testing.expect(!std.unicode.utf8ValidateSlice(result));
|
||||
}
|
||||
|
||||
@@ -1,21 +1,60 @@
|
||||
const Utf8 = @This();
|
||||
|
||||
const std = @import("std");
|
||||
const assert = std.debug.assert;
|
||||
const Allocator = std.mem.Allocator;
|
||||
const synthetic = @import("../main.zig");
|
||||
|
||||
const log = std.log.scoped(.@"terminal-stream-bench");
|
||||
pub const Options = struct {
|
||||
/// Seed to use for deterministic generation. If unset, a time-based
|
||||
/// seed is used by the generic synthetic CLI.
|
||||
seed: ?u64 = null,
|
||||
|
||||
pub const Options = struct {};
|
||||
/// Relative weight for choosing 1-byte UTF-8 sequences.
|
||||
@"weight-one": f64 = 1.0,
|
||||
|
||||
/// Relative weight for choosing 2-byte UTF-8 sequences.
|
||||
@"weight-two": f64 = 1.0,
|
||||
|
||||
/// Relative weight for choosing 3-byte UTF-8 sequences.
|
||||
@"weight-three": f64 = 1.0,
|
||||
|
||||
/// Relative weight for choosing 4-byte UTF-8 sequences.
|
||||
@"weight-four": f64 = 1.0,
|
||||
|
||||
/// Restrict ASCII codepoints to printable characters.
|
||||
@"ascii-printable-only": bool = false,
|
||||
|
||||
/// Probability that an emitted sequence is malformed UTF-8.
|
||||
@"invalid-rate": f64 = 0.0,
|
||||
};
|
||||
|
||||
opts: Options,
|
||||
|
||||
/// Create a new terminal stream handler for the given arguments.
|
||||
pub fn create(
|
||||
alloc: Allocator,
|
||||
_: Options,
|
||||
opts: Options,
|
||||
) !*Utf8 {
|
||||
if (opts.@"invalid-rate" < 0 or opts.@"invalid-rate" > 1) {
|
||||
return error.InvalidValue;
|
||||
}
|
||||
|
||||
const weights = [_]f64{
|
||||
opts.@"weight-one",
|
||||
opts.@"weight-two",
|
||||
opts.@"weight-three",
|
||||
opts.@"weight-four",
|
||||
};
|
||||
var weight_sum: f64 = 0;
|
||||
for (weights) |weight| {
|
||||
if (weight < 0) return error.InvalidValue;
|
||||
weight_sum += weight;
|
||||
}
|
||||
if (weight_sum <= 0) return error.InvalidValue;
|
||||
|
||||
const ptr = try alloc.create(Utf8);
|
||||
errdefer alloc.destroy(ptr);
|
||||
ptr.* = .{ .opts = opts };
|
||||
return ptr;
|
||||
}
|
||||
|
||||
@@ -24,11 +63,22 @@ pub fn destroy(self: *Utf8, alloc: Allocator) void {
|
||||
}
|
||||
|
||||
pub fn run(self: *Utf8, writer: *std.Io.Writer, rand: std.Random) !void {
|
||||
_ = self;
|
||||
var prng: ?std.Random.DefaultPrng = null;
|
||||
var gen_rand = rand;
|
||||
if (self.opts.seed) |seed| {
|
||||
prng = std.Random.DefaultPrng.init(seed);
|
||||
gen_rand = prng.?.random();
|
||||
}
|
||||
|
||||
var gen: synthetic.Utf8 = .{
|
||||
.rand = rand,
|
||||
.rand = gen_rand,
|
||||
.ascii_printable_only = self.opts.@"ascii-printable-only",
|
||||
.invalid_rate = self.opts.@"invalid-rate",
|
||||
};
|
||||
gen.p_length.set(.one, self.opts.@"weight-one");
|
||||
gen.p_length.set(.two, self.opts.@"weight-two");
|
||||
gen.p_length.set(.three, self.opts.@"weight-three");
|
||||
gen.p_length.set(.four, self.opts.@"weight-four");
|
||||
|
||||
while (true) {
|
||||
gen.next(writer, 1024) catch |err| {
|
||||
@@ -46,7 +96,9 @@ test Utf8 {
|
||||
const testing = std.testing;
|
||||
const alloc = testing.allocator;
|
||||
|
||||
const impl: *Utf8 = try .create(alloc, .{});
|
||||
const impl: *Utf8 = try .create(alloc, .{
|
||||
.seed = 1,
|
||||
});
|
||||
defer impl.destroy(alloc);
|
||||
|
||||
var prng = std.Random.DefaultPrng.init(1);
|
||||
@@ -55,4 +107,5 @@ test Utf8 {
|
||||
var buf: [1024]u8 = undefined;
|
||||
var writer: std.Io.Writer = .fixed(&buf);
|
||||
try impl.run(&writer, rand);
|
||||
try testing.expectEqual(@as(usize, 1024), writer.buffered().len);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user