mirror of
https://github.com/ghostty-org/ghostty.git
synced 2026-05-25 06:18:37 +00:00
Extract the tight per-byte parsing loop from TerminalParser.step into a separate noinline function (parseAll). This eliminates a ~20% benchmark regression that appeared after the highway vendor changes despite zero changes to the parser source code. The root cause: the parser benchmark processes 50 MB of input through a byte-at-a-time DFA loop that is highly sensitive to instruction cache-line placement on Apple Silicon. The M-series cores fetch aligned 16-byte blocks; when the loop head lands near the end of a 64-byte cache line (offset 60), only one instruction fits in the first fetch versus four when aligned to offset 48. This causes ~29% more cycles for identical instruction counts. Previously the loop was inlined into the large step() function, so any code change anywhere in the binary (like the highway vendor restructuring) could shift the loop across a cache-line boundary. By making parseAll noinline, the loop gets its own function placement that is stable regardless of surrounding code changes.
115 lines
3.4 KiB
Zig
115 lines
3.4 KiB
Zig
//! This benchmark tests the throughput of the terminal escape code parser.
|
|
const TerminalParser = @This();
|
|
|
|
const std = @import("std");
|
|
const assert = std.debug.assert;
|
|
const Allocator = std.mem.Allocator;
|
|
const terminalpkg = @import("../terminal/main.zig");
|
|
const Benchmark = @import("Benchmark.zig");
|
|
const options = @import("options.zig");
|
|
|
|
const log = std.log.scoped(.@"terminal-stream-bench");
|
|
|
|
opts: Options,
|
|
|
|
/// The file, opened in the setup function.
|
|
data_f: ?std.fs.File = null,
|
|
|
|
pub const Options = struct {
|
|
/// The data to read as a filepath. If this is "-" then
|
|
/// we will read stdin. If this is unset, then we will
|
|
/// do nothing (benchmark is a noop). It'd be more unixy to
|
|
/// use stdin by default but I find that a hanging CLI command
|
|
/// with no interaction is a bit annoying.
|
|
data: ?[]const u8 = null,
|
|
};
|
|
|
|
pub fn create(
|
|
alloc: Allocator,
|
|
opts: Options,
|
|
) !*TerminalParser {
|
|
const ptr = try alloc.create(TerminalParser);
|
|
errdefer alloc.destroy(ptr);
|
|
ptr.* = .{ .opts = opts };
|
|
return ptr;
|
|
}
|
|
|
|
pub fn destroy(self: *TerminalParser, alloc: Allocator) void {
|
|
alloc.destroy(self);
|
|
}
|
|
|
|
pub fn benchmark(self: *TerminalParser) Benchmark {
|
|
return .init(self, .{
|
|
.stepFn = step,
|
|
.setupFn = setup,
|
|
.teardownFn = teardown,
|
|
});
|
|
}
|
|
|
|
fn setup(ptr: *anyopaque) Benchmark.Error!void {
|
|
const self: *TerminalParser = @ptrCast(@alignCast(ptr));
|
|
|
|
// Open our data file to prepare for reading. We can do more
|
|
// validation here eventually.
|
|
assert(self.data_f == null);
|
|
self.data_f = options.dataFile(self.opts.data) catch |err| {
|
|
log.warn("error opening data file err={}", .{err});
|
|
return error.BenchmarkFailed;
|
|
};
|
|
}
|
|
|
|
fn teardown(ptr: *anyopaque) void {
|
|
const self: *TerminalParser = @ptrCast(@alignCast(ptr));
|
|
if (self.data_f) |f| {
|
|
f.close();
|
|
self.data_f = null;
|
|
}
|
|
}
|
|
|
|
fn step(ptr: *anyopaque) Benchmark.Error!void {
|
|
const self: *TerminalParser = @ptrCast(@alignCast(ptr));
|
|
|
|
// Get our buffered reader so we're not predominantly
|
|
// waiting on file IO. It'd be better to move this fully into
|
|
// memory. If we're IO bound though that should show up on
|
|
// the benchmark results and... I know writing this that we
|
|
// aren't currently IO bound.
|
|
const f = self.data_f orelse return;
|
|
var read_buf: [4096]u8 align(std.atomic.cache_line) = undefined;
|
|
var f_reader = f.reader(&read_buf);
|
|
var r = &f_reader.interface;
|
|
|
|
var p: terminalpkg.Parser = .init();
|
|
|
|
var buf: [4096]u8 = undefined;
|
|
while (true) {
|
|
const n = r.readSliceShort(&buf) catch {
|
|
log.warn("error reading data file err={?}", .{f_reader.err});
|
|
return error.BenchmarkFailed;
|
|
};
|
|
if (n == 0) break; // EOF reached
|
|
parseAll(&p, buf[0..n]);
|
|
}
|
|
}
|
|
|
|
/// Separated from `step` so that the tight per-byte loop gets its own
|
|
/// function alignment, insulating it from code-layout changes elsewhere
|
|
/// in the binary that would otherwise shift its cache-line placement.
|
|
noinline fn parseAll(p: *terminalpkg.Parser, data: []const u8) void {
|
|
for (data) |c| {
|
|
const actions = p.next(c);
|
|
_ = actions;
|
|
}
|
|
}
|
|
|
|
test TerminalParser {
|
|
const testing = std.testing;
|
|
const alloc = testing.allocator;
|
|
|
|
const impl: *TerminalParser = try .create(alloc, .{});
|
|
defer impl.destroy(alloc);
|
|
|
|
const bench = impl.benchmark();
|
|
_ = try bench.run(.once);
|
|
}
|