drop the new LUT type as no performance advantage detected

This commit is contained in:
Jeffrey C. Ollie
2025-09-04 23:15:29 -05:00
parent a7da96faee
commit e024b77ad5
7 changed files with 22 additions and 339 deletions

View File

@@ -10,8 +10,7 @@ const Allocator = std.mem.Allocator;
const Benchmark = @import("Benchmark.zig");
const options = @import("options.zig");
const UTF8Decoder = @import("../terminal/UTF8Decoder.zig");
const symbols1 = @import("../unicode/symbols1.zig");
const symbols2 = @import("../unicode/symbols2.zig");
const symbols = @import("../unicode/symbols.zig");
const log = std.log.scoped(.@"is-symbol-bench");
@@ -37,8 +36,7 @@ pub const Mode = enum {
ziglyph,
/// Ghostty's table-based approach.
table1,
table2,
table,
};
/// Create a new terminal stream handler for the given arguments.
@@ -60,8 +58,7 @@ pub fn benchmark(self: *IsSymbol) Benchmark {
return .init(self, .{
.stepFn = switch (self.opts.mode) {
.ziglyph => stepZiglyph,
.table1 => stepTable1,
.table2 => stepTable1,
.table => stepTable,
},
.setupFn = setup,
.teardownFn = teardown,
@@ -106,13 +103,13 @@ fn stepZiglyph(ptr: *anyopaque) Benchmark.Error!void {
const cp_, const consumed = d.next(c);
assert(consumed);
if (cp_) |cp| {
std.mem.doNotOptimizeAway(symbols1.isSymbol(cp));
std.mem.doNotOptimizeAway(symbols.isSymbol(cp));
}
}
}
}
fn stepTable1(ptr: *anyopaque) Benchmark.Error!void {
fn stepTable(ptr: *anyopaque) Benchmark.Error!void {
const self: *IsSymbol = @ptrCast(@alignCast(ptr));
const f = self.data_f orelse return;
@@ -130,31 +127,7 @@ fn stepTable1(ptr: *anyopaque) Benchmark.Error!void {
const cp_, const consumed = d.next(c);
assert(consumed);
if (cp_) |cp| {
std.mem.doNotOptimizeAway(symbols1.table.get(cp));
}
}
}
}
fn stepTable2(ptr: *anyopaque) Benchmark.Error!void {
const self: *IsSymbol = @ptrCast(@alignCast(ptr));
const f = self.data_f orelse return;
var r = std.io.bufferedReader(f.reader());
var d: UTF8Decoder = .{};
var buf: [4096]u8 = undefined;
while (true) {
const n = r.read(&buf) catch |err| {
log.warn("error reading data file err={}", .{err});
return error.BenchmarkFailed;
};
if (n == 0) break; // EOF reached
for (buf[0..n]) |c| {
const cp_, const consumed = d.next(c);
assert(consumed);
if (cp_) |cp| {
std.mem.doNotOptimizeAway(symbols2.table.get(cp));
std.mem.doNotOptimizeAway(symbols.table.get(cp));
}
}
}

View File

@@ -5,13 +5,11 @@ const Config = @import("Config.zig");
/// The exe.
props_exe: *std.Build.Step.Compile,
symbols1_exe: *std.Build.Step.Compile,
symbols2_exe: *std.Build.Step.Compile,
symbols_exe: *std.Build.Step.Compile,
/// The output path for the unicode tables
props_output: std.Build.LazyPath,
symbols1_output: std.Build.LazyPath,
symbols2_output: std.Build.LazyPath,
symbols_output: std.Build.LazyPath,
pub fn init(b: *std.Build) !UnicodeTables {
const props_exe = b.addExecutable(.{
@@ -25,21 +23,10 @@ pub fn init(b: *std.Build) !UnicodeTables {
}),
});
const symbols1_exe = b.addExecutable(.{
.name = "symbols1-unigen",
const symbols_exe = b.addExecutable(.{
.name = "symbols-unigen",
.root_module = b.createModule(.{
.root_source_file = b.path("src/unicode/symbols1.zig"),
.target = b.graph.host,
.strip = false,
.omit_frame_pointer = false,
.unwind_tables = .sync,
}),
});
const symbols2_exe = b.addExecutable(.{
.name = "symbols2-unigen",
.root_module = b.createModule(.{
.root_source_file = b.path("src/unicode/symbols2.zig"),
.root_source_file = b.path("src/unicode/symbols.zig"),
.target = b.graph.host,
.strip = false,
.omit_frame_pointer = false,
@@ -50,7 +37,7 @@ pub fn init(b: *std.Build) !UnicodeTables {
if (b.lazyDependency("ziglyph", .{
.target = b.graph.host,
})) |ziglyph_dep| {
inline for (&.{ props_exe, symbols1_exe, symbols2_exe }) |exe| {
inline for (&.{ props_exe, symbols_exe }) |exe| {
exe.root_module.addImport(
"ziglyph",
ziglyph_dep.module("ziglyph"),
@@ -59,16 +46,13 @@ pub fn init(b: *std.Build) !UnicodeTables {
}
const props_run = b.addRunArtifact(props_exe);
const symbols1_run = b.addRunArtifact(symbols1_exe);
const symbols2_run = b.addRunArtifact(symbols2_exe);
const symbols_run = b.addRunArtifact(symbols_exe);
return .{
.props_exe = props_exe,
.symbols1_exe = symbols1_exe,
.symbols2_exe = symbols2_exe,
.symbols_exe = symbols_exe,
.props_output = props_run.captureStdOut(),
.symbols1_output = symbols1_run.captureStdOut(),
.symbols2_output = symbols2_run.captureStdOut(),
.symbols_output = symbols_run.captureStdOut(),
};
}
@@ -78,19 +62,14 @@ pub fn addImport(self: *const UnicodeTables, step: *std.Build.Step.Compile) void
step.root_module.addAnonymousImport("unicode_tables", .{
.root_source_file = self.props_output,
});
self.symbols1_output.addStepDependencies(&step.step);
step.root_module.addAnonymousImport("symbols1_tables", .{
.root_source_file = self.symbols1_output,
});
self.symbols2_output.addStepDependencies(&step.step);
step.root_module.addAnonymousImport("symbols2_tables", .{
.root_source_file = self.symbols2_output,
self.symbols_output.addStepDependencies(&step.step);
step.root_module.addAnonymousImport("symbols_tables", .{
.root_source_file = self.symbols_output,
});
}
/// Install the exe
pub fn install(self: *const UnicodeTables, b: *std.Build) void {
b.installArtifact(self.props_exe);
b.installArtifact(self.symbols1_exe);
b.installArtifact(self.symbols2_exe);
b.installArtifact(self.symbols_exe);
}

View File

@@ -6,7 +6,7 @@ const terminal = @import("../terminal/main.zig");
const renderer = @import("../renderer.zig");
const shaderpkg = renderer.Renderer.API.shaders;
const ArrayListCollection = @import("../datastruct/array_list_collection.zig").ArrayListCollection;
const symbols = @import("../unicode/symbols1.zig").table;
const symbols = @import("../unicode/symbols.zig").table;
/// The possible cell content keys that exist.
pub const Key = enum {

View File

@@ -1,183 +0,0 @@
const std = @import("std");
const assert = std.debug.assert;
const Allocator = std.mem.Allocator;
// This whole file is based on the algorithm described here:
// https://here-be-braces.com/fast-lookup-of-unicode-properties/
const set_size = @typeInfo(usize).int.bits;
// const Set = std.bit_set.ArrayBitSet(usize, set_size);
const Set = std.bit_set.IntegerBitSet(set_size);
const cp_shift = std.math.log2_int(u21, set_size);
const cp_mask = set_size - 1;
/// Creates a type that is able to generate a 2-level lookup table
/// from a Unicode codepoint to a mapping of type bool. The lookup table
/// generally is expected to be codegen'd and then reloaded, although it
/// can in theory be generated at runtime.
///
/// Context must have one function:
/// - `get(Context, u21) bool`: returns the mapping for a given codepoint
///
pub fn Generator(
comptime Context: type,
) type {
return struct {
const Self = @This();
/// Mapping of a block to its index in the stage2 array.
const SetMap = std.HashMap(
Set,
u16,
struct {
pub fn hash(ctx: @This(), k: Set) u64 {
_ = ctx;
var hasher = std.hash.Wyhash.init(0);
std.hash.autoHashStrat(&hasher, k, .DeepRecursive);
return hasher.final();
}
pub fn eql(ctx: @This(), a: Set, b: Set) bool {
_ = ctx;
return a.eql(b);
}
},
std.hash_map.default_max_load_percentage,
);
ctx: Context = undefined,
/// Generate the lookup tables. The arrays in the return value
/// are owned by the caller and must be freed.
pub fn generate(self: *const Self, alloc: Allocator) !Tables {
var min: u21 = std.math.maxInt(u21);
var max: u21 = std.math.minInt(u21);
// Maps block => stage2 index
var set_map = SetMap.init(alloc);
defer set_map.deinit();
// Our stages
var stage1 = std.ArrayList(u16).init(alloc);
defer stage1.deinit();
var stage2 = std.ArrayList(Set).init(alloc);
defer stage2.deinit();
var set: Set = .initEmpty();
// ensure that the 1st entry is always all false
try stage2.append(set);
try set_map.putNoClobber(set, 0);
for (0..std.math.maxInt(u21) + 1) |cp_| {
const cp: u21 = @intCast(cp_);
const high = cp >> cp_shift;
const low = cp & cp_mask;
if (self.ctx.get(cp)) {
if (cp < min) min = cp;
if (cp > max) max = cp;
set.set(low);
}
// If we still have space and we're not done with codepoints,
// we keep building up the block. Conversely: we finalize this
// block if we've filled it or are out of codepoints.
if (low + 1 < set_size and cp != std.math.maxInt(u21)) continue;
// Look for the stage2 index for this block. If it doesn't exist
// we add it to stage2 and update the mapping.
const gop = try set_map.getOrPut(set);
if (!gop.found_existing) {
gop.value_ptr.* = std.math.cast(
u16,
stage2.items.len,
) orelse return error.Stage2TooLarge;
try stage2.append(set);
}
// Map stage1 => stage2 and reset our block
try stage1.append(gop.value_ptr.*);
set = .initEmpty();
assert(stage1.items.len - 1 == high);
}
// All of our lengths must fit in a u16 for this to work
assert(stage1.items.len <= std.math.maxInt(u16));
assert(stage2.items.len <= std.math.maxInt(u16));
const stage1_owned = try stage1.toOwnedSlice();
errdefer alloc.free(stage1_owned);
const stage2_owned = try stage2.toOwnedSlice();
errdefer alloc.free(stage2_owned);
return .{
.min = min,
.max = max,
.stage1 = stage1_owned,
.stage2 = stage2_owned,
};
}
};
}
/// Creates a type that given a 3-level lookup table, can be used to
/// look up a mapping for a given codepoint, encode it out to Zig, etc.
pub const Tables = struct {
const Self = @This();
min: u21,
max: u21,
stage1: []const u16,
stage2: []const Set,
/// Given a codepoint, returns the mapping for that codepoint.
pub fn get(self: *const Self, cp: u21) bool {
if (cp < self.min) return false;
if (cp > self.max) return false;
const high = cp >> cp_shift;
const stage2 = self.stage1[high];
// take advantage of the fact that the first entry is always all false
if (stage2 == 0) return false;
const low = cp & cp_mask;
return self.stage2[stage2].isSet(low);
}
/// Writes the lookup table as Zig to the given writer. The
/// written file exports three constants: stage1, stage2, and
/// stage3. These can be used to rebuild the lookup table in Zig.
pub fn writeZig(self: *const Self, writer: anytype) !void {
try writer.print(
\\//! This file is auto-generated. Do not edit.
\\const std = @import("std");
\\
\\pub const min: u21 = {};
\\pub const max: u21 = {};
\\
\\pub const stage1: [{}]u16 = .{{
, .{ self.min, self.max, self.stage1.len });
for (self.stage1) |entry| try writer.print("{},", .{entry});
try writer.print(
\\
\\}};
\\
\\pub const Set = std.bit_set.IntegerBitSet({d});
\\pub const stage2: [{d}]Set = .{{
\\
, .{ set_size, self.stage2.len });
// for (self.stage2) |entry| {
// try writer.print(" .{{\n", .{});
// try writer.print(" .masks = [{d}]{s}{{\n", .{ entry.masks.len, @typeName(Set.MaskInt) });
// for (entry.masks) |mask| {
// try writer.print(" {d},\n", .{mask});
// }
// try writer.print(" }},\n", .{});
// try writer.print(" }},\n", .{});
// }
for (self.stage2) |entry| {
try writer.print(" .{{ .mask = {d} }},\n", .{entry.mask});
}
try writer.writeAll("};\n");
}
};

View File

@@ -9,7 +9,6 @@ pub const graphemeBreak = grapheme.graphemeBreak;
pub const GraphemeBreakState = grapheme.BreakState;
test {
_ = @import("symbols1.zig");
_ = @import("symbols2.zig");
_ = @import("symbols.zig");
@import("std").testing.refAllDecls(@This());
}

View File

@@ -8,7 +8,7 @@ const lut = @import("lut.zig");
pub const table = table: {
// This is only available after running main() below as part of the Ghostty
// build.zig, but due to Zig's lazy analysis we can still reference it here.
const generated = @import("symbols1_tables").Tables(bool);
const generated = @import("symbols_tables").Tables(bool);
const Tables = lut.Tables(bool);
break :table Tables{
.stage1 = &generated.stage1,

View File

@@ -1,85 +0,0 @@
const props = @This();
const std = @import("std");
const assert = std.debug.assert;
const ziglyph = @import("ziglyph");
const lut2 = @import("lut2.zig");
/// The lookup tables for Ghostty.
pub const table = table: {
// This is only available after running main() below as part of the Ghostty
// build.zig, but due to Zig's lazy analysis we can still reference it here.
const generated = @import("symbols2_tables");
break :table lut2.Tables{
.min = generated.min,
.max = generated.max,
.stage1 = &generated.stage1,
.stage2 = &generated.stage2,
};
};
/// Returns true of the codepoint is a "symbol-like" character, which
/// for now we define as anything in a private use area and anything
/// in several unicode blocks:
/// - Dingbats
/// - Emoticons
/// - Miscellaneous Symbols
/// - Enclosed Alphanumerics
/// - Enclosed Alphanumeric Supplement
/// - Miscellaneous Symbols and Pictographs
/// - Transport and Map Symbols
///
/// In the future it may be prudent to expand this to encompass more
/// symbol-like characters, and/or exclude some PUA sections.
pub fn isSymbol(cp: u21) bool {
return ziglyph.general_category.isPrivateUse(cp) or
ziglyph.blocks.isDingbats(cp) or
ziglyph.blocks.isEmoticons(cp) or
ziglyph.blocks.isMiscellaneousSymbols(cp) or
ziglyph.blocks.isEnclosedAlphanumerics(cp) or
ziglyph.blocks.isEnclosedAlphanumericSupplement(cp) or
ziglyph.blocks.isMiscellaneousSymbolsAndPictographs(cp) or
ziglyph.blocks.isTransportAndMapSymbols(cp);
}
/// Runnable binary to generate the lookup tables and output to stdout.
pub fn main() !void {
var arena_state = std.heap.ArenaAllocator.init(std.heap.page_allocator);
defer arena_state.deinit();
const alloc = arena_state.allocator();
const gen: lut2.Generator(
struct {
pub fn get(ctx: @This(), cp: u21) bool {
_ = ctx;
return isSymbol(cp);
}
},
) = .{};
const t = try gen.generate(alloc);
defer alloc.free(t.stage1);
defer alloc.free(t.stage2);
try t.writeZig(std.io.getStdOut().writer());
// Uncomment when manually debugging to see our table sizes.
// std.log.warn("stage1={} stage2={}", .{
// t.stage1.len,
// t.stage2.len,
// });
}
// This is not very fast in debug modes, so its commented by default.
// IMPORTANT: UNCOMMENT THIS WHENEVER MAKING CHANGES.
test "unicode symbols2: tables match ziglyph" {
const testing = std.testing;
for (0..std.math.maxInt(u21)) |cp| {
const t1 = table.get(@intCast(cp));
const zg = isSymbol(@intCast(cp));
if (t1 != zg) {
std.log.warn("mismatch cp=U+{x} t={} zg={}", .{ cp, t1, zg });
try testing.expect(false);
}
}
}