From a7da96faeea305d5ff1758a98557a42afe0fed32 Mon Sep 17 00:00:00 2001 From: "Jeffrey C. Ollie" Date: Thu, 4 Sep 2025 23:04:08 -0500 Subject: [PATCH] add two LUT-based implementations of isSymbol --- src/benchmark/IsSymbol.zig | 172 +++++++++++++++++++++++++++++++++ src/benchmark/cli.zig | 2 + src/benchmark/main.zig | 1 + src/build/Config.zig | 7 ++ src/build/SharedDeps.zig | 1 + src/build/UnicodeTables.zig | 73 +++++++++++--- src/renderer/cell.zig | 12 +-- src/unicode/lut.zig | 26 +++++ src/unicode/lut2.zig | 183 ++++++++++++++++++++++++++++++++++++ src/unicode/main.zig | 2 + src/unicode/props.zig | 2 +- src/unicode/symbols1.zig | 93 ++++++++++++++++++ src/unicode/symbols2.zig | 85 +++++++++++++++++ 13 files changed, 634 insertions(+), 25 deletions(-) create mode 100644 src/benchmark/IsSymbol.zig create mode 100644 src/unicode/lut2.zig create mode 100644 src/unicode/symbols1.zig create mode 100644 src/unicode/symbols2.zig diff --git a/src/benchmark/IsSymbol.zig b/src/benchmark/IsSymbol.zig new file mode 100644 index 000000000..46ebb8c66 --- /dev/null +++ b/src/benchmark/IsSymbol.zig @@ -0,0 +1,172 @@ +//! This benchmark tests the throughput of grapheme break calculation. +//! This is a common operation in terminal character printing for terminals +//! that support grapheme clustering. +const IsSymbol = @This(); + +const std = @import("std"); +const builtin = @import("builtin"); +const assert = std.debug.assert; +const Allocator = std.mem.Allocator; +const Benchmark = @import("Benchmark.zig"); +const options = @import("options.zig"); +const UTF8Decoder = @import("../terminal/UTF8Decoder.zig"); +const symbols1 = @import("../unicode/symbols1.zig"); +const symbols2 = @import("../unicode/symbols2.zig"); + +const log = std.log.scoped(.@"is-symbol-bench"); + +opts: Options, + +/// The file, opened in the setup function. +data_f: ?std.fs.File = null, + +pub const Options = struct { + /// Which test to run. + mode: Mode = .ziglyph, + + /// The data to read as a filepath. If this is "-" then + /// we will read stdin. If this is unset, then we will + /// do nothing (benchmark is a noop). It'd be more unixy to + /// use stdin by default but I find that a hanging CLI command + /// with no interaction is a bit annoying. + data: ?[]const u8 = null, +}; + +pub const Mode = enum { + /// "Naive" ziglyph implementation. + ziglyph, + + /// Ghostty's table-based approach. + table1, + table2, +}; + +/// Create a new terminal stream handler for the given arguments. +pub fn create( + alloc: Allocator, + opts: Options, +) !*IsSymbol { + const ptr = try alloc.create(IsSymbol); + errdefer alloc.destroy(ptr); + ptr.* = .{ .opts = opts }; + return ptr; +} + +pub fn destroy(self: *IsSymbol, alloc: Allocator) void { + alloc.destroy(self); +} + +pub fn benchmark(self: *IsSymbol) Benchmark { + return .init(self, .{ + .stepFn = switch (self.opts.mode) { + .ziglyph => stepZiglyph, + .table1 => stepTable1, + .table2 => stepTable1, + }, + .setupFn = setup, + .teardownFn = teardown, + }); +} + +fn setup(ptr: *anyopaque) Benchmark.Error!void { + const self: *IsSymbol = @ptrCast(@alignCast(ptr)); + + // Open our data file to prepare for reading. We can do more + // validation here eventually. + assert(self.data_f == null); + self.data_f = options.dataFile(self.opts.data) catch |err| { + log.warn("error opening data file err={}", .{err}); + return error.BenchmarkFailed; + }; +} + +fn teardown(ptr: *anyopaque) void { + const self: *IsSymbol = @ptrCast(@alignCast(ptr)); + if (self.data_f) |f| { + f.close(); + self.data_f = null; + } +} + +fn stepZiglyph(ptr: *anyopaque) Benchmark.Error!void { + const self: *IsSymbol = @ptrCast(@alignCast(ptr)); + + const f = self.data_f orelse return; + var r = std.io.bufferedReader(f.reader()); + var d: UTF8Decoder = .{}; + var buf: [4096]u8 = undefined; + while (true) { + const n = r.read(&buf) catch |err| { + log.warn("error reading data file err={}", .{err}); + return error.BenchmarkFailed; + }; + if (n == 0) break; // EOF reached + + for (buf[0..n]) |c| { + const cp_, const consumed = d.next(c); + assert(consumed); + if (cp_) |cp| { + std.mem.doNotOptimizeAway(symbols1.isSymbol(cp)); + } + } + } +} + +fn stepTable1(ptr: *anyopaque) Benchmark.Error!void { + const self: *IsSymbol = @ptrCast(@alignCast(ptr)); + + const f = self.data_f orelse return; + var r = std.io.bufferedReader(f.reader()); + var d: UTF8Decoder = .{}; + var buf: [4096]u8 = undefined; + while (true) { + const n = r.read(&buf) catch |err| { + log.warn("error reading data file err={}", .{err}); + return error.BenchmarkFailed; + }; + if (n == 0) break; // EOF reached + + for (buf[0..n]) |c| { + const cp_, const consumed = d.next(c); + assert(consumed); + if (cp_) |cp| { + std.mem.doNotOptimizeAway(symbols1.table.get(cp)); + } + } + } +} + +fn stepTable2(ptr: *anyopaque) Benchmark.Error!void { + const self: *IsSymbol = @ptrCast(@alignCast(ptr)); + + const f = self.data_f orelse return; + var r = std.io.bufferedReader(f.reader()); + var d: UTF8Decoder = .{}; + var buf: [4096]u8 = undefined; + while (true) { + const n = r.read(&buf) catch |err| { + log.warn("error reading data file err={}", .{err}); + return error.BenchmarkFailed; + }; + if (n == 0) break; // EOF reached + + for (buf[0..n]) |c| { + const cp_, const consumed = d.next(c); + assert(consumed); + if (cp_) |cp| { + std.mem.doNotOptimizeAway(symbols2.table.get(cp)); + } + } + } +} + +test IsSymbol { + const testing = std.testing; + const alloc = testing.allocator; + + const impl: *IsSymbol = try .create(alloc, .{}); + defer impl.destroy(alloc); + + const bench = impl.benchmark(); + _ = try bench.run(.once); +} diff --git a/src/benchmark/cli.zig b/src/benchmark/cli.zig index 97bb9c683..3b1c905eb 100644 --- a/src/benchmark/cli.zig +++ b/src/benchmark/cli.zig @@ -10,6 +10,7 @@ pub const Action = enum { @"grapheme-break", @"terminal-parser", @"terminal-stream", + @"is-symbol", /// Returns the struct associated with the action. The struct /// should have a few decls: @@ -25,6 +26,7 @@ pub const Action = enum { .@"codepoint-width" => @import("CodepointWidth.zig"), .@"grapheme-break" => @import("GraphemeBreak.zig"), .@"terminal-parser" => @import("TerminalParser.zig"), + .@"is-symbol" => @import("IsSymbol.zig"), }; } }; diff --git a/src/benchmark/main.zig b/src/benchmark/main.zig index 49bb17289..3a59125fc 100644 --- a/src/benchmark/main.zig +++ b/src/benchmark/main.zig @@ -5,6 +5,7 @@ pub const TerminalStream = @import("TerminalStream.zig"); pub const CodepointWidth = @import("CodepointWidth.zig"); pub const GraphemeBreak = @import("GraphemeBreak.zig"); pub const TerminalParser = @import("TerminalParser.zig"); +pub const IsSymbol = @import("IsSymbol.zig"); test { @import("std").testing.refAllDecls(@This()); diff --git a/src/build/Config.zig b/src/build/Config.zig index fd892f16c..b11e8850d 100644 --- a/src/build/Config.zig +++ b/src/build/Config.zig @@ -61,6 +61,7 @@ emit_termcap: bool = false, emit_test_exe: bool = false, emit_xcframework: bool = false, emit_webdata: bool = false, +emit_unicode_table_gen: bool = false, /// Environmental properties env: std.process.EnvMap, @@ -299,6 +300,12 @@ pub fn init(b: *std.Build) !Config { "Build and install test executables with 'build'", ) orelse false; + config.emit_unicode_table_gen = b.option( + bool, + "emit-unicode-table-gen", + "Build and install executables that generate unicode tables with 'build'", + ) orelse false; + config.emit_bench = b.option( bool, "emit-bench", diff --git a/src/build/SharedDeps.zig b/src/build/SharedDeps.zig index 86390a496..af826d964 100644 --- a/src/build/SharedDeps.zig +++ b/src/build/SharedDeps.zig @@ -31,6 +31,7 @@ pub fn init(b: *std.Build, cfg: *const Config) !SharedDeps { .metallib = undefined, }; try result.initTarget(b, cfg.target); + if (cfg.emit_unicode_table_gen) result.unicode_tables.install(b); return result; } diff --git a/src/build/UnicodeTables.zig b/src/build/UnicodeTables.zig index 5bba2341b..dd9a6bdf2 100644 --- a/src/build/UnicodeTables.zig +++ b/src/build/UnicodeTables.zig @@ -4,14 +4,18 @@ const std = @import("std"); const Config = @import("Config.zig"); /// The exe. -exe: *std.Build.Step.Compile, +props_exe: *std.Build.Step.Compile, +symbols1_exe: *std.Build.Step.Compile, +symbols2_exe: *std.Build.Step.Compile, /// The output path for the unicode tables -output: std.Build.LazyPath, +props_output: std.Build.LazyPath, +symbols1_output: std.Build.LazyPath, +symbols2_output: std.Build.LazyPath, pub fn init(b: *std.Build) !UnicodeTables { - const exe = b.addExecutable(.{ - .name = "unigen", + const props_exe = b.addExecutable(.{ + .name = "props-unigen", .root_module = b.createModule(.{ .root_source_file = b.path("src/unicode/props.zig"), .target = b.graph.host, @@ -21,31 +25,72 @@ pub fn init(b: *std.Build) !UnicodeTables { }), }); + const symbols1_exe = b.addExecutable(.{ + .name = "symbols1-unigen", + .root_module = b.createModule(.{ + .root_source_file = b.path("src/unicode/symbols1.zig"), + .target = b.graph.host, + .strip = false, + .omit_frame_pointer = false, + .unwind_tables = .sync, + }), + }); + + const symbols2_exe = b.addExecutable(.{ + .name = "symbols2-unigen", + .root_module = b.createModule(.{ + .root_source_file = b.path("src/unicode/symbols2.zig"), + .target = b.graph.host, + .strip = false, + .omit_frame_pointer = false, + .unwind_tables = .sync, + }), + }); + if (b.lazyDependency("ziglyph", .{ .target = b.graph.host, })) |ziglyph_dep| { - exe.root_module.addImport( - "ziglyph", - ziglyph_dep.module("ziglyph"), - ); + inline for (&.{ props_exe, symbols1_exe, symbols2_exe }) |exe| { + exe.root_module.addImport( + "ziglyph", + ziglyph_dep.module("ziglyph"), + ); + } } - const run = b.addRunArtifact(exe); + const props_run = b.addRunArtifact(props_exe); + const symbols1_run = b.addRunArtifact(symbols1_exe); + const symbols2_run = b.addRunArtifact(symbols2_exe); + return .{ - .exe = exe, - .output = run.captureStdOut(), + .props_exe = props_exe, + .symbols1_exe = symbols1_exe, + .symbols2_exe = symbols2_exe, + .props_output = props_run.captureStdOut(), + .symbols1_output = symbols1_run.captureStdOut(), + .symbols2_output = symbols2_run.captureStdOut(), }; } /// Add the "unicode_tables" import. pub fn addImport(self: *const UnicodeTables, step: *std.Build.Step.Compile) void { - self.output.addStepDependencies(&step.step); + self.props_output.addStepDependencies(&step.step); step.root_module.addAnonymousImport("unicode_tables", .{ - .root_source_file = self.output, + .root_source_file = self.props_output, + }); + self.symbols1_output.addStepDependencies(&step.step); + step.root_module.addAnonymousImport("symbols1_tables", .{ + .root_source_file = self.symbols1_output, + }); + self.symbols2_output.addStepDependencies(&step.step); + step.root_module.addAnonymousImport("symbols2_tables", .{ + .root_source_file = self.symbols2_output, }); } /// Install the exe pub fn install(self: *const UnicodeTables, b: *std.Build) void { - b.installArtifact(self.exe); + b.installArtifact(self.props_exe); + b.installArtifact(self.symbols1_exe); + b.installArtifact(self.symbols2_exe); } diff --git a/src/renderer/cell.zig b/src/renderer/cell.zig index ec13b8953..a75fddf52 100644 --- a/src/renderer/cell.zig +++ b/src/renderer/cell.zig @@ -1,12 +1,12 @@ const std = @import("std"); const Allocator = std.mem.Allocator; const assert = std.debug.assert; -const ziglyph = @import("ziglyph"); const font = @import("../font/main.zig"); const terminal = @import("../terminal/main.zig"); const renderer = @import("../renderer.zig"); const shaderpkg = renderer.Renderer.API.shaders; const ArrayListCollection = @import("../datastruct/array_list_collection.zig").ArrayListCollection; +const symbols = @import("../unicode/symbols1.zig").table; /// The possible cell content keys that exist. pub const Key = enum { @@ -249,15 +249,7 @@ pub fn isCovering(cp: u21) bool { /// In the future it may be prudent to expand this to encompass more /// symbol-like characters, and/or exclude some PUA sections. pub fn isSymbol(cp: u21) bool { - // TODO: This should probably become a codegen'd LUT - return ziglyph.general_category.isPrivateUse(cp) or - ziglyph.blocks.isDingbats(cp) or - ziglyph.blocks.isEmoticons(cp) or - ziglyph.blocks.isMiscellaneousSymbols(cp) or - ziglyph.blocks.isEnclosedAlphanumerics(cp) or - ziglyph.blocks.isEnclosedAlphanumericSupplement(cp) or - ziglyph.blocks.isMiscellaneousSymbolsAndPictographs(cp) or - ziglyph.blocks.isTransportAndMapSymbols(cp); + return symbols.get(cp); } /// Returns the appropriate `constraint_width` for diff --git a/src/unicode/lut.zig b/src/unicode/lut.zig index 95c6a3688..e709bf1fe 100644 --- a/src/unicode/lut.zig +++ b/src/unicode/lut.zig @@ -142,6 +142,32 @@ pub fn Tables(comptime Elem: type) type { return self.stage3[self.stage2[self.stage1[high] + low]]; } + pub inline fn getInline(self: *const Self, cp: u21) Elem { + const high = cp >> 8; + const low = cp & 0xFF; + return self.stage3[self.stage2[self.stage1[high] + low]]; + } + + pub fn getBool(self: *const Self, cp: u21) bool { + assert(Elem == bool); + assert(self.stage3.len == 2); + assert(self.stage3[0] == false); + assert(self.stage3[1] == true); + const high = cp >> 8; + const low = cp & 0xFF; + return self.stage2[self.stage1[high] + low] != 0; + } + + pub inline fn getBoolInline(self: *const Self, cp: u21) bool { + assert(Elem == bool); + assert(self.stage3.len == 2); + assert(self.stage3[0] == false); + assert(self.stage3[1] == true); + const high = cp >> 8; + const low = cp & 0xFF; + return self.stage2[self.stage1[high] + low] != 0; + } + /// Writes the lookup table as Zig to the given writer. The /// written file exports three constants: stage1, stage2, and /// stage3. These can be used to rebuild the lookup table in Zig. diff --git a/src/unicode/lut2.zig b/src/unicode/lut2.zig new file mode 100644 index 000000000..ef5c886a2 --- /dev/null +++ b/src/unicode/lut2.zig @@ -0,0 +1,183 @@ +const std = @import("std"); +const assert = std.debug.assert; +const Allocator = std.mem.Allocator; + +// This whole file is based on the algorithm described here: +// https://here-be-braces.com/fast-lookup-of-unicode-properties/ + +const set_size = @typeInfo(usize).int.bits; +// const Set = std.bit_set.ArrayBitSet(usize, set_size); +const Set = std.bit_set.IntegerBitSet(set_size); +const cp_shift = std.math.log2_int(u21, set_size); +const cp_mask = set_size - 1; + +/// Creates a type that is able to generate a 2-level lookup table +/// from a Unicode codepoint to a mapping of type bool. The lookup table +/// generally is expected to be codegen'd and then reloaded, although it +/// can in theory be generated at runtime. +/// +/// Context must have one function: +/// - `get(Context, u21) bool`: returns the mapping for a given codepoint +/// +pub fn Generator( + comptime Context: type, +) type { + return struct { + const Self = @This(); + + /// Mapping of a block to its index in the stage2 array. + const SetMap = std.HashMap( + Set, + u16, + struct { + pub fn hash(ctx: @This(), k: Set) u64 { + _ = ctx; + var hasher = std.hash.Wyhash.init(0); + std.hash.autoHashStrat(&hasher, k, .DeepRecursive); + return hasher.final(); + } + + pub fn eql(ctx: @This(), a: Set, b: Set) bool { + _ = ctx; + return a.eql(b); + } + }, + std.hash_map.default_max_load_percentage, + ); + + ctx: Context = undefined, + + /// Generate the lookup tables. The arrays in the return value + /// are owned by the caller and must be freed. + pub fn generate(self: *const Self, alloc: Allocator) !Tables { + var min: u21 = std.math.maxInt(u21); + var max: u21 = std.math.minInt(u21); + + // Maps block => stage2 index + var set_map = SetMap.init(alloc); + defer set_map.deinit(); + + // Our stages + var stage1 = std.ArrayList(u16).init(alloc); + defer stage1.deinit(); + var stage2 = std.ArrayList(Set).init(alloc); + defer stage2.deinit(); + + var set: Set = .initEmpty(); + + // ensure that the 1st entry is always all false + try stage2.append(set); + try set_map.putNoClobber(set, 0); + + for (0..std.math.maxInt(u21) + 1) |cp_| { + const cp: u21 = @intCast(cp_); + const high = cp >> cp_shift; + const low = cp & cp_mask; + + if (self.ctx.get(cp)) { + if (cp < min) min = cp; + if (cp > max) max = cp; + set.set(low); + } + + // If we still have space and we're not done with codepoints, + // we keep building up the block. Conversely: we finalize this + // block if we've filled it or are out of codepoints. + if (low + 1 < set_size and cp != std.math.maxInt(u21)) continue; + + // Look for the stage2 index for this block. If it doesn't exist + // we add it to stage2 and update the mapping. + const gop = try set_map.getOrPut(set); + if (!gop.found_existing) { + gop.value_ptr.* = std.math.cast( + u16, + stage2.items.len, + ) orelse return error.Stage2TooLarge; + try stage2.append(set); + } + + // Map stage1 => stage2 and reset our block + try stage1.append(gop.value_ptr.*); + set = .initEmpty(); + assert(stage1.items.len - 1 == high); + } + + // All of our lengths must fit in a u16 for this to work + assert(stage1.items.len <= std.math.maxInt(u16)); + assert(stage2.items.len <= std.math.maxInt(u16)); + + const stage1_owned = try stage1.toOwnedSlice(); + errdefer alloc.free(stage1_owned); + const stage2_owned = try stage2.toOwnedSlice(); + errdefer alloc.free(stage2_owned); + + return .{ + .min = min, + .max = max, + .stage1 = stage1_owned, + .stage2 = stage2_owned, + }; + } + }; +} + +/// Creates a type that given a 3-level lookup table, can be used to +/// look up a mapping for a given codepoint, encode it out to Zig, etc. +pub const Tables = struct { + const Self = @This(); + + min: u21, + max: u21, + stage1: []const u16, + stage2: []const Set, + + /// Given a codepoint, returns the mapping for that codepoint. + pub fn get(self: *const Self, cp: u21) bool { + if (cp < self.min) return false; + if (cp > self.max) return false; + const high = cp >> cp_shift; + const stage2 = self.stage1[high]; + // take advantage of the fact that the first entry is always all false + if (stage2 == 0) return false; + const low = cp & cp_mask; + return self.stage2[stage2].isSet(low); + } + + /// Writes the lookup table as Zig to the given writer. The + /// written file exports three constants: stage1, stage2, and + /// stage3. These can be used to rebuild the lookup table in Zig. + pub fn writeZig(self: *const Self, writer: anytype) !void { + try writer.print( + \\//! This file is auto-generated. Do not edit. + \\const std = @import("std"); + \\ + \\pub const min: u21 = {}; + \\pub const max: u21 = {}; + \\ + \\pub const stage1: [{}]u16 = .{{ + , .{ self.min, self.max, self.stage1.len }); + for (self.stage1) |entry| try writer.print("{},", .{entry}); + + try writer.print( + \\ + \\}}; + \\ + \\pub const Set = std.bit_set.IntegerBitSet({d}); + \\pub const stage2: [{d}]Set = .{{ + \\ + , .{ set_size, self.stage2.len }); + // for (self.stage2) |entry| { + // try writer.print(" .{{\n", .{}); + // try writer.print(" .masks = [{d}]{s}{{\n", .{ entry.masks.len, @typeName(Set.MaskInt) }); + // for (entry.masks) |mask| { + // try writer.print(" {d},\n", .{mask}); + // } + // try writer.print(" }},\n", .{}); + // try writer.print(" }},\n", .{}); + // } + for (self.stage2) |entry| { + try writer.print(" .{{ .mask = {d} }},\n", .{entry.mask}); + } + try writer.writeAll("};\n"); + } +}; diff --git a/src/unicode/main.zig b/src/unicode/main.zig index f5b911948..91dfd482c 100644 --- a/src/unicode/main.zig +++ b/src/unicode/main.zig @@ -9,5 +9,7 @@ pub const graphemeBreak = grapheme.graphemeBreak; pub const GraphemeBreakState = grapheme.BreakState; test { + _ = @import("symbols1.zig"); + _ = @import("symbols2.zig"); @import("std").testing.refAllDecls(@This()); } diff --git a/src/unicode/props.zig b/src/unicode/props.zig index 99c57aa0a..7edb3761c 100644 --- a/src/unicode/props.zig +++ b/src/unicode/props.zig @@ -166,7 +166,7 @@ pub fn main() !void { // This is not very fast in debug modes, so its commented by default. // IMPORTANT: UNCOMMENT THIS WHENEVER MAKING CODEPOINTWIDTH CHANGES. -// test "tables match ziglyph" { +// test "unicode props: tables match ziglyph" { // const testing = std.testing; // // const min = 0xFF + 1; // start outside ascii diff --git a/src/unicode/symbols1.zig b/src/unicode/symbols1.zig new file mode 100644 index 000000000..e5b8cc22a --- /dev/null +++ b/src/unicode/symbols1.zig @@ -0,0 +1,93 @@ +const props = @This(); +const std = @import("std"); +const assert = std.debug.assert; +const ziglyph = @import("ziglyph"); +const lut = @import("lut.zig"); + +/// The lookup tables for Ghostty. +pub const table = table: { + // This is only available after running main() below as part of the Ghostty + // build.zig, but due to Zig's lazy analysis we can still reference it here. + const generated = @import("symbols1_tables").Tables(bool); + const Tables = lut.Tables(bool); + break :table Tables{ + .stage1 = &generated.stage1, + .stage2 = &generated.stage2, + .stage3 = &generated.stage3, + }; +}; + +/// Returns true of the codepoint is a "symbol-like" character, which +/// for now we define as anything in a private use area and anything +/// in several unicode blocks: +/// - Dingbats +/// - Emoticons +/// - Miscellaneous Symbols +/// - Enclosed Alphanumerics +/// - Enclosed Alphanumeric Supplement +/// - Miscellaneous Symbols and Pictographs +/// - Transport and Map Symbols +/// +/// In the future it may be prudent to expand this to encompass more +/// symbol-like characters, and/or exclude some PUA sections. +pub fn isSymbol(cp: u21) bool { + return ziglyph.general_category.isPrivateUse(cp) or + ziglyph.blocks.isDingbats(cp) or + ziglyph.blocks.isEmoticons(cp) or + ziglyph.blocks.isMiscellaneousSymbols(cp) or + ziglyph.blocks.isEnclosedAlphanumerics(cp) or + ziglyph.blocks.isEnclosedAlphanumericSupplement(cp) or + ziglyph.blocks.isMiscellaneousSymbolsAndPictographs(cp) or + ziglyph.blocks.isTransportAndMapSymbols(cp); +} + +/// Runnable binary to generate the lookup tables and output to stdout. +pub fn main() !void { + var arena_state = std.heap.ArenaAllocator.init(std.heap.page_allocator); + defer arena_state.deinit(); + const alloc = arena_state.allocator(); + + const gen: lut.Generator( + bool, + struct { + pub fn get(ctx: @This(), cp: u21) !bool { + _ = ctx; + return isSymbol(cp); + } + + pub fn eql(ctx: @This(), a: bool, b: bool) bool { + _ = ctx; + return a == b; + } + }, + ) = .{}; + + const t = try gen.generate(alloc); + defer alloc.free(t.stage1); + defer alloc.free(t.stage2); + defer alloc.free(t.stage3); + try t.writeZig(std.io.getStdOut().writer()); + + // Uncomment when manually debugging to see our table sizes. + // std.log.warn("stage1={} stage2={} stage3={}", .{ + // t.stage1.len, + // t.stage2.len, + // t.stage3.len, + // }); +} + +// This is not very fast in debug modes, so its commented by default. +// IMPORTANT: UNCOMMENT THIS WHENEVER MAKING CHANGES. +test "unicode symbols1: tables match ziglyph" { + const testing = std.testing; + + for (0..std.math.maxInt(u21)) |cp| { + const t = table.get(@intCast(cp)); + const zg = isSymbol(@intCast(cp)); + + if (t != zg) { + std.log.warn("mismatch cp=U+{x} t={} zg={}", .{ cp, t, zg }); + try testing.expect(false); + } + } +} diff --git a/src/unicode/symbols2.zig b/src/unicode/symbols2.zig new file mode 100644 index 000000000..1d23c51be --- /dev/null +++ b/src/unicode/symbols2.zig @@ -0,0 +1,85 @@ +const props = @This(); +const std = @import("std"); +const assert = std.debug.assert; +const ziglyph = @import("ziglyph"); +const lut2 = @import("lut2.zig"); + +/// The lookup tables for Ghostty. +pub const table = table: { + // This is only available after running main() below as part of the Ghostty + // build.zig, but due to Zig's lazy analysis we can still reference it here. + const generated = @import("symbols2_tables"); + break :table lut2.Tables{ + .min = generated.min, + .max = generated.max, + .stage1 = &generated.stage1, + .stage2 = &generated.stage2, + }; +}; + +/// Returns true of the codepoint is a "symbol-like" character, which +/// for now we define as anything in a private use area and anything +/// in several unicode blocks: +/// - Dingbats +/// - Emoticons +/// - Miscellaneous Symbols +/// - Enclosed Alphanumerics +/// - Enclosed Alphanumeric Supplement +/// - Miscellaneous Symbols and Pictographs +/// - Transport and Map Symbols +/// +/// In the future it may be prudent to expand this to encompass more +/// symbol-like characters, and/or exclude some PUA sections. +pub fn isSymbol(cp: u21) bool { + return ziglyph.general_category.isPrivateUse(cp) or + ziglyph.blocks.isDingbats(cp) or + ziglyph.blocks.isEmoticons(cp) or + ziglyph.blocks.isMiscellaneousSymbols(cp) or + ziglyph.blocks.isEnclosedAlphanumerics(cp) or + ziglyph.blocks.isEnclosedAlphanumericSupplement(cp) or + ziglyph.blocks.isMiscellaneousSymbolsAndPictographs(cp) or + ziglyph.blocks.isTransportAndMapSymbols(cp); +} + +/// Runnable binary to generate the lookup tables and output to stdout. +pub fn main() !void { + var arena_state = std.heap.ArenaAllocator.init(std.heap.page_allocator); + defer arena_state.deinit(); + const alloc = arena_state.allocator(); + + const gen: lut2.Generator( + struct { + pub fn get(ctx: @This(), cp: u21) bool { + _ = ctx; + return isSymbol(cp); + } + }, + ) = .{}; + + const t = try gen.generate(alloc); + defer alloc.free(t.stage1); + defer alloc.free(t.stage2); + try t.writeZig(std.io.getStdOut().writer()); + + // Uncomment when manually debugging to see our table sizes. + // std.log.warn("stage1={} stage2={}", .{ + // t.stage1.len, + // t.stage2.len, + // }); +} + +// This is not very fast in debug modes, so its commented by default. +// IMPORTANT: UNCOMMENT THIS WHENEVER MAKING CHANGES. +test "unicode symbols2: tables match ziglyph" { + const testing = std.testing; + + for (0..std.math.maxInt(u21)) |cp| { + const t1 = table.get(@intCast(cp)); + const zg = isSymbol(@intCast(cp)); + + if (t1 != zg) { + std.log.warn("mismatch cp=U+{x} t={} zg={}", .{ cp, t1, zg }); + try testing.expect(false); + } + } +}