From bf1278deff54ec70bc0a8bbf6ff55638b993c0af Mon Sep 17 00:00:00 2001 From: Mitchell Hashimoto Date: Sat, 20 Sep 2025 14:57:37 -0700 Subject: [PATCH] unicode: isolate properties, tables, and ziglyph into separate files This makes it cleaner to add new sources of table generation and also avoids inadvertently depending on different modules (despite Zig's lazy analysis). This also fixes up terminal to only use our look up tables which avoids bringing ziglyph in for the terminal module. --- src/terminal/Terminal.zig | 4 +- src/unicode/Properties.zig | 75 +++++++++++++++++++++++++++ src/unicode/main.zig | 7 ++- src/unicode/props_table.zig | 18 +++++++ src/unicode/props_ziglyph.zig | 96 +++++++++++++++++++++++++++++++++++ 5 files changed, 194 insertions(+), 6 deletions(-) create mode 100644 src/unicode/Properties.zig create mode 100644 src/unicode/props_table.zig create mode 100644 src/unicode/props_ziglyph.zig diff --git a/src/terminal/Terminal.zig b/src/terminal/Terminal.zig index 162cfde38..0d659eb9a 100644 --- a/src/terminal/Terminal.zig +++ b/src/terminal/Terminal.zig @@ -344,7 +344,7 @@ pub fn print(self: *Terminal, c: u21) !void { // VS15 makes it narrow. if (c == 0xFE0F or c == 0xFE0E) { // This only applies to emoji - const prev_props = unicode.getProperties(prev.cell.content.codepoint); + const prev_props = unicode.table.get(prev.cell.content.codepoint); const emoji = prev_props.grapheme_boundary_class.isExtendedPictographic(); if (!emoji) return; @@ -470,7 +470,7 @@ pub fn print(self: *Terminal, c: u21) !void { // If this is a emoji variation selector, prev must be an emoji if (c == 0xFE0F or c == 0xFE0E) { - const prev_props = unicode.getProperties(prev.content.codepoint); + const prev_props = unicode.table.get(prev.content.codepoint); const emoji = prev_props.grapheme_boundary_class == .extended_pictographic; if (!emoji) return; } diff --git a/src/unicode/Properties.zig b/src/unicode/Properties.zig new file mode 100644 index 000000000..b7840743a --- /dev/null +++ b/src/unicode/Properties.zig @@ -0,0 +1,75 @@ +//! Property set per codepoint that Ghostty cares about. +//! +//! Adding to this lets you find new properties but also potentially makes +//! our lookup tables less efficient. Any changes to this should run the +//! benchmarks in src/bench to verify that we haven't regressed. +const Properties = @This(); + +const std = @import("std"); + +/// Codepoint width. We clamp to [0, 2] since Ghostty handles control +/// characters and we max out at 2 for wide characters (i.e. 3-em dash +/// becomes a 2-em dash). +width: u2 = 0, + +/// Grapheme boundary class. +grapheme_boundary_class: GraphemeBoundaryClass = .invalid, + +// Needed for lut.Generator +pub fn eql(a: Properties, b: Properties) bool { + return a.width == b.width and + a.grapheme_boundary_class == b.grapheme_boundary_class; +} + +// Needed for lut.Generator +pub fn format( + self: Properties, + comptime layout: []const u8, + opts: std.fmt.FormatOptions, + writer: anytype, +) !void { + _ = layout; + _ = opts; + try std.fmt.format(writer, + \\.{{ + \\ .width= {}, + \\ .grapheme_boundary_class= .{s}, + \\}} + , .{ + self.width, + @tagName(self.grapheme_boundary_class), + }); +} + +/// Possible grapheme boundary classes. This isn't an exhaustive list: +/// we omit control, CR, LF, etc. because in Ghostty's usage that are +/// impossible because they're handled by the terminal. +pub const GraphemeBoundaryClass = enum(u4) { + invalid, + L, + V, + T, + LV, + LVT, + prepend, + extend, + zwj, + spacing_mark, + regional_indicator, + extended_pictographic, + extended_pictographic_base, // \p{Extended_Pictographic} & \p{Emoji_Modifier_Base} + emoji_modifier, // \p{Emoji_Modifier} + + /// Returns true if this is an extended pictographic type. This + /// should be used instead of comparing the enum value directly + /// because we classify multiple. + pub fn isExtendedPictographic(self: GraphemeBoundaryClass) bool { + return switch (self) { + .extended_pictographic, + .extended_pictographic_base, + => true, + + else => false, + }; + } +}; diff --git a/src/unicode/main.zig b/src/unicode/main.zig index 17c86deca..ae50075ff 100644 --- a/src/unicode/main.zig +++ b/src/unicode/main.zig @@ -1,14 +1,13 @@ pub const lut = @import("lut.zig"); const grapheme = @import("grapheme.zig"); -const props = @import("props.zig"); -pub const table = props.table; -pub const Properties = props.Properties; -pub const getProperties = props.get; +pub const table = @import("props_table.zig").table; +pub const Properties = @import("Properties.zig"); pub const graphemeBreak = grapheme.graphemeBreak; pub const GraphemeBreakState = grapheme.BreakState; test { + _ = @import("props_ziglyph.zig"); _ = @import("symbols.zig"); @import("std").testing.refAllDecls(@This()); } diff --git a/src/unicode/props_table.zig b/src/unicode/props_table.zig new file mode 100644 index 000000000..80492346c --- /dev/null +++ b/src/unicode/props_table.zig @@ -0,0 +1,18 @@ +const Properties = @import("Properties.zig"); +const lut = @import("lut.zig"); + +/// The lookup tables for Ghostty. +pub const table = table: { + // This is only available after running a generator as part of the Ghostty + // build.zig process, but due to Zig's lazy analysis we can still reference + // it here. + // + // An example process is the `main` in `props_ziglyph.zig` + const generated = @import("unicode_tables").Tables(Properties); + const Tables = lut.Tables(Properties); + break :table Tables{ + .stage1 = &generated.stage1, + .stage2 = &generated.stage2, + .stage3 = &generated.stage3, + }; +}; diff --git a/src/unicode/props_ziglyph.zig b/src/unicode/props_ziglyph.zig new file mode 100644 index 000000000..fd123f3b5 --- /dev/null +++ b/src/unicode/props_ziglyph.zig @@ -0,0 +1,96 @@ +const props = @This(); + +const std = @import("std"); +const assert = std.debug.assert; +const ziglyph = @import("ziglyph"); +const lut = @import("lut.zig"); +const Properties = @import("Properties.zig"); +const GraphemeBoundaryClass = Properties.GraphemeBoundaryClass; + +/// Gets the grapheme boundary class for a codepoint. This is VERY +/// SLOW. The use case for this is only in generating lookup tables. +fn graphemeBoundaryClass(cp: u21) GraphemeBoundaryClass { + // We special-case modifier bases because we should not break + // if a modifier isn't next to a base. + if (ziglyph.emoji.isEmojiModifierBase(cp)) { + assert(ziglyph.emoji.isExtendedPictographic(cp)); + return .extended_pictographic_base; + } + + if (ziglyph.emoji.isEmojiModifier(cp)) return .emoji_modifier; + if (ziglyph.emoji.isExtendedPictographic(cp)) return .extended_pictographic; + if (ziglyph.grapheme_break.isL(cp)) return .L; + if (ziglyph.grapheme_break.isV(cp)) return .V; + if (ziglyph.grapheme_break.isT(cp)) return .T; + if (ziglyph.grapheme_break.isLv(cp)) return .LV; + if (ziglyph.grapheme_break.isLvt(cp)) return .LVT; + if (ziglyph.grapheme_break.isPrepend(cp)) return .prepend; + if (ziglyph.grapheme_break.isExtend(cp)) return .extend; + if (ziglyph.grapheme_break.isZwj(cp)) return .zwj; + if (ziglyph.grapheme_break.isSpacingmark(cp)) return .spacing_mark; + if (ziglyph.grapheme_break.isRegionalIndicator(cp)) return .regional_indicator; + + // This is obviously not INVALID invalid, there is SOME grapheme + // boundary class for every codepoint. But we don't care about + // anything that doesn't fit into the above categories. + return .invalid; +} + +pub fn get(cp: u21) Properties { + const zg_width = ziglyph.display_width.codePointWidth(cp, .half); + return .{ + .width = @intCast(@min(2, @max(0, zg_width))), + .grapheme_boundary_class = .init(cp), + }; +} + +/// Runnable binary to generate the lookup tables and output to stdout. +pub fn main() !void { + var arena_state = std.heap.ArenaAllocator.init(std.heap.page_allocator); + defer arena_state.deinit(); + const alloc = arena_state.allocator(); + + const gen: lut.Generator( + Properties, + struct { + pub fn get(ctx: @This(), cp: u21) !Properties { + _ = ctx; + return props.get(cp); + } + + pub fn eql(ctx: @This(), a: Properties, b: Properties) bool { + _ = ctx; + return a.eql(b); + } + }, + ) = .{}; + + const t = try gen.generate(alloc); + defer alloc.free(t.stage1); + defer alloc.free(t.stage2); + defer alloc.free(t.stage3); + try t.writeZig(std.io.getStdOut().writer()); + + // Uncomment when manually debugging to see our table sizes. + // std.log.warn("stage1={} stage2={} stage3={}", .{ + // t.stage1.len, + // t.stage2.len, + // t.stage3.len, + // }); +} + +// This is not very fast in debug modes, so its commented by default. +// IMPORTANT: UNCOMMENT THIS WHENEVER MAKING CODEPOINTWIDTH CHANGES. +// test "unicode props: tables match ziglyph" { +// const testing = std.testing; +// +// const min = 0xFF + 1; // start outside ascii +// for (min..std.math.maxInt(u21)) |cp| { +// const t = table.get(@intCast(cp)); +// const zg = @min(2, @max(0, ziglyph.display_width.codePointWidth(@intCast(cp), .half))); +// if (t.width != zg) { +// std.log.warn("mismatch cp=U+{x} t={} zg={}", .{ cp, t, zg }); +// try testing.expect(false); +// } +// } +// }