From bf1278deff54ec70bc0a8bbf6ff55638b993c0af Mon Sep 17 00:00:00 2001
From: Mitchell Hashimoto <m@mitchellh.com>
Date: Sat, 20 Sep 2025 14:57:37 -0700
Subject: [PATCH] unicode: isolate properties, tables, and ziglyph into
 separate files

This makes it cleaner to add new sources of table generation and also
avoids inadvertently depending on different modules (despite Zig's lazy
analysis).

This also fixes up terminal to only use our look up tables which avoids
bringing ziglyph in for the terminal module.
---
 src/terminal/Terminal.zig     |  4 +-
 src/unicode/Properties.zig    | 75 +++++++++++++++++++++++++++
 src/unicode/main.zig          |  7 ++-
 src/unicode/props_table.zig   | 18 +++++++
 src/unicode/props_ziglyph.zig | 96 +++++++++++++++++++++++++++++++++++
 5 files changed, 194 insertions(+), 6 deletions(-)
 create mode 100644 src/unicode/Properties.zig
 create mode 100644 src/unicode/props_table.zig
 create mode 100644 src/unicode/props_ziglyph.zig

diff --git a/src/terminal/Terminal.zig b/src/terminal/Terminal.zig
index 162cfde38..0d659eb9a 100644
--- a/src/terminal/Terminal.zig
+++ b/src/terminal/Terminal.zig
@@ -344,7 +344,7 @@ pub fn print(self: *Terminal, c: u21) !void {
             // VS15 makes it narrow.
             if (c == 0xFE0F or c == 0xFE0E) {
                 // This only applies to emoji
-                const prev_props = unicode.getProperties(prev.cell.content.codepoint);
+                const prev_props = unicode.table.get(prev.cell.content.codepoint);
                 const emoji = prev_props.grapheme_boundary_class.isExtendedPictographic();
                 if (!emoji) return;
 
@@ -470,7 +470,7 @@ pub fn print(self: *Terminal, c: u21) !void {
 
         // If this is a emoji variation selector, prev must be an emoji
         if (c == 0xFE0F or c == 0xFE0E) {
-            const prev_props = unicode.getProperties(prev.content.codepoint);
+            const prev_props = unicode.table.get(prev.content.codepoint);
             const emoji = prev_props.grapheme_boundary_class == .extended_pictographic;
             if (!emoji) return;
         }
diff --git a/src/unicode/Properties.zig b/src/unicode/Properties.zig
new file mode 100644
index 000000000..b7840743a
--- /dev/null
+++ b/src/unicode/Properties.zig
@@ -0,0 +1,75 @@
+//! Property set per codepoint that Ghostty cares about.
+//!
+//! Adding to this lets you find new properties but also potentially makes
+//! our lookup tables less efficient. Any changes to this should run the
+//! benchmarks in src/bench to verify that we haven't regressed.
+const Properties = @This();
+
+const std = @import("std");
+
+/// Codepoint width. We clamp to [0, 2] since Ghostty handles control
+/// characters and we max out at 2 for wide characters (i.e. 3-em dash
+/// becomes a 2-em dash).
+width: u2 = 0,
+
+/// Grapheme boundary class.
+grapheme_boundary_class: GraphemeBoundaryClass = .invalid,
+
+// Needed for lut.Generator
+pub fn eql(a: Properties, b: Properties) bool {
+    return a.width == b.width and
+        a.grapheme_boundary_class == b.grapheme_boundary_class;
+}
+
+// Needed for lut.Generator
+pub fn format(
+    self: Properties,
+    comptime layout: []const u8,
+    opts: std.fmt.FormatOptions,
+    writer: anytype,
+) !void {
+    _ = layout;
+    _ = opts;
+    try std.fmt.format(writer,
+        \\.{{
+        \\    .width= {},
+        \\    .grapheme_boundary_class= .{s},
+        \\}}
+    , .{
+        self.width,
+        @tagName(self.grapheme_boundary_class),
+    });
+}
+
+/// Possible grapheme boundary classes. This isn't an exhaustive list:
+/// we omit control, CR, LF, etc. because in Ghostty's usage that are
+/// impossible because they're handled by the terminal.
+pub const GraphemeBoundaryClass = enum(u4) {
+    invalid,
+    L,
+    V,
+    T,
+    LV,
+    LVT,
+    prepend,
+    extend,
+    zwj,
+    spacing_mark,
+    regional_indicator,
+    extended_pictographic,
+    extended_pictographic_base, // \p{Extended_Pictographic} & \p{Emoji_Modifier_Base}
+    emoji_modifier, // \p{Emoji_Modifier}
+
+    /// Returns true if this is an extended pictographic type. This
+    /// should be used instead of comparing the enum value directly
+    /// because we classify multiple.
+    pub fn isExtendedPictographic(self: GraphemeBoundaryClass) bool {
+        return switch (self) {
+            .extended_pictographic,
+            .extended_pictographic_base,
+            => true,
+
+            else => false,
+        };
+    }
+};
diff --git a/src/unicode/main.zig b/src/unicode/main.zig
index 17c86deca..ae50075ff 100644
--- a/src/unicode/main.zig
+++ b/src/unicode/main.zig
@@ -1,14 +1,13 @@
 pub const lut = @import("lut.zig");
 
 const grapheme = @import("grapheme.zig");
-const props = @import("props.zig");
-pub const table = props.table;
-pub const Properties = props.Properties;
-pub const getProperties = props.get;
+pub const table = @import("props_table.zig").table;
+pub const Properties = @import("Properties.zig");
 pub const graphemeBreak = grapheme.graphemeBreak;
 pub const GraphemeBreakState = grapheme.BreakState;
 
 test {
+    _ = @import("props_ziglyph.zig");
     _ = @import("symbols.zig");
     @import("std").testing.refAllDecls(@This());
 }
diff --git a/src/unicode/props_table.zig b/src/unicode/props_table.zig
new file mode 100644
index 000000000..80492346c
--- /dev/null
+++ b/src/unicode/props_table.zig
@@ -0,0 +1,18 @@
+const Properties = @import("Properties.zig");
+const lut = @import("lut.zig");
+
+/// The lookup tables for Ghostty.
+pub const table = table: {
+    // This is only available after running a generator as part of the Ghostty
+    // build.zig process, but due to Zig's lazy analysis we can still reference
+    // it here.
+    //
+    // An example process is the `main` in `props_ziglyph.zig`
+    const generated = @import("unicode_tables").Tables(Properties);
+    const Tables = lut.Tables(Properties);
+    break :table Tables{
+        .stage1 = &generated.stage1,
+        .stage2 = &generated.stage2,
+        .stage3 = &generated.stage3,
+    };
+};
diff --git a/src/unicode/props_ziglyph.zig b/src/unicode/props_ziglyph.zig
new file mode 100644
index 000000000..fd123f3b5
--- /dev/null
+++ b/src/unicode/props_ziglyph.zig
@@ -0,0 +1,96 @@
+const props = @This();
+
+const std = @import("std");
+const assert = std.debug.assert;
+const ziglyph = @import("ziglyph");
+const lut = @import("lut.zig");
+const Properties = @import("Properties.zig");
+const GraphemeBoundaryClass = Properties.GraphemeBoundaryClass;
+
+/// Gets the grapheme boundary class for a codepoint. This is VERY
+/// SLOW. The use case for this is only in generating lookup tables.
+fn graphemeBoundaryClass(cp: u21) GraphemeBoundaryClass {
+    // We special-case modifier bases because we should not break
+    // if a modifier isn't next to a base.
+    if (ziglyph.emoji.isEmojiModifierBase(cp)) {
+        assert(ziglyph.emoji.isExtendedPictographic(cp));
+        return .extended_pictographic_base;
+    }
+
+    if (ziglyph.emoji.isEmojiModifier(cp)) return .emoji_modifier;
+    if (ziglyph.emoji.isExtendedPictographic(cp)) return .extended_pictographic;
+    if (ziglyph.grapheme_break.isL(cp)) return .L;
+    if (ziglyph.grapheme_break.isV(cp)) return .V;
+    if (ziglyph.grapheme_break.isT(cp)) return .T;
+    if (ziglyph.grapheme_break.isLv(cp)) return .LV;
+    if (ziglyph.grapheme_break.isLvt(cp)) return .LVT;
+    if (ziglyph.grapheme_break.isPrepend(cp)) return .prepend;
+    if (ziglyph.grapheme_break.isExtend(cp)) return .extend;
+    if (ziglyph.grapheme_break.isZwj(cp)) return .zwj;
+    if (ziglyph.grapheme_break.isSpacingmark(cp)) return .spacing_mark;
+    if (ziglyph.grapheme_break.isRegionalIndicator(cp)) return .regional_indicator;
+
+    // This is obviously not INVALID invalid, there is SOME grapheme
+    // boundary class for every codepoint. But we don't care about
+    // anything that doesn't fit into the above categories.
+    return .invalid;
+}
+
+pub fn get(cp: u21) Properties {
+    const zg_width = ziglyph.display_width.codePointWidth(cp, .half);
+    return .{
+        .width = @intCast(@min(2, @max(0, zg_width))),
+        .grapheme_boundary_class = .init(cp),
+    };
+}
+
+/// Runnable binary to generate the lookup tables and output to stdout.
+pub fn main() !void {
+    var arena_state = std.heap.ArenaAllocator.init(std.heap.page_allocator);
+    defer arena_state.deinit();
+    const alloc = arena_state.allocator();
+
+    const gen: lut.Generator(
+        Properties,
+        struct {
+            pub fn get(ctx: @This(), cp: u21) !Properties {
+                _ = ctx;
+                return props.get(cp);
+            }
+
+            pub fn eql(ctx: @This(), a: Properties, b: Properties) bool {
+                _ = ctx;
+                return a.eql(b);
+            }
+        },
+    ) = .{};
+
+    const t = try gen.generate(alloc);
+    defer alloc.free(t.stage1);
+    defer alloc.free(t.stage2);
+    defer alloc.free(t.stage3);
+    try t.writeZig(std.io.getStdOut().writer());
+
+    // Uncomment when manually debugging to see our table sizes.
+    // std.log.warn("stage1={} stage2={} stage3={}", .{
+    //     t.stage1.len,
+    //     t.stage2.len,
+    //     t.stage3.len,
+    // });
+}
+
+// This is not very fast in debug modes, so its commented by default.
+// IMPORTANT: UNCOMMENT THIS WHENEVER MAKING CODEPOINTWIDTH CHANGES.
+// test "unicode props: tables match ziglyph" {
+//     const testing = std.testing;
+//
+//     const min = 0xFF + 1; // start outside ascii
+//     for (min..std.math.maxInt(u21)) |cp| {
+//         const t = table.get(@intCast(cp));
+//         const zg = @min(2, @max(0, ziglyph.display_width.codePointWidth(@intCast(cp), .half)));
+//         if (t.width != zg) {
+//             std.log.warn("mismatch cp=U+{x} t={} zg={}", .{ cp, t, zg });
+//             try testing.expect(false);
+//         }
+//     }
+// }