unicode: switch to uucode grapheme break to (mostly) match unicode spec (#9680)

This PR builds on https://github.com/ghostty-org/ghostty/pull/9678 ~so the diff from there is included here (it's not possible to stack PRs unless it's a PR against my own fork)--review that one first!~ This PR updates the `graphemeBreak` calculation to use `uucode`'s `computeGraphemeBreakNoControl`, which has [tests in uucode](215ff09730/src/x/grapheme.zig (L753)) that confirm it passes the `GraphemeBreakTest.txt` (minus some exceptions). Note that the `grapheme_break` (and `grapheme_break_no_control`) property in `uucode` incorporates `emoji_modifier` and `emoji_modifier_base`, diverging from UAX #29 but matching UTS #51. See [this comment in uucode](215ff09730/src/grapheme.zig (L420-L434)) for details. The `grapheme_break_no_control` property and `computeGraphemeBreakNoControl` both assume `control`, `cr`, and `lf` have been filtered out, matching the current grapheme break logic in Ghostty. This PR keeps the `Precompute.data` logic mostly equivalent, since the `uucode` `precomputedGraphemeBreak` lacks benchmarks in the `uucode` repository (it was benchmarked in [the original PR adding `uucode` to Ghostty](https://github.com/ghostty-org/ghostty/pull/8757)). Note however, that due to `grapheme_break` being one bit larger than `grapheme_boundary_class` and the new `BreakState` also being one bit larger, the state jumps up by a factor of 8 (u10 -> u13), to 8KB. ## Benchmarks ~I benchmarked the old `main` version versus this PR for `+grapheme-break` and surprisingly this PR is 2% faster (?). Looking at the assembly though, I'm thinking something else might be causing that. Once I get to the bottom of that I'll remove the below TODO and include the benchmark results here.~ When seeing the speedup with `data.txt` and maybe a tiny speedup on English wiki, I was surprised given the 1KB -> 8KB tables. Here's what AI said when I asked it to inspect the assembly: https://ampcode.com/threads/T-979b1743-19e7-47c9-8074-9778b4b2a61e, and here's what it said when I asked it to predict the faster version: https://ampcode.com/threads/T-3291dcd3-7a21-4d24-a192-7b3f6e18cd31 It looks like two loads got reordered and that put the load that depended on stage1 -> stage2 -> stage3 second, "hiding memory latency". So that makes the new one faster when looking up the `grapheme_break` property. These gains go away with the Japanese and Arabic benchmarks, which spend more time processing utf8, and may even have more grapheme clusters too. ### with data.txt (200 MB ghostty-gen random utf8) <img width="1822" height="464" alt="CleanShot 2025-11-26 at 08 42 03@2x" src="https://github.com/user-attachments/assets/56d4ee98-21db-4eab-93ab-a0463a653883" /> ### with English wiki dump <img width="2012" height="506" alt="CleanShot 2025-11-26 at 08 43 15@2x" src="https://github.com/user-attachments/assets/230fbfb7-272d-4a2a-93e7-7268962a9814" /> ### with Japanese wiki dump <img width="2008" height="518" alt="CleanShot 2025-11-26 at 08 43 49@2x" src="https://github.com/user-attachments/assets/edb408c8-a604-4a8f-bd5b-80f19e3d65ee" /> ### with Arabic wiki dump <img width="2010" height="512" alt="CleanShot 2025-11-26 at 08 44 25@2x" src="https://github.com/user-attachments/assets/81a29ac8-0586-4e82-8276-d7fa90c31c90" /> TODO: * [x] Take a closer look at the assembly and understand why this PR (8 KB vs 1 KB table) is faster on my machine. * [x] _(**edit**: checking this off because it seems unnecessary)_ If this turns out to actually be unacceptably slower, one possibility is to switch to `uucode`'s `precomputedGraphemeBreak` which uses a 1445 byte table since it uses a dense table (indexed using multiplication instead of bitCast, though, which did show up in the initial benchmarks from https://github.com/ghostty-org/ghostty/pull/8757 a small amount.) AI was used in some of the uucode changes in https://github.com/ghostty-org/ghostty/pull/9678 (Amp--primarily for tests), but everything was carefully vetted and much of it done by hand. This PR was made without AI with the exception of consulting AI about whether the "Prepend + ASCII" scenario is common (hopefully it's right about that being uncommon).
2026-05-31 00:55:46 +00:00 · 2026-01-20 09:44:15 -08:00
parent e5afaa47b3 7bddbfed1e
commit 49b2b8d644
8 changed files with 149 additions and 196 deletions
--- a/src/unicode/grapheme.zig
+++ b/src/unicode/grapheme.zig
@@ -1,6 +1,6 @@
 const std = @import("std");
 const table = @import("props_table.zig").table;
-const GraphemeBoundaryClass = @import("props.zig").GraphemeBoundaryClass;
+const uucode = @import("uucode");

 /// Determines if there is a grapheme break between two codepoints. This
 /// must be called sequentially maintaining the state between calls.
@@ -9,11 +9,11 @@ const GraphemeBoundaryClass = @import("props.zig").GraphemeBoundaryClass;
 /// line feeds, and carriage returns are expected to be filtered out before
 /// calling this function. This is because this function is tuned for
 /// Ghostty.
-pub fn graphemeBreak(cp1: u21, cp2: u21, state: *BreakState) bool {
+pub fn graphemeBreak(cp1: u21, cp2: u21, state: *uucode.grapheme.BreakState) bool {
    const value = Precompute.data[
        (Precompute.Key{
-            .gbc1 = table.get(cp1).grapheme_boundary_class,
-            .gbc2 = table.get(cp2).grapheme_boundary_class,
+            .gb1 = table.get(cp1).grapheme_break,
+            .gb2 = table.get(cp2).grapheme_break,
            .state = state.*,
        }).index()
    ];
@@ -21,133 +21,64 @@ pub fn graphemeBreak(cp1: u21, cp2: u21, state: *BreakState) bool {
    return value.result;
 }

-/// The state that must be maintained between calls to `graphemeBreak`.
-pub const BreakState = packed struct(u2) {
-    extended_pictographic: bool = false,
-    regional_indicator: bool = false,
-};
-
 /// This is all the structures and data for the precomputed lookup table
-/// for all possible permutations of state and grapheme boundary classes.
-/// Precomputation only requires 2^10 keys of 3 bit values so the whole
-/// table is less than 1KB.
+/// for all possible permutations of state and grapheme break properties.
+/// Precomputation requires 2^13 keys of 4 bit values so the whole table is
+/// 8KB.
 const Precompute = struct {
-    const Key = packed struct(u10) {
-        state: BreakState,
-        gbc1: GraphemeBoundaryClass,
-        gbc2: GraphemeBoundaryClass,
+    const Key = packed struct(u13) {
+        state: uucode.grapheme.BreakState,
+        gb1: uucode.x.types.GraphemeBreakNoControl,
+        gb2: uucode.x.types.GraphemeBreakNoControl,

        fn index(self: Key) usize {
-            return @intCast(@as(u10, @bitCast(self)));
+            return @intCast(@as(u13, @bitCast(self)));
        }
    };

-    const Value = packed struct(u3) {
+    const Value = packed struct(u4) {
        result: bool,
-        state: BreakState,
+        state: uucode.grapheme.BreakState,
    };

    const data = precompute: {
-        var result: [std.math.maxInt(u10)]Value = undefined;
+        var result: [std.math.maxInt(u13) + 1]Value = undefined;

-        @setEvalBranchQuota(3_000);
-        const info = @typeInfo(GraphemeBoundaryClass).@"enum";
-        for (0..std.math.maxInt(u2) + 1) |state_init| {
+        const max_state_int = blk: {
+            var max: usize = 0;
+            for (@typeInfo(uucode.grapheme.BreakState).@"enum".fields) |field| {
+                if (field.value > max) max = field.value;
+            }
+            break :blk max;
+        };
+
+        @setEvalBranchQuota(10_000);
+        const info = @typeInfo(uucode.x.types.GraphemeBreakNoControl).@"enum";
+        for (0..max_state_int + 1) |state_int| {
            for (info.fields) |field1| {
                for (info.fields) |field2| {
-                    var state: BreakState = @bitCast(@as(u2, @intCast(state_init)));
+                    var state: uucode.grapheme.BreakState = @enumFromInt(state_int);
+
                    const key: Key = .{
-                        .gbc1 = @field(GraphemeBoundaryClass, field1.name),
-                        .gbc2 = @field(GraphemeBoundaryClass, field2.name),
+                        .gb1 = @field(uucode.x.types.GraphemeBreakNoControl, field1.name),
+                        .gb2 = @field(uucode.x.types.GraphemeBreakNoControl, field2.name),
                        .state = state,
                    };
-                    const v = graphemeBreakClass(key.gbc1, key.gbc2, &state);
+                    const v = uucode.x.grapheme.computeGraphemeBreakNoControl(
+                        key.gb1,
+                        key.gb2,
+                        &state,
+                    );
                    result[key.index()] = .{ .result = v, .state = state };
                }
            }
        }

+        std.debug.assert(@sizeOf(@TypeOf(result)) == 8192);
        break :precompute result;
    };
 };

-/// This is the algorithm from utf8proc. We only use this offline for
-/// precomputing the lookup table.
-fn graphemeBreakClass(
-    gbc1: GraphemeBoundaryClass,
-    gbc2: GraphemeBoundaryClass,
-    state: *BreakState,
-) bool {
-    // GB11: Emoji Extend* ZWJ x Emoji
-    if (!state.extended_pictographic and gbc1.isExtendedPictographic()) {
-        state.extended_pictographic = true;
-    }
-
-    // These two properties are ignored because they're not relevant to
-    // Ghostty -- they're filtered out before checking grapheme boundaries.
-    // GB3: CR x LF
-    // GB4: Control
-
-    // GB6: Hangul L x (L|V|LV|VT)
-    if (gbc1 == .L) {
-        if (gbc2 == .L or
-            gbc2 == .V or
-            gbc2 == .LV or
-            gbc2 == .LVT) return false;
-    }
-
-    // GB7: Hangul (LV | V) x (V | T)
-    if (gbc1 == .LV or gbc1 == .V) {
-        if (gbc2 == .V or
-            gbc2 == .T) return false;
-    }
-
-    // GB8: Hangul (LVT | T) x T
-    if (gbc1 == .LVT or gbc1 == .T) {
-        if (gbc2 == .T) return false;
-    }
-
-    // GB9b: x (Extend | ZWJ)
-    if (gbc2 == .extend or gbc2 == .zwj) return false;
-
-    // GB9a: x Spacing
-    if (gbc2 == .spacing_mark) return false;
-
-    // GB9b: Prepend x
-    if (gbc1 == .prepend) return false;
-
-    // GB12, GB13: RI x RI
-    if (gbc1 == .regional_indicator and gbc2 == .regional_indicator) {
-        if (state.regional_indicator) {
-            state.regional_indicator = false;
-            return true;
-        } else {
-            state.regional_indicator = true;
-            return false;
-        }
-    }
-
-    // GB11: Emoji Extend* ZWJ x Emoji
-    if (state.extended_pictographic and
-        gbc1 == .zwj and
-        gbc2.isExtendedPictographic())
-    {
-        state.extended_pictographic = false;
-        return false;
-    }
-
-    // UTS #51. This isn't covered by UAX #29 as far as I can tell (but
-    // I'm probably wrong). This is a special case for emoji modifiers
-    // which only do not break if they're next to a base.
-    //
-    // emoji_modifier_sequence := emoji_modifier_base emoji_modifier
-    if (gbc2 == .emoji_modifier and gbc1 == .extended_pictographic_base) {
-        return false;
-    }
-
-    return true;
-}
-
 /// If you build this file as a binary, we will verify the grapheme break
 /// implementation. This iterates over billions of codepoints so it is
 /// SLOW. It's not meant to be run in CI, but it's useful for debugging.
@@ -156,13 +87,11 @@ fn graphemeBreakClass(
 /// adding a `-Demit-unicode-test` option for `zig build`, but that
 /// hasn't been done here.
 pub fn main() !void {
-    const uucode = @import("uucode");
-
    // Set the min and max to control the test range.
    const min = 0;
    const max = uucode.config.max_code_point + 1;

-    var state: BreakState = .{};
+    var state: uucode.grapheme.BreakState = .default;
    var uu_state: uucode.grapheme.BreakState = .default;
    for (min..max) |cp1| {
        if (cp1 % 1000 == 0) std.log.warn("progress cp1={}", .{cp1});
@@ -199,13 +128,53 @@ test "grapheme break: emoji modifier" {

    // Emoji and modifier
    {
-        var state: BreakState = .{};
+        var state: uucode.grapheme.BreakState = .default;
        try testing.expect(!graphemeBreak(0x261D, 0x1F3FF, &state));
    }

    // Non-emoji and emoji modifier
    {
-        var state: BreakState = .{};
+        var state: uucode.grapheme.BreakState = .default;
        try testing.expect(graphemeBreak(0x22, 0x1F3FF, &state));
    }
 }
+
+test "long emoji zwj sequences" {
+    var state: uucode.grapheme.BreakState = .default;
+    // 👩‍👩‍👧‍👦 (family: woman, woman, girl, boy)
+    var it = uucode.utf8.Iterator.init("\u{1F469}\u{200D}\u{1F469}\u{200D}\u{1F467}\u{200D}\u{1F466}_");
+    var cp1 = it.next() orelse unreachable;
+    var cp2 = it.next() orelse unreachable;
+    try std.testing.expect(cp1 == 0x1F469); // 👩
+    try std.testing.expect(!graphemeBreak(cp1, cp2, &state));
+
+    cp1 = cp2;
+    cp2 = it.next() orelse unreachable;
+    try std.testing.expect(cp1 == 0x200D);
+    try std.testing.expect(!graphemeBreak(cp1, cp2, &state));
+
+    cp1 = cp2;
+    cp2 = it.next() orelse unreachable;
+    try std.testing.expect(cp1 == 0x1F469); // 👩
+    try std.testing.expect(!graphemeBreak(cp1, cp2, &state));
+
+    cp1 = cp2;
+    cp2 = it.next() orelse unreachable;
+    try std.testing.expect(cp1 == 0x200D);
+    try std.testing.expect(!graphemeBreak(cp1, cp2, &state));
+
+    cp1 = cp2;
+    cp2 = it.next() orelse unreachable;
+    try std.testing.expect(cp1 == 0x1F467); // 👧
+    try std.testing.expect(!graphemeBreak(cp1, cp2, &state));
+
+    cp1 = cp2;
+    cp2 = it.next() orelse unreachable;
+    try std.testing.expect(cp1 == 0x200D);
+    try std.testing.expect(!graphemeBreak(cp1, cp2, &state));
+
+    cp1 = cp2;
+    cp2 = it.next() orelse unreachable;
+    try std.testing.expect(cp1 == 0x1F466); // 👦
+    try std.testing.expect(graphemeBreak(cp1, cp2, &state)); // break
+}
--- a/src/unicode/main.zig
+++ b/src/unicode/main.zig
@@ -4,7 +4,6 @@ const grapheme = @import("grapheme.zig");
 pub const table = @import("props_table.zig").table;
 pub const Properties = @import("props.zig").Properties;
 pub const graphemeBreak = grapheme.graphemeBreak;
-pub const GraphemeBreakState = grapheme.BreakState;

 test {
    @import("std").testing.refAllDecls(@This());
--- a/src/unicode/props.zig
+++ b/src/unicode/props.zig
@@ -5,6 +5,7 @@
 //! benchmarks in src/bench to verify that we haven't regressed.

 const std = @import("std");
+const uucode = @import("uucode");

 pub const Properties = packed struct {
    /// Codepoint width. We clamp to [0, 2] since Ghostty handles control
@@ -12,8 +13,8 @@ pub const Properties = packed struct {
    /// becomes a 2-em dash).
    width: u2 = 0,

-    /// Grapheme boundary class.
-    grapheme_boundary_class: GraphemeBoundaryClass = .invalid,
+    /// Grapheme break property.
+    grapheme_break: uucode.x.types.GraphemeBreakNoControl = .other,

    /// Emoji VS compatibility
    emoji_vs_base: bool = false,
@@ -21,7 +22,7 @@ pub const Properties = packed struct {
    // Needed for lut.Generator
    pub fn eql(a: Properties, b: Properties) bool {
        return a.width == b.width and
-            a.grapheme_boundary_class == b.grapheme_boundary_class and
+            a.grapheme_break == b.grapheme_break and
            a.emoji_vs_base == b.emoji_vs_base;
    }

@@ -33,46 +34,13 @@ pub const Properties = packed struct {
        try writer.print(
            \\.{{
            \\    .width= {},
-            \\    .grapheme_boundary_class= .{s},
+            \\    .grapheme_break= .{s},
            \\    .emoji_vs_base= {},
            \\}}
        , .{
            self.width,
-            @tagName(self.grapheme_boundary_class),
+            @tagName(self.grapheme_break),
            self.emoji_vs_base,
        });
    }
 };
-
-/// Possible grapheme boundary classes. This isn't an exhaustive list:
-/// we omit control, CR, LF, etc. because in Ghostty's usage that are
-/// impossible because they're handled by the terminal.
-pub const GraphemeBoundaryClass = enum(u4) {
-    invalid,
-    L,
-    V,
-    T,
-    LV,
-    LVT,
-    prepend,
-    extend,
-    zwj,
-    spacing_mark,
-    regional_indicator,
-    extended_pictographic,
-    extended_pictographic_base, // \p{Extended_Pictographic} & \p{Emoji_Modifier_Base}
-    emoji_modifier, // \p{Emoji_Modifier}
-
-    /// Returns true if this is an extended pictographic type. This
-    /// should be used instead of comparing the enum value directly
-    /// because we classify multiple.
-    pub fn isExtendedPictographic(self: GraphemeBoundaryClass) bool {
-        return switch (self) {
-            .extended_pictographic,
-            .extended_pictographic_base,
-            => true,
-
-            else => false,
-        };
-    }
-};
--- a/src/unicode/props_uucode.zig
+++ b/src/unicode/props_uucode.zig
@@ -4,56 +4,17 @@ const assert = std.debug.assert;
 const uucode = @import("uucode");
 const lut = @import("lut.zig");
 const Properties = @import("props.zig").Properties;
-const GraphemeBoundaryClass = @import("props.zig").GraphemeBoundaryClass;
-
-/// Gets the grapheme boundary class for a codepoint.
-/// The use case for this is only in generating lookup tables.
-fn graphemeBoundaryClass(cp: u21) GraphemeBoundaryClass {
-    if (cp > uucode.config.max_code_point) return .invalid;
-
-    return switch (uucode.get(.grapheme_break, cp)) {
-        .extended_pictographic => .extended_pictographic,
-        .l => .L,
-        .v => .V,
-        .t => .T,
-        .lv => .LV,
-        .lvt => .LVT,
-        .prepend => .prepend,
-        .zwj => .zwj,
-        .spacing_mark => .spacing_mark,
-        .regional_indicator => .regional_indicator,
-        .emoji_modifier => .emoji_modifier,
-        .emoji_modifier_base => .extended_pictographic_base,
-
-        .zwnj,
-        .indic_conjunct_break_extend,
-        .indic_conjunct_break_linker,
-        => .extend,
-
-        // This is obviously not INVALID invalid, there is SOME grapheme
-        // boundary class for every codepoint. But we don't care about
-        // anything that doesn't fit into the above categories. Also note
-        // that `indic_conjunct_break_consonant` is `other` in
-        // 'GraphemeBreakProperty.txt' (it's missing).
-        .other,
-        .indic_conjunct_break_consonant,
-        .cr,
-        .lf,
-        .control,
-        => .invalid,
-    };
-}

 pub fn get(cp: u21) Properties {
    if (cp > uucode.config.max_code_point) return .{
        .width = 1,
-        .grapheme_boundary_class = .invalid,
+        .grapheme_break = .other,
        .emoji_vs_base = false,
    };

    return .{
        .width = uucode.get(.width, cp),
-        .grapheme_boundary_class = graphemeBoundaryClass(cp),
+        .grapheme_break = uucode.get(.grapheme_break_no_control, cp),
        .emoji_vs_base = uucode.get(.is_emoji_vs_base, cp),
    };
 }