unicode: switch to uucode grapheme break to match unicode 16 spec

2026-06-05 19:34:18 +00:00 · 2025-11-23 19:16:16 -05:00
parent 97926ca307
commit 97aff0743e
8 changed files with 152 additions and 198 deletions
--- a/src/unicode/grapheme.zig
+++ b/src/unicode/grapheme.zig
@@ -1,6 +1,6 @@
 const std = @import("std");
 const table = @import("props_table.zig").table;
-const GraphemeBoundaryClass = @import("props.zig").GraphemeBoundaryClass;
+const uucode = @import("uucode");

 /// Determines if there is a grapheme break between two codepoints. This
 /// must be called sequentially maintaining the state between calls.
@@ -9,11 +9,11 @@ const GraphemeBoundaryClass = @import("props.zig").GraphemeBoundaryClass;
 /// line feeds, and carriage returns are expected to be filtered out before
 /// calling this function. This is because this function is tuned for
 /// Ghostty.
-pub fn graphemeBreak(cp1: u21, cp2: u21, state: *BreakState) bool {
+pub fn graphemeBreak(cp1: u21, cp2: u21, state: *uucode.grapheme.BreakState) bool {
    const value = Precompute.data[
        (Precompute.Key{
-            .gbc1 = table.get(cp1).grapheme_boundary_class,
-            .gbc2 = table.get(cp2).grapheme_boundary_class,
+            .gb1 = table.get(cp1).grapheme_break,
+            .gb2 = table.get(cp2).grapheme_break,
            .state = state.*,
        }).index()
    ];
@@ -21,133 +21,64 @@ pub fn graphemeBreak(cp1: u21, cp2: u21, state: *BreakState) bool {
    return value.result;
 }

-/// The state that must be maintained between calls to `graphemeBreak`.
-pub const BreakState = packed struct(u2) {
-    extended_pictographic: bool = false,
-    regional_indicator: bool = false,
-};
-
 /// This is all the structures and data for the precomputed lookup table
-/// for all possible permutations of state and grapheme boundary classes.
-/// Precomputation only requires 2^10 keys of 3 bit values so the whole
-/// table is less than 1KB.
+/// for all possible permutations of state and grapheme break properties.
+/// Precomputation requires 2^13 keys of 4 bit values so the whole table is
+/// 8KB.
 const Precompute = struct {
-    const Key = packed struct(u10) {
-        state: BreakState,
-        gbc1: GraphemeBoundaryClass,
-        gbc2: GraphemeBoundaryClass,
+    const Key = packed struct(u13) {
+        state: uucode.grapheme.BreakState,
+        gb1: uucode.x.types.GraphemeBreakNoControl,
+        gb2: uucode.x.types.GraphemeBreakNoControl,

        fn index(self: Key) usize {
-            return @intCast(@as(u10, @bitCast(self)));
+            return @intCast(@as(u13, @bitCast(self)));
        }
    };

-    const Value = packed struct(u3) {
+    const Value = packed struct(u4) {
        result: bool,
-        state: BreakState,
+        state: uucode.grapheme.BreakState,
    };

    const data = precompute: {
-        var result: [std.math.maxInt(u10)]Value = undefined;
+        var result: [std.math.maxInt(u13) + 1]Value = undefined;

-        @setEvalBranchQuota(3_000);
-        const info = @typeInfo(GraphemeBoundaryClass).@"enum";
-        for (0..std.math.maxInt(u2) + 1) |state_init| {
+        const max_state_int = blk: {
+            var max: usize = 0;
+            for (@typeInfo(uucode.grapheme.BreakState).@"enum".fields) |field| {
+                if (field.value > max) max = field.value;
+            }
+            break :blk max;
+        };
+
+        @setEvalBranchQuota(10_000);
+        const info = @typeInfo(uucode.x.types.GraphemeBreakNoControl).@"enum";
+        for (0..max_state_int + 1) |state_int| {
            for (info.fields) |field1| {
                for (info.fields) |field2| {
-                    var state: BreakState = @bitCast(@as(u2, @intCast(state_init)));
+                    var state: uucode.grapheme.BreakState = @enumFromInt(state_int);
+
                    const key: Key = .{
-                        .gbc1 = @field(GraphemeBoundaryClass, field1.name),
-                        .gbc2 = @field(GraphemeBoundaryClass, field2.name),
+                        .gb1 = @field(uucode.x.types.GraphemeBreakNoControl, field1.name),
+                        .gb2 = @field(uucode.x.types.GraphemeBreakNoControl, field2.name),
                        .state = state,
                    };
-                    const v = graphemeBreakClass(key.gbc1, key.gbc2, &state);
+                    const v = uucode.x.grapheme.computeGraphemeBreakNoControl(
+                        key.gb1,
+                        key.gb2,
+                        &state,
+                    );
                    result[key.index()] = .{ .result = v, .state = state };
                }
            }
        }

+        std.debug.assert(@sizeOf(@TypeOf(result)) == 8192);
        break :precompute result;
    };
 };

-/// This is the algorithm from utf8proc. We only use this offline for
-/// precomputing the lookup table.
-fn graphemeBreakClass(
-    gbc1: GraphemeBoundaryClass,
-    gbc2: GraphemeBoundaryClass,
-    state: *BreakState,
-) bool {
-    // GB11: Emoji Extend* ZWJ x Emoji
-    if (!state.extended_pictographic and gbc1.isExtendedPictographic()) {
-        state.extended_pictographic = true;
-    }
-
-    // These two properties are ignored because they're not relevant to
-    // Ghostty -- they're filtered out before checking grapheme boundaries.
-    // GB3: CR x LF
-    // GB4: Control
-
-    // GB6: Hangul L x (L|V|LV|VT)
-    if (gbc1 == .L) {
-        if (gbc2 == .L or
-            gbc2 == .V or
-            gbc2 == .LV or
-            gbc2 == .LVT) return false;
-    }
-
-    // GB7: Hangul (LV | V) x (V | T)
-    if (gbc1 == .LV or gbc1 == .V) {
-        if (gbc2 == .V or
-            gbc2 == .T) return false;
-    }
-
-    // GB8: Hangul (LVT | T) x T
-    if (gbc1 == .LVT or gbc1 == .T) {
-        if (gbc2 == .T) return false;
-    }
-
-    // GB9b: x (Extend | ZWJ)
-    if (gbc2 == .extend or gbc2 == .zwj) return false;
-
-    // GB9a: x Spacing
-    if (gbc2 == .spacing_mark) return false;
-
-    // GB9b: Prepend x
-    if (gbc1 == .prepend) return false;
-
-    // GB12, GB13: RI x RI
-    if (gbc1 == .regional_indicator and gbc2 == .regional_indicator) {
-        if (state.regional_indicator) {
-            state.regional_indicator = false;
-            return true;
-        } else {
-            state.regional_indicator = true;
-            return false;
-        }
-    }
-
-    // GB11: Emoji Extend* ZWJ x Emoji
-    if (state.extended_pictographic and
-        gbc1 == .zwj and
-        gbc2.isExtendedPictographic())
-    {
-        state.extended_pictographic = false;
-        return false;
-    }
-
-    // UTS #51. This isn't covered by UAX #29 as far as I can tell (but
-    // I'm probably wrong). This is a special case for emoji modifiers
-    // which only do not break if they're next to a base.
-    //
-    // emoji_modifier_sequence := emoji_modifier_base emoji_modifier
-    if (gbc2 == .emoji_modifier and gbc1 == .extended_pictographic_base) {
-        return false;
-    }
-
-    return true;
-}
-
 /// If you build this file as a binary, we will verify the grapheme break
 /// implementation. This iterates over billions of codepoints so it is
 /// SLOW. It's not meant to be run in CI, but it's useful for debugging.
@@ -156,13 +87,11 @@ fn graphemeBreakClass(
 /// adding a `-Demit-unicode-test` option for `zig build`, but that
 /// hasn't been done here.
 pub fn main() !void {
-    const uucode = @import("uucode");
-
    // Set the min and max to control the test range.
    const min = 0;
    const max = uucode.config.max_code_point + 1;

-    var state: BreakState = .{};
+    var state: uucode.grapheme.BreakState = .default;
    var uu_state: uucode.grapheme.BreakState = .default;
    for (min..max) |cp1| {
        if (cp1 % 1000 == 0) std.log.warn("progress cp1={}", .{cp1});
@@ -199,13 +128,53 @@ test "grapheme break: emoji modifier" {

    // Emoji and modifier
    {
-        var state: BreakState = .{};
+        var state: uucode.grapheme.BreakState = .default;
        try testing.expect(!graphemeBreak(0x261D, 0x1F3FF, &state));
    }

    // Non-emoji and emoji modifier
    {
-        var state: BreakState = .{};
+        var state: uucode.grapheme.BreakState = .default;
        try testing.expect(graphemeBreak(0x22, 0x1F3FF, &state));
    }
 }
+
+test "long emoji zwj sequences" {
+    var state: uucode.grapheme.BreakState = .default;
+    // 👩‍👩‍👧‍👦 (family: woman, woman, girl, boy)
+    var it = uucode.utf8.Iterator.init("\u{1F469}\u{200D}\u{1F469}\u{200D}\u{1F467}\u{200D}\u{1F466}_");
+    var cp1 = it.next() orelse unreachable;
+    var cp2 = it.next() orelse unreachable;
+    try std.testing.expect(cp1 == 0x1F469); // 👩
+    try std.testing.expect(!graphemeBreak(cp1, cp2, &state));
+
+    cp1 = cp2;
+    cp2 = it.next() orelse unreachable;
+    try std.testing.expect(cp1 == 0x200D);
+    try std.testing.expect(!graphemeBreak(cp1, cp2, &state));
+
+    cp1 = cp2;
+    cp2 = it.next() orelse unreachable;
+    try std.testing.expect(cp1 == 0x1F469); // 👩
+    try std.testing.expect(!graphemeBreak(cp1, cp2, &state));
+
+    cp1 = cp2;
+    cp2 = it.next() orelse unreachable;
+    try std.testing.expect(cp1 == 0x200D);
+    try std.testing.expect(!graphemeBreak(cp1, cp2, &state));
+
+    cp1 = cp2;
+    cp2 = it.next() orelse unreachable;
+    try std.testing.expect(cp1 == 0x1F467); // 👧
+    try std.testing.expect(!graphemeBreak(cp1, cp2, &state));
+
+    cp1 = cp2;
+    cp2 = it.next() orelse unreachable;
+    try std.testing.expect(cp1 == 0x200D);
+    try std.testing.expect(!graphemeBreak(cp1, cp2, &state));
+
+    cp1 = cp2;
+    cp2 = it.next() orelse unreachable;
+    try std.testing.expect(cp1 == 0x1F466); // 👦
+    try std.testing.expect(graphemeBreak(cp1, cp2, &state)); // break
+}
--- a/src/unicode/main.zig
+++ b/src/unicode/main.zig
@@ -4,7 +4,6 @@ const grapheme = @import("grapheme.zig");
 pub const table = @import("props_table.zig").table;
 pub const Properties = @import("props.zig").Properties;
 pub const graphemeBreak = grapheme.graphemeBreak;
-pub const GraphemeBreakState = grapheme.BreakState;

 test {
    @import("std").testing.refAllDecls(@This());
--- a/src/unicode/props.zig
+++ b/src/unicode/props.zig
@@ -5,6 +5,7 @@
 //! benchmarks in src/bench to verify that we haven't regressed.

 const std = @import("std");
+const uucode = @import("uucode");

 pub const Properties = packed struct {
    /// Codepoint width. We clamp to [0, 2] since Ghostty handles control
@@ -12,8 +13,8 @@ pub const Properties = packed struct {
    /// becomes a 2-em dash).
    width: u2 = 0,

-    /// Grapheme boundary class.
-    grapheme_boundary_class: GraphemeBoundaryClass = .invalid,
+    /// Grapheme break property.
+    grapheme_break: uucode.x.types.GraphemeBreakNoControl = .other,

    /// Emoji VS compatibility
    emoji_vs_text: bool = false,
@@ -22,7 +23,7 @@ pub const Properties = packed struct {
    // Needed for lut.Generator
    pub fn eql(a: Properties, b: Properties) bool {
        return a.width == b.width and
-            a.grapheme_boundary_class == b.grapheme_boundary_class and
+            a.grapheme_break == b.grapheme_break and
            a.emoji_vs_text == b.emoji_vs_text and
            a.emoji_vs_emoji == b.emoji_vs_emoji;
    }
@@ -35,48 +36,15 @@ pub const Properties = packed struct {
        try writer.print(
            \\.{{
            \\    .width= {},
-            \\    .grapheme_boundary_class= .{s},
+            \\    .grapheme_break= .{s},
            \\    .emoji_vs_text= {},
            \\    .emoji_vs_emoji= {},
            \\}}
        , .{
            self.width,
-            @tagName(self.grapheme_boundary_class),
+            @tagName(self.grapheme_break),
            self.emoji_vs_text,
            self.emoji_vs_emoji,
        });
    }
 };
-
-/// Possible grapheme boundary classes. This isn't an exhaustive list:
-/// we omit control, CR, LF, etc. because in Ghostty's usage that are
-/// impossible because they're handled by the terminal.
-pub const GraphemeBoundaryClass = enum(u4) {
-    invalid,
-    L,
-    V,
-    T,
-    LV,
-    LVT,
-    prepend,
-    extend,
-    zwj,
-    spacing_mark,
-    regional_indicator,
-    extended_pictographic,
-    extended_pictographic_base, // \p{Extended_Pictographic} & \p{Emoji_Modifier_Base}
-    emoji_modifier, // \p{Emoji_Modifier}
-
-    /// Returns true if this is an extended pictographic type. This
-    /// should be used instead of comparing the enum value directly
-    /// because we classify multiple.
-    pub fn isExtendedPictographic(self: GraphemeBoundaryClass) bool {
-        return switch (self) {
-            .extended_pictographic,
-            .extended_pictographic_base,
-            => true,
-
-            else => false,
-        };
-    }
-};
--- a/src/unicode/props_uucode.zig
+++ b/src/unicode/props_uucode.zig
@@ -4,57 +4,18 @@ const assert = std.debug.assert;
 const uucode = @import("uucode");
 const lut = @import("lut.zig");
 const Properties = @import("props.zig").Properties;
-const GraphemeBoundaryClass = @import("props.zig").GraphemeBoundaryClass;
-
-/// Gets the grapheme boundary class for a codepoint.
-/// The use case for this is only in generating lookup tables.
-fn graphemeBoundaryClass(cp: u21) GraphemeBoundaryClass {
-    if (cp > uucode.config.max_code_point) return .invalid;
-
-    return switch (uucode.get(.grapheme_break, cp)) {
-        .extended_pictographic => .extended_pictographic,
-        .l => .L,
-        .v => .V,
-        .t => .T,
-        .lv => .LV,
-        .lvt => .LVT,
-        .prepend => .prepend,
-        .zwj => .zwj,
-        .spacing_mark => .spacing_mark,
-        .regional_indicator => .regional_indicator,
-        .emoji_modifier => .emoji_modifier,
-        .emoji_modifier_base => .extended_pictographic_base,
-
-        .zwnj,
-        .indic_conjunct_break_extend,
-        .indic_conjunct_break_linker,
-        => .extend,
-
-        // This is obviously not INVALID invalid, there is SOME grapheme
-        // boundary class for every codepoint. But we don't care about
-        // anything that doesn't fit into the above categories. Also note
-        // that `indic_conjunct_break_consonant` is `other` in
-        // 'GraphemeBreakProperty.txt' (it's missing).
-        .other,
-        .indic_conjunct_break_consonant,
-        .cr,
-        .lf,
-        .control,
-        => .invalid,
-    };
-}

 pub fn get(cp: u21) Properties {
    if (cp > uucode.config.max_code_point) return .{
        .width = 1,
-        .grapheme_boundary_class = .invalid,
+        .grapheme_break = .other,
        .emoji_vs_text = false,
        .emoji_vs_emoji = false,
    };

    return .{
        .width = uucode.get(.width, cp),
-        .grapheme_boundary_class = graphemeBoundaryClass(cp),
+        .grapheme_break = uucode.get(.grapheme_break_no_control, cp),
        .emoji_vs_text = uucode.get(.is_emoji_vs_text, cp),
        .emoji_vs_emoji = uucode.get(.is_emoji_vs_emoji, cp),
    };