fix up diff from benchmarks, and add tests against ziglyph

2026-06-10 13:48:20 +00:00 · 2025-09-18 11:46:05 -04:00
parent 285a33fbc0
commit 69594119c3
17 changed files with 227 additions and 161 deletions
--- a/src/unicode/grapheme.zig
+++ b/src/unicode/grapheme.zig
@@ -2,7 +2,6 @@ const std = @import("std");
 const props = @import("props.zig");
 const GraphemeBoundaryClass = props.GraphemeBoundaryClass;
 const table = props.table;
-const isExtendedPictographic = props.isExtendedPictographic;

 /// Determines if there is a grapheme break between two codepoints. This
 /// must be called sequentially maintaining the state between calls.
@@ -81,7 +80,7 @@ fn graphemeBreakClass(
    state: *BreakState,
 ) bool {
    // GB11: Emoji Extend* ZWJ x Emoji
-    if (!state.extended_pictographic and isExtendedPictographic(gbc1)) {
+    if (!state.extended_pictographic and gbc1.isExtendedPictographic()) {
        state.extended_pictographic = true;
    }

@@ -132,7 +131,7 @@ fn graphemeBreakClass(
    // GB11: Emoji Extend* ZWJ x Emoji
    if (state.extended_pictographic and
        gbc1 == .zwj and
-        isExtendedPictographic(gbc2))
+        gbc2.isExtendedPictographic())
    {
        state.extended_pictographic = false;
        return false;
@@ -156,38 +155,36 @@ fn graphemeBreakClass(
 /// TODO: this is hard to build with newer zig build, so
 /// https://github.com/ghostty-org/ghostty/pull/7806 took the approach of
 /// adding a `-Demit-unicode-test` option for `zig build`, but that
-/// hasn't been done here yet.
-/// TODO: this also still uses `ziglyph`, but could be switched to use
-/// `uucode`'s grapheme break once that is implemented.
+/// hasn't been done here.
 pub fn main() !void {
-    const ziglyph = @import("ziglyph");
+    const uucode = @import("uucode");

    // Set the min and max to control the test range.
    const min = 0;
    const max = std.math.maxInt(u21) + 1;

    var state: BreakState = .{};
-    var zg_state: u3 = 0;
+    var uu_state: uucode.grapheme.BreakState = .default;
    for (min..max) |cp1| {
        if (cp1 % 1000 == 0) std.log.warn("progress cp1={}", .{cp1});

        if (cp1 == '\r' or cp1 == '\n' or
-            ziglyph.grapheme_break.isControl(@intCast(cp1))) continue;
+            uucode.get(.grapheme_break, @intCast(cp1)) == .control) continue;

        for (min..max) |cp2| {
            if (cp2 == '\r' or cp2 == '\n' or
-                ziglyph.grapheme_break.isControl(@intCast(cp2))) continue;
+                uucode.get(.grapheme_break, @intCast(cp1)) == .control) continue;

            const gb = graphemeBreak(@intCast(cp1), @intCast(cp2), &state);
-            const zg_gb = ziglyph.graphemeBreak(@intCast(cp1), @intCast(cp2), &zg_state);
-            if (gb != zg_gb) {
-                std.log.warn("cp1={x} cp2={x} gb={} state={} zg_gb={} zg_state={}", .{
+            const uu_gb = uucode.grapheme.isBreak(@intCast(cp1), @intCast(cp2), &uu_state);
+            if (gb != uu_gb) {
+                std.log.warn("cp1={x} cp2={x} gb={} state={} uu_gb={} uu_state={}", .{
                    cp1,
                    cp2,
                    gb,
                    state,
-                    zg_gb,
-                    zg_state,
+                    uu_gb,
+                    uu_state,
                });
            }
        }
--- a/src/unicode/main.zig
+++ b/src/unicode/main.zig
@@ -7,7 +7,6 @@ pub const Properties = props.Properties;
 pub const getProperties = props.get;
 pub const graphemeBreak = grapheme.graphemeBreak;
 pub const GraphemeBreakState = grapheme.BreakState;
-pub const isExtendedPictographic = props.isExtendedPictographic;

 test {
    _ = @import("symbols.zig");
--- a/src/unicode/props.zig
+++ b/src/unicode/props.zig
@@ -76,66 +76,66 @@ pub const GraphemeBoundaryClass = enum(u4) {
    extended_pictographic,
    extended_pictographic_base, // \p{Extended_Pictographic} & \p{Emoji_Modifier_Base}
    emoji_modifier, // \p{Emoji_Modifier}
+
+    /// Gets the grapheme boundary class for a codepoint.
+    /// The use case for this is only in generating lookup tables.
+    pub fn init(cp: u21) GraphemeBoundaryClass {
+        if (cp > uucode.config.max_code_point) return .invalid;
+        if (uucode.get(.is_emoji_modifier, cp)) return .emoji_modifier;
+        if (uucode.get(.is_emoji_modifier_base, cp)) return .extended_pictographic_base;
+
+        return switch (uucode.get(.grapheme_break, cp)) {
+            .extended_pictographic => .extended_pictographic,
+            .l => .L,
+            .v => .V,
+            .t => .T,
+            .lv => .LV,
+            .lvt => .LVT,
+            .prepend => .prepend,
+            .zwj => .zwj,
+            .spacing_mark => .spacing_mark,
+            .regional_indicator => .regional_indicator,
+
+            .zwnj,
+            .indic_conjunct_break_extend,
+            .indic_conjunct_break_linker,
+            => .extend,
+
+            // This is obviously not INVALID invalid, there is SOME grapheme
+            // boundary class for every codepoint. But we don't care about
+            // anything that doesn't fit into the above categories.
+            .other,
+            .indic_conjunct_break_consonant,
+            .cr,
+            .lf,
+            .control,
+            => .invalid,
+        };
+    }
+
+    /// Returns true if this is an extended pictographic type. This
+    /// should be used instead of comparing the enum value directly
+    /// because we classify multiple.
+    pub fn isExtendedPictographic(self: GraphemeBoundaryClass) bool {
+        return switch (self) {
+            .extended_pictographic,
+            .extended_pictographic_base,
+            => true,
+
+            else => false,
+        };
+    }
 };

-/// Gets the grapheme boundary class for a codepoint.
-/// The use case for this is only in generating lookup tables.
-fn computeGraphemeBoundaryClass(cp: u21) GraphemeBoundaryClass {
-    if (cp > uucode.config.max_code_point) return .invalid;
-    if (uucode.get(.is_emoji_modifier, cp)) return .emoji_modifier;
-    if (uucode.get(.is_emoji_modifier_base, cp)) return .extended_pictographic_base;
-
-    return switch (uucode.get(.grapheme_break, cp)) {
-        .extended_pictographic => .extended_pictographic,
-        .l => .L,
-        .v => .V,
-        .t => .T,
-        .lv => .LV,
-        .lvt => .LVT,
-        .prepend => .prepend,
-        .zwj => .zwj,
-        .spacing_mark => .spacing_mark,
-        .regional_indicator => .regional_indicator,
-
-        .zwnj,
-        .indic_conjunct_break_extend,
-        .indic_conjunct_break_linker,
-        => .extend,
-
-        // This is obviously not INVALID invalid, there is SOME grapheme
-        // boundary class for every codepoint. But we don't care about
-        // anything that doesn't fit into the above categories.
-        .other,
-        .indic_conjunct_break_consonant,
-        .cr,
-        .lf,
-        .control,
-        => .invalid,
-    };
-}
-
-/// Returns true if this is an extended pictographic type. This
-/// should be used instead of comparing the enum value directly
-/// because we classify multiple.
-pub fn isExtendedPictographic(self: GraphemeBoundaryClass) bool {
-    return switch (self) {
-        .extended_pictographic,
-        .extended_pictographic_base,
-        => true,
-
-        else => false,
-    };
-}
-
 pub fn get(cp: u21) Properties {
    const width = if (cp > uucode.config.max_code_point)
-        0
+        1
    else
        uucode.getX(.width, cp);

    return .{
        .width = width,
-        .grapheme_boundary_class = computeGraphemeBoundaryClass(cp),
+        .grapheme_boundary_class = .init(cp),
    };
 }

@@ -145,13 +145,6 @@ pub fn main() !void {
    defer arena_state.deinit();
    const alloc = arena_state.allocator();

-    var args_iter = try std.process.argsWithAllocator(alloc);
-    defer args_iter.deinit();
-    _ = args_iter.skip(); // Skip program name
-
-    const output_path = args_iter.next() orelse std.debug.panic("No output file arg for props exe!", .{});
-    std.debug.print("Unicode props_table output_path = {s}\n", .{output_path});
-
    const gen: lut.Generator(
        Properties,
        struct {
@@ -171,10 +164,7 @@ pub fn main() !void {
    defer alloc.free(t.stage1);
    defer alloc.free(t.stage2);
    defer alloc.free(t.stage3);
-    var out_file = try std.fs.cwd().createFile(output_path, .{});
-    defer out_file.close();
-    const writer = out_file.writer();
-    try t.writeZig(writer);
+    try t.writeZig(std.io.getStdOut().writer());

    // Uncomment when manually debugging to see our table sizes.
    // std.log.warn("stage1={} stage2={} stage3={}", .{
@@ -186,17 +176,78 @@ pub fn main() !void {

 // This is not very fast in debug modes, so its commented by default.
 // IMPORTANT: UNCOMMENT THIS WHENEVER MAKING CODEPOINTWIDTH CHANGES.
-// test "unicode props: tables match uucode" {
-//     const testing = std.testing;
-//
-//     const min = 0xFF + 1; // start outside ascii
-//     const max = std.math.maxInt(u21) + 1;
-//     for (min..max) |cp| {
-//         const t = table.get(@intCast(cp));
-//         const uu = @min(2, @max(0, uucode.get(.wcwidth, @intCast(cp))));
-//         if (t.width != uu) {
-//             std.log.warn("mismatch cp=U+{x} t={} uucode={}", .{ cp, t, uu });
-//             try testing.expect(false);
-//         }
-//     }
-//}
+test "unicode props: tables match uucode" {
+    if (std.valgrind.runningOnValgrind() > 0) return error.SkipZigTest;
+
+    const testing = std.testing;
+
+    const min = 0xFF + 1; // start outside ascii
+    const max = std.math.maxInt(u21) + 1;
+    for (min..max) |cp| {
+        const t = table.get(@intCast(cp));
+        const uu = if (cp > uucode.config.max_code_point)
+            1
+        else
+            uucode.getX(.width, @intCast(cp));
+        if (t.width != uu) {
+            std.log.warn("mismatch cp=U+{x} t={} uu={}", .{ cp, t.width, uu });
+            try testing.expect(false);
+        }
+    }
+}
+
+test "unicode props: tables match ziglyph" {
+    if (std.valgrind.runningOnValgrind() > 0) return error.SkipZigTest;
+
+    const ziglyph = @import("ziglyph");
+    const testing = std.testing;
+
+    const min = 0xFF + 1; // start outside ascii
+    const max = std.math.maxInt(u21) + 1;
+    for (min..max) |cp| {
+        const t = table.get(@intCast(cp));
+        const zg = @min(2, @max(0, ziglyph.display_width.codePointWidth(@intCast(cp), .half)));
+        if (t.width != zg) {
+
+            // Known exceptions
+            if (cp == 0x0897) continue; // non-spacing mark (t = 0)
+            if (cp == 0x2065) continue; // unassigned (t = 1)
+            if (cp >= 0x2630 and cp <= 0x2637) continue; // east asian width is wide (t = 2)
+            if (cp >= 0x268A and cp <= 0x268F) continue; // east asian width is wide (t = 2)
+            if (cp >= 0x2FFC and cp <= 0x2FFF) continue; // east asian width is wide (t = 2)
+            if (cp == 0x31E4 or cp == 0x31E5) continue; // east asian width is wide (t = 2)
+            if (cp == 0x31EF) continue; // east asian width is wide (t = 2)
+            if (cp >= 0x4DC0 and cp <= 0x4DFF) continue; // east asian width is wide (t = 2)
+            if (cp >= 0xFFF0 and cp <= 0xFFF8) continue; // unassigned (t = 1)
+            if (cp >= 0xFFF0 and cp <= 0xFFF8) continue; // unassigned (t = 1)
+            if (cp >= 0x10D69 and cp <= 0x10D6D) continue; // non-spacing mark, despite being east asian width normal (t = 0)
+            if (cp >= 0x10EFC and cp <= 0x10EFF) continue; // non-spacing mark, despite being east asian width normal (t = 0)
+            if (cp >= 0x113BB and cp <= 0x113C0) continue; // non-spacing mark, despite being east asian width normal (t = 0)
+            if (cp == 0x113CE) continue; // non-spacing mark, despite being east asian width normal (t = 0)
+            if (cp == 0x113D0) continue; // non-spacing mark, despite being east asian width normal (t = 0)
+            if (cp == 0x113D2) continue; // non-spacing mark, despite being east asian width normal (t = 0)
+            if (cp == 0x113E1) continue; // non-spacing mark, despite being east asian width normal (t = 0)
+            if (cp == 0x113E2) continue; // non-spacing mark, despite being east asian width normal (t = 0)
+            if (cp == 0x1171E) continue; // mark spacing combining (t = 1)
+            if (cp == 0x11F5A) continue; // non-spacing mark, despite being east asian width normal (t = 0)
+            if (cp == 0x1611E) continue; // non-spacing mark, despite being east asian width normal (t = 0)
+            if (cp == 0x1611F) continue; // non-spacing mark, despite being east asian width normal (t = 0)
+            if (cp >= 0x16120 and cp <= 0x1612F) continue; // non-spacing mark, despite being east asian width normal (t = 0)
+            if (cp >= 0xE0000 and cp <= 0xE0FFF) continue; // ziglyph ignores these with 0, but many are unassigned (t = 1)
+            if (cp == 0x18CFF) continue; // east asian width is wide (t = 2)
+            if (cp >= 0x1D300 and cp <= 0x1D376) continue; // east asian width is wide (t = 2)
+            if (cp == 0x1E5EE) continue; // non-spacing mark, despite being east asian width normal (t = 0)
+            if (cp == 0x1E5EF) continue; // non-spacing mark, despite being east asian width normal (t = 0)
+            if (cp == 0x1FA89) continue; // east asian width is wide (t = 2)
+            if (cp == 0x1FA8F) continue; // east asian width is wide (t = 2)
+            if (cp == 0x1FABE) continue; // east asian width is wide (t = 2)
+            if (cp == 0x1FAC6) continue; // east asian width is wide (t = 2)
+            if (cp == 0x1FADC) continue; // east asian width is wide (t = 2)
+            if (cp == 0x1FADF) continue; // east asian width is wide (t = 2)
+            if (cp == 0x1FAE9) continue; // east asian width is wide (t = 2)
+
+            std.log.warn("mismatch cp=U+{x} t={} zg={}", .{ cp, t.width, zg });
+            try testing.expect(false);
+        }
+    }
+}
--- a/src/unicode/symbols.zig
+++ b/src/unicode/symbols.zig
@@ -17,37 +17,12 @@ pub const table = table: {
    };
 };

-/// Returns true of the codepoint is a "symbol-like" character, which
-/// for now we define as anything in a private use area and anything
-/// in several unicode blocks:
-/// - Dingbats
-/// - Emoticons
-/// - Miscellaneous Symbols
-/// - Enclosed Alphanumerics
-/// - Enclosed Alphanumeric Supplement
-/// - Miscellaneous Symbols and Pictographs
-/// - Transport and Map Symbols
-///
-/// In the future it may be prudent to expand this to encompass more
-/// symbol-like characters, and/or exclude some PUA sections.
-pub fn isSymbol(cp: u21) bool {
-    // TODO: probably can remove this method and just call uucode directly
-    return uucode.getX(.is_symbol, cp);
-}
-
 /// Runnable binary to generate the lookup tables and output to stdout.
 pub fn main() !void {
    var arena_state = std.heap.ArenaAllocator.init(std.heap.page_allocator);
    defer arena_state.deinit();
    const alloc = arena_state.allocator();

-    var args_iter = try std.process.argsWithAllocator(alloc);
-    defer args_iter.deinit();
-    _ = args_iter.skip(); // Skip program name
-
-    const output_path = args_iter.next() orelse std.debug.panic("No output file arg for symbols exe!", .{});
-    std.debug.print("Unicode symbols_table output_path = {s}\n", .{output_path});
-
    const gen: lut.Generator(
        bool,
        struct {
@@ -56,7 +31,7 @@ pub fn main() !void {
                return if (cp > uucode.config.max_code_point)
                    false
                else
-                    isSymbol(@intCast(cp));
+                    uucode.getX(.is_symbol, @intCast(cp));
            }

            pub fn eql(ctx: @This(), a: bool, b: bool) bool {
@@ -70,10 +45,7 @@ pub fn main() !void {
    defer alloc.free(t.stage1);
    defer alloc.free(t.stage2);
    defer alloc.free(t.stage3);
-    var out_file = try std.fs.cwd().createFile(output_path, .{});
-    defer out_file.close();
-    const writer = out_file.writer();
-    try t.writeZig(writer);
+    try t.writeZig(std.io.getStdOut().writer());

    // Uncomment when manually debugging to see our table sizes.
    // std.log.warn("stage1={} stage2={} stage3={}", .{
@@ -83,8 +55,6 @@ pub fn main() !void {
    // });
 }

-// This is not very fast in debug modes, so its commented by default.
-// IMPORTANT: UNCOMMENT THIS WHENEVER MAKING CHANGES.
 test "unicode symbols: tables match uucode" {
    if (std.valgrind.runningOnValgrind() > 0) return error.SkipZigTest;

@@ -95,7 +65,7 @@ test "unicode symbols: tables match uucode" {
        const uu = if (cp > uucode.config.max_code_point)
            false
        else
-            isSymbol(@intCast(cp));
+            uucode.getX(.is_symbol, @intCast(cp));

        if (t != uu) {
            std.log.warn("mismatch cp=U+{x} t={} uu={}", .{ cp, t, uu });
@@ -103,3 +73,28 @@ test "unicode symbols: tables match uucode" {
        }
    }
 }
+
+test "unicode symbols: tables match ziglyph" {
+    if (std.valgrind.runningOnValgrind() > 0) return error.SkipZigTest;
+
+    const ziglyph = @import("ziglyph");
+    const testing = std.testing;
+
+    for (0..std.math.maxInt(u21)) |cp_usize| {
+        const cp: u21 = @intCast(cp_usize);
+        const t = table.get(cp);
+        const zg = ziglyph.general_category.isPrivateUse(cp) or
+            ziglyph.blocks.isDingbats(cp) or
+            ziglyph.blocks.isEmoticons(cp) or
+            ziglyph.blocks.isMiscellaneousSymbols(cp) or
+            ziglyph.blocks.isEnclosedAlphanumerics(cp) or
+            ziglyph.blocks.isEnclosedAlphanumericSupplement(cp) or
+            ziglyph.blocks.isMiscellaneousSymbolsAndPictographs(cp) or
+            ziglyph.blocks.isTransportAndMapSymbols(cp);
+
+        if (t != zg) {
+            std.log.warn("mismatch cp=U+{x} t={} zg={}", .{ cp, t, zg });
+            try testing.expect(false);
+        }
+    }
+}