diff --git a/src/benchmark/GraphemeBreak.zig b/src/benchmark/GraphemeBreak.zig index 328d63a75..8278c5c2f 100644 --- a/src/benchmark/GraphemeBreak.zig +++ b/src/benchmark/GraphemeBreak.zig @@ -10,6 +10,7 @@ const Benchmark = @import("Benchmark.zig"); const options = @import("options.zig"); const UTF8Decoder = @import("../terminal/UTF8Decoder.zig"); const unicode = @import("../unicode/main.zig"); +const uucode = @import("uucode"); const log = std.log.scoped(.@"terminal-stream-bench"); @@ -118,7 +119,7 @@ fn stepTable(ptr: *anyopaque) Benchmark.Error!void { var r = &f_reader.interface; var d: UTF8Decoder = .{}; - var state: unicode.GraphemeBreakState = .{}; + var state: uucode.grapheme.BreakState = .default; var cp1: u21 = 0; var buf: [4096]u8 align(std.atomic.cache_line) = undefined; while (true) { diff --git a/src/build/uucode_config.zig b/src/build/uucode_config.zig index 2fadbdb78..594a05366 100644 --- a/src/build/uucode_config.zig +++ b/src/build/uucode_config.zig @@ -4,6 +4,7 @@ const config = @import("config.zig"); const config_x = @import("config.x.zig"); const d = config.default; const wcwidth = config_x.wcwidth; +const grapheme_break_no_control = config_x.grapheme_break_no_control; const Allocator = std.mem.Allocator; @@ -85,10 +86,15 @@ pub const tables = [_]config.Table{ }, .{ .name = "buildtime", - .extensions = &.{ wcwidth, width, is_symbol }, + .extensions = &.{ + wcwidth, + grapheme_break_no_control, + width, + is_symbol, + }, .fields = &.{ width.field("width"), - d.field("grapheme_break"), + grapheme_break_no_control.field("grapheme_break_no_control"), is_symbol.field("is_symbol"), d.field("is_emoji_vs_base"), }, diff --git a/src/font/shaper/web_canvas.zig b/src/font/shaper/web_canvas.zig index c8334ec9d..590c1d2a3 100644 --- a/src/font/shaper/web_canvas.zig +++ b/src/font/shaper/web_canvas.zig @@ -4,6 +4,7 @@ const Allocator = std.mem.Allocator; const font = @import("../main.zig"); const terminal = @import("../../terminal/main.zig"); const unicode = @import("../../unicode/main.zig"); +const uucode = @import("uucode"); const log = std.log.scoped(.font_shaper); @@ -111,7 +112,7 @@ pub const Shaper = struct { // font ligatures. However, we do support grapheme clustering. // This means we can render things like skin tone emoji but // we can't render things like single glyph "=>". - var break_state: unicode.GraphemeBreakState = .{}; + var break_state: uucode.grapheme.BreakState = .default; var cp1: u21 = @intCast(codepoints[0]); var start: usize = 0; diff --git a/src/terminal/Terminal.zig b/src/terminal/Terminal.zig index 7e02e3a24..45d19fa06 100644 --- a/src/terminal/Terminal.zig +++ b/src/terminal/Terminal.zig @@ -9,6 +9,7 @@ const assert = @import("../quirks.zig").inlineAssert; const testing = std.testing; const Allocator = std.mem.Allocator; const unicode = @import("../unicode/main.zig"); +const uucode = @import("uucode"); const ansi = @import("ansi.zig"); const modespkg = @import("modes.zig"); @@ -361,7 +362,7 @@ pub fn print(self: *Terminal, c: u21) !void { if (prev.cell.codepoint() == 0) break :grapheme; const grapheme_break = brk: { - var state: unicode.GraphemeBreakState = .{}; + var state: uucode.grapheme.BreakState = .default; var cp1: u21 = prev.cell.content.codepoint; if (prev.cell.hasGrapheme()) { const cps = self.screens.active.cursor.page_pin.node.data.lookupGrapheme(prev.cell).?; @@ -512,7 +513,7 @@ pub fn print(self: *Terminal, c: u21) !void { // If this is a emoji variation selector, prev must be an emoji if (c == 0xFE0F or c == 0xFE0E) { const prev_props = unicode.table.get(prev.content.codepoint); - const emoji = prev_props.grapheme_boundary_class == .extended_pictographic; + const emoji = prev_props.grapheme_break == .extended_pictographic; if (!emoji) return; } @@ -3996,6 +3997,53 @@ test "Terminal: overwrite multicodepoint grapheme tail clears grapheme data" { try testing.expectEqual(@as(usize, 0), page.graphemeCount()); } +test "Terminal: print breaks valid grapheme cluster with Prepend + ASCII for speed" { + const alloc = testing.allocator; + var t = try init(alloc, .{ .rows = 5, .cols = 5 }); + defer t.deinit(alloc); + t.modes.set(.grapheme_cluster, true); + + // Make sure we're not at cursor.x == 0 for the next char. + try t.print('_'); + + // U+0600 ARABIC NUMBER SIGN (Prepend) + try t.print(0x0600); + try t.print('1'); + + // We should have 3 cells taken up, each narrow. Note that this is + // **incorrect** grapheme break behavior, since a Prepend code point should + // not break with the one following it per UAX #29 GB9b. However, as an + // optimization we assume a grapheme break when c <= 255, and note that + // this deviation only affects these very uncommon scenarios (e.g. the + // Arabic number sign should precede Arabic-script digits). + try testing.expectEqual(@as(usize, 0), t.screens.active.cursor.y); + try testing.expectEqual(@as(usize, 3), t.screens.active.cursor.x); + // This is what we'd expect if we did break correctly: + //try testing.expectEqual(@as(usize, 2), t.screens.active.cursor.x); + + // Assert various properties about our screen to verify + // we have all expected cells. + { + const list_cell = t.screens.active.pages.getCell(.{ .screen = .{ .x = 1, .y = 0 } }).?; + const cell = list_cell.cell; + try testing.expectEqual(@as(u21, 0x0600), cell.content.codepoint); + try testing.expect(!cell.hasGrapheme()); + // This is what we'd expect if we did break correctly: + //try testing.expect(cell.hasGrapheme()); + //try testing.expectEqualSlices(u21, &.{'1'}, list_cell.node.data.lookupGrapheme(cell).?); + try testing.expectEqual(Cell.Wide.narrow, cell.wide); + } + { + const list_cell = t.screens.active.pages.getCell(.{ .screen = .{ .x = 2, .y = 0 } }).?; + const cell = list_cell.cell; + try testing.expectEqual(@as(u21, '1'), cell.content.codepoint); + // This is what we'd expect if we did break correctly: + //try testing.expectEqual(@as(u21, 0), cell.content.codepoint); + try testing.expect(!cell.hasGrapheme()); + try testing.expectEqual(Cell.Wide.narrow, cell.wide); + } +} + test "Terminal: print writes to bottom if scrolled" { var t = try init(testing.allocator, .{ .cols = 5, .rows = 2 }); defer t.deinit(testing.allocator); diff --git a/src/unicode/grapheme.zig b/src/unicode/grapheme.zig index 47be43bb0..d233dec72 100644 --- a/src/unicode/grapheme.zig +++ b/src/unicode/grapheme.zig @@ -1,6 +1,6 @@ const std = @import("std"); const table = @import("props_table.zig").table; -const GraphemeBoundaryClass = @import("props.zig").GraphemeBoundaryClass; +const uucode = @import("uucode"); /// Determines if there is a grapheme break between two codepoints. This /// must be called sequentially maintaining the state between calls. @@ -9,11 +9,11 @@ const GraphemeBoundaryClass = @import("props.zig").GraphemeBoundaryClass; /// line feeds, and carriage returns are expected to be filtered out before /// calling this function. This is because this function is tuned for /// Ghostty. -pub fn graphemeBreak(cp1: u21, cp2: u21, state: *BreakState) bool { +pub fn graphemeBreak(cp1: u21, cp2: u21, state: *uucode.grapheme.BreakState) bool { const value = Precompute.data[ (Precompute.Key{ - .gbc1 = table.get(cp1).grapheme_boundary_class, - .gbc2 = table.get(cp2).grapheme_boundary_class, + .gb1 = table.get(cp1).grapheme_break, + .gb2 = table.get(cp2).grapheme_break, .state = state.*, }).index() ]; @@ -21,133 +21,64 @@ pub fn graphemeBreak(cp1: u21, cp2: u21, state: *BreakState) bool { return value.result; } -/// The state that must be maintained between calls to `graphemeBreak`. -pub const BreakState = packed struct(u2) { - extended_pictographic: bool = false, - regional_indicator: bool = false, -}; - /// This is all the structures and data for the precomputed lookup table -/// for all possible permutations of state and grapheme boundary classes. -/// Precomputation only requires 2^10 keys of 3 bit values so the whole -/// table is less than 1KB. +/// for all possible permutations of state and grapheme break properties. +/// Precomputation requires 2^13 keys of 4 bit values so the whole table is +/// 8KB. const Precompute = struct { - const Key = packed struct(u10) { - state: BreakState, - gbc1: GraphemeBoundaryClass, - gbc2: GraphemeBoundaryClass, + const Key = packed struct(u13) { + state: uucode.grapheme.BreakState, + gb1: uucode.x.types.GraphemeBreakNoControl, + gb2: uucode.x.types.GraphemeBreakNoControl, fn index(self: Key) usize { - return @intCast(@as(u10, @bitCast(self))); + return @intCast(@as(u13, @bitCast(self))); } }; - const Value = packed struct(u3) { + const Value = packed struct(u4) { result: bool, - state: BreakState, + state: uucode.grapheme.BreakState, }; const data = precompute: { - var result: [std.math.maxInt(u10)]Value = undefined; + var result: [std.math.maxInt(u13) + 1]Value = undefined; - @setEvalBranchQuota(3_000); - const info = @typeInfo(GraphemeBoundaryClass).@"enum"; - for (0..std.math.maxInt(u2) + 1) |state_init| { + const max_state_int = blk: { + var max: usize = 0; + for (@typeInfo(uucode.grapheme.BreakState).@"enum".fields) |field| { + if (field.value > max) max = field.value; + } + break :blk max; + }; + + @setEvalBranchQuota(10_000); + const info = @typeInfo(uucode.x.types.GraphemeBreakNoControl).@"enum"; + for (0..max_state_int + 1) |state_int| { for (info.fields) |field1| { for (info.fields) |field2| { - var state: BreakState = @bitCast(@as(u2, @intCast(state_init))); + var state: uucode.grapheme.BreakState = @enumFromInt(state_int); + const key: Key = .{ - .gbc1 = @field(GraphemeBoundaryClass, field1.name), - .gbc2 = @field(GraphemeBoundaryClass, field2.name), + .gb1 = @field(uucode.x.types.GraphemeBreakNoControl, field1.name), + .gb2 = @field(uucode.x.types.GraphemeBreakNoControl, field2.name), .state = state, }; - const v = graphemeBreakClass(key.gbc1, key.gbc2, &state); + const v = uucode.x.grapheme.computeGraphemeBreakNoControl( + key.gb1, + key.gb2, + &state, + ); result[key.index()] = .{ .result = v, .state = state }; } } } + std.debug.assert(@sizeOf(@TypeOf(result)) == 8192); break :precompute result; }; }; -/// This is the algorithm from utf8proc. We only use this offline for -/// precomputing the lookup table. -fn graphemeBreakClass( - gbc1: GraphemeBoundaryClass, - gbc2: GraphemeBoundaryClass, - state: *BreakState, -) bool { - // GB11: Emoji Extend* ZWJ x Emoji - if (!state.extended_pictographic and gbc1.isExtendedPictographic()) { - state.extended_pictographic = true; - } - - // These two properties are ignored because they're not relevant to - // Ghostty -- they're filtered out before checking grapheme boundaries. - // GB3: CR x LF - // GB4: Control - - // GB6: Hangul L x (L|V|LV|VT) - if (gbc1 == .L) { - if (gbc2 == .L or - gbc2 == .V or - gbc2 == .LV or - gbc2 == .LVT) return false; - } - - // GB7: Hangul (LV | V) x (V | T) - if (gbc1 == .LV or gbc1 == .V) { - if (gbc2 == .V or - gbc2 == .T) return false; - } - - // GB8: Hangul (LVT | T) x T - if (gbc1 == .LVT or gbc1 == .T) { - if (gbc2 == .T) return false; - } - - // GB9b: x (Extend | ZWJ) - if (gbc2 == .extend or gbc2 == .zwj) return false; - - // GB9a: x Spacing - if (gbc2 == .spacing_mark) return false; - - // GB9b: Prepend x - if (gbc1 == .prepend) return false; - - // GB12, GB13: RI x RI - if (gbc1 == .regional_indicator and gbc2 == .regional_indicator) { - if (state.regional_indicator) { - state.regional_indicator = false; - return true; - } else { - state.regional_indicator = true; - return false; - } - } - - // GB11: Emoji Extend* ZWJ x Emoji - if (state.extended_pictographic and - gbc1 == .zwj and - gbc2.isExtendedPictographic()) - { - state.extended_pictographic = false; - return false; - } - - // UTS #51. This isn't covered by UAX #29 as far as I can tell (but - // I'm probably wrong). This is a special case for emoji modifiers - // which only do not break if they're next to a base. - // - // emoji_modifier_sequence := emoji_modifier_base emoji_modifier - if (gbc2 == .emoji_modifier and gbc1 == .extended_pictographic_base) { - return false; - } - - return true; -} - /// If you build this file as a binary, we will verify the grapheme break /// implementation. This iterates over billions of codepoints so it is /// SLOW. It's not meant to be run in CI, but it's useful for debugging. @@ -156,13 +87,11 @@ fn graphemeBreakClass( /// adding a `-Demit-unicode-test` option for `zig build`, but that /// hasn't been done here. pub fn main() !void { - const uucode = @import("uucode"); - // Set the min and max to control the test range. const min = 0; const max = uucode.config.max_code_point + 1; - var state: BreakState = .{}; + var state: uucode.grapheme.BreakState = .default; var uu_state: uucode.grapheme.BreakState = .default; for (min..max) |cp1| { if (cp1 % 1000 == 0) std.log.warn("progress cp1={}", .{cp1}); @@ -199,13 +128,53 @@ test "grapheme break: emoji modifier" { // Emoji and modifier { - var state: BreakState = .{}; + var state: uucode.grapheme.BreakState = .default; try testing.expect(!graphemeBreak(0x261D, 0x1F3FF, &state)); } // Non-emoji and emoji modifier { - var state: BreakState = .{}; + var state: uucode.grapheme.BreakState = .default; try testing.expect(graphemeBreak(0x22, 0x1F3FF, &state)); } } + +test "long emoji zwj sequences" { + var state: uucode.grapheme.BreakState = .default; + // ๐Ÿ‘ฉโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ (family: woman, woman, girl, boy) + var it = uucode.utf8.Iterator.init("\u{1F469}\u{200D}\u{1F469}\u{200D}\u{1F467}\u{200D}\u{1F466}_"); + var cp1 = it.next() orelse unreachable; + var cp2 = it.next() orelse unreachable; + try std.testing.expect(cp1 == 0x1F469); // ๐Ÿ‘ฉ + try std.testing.expect(!graphemeBreak(cp1, cp2, &state)); + + cp1 = cp2; + cp2 = it.next() orelse unreachable; + try std.testing.expect(cp1 == 0x200D); + try std.testing.expect(!graphemeBreak(cp1, cp2, &state)); + + cp1 = cp2; + cp2 = it.next() orelse unreachable; + try std.testing.expect(cp1 == 0x1F469); // ๐Ÿ‘ฉ + try std.testing.expect(!graphemeBreak(cp1, cp2, &state)); + + cp1 = cp2; + cp2 = it.next() orelse unreachable; + try std.testing.expect(cp1 == 0x200D); + try std.testing.expect(!graphemeBreak(cp1, cp2, &state)); + + cp1 = cp2; + cp2 = it.next() orelse unreachable; + try std.testing.expect(cp1 == 0x1F467); // ๐Ÿ‘ง + try std.testing.expect(!graphemeBreak(cp1, cp2, &state)); + + cp1 = cp2; + cp2 = it.next() orelse unreachable; + try std.testing.expect(cp1 == 0x200D); + try std.testing.expect(!graphemeBreak(cp1, cp2, &state)); + + cp1 = cp2; + cp2 = it.next() orelse unreachable; + try std.testing.expect(cp1 == 0x1F466); // ๐Ÿ‘ฆ + try std.testing.expect(graphemeBreak(cp1, cp2, &state)); // break +} diff --git a/src/unicode/main.zig b/src/unicode/main.zig index 427c65614..11ecbd903 100644 --- a/src/unicode/main.zig +++ b/src/unicode/main.zig @@ -4,7 +4,6 @@ const grapheme = @import("grapheme.zig"); pub const table = @import("props_table.zig").table; pub const Properties = @import("props.zig").Properties; pub const graphemeBreak = grapheme.graphemeBreak; -pub const GraphemeBreakState = grapheme.BreakState; test { @import("std").testing.refAllDecls(@This()); diff --git a/src/unicode/props.zig b/src/unicode/props.zig index 492dad34a..a6615e56e 100644 --- a/src/unicode/props.zig +++ b/src/unicode/props.zig @@ -5,6 +5,7 @@ //! benchmarks in src/bench to verify that we haven't regressed. const std = @import("std"); +const uucode = @import("uucode"); pub const Properties = packed struct { /// Codepoint width. We clamp to [0, 2] since Ghostty handles control @@ -12,8 +13,8 @@ pub const Properties = packed struct { /// becomes a 2-em dash). width: u2 = 0, - /// Grapheme boundary class. - grapheme_boundary_class: GraphemeBoundaryClass = .invalid, + /// Grapheme break property. + grapheme_break: uucode.x.types.GraphemeBreakNoControl = .other, /// Emoji VS compatibility emoji_vs_base: bool = false, @@ -21,7 +22,7 @@ pub const Properties = packed struct { // Needed for lut.Generator pub fn eql(a: Properties, b: Properties) bool { return a.width == b.width and - a.grapheme_boundary_class == b.grapheme_boundary_class and + a.grapheme_break == b.grapheme_break and a.emoji_vs_base == b.emoji_vs_base; } @@ -33,46 +34,13 @@ pub const Properties = packed struct { try writer.print( \\.{{ \\ .width= {}, - \\ .grapheme_boundary_class= .{s}, + \\ .grapheme_break= .{s}, \\ .emoji_vs_base= {}, \\}} , .{ self.width, - @tagName(self.grapheme_boundary_class), + @tagName(self.grapheme_break), self.emoji_vs_base, }); } }; - -/// Possible grapheme boundary classes. This isn't an exhaustive list: -/// we omit control, CR, LF, etc. because in Ghostty's usage that are -/// impossible because they're handled by the terminal. -pub const GraphemeBoundaryClass = enum(u4) { - invalid, - L, - V, - T, - LV, - LVT, - prepend, - extend, - zwj, - spacing_mark, - regional_indicator, - extended_pictographic, - extended_pictographic_base, // \p{Extended_Pictographic} & \p{Emoji_Modifier_Base} - emoji_modifier, // \p{Emoji_Modifier} - - /// Returns true if this is an extended pictographic type. This - /// should be used instead of comparing the enum value directly - /// because we classify multiple. - pub fn isExtendedPictographic(self: GraphemeBoundaryClass) bool { - return switch (self) { - .extended_pictographic, - .extended_pictographic_base, - => true, - - else => false, - }; - } -}; diff --git a/src/unicode/props_uucode.zig b/src/unicode/props_uucode.zig index bee942422..d876bf4ac 100644 --- a/src/unicode/props_uucode.zig +++ b/src/unicode/props_uucode.zig @@ -4,56 +4,17 @@ const assert = std.debug.assert; const uucode = @import("uucode"); const lut = @import("lut.zig"); const Properties = @import("props.zig").Properties; -const GraphemeBoundaryClass = @import("props.zig").GraphemeBoundaryClass; - -/// Gets the grapheme boundary class for a codepoint. -/// The use case for this is only in generating lookup tables. -fn graphemeBoundaryClass(cp: u21) GraphemeBoundaryClass { - if (cp > uucode.config.max_code_point) return .invalid; - - return switch (uucode.get(.grapheme_break, cp)) { - .extended_pictographic => .extended_pictographic, - .l => .L, - .v => .V, - .t => .T, - .lv => .LV, - .lvt => .LVT, - .prepend => .prepend, - .zwj => .zwj, - .spacing_mark => .spacing_mark, - .regional_indicator => .regional_indicator, - .emoji_modifier => .emoji_modifier, - .emoji_modifier_base => .extended_pictographic_base, - - .zwnj, - .indic_conjunct_break_extend, - .indic_conjunct_break_linker, - => .extend, - - // This is obviously not INVALID invalid, there is SOME grapheme - // boundary class for every codepoint. But we don't care about - // anything that doesn't fit into the above categories. Also note - // that `indic_conjunct_break_consonant` is `other` in - // 'GraphemeBreakProperty.txt' (it's missing). - .other, - .indic_conjunct_break_consonant, - .cr, - .lf, - .control, - => .invalid, - }; -} pub fn get(cp: u21) Properties { if (cp > uucode.config.max_code_point) return .{ .width = 1, - .grapheme_boundary_class = .invalid, + .grapheme_break = .other, .emoji_vs_base = false, }; return .{ .width = uucode.get(.width, cp), - .grapheme_boundary_class = graphemeBoundaryClass(cp), + .grapheme_break = uucode.get(.grapheme_break_no_control, cp), .emoji_vs_base = uucode.get(.is_emoji_vs_base, cp), }; }