diff --git a/build.zig.zon b/build.zig.zon index 79b06cf8b..4c3e36b89 100644 --- a/build.zig.zon +++ b/build.zig.zon @@ -37,8 +37,8 @@ .lazy = true, }, .uucode = .{ - .url = "https://github.com/jacobsandlund/uucode/archive/38b82297e69a3b2dc55dc8df25f3851be37f9327.tar.gz", - .hash = "uucode-0.0.0-ZZjBPiqdPwB-rG3ieaq3c6tMpnksWYs4_rGj2IvFGjjB", + .url = "https://github.com/jacobsandlund/uucode/archive/69782fbe79e06a34ee177978d3479ed5801ce0af.tar.gz", + .hash = "uucode-0.0.0-ZZjBPl_dPwC-BPhSJLID4Hs9O0zw-vZKGXdaOBFch8c8", }, .zig_wayland = .{ // codeberg ifreund/zig-wayland diff --git a/src/benchmark/CodepointWidth.zig b/src/benchmark/CodepointWidth.zig index e9207aed5..b6c719184 100644 --- a/src/benchmark/CodepointWidth.zig +++ b/src/benchmark/CodepointWidth.zig @@ -10,6 +10,7 @@ const assert = std.debug.assert; const Allocator = std.mem.Allocator; const Benchmark = @import("Benchmark.zig"); const options = @import("options.zig"); +const uucode = @import("uucode"); const UTF8Decoder = @import("../terminal/UTF8Decoder.zig"); const simd = @import("../simd/main.zig"); const table = @import("../unicode/main.zig").table; @@ -47,6 +48,9 @@ pub const Mode = enum { /// Test our lookup table implementation. table, + + /// Using uucode, with custom `width` extension based on `wcwidth`. + uucode, }; /// Create a new terminal stream handler for the given arguments. @@ -71,6 +75,7 @@ pub fn benchmark(self: *CodepointWidth) Benchmark { .wcwidth => stepWcwidth, .table => stepTable, .simd => stepSimd, + .uucode => stepUucode, }, .setupFn = setup, .teardownFn = teardown, @@ -192,6 +197,41 @@ fn stepSimd(ptr: *anyopaque) Benchmark.Error!void { } } +fn stepUucode(ptr: *anyopaque) Benchmark.Error!void { + const self: *CodepointWidth = @ptrCast(@alignCast(ptr)); + + const f = self.data_f orelse return; + var r = std.io.bufferedReader(f.reader()); + var d: UTF8Decoder = .{}; + var buf: [4096]u8 = undefined; + while (true) { + const n = r.read(&buf) catch |err| { + log.warn("error reading data file err={}", .{err}); + return error.BenchmarkFailed; + }; + if (n == 0) break; // EOF reached + + for (buf[0..n]) |c| { + const cp_, const consumed = d.next(c); + assert(consumed); + if (cp_) |cp| { + // This is the same trick we do in terminal.zig so we + // keep it here. + const width = if (cp <= 0xFF) + 1 + else + //uucode.getX(.width, @intCast(cp)); + //uucode.getWidth(@intCast(cp)); + uucode.getSpecial(@intCast(cp)).width; + + // Write the width to the buffer to avoid it being compiled + // away + buf[0] = @intCast(width); + } + } + } +} + test CodepointWidth { const testing = std.testing; const alloc = testing.allocator; diff --git a/src/benchmark/GraphemeBreak.zig b/src/benchmark/GraphemeBreak.zig index 57effebe4..105371ea5 100644 --- a/src/benchmark/GraphemeBreak.zig +++ b/src/benchmark/GraphemeBreak.zig @@ -8,6 +8,7 @@ const assert = std.debug.assert; const Allocator = std.mem.Allocator; const Benchmark = @import("Benchmark.zig"); const options = @import("options.zig"); +const uucode = @import("uucode"); const UTF8Decoder = @import("../terminal/UTF8Decoder.zig"); const unicode = @import("../unicode/main.zig"); @@ -38,6 +39,9 @@ pub const Mode = enum { /// Ghostty's table-based approach. table, + + /// Uucode + uucode, }; /// Create a new terminal stream handler for the given arguments. @@ -60,6 +64,7 @@ pub fn benchmark(self: *GraphemeBreak) Benchmark { .stepFn = switch (self.opts.mode) { .noop => stepNoop, .table => stepTable, + .uucode => stepUucode, }, .setupFn = setup, .teardownFn = teardown, @@ -134,6 +139,160 @@ fn stepTable(ptr: *anyopaque) Benchmark.Error!void { } } +const GraphemeBoundaryClass = uucode.TypeOfX(.grapheme_boundary_class); + +pub fn computeGraphemeBoundaryClass( + gb1: GraphemeBoundaryClass, + gb2: GraphemeBoundaryClass, + state: *uucode.grapheme.BreakState, +) bool { + // Set state back to default when `gb1` or `gb2` is not expected in sequence. + switch (state.*) { + .regional_indicator => { + if (gb1 != .regional_indicator or gb2 != .regional_indicator) { + state.* = .default; + } + }, + .extended_pictographic => { + switch (gb1) { + .extend, + .zwj, + .extended_pictographic, + => {}, + + else => state.* = .default, + } + + switch (gb2) { + .extend, + .zwj, + .extended_pictographic, + => {}, + + else => state.* = .default, + } + }, + .default, .indic_conjunct_break_consonant, .indic_conjunct_break_linker => {}, + } + + // GB6: L x (L | V | LV | VT) + if (gb1 == .L) { + if (gb2 == .L or + gb2 == .V or + gb2 == .LV or + gb2 == .LVT) return false; + } + + // GB7: (LV | V) x (V | T) + if (gb1 == .LV or gb1 == .V) { + if (gb2 == .V or gb2 == .T) return false; + } + + // GB8: (LVT | T) x T + if (gb1 == .LVT or gb1 == .T) { + if (gb2 == .T) return false; + } + + // Handle GB9 (Extend | ZWJ) later, since it can also match the start of + // GB9c (Indic) and GB11 (Emoji ZWJ) + + // GB9a: SpacingMark + if (gb2 == .spacing_mark) return false; + + // GB9b: Prepend + if (gb1 == .prepend) return false; + + // GB11: Emoji ZWJ sequence + if (gb1 == .extended_pictographic) { + // start of sequence: + + // In normal operation, we'll be in this state, but + // precomputeGraphemeBreak iterates all states. + // std.debug.assert(state.* == .default); + + if (gb2 == .extend or gb2 == .zwj) { + state.* = .extended_pictographic; + return false; + } + // else, not an Emoji ZWJ sequence + } else if (state.* == .extended_pictographic) { + // continue or end sequence: + + if (gb1 == .extend and (gb2 == .extend or gb2 == .zwj)) { + // continue extend* ZWJ sequence + return false; + } else if (gb1 == .zwj and gb2 == .extended_pictographic) { + // ZWJ -> end of sequence + state.* = .default; + return false; + } else { + // Not a valid Emoji ZWJ sequence + state.* = .default; + } + } + + // GB12 and GB13: Regional Indicator + if (gb1 == .regional_indicator and gb2 == .regional_indicator) { + if (state.* == .default) { + state.* = .regional_indicator; + return false; + } else { + state.* = .default; + return true; + } + } + + // GB9: x (Extend | ZWJ) + if (gb2 == .extend or gb2 == .zwj) return false; + + // GB999: Otherwise, break everywhere + return true; +} + +pub fn isBreak( + cp1: u21, + cp2: u21, + state: *uucode.grapheme.BreakState, +) bool { + const table = comptime uucode.grapheme.precomputeGraphemeBreak( + GraphemeBoundaryClass, + computeGraphemeBoundaryClass, + ); + const gb1 = uucode.getX(.grapheme_boundary_class, cp1); + const gb2 = uucode.getX(.grapheme_boundary_class, cp2); + const result = table.get(gb1, gb2, state.*); + state.* = result.state; + return result.result; +} + +fn stepUucode(ptr: *anyopaque) Benchmark.Error!void { + const self: *GraphemeBreak = @ptrCast(@alignCast(ptr)); + + const f = self.data_f orelse return; + var r = std.io.bufferedReader(f.reader()); + var d: UTF8Decoder = .{}; + var state: uucode.grapheme.BreakState = .default; + var cp1: u21 = 0; + var buf: [4096]u8 = undefined; + while (true) { + const n = r.read(&buf) catch |err| { + log.warn("error reading data file err={}", .{err}); + return error.BenchmarkFailed; + }; + if (n == 0) break; // EOF reached + + for (buf[0..n]) |c| { + const cp_, const consumed = d.next(c); + assert(consumed); + if (cp_) |cp2| { + const v = isBreak(cp1, @intCast(cp2), &state); + buf[0] = @intCast(@intFromBool(v)); + cp1 = cp2; + } + } + } +} + test GraphemeBreak { const testing = std.testing; const alloc = testing.allocator; diff --git a/src/build/UnicodeTables.zig b/src/build/UnicodeTables.zig index 78bcef2c9..219b8589a 100644 --- a/src/build/UnicodeTables.zig +++ b/src/build/UnicodeTables.zig @@ -24,14 +24,16 @@ pub fn init(b: *std.Build, uucode_tables_zig: std.Build.LazyPath) !UnicodeTables if (b.lazyDependency("uucode", .{ .target = b.graph.host, .@"tables.zig" = uucode_tables_zig, + .build_config_path = b.path("src/build/uucode_config.zig"), })) |dep| { exe.root_module.addImport("uucode", dep.module("uucode")); } const run = b.addRunArtifact(exe); + const output = run.addOutputFileArg("tables.zig"); return .{ .exe = exe, - .output = run.captureStdOut(), + .output = output, }; } diff --git a/src/build/uucode_config.zig b/src/build/uucode_config.zig index 69d0d2fd3..e2e3c9163 100644 --- a/src/build/uucode_config.zig +++ b/src/build/uucode_config.zig @@ -3,6 +3,93 @@ const config_x = @import("config.x.zig"); const d = config.default; const wcwidth = config_x.wcwidth; +pub const log_level = .debug; + +fn computeWidth(cp: u21, data: anytype, backing: anytype, tracking: anytype) void { + _ = cp; + _ = backing; + _ = tracking; + if (data.wcwidth < 0) { + data.width = 0; + } else if (data.wcwidth > 2) { + data.width = 2; + } else { + data.width = @intCast(data.wcwidth); + } +} + +const width = config.Extension{ .inputs = &.{"wcwidth"}, .compute = &computeWidth, .fields = &.{ + .{ .name = "width", .type = u2 }, +} }; + +pub const GraphemeBoundaryClass = enum(u4) { + invalid, + L, + V, + T, + LV, + LVT, + prepend, + extend, + zwj, + spacing_mark, + regional_indicator, + extended_pictographic, + extended_pictographic_base, // \p{Extended_Pictographic} & \p{Emoji_Modifier_Base} + emoji_modifier, // \p{Emoji_Modifier} +}; + +fn computeGraphemeBoundaryClass(cp: u21, data: anytype, backing: anytype, tracking: anytype) void { + _ = cp; + _ = backing; + _ = tracking; + if (data.is_emoji_modifier) { + data.grapheme_boundary_class = .emoji_modifier; + } else if (data.is_emoji_modifier_base) { + data.grapheme_boundary_class = .extended_pictographic_base; + } else { + data.grapheme_boundary_class = switch (data.grapheme_break) { + .extended_pictographic => .extended_pictographic, + .l => .L, + .v => .V, + .t => .T, + .lv => .LV, + .lvt => .LVT, + .prepend => .prepend, + .zwj => .zwj, + .spacing_mark => .spacing_mark, + .regional_indicator => .regional_indicator, + + .zwnj, + .indic_conjunct_break_extend, + .indic_conjunct_break_linker, + => .extend, + + // This is obviously not INVALID invalid, there is SOME grapheme + // boundary class for every codepoint. But we don't care about + // anything that doesn't fit into the above categories. + .other, + .indic_conjunct_break_consonant, + .cr, + .lf, + .control, + => .invalid, + }; + } +} + +const grapheme_boundary_class = config.Extension{ + .inputs = &.{ + "grapheme_break", + "is_emoji_modifier", + "is_emoji_modifier_base", + }, + .compute = &computeGraphemeBoundaryClass, + .fields = &.{ + .{ .name = "grapheme_boundary_class", .type = GraphemeBoundaryClass }, + }, +}; + pub const tables = [_]config.Table{ .{ .extensions = &.{wcwidth}, @@ -14,9 +101,16 @@ pub const tables = [_]config.Table{ d.field("case_folding_full"), // Alternative: // d.field("case_folding_simple"), - d.field("grapheme_break"), d.field("is_emoji_modifier"), d.field("is_emoji_modifier_base"), + d.field("grapheme_break"), + }, + }, + .{ + .extensions = &.{ wcwidth, width, grapheme_boundary_class }, + .fields = &.{ + width.field("width"), + grapheme_boundary_class.field("grapheme_boundary_class"), }, }, }; diff --git a/src/simd/codepoint_width.zig b/src/simd/codepoint_width.zig index e2383aff1..008c7ad9f 100644 --- a/src/simd/codepoint_width.zig +++ b/src/simd/codepoint_width.zig @@ -29,7 +29,8 @@ test "codepointWidth basic" { // const uucode = @import("uucode"); // // const min = 0xFF + 1; // start outside ascii -// for (min..uucode.code_point_range_end) |cp| { +// const max = std.math.maxInt(u21) + 1; +// for (min..max) |cp| { // const simd = codepointWidth(@intCast(cp)); // const uu = @min(2, @max(0, uucode.get(.wcwidth, @intCast(cp)))); // if (simd != uu) mismatch: { diff --git a/src/terminal/Terminal.zig b/src/terminal/Terminal.zig index c46488f98..6b3b9252b 100644 --- a/src/terminal/Terminal.zig +++ b/src/terminal/Terminal.zig @@ -345,7 +345,7 @@ pub fn print(self: *Terminal, c: u21) !void { if (c == 0xFE0F or c == 0xFE0E) { // This only applies to emoji const prev_props = unicode.getProperties(prev.cell.content.codepoint); - const emoji = prev_props.grapheme_boundary_class.isExtendedPictographic(); + const emoji = unicode.isExtendedPictographic(prev_props.grapheme_boundary_class); if (!emoji) return; switch (c) { diff --git a/src/unicode/grapheme.zig b/src/unicode/grapheme.zig index 0950bedba..b0cb4ead9 100644 --- a/src/unicode/grapheme.zig +++ b/src/unicode/grapheme.zig @@ -2,6 +2,7 @@ const std = @import("std"); const props = @import("props.zig"); const GraphemeBoundaryClass = props.GraphemeBoundaryClass; const table = props.table; +const isExtendedPictographic = props.isExtendedPictographic; /// Determines if there is a grapheme break between two codepoints. This /// must be called sequentially maintaining the state between calls. @@ -80,7 +81,7 @@ fn graphemeBreakClass( state: *BreakState, ) bool { // GB11: Emoji Extend* ZWJ x Emoji - if (!state.extended_pictographic and gbc1.isExtendedPictographic()) { + if (!state.extended_pictographic and isExtendedPictographic(gbc1)) { state.extended_pictographic = true; } @@ -131,7 +132,7 @@ fn graphemeBreakClass( // GB11: Emoji Extend* ZWJ x Emoji if (state.extended_pictographic and gbc1 == .zwj and - gbc2.isExtendedPictographic()) + isExtendedPictographic(gbc2)) { state.extended_pictographic = false; return false; diff --git a/src/unicode/main.zig b/src/unicode/main.zig index f5b911948..2b0b8ef9c 100644 --- a/src/unicode/main.zig +++ b/src/unicode/main.zig @@ -7,6 +7,7 @@ pub const Properties = props.Properties; pub const getProperties = props.get; pub const graphemeBreak = grapheme.graphemeBreak; pub const GraphemeBreakState = grapheme.BreakState; +pub const isExtendedPictographic = props.isExtendedPictographic; test { @import("std").testing.refAllDecls(@This()); diff --git a/src/unicode/props.zig b/src/unicode/props.zig index c06329876..579e59977 100644 --- a/src/unicode/props.zig +++ b/src/unicode/props.zig @@ -6,10 +6,11 @@ const lut = @import("lut.zig"); /// The lookup tables for Ghostty. pub const table = table: { + const Props = uucode.PackedTypeOf("1"); // This is only available after running main() below as part of the Ghostty // build.zig, but due to Zig's lazy analysis we can still reference it here. - const generated = @import("unicode_tables").Tables(Properties); - const Tables = lut.Tables(Properties); + const generated = @import("unicode_tables").Tables(Props); + const Tables = lut.Tables(Props); break :table Tables{ .stage1 = &generated.stage1, .stage2 = &generated.stage2, @@ -61,81 +62,62 @@ pub const Properties = struct { /// Possible grapheme boundary classes. This isn't an exhaustive list: /// we omit control, CR, LF, etc. because in Ghostty's usage that are /// impossible because they're handled by the terminal. -pub const GraphemeBoundaryClass = enum(u4) { - invalid, - L, - V, - T, - LV, - LVT, - prepend, - extend, - zwj, - spacing_mark, - regional_indicator, - extended_pictographic, - extended_pictographic_base, // \p{Extended_Pictographic} & \p{Emoji_Modifier_Base} - emoji_modifier, // \p{Emoji_Modifier} +pub const GraphemeBoundaryClass = uucode.TypeOfX(.grapheme_boundary_class); - /// Gets the grapheme boundary class for a codepoint. - /// The use case for this is only in generating lookup tables. - pub fn init(cp: u21) GraphemeBoundaryClass { - if (cp < uucode.code_point_range_end) { - if (uucode.get(.is_emoji_modifier, cp)) return .emoji_modifier; - if (uucode.get(.is_emoji_modifier_base, cp)) return .extended_pictographic_base; +/// Gets the grapheme boundary class for a codepoint. +/// The use case for this is only in generating lookup tables. +pub fn computeGraphemeBoundaryClass(cp: u21) GraphemeBoundaryClass { + if (uucode.get(.is_emoji_modifier, cp)) return .emoji_modifier; + if (uucode.get(.is_emoji_modifier_base, cp)) return .extended_pictographic_base; - return switch (uucode.get(.grapheme_break, cp)) { - .extended_pictographic => .extended_pictographic, - .l => .L, - .v => .V, - .t => .T, - .lv => .LV, - .lvt => .LVT, - .prepend => .prepend, - .zwj => .zwj, - .spacing_mark => .spacing_mark, - .regional_indicator => .regional_indicator, + return switch (uucode.get(.grapheme_break, cp)) { + .extended_pictographic => .extended_pictographic, + .l => .L, + .v => .V, + .t => .T, + .lv => .LV, + .lvt => .LVT, + .prepend => .prepend, + .zwj => .zwj, + .spacing_mark => .spacing_mark, + .regional_indicator => .regional_indicator, - .zwnj, - .indic_conjunct_break_extend, - .indic_conjunct_break_linker, - => .extend, + .zwnj, + .indic_conjunct_break_extend, + .indic_conjunct_break_linker, + => .extend, - // This is obviously not INVALID invalid, there is SOME grapheme - // boundary class for every codepoint. But we don't care about - // anything that doesn't fit into the above categories. - .other, - .indic_conjunct_break_consonant, - .cr, - .lf, - .control, - => .invalid, - }; - } else { - return .invalid; - } - } + // This is obviously not INVALID invalid, there is SOME grapheme + // boundary class for every codepoint. But we don't care about + // anything that doesn't fit into the above categories. + .other, + .indic_conjunct_break_consonant, + .cr, + .lf, + .control, + => .invalid, + }; +} - /// Returns true if this is an extended pictographic type. This - /// should be used instead of comparing the enum value directly - /// because we classify multiple. - pub fn isExtendedPictographic(self: GraphemeBoundaryClass) bool { - return switch (self) { - .extended_pictographic, - .extended_pictographic_base, - => true, +/// Returns true if this is an extended pictographic type. This +/// should be used instead of comparing the enum value directly +/// because we classify multiple. +pub fn isExtendedPictographic(self: GraphemeBoundaryClass) bool { + return switch (self) { + .extended_pictographic, + .extended_pictographic_base, + => true, - else => false, - }; - } -}; + else => false, + }; +} pub fn get(cp: u21) Properties { - const wcwidth = if (cp < uucode.code_point_range_end) uucode.get(.wcwidth, cp) else 0; + const wcwidth = uucode.get(.wcwidth, cp); return .{ .width = @intCast(@min(2, @max(0, wcwidth))), - .grapheme_boundary_class = .init(cp), + .grapheme_boundary_class = computeGraphemeBoundaryClass(cp), }; } @@ -145,6 +127,13 @@ pub fn main() !void { defer arena_state.deinit(); const alloc = arena_state.allocator(); + var args_iter = try std.process.argsWithAllocator(alloc); + defer args_iter.deinit(); + _ = args_iter.skip(); // Skip program name + + const output_path = args_iter.next() orelse std.debug.panic("No output file arg!", .{}); + std.debug.print("Unicode tables output_path = {s}\n", .{output_path}); + const gen: lut.Generator( Properties, struct { @@ -164,7 +153,10 @@ pub fn main() !void { defer alloc.free(t.stage1); defer alloc.free(t.stage2); defer alloc.free(t.stage3); - try t.writeZig(std.io.getStdOut().writer()); + var out_file = try std.fs.cwd().createFile(output_path, .{}); + defer out_file.close(); + const writer = out_file.writer(); + try t.writeZig(writer); // Uncomment when manually debugging to see our table sizes. // std.log.warn("stage1={} stage2={} stage3={}", .{ @@ -180,7 +172,8 @@ pub fn main() !void { // const testing = std.testing; // // const min = 0xFF + 1; // start outside ascii -// for (min..uucode.code_point_range_end) |cp| { +// const max = std.math.maxInt(u21) + 1; +// for (min..max) |cp| { // const t = table.get(@intCast(cp)); // const uu = @min(2, @max(0, uucode.get(.wcwidth, @intCast(cp)))); // if (t.width != uu) {