diff --git a/src/terminal/Terminal.zig b/src/terminal/Terminal.zig index 162cfde38..0d659eb9a 100644 --- a/src/terminal/Terminal.zig +++ b/src/terminal/Terminal.zig @@ -344,7 +344,7 @@ pub fn print(self: *Terminal, c: u21) !void { // VS15 makes it narrow. if (c == 0xFE0F or c == 0xFE0E) { // This only applies to emoji - const prev_props = unicode.getProperties(prev.cell.content.codepoint); + const prev_props = unicode.table.get(prev.cell.content.codepoint); const emoji = prev_props.grapheme_boundary_class.isExtendedPictographic(); if (!emoji) return; @@ -470,7 +470,7 @@ pub fn print(self: *Terminal, c: u21) !void { // If this is a emoji variation selector, prev must be an emoji if (c == 0xFE0F or c == 0xFE0E) { - const prev_props = unicode.getProperties(prev.content.codepoint); + const prev_props = unicode.table.get(prev.content.codepoint); const emoji = prev_props.grapheme_boundary_class == .extended_pictographic; if (!emoji) return; } diff --git a/src/unicode/Properties.zig b/src/unicode/Properties.zig new file mode 100644 index 000000000..b7840743a --- /dev/null +++ b/src/unicode/Properties.zig @@ -0,0 +1,75 @@ +//! Property set per codepoint that Ghostty cares about. +//! +//! Adding to this lets you find new properties but also potentially makes +//! our lookup tables less efficient. Any changes to this should run the +//! benchmarks in src/bench to verify that we haven't regressed. +const Properties = @This(); + +const std = @import("std"); + +/// Codepoint width. We clamp to [0, 2] since Ghostty handles control +/// characters and we max out at 2 for wide characters (i.e. 3-em dash +/// becomes a 2-em dash). +width: u2 = 0, + +/// Grapheme boundary class. +grapheme_boundary_class: GraphemeBoundaryClass = .invalid, + +// Needed for lut.Generator +pub fn eql(a: Properties, b: Properties) bool { + return a.width == b.width and + a.grapheme_boundary_class == b.grapheme_boundary_class; +} + +// Needed for lut.Generator +pub fn format( + self: Properties, + comptime layout: []const u8, + opts: std.fmt.FormatOptions, + writer: anytype, +) !void { + _ = layout; + _ = opts; + try std.fmt.format(writer, + \\.{{ + \\ .width= {}, + \\ .grapheme_boundary_class= .{s}, + \\}} + , .{ + self.width, + @tagName(self.grapheme_boundary_class), + }); +} + +/// Possible grapheme boundary classes. This isn't an exhaustive list: +/// we omit control, CR, LF, etc. because in Ghostty's usage that are +/// impossible because they're handled by the terminal. +pub const GraphemeBoundaryClass = enum(u4) { + invalid, + L, + V, + T, + LV, + LVT, + prepend, + extend, + zwj, + spacing_mark, + regional_indicator, + extended_pictographic, + extended_pictographic_base, // \p{Extended_Pictographic} & \p{Emoji_Modifier_Base} + emoji_modifier, // \p{Emoji_Modifier} + + /// Returns true if this is an extended pictographic type. This + /// should be used instead of comparing the enum value directly + /// because we classify multiple. + pub fn isExtendedPictographic(self: GraphemeBoundaryClass) bool { + return switch (self) { + .extended_pictographic, + .extended_pictographic_base, + => true, + + else => false, + }; + } +}; diff --git a/src/unicode/main.zig b/src/unicode/main.zig index 17c86deca..ae50075ff 100644 --- a/src/unicode/main.zig +++ b/src/unicode/main.zig @@ -1,14 +1,13 @@ pub const lut = @import("lut.zig"); const grapheme = @import("grapheme.zig"); -const props = @import("props.zig"); -pub const table = props.table; -pub const Properties = props.Properties; -pub const getProperties = props.get; +pub const table = @import("props_table.zig").table; +pub const Properties = @import("Properties.zig"); pub const graphemeBreak = grapheme.graphemeBreak; pub const GraphemeBreakState = grapheme.BreakState; test { + _ = @import("props_ziglyph.zig"); _ = @import("symbols.zig"); @import("std").testing.refAllDecls(@This()); } diff --git a/src/unicode/props_table.zig b/src/unicode/props_table.zig new file mode 100644 index 000000000..80492346c --- /dev/null +++ b/src/unicode/props_table.zig @@ -0,0 +1,18 @@ +const Properties = @import("Properties.zig"); +const lut = @import("lut.zig"); + +/// The lookup tables for Ghostty. +pub const table = table: { + // This is only available after running a generator as part of the Ghostty + // build.zig process, but due to Zig's lazy analysis we can still reference + // it here. + // + // An example process is the `main` in `props_ziglyph.zig` + const generated = @import("unicode_tables").Tables(Properties); + const Tables = lut.Tables(Properties); + break :table Tables{ + .stage1 = &generated.stage1, + .stage2 = &generated.stage2, + .stage3 = &generated.stage3, + }; +}; diff --git a/src/unicode/props_ziglyph.zig b/src/unicode/props_ziglyph.zig new file mode 100644 index 000000000..fd123f3b5 --- /dev/null +++ b/src/unicode/props_ziglyph.zig @@ -0,0 +1,96 @@ +const props = @This(); + +const std = @import("std"); +const assert = std.debug.assert; +const ziglyph = @import("ziglyph"); +const lut = @import("lut.zig"); +const Properties = @import("Properties.zig"); +const GraphemeBoundaryClass = Properties.GraphemeBoundaryClass; + +/// Gets the grapheme boundary class for a codepoint. This is VERY +/// SLOW. The use case for this is only in generating lookup tables. +fn graphemeBoundaryClass(cp: u21) GraphemeBoundaryClass { + // We special-case modifier bases because we should not break + // if a modifier isn't next to a base. + if (ziglyph.emoji.isEmojiModifierBase(cp)) { + assert(ziglyph.emoji.isExtendedPictographic(cp)); + return .extended_pictographic_base; + } + + if (ziglyph.emoji.isEmojiModifier(cp)) return .emoji_modifier; + if (ziglyph.emoji.isExtendedPictographic(cp)) return .extended_pictographic; + if (ziglyph.grapheme_break.isL(cp)) return .L; + if (ziglyph.grapheme_break.isV(cp)) return .V; + if (ziglyph.grapheme_break.isT(cp)) return .T; + if (ziglyph.grapheme_break.isLv(cp)) return .LV; + if (ziglyph.grapheme_break.isLvt(cp)) return .LVT; + if (ziglyph.grapheme_break.isPrepend(cp)) return .prepend; + if (ziglyph.grapheme_break.isExtend(cp)) return .extend; + if (ziglyph.grapheme_break.isZwj(cp)) return .zwj; + if (ziglyph.grapheme_break.isSpacingmark(cp)) return .spacing_mark; + if (ziglyph.grapheme_break.isRegionalIndicator(cp)) return .regional_indicator; + + // This is obviously not INVALID invalid, there is SOME grapheme + // boundary class for every codepoint. But we don't care about + // anything that doesn't fit into the above categories. + return .invalid; +} + +pub fn get(cp: u21) Properties { + const zg_width = ziglyph.display_width.codePointWidth(cp, .half); + return .{ + .width = @intCast(@min(2, @max(0, zg_width))), + .grapheme_boundary_class = .init(cp), + }; +} + +/// Runnable binary to generate the lookup tables and output to stdout. +pub fn main() !void { + var arena_state = std.heap.ArenaAllocator.init(std.heap.page_allocator); + defer arena_state.deinit(); + const alloc = arena_state.allocator(); + + const gen: lut.Generator( + Properties, + struct { + pub fn get(ctx: @This(), cp: u21) !Properties { + _ = ctx; + return props.get(cp); + } + + pub fn eql(ctx: @This(), a: Properties, b: Properties) bool { + _ = ctx; + return a.eql(b); + } + }, + ) = .{}; + + const t = try gen.generate(alloc); + defer alloc.free(t.stage1); + defer alloc.free(t.stage2); + defer alloc.free(t.stage3); + try t.writeZig(std.io.getStdOut().writer()); + + // Uncomment when manually debugging to see our table sizes. + // std.log.warn("stage1={} stage2={} stage3={}", .{ + // t.stage1.len, + // t.stage2.len, + // t.stage3.len, + // }); +} + +// This is not very fast in debug modes, so its commented by default. +// IMPORTANT: UNCOMMENT THIS WHENEVER MAKING CODEPOINTWIDTH CHANGES. +// test "unicode props: tables match ziglyph" { +// const testing = std.testing; +// +// const min = 0xFF + 1; // start outside ascii +// for (min..std.math.maxInt(u21)) |cp| { +// const t = table.get(@intCast(cp)); +// const zg = @min(2, @max(0, ziglyph.display_width.codePointWidth(@intCast(cp), .half))); +// if (t.width != zg) { +// std.log.warn("mismatch cp=U+{x} t={} zg={}", .{ cp, t, zg }); +// try testing.expect(false); +// } +// } +// }