mirror of
https://github.com/ghostty-org/ghostty.git
synced 2026-04-18 05:20:29 +00:00
unicode: switch to uucode grapheme break to match unicode 16 spec
This commit is contained in:
@@ -1,6 +1,6 @@
|
||||
const std = @import("std");
|
||||
const table = @import("props_table.zig").table;
|
||||
const GraphemeBoundaryClass = @import("props.zig").GraphemeBoundaryClass;
|
||||
const uucode = @import("uucode");
|
||||
|
||||
/// Determines if there is a grapheme break between two codepoints. This
|
||||
/// must be called sequentially maintaining the state between calls.
|
||||
@@ -9,11 +9,11 @@ const GraphemeBoundaryClass = @import("props.zig").GraphemeBoundaryClass;
|
||||
/// line feeds, and carriage returns are expected to be filtered out before
|
||||
/// calling this function. This is because this function is tuned for
|
||||
/// Ghostty.
|
||||
pub fn graphemeBreak(cp1: u21, cp2: u21, state: *BreakState) bool {
|
||||
pub fn graphemeBreak(cp1: u21, cp2: u21, state: *uucode.grapheme.BreakState) bool {
|
||||
const value = Precompute.data[
|
||||
(Precompute.Key{
|
||||
.gbc1 = table.get(cp1).grapheme_boundary_class,
|
||||
.gbc2 = table.get(cp2).grapheme_boundary_class,
|
||||
.gb1 = table.get(cp1).grapheme_break,
|
||||
.gb2 = table.get(cp2).grapheme_break,
|
||||
.state = state.*,
|
||||
}).index()
|
||||
];
|
||||
@@ -21,133 +21,64 @@ pub fn graphemeBreak(cp1: u21, cp2: u21, state: *BreakState) bool {
|
||||
return value.result;
|
||||
}
|
||||
|
||||
/// The state that must be maintained between calls to `graphemeBreak`.
|
||||
pub const BreakState = packed struct(u2) {
|
||||
extended_pictographic: bool = false,
|
||||
regional_indicator: bool = false,
|
||||
};
|
||||
|
||||
/// This is all the structures and data for the precomputed lookup table
|
||||
/// for all possible permutations of state and grapheme boundary classes.
|
||||
/// Precomputation only requires 2^10 keys of 3 bit values so the whole
|
||||
/// table is less than 1KB.
|
||||
/// for all possible permutations of state and grapheme break properties.
|
||||
/// Precomputation requires 2^13 keys of 4 bit values so the whole table is
|
||||
/// 8KB.
|
||||
const Precompute = struct {
|
||||
const Key = packed struct(u10) {
|
||||
state: BreakState,
|
||||
gbc1: GraphemeBoundaryClass,
|
||||
gbc2: GraphemeBoundaryClass,
|
||||
const Key = packed struct(u13) {
|
||||
state: uucode.grapheme.BreakState,
|
||||
gb1: uucode.x.types.GraphemeBreakNoControl,
|
||||
gb2: uucode.x.types.GraphemeBreakNoControl,
|
||||
|
||||
fn index(self: Key) usize {
|
||||
return @intCast(@as(u10, @bitCast(self)));
|
||||
return @intCast(@as(u13, @bitCast(self)));
|
||||
}
|
||||
};
|
||||
|
||||
const Value = packed struct(u3) {
|
||||
const Value = packed struct(u4) {
|
||||
result: bool,
|
||||
state: BreakState,
|
||||
state: uucode.grapheme.BreakState,
|
||||
};
|
||||
|
||||
const data = precompute: {
|
||||
var result: [std.math.maxInt(u10)]Value = undefined;
|
||||
var result: [std.math.maxInt(u13) + 1]Value = undefined;
|
||||
|
||||
@setEvalBranchQuota(3_000);
|
||||
const info = @typeInfo(GraphemeBoundaryClass).@"enum";
|
||||
for (0..std.math.maxInt(u2) + 1) |state_init| {
|
||||
const max_state_int = blk: {
|
||||
var max: usize = 0;
|
||||
for (@typeInfo(uucode.grapheme.BreakState).@"enum".fields) |field| {
|
||||
if (field.value > max) max = field.value;
|
||||
}
|
||||
break :blk max;
|
||||
};
|
||||
|
||||
@setEvalBranchQuota(10_000);
|
||||
const info = @typeInfo(uucode.x.types.GraphemeBreakNoControl).@"enum";
|
||||
for (0..max_state_int + 1) |state_int| {
|
||||
for (info.fields) |field1| {
|
||||
for (info.fields) |field2| {
|
||||
var state: BreakState = @bitCast(@as(u2, @intCast(state_init)));
|
||||
var state: uucode.grapheme.BreakState = @enumFromInt(state_int);
|
||||
|
||||
const key: Key = .{
|
||||
.gbc1 = @field(GraphemeBoundaryClass, field1.name),
|
||||
.gbc2 = @field(GraphemeBoundaryClass, field2.name),
|
||||
.gb1 = @field(uucode.x.types.GraphemeBreakNoControl, field1.name),
|
||||
.gb2 = @field(uucode.x.types.GraphemeBreakNoControl, field2.name),
|
||||
.state = state,
|
||||
};
|
||||
const v = graphemeBreakClass(key.gbc1, key.gbc2, &state);
|
||||
const v = uucode.x.grapheme.computeGraphemeBreakNoControl(
|
||||
key.gb1,
|
||||
key.gb2,
|
||||
&state,
|
||||
);
|
||||
result[key.index()] = .{ .result = v, .state = state };
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std.debug.assert(@sizeOf(@TypeOf(result)) == 8192);
|
||||
break :precompute result;
|
||||
};
|
||||
};
|
||||
|
||||
/// This is the algorithm from utf8proc. We only use this offline for
|
||||
/// precomputing the lookup table.
|
||||
fn graphemeBreakClass(
|
||||
gbc1: GraphemeBoundaryClass,
|
||||
gbc2: GraphemeBoundaryClass,
|
||||
state: *BreakState,
|
||||
) bool {
|
||||
// GB11: Emoji Extend* ZWJ x Emoji
|
||||
if (!state.extended_pictographic and gbc1.isExtendedPictographic()) {
|
||||
state.extended_pictographic = true;
|
||||
}
|
||||
|
||||
// These two properties are ignored because they're not relevant to
|
||||
// Ghostty -- they're filtered out before checking grapheme boundaries.
|
||||
// GB3: CR x LF
|
||||
// GB4: Control
|
||||
|
||||
// GB6: Hangul L x (L|V|LV|VT)
|
||||
if (gbc1 == .L) {
|
||||
if (gbc2 == .L or
|
||||
gbc2 == .V or
|
||||
gbc2 == .LV or
|
||||
gbc2 == .LVT) return false;
|
||||
}
|
||||
|
||||
// GB7: Hangul (LV | V) x (V | T)
|
||||
if (gbc1 == .LV or gbc1 == .V) {
|
||||
if (gbc2 == .V or
|
||||
gbc2 == .T) return false;
|
||||
}
|
||||
|
||||
// GB8: Hangul (LVT | T) x T
|
||||
if (gbc1 == .LVT or gbc1 == .T) {
|
||||
if (gbc2 == .T) return false;
|
||||
}
|
||||
|
||||
// GB9b: x (Extend | ZWJ)
|
||||
if (gbc2 == .extend or gbc2 == .zwj) return false;
|
||||
|
||||
// GB9a: x Spacing
|
||||
if (gbc2 == .spacing_mark) return false;
|
||||
|
||||
// GB9b: Prepend x
|
||||
if (gbc1 == .prepend) return false;
|
||||
|
||||
// GB12, GB13: RI x RI
|
||||
if (gbc1 == .regional_indicator and gbc2 == .regional_indicator) {
|
||||
if (state.regional_indicator) {
|
||||
state.regional_indicator = false;
|
||||
return true;
|
||||
} else {
|
||||
state.regional_indicator = true;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// GB11: Emoji Extend* ZWJ x Emoji
|
||||
if (state.extended_pictographic and
|
||||
gbc1 == .zwj and
|
||||
gbc2.isExtendedPictographic())
|
||||
{
|
||||
state.extended_pictographic = false;
|
||||
return false;
|
||||
}
|
||||
|
||||
// UTS #51. This isn't covered by UAX #29 as far as I can tell (but
|
||||
// I'm probably wrong). This is a special case for emoji modifiers
|
||||
// which only do not break if they're next to a base.
|
||||
//
|
||||
// emoji_modifier_sequence := emoji_modifier_base emoji_modifier
|
||||
if (gbc2 == .emoji_modifier and gbc1 == .extended_pictographic_base) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/// If you build this file as a binary, we will verify the grapheme break
|
||||
/// implementation. This iterates over billions of codepoints so it is
|
||||
/// SLOW. It's not meant to be run in CI, but it's useful for debugging.
|
||||
@@ -156,13 +87,11 @@ fn graphemeBreakClass(
|
||||
/// adding a `-Demit-unicode-test` option for `zig build`, but that
|
||||
/// hasn't been done here.
|
||||
pub fn main() !void {
|
||||
const uucode = @import("uucode");
|
||||
|
||||
// Set the min and max to control the test range.
|
||||
const min = 0;
|
||||
const max = uucode.config.max_code_point + 1;
|
||||
|
||||
var state: BreakState = .{};
|
||||
var state: uucode.grapheme.BreakState = .default;
|
||||
var uu_state: uucode.grapheme.BreakState = .default;
|
||||
for (min..max) |cp1| {
|
||||
if (cp1 % 1000 == 0) std.log.warn("progress cp1={}", .{cp1});
|
||||
@@ -199,13 +128,53 @@ test "grapheme break: emoji modifier" {
|
||||
|
||||
// Emoji and modifier
|
||||
{
|
||||
var state: BreakState = .{};
|
||||
var state: uucode.grapheme.BreakState = .default;
|
||||
try testing.expect(!graphemeBreak(0x261D, 0x1F3FF, &state));
|
||||
}
|
||||
|
||||
// Non-emoji and emoji modifier
|
||||
{
|
||||
var state: BreakState = .{};
|
||||
var state: uucode.grapheme.BreakState = .default;
|
||||
try testing.expect(graphemeBreak(0x22, 0x1F3FF, &state));
|
||||
}
|
||||
}
|
||||
|
||||
test "long emoji zwj sequences" {
|
||||
var state: uucode.grapheme.BreakState = .default;
|
||||
// 👩👩👧👦 (family: woman, woman, girl, boy)
|
||||
var it = uucode.utf8.Iterator.init("\u{1F469}\u{200D}\u{1F469}\u{200D}\u{1F467}\u{200D}\u{1F466}_");
|
||||
var cp1 = it.next() orelse unreachable;
|
||||
var cp2 = it.next() orelse unreachable;
|
||||
try std.testing.expect(cp1 == 0x1F469); // 👩
|
||||
try std.testing.expect(!graphemeBreak(cp1, cp2, &state));
|
||||
|
||||
cp1 = cp2;
|
||||
cp2 = it.next() orelse unreachable;
|
||||
try std.testing.expect(cp1 == 0x200D);
|
||||
try std.testing.expect(!graphemeBreak(cp1, cp2, &state));
|
||||
|
||||
cp1 = cp2;
|
||||
cp2 = it.next() orelse unreachable;
|
||||
try std.testing.expect(cp1 == 0x1F469); // 👩
|
||||
try std.testing.expect(!graphemeBreak(cp1, cp2, &state));
|
||||
|
||||
cp1 = cp2;
|
||||
cp2 = it.next() orelse unreachable;
|
||||
try std.testing.expect(cp1 == 0x200D);
|
||||
try std.testing.expect(!graphemeBreak(cp1, cp2, &state));
|
||||
|
||||
cp1 = cp2;
|
||||
cp2 = it.next() orelse unreachable;
|
||||
try std.testing.expect(cp1 == 0x1F467); // 👧
|
||||
try std.testing.expect(!graphemeBreak(cp1, cp2, &state));
|
||||
|
||||
cp1 = cp2;
|
||||
cp2 = it.next() orelse unreachable;
|
||||
try std.testing.expect(cp1 == 0x200D);
|
||||
try std.testing.expect(!graphemeBreak(cp1, cp2, &state));
|
||||
|
||||
cp1 = cp2;
|
||||
cp2 = it.next() orelse unreachable;
|
||||
try std.testing.expect(cp1 == 0x1F466); // 👦
|
||||
try std.testing.expect(graphemeBreak(cp1, cp2, &state)); // break
|
||||
}
|
||||
|
||||
@@ -4,7 +4,6 @@ const grapheme = @import("grapheme.zig");
|
||||
pub const table = @import("props_table.zig").table;
|
||||
pub const Properties = @import("props.zig").Properties;
|
||||
pub const graphemeBreak = grapheme.graphemeBreak;
|
||||
pub const GraphemeBreakState = grapheme.BreakState;
|
||||
|
||||
test {
|
||||
@import("std").testing.refAllDecls(@This());
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
//! benchmarks in src/bench to verify that we haven't regressed.
|
||||
|
||||
const std = @import("std");
|
||||
const uucode = @import("uucode");
|
||||
|
||||
pub const Properties = packed struct {
|
||||
/// Codepoint width. We clamp to [0, 2] since Ghostty handles control
|
||||
@@ -12,8 +13,8 @@ pub const Properties = packed struct {
|
||||
/// becomes a 2-em dash).
|
||||
width: u2 = 0,
|
||||
|
||||
/// Grapheme boundary class.
|
||||
grapheme_boundary_class: GraphemeBoundaryClass = .invalid,
|
||||
/// Grapheme break property.
|
||||
grapheme_break: uucode.x.types.GraphemeBreakNoControl = .other,
|
||||
|
||||
/// Emoji VS compatibility
|
||||
emoji_vs_text: bool = false,
|
||||
@@ -22,7 +23,7 @@ pub const Properties = packed struct {
|
||||
// Needed for lut.Generator
|
||||
pub fn eql(a: Properties, b: Properties) bool {
|
||||
return a.width == b.width and
|
||||
a.grapheme_boundary_class == b.grapheme_boundary_class and
|
||||
a.grapheme_break == b.grapheme_break and
|
||||
a.emoji_vs_text == b.emoji_vs_text and
|
||||
a.emoji_vs_emoji == b.emoji_vs_emoji;
|
||||
}
|
||||
@@ -35,48 +36,15 @@ pub const Properties = packed struct {
|
||||
try writer.print(
|
||||
\\.{{
|
||||
\\ .width= {},
|
||||
\\ .grapheme_boundary_class= .{s},
|
||||
\\ .grapheme_break= .{s},
|
||||
\\ .emoji_vs_text= {},
|
||||
\\ .emoji_vs_emoji= {},
|
||||
\\}}
|
||||
, .{
|
||||
self.width,
|
||||
@tagName(self.grapheme_boundary_class),
|
||||
@tagName(self.grapheme_break),
|
||||
self.emoji_vs_text,
|
||||
self.emoji_vs_emoji,
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
/// Possible grapheme boundary classes. This isn't an exhaustive list:
|
||||
/// we omit control, CR, LF, etc. because in Ghostty's usage that are
|
||||
/// impossible because they're handled by the terminal.
|
||||
pub const GraphemeBoundaryClass = enum(u4) {
|
||||
invalid,
|
||||
L,
|
||||
V,
|
||||
T,
|
||||
LV,
|
||||
LVT,
|
||||
prepend,
|
||||
extend,
|
||||
zwj,
|
||||
spacing_mark,
|
||||
regional_indicator,
|
||||
extended_pictographic,
|
||||
extended_pictographic_base, // \p{Extended_Pictographic} & \p{Emoji_Modifier_Base}
|
||||
emoji_modifier, // \p{Emoji_Modifier}
|
||||
|
||||
/// Returns true if this is an extended pictographic type. This
|
||||
/// should be used instead of comparing the enum value directly
|
||||
/// because we classify multiple.
|
||||
pub fn isExtendedPictographic(self: GraphemeBoundaryClass) bool {
|
||||
return switch (self) {
|
||||
.extended_pictographic,
|
||||
.extended_pictographic_base,
|
||||
=> true,
|
||||
|
||||
else => false,
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
@@ -4,57 +4,18 @@ const assert = std.debug.assert;
|
||||
const uucode = @import("uucode");
|
||||
const lut = @import("lut.zig");
|
||||
const Properties = @import("props.zig").Properties;
|
||||
const GraphemeBoundaryClass = @import("props.zig").GraphemeBoundaryClass;
|
||||
|
||||
/// Gets the grapheme boundary class for a codepoint.
|
||||
/// The use case for this is only in generating lookup tables.
|
||||
fn graphemeBoundaryClass(cp: u21) GraphemeBoundaryClass {
|
||||
if (cp > uucode.config.max_code_point) return .invalid;
|
||||
|
||||
return switch (uucode.get(.grapheme_break, cp)) {
|
||||
.extended_pictographic => .extended_pictographic,
|
||||
.l => .L,
|
||||
.v => .V,
|
||||
.t => .T,
|
||||
.lv => .LV,
|
||||
.lvt => .LVT,
|
||||
.prepend => .prepend,
|
||||
.zwj => .zwj,
|
||||
.spacing_mark => .spacing_mark,
|
||||
.regional_indicator => .regional_indicator,
|
||||
.emoji_modifier => .emoji_modifier,
|
||||
.emoji_modifier_base => .extended_pictographic_base,
|
||||
|
||||
.zwnj,
|
||||
.indic_conjunct_break_extend,
|
||||
.indic_conjunct_break_linker,
|
||||
=> .extend,
|
||||
|
||||
// This is obviously not INVALID invalid, there is SOME grapheme
|
||||
// boundary class for every codepoint. But we don't care about
|
||||
// anything that doesn't fit into the above categories. Also note
|
||||
// that `indic_conjunct_break_consonant` is `other` in
|
||||
// 'GraphemeBreakProperty.txt' (it's missing).
|
||||
.other,
|
||||
.indic_conjunct_break_consonant,
|
||||
.cr,
|
||||
.lf,
|
||||
.control,
|
||||
=> .invalid,
|
||||
};
|
||||
}
|
||||
|
||||
pub fn get(cp: u21) Properties {
|
||||
if (cp > uucode.config.max_code_point) return .{
|
||||
.width = 1,
|
||||
.grapheme_boundary_class = .invalid,
|
||||
.grapheme_break = .other,
|
||||
.emoji_vs_text = false,
|
||||
.emoji_vs_emoji = false,
|
||||
};
|
||||
|
||||
return .{
|
||||
.width = uucode.get(.width, cp),
|
||||
.grapheme_boundary_class = graphemeBoundaryClass(cp),
|
||||
.grapheme_break = uucode.get(.grapheme_break_no_control, cp),
|
||||
.emoji_vs_text = uucode.get(.is_emoji_vs_text, cp),
|
||||
.emoji_vs_emoji = uucode.get(.is_emoji_vs_emoji, cp),
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user