mirror of
https://github.com/ghostty-org/ghostty.git
synced 2026-04-19 05:50:27 +00:00
fix up diff from benchmarks, and add tests against ziglyph
This commit is contained in:
@@ -2,7 +2,6 @@ const std = @import("std");
|
||||
const props = @import("props.zig");
|
||||
const GraphemeBoundaryClass = props.GraphemeBoundaryClass;
|
||||
const table = props.table;
|
||||
const isExtendedPictographic = props.isExtendedPictographic;
|
||||
|
||||
/// Determines if there is a grapheme break between two codepoints. This
|
||||
/// must be called sequentially maintaining the state between calls.
|
||||
@@ -81,7 +80,7 @@ fn graphemeBreakClass(
|
||||
state: *BreakState,
|
||||
) bool {
|
||||
// GB11: Emoji Extend* ZWJ x Emoji
|
||||
if (!state.extended_pictographic and isExtendedPictographic(gbc1)) {
|
||||
if (!state.extended_pictographic and gbc1.isExtendedPictographic()) {
|
||||
state.extended_pictographic = true;
|
||||
}
|
||||
|
||||
@@ -132,7 +131,7 @@ fn graphemeBreakClass(
|
||||
// GB11: Emoji Extend* ZWJ x Emoji
|
||||
if (state.extended_pictographic and
|
||||
gbc1 == .zwj and
|
||||
isExtendedPictographic(gbc2))
|
||||
gbc2.isExtendedPictographic())
|
||||
{
|
||||
state.extended_pictographic = false;
|
||||
return false;
|
||||
@@ -156,38 +155,36 @@ fn graphemeBreakClass(
|
||||
/// TODO: this is hard to build with newer zig build, so
|
||||
/// https://github.com/ghostty-org/ghostty/pull/7806 took the approach of
|
||||
/// adding a `-Demit-unicode-test` option for `zig build`, but that
|
||||
/// hasn't been done here yet.
|
||||
/// TODO: this also still uses `ziglyph`, but could be switched to use
|
||||
/// `uucode`'s grapheme break once that is implemented.
|
||||
/// hasn't been done here.
|
||||
pub fn main() !void {
|
||||
const ziglyph = @import("ziglyph");
|
||||
const uucode = @import("uucode");
|
||||
|
||||
// Set the min and max to control the test range.
|
||||
const min = 0;
|
||||
const max = std.math.maxInt(u21) + 1;
|
||||
|
||||
var state: BreakState = .{};
|
||||
var zg_state: u3 = 0;
|
||||
var uu_state: uucode.grapheme.BreakState = .default;
|
||||
for (min..max) |cp1| {
|
||||
if (cp1 % 1000 == 0) std.log.warn("progress cp1={}", .{cp1});
|
||||
|
||||
if (cp1 == '\r' or cp1 == '\n' or
|
||||
ziglyph.grapheme_break.isControl(@intCast(cp1))) continue;
|
||||
uucode.get(.grapheme_break, @intCast(cp1)) == .control) continue;
|
||||
|
||||
for (min..max) |cp2| {
|
||||
if (cp2 == '\r' or cp2 == '\n' or
|
||||
ziglyph.grapheme_break.isControl(@intCast(cp2))) continue;
|
||||
uucode.get(.grapheme_break, @intCast(cp1)) == .control) continue;
|
||||
|
||||
const gb = graphemeBreak(@intCast(cp1), @intCast(cp2), &state);
|
||||
const zg_gb = ziglyph.graphemeBreak(@intCast(cp1), @intCast(cp2), &zg_state);
|
||||
if (gb != zg_gb) {
|
||||
std.log.warn("cp1={x} cp2={x} gb={} state={} zg_gb={} zg_state={}", .{
|
||||
const uu_gb = uucode.grapheme.isBreak(@intCast(cp1), @intCast(cp2), &uu_state);
|
||||
if (gb != uu_gb) {
|
||||
std.log.warn("cp1={x} cp2={x} gb={} state={} uu_gb={} uu_state={}", .{
|
||||
cp1,
|
||||
cp2,
|
||||
gb,
|
||||
state,
|
||||
zg_gb,
|
||||
zg_state,
|
||||
uu_gb,
|
||||
uu_state,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,7 +7,6 @@ pub const Properties = props.Properties;
|
||||
pub const getProperties = props.get;
|
||||
pub const graphemeBreak = grapheme.graphemeBreak;
|
||||
pub const GraphemeBreakState = grapheme.BreakState;
|
||||
pub const isExtendedPictographic = props.isExtendedPictographic;
|
||||
|
||||
test {
|
||||
_ = @import("symbols.zig");
|
||||
|
||||
@@ -76,66 +76,66 @@ pub const GraphemeBoundaryClass = enum(u4) {
|
||||
extended_pictographic,
|
||||
extended_pictographic_base, // \p{Extended_Pictographic} & \p{Emoji_Modifier_Base}
|
||||
emoji_modifier, // \p{Emoji_Modifier}
|
||||
|
||||
/// Gets the grapheme boundary class for a codepoint.
|
||||
/// The use case for this is only in generating lookup tables.
|
||||
pub fn init(cp: u21) GraphemeBoundaryClass {
|
||||
if (cp > uucode.config.max_code_point) return .invalid;
|
||||
if (uucode.get(.is_emoji_modifier, cp)) return .emoji_modifier;
|
||||
if (uucode.get(.is_emoji_modifier_base, cp)) return .extended_pictographic_base;
|
||||
|
||||
return switch (uucode.get(.grapheme_break, cp)) {
|
||||
.extended_pictographic => .extended_pictographic,
|
||||
.l => .L,
|
||||
.v => .V,
|
||||
.t => .T,
|
||||
.lv => .LV,
|
||||
.lvt => .LVT,
|
||||
.prepend => .prepend,
|
||||
.zwj => .zwj,
|
||||
.spacing_mark => .spacing_mark,
|
||||
.regional_indicator => .regional_indicator,
|
||||
|
||||
.zwnj,
|
||||
.indic_conjunct_break_extend,
|
||||
.indic_conjunct_break_linker,
|
||||
=> .extend,
|
||||
|
||||
// This is obviously not INVALID invalid, there is SOME grapheme
|
||||
// boundary class for every codepoint. But we don't care about
|
||||
// anything that doesn't fit into the above categories.
|
||||
.other,
|
||||
.indic_conjunct_break_consonant,
|
||||
.cr,
|
||||
.lf,
|
||||
.control,
|
||||
=> .invalid,
|
||||
};
|
||||
}
|
||||
|
||||
/// Returns true if this is an extended pictographic type. This
|
||||
/// should be used instead of comparing the enum value directly
|
||||
/// because we classify multiple.
|
||||
pub fn isExtendedPictographic(self: GraphemeBoundaryClass) bool {
|
||||
return switch (self) {
|
||||
.extended_pictographic,
|
||||
.extended_pictographic_base,
|
||||
=> true,
|
||||
|
||||
else => false,
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
/// Gets the grapheme boundary class for a codepoint.
|
||||
/// The use case for this is only in generating lookup tables.
|
||||
fn computeGraphemeBoundaryClass(cp: u21) GraphemeBoundaryClass {
|
||||
if (cp > uucode.config.max_code_point) return .invalid;
|
||||
if (uucode.get(.is_emoji_modifier, cp)) return .emoji_modifier;
|
||||
if (uucode.get(.is_emoji_modifier_base, cp)) return .extended_pictographic_base;
|
||||
|
||||
return switch (uucode.get(.grapheme_break, cp)) {
|
||||
.extended_pictographic => .extended_pictographic,
|
||||
.l => .L,
|
||||
.v => .V,
|
||||
.t => .T,
|
||||
.lv => .LV,
|
||||
.lvt => .LVT,
|
||||
.prepend => .prepend,
|
||||
.zwj => .zwj,
|
||||
.spacing_mark => .spacing_mark,
|
||||
.regional_indicator => .regional_indicator,
|
||||
|
||||
.zwnj,
|
||||
.indic_conjunct_break_extend,
|
||||
.indic_conjunct_break_linker,
|
||||
=> .extend,
|
||||
|
||||
// This is obviously not INVALID invalid, there is SOME grapheme
|
||||
// boundary class for every codepoint. But we don't care about
|
||||
// anything that doesn't fit into the above categories.
|
||||
.other,
|
||||
.indic_conjunct_break_consonant,
|
||||
.cr,
|
||||
.lf,
|
||||
.control,
|
||||
=> .invalid,
|
||||
};
|
||||
}
|
||||
|
||||
/// Returns true if this is an extended pictographic type. This
|
||||
/// should be used instead of comparing the enum value directly
|
||||
/// because we classify multiple.
|
||||
pub fn isExtendedPictographic(self: GraphemeBoundaryClass) bool {
|
||||
return switch (self) {
|
||||
.extended_pictographic,
|
||||
.extended_pictographic_base,
|
||||
=> true,
|
||||
|
||||
else => false,
|
||||
};
|
||||
}
|
||||
|
||||
pub fn get(cp: u21) Properties {
|
||||
const width = if (cp > uucode.config.max_code_point)
|
||||
0
|
||||
1
|
||||
else
|
||||
uucode.getX(.width, cp);
|
||||
|
||||
return .{
|
||||
.width = width,
|
||||
.grapheme_boundary_class = computeGraphemeBoundaryClass(cp),
|
||||
.grapheme_boundary_class = .init(cp),
|
||||
};
|
||||
}
|
||||
|
||||
@@ -145,13 +145,6 @@ pub fn main() !void {
|
||||
defer arena_state.deinit();
|
||||
const alloc = arena_state.allocator();
|
||||
|
||||
var args_iter = try std.process.argsWithAllocator(alloc);
|
||||
defer args_iter.deinit();
|
||||
_ = args_iter.skip(); // Skip program name
|
||||
|
||||
const output_path = args_iter.next() orelse std.debug.panic("No output file arg for props exe!", .{});
|
||||
std.debug.print("Unicode props_table output_path = {s}\n", .{output_path});
|
||||
|
||||
const gen: lut.Generator(
|
||||
Properties,
|
||||
struct {
|
||||
@@ -171,10 +164,7 @@ pub fn main() !void {
|
||||
defer alloc.free(t.stage1);
|
||||
defer alloc.free(t.stage2);
|
||||
defer alloc.free(t.stage3);
|
||||
var out_file = try std.fs.cwd().createFile(output_path, .{});
|
||||
defer out_file.close();
|
||||
const writer = out_file.writer();
|
||||
try t.writeZig(writer);
|
||||
try t.writeZig(std.io.getStdOut().writer());
|
||||
|
||||
// Uncomment when manually debugging to see our table sizes.
|
||||
// std.log.warn("stage1={} stage2={} stage3={}", .{
|
||||
@@ -186,17 +176,78 @@ pub fn main() !void {
|
||||
|
||||
// This is not very fast in debug modes, so its commented by default.
|
||||
// IMPORTANT: UNCOMMENT THIS WHENEVER MAKING CODEPOINTWIDTH CHANGES.
|
||||
// test "unicode props: tables match uucode" {
|
||||
// const testing = std.testing;
|
||||
//
|
||||
// const min = 0xFF + 1; // start outside ascii
|
||||
// const max = std.math.maxInt(u21) + 1;
|
||||
// for (min..max) |cp| {
|
||||
// const t = table.get(@intCast(cp));
|
||||
// const uu = @min(2, @max(0, uucode.get(.wcwidth, @intCast(cp))));
|
||||
// if (t.width != uu) {
|
||||
// std.log.warn("mismatch cp=U+{x} t={} uucode={}", .{ cp, t, uu });
|
||||
// try testing.expect(false);
|
||||
// }
|
||||
// }
|
||||
//}
|
||||
test "unicode props: tables match uucode" {
|
||||
if (std.valgrind.runningOnValgrind() > 0) return error.SkipZigTest;
|
||||
|
||||
const testing = std.testing;
|
||||
|
||||
const min = 0xFF + 1; // start outside ascii
|
||||
const max = std.math.maxInt(u21) + 1;
|
||||
for (min..max) |cp| {
|
||||
const t = table.get(@intCast(cp));
|
||||
const uu = if (cp > uucode.config.max_code_point)
|
||||
1
|
||||
else
|
||||
uucode.getX(.width, @intCast(cp));
|
||||
if (t.width != uu) {
|
||||
std.log.warn("mismatch cp=U+{x} t={} uu={}", .{ cp, t.width, uu });
|
||||
try testing.expect(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test "unicode props: tables match ziglyph" {
|
||||
if (std.valgrind.runningOnValgrind() > 0) return error.SkipZigTest;
|
||||
|
||||
const ziglyph = @import("ziglyph");
|
||||
const testing = std.testing;
|
||||
|
||||
const min = 0xFF + 1; // start outside ascii
|
||||
const max = std.math.maxInt(u21) + 1;
|
||||
for (min..max) |cp| {
|
||||
const t = table.get(@intCast(cp));
|
||||
const zg = @min(2, @max(0, ziglyph.display_width.codePointWidth(@intCast(cp), .half)));
|
||||
if (t.width != zg) {
|
||||
|
||||
// Known exceptions
|
||||
if (cp == 0x0897) continue; // non-spacing mark (t = 0)
|
||||
if (cp == 0x2065) continue; // unassigned (t = 1)
|
||||
if (cp >= 0x2630 and cp <= 0x2637) continue; // east asian width is wide (t = 2)
|
||||
if (cp >= 0x268A and cp <= 0x268F) continue; // east asian width is wide (t = 2)
|
||||
if (cp >= 0x2FFC and cp <= 0x2FFF) continue; // east asian width is wide (t = 2)
|
||||
if (cp == 0x31E4 or cp == 0x31E5) continue; // east asian width is wide (t = 2)
|
||||
if (cp == 0x31EF) continue; // east asian width is wide (t = 2)
|
||||
if (cp >= 0x4DC0 and cp <= 0x4DFF) continue; // east asian width is wide (t = 2)
|
||||
if (cp >= 0xFFF0 and cp <= 0xFFF8) continue; // unassigned (t = 1)
|
||||
if (cp >= 0xFFF0 and cp <= 0xFFF8) continue; // unassigned (t = 1)
|
||||
if (cp >= 0x10D69 and cp <= 0x10D6D) continue; // non-spacing mark, despite being east asian width normal (t = 0)
|
||||
if (cp >= 0x10EFC and cp <= 0x10EFF) continue; // non-spacing mark, despite being east asian width normal (t = 0)
|
||||
if (cp >= 0x113BB and cp <= 0x113C0) continue; // non-spacing mark, despite being east asian width normal (t = 0)
|
||||
if (cp == 0x113CE) continue; // non-spacing mark, despite being east asian width normal (t = 0)
|
||||
if (cp == 0x113D0) continue; // non-spacing mark, despite being east asian width normal (t = 0)
|
||||
if (cp == 0x113D2) continue; // non-spacing mark, despite being east asian width normal (t = 0)
|
||||
if (cp == 0x113E1) continue; // non-spacing mark, despite being east asian width normal (t = 0)
|
||||
if (cp == 0x113E2) continue; // non-spacing mark, despite being east asian width normal (t = 0)
|
||||
if (cp == 0x1171E) continue; // mark spacing combining (t = 1)
|
||||
if (cp == 0x11F5A) continue; // non-spacing mark, despite being east asian width normal (t = 0)
|
||||
if (cp == 0x1611E) continue; // non-spacing mark, despite being east asian width normal (t = 0)
|
||||
if (cp == 0x1611F) continue; // non-spacing mark, despite being east asian width normal (t = 0)
|
||||
if (cp >= 0x16120 and cp <= 0x1612F) continue; // non-spacing mark, despite being east asian width normal (t = 0)
|
||||
if (cp >= 0xE0000 and cp <= 0xE0FFF) continue; // ziglyph ignores these with 0, but many are unassigned (t = 1)
|
||||
if (cp == 0x18CFF) continue; // east asian width is wide (t = 2)
|
||||
if (cp >= 0x1D300 and cp <= 0x1D376) continue; // east asian width is wide (t = 2)
|
||||
if (cp == 0x1E5EE) continue; // non-spacing mark, despite being east asian width normal (t = 0)
|
||||
if (cp == 0x1E5EF) continue; // non-spacing mark, despite being east asian width normal (t = 0)
|
||||
if (cp == 0x1FA89) continue; // east asian width is wide (t = 2)
|
||||
if (cp == 0x1FA8F) continue; // east asian width is wide (t = 2)
|
||||
if (cp == 0x1FABE) continue; // east asian width is wide (t = 2)
|
||||
if (cp == 0x1FAC6) continue; // east asian width is wide (t = 2)
|
||||
if (cp == 0x1FADC) continue; // east asian width is wide (t = 2)
|
||||
if (cp == 0x1FADF) continue; // east asian width is wide (t = 2)
|
||||
if (cp == 0x1FAE9) continue; // east asian width is wide (t = 2)
|
||||
|
||||
std.log.warn("mismatch cp=U+{x} t={} zg={}", .{ cp, t.width, zg });
|
||||
try testing.expect(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17,37 +17,12 @@ pub const table = table: {
|
||||
};
|
||||
};
|
||||
|
||||
/// Returns true of the codepoint is a "symbol-like" character, which
|
||||
/// for now we define as anything in a private use area and anything
|
||||
/// in several unicode blocks:
|
||||
/// - Dingbats
|
||||
/// - Emoticons
|
||||
/// - Miscellaneous Symbols
|
||||
/// - Enclosed Alphanumerics
|
||||
/// - Enclosed Alphanumeric Supplement
|
||||
/// - Miscellaneous Symbols and Pictographs
|
||||
/// - Transport and Map Symbols
|
||||
///
|
||||
/// In the future it may be prudent to expand this to encompass more
|
||||
/// symbol-like characters, and/or exclude some PUA sections.
|
||||
pub fn isSymbol(cp: u21) bool {
|
||||
// TODO: probably can remove this method and just call uucode directly
|
||||
return uucode.getX(.is_symbol, cp);
|
||||
}
|
||||
|
||||
/// Runnable binary to generate the lookup tables and output to stdout.
|
||||
pub fn main() !void {
|
||||
var arena_state = std.heap.ArenaAllocator.init(std.heap.page_allocator);
|
||||
defer arena_state.deinit();
|
||||
const alloc = arena_state.allocator();
|
||||
|
||||
var args_iter = try std.process.argsWithAllocator(alloc);
|
||||
defer args_iter.deinit();
|
||||
_ = args_iter.skip(); // Skip program name
|
||||
|
||||
const output_path = args_iter.next() orelse std.debug.panic("No output file arg for symbols exe!", .{});
|
||||
std.debug.print("Unicode symbols_table output_path = {s}\n", .{output_path});
|
||||
|
||||
const gen: lut.Generator(
|
||||
bool,
|
||||
struct {
|
||||
@@ -56,7 +31,7 @@ pub fn main() !void {
|
||||
return if (cp > uucode.config.max_code_point)
|
||||
false
|
||||
else
|
||||
isSymbol(@intCast(cp));
|
||||
uucode.getX(.is_symbol, @intCast(cp));
|
||||
}
|
||||
|
||||
pub fn eql(ctx: @This(), a: bool, b: bool) bool {
|
||||
@@ -70,10 +45,7 @@ pub fn main() !void {
|
||||
defer alloc.free(t.stage1);
|
||||
defer alloc.free(t.stage2);
|
||||
defer alloc.free(t.stage3);
|
||||
var out_file = try std.fs.cwd().createFile(output_path, .{});
|
||||
defer out_file.close();
|
||||
const writer = out_file.writer();
|
||||
try t.writeZig(writer);
|
||||
try t.writeZig(std.io.getStdOut().writer());
|
||||
|
||||
// Uncomment when manually debugging to see our table sizes.
|
||||
// std.log.warn("stage1={} stage2={} stage3={}", .{
|
||||
@@ -83,8 +55,6 @@ pub fn main() !void {
|
||||
// });
|
||||
}
|
||||
|
||||
// This is not very fast in debug modes, so its commented by default.
|
||||
// IMPORTANT: UNCOMMENT THIS WHENEVER MAKING CHANGES.
|
||||
test "unicode symbols: tables match uucode" {
|
||||
if (std.valgrind.runningOnValgrind() > 0) return error.SkipZigTest;
|
||||
|
||||
@@ -95,7 +65,7 @@ test "unicode symbols: tables match uucode" {
|
||||
const uu = if (cp > uucode.config.max_code_point)
|
||||
false
|
||||
else
|
||||
isSymbol(@intCast(cp));
|
||||
uucode.getX(.is_symbol, @intCast(cp));
|
||||
|
||||
if (t != uu) {
|
||||
std.log.warn("mismatch cp=U+{x} t={} uu={}", .{ cp, t, uu });
|
||||
@@ -103,3 +73,28 @@ test "unicode symbols: tables match uucode" {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test "unicode symbols: tables match ziglyph" {
|
||||
if (std.valgrind.runningOnValgrind() > 0) return error.SkipZigTest;
|
||||
|
||||
const ziglyph = @import("ziglyph");
|
||||
const testing = std.testing;
|
||||
|
||||
for (0..std.math.maxInt(u21)) |cp_usize| {
|
||||
const cp: u21 = @intCast(cp_usize);
|
||||
const t = table.get(cp);
|
||||
const zg = ziglyph.general_category.isPrivateUse(cp) or
|
||||
ziglyph.blocks.isDingbats(cp) or
|
||||
ziglyph.blocks.isEmoticons(cp) or
|
||||
ziglyph.blocks.isMiscellaneousSymbols(cp) or
|
||||
ziglyph.blocks.isEnclosedAlphanumerics(cp) or
|
||||
ziglyph.blocks.isEnclosedAlphanumericSupplement(cp) or
|
||||
ziglyph.blocks.isMiscellaneousSymbolsAndPictographs(cp) or
|
||||
ziglyph.blocks.isTransportAndMapSymbols(cp);
|
||||
|
||||
if (t != zg) {
|
||||
std.log.warn("mismatch cp=U+{x} t={} zg={}", .{ cp, t, zg });
|
||||
try testing.expect(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user