Measure East_Asian_Width during grapheme decoding

This commit is contained in:
Feoramund
2024-06-19 22:35:36 -04:00
parent 4380934283
commit e620645a03
4 changed files with 310 additions and 7 deletions

View File

@@ -5,8 +5,10 @@ REPLACEMENT_CHAR :: '\ufffd' // Represented an invalid code point
MAX_ASCII :: '\u007f' // Maximum ASCII value
MAX_LATIN1 :: '\u00ff' // Maximum Latin-1 value
ZERO_WIDTH_SPACE :: '\u200B'
ZERO_WIDTH_NON_JOINER :: '\u200C'
ZERO_WIDTH_JOINER :: '\u200D'
WORD_JOINER :: '\u2060'
@(require_results)
binary_search :: proc(c: i32, table: []i32, length, stride: int) -> int {
@@ -450,6 +452,41 @@ is_gcb_extend_class :: proc(r: rune) -> bool {
return is_grapheme_extend(r) || is_emoji_modifier(r)
}
// Return values:
//
// - 2 if East_Asian_Width=F or W, or
// - 0 if non-printable / zero-width, or
// - 1 in all other cases.
//
@(require_results)
normalized_east_asian_width :: proc(r: rune) -> int {
// This is a different interpretation of the BOM which occurs in the middle of text.
ZERO_WIDTH_NO_BREAK_SPACE :: '\uFEFF'
if is_control(r) {
return 0
} else if r <= 0x10FF {
// Easy early out for low runes.
return 1
}
switch r {
case ZERO_WIDTH_NO_BREAK_SPACE,
ZERO_WIDTH_SPACE,
ZERO_WIDTH_NON_JOINER,
ZERO_WIDTH_JOINER,
WORD_JOINER:
return 0
}
c := i32(r)
p := binary_search(c, normalized_east_asian_width_ranges[:], len(normalized_east_asian_width_ranges)/3, 3)
if p >= 0 && normalized_east_asian_width_ranges[p] <= c && c <= normalized_east_asian_width_ranges[p+1] {
return cast(int)normalized_east_asian_width_ranges[p+2]
}
return 1
}
//
// End of Unicode 15.1.0 block.
//

View File

@@ -3716,6 +3716,177 @@ indic_conjunct_break_extend_ranges := [?]i32 {
0x1E944, 0x1E94A,
}
// Fullwidth (F) and Wide (W) are counted as 2.
// Everything else is 1.
//
// Derived from: https://unicode.org/Public/15.1.0/ucd/EastAsianWidth.txt
@(rodata)
normalized_east_asian_width_ranges := [?]i32 {
0x0000, 0x10FF, 1,
0x1100, 0x115F, 2,
0x1160, 0x2319, 1,
0x231A, 0x231B, 2,
0x231C, 0x2328, 1,
0x2329, 0x232A, 2,
0x232B, 0x23E8, 1,
0x23E9, 0x23EC, 2,
0x23ED, 0x23EF, 1,
0x23F0, 0x23F0, 2,
0x23F1, 0x23F2, 1,
0x23F3, 0x23F3, 2,
0x23F4, 0x25FC, 1,
0x25FD, 0x25FE, 2,
0x25FF, 0x2613, 1,
0x2614, 0x2615, 2,
0x2616, 0x2647, 1,
0x2648, 0x2653, 2,
0x2654, 0x267E, 1,
0x267F, 0x267F, 2,
0x2680, 0x2692, 1,
0x2693, 0x2693, 2,
0x2694, 0x26A0, 1,
0x26A1, 0x26A1, 2,
0x26A2, 0x26A9, 1,
0x26AA, 0x26AB, 2,
0x26AC, 0x26BC, 1,
0x26BD, 0x26BE, 2,
0x26BF, 0x26C3, 1,
0x26C4, 0x26C5, 2,
0x26C6, 0x26CD, 1,
0x26CE, 0x26CE, 2,
0x26CF, 0x26D3, 1,
0x26D4, 0x26D4, 2,
0x26D5, 0x26E9, 1,
0x26EA, 0x26EA, 2,
0x26EB, 0x26F1, 1,
0x26F2, 0x26F3, 2,
0x26F4, 0x26F4, 1,
0x26F5, 0x26F5, 2,
0x26F6, 0x26F9, 1,
0x26FA, 0x26FA, 2,
0x26FB, 0x26FC, 1,
0x26FD, 0x26FD, 2,
0x26FE, 0x2704, 1,
0x2705, 0x2705, 2,
0x2706, 0x2709, 1,
0x270A, 0x270B, 2,
0x270C, 0x2727, 1,
0x2728, 0x2728, 2,
0x2729, 0x274B, 1,
0x274C, 0x274C, 2,
0x274D, 0x274D, 1,
0x274E, 0x274E, 2,
0x274F, 0x2752, 1,
0x2753, 0x2755, 2,
0x2756, 0x2756, 1,
0x2757, 0x2757, 2,
0x2758, 0x2794, 1,
0x2795, 0x2797, 2,
0x2798, 0x27AF, 1,
0x27B0, 0x27B0, 2,
0x27B1, 0x27BE, 1,
0x27BF, 0x27BF, 2,
0x27C0, 0x2B1A, 1,
0x2B1B, 0x2B1C, 2,
0x2B1D, 0x2B4F, 1,
0x2B50, 0x2B50, 2,
0x2B51, 0x2B54, 1,
0x2B55, 0x2B55, 2,
0x2B56, 0x2E5D, 1,
0x2E80, 0x303E, 2,
0x303F, 0x303F, 1,
0x3041, 0x3247, 2,
0x3248, 0x324F, 1,
0x3250, 0x4DBF, 2,
0x4DC0, 0x4DFF, 1,
0x4E00, 0xA4C6, 2,
0xA4D0, 0xA95F, 1,
0xA960, 0xA97C, 2,
0xA980, 0xABF9, 1,
0xAC00, 0xD7A3, 2,
0xD7B0, 0xF8FF, 1,
0xF900, 0xFAFF, 2,
0xFB00, 0xFE0F, 1,
0xFE10, 0xFE19, 2,
0xFE20, 0xFE2F, 1,
0xFE30, 0xFE6B, 2,
0xFE70, 0xFEFF, 1,
0xFF01, 0xFF60, 2,
0xFF61, 0xFFDC, 1,
0xFFE0, 0xFFE6, 2,
0xFFE8, 0x16F9F, 1,
0x16FE0, 0x1B2FB, 2,
0x1BC00, 0x1F003, 1,
0x1F004, 0x1F004, 2,
0x1F005, 0x1F0CE, 1,
0x1F0CF, 0x1F0CF, 2,
0x1F0D1, 0x1F18D, 1,
0x1F18E, 0x1F18E, 2,
0x1F18F, 0x1F190, 1,
0x1F191, 0x1F19A, 2,
0x1F19B, 0x1F1FF, 1,
0x1F200, 0x1F320, 2,
0x1F321, 0x1F32C, 1,
0x1F32D, 0x1F335, 2,
0x1F336, 0x1F336, 1,
0x1F337, 0x1F37C, 2,
0x1F37D, 0x1F37D, 1,
0x1F37E, 0x1F393, 2,
0x1F394, 0x1F39F, 1,
0x1F3A0, 0x1F3CA, 2,
0x1F3CB, 0x1F3CE, 1,
0x1F3CF, 0x1F3D3, 2,
0x1F3D4, 0x1F3DF, 1,
0x1F3E0, 0x1F3F0, 2,
0x1F3F1, 0x1F3F3, 1,
0x1F3F4, 0x1F3F4, 2,
0x1F3F5, 0x1F3F7, 1,
0x1F3F8, 0x1F43E, 2,
0x1F43F, 0x1F43F, 1,
0x1F440, 0x1F440, 2,
0x1F441, 0x1F441, 1,
0x1F442, 0x1F4FC, 2,
0x1F4FD, 0x1F4FE, 1,
0x1F4FF, 0x1F53D, 2,
0x1F53E, 0x1F54A, 1,
0x1F54B, 0x1F54E, 2,
0x1F54F, 0x1F54F, 1,
0x1F550, 0x1F567, 2,
0x1F568, 0x1F579, 1,
0x1F57A, 0x1F57A, 2,
0x1F57B, 0x1F594, 1,
0x1F595, 0x1F596, 2,
0x1F597, 0x1F5A3, 1,
0x1F5A4, 0x1F5A4, 2,
0x1F5A5, 0x1F5FA, 1,
0x1F5FB, 0x1F64F, 2,
0x1F650, 0x1F67F, 1,
0x1F680, 0x1F6C5, 2,
0x1F6C6, 0x1F6CB, 1,
0x1F6CC, 0x1F6CC, 2,
0x1F6CD, 0x1F6CF, 1,
0x1F6D0, 0x1F6D2, 2,
0x1F6D3, 0x1F6D4, 1,
0x1F6D5, 0x1F6DF, 2,
0x1F6E0, 0x1F6EA, 1,
0x1F6EB, 0x1F6EC, 2,
0x1F6F0, 0x1F6F3, 1,
0x1F6F4, 0x1F6FC, 2,
0x1F700, 0x1F7D9, 1,
0x1F7E0, 0x1F7F0, 2,
0x1F800, 0x1F90B, 1,
0x1F90C, 0x1F93A, 2,
0x1F93B, 0x1F93B, 1,
0x1F93C, 0x1F945, 2,
0x1F946, 0x1F946, 1,
0x1F947, 0x1F9FF, 2,
0x1FA00, 0x1FA6D, 1,
0x1FA70, 0x1FAF8, 2,
0x1FB00, 0x1FBF9, 1,
0x20000, 0x3FFFD, 2,
0xE0001, 0x10FFFD, 1,
}
//
// End of Unicode 15.1.0 block.
//

View File

@@ -17,11 +17,13 @@ is_spacing_mark :: unicode.is_spacing_mark
is_gcb_prepend_class :: unicode.is_gcb_prepend_class
is_emoji_extended_pictographic :: unicode.is_emoji_extended_pictographic
is_regional_indicator :: unicode.is_regional_indicator
normalized_east_asian_width :: unicode.normalized_east_asian_width
Grapheme :: struct {
byte_index: int,
rune_index: int,
width: int,
}
/*
@@ -33,10 +35,11 @@ Inputs:
Returns:
- graphemes: The number of graphemes in the string.
- runes: The number of runes in the string.
- width: The width of the string in number of monospace cells.
*/
@(require_results)
grapheme_count :: proc(str: string) -> (graphemes, runes: int) {
_, graphemes, runes = decode_grapheme_clusters(str, false)
grapheme_count :: proc(str: string) -> (graphemes, runes, width: int) {
_, graphemes, runes, width = decode_grapheme_clusters(str, false)
return
}
@@ -54,6 +57,7 @@ Returns:
- graphemes: Extra data about each grapheme.
- grapheme_count: The number of graphemes in the string.
- rune_count: The number of runes in the string.
- width: The width of the string in number of monospace cells.
*/
@(require_results)
decode_grapheme_clusters :: proc(
@@ -64,6 +68,7 @@ decode_grapheme_clusters :: proc(
graphemes: [dynamic]Grapheme,
grapheme_count: int,
rune_count: int,
width: int,
) {
// The following procedure implements text segmentation by breaking on
// Grapheme Cluster Boundaries[1], using the values[2] and rules[3] from
@@ -115,6 +120,24 @@ decode_grapheme_clusters :: proc(
// [3]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
// [4]: https://www.unicode.org/reports/tr29/#Conformance
// Additionally, this procedure now takes into account Standard Annex #11,
// in order to estimate how visually wide the string will appear on a
// monospaced display. This can only ever be a rough guess, as this tends
// to be an implementation detail relating to which fonts are being used,
// how codepoints are interpreted and drawn, if codepoint sequences are
// interpreted correctly, and et cetera.
//
// For example, a program may not properly interpret an emoji modifier
// sequence and print the component glyphs instead of one whole glyph.
//
// See here for more information: https://www.unicode.org/reports/tr11/
//
// NOTE: There is no explicit mention of what to do with zero-width spaces
// as far as grapheme cluster segmentation goes, therefore this
// implementation may count and return graphemes with a `width` of zero.
//
// Treat them as any other space.
Grapheme_Cluster_Sequence :: enum {
None,
Indic,
@@ -127,6 +150,7 @@ decode_grapheme_clusters :: proc(
last_rune: rune
last_rune_breaks_forward: bool
last_width: int
last_grapheme_count: int
bypass_next_rune: bool
@@ -145,10 +169,19 @@ decode_grapheme_clusters :: proc(
if rune_count == 0 && grapheme_count == 0 {
grapheme_count += 1
}
if track_graphemes && grapheme_count > last_grapheme_count {
append(&graphemes, Grapheme{ byte_index, rune_count })
if grapheme_count > last_grapheme_count {
width += normalized_east_asian_width(this_rune)
if track_graphemes {
append(&graphemes, Grapheme{
byte_index,
rune_count,
width - last_width,
})
}
last_grapheme_count = grapheme_count
last_width = width
}
last_grapheme_count = grapheme_count
last_rune = this_rune
rune_count += 1

View File

@@ -13,7 +13,7 @@ run_test_cases :: proc(t: ^testing.T, test_cases: []Test_Case, loc := #caller_lo
failed := 0
for c, i in test_cases {
log.debugf("(#% 4i) %q ...", i, c.str)
result, _ := utf8.grapheme_count(c.str)
result, _, _ := utf8.grapheme_count(c.str)
if !testing.expectf(t, result == c.expected_clusters,
"(#% 4i) graphemes: %i != %i, %q %s", i, result, c.expected_clusters, c.str, c.str,
loc = loc)
@@ -43,7 +43,7 @@ test_grapheme_byte_index_segmentation :: proc(t: ^testing.T) {
str := SAMPLE_1 + SAMPLE_2 + SAMPLE_3 + SAMPLE_2 + SAMPLE_1
graphemes, _, _ := utf8.decode_grapheme_clusters(str)
graphemes, _, _, _ := utf8.decode_grapheme_clusters(str)
defer delete(graphemes)
defer if testing.failed(t) {
@@ -71,3 +71,65 @@ test_grapheme_byte_index_segmentation :: proc(t: ^testing.T) {
testing.expectf(t, grapheme_4 == SAMPLE_2, "expected %q, got %q", SAMPLE_2, grapheme_2)
testing.expectf(t, grapheme_5 == SAMPLE_1, "expected %q, got %q", SAMPLE_1, grapheme_1)
}
@test
test_width :: proc(t: ^testing.T) {
{
str := "He\u200dllo"
graphemes, _, width := utf8.grapheme_count(str)
testing.expect_value(t, graphemes, 5)
testing.expect_value(t, width, 5)
}
{
// Note that a zero-width space is still considered a grapheme as far
// as the specification is concerned.
str := "He\u200bllo"
graphemes, _, width := utf8.grapheme_count(str)
testing.expect_value(t, graphemes, 6)
testing.expect_value(t, width, 5)
}
{
str := "\U0001F926\U0001F3FC\u200D\u2642"
graphemes, _, width := utf8.grapheme_count(str)
testing.expect_value(t, graphemes, 1)
testing.expect_value(t, width, 2)
}
{
str := "H̷e̶l̵l̸o̴p̵e̷ ̸w̶o̸r̵l̶d̵!̴"
graphemes, _, width := utf8.grapheme_count(str)
testing.expect_value(t, graphemes, 14)
testing.expect_value(t, width, 14)
}
{
str := "aカ.ヒフ"
graphemes, grapheme_count, _, width := utf8.decode_grapheme_clusters(str)
defer delete(graphemes)
testing.expect_value(t, grapheme_count, 5)
testing.expect_value(t, width, 8)
if grapheme_count == 5 {
testing.expect_value(t, graphemes[0].width, 1)
testing.expect_value(t, graphemes[1].width, 2)
testing.expect_value(t, graphemes[2].width, 1)
testing.expect_value(t, graphemes[3].width, 2)
testing.expect_value(t, graphemes[4].width, 2)
}
}
{
str := "いろはにほへ"
graphemes, _, width := utf8.grapheme_count(str)
testing.expect_value(t, graphemes, 6)
testing.expect_value(t, width, 12)
}
{
str := "舍利弗,是諸法空相,不生不滅,不垢不淨,不增不減。"
graphemes, _, width := utf8.grapheme_count(str)
testing.expect_value(t, graphemes, 25)
testing.expect_value(t, width, 50)
}
}