diff --git a/core/unicode/letter.odin b/core/unicode/letter.odin index f88907501..af345f733 100644 --- a/core/unicode/letter.odin +++ b/core/unicode/letter.odin @@ -5,8 +5,10 @@ REPLACEMENT_CHAR :: '\ufffd' // Represented an invalid code point MAX_ASCII :: '\u007f' // Maximum ASCII value MAX_LATIN1 :: '\u00ff' // Maximum Latin-1 value +ZERO_WIDTH_SPACE :: '\u200B' ZERO_WIDTH_NON_JOINER :: '\u200C' ZERO_WIDTH_JOINER :: '\u200D' +WORD_JOINER :: '\u2060' @(require_results) binary_search :: proc(c: i32, table: []i32, length, stride: int) -> int { @@ -450,6 +452,41 @@ is_gcb_extend_class :: proc(r: rune) -> bool { return is_grapheme_extend(r) || is_emoji_modifier(r) } +// Return values: +// +// - 2 if East_Asian_Width=F or W, or +// - 0 if non-printable / zero-width, or +// - 1 in all other cases. +// +@(require_results) +normalized_east_asian_width :: proc(r: rune) -> int { + // This is a different interpretation of the BOM which occurs in the middle of text. + ZERO_WIDTH_NO_BREAK_SPACE :: '\uFEFF' + + if is_control(r) { + return 0 + } else if r <= 0x10FF { + // Easy early out for low runes. + return 1 + } + + switch r { + case ZERO_WIDTH_NO_BREAK_SPACE, + ZERO_WIDTH_SPACE, + ZERO_WIDTH_NON_JOINER, + ZERO_WIDTH_JOINER, + WORD_JOINER: + return 0 + } + + c := i32(r) + p := binary_search(c, normalized_east_asian_width_ranges[:], len(normalized_east_asian_width_ranges)/3, 3) + if p >= 0 && normalized_east_asian_width_ranges[p] <= c && c <= normalized_east_asian_width_ranges[p+1] { + return cast(int)normalized_east_asian_width_ranges[p+2] + } + return 1 +} + // // End of Unicode 15.1.0 block. // diff --git a/core/unicode/tables.odin b/core/unicode/tables.odin index 5b3d0806e..c0b3fe434 100644 --- a/core/unicode/tables.odin +++ b/core/unicode/tables.odin @@ -3716,6 +3716,177 @@ indic_conjunct_break_extend_ranges := [?]i32 { 0x1E944, 0x1E94A, } +// Fullwidth (F) and Wide (W) are counted as 2. +// Everything else is 1. +// +// Derived from: https://unicode.org/Public/15.1.0/ucd/EastAsianWidth.txt +@(rodata) +normalized_east_asian_width_ranges := [?]i32 { + 0x0000, 0x10FF, 1, + 0x1100, 0x115F, 2, + 0x1160, 0x2319, 1, + 0x231A, 0x231B, 2, + 0x231C, 0x2328, 1, + 0x2329, 0x232A, 2, + 0x232B, 0x23E8, 1, + 0x23E9, 0x23EC, 2, + 0x23ED, 0x23EF, 1, + 0x23F0, 0x23F0, 2, + 0x23F1, 0x23F2, 1, + 0x23F3, 0x23F3, 2, + 0x23F4, 0x25FC, 1, + 0x25FD, 0x25FE, 2, + 0x25FF, 0x2613, 1, + 0x2614, 0x2615, 2, + 0x2616, 0x2647, 1, + 0x2648, 0x2653, 2, + 0x2654, 0x267E, 1, + 0x267F, 0x267F, 2, + 0x2680, 0x2692, 1, + 0x2693, 0x2693, 2, + 0x2694, 0x26A0, 1, + 0x26A1, 0x26A1, 2, + 0x26A2, 0x26A9, 1, + 0x26AA, 0x26AB, 2, + 0x26AC, 0x26BC, 1, + 0x26BD, 0x26BE, 2, + 0x26BF, 0x26C3, 1, + 0x26C4, 0x26C5, 2, + 0x26C6, 0x26CD, 1, + 0x26CE, 0x26CE, 2, + 0x26CF, 0x26D3, 1, + 0x26D4, 0x26D4, 2, + 0x26D5, 0x26E9, 1, + 0x26EA, 0x26EA, 2, + 0x26EB, 0x26F1, 1, + 0x26F2, 0x26F3, 2, + 0x26F4, 0x26F4, 1, + 0x26F5, 0x26F5, 2, + 0x26F6, 0x26F9, 1, + 0x26FA, 0x26FA, 2, + 0x26FB, 0x26FC, 1, + 0x26FD, 0x26FD, 2, + 0x26FE, 0x2704, 1, + 0x2705, 0x2705, 2, + 0x2706, 0x2709, 1, + 0x270A, 0x270B, 2, + 0x270C, 0x2727, 1, + 0x2728, 0x2728, 2, + 0x2729, 0x274B, 1, + 0x274C, 0x274C, 2, + 0x274D, 0x274D, 1, + 0x274E, 0x274E, 2, + 0x274F, 0x2752, 1, + 0x2753, 0x2755, 2, + 0x2756, 0x2756, 1, + 0x2757, 0x2757, 2, + 0x2758, 0x2794, 1, + 0x2795, 0x2797, 2, + 0x2798, 0x27AF, 1, + 0x27B0, 0x27B0, 2, + 0x27B1, 0x27BE, 1, + 0x27BF, 0x27BF, 2, + 0x27C0, 0x2B1A, 1, + 0x2B1B, 0x2B1C, 2, + 0x2B1D, 0x2B4F, 1, + 0x2B50, 0x2B50, 2, + 0x2B51, 0x2B54, 1, + 0x2B55, 0x2B55, 2, + 0x2B56, 0x2E5D, 1, + 0x2E80, 0x303E, 2, + 0x303F, 0x303F, 1, + 0x3041, 0x3247, 2, + 0x3248, 0x324F, 1, + 0x3250, 0x4DBF, 2, + 0x4DC0, 0x4DFF, 1, + 0x4E00, 0xA4C6, 2, + 0xA4D0, 0xA95F, 1, + 0xA960, 0xA97C, 2, + 0xA980, 0xABF9, 1, + 0xAC00, 0xD7A3, 2, + 0xD7B0, 0xF8FF, 1, + 0xF900, 0xFAFF, 2, + 0xFB00, 0xFE0F, 1, + 0xFE10, 0xFE19, 2, + 0xFE20, 0xFE2F, 1, + 0xFE30, 0xFE6B, 2, + 0xFE70, 0xFEFF, 1, + 0xFF01, 0xFF60, 2, + 0xFF61, 0xFFDC, 1, + 0xFFE0, 0xFFE6, 2, + 0xFFE8, 0x16F9F, 1, + 0x16FE0, 0x1B2FB, 2, + 0x1BC00, 0x1F003, 1, + 0x1F004, 0x1F004, 2, + 0x1F005, 0x1F0CE, 1, + 0x1F0CF, 0x1F0CF, 2, + 0x1F0D1, 0x1F18D, 1, + 0x1F18E, 0x1F18E, 2, + 0x1F18F, 0x1F190, 1, + 0x1F191, 0x1F19A, 2, + 0x1F19B, 0x1F1FF, 1, + 0x1F200, 0x1F320, 2, + 0x1F321, 0x1F32C, 1, + 0x1F32D, 0x1F335, 2, + 0x1F336, 0x1F336, 1, + 0x1F337, 0x1F37C, 2, + 0x1F37D, 0x1F37D, 1, + 0x1F37E, 0x1F393, 2, + 0x1F394, 0x1F39F, 1, + 0x1F3A0, 0x1F3CA, 2, + 0x1F3CB, 0x1F3CE, 1, + 0x1F3CF, 0x1F3D3, 2, + 0x1F3D4, 0x1F3DF, 1, + 0x1F3E0, 0x1F3F0, 2, + 0x1F3F1, 0x1F3F3, 1, + 0x1F3F4, 0x1F3F4, 2, + 0x1F3F5, 0x1F3F7, 1, + 0x1F3F8, 0x1F43E, 2, + 0x1F43F, 0x1F43F, 1, + 0x1F440, 0x1F440, 2, + 0x1F441, 0x1F441, 1, + 0x1F442, 0x1F4FC, 2, + 0x1F4FD, 0x1F4FE, 1, + 0x1F4FF, 0x1F53D, 2, + 0x1F53E, 0x1F54A, 1, + 0x1F54B, 0x1F54E, 2, + 0x1F54F, 0x1F54F, 1, + 0x1F550, 0x1F567, 2, + 0x1F568, 0x1F579, 1, + 0x1F57A, 0x1F57A, 2, + 0x1F57B, 0x1F594, 1, + 0x1F595, 0x1F596, 2, + 0x1F597, 0x1F5A3, 1, + 0x1F5A4, 0x1F5A4, 2, + 0x1F5A5, 0x1F5FA, 1, + 0x1F5FB, 0x1F64F, 2, + 0x1F650, 0x1F67F, 1, + 0x1F680, 0x1F6C5, 2, + 0x1F6C6, 0x1F6CB, 1, + 0x1F6CC, 0x1F6CC, 2, + 0x1F6CD, 0x1F6CF, 1, + 0x1F6D0, 0x1F6D2, 2, + 0x1F6D3, 0x1F6D4, 1, + 0x1F6D5, 0x1F6DF, 2, + 0x1F6E0, 0x1F6EA, 1, + 0x1F6EB, 0x1F6EC, 2, + 0x1F6F0, 0x1F6F3, 1, + 0x1F6F4, 0x1F6FC, 2, + 0x1F700, 0x1F7D9, 1, + 0x1F7E0, 0x1F7F0, 2, + 0x1F800, 0x1F90B, 1, + 0x1F90C, 0x1F93A, 2, + 0x1F93B, 0x1F93B, 1, + 0x1F93C, 0x1F945, 2, + 0x1F946, 0x1F946, 1, + 0x1F947, 0x1F9FF, 2, + 0x1FA00, 0x1FA6D, 1, + 0x1FA70, 0x1FAF8, 2, + 0x1FB00, 0x1FBF9, 1, + 0x20000, 0x3FFFD, 2, + 0xE0001, 0x10FFFD, 1, +} + // // End of Unicode 15.1.0 block. // diff --git a/core/unicode/utf8/grapheme.odin b/core/unicode/utf8/grapheme.odin index c0851c6ea..911165af9 100644 --- a/core/unicode/utf8/grapheme.odin +++ b/core/unicode/utf8/grapheme.odin @@ -17,11 +17,13 @@ is_spacing_mark :: unicode.is_spacing_mark is_gcb_prepend_class :: unicode.is_gcb_prepend_class is_emoji_extended_pictographic :: unicode.is_emoji_extended_pictographic is_regional_indicator :: unicode.is_regional_indicator +normalized_east_asian_width :: unicode.normalized_east_asian_width Grapheme :: struct { byte_index: int, rune_index: int, + width: int, } /* @@ -33,10 +35,11 @@ Inputs: Returns: - graphemes: The number of graphemes in the string. - runes: The number of runes in the string. +- width: The width of the string in number of monospace cells. */ @(require_results) -grapheme_count :: proc(str: string) -> (graphemes, runes: int) { - _, graphemes, runes = decode_grapheme_clusters(str, false) +grapheme_count :: proc(str: string) -> (graphemes, runes, width: int) { + _, graphemes, runes, width = decode_grapheme_clusters(str, false) return } @@ -54,6 +57,7 @@ Returns: - graphemes: Extra data about each grapheme. - grapheme_count: The number of graphemes in the string. - rune_count: The number of runes in the string. +- width: The width of the string in number of monospace cells. */ @(require_results) decode_grapheme_clusters :: proc( @@ -64,6 +68,7 @@ decode_grapheme_clusters :: proc( graphemes: [dynamic]Grapheme, grapheme_count: int, rune_count: int, + width: int, ) { // The following procedure implements text segmentation by breaking on // Grapheme Cluster Boundaries[1], using the values[2] and rules[3] from @@ -115,6 +120,24 @@ decode_grapheme_clusters :: proc( // [3]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules // [4]: https://www.unicode.org/reports/tr29/#Conformance + // Additionally, this procedure now takes into account Standard Annex #11, + // in order to estimate how visually wide the string will appear on a + // monospaced display. This can only ever be a rough guess, as this tends + // to be an implementation detail relating to which fonts are being used, + // how codepoints are interpreted and drawn, if codepoint sequences are + // interpreted correctly, and et cetera. + // + // For example, a program may not properly interpret an emoji modifier + // sequence and print the component glyphs instead of one whole glyph. + // + // See here for more information: https://www.unicode.org/reports/tr11/ + // + // NOTE: There is no explicit mention of what to do with zero-width spaces + // as far as grapheme cluster segmentation goes, therefore this + // implementation may count and return graphemes with a `width` of zero. + // + // Treat them as any other space. + Grapheme_Cluster_Sequence :: enum { None, Indic, @@ -127,6 +150,7 @@ decode_grapheme_clusters :: proc( last_rune: rune last_rune_breaks_forward: bool + last_width: int last_grapheme_count: int bypass_next_rune: bool @@ -145,10 +169,19 @@ decode_grapheme_clusters :: proc( if rune_count == 0 && grapheme_count == 0 { grapheme_count += 1 } - if track_graphemes && grapheme_count > last_grapheme_count { - append(&graphemes, Grapheme{ byte_index, rune_count }) + + if grapheme_count > last_grapheme_count { + width += normalized_east_asian_width(this_rune) + if track_graphemes { + append(&graphemes, Grapheme{ + byte_index, + rune_count, + width - last_width, + }) + } + last_grapheme_count = grapheme_count + last_width = width } - last_grapheme_count = grapheme_count last_rune = this_rune rune_count += 1 diff --git a/tests/core/unicode/test_core_unicode.odin b/tests/core/unicode/test_core_unicode.odin index c097d518a..a1f6ac934 100644 --- a/tests/core/unicode/test_core_unicode.odin +++ b/tests/core/unicode/test_core_unicode.odin @@ -13,7 +13,7 @@ run_test_cases :: proc(t: ^testing.T, test_cases: []Test_Case, loc := #caller_lo failed := 0 for c, i in test_cases { log.debugf("(#% 4i) %q ...", i, c.str) - result, _ := utf8.grapheme_count(c.str) + result, _, _ := utf8.grapheme_count(c.str) if !testing.expectf(t, result == c.expected_clusters, "(#% 4i) graphemes: %i != %i, %q %s", i, result, c.expected_clusters, c.str, c.str, loc = loc) @@ -43,7 +43,7 @@ test_grapheme_byte_index_segmentation :: proc(t: ^testing.T) { str := SAMPLE_1 + SAMPLE_2 + SAMPLE_3 + SAMPLE_2 + SAMPLE_1 - graphemes, _, _ := utf8.decode_grapheme_clusters(str) + graphemes, _, _, _ := utf8.decode_grapheme_clusters(str) defer delete(graphemes) defer if testing.failed(t) { @@ -71,3 +71,65 @@ test_grapheme_byte_index_segmentation :: proc(t: ^testing.T) { testing.expectf(t, grapheme_4 == SAMPLE_2, "expected %q, got %q", SAMPLE_2, grapheme_2) testing.expectf(t, grapheme_5 == SAMPLE_1, "expected %q, got %q", SAMPLE_1, grapheme_1) } + +@test +test_width :: proc(t: ^testing.T) { + { + str := "He\u200dllo" + graphemes, _, width := utf8.grapheme_count(str) + testing.expect_value(t, graphemes, 5) + testing.expect_value(t, width, 5) + } + + { + // Note that a zero-width space is still considered a grapheme as far + // as the specification is concerned. + str := "He\u200bllo" + graphemes, _, width := utf8.grapheme_count(str) + testing.expect_value(t, graphemes, 6) + testing.expect_value(t, width, 5) + } + + { + str := "\U0001F926\U0001F3FC\u200D\u2642" + graphemes, _, width := utf8.grapheme_count(str) + testing.expect_value(t, graphemes, 1) + testing.expect_value(t, width, 2) + } + + { + str := "H̷e̶l̵l̸o̴p̵e̷ ̸w̶o̸r̵l̶d̵!̴" + graphemes, _, width := utf8.grapheme_count(str) + testing.expect_value(t, graphemes, 14) + testing.expect_value(t, width, 14) + } + + { + str := "aカ.ヒフ" + graphemes, grapheme_count, _, width := utf8.decode_grapheme_clusters(str) + defer delete(graphemes) + testing.expect_value(t, grapheme_count, 5) + testing.expect_value(t, width, 8) + if grapheme_count == 5 { + testing.expect_value(t, graphemes[0].width, 1) + testing.expect_value(t, graphemes[1].width, 2) + testing.expect_value(t, graphemes[2].width, 1) + testing.expect_value(t, graphemes[3].width, 2) + testing.expect_value(t, graphemes[4].width, 2) + } + } + + { + str := "いろはにほへ" + graphemes, _, width := utf8.grapheme_count(str) + testing.expect_value(t, graphemes, 6) + testing.expect_value(t, width, 12) + } + + { + str := "舍利弗,是諸法空相,不生不滅,不垢不淨,不增不減。" + graphemes, _, width := utf8.grapheme_count(str) + testing.expect_value(t, graphemes, 25) + testing.expect_value(t, width, 50) + } +}