Measure East_Asian_Width during grapheme decoding

2026-02-19 17:38:23 +00:00 · 2024-06-19 22:35:36 -04:00
parent 4380934283
commit e620645a03
4 changed files with 310 additions and 7 deletions
--- a/core/unicode/letter.odin
+++ b/core/unicode/letter.odin
@@ -5,8 +5,10 @@ REPLACEMENT_CHAR :: '\ufffd'     // Represented an invalid code point
 MAX_ASCII        :: '\u007f'     // Maximum ASCII value
 MAX_LATIN1       :: '\u00ff'     // Maximum Latin-1 value

+ZERO_WIDTH_SPACE      :: '\u200B'
 ZERO_WIDTH_NON_JOINER :: '\u200C'
 ZERO_WIDTH_JOINER     :: '\u200D'
+WORD_JOINER           :: '\u2060'

@(require_results)
 binary_search :: proc(c: i32, table: []i32, length, stride: int) -> int {
@@ -450,6 +452,41 @@ is_gcb_extend_class :: proc(r: rune) -> bool {
 	return is_grapheme_extend(r) || is_emoji_modifier(r)
 }

+// Return values:
+//
+// - 2 if East_Asian_Width=F or W, or
+// - 0 if non-printable / zero-width, or
+// - 1 in all other cases.
+//
+@(require_results)
+normalized_east_asian_width :: proc(r: rune) -> int {
+	// This is a different interpretation of the BOM which occurs in the middle of text.
+	ZERO_WIDTH_NO_BREAK_SPACE :: '\uFEFF'
+
+	if is_control(r) {
+		return 0
+	} else if r <= 0x10FF {
+		// Easy early out for low runes.
+		return 1
+	}
+
+	switch r {
+	case ZERO_WIDTH_NO_BREAK_SPACE,
+	     ZERO_WIDTH_SPACE,
+	     ZERO_WIDTH_NON_JOINER,
+	     ZERO_WIDTH_JOINER,
+	     WORD_JOINER:
+		return 0
+	}
+
+	c := i32(r)
+	p := binary_search(c, normalized_east_asian_width_ranges[:], len(normalized_east_asian_width_ranges)/3, 3)
+	if p >= 0 && normalized_east_asian_width_ranges[p] <= c && c <= normalized_east_asian_width_ranges[p+1] {
+		return cast(int)normalized_east_asian_width_ranges[p+2]
+	}
+	return 1
+}
+
 //
 // End of Unicode 15.1.0 block.
 //
--- a/core/unicode/tables.odin
+++ b/core/unicode/tables.odin
@@ -3716,6 +3716,177 @@ indic_conjunct_break_extend_ranges := [?]i32 {
 	0x1E944, 0x1E94A,
 }

+// Fullwidth (F) and Wide (W) are counted as 2.
+// Everything else is 1.
+//
+// Derived from: https://unicode.org/Public/15.1.0/ucd/EastAsianWidth.txt
+@(rodata)
+normalized_east_asian_width_ranges := [?]i32 {
+	0x0000, 0x10FF, 1,
+	0x1100, 0x115F, 2,
+	0x1160, 0x2319, 1,
+	0x231A, 0x231B, 2,
+	0x231C, 0x2328, 1,
+	0x2329, 0x232A, 2,
+	0x232B, 0x23E8, 1,
+	0x23E9, 0x23EC, 2,
+	0x23ED, 0x23EF, 1,
+	0x23F0, 0x23F0, 2,
+	0x23F1, 0x23F2, 1,
+	0x23F3, 0x23F3, 2,
+	0x23F4, 0x25FC, 1,
+	0x25FD, 0x25FE, 2,
+	0x25FF, 0x2613, 1,
+	0x2614, 0x2615, 2,
+	0x2616, 0x2647, 1,
+	0x2648, 0x2653, 2,
+	0x2654, 0x267E, 1,
+	0x267F, 0x267F, 2,
+	0x2680, 0x2692, 1,
+	0x2693, 0x2693, 2,
+	0x2694, 0x26A0, 1,
+	0x26A1, 0x26A1, 2,
+	0x26A2, 0x26A9, 1,
+	0x26AA, 0x26AB, 2,
+	0x26AC, 0x26BC, 1,
+	0x26BD, 0x26BE, 2,
+	0x26BF, 0x26C3, 1,
+	0x26C4, 0x26C5, 2,
+	0x26C6, 0x26CD, 1,
+	0x26CE, 0x26CE, 2,
+	0x26CF, 0x26D3, 1,
+	0x26D4, 0x26D4, 2,
+	0x26D5, 0x26E9, 1,
+	0x26EA, 0x26EA, 2,
+	0x26EB, 0x26F1, 1,
+	0x26F2, 0x26F3, 2,
+	0x26F4, 0x26F4, 1,
+	0x26F5, 0x26F5, 2,
+	0x26F6, 0x26F9, 1,
+	0x26FA, 0x26FA, 2,
+	0x26FB, 0x26FC, 1,
+	0x26FD, 0x26FD, 2,
+	0x26FE, 0x2704, 1,
+	0x2705, 0x2705, 2,
+	0x2706, 0x2709, 1,
+	0x270A, 0x270B, 2,
+	0x270C, 0x2727, 1,
+	0x2728, 0x2728, 2,
+	0x2729, 0x274B, 1,
+	0x274C, 0x274C, 2,
+	0x274D, 0x274D, 1,
+	0x274E, 0x274E, 2,
+	0x274F, 0x2752, 1,
+	0x2753, 0x2755, 2,
+	0x2756, 0x2756, 1,
+	0x2757, 0x2757, 2,
+	0x2758, 0x2794, 1,
+	0x2795, 0x2797, 2,
+	0x2798, 0x27AF, 1,
+	0x27B0, 0x27B0, 2,
+	0x27B1, 0x27BE, 1,
+	0x27BF, 0x27BF, 2,
+	0x27C0, 0x2B1A, 1,
+	0x2B1B, 0x2B1C, 2,
+	0x2B1D, 0x2B4F, 1,
+	0x2B50, 0x2B50, 2,
+	0x2B51, 0x2B54, 1,
+	0x2B55, 0x2B55, 2,
+	0x2B56, 0x2E5D, 1,
+	0x2E80, 0x303E, 2,
+	0x303F, 0x303F, 1,
+	0x3041, 0x3247, 2,
+	0x3248, 0x324F, 1,
+	0x3250, 0x4DBF, 2,
+	0x4DC0, 0x4DFF, 1,
+	0x4E00, 0xA4C6, 2,
+	0xA4D0, 0xA95F, 1,
+	0xA960, 0xA97C, 2,
+	0xA980, 0xABF9, 1,
+	0xAC00, 0xD7A3, 2,
+	0xD7B0, 0xF8FF, 1,
+	0xF900, 0xFAFF, 2,
+	0xFB00, 0xFE0F, 1,
+	0xFE10, 0xFE19, 2,
+	0xFE20, 0xFE2F, 1,
+	0xFE30, 0xFE6B, 2,
+	0xFE70, 0xFEFF, 1,
+	0xFF01, 0xFF60, 2,
+	0xFF61, 0xFFDC, 1,
+	0xFFE0, 0xFFE6, 2,
+	0xFFE8, 0x16F9F, 1,
+	0x16FE0, 0x1B2FB, 2,
+	0x1BC00, 0x1F003, 1,
+	0x1F004, 0x1F004, 2,
+	0x1F005, 0x1F0CE, 1,
+	0x1F0CF, 0x1F0CF, 2,
+	0x1F0D1, 0x1F18D, 1,
+	0x1F18E, 0x1F18E, 2,
+	0x1F18F, 0x1F190, 1,
+	0x1F191, 0x1F19A, 2,
+	0x1F19B, 0x1F1FF, 1,
+	0x1F200, 0x1F320, 2,
+	0x1F321, 0x1F32C, 1,
+	0x1F32D, 0x1F335, 2,
+	0x1F336, 0x1F336, 1,
+	0x1F337, 0x1F37C, 2,
+	0x1F37D, 0x1F37D, 1,
+	0x1F37E, 0x1F393, 2,
+	0x1F394, 0x1F39F, 1,
+	0x1F3A0, 0x1F3CA, 2,
+	0x1F3CB, 0x1F3CE, 1,
+	0x1F3CF, 0x1F3D3, 2,
+	0x1F3D4, 0x1F3DF, 1,
+	0x1F3E0, 0x1F3F0, 2,
+	0x1F3F1, 0x1F3F3, 1,
+	0x1F3F4, 0x1F3F4, 2,
+	0x1F3F5, 0x1F3F7, 1,
+	0x1F3F8, 0x1F43E, 2,
+	0x1F43F, 0x1F43F, 1,
+	0x1F440, 0x1F440, 2,
+	0x1F441, 0x1F441, 1,
+	0x1F442, 0x1F4FC, 2,
+	0x1F4FD, 0x1F4FE, 1,
+	0x1F4FF, 0x1F53D, 2,
+	0x1F53E, 0x1F54A, 1,
+	0x1F54B, 0x1F54E, 2,
+	0x1F54F, 0x1F54F, 1,
+	0x1F550, 0x1F567, 2,
+	0x1F568, 0x1F579, 1,
+	0x1F57A, 0x1F57A, 2,
+	0x1F57B, 0x1F594, 1,
+	0x1F595, 0x1F596, 2,
+	0x1F597, 0x1F5A3, 1,
+	0x1F5A4, 0x1F5A4, 2,
+	0x1F5A5, 0x1F5FA, 1,
+	0x1F5FB, 0x1F64F, 2,
+	0x1F650, 0x1F67F, 1,
+	0x1F680, 0x1F6C5, 2,
+	0x1F6C6, 0x1F6CB, 1,
+	0x1F6CC, 0x1F6CC, 2,
+	0x1F6CD, 0x1F6CF, 1,
+	0x1F6D0, 0x1F6D2, 2,
+	0x1F6D3, 0x1F6D4, 1,
+	0x1F6D5, 0x1F6DF, 2,
+	0x1F6E0, 0x1F6EA, 1,
+	0x1F6EB, 0x1F6EC, 2,
+	0x1F6F0, 0x1F6F3, 1,
+	0x1F6F4, 0x1F6FC, 2,
+	0x1F700, 0x1F7D9, 1,
+	0x1F7E0, 0x1F7F0, 2,
+	0x1F800, 0x1F90B, 1,
+	0x1F90C, 0x1F93A, 2,
+	0x1F93B, 0x1F93B, 1,
+	0x1F93C, 0x1F945, 2,
+	0x1F946, 0x1F946, 1,
+	0x1F947, 0x1F9FF, 2,
+	0x1FA00, 0x1FA6D, 1,
+	0x1FA70, 0x1FAF8, 2,
+	0x1FB00, 0x1FBF9, 1,
+	0x20000, 0x3FFFD, 2,
+	0xE0001, 0x10FFFD, 1,
+}
+
 //
 // End of Unicode 15.1.0 block.
 //
--- a/core/unicode/utf8/grapheme.odin
+++ b/core/unicode/utf8/grapheme.odin
@@ -17,11 +17,13 @@ is_spacing_mark                   :: unicode.is_spacing_mark
 is_gcb_prepend_class              :: unicode.is_gcb_prepend_class
 is_emoji_extended_pictographic    :: unicode.is_emoji_extended_pictographic
 is_regional_indicator             :: unicode.is_regional_indicator
+normalized_east_asian_width       :: unicode.normalized_east_asian_width


 Grapheme :: struct {
 	byte_index: int,
 	rune_index: int,
+	width: int,
 }

 /*
@@ -33,10 +35,11 @@ Inputs:
 Returns:
 - graphemes: The number of graphemes in the string.
 - runes: The number of runes in the string.
+- width: The width of the string in number of monospace cells.
 */
@(require_results)
-grapheme_count :: proc(str: string) -> (graphemes, runes: int) {
-	_, graphemes, runes = decode_grapheme_clusters(str, false)
+grapheme_count :: proc(str: string) -> (graphemes, runes, width: int) {
+	_, graphemes, runes, width = decode_grapheme_clusters(str, false)
 	return
 }

@@ -54,6 +57,7 @@ Returns:
 - graphemes: Extra data about each grapheme.
 - grapheme_count: The number of graphemes in the string.
 - rune_count: The number of runes in the string.
+- width: The width of the string in number of monospace cells.
 */
@(require_results)
 decode_grapheme_clusters :: proc(
@@ -64,6 +68,7 @@ decode_grapheme_clusters :: proc(
 	graphemes:      [dynamic]Grapheme,
 	grapheme_count: int,
 	rune_count:     int,
+	width:          int,
 ) {
 	// The following procedure implements text segmentation by breaking on
 	// Grapheme Cluster Boundaries[1], using the values[2] and rules[3] from
@@ -115,6 +120,24 @@ decode_grapheme_clusters :: proc(
 	// [3]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
 	// [4]: https://www.unicode.org/reports/tr29/#Conformance

+	// Additionally, this procedure now takes into account Standard Annex #11,
+	// in order to estimate how visually wide the string will appear on a
+	// monospaced display. This can only ever be a rough guess, as this tends
+	// to be an implementation detail relating to which fonts are being used,
+	// how codepoints are interpreted and drawn, if codepoint sequences are
+	// interpreted correctly, and et cetera.
+	//
+	// For example, a program may not properly interpret an emoji modifier
+	// sequence and print the component glyphs instead of one whole glyph.
+	//
+	// See here for more information: https://www.unicode.org/reports/tr11/
+	//
+	// NOTE: There is no explicit mention of what to do with zero-width spaces
+	// as far as grapheme cluster segmentation goes, therefore this
+	// implementation may count and return graphemes with a `width` of zero.
+	//
+	// Treat them as any other space.
+
 	Grapheme_Cluster_Sequence :: enum {
 		None,
 		Indic,
@@ -127,6 +150,7 @@ decode_grapheme_clusters :: proc(
 	last_rune: rune
 	last_rune_breaks_forward: bool

+	last_width: int
 	last_grapheme_count: int

 	bypass_next_rune: bool
@@ -145,10 +169,19 @@ decode_grapheme_clusters :: proc(
 			if rune_count == 0 && grapheme_count == 0 {
 				grapheme_count += 1
 			}
-			if track_graphemes && grapheme_count > last_grapheme_count {
-				append(&graphemes, Grapheme{ byte_index, rune_count })
+
+			if grapheme_count > last_grapheme_count {
+				width += normalized_east_asian_width(this_rune)
+				if track_graphemes {
+					append(&graphemes, Grapheme{
+						byte_index,
+						rune_count,
+						width - last_width,
+					})
+				}
+				last_grapheme_count = grapheme_count
+				last_width = width
 			}
-			last_grapheme_count = grapheme_count

 			last_rune = this_rune
 			rune_count += 1
--- a/tests/core/unicode/test_core_unicode.odin
+++ b/tests/core/unicode/test_core_unicode.odin
@@ -13,7 +13,7 @@ run_test_cases :: proc(t: ^testing.T, test_cases: []Test_Case, loc := #caller_lo
 	failed := 0
 	for c, i in test_cases {
 		log.debugf("(#% 4i) %q ...", i, c.str)
-		result, _ := utf8.grapheme_count(c.str)
+		result, _, _ := utf8.grapheme_count(c.str)
 		if !testing.expectf(t, result == c.expected_clusters,
 			"(#% 4i) graphemes: %i != %i, %q %s", i, result, c.expected_clusters, c.str, c.str,
 			loc = loc)
@@ -43,7 +43,7 @@ test_grapheme_byte_index_segmentation :: proc(t: ^testing.T) {

 	str := SAMPLE_1 + SAMPLE_2 + SAMPLE_3 + SAMPLE_2 + SAMPLE_1

-	graphemes, _, _ := utf8.decode_grapheme_clusters(str)
+	graphemes, _, _, _ := utf8.decode_grapheme_clusters(str)
 	defer delete(graphemes)

 	defer if testing.failed(t) {
@@ -71,3 +71,65 @@ test_grapheme_byte_index_segmentation :: proc(t: ^testing.T) {
 	testing.expectf(t, grapheme_4 == SAMPLE_2, "expected %q, got %q", SAMPLE_2, grapheme_2)
 	testing.expectf(t, grapheme_5 == SAMPLE_1, "expected %q, got %q", SAMPLE_1, grapheme_1)
 }
+
+@test
+test_width :: proc(t: ^testing.T) {
+	{
+		str := "He\u200dllo"
+		graphemes, _, width := utf8.grapheme_count(str)
+		testing.expect_value(t, graphemes, 5)
+		testing.expect_value(t, width, 5)
+	}
+
+	{
+		// Note that a zero-width space is still considered a grapheme as far
+		// as the specification is concerned.
+		str := "He\u200bllo"
+		graphemes, _, width := utf8.grapheme_count(str)
+		testing.expect_value(t, graphemes, 6)
+		testing.expect_value(t, width, 5)
+	}
+
+	{
+		str := "\U0001F926\U0001F3FC\u200D\u2642"
+		graphemes, _, width := utf8.grapheme_count(str)
+		testing.expect_value(t, graphemes, 1)
+		testing.expect_value(t, width, 2)
+	}
+
+	{
+		str := "H̷e̶l̵l̸o̴p̵e̷ ̸w̶o̸r̵l̶d̵!̴"
+		graphemes, _, width := utf8.grapheme_count(str)
+		testing.expect_value(t, graphemes, 14)
+		testing.expect_value(t, width, 14)
+	}
+
+	{
+		str := "aカ.ヒフ"
+		graphemes, grapheme_count, _, width := utf8.decode_grapheme_clusters(str)
+		defer delete(graphemes)
+		testing.expect_value(t, grapheme_count, 5)
+		testing.expect_value(t, width, 8)
+		if grapheme_count == 5 {
+			testing.expect_value(t, graphemes[0].width, 1)
+			testing.expect_value(t, graphemes[1].width, 2)
+			testing.expect_value(t, graphemes[2].width, 1)
+			testing.expect_value(t, graphemes[3].width, 2)
+			testing.expect_value(t, graphemes[4].width, 2)
+		}
+	}
+
+	{
+		str := "いろはにほへ"
+		graphemes, _, width := utf8.grapheme_count(str)
+		testing.expect_value(t, graphemes, 6)
+		testing.expect_value(t, width, 12)
+	}
+
+	{
+		str := "舍利弗，是諸法空相，不生不滅，不垢不淨，不增不減。"
+		graphemes, _, width := utf8.grapheme_count(str)
+		testing.expect_value(t, graphemes, 25)
+		testing.expect_value(t, width, 50)
+	}
+}