mirror of
https://github.com/odin-lang/Odin.git
synced 2026-01-04 20:17:48 +00:00
Measure East_Asian_Width during grapheme decoding
This commit is contained in:
@@ -5,8 +5,10 @@ REPLACEMENT_CHAR :: '\ufffd' // Represented an invalid code point
|
||||
MAX_ASCII :: '\u007f' // Maximum ASCII value
|
||||
MAX_LATIN1 :: '\u00ff' // Maximum Latin-1 value
|
||||
|
||||
ZERO_WIDTH_SPACE :: '\u200B'
|
||||
ZERO_WIDTH_NON_JOINER :: '\u200C'
|
||||
ZERO_WIDTH_JOINER :: '\u200D'
|
||||
WORD_JOINER :: '\u2060'
|
||||
|
||||
@(require_results)
|
||||
binary_search :: proc(c: i32, table: []i32, length, stride: int) -> int {
|
||||
@@ -450,6 +452,41 @@ is_gcb_extend_class :: proc(r: rune) -> bool {
|
||||
return is_grapheme_extend(r) || is_emoji_modifier(r)
|
||||
}
|
||||
|
||||
// Return values:
|
||||
//
|
||||
// - 2 if East_Asian_Width=F or W, or
|
||||
// - 0 if non-printable / zero-width, or
|
||||
// - 1 in all other cases.
|
||||
//
|
||||
@(require_results)
|
||||
normalized_east_asian_width :: proc(r: rune) -> int {
|
||||
// This is a different interpretation of the BOM which occurs in the middle of text.
|
||||
ZERO_WIDTH_NO_BREAK_SPACE :: '\uFEFF'
|
||||
|
||||
if is_control(r) {
|
||||
return 0
|
||||
} else if r <= 0x10FF {
|
||||
// Easy early out for low runes.
|
||||
return 1
|
||||
}
|
||||
|
||||
switch r {
|
||||
case ZERO_WIDTH_NO_BREAK_SPACE,
|
||||
ZERO_WIDTH_SPACE,
|
||||
ZERO_WIDTH_NON_JOINER,
|
||||
ZERO_WIDTH_JOINER,
|
||||
WORD_JOINER:
|
||||
return 0
|
||||
}
|
||||
|
||||
c := i32(r)
|
||||
p := binary_search(c, normalized_east_asian_width_ranges[:], len(normalized_east_asian_width_ranges)/3, 3)
|
||||
if p >= 0 && normalized_east_asian_width_ranges[p] <= c && c <= normalized_east_asian_width_ranges[p+1] {
|
||||
return cast(int)normalized_east_asian_width_ranges[p+2]
|
||||
}
|
||||
return 1
|
||||
}
|
||||
|
||||
//
|
||||
// End of Unicode 15.1.0 block.
|
||||
//
|
||||
|
||||
@@ -3716,6 +3716,177 @@ indic_conjunct_break_extend_ranges := [?]i32 {
|
||||
0x1E944, 0x1E94A,
|
||||
}
|
||||
|
||||
// Fullwidth (F) and Wide (W) are counted as 2.
|
||||
// Everything else is 1.
|
||||
//
|
||||
// Derived from: https://unicode.org/Public/15.1.0/ucd/EastAsianWidth.txt
|
||||
@(rodata)
|
||||
normalized_east_asian_width_ranges := [?]i32 {
|
||||
0x0000, 0x10FF, 1,
|
||||
0x1100, 0x115F, 2,
|
||||
0x1160, 0x2319, 1,
|
||||
0x231A, 0x231B, 2,
|
||||
0x231C, 0x2328, 1,
|
||||
0x2329, 0x232A, 2,
|
||||
0x232B, 0x23E8, 1,
|
||||
0x23E9, 0x23EC, 2,
|
||||
0x23ED, 0x23EF, 1,
|
||||
0x23F0, 0x23F0, 2,
|
||||
0x23F1, 0x23F2, 1,
|
||||
0x23F3, 0x23F3, 2,
|
||||
0x23F4, 0x25FC, 1,
|
||||
0x25FD, 0x25FE, 2,
|
||||
0x25FF, 0x2613, 1,
|
||||
0x2614, 0x2615, 2,
|
||||
0x2616, 0x2647, 1,
|
||||
0x2648, 0x2653, 2,
|
||||
0x2654, 0x267E, 1,
|
||||
0x267F, 0x267F, 2,
|
||||
0x2680, 0x2692, 1,
|
||||
0x2693, 0x2693, 2,
|
||||
0x2694, 0x26A0, 1,
|
||||
0x26A1, 0x26A1, 2,
|
||||
0x26A2, 0x26A9, 1,
|
||||
0x26AA, 0x26AB, 2,
|
||||
0x26AC, 0x26BC, 1,
|
||||
0x26BD, 0x26BE, 2,
|
||||
0x26BF, 0x26C3, 1,
|
||||
0x26C4, 0x26C5, 2,
|
||||
0x26C6, 0x26CD, 1,
|
||||
0x26CE, 0x26CE, 2,
|
||||
0x26CF, 0x26D3, 1,
|
||||
0x26D4, 0x26D4, 2,
|
||||
0x26D5, 0x26E9, 1,
|
||||
0x26EA, 0x26EA, 2,
|
||||
0x26EB, 0x26F1, 1,
|
||||
0x26F2, 0x26F3, 2,
|
||||
0x26F4, 0x26F4, 1,
|
||||
0x26F5, 0x26F5, 2,
|
||||
0x26F6, 0x26F9, 1,
|
||||
0x26FA, 0x26FA, 2,
|
||||
0x26FB, 0x26FC, 1,
|
||||
0x26FD, 0x26FD, 2,
|
||||
0x26FE, 0x2704, 1,
|
||||
0x2705, 0x2705, 2,
|
||||
0x2706, 0x2709, 1,
|
||||
0x270A, 0x270B, 2,
|
||||
0x270C, 0x2727, 1,
|
||||
0x2728, 0x2728, 2,
|
||||
0x2729, 0x274B, 1,
|
||||
0x274C, 0x274C, 2,
|
||||
0x274D, 0x274D, 1,
|
||||
0x274E, 0x274E, 2,
|
||||
0x274F, 0x2752, 1,
|
||||
0x2753, 0x2755, 2,
|
||||
0x2756, 0x2756, 1,
|
||||
0x2757, 0x2757, 2,
|
||||
0x2758, 0x2794, 1,
|
||||
0x2795, 0x2797, 2,
|
||||
0x2798, 0x27AF, 1,
|
||||
0x27B0, 0x27B0, 2,
|
||||
0x27B1, 0x27BE, 1,
|
||||
0x27BF, 0x27BF, 2,
|
||||
0x27C0, 0x2B1A, 1,
|
||||
0x2B1B, 0x2B1C, 2,
|
||||
0x2B1D, 0x2B4F, 1,
|
||||
0x2B50, 0x2B50, 2,
|
||||
0x2B51, 0x2B54, 1,
|
||||
0x2B55, 0x2B55, 2,
|
||||
0x2B56, 0x2E5D, 1,
|
||||
0x2E80, 0x303E, 2,
|
||||
0x303F, 0x303F, 1,
|
||||
0x3041, 0x3247, 2,
|
||||
0x3248, 0x324F, 1,
|
||||
0x3250, 0x4DBF, 2,
|
||||
0x4DC0, 0x4DFF, 1,
|
||||
0x4E00, 0xA4C6, 2,
|
||||
0xA4D0, 0xA95F, 1,
|
||||
0xA960, 0xA97C, 2,
|
||||
0xA980, 0xABF9, 1,
|
||||
0xAC00, 0xD7A3, 2,
|
||||
0xD7B0, 0xF8FF, 1,
|
||||
0xF900, 0xFAFF, 2,
|
||||
0xFB00, 0xFE0F, 1,
|
||||
0xFE10, 0xFE19, 2,
|
||||
0xFE20, 0xFE2F, 1,
|
||||
0xFE30, 0xFE6B, 2,
|
||||
0xFE70, 0xFEFF, 1,
|
||||
0xFF01, 0xFF60, 2,
|
||||
0xFF61, 0xFFDC, 1,
|
||||
0xFFE0, 0xFFE6, 2,
|
||||
0xFFE8, 0x16F9F, 1,
|
||||
0x16FE0, 0x1B2FB, 2,
|
||||
0x1BC00, 0x1F003, 1,
|
||||
0x1F004, 0x1F004, 2,
|
||||
0x1F005, 0x1F0CE, 1,
|
||||
0x1F0CF, 0x1F0CF, 2,
|
||||
0x1F0D1, 0x1F18D, 1,
|
||||
0x1F18E, 0x1F18E, 2,
|
||||
0x1F18F, 0x1F190, 1,
|
||||
0x1F191, 0x1F19A, 2,
|
||||
0x1F19B, 0x1F1FF, 1,
|
||||
0x1F200, 0x1F320, 2,
|
||||
0x1F321, 0x1F32C, 1,
|
||||
0x1F32D, 0x1F335, 2,
|
||||
0x1F336, 0x1F336, 1,
|
||||
0x1F337, 0x1F37C, 2,
|
||||
0x1F37D, 0x1F37D, 1,
|
||||
0x1F37E, 0x1F393, 2,
|
||||
0x1F394, 0x1F39F, 1,
|
||||
0x1F3A0, 0x1F3CA, 2,
|
||||
0x1F3CB, 0x1F3CE, 1,
|
||||
0x1F3CF, 0x1F3D3, 2,
|
||||
0x1F3D4, 0x1F3DF, 1,
|
||||
0x1F3E0, 0x1F3F0, 2,
|
||||
0x1F3F1, 0x1F3F3, 1,
|
||||
0x1F3F4, 0x1F3F4, 2,
|
||||
0x1F3F5, 0x1F3F7, 1,
|
||||
0x1F3F8, 0x1F43E, 2,
|
||||
0x1F43F, 0x1F43F, 1,
|
||||
0x1F440, 0x1F440, 2,
|
||||
0x1F441, 0x1F441, 1,
|
||||
0x1F442, 0x1F4FC, 2,
|
||||
0x1F4FD, 0x1F4FE, 1,
|
||||
0x1F4FF, 0x1F53D, 2,
|
||||
0x1F53E, 0x1F54A, 1,
|
||||
0x1F54B, 0x1F54E, 2,
|
||||
0x1F54F, 0x1F54F, 1,
|
||||
0x1F550, 0x1F567, 2,
|
||||
0x1F568, 0x1F579, 1,
|
||||
0x1F57A, 0x1F57A, 2,
|
||||
0x1F57B, 0x1F594, 1,
|
||||
0x1F595, 0x1F596, 2,
|
||||
0x1F597, 0x1F5A3, 1,
|
||||
0x1F5A4, 0x1F5A4, 2,
|
||||
0x1F5A5, 0x1F5FA, 1,
|
||||
0x1F5FB, 0x1F64F, 2,
|
||||
0x1F650, 0x1F67F, 1,
|
||||
0x1F680, 0x1F6C5, 2,
|
||||
0x1F6C6, 0x1F6CB, 1,
|
||||
0x1F6CC, 0x1F6CC, 2,
|
||||
0x1F6CD, 0x1F6CF, 1,
|
||||
0x1F6D0, 0x1F6D2, 2,
|
||||
0x1F6D3, 0x1F6D4, 1,
|
||||
0x1F6D5, 0x1F6DF, 2,
|
||||
0x1F6E0, 0x1F6EA, 1,
|
||||
0x1F6EB, 0x1F6EC, 2,
|
||||
0x1F6F0, 0x1F6F3, 1,
|
||||
0x1F6F4, 0x1F6FC, 2,
|
||||
0x1F700, 0x1F7D9, 1,
|
||||
0x1F7E0, 0x1F7F0, 2,
|
||||
0x1F800, 0x1F90B, 1,
|
||||
0x1F90C, 0x1F93A, 2,
|
||||
0x1F93B, 0x1F93B, 1,
|
||||
0x1F93C, 0x1F945, 2,
|
||||
0x1F946, 0x1F946, 1,
|
||||
0x1F947, 0x1F9FF, 2,
|
||||
0x1FA00, 0x1FA6D, 1,
|
||||
0x1FA70, 0x1FAF8, 2,
|
||||
0x1FB00, 0x1FBF9, 1,
|
||||
0x20000, 0x3FFFD, 2,
|
||||
0xE0001, 0x10FFFD, 1,
|
||||
}
|
||||
|
||||
//
|
||||
// End of Unicode 15.1.0 block.
|
||||
//
|
||||
|
||||
@@ -17,11 +17,13 @@ is_spacing_mark :: unicode.is_spacing_mark
|
||||
is_gcb_prepend_class :: unicode.is_gcb_prepend_class
|
||||
is_emoji_extended_pictographic :: unicode.is_emoji_extended_pictographic
|
||||
is_regional_indicator :: unicode.is_regional_indicator
|
||||
normalized_east_asian_width :: unicode.normalized_east_asian_width
|
||||
|
||||
|
||||
Grapheme :: struct {
|
||||
byte_index: int,
|
||||
rune_index: int,
|
||||
width: int,
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -33,10 +35,11 @@ Inputs:
|
||||
Returns:
|
||||
- graphemes: The number of graphemes in the string.
|
||||
- runes: The number of runes in the string.
|
||||
- width: The width of the string in number of monospace cells.
|
||||
*/
|
||||
@(require_results)
|
||||
grapheme_count :: proc(str: string) -> (graphemes, runes: int) {
|
||||
_, graphemes, runes = decode_grapheme_clusters(str, false)
|
||||
grapheme_count :: proc(str: string) -> (graphemes, runes, width: int) {
|
||||
_, graphemes, runes, width = decode_grapheme_clusters(str, false)
|
||||
return
|
||||
}
|
||||
|
||||
@@ -54,6 +57,7 @@ Returns:
|
||||
- graphemes: Extra data about each grapheme.
|
||||
- grapheme_count: The number of graphemes in the string.
|
||||
- rune_count: The number of runes in the string.
|
||||
- width: The width of the string in number of monospace cells.
|
||||
*/
|
||||
@(require_results)
|
||||
decode_grapheme_clusters :: proc(
|
||||
@@ -64,6 +68,7 @@ decode_grapheme_clusters :: proc(
|
||||
graphemes: [dynamic]Grapheme,
|
||||
grapheme_count: int,
|
||||
rune_count: int,
|
||||
width: int,
|
||||
) {
|
||||
// The following procedure implements text segmentation by breaking on
|
||||
// Grapheme Cluster Boundaries[1], using the values[2] and rules[3] from
|
||||
@@ -115,6 +120,24 @@ decode_grapheme_clusters :: proc(
|
||||
// [3]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
|
||||
// [4]: https://www.unicode.org/reports/tr29/#Conformance
|
||||
|
||||
// Additionally, this procedure now takes into account Standard Annex #11,
|
||||
// in order to estimate how visually wide the string will appear on a
|
||||
// monospaced display. This can only ever be a rough guess, as this tends
|
||||
// to be an implementation detail relating to which fonts are being used,
|
||||
// how codepoints are interpreted and drawn, if codepoint sequences are
|
||||
// interpreted correctly, and et cetera.
|
||||
//
|
||||
// For example, a program may not properly interpret an emoji modifier
|
||||
// sequence and print the component glyphs instead of one whole glyph.
|
||||
//
|
||||
// See here for more information: https://www.unicode.org/reports/tr11/
|
||||
//
|
||||
// NOTE: There is no explicit mention of what to do with zero-width spaces
|
||||
// as far as grapheme cluster segmentation goes, therefore this
|
||||
// implementation may count and return graphemes with a `width` of zero.
|
||||
//
|
||||
// Treat them as any other space.
|
||||
|
||||
Grapheme_Cluster_Sequence :: enum {
|
||||
None,
|
||||
Indic,
|
||||
@@ -127,6 +150,7 @@ decode_grapheme_clusters :: proc(
|
||||
last_rune: rune
|
||||
last_rune_breaks_forward: bool
|
||||
|
||||
last_width: int
|
||||
last_grapheme_count: int
|
||||
|
||||
bypass_next_rune: bool
|
||||
@@ -145,10 +169,19 @@ decode_grapheme_clusters :: proc(
|
||||
if rune_count == 0 && grapheme_count == 0 {
|
||||
grapheme_count += 1
|
||||
}
|
||||
if track_graphemes && grapheme_count > last_grapheme_count {
|
||||
append(&graphemes, Grapheme{ byte_index, rune_count })
|
||||
|
||||
if grapheme_count > last_grapheme_count {
|
||||
width += normalized_east_asian_width(this_rune)
|
||||
if track_graphemes {
|
||||
append(&graphemes, Grapheme{
|
||||
byte_index,
|
||||
rune_count,
|
||||
width - last_width,
|
||||
})
|
||||
}
|
||||
last_grapheme_count = grapheme_count
|
||||
last_width = width
|
||||
}
|
||||
last_grapheme_count = grapheme_count
|
||||
|
||||
last_rune = this_rune
|
||||
rune_count += 1
|
||||
|
||||
@@ -13,7 +13,7 @@ run_test_cases :: proc(t: ^testing.T, test_cases: []Test_Case, loc := #caller_lo
|
||||
failed := 0
|
||||
for c, i in test_cases {
|
||||
log.debugf("(#% 4i) %q ...", i, c.str)
|
||||
result, _ := utf8.grapheme_count(c.str)
|
||||
result, _, _ := utf8.grapheme_count(c.str)
|
||||
if !testing.expectf(t, result == c.expected_clusters,
|
||||
"(#% 4i) graphemes: %i != %i, %q %s", i, result, c.expected_clusters, c.str, c.str,
|
||||
loc = loc)
|
||||
@@ -43,7 +43,7 @@ test_grapheme_byte_index_segmentation :: proc(t: ^testing.T) {
|
||||
|
||||
str := SAMPLE_1 + SAMPLE_2 + SAMPLE_3 + SAMPLE_2 + SAMPLE_1
|
||||
|
||||
graphemes, _, _ := utf8.decode_grapheme_clusters(str)
|
||||
graphemes, _, _, _ := utf8.decode_grapheme_clusters(str)
|
||||
defer delete(graphemes)
|
||||
|
||||
defer if testing.failed(t) {
|
||||
@@ -71,3 +71,65 @@ test_grapheme_byte_index_segmentation :: proc(t: ^testing.T) {
|
||||
testing.expectf(t, grapheme_4 == SAMPLE_2, "expected %q, got %q", SAMPLE_2, grapheme_2)
|
||||
testing.expectf(t, grapheme_5 == SAMPLE_1, "expected %q, got %q", SAMPLE_1, grapheme_1)
|
||||
}
|
||||
|
||||
@test
|
||||
test_width :: proc(t: ^testing.T) {
|
||||
{
|
||||
str := "He\u200dllo"
|
||||
graphemes, _, width := utf8.grapheme_count(str)
|
||||
testing.expect_value(t, graphemes, 5)
|
||||
testing.expect_value(t, width, 5)
|
||||
}
|
||||
|
||||
{
|
||||
// Note that a zero-width space is still considered a grapheme as far
|
||||
// as the specification is concerned.
|
||||
str := "He\u200bllo"
|
||||
graphemes, _, width := utf8.grapheme_count(str)
|
||||
testing.expect_value(t, graphemes, 6)
|
||||
testing.expect_value(t, width, 5)
|
||||
}
|
||||
|
||||
{
|
||||
str := "\U0001F926\U0001F3FC\u200D\u2642"
|
||||
graphemes, _, width := utf8.grapheme_count(str)
|
||||
testing.expect_value(t, graphemes, 1)
|
||||
testing.expect_value(t, width, 2)
|
||||
}
|
||||
|
||||
{
|
||||
str := "H̷e̶l̵l̸o̴p̵e̷ ̸w̶o̸r̵l̶d̵!̴"
|
||||
graphemes, _, width := utf8.grapheme_count(str)
|
||||
testing.expect_value(t, graphemes, 14)
|
||||
testing.expect_value(t, width, 14)
|
||||
}
|
||||
|
||||
{
|
||||
str := "aカ.ヒフ"
|
||||
graphemes, grapheme_count, _, width := utf8.decode_grapheme_clusters(str)
|
||||
defer delete(graphemes)
|
||||
testing.expect_value(t, grapheme_count, 5)
|
||||
testing.expect_value(t, width, 8)
|
||||
if grapheme_count == 5 {
|
||||
testing.expect_value(t, graphemes[0].width, 1)
|
||||
testing.expect_value(t, graphemes[1].width, 2)
|
||||
testing.expect_value(t, graphemes[2].width, 1)
|
||||
testing.expect_value(t, graphemes[3].width, 2)
|
||||
testing.expect_value(t, graphemes[4].width, 2)
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
str := "いろはにほへ"
|
||||
graphemes, _, width := utf8.grapheme_count(str)
|
||||
testing.expect_value(t, graphemes, 6)
|
||||
testing.expect_value(t, width, 12)
|
||||
}
|
||||
|
||||
{
|
||||
str := "舍利弗,是諸法空相,不生不滅,不垢不淨,不增不減。"
|
||||
graphemes, _, width := utf8.grapheme_count(str)
|
||||
testing.expect_value(t, graphemes, 25)
|
||||
testing.expect_value(t, width, 50)
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user