Merge pull request #3775 from Feoramund/unicode-graphemes

Add grapheme analysis facilities to `core:unicode`
This commit is contained in:
gingerBill
2024-06-18 12:48:31 +01:00
committed by GitHub
5 changed files with 8084 additions and 4 deletions

View File

@@ -5,6 +5,10 @@ REPLACEMENT_CHAR :: '\ufffd' // Represented an invalid code point
MAX_ASCII :: '\u007f' // Maximum ASCII value
MAX_LATIN1 :: '\u00ff' // Maximum Latin-1 value
ZERO_WIDTH_NON_JOINER :: '\u200C'
ZERO_WIDTH_JOINER :: '\u200D'
@(require_results)
binary_search :: proc(c: i32, table: []i32, length, stride: int) -> int {
n := length
t := 0
@@ -24,6 +28,7 @@ binary_search :: proc(c: i32, table: []i32, length, stride: int) -> int {
return -1
}
@(require_results)
to_lower :: proc(r: rune) -> rune {
c := i32(r)
p := binary_search(c, to_lower_ranges[:], len(to_lower_ranges)/3, 3)
@@ -36,6 +41,7 @@ to_lower :: proc(r: rune) -> rune {
}
return rune(c)
}
@(require_results)
to_upper :: proc(r: rune) -> rune {
c := i32(r)
p := binary_search(c, to_upper_ranges[:], len(to_upper_ranges)/3, 3)
@@ -48,6 +54,7 @@ to_upper :: proc(r: rune) -> rune {
}
return rune(c)
}
@(require_results)
to_title :: proc(r: rune) -> rune {
c := i32(r)
p := binary_search(c, to_upper_singlets[:], len(to_title_singlets)/2, 2)
@@ -58,6 +65,7 @@ to_title :: proc(r: rune) -> rune {
}
@(require_results)
is_lower :: proc(r: rune) -> bool {
if r <= MAX_ASCII {
return u32(r)-'a' < 26
@@ -74,6 +82,7 @@ is_lower :: proc(r: rune) -> bool {
return false
}
@(require_results)
is_upper :: proc(r: rune) -> bool {
if r <= MAX_ASCII {
return u32(r)-'A' < 26
@@ -91,6 +100,7 @@ is_upper :: proc(r: rune) -> bool {
}
is_alpha :: is_letter
@(require_results)
is_letter :: proc(r: rune) -> bool {
if u32(r) <= MAX_LATIN1 {
return char_properties[u8(r)]&pLmask != 0
@@ -111,10 +121,12 @@ is_letter :: proc(r: rune) -> bool {
return false
}
@(require_results)
is_title :: proc(r: rune) -> bool {
return is_upper(r) && is_lower(r)
}
@(require_results)
is_digit :: proc(r: rune) -> bool {
if r <= MAX_LATIN1 {
return '0' <= r && r <= '9'
@@ -124,6 +136,7 @@ is_digit :: proc(r: rune) -> bool {
is_white_space :: is_space
@(require_results)
is_space :: proc(r: rune) -> bool {
if u32(r) <= MAX_LATIN1 {
switch r {
@@ -140,18 +153,20 @@ is_space :: proc(r: rune) -> bool {
return false
}
@(require_results)
is_combining :: proc(r: rune) -> bool {
c := i32(r)
return c >= 0x0300 && (c <= 0x036f ||
(c >= 0x1ab0 && c <= 0x1aff) ||
(c >= 0x1dc0 && c <= 0x1dff) ||
(c >= 0x20d0 && c <= 0x20ff) ||
(c >= 0xfe20 && c <= 0xfe2f))
(c >= 0x1ab0 && c <= 0x1aff) ||
(c >= 0x1dc0 && c <= 0x1dff) ||
(c >= 0x20d0 && c <= 0x20ff) ||
(c >= 0xfe20 && c <= 0xfe2f))
}
@(require_results)
is_graphic :: proc(r: rune) -> bool {
if u32(r) <= MAX_LATIN1 {
return char_properties[u8(r)]&pg != 0
@@ -159,6 +174,7 @@ is_graphic :: proc(r: rune) -> bool {
return false
}
@(require_results)
is_print :: proc(r: rune) -> bool {
if u32(r) <= MAX_LATIN1 {
return char_properties[u8(r)]&pp != 0
@@ -166,6 +182,7 @@ is_print :: proc(r: rune) -> bool {
return false
}
@(require_results)
is_control :: proc(r: rune) -> bool {
if u32(r) <= MAX_LATIN1 {
return char_properties[u8(r)]&pC != 0
@@ -173,6 +190,7 @@ is_control :: proc(r: rune) -> bool {
return false
}
@(require_results)
is_number :: proc(r: rune) -> bool {
if u32(r) <= MAX_LATIN1 {
return char_properties[u8(r)]&pN != 0
@@ -180,6 +198,7 @@ is_number :: proc(r: rune) -> bool {
return false
}
@(require_results)
is_punct :: proc(r: rune) -> bool {
if u32(r) <= MAX_LATIN1 {
return char_properties[u8(r)]&pP != 0
@@ -187,9 +206,249 @@ is_punct :: proc(r: rune) -> bool {
return false
}
@(require_results)
is_symbol :: proc(r: rune) -> bool {
if u32(r) <= MAX_LATIN1 {
return char_properties[u8(r)]&pS != 0
}
return false
}
//
// The procedures below are accurate as of Unicode 15.1.0.
//
// Emoji_Modifier
@(require_results)
is_emoji_modifier :: proc(r: rune) -> bool {
return 0x1F3FB <= r && r <= 0x1F3FF
}
// Regional_Indicator
@(require_results)
is_regional_indicator :: proc(r: rune) -> bool {
return 0x1F1E6 <= r && r <= 0x1F1FF
}
// General_Category=Enclosing_Mark
@(require_results)
is_enclosing_mark :: proc(r: rune) -> bool {
switch r {
case 0x0488,
0x0489,
0x1ABE,
0x20DD ..= 0x20E0,
0x20E2 ..= 0x20E4,
0xA670 ..= 0xA672: return true
}
return false
}
// Prepended_Concatenation_Mark
@(require_results)
is_prepended_concatenation_mark :: proc(r: rune) -> bool {
switch r {
case 0x00600 ..= 0x00605,
0x006DD,
0x0070F,
0x00890 ..= 0x00891,
0x008E2,
0x110BD,
0x110CD:
return true
case:
return false
}
}
// General_Category=Spacing_Mark
@(require_results)
is_spacing_mark :: proc(r: rune) -> bool {
c := i32(r)
p := binary_search(c, spacing_mark_ranges[:], len(spacing_mark_ranges)/2, 2)
if p >= 0 && spacing_mark_ranges[p] <= c && c <= spacing_mark_ranges[p+1] {
return true
}
return false
}
// General_Category=Nonspacing_Mark
@(require_results)
is_nonspacing_mark :: proc(r: rune) -> bool {
c := i32(r)
p := binary_search(c, nonspacing_mark_ranges[:], len(nonspacing_mark_ranges)/2, 2)
if p >= 0 && nonspacing_mark_ranges[p] <= c && c <= nonspacing_mark_ranges[p+1] {
return true
}
return false
}
// Extended_Pictographic
@(require_results)
is_emoji_extended_pictographic :: proc(r: rune) -> bool {
c := i32(r)
p := binary_search(c, emoji_extended_pictographic_ranges[:], len(emoji_extended_pictographic_ranges)/2, 2)
if p >= 0 && emoji_extended_pictographic_ranges[p] <= c && c <= emoji_extended_pictographic_ranges[p+1] {
return true
}
return false
}
// Grapheme_Extend
@(require_results)
is_grapheme_extend :: proc(r: rune) -> bool {
c := i32(r)
p := binary_search(c, grapheme_extend_ranges[:], len(grapheme_extend_ranges)/2, 2)
if p >= 0 && grapheme_extend_ranges[p] <= c && c <= grapheme_extend_ranges[p+1] {
return true
}
return false
}
// Hangul_Syllable_Type=Leading_Jamo
@(require_results)
is_hangul_syllable_leading :: proc(r: rune) -> bool {
return 0x1100 <= r && r <= 0x115F || 0xA960 <= r && r <= 0xA97C
}
// Hangul_Syllable_Type=Vowel_Jamo
@(require_results)
is_hangul_syllable_vowel :: proc(r: rune) -> bool {
return 0x1160 <= r && r <= 0x11A7 || 0xD7B0 <= r && r <= 0xD7C6
}
// Hangul_Syllable_Type=Trailing_Jamo
@(require_results)
is_hangul_syllable_trailing :: proc(r: rune) -> bool {
return 0x11A8 <= r && r <= 0x11FF || 0xD7CB <= r && r <= 0xD7FB
}
// Hangul_Syllable_Type=LV_Syllable
@(require_results)
is_hangul_syllable_lv :: proc(r: rune) -> bool {
c := i32(r)
p := binary_search(c, hangul_syllable_lv_singlets[:], len(hangul_syllable_lv_singlets), 1)
if p >= 0 && c == hangul_syllable_lv_singlets[p] {
return true
}
return false
}
// Hangul_Syllable_Type=LVT_Syllable
@(require_results)
is_hangul_syllable_lvt :: proc(r: rune) -> bool {
c := i32(r)
p := binary_search(c, hangul_syllable_lvt_ranges[:], len(hangul_syllable_lvt_ranges)/2, 2)
if p >= 0 && hangul_syllable_lvt_ranges[p] <= c && c <= hangul_syllable_lvt_ranges[p+1] {
return true
}
return false
}
// Indic_Syllabic_Category=Consonant_Preceding_Repha
@(require_results)
is_indic_consonant_preceding_repha :: proc(r: rune) -> bool {
switch r {
case 0x00D4E,
0x11941,
0x11D46,
0x11F02:
return true
case:
return false
}
}
// Indic_Syllabic_Category=Consonant_Prefixed
@(require_results)
is_indic_consonant_prefixed :: proc(r: rune) -> bool {
switch r {
case 0x111C2 ..= 0x111C3,
0x1193F,
0x11A3A,
0x11A84 ..= 0x11A89:
return true
case:
return false
}
}
// Indic_Conjunct_Break=Linker
@(require_results)
is_indic_conjunct_break_linker :: proc(r: rune) -> bool {
switch r {
case 0x094D,
0x09CD,
0x0ACD,
0x0B4D,
0x0C4D,
0x0D4D:
return true
case:
return false
}
}
// Indic_Conjunct_Break=Consonant
@(require_results)
is_indic_conjunct_break_consonant :: proc(r: rune) -> bool {
c := i32(r)
p := binary_search(c, indic_conjunct_break_consonant_ranges[:], len(indic_conjunct_break_consonant_ranges)/2, 2)
if p >= 0 && indic_conjunct_break_consonant_ranges[p] <= c && c <= indic_conjunct_break_consonant_ranges[p+1] {
return true
}
return false
}
// Indic_Conjunct_Break=Extend
@(require_results)
is_indic_conjunct_break_extend :: proc(r: rune) -> bool {
c := i32(r)
p := binary_search(c, indic_conjunct_break_extend_ranges[:], len(indic_conjunct_break_extend_ranges)/2, 2)
if p >= 0 && indic_conjunct_break_extend_ranges[p] <= c && c <= indic_conjunct_break_extend_ranges[p+1] {
return true
}
return false
}
/*
For grapheme text segmentation, from Unicode TR 29 Rev 43:
```
Indic_Syllabic_Category = Consonant_Preceding_Repha, or
Indic_Syllabic_Category = Consonant_Prefixed, or
Prepended_Concatenation_Mark = Yes
```
*/
@(require_results)
is_gcb_prepend_class :: proc(r: rune) -> bool {
return is_indic_consonant_preceding_repha(r) || is_indic_consonant_prefixed(r) || is_prepended_concatenation_mark(r)
}
/*
For grapheme text segmentation, from Unicode TR 29 Rev 43:
```
Grapheme_Extend = Yes, or
Emoji_Modifier = Yes
This includes:
General_Category = Nonspacing_Mark
General_Category = Enclosing_Mark
U+200C ZERO WIDTH NON-JOINER
plus a few General_Category = Spacing_Mark needed for canonical equivalence.
```
*/
@(require_results)
is_gcb_extend_class :: proc(r: rune) -> bool {
return is_grapheme_extend(r) || is_emoji_modifier(r)
}
//
// End of Unicode 15.1.0 block.
//

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,387 @@
package utf8
import "core:unicode"
ZERO_WIDTH_JOINER :: unicode.ZERO_WIDTH_JOINER
is_control :: unicode.is_control
is_hangul_syllable_leading :: unicode.is_hangul_syllable_leading
is_hangul_syllable_vowel :: unicode.is_hangul_syllable_vowel
is_hangul_syllable_trailing :: unicode.is_hangul_syllable_trailing
is_hangul_syllable_lv :: unicode.is_hangul_syllable_lv
is_hangul_syllable_lvt :: unicode.is_hangul_syllable_lvt
is_indic_conjunct_break_extend :: unicode.is_indic_conjunct_break_extend
is_indic_conjunct_break_linker :: unicode.is_indic_conjunct_break_linker
is_indic_conjunct_break_consonant :: unicode.is_indic_conjunct_break_consonant
is_gcb_extend_class :: unicode.is_gcb_extend_class
is_spacing_mark :: unicode.is_spacing_mark
is_gcb_prepend_class :: unicode.is_gcb_prepend_class
is_emoji_extended_pictographic :: unicode.is_emoji_extended_pictographic
is_regional_indicator :: unicode.is_regional_indicator
Grapheme :: struct {
byte_index: int,
rune_index: int,
}
/*
Count the individual graphemes in a UTF-8 string.
Inputs:
- str: The input string.
Returns:
- graphemes: The number of graphemes in the string.
- runes: The number of runes in the string.
*/
@(require_results)
grapheme_count :: proc(str: string) -> (graphemes, runes: int) {
_, graphemes, runes = decode_grapheme_clusters(str, false)
return
}
/*
Decode the individual graphemes in a UTF-8 string.
*Allocates Using Provided Allocator*
Inputs:
- str: The input string.
- track_graphemes: Whether or not to allocate and return `graphemes` with extra data about each grapheme.
- allocator: (default: context.allocator)
Returns:
- graphemes: Extra data about each grapheme.
- grapheme_count: The number of graphemes in the string.
- rune_count: The number of runes in the string.
*/
@(require_results)
decode_grapheme_clusters :: proc(
str: string,
track_graphemes := true,
allocator := context.allocator,
) -> (
graphemes: [dynamic]Grapheme,
grapheme_count: int,
rune_count: int,
) {
// The following procedure implements text segmentation by breaking on
// Grapheme Cluster Boundaries[1], using the values[2] and rules[3] from
// the Unicode® Standard Annex #29, entitled:
//
// UNICODE TEXT SEGMENTATION
//
// Version: Unicode 15.1.0
// Date: 2023-08-16
// Revision: 43
//
// This procedure is conformant[4] to UAX29-C1-1, otherwise known as the
// extended, non-legacy ruleset.
//
// Please see the references below for more information.
//
//
// NOTE(Feoramund): This procedure has not been highly optimized.
// A couple opportunities were taken to bypass repeated checking when a
// rune is outside of certain codepoint ranges, but little else has been
// done. Standard switches, conditionals, and binary search are used to
// see if a rune fits into a certain category.
//
// I did find that only one prior rune of state was necessary to build an
// algorithm that successfully passes all 4,835 test cases provided with
// this implementation from the Unicode organization's website.
//
// My initial implementation tracked explicit breaks and counted them once
// the string iteration had terminated. I've found this current
// implementation to be far simpler and need no allocations (unless the
// caller wants position data).
//
// Most rules work backwards instead of forwards which has helped keep this
// simple, despite its length and verbosity.
//
//
// The implementation has been left verbose and in the order described by
// the specification, to enable better readability and future upkeep.
//
// Some possible optimizations might include:
//
// - saving the type of `last_rune` instead of the exact rune.
// - reordering rules.
// - combining tables.
//
//
// [1]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
// [2]: https://www.unicode.org/reports/tr29/#Default_Grapheme_Cluster_Table
// [3]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
// [4]: https://www.unicode.org/reports/tr29/#Conformance
Grapheme_Cluster_Sequence :: enum {
None,
Indic,
Emoji,
Regional,
}
context.allocator = allocator
last_rune: rune
last_rune_breaks_forward: bool
last_grapheme_count: int
bypass_next_rune: bool
regional_indicator_counter: int
current_sequence: Grapheme_Cluster_Sequence
continue_sequence: bool
for this_rune, byte_index in str {
defer {
// "Break at the start and end of text, unless the text is empty."
//
// GB1: sot ÷ Any
// GB2: Any ÷ eot
if rune_count == 0 && grapheme_count == 0 {
grapheme_count += 1
}
if track_graphemes && grapheme_count > last_grapheme_count {
append(&graphemes, Grapheme{ byte_index, rune_count })
}
last_grapheme_count = grapheme_count
last_rune = this_rune
rune_count += 1
if !continue_sequence {
current_sequence = .None
regional_indicator_counter = 0
}
continue_sequence = false
}
// "Do not break between a CR and LF. Otherwise, break before and after controls."
//
// GB3: CR × LF
// GB4: (Control | CR | LF) ÷
// GB5: ÷ (Control | CR | LF)
if this_rune == '\n' && last_rune == '\r' {
last_rune_breaks_forward = false
bypass_next_rune = false
continue
}
if is_control(this_rune) {
grapheme_count += 1
last_rune_breaks_forward = true
bypass_next_rune = true
continue
}
// (This check is for rules that work forwards, instead of backwards.)
if bypass_next_rune {
if last_rune_breaks_forward {
grapheme_count += 1
last_rune_breaks_forward = false
}
bypass_next_rune = false
continue
}
// (Optimization 1: Prevent low runes from proceeding further.)
//
// * 0xA9 and 0xAE are in the Extended_Pictographic range,
// which is checked later in GB11.
if this_rune != 0xA9 && this_rune != 0xAE && this_rune <= 0x2FF {
grapheme_count += 1
continue
}
// (Optimization 2: Check if the rune is in the Hangul space before getting specific.)
if 0x1100 <= this_rune && this_rune <= 0xD7FB {
// "Do not break Hangul syllable sequences."
//
// GB6: L × (L | V | LV | LVT)
// GB7: (LV | V) × (V | T)
// GB8: (LVT | T) × T
if is_hangul_syllable_leading(this_rune) ||
is_hangul_syllable_lv(this_rune) ||
is_hangul_syllable_lvt(this_rune)
{
if !is_hangul_syllable_leading(last_rune) {
grapheme_count += 1
}
continue
}
if is_hangul_syllable_vowel(this_rune) {
if is_hangul_syllable_leading(last_rune) ||
is_hangul_syllable_vowel(last_rune) ||
is_hangul_syllable_lv(last_rune)
{
continue
}
grapheme_count += 1
continue
}
if is_hangul_syllable_trailing(this_rune) {
if is_hangul_syllable_trailing(last_rune) ||
is_hangul_syllable_lvt(last_rune) ||
is_hangul_syllable_lv(last_rune) ||
is_hangul_syllable_vowel(last_rune)
{
continue
}
grapheme_count += 1
continue
}
}
// "Do not break before extending characters or ZWJ."
//
// GB9: × (Extend | ZWJ)
if this_rune == ZERO_WIDTH_JOINER {
continue_sequence = true
continue
}
if is_gcb_extend_class(this_rune) {
// (Support for GB9c.)
if current_sequence == .Indic {
if is_indic_conjunct_break_extend(this_rune) && (
is_indic_conjunct_break_linker(last_rune) ||
is_indic_conjunct_break_consonant(last_rune) )
{
continue_sequence = true
continue
}
if is_indic_conjunct_break_linker(this_rune) && (
is_indic_conjunct_break_linker(last_rune) ||
is_indic_conjunct_break_extend(last_rune) ||
is_indic_conjunct_break_consonant(last_rune) )
{
continue_sequence = true
continue
}
continue
}
// (Support for GB11.)
if current_sequence == .Emoji && (
is_gcb_extend_class(last_rune) ||
is_emoji_extended_pictographic(last_rune) )
{
continue_sequence = true
}
continue
}
// _The GB9a and GB9b rules only apply to extended grapheme clusters:_
// "Do not break before SpacingMarks, or after Prepend characters."
//
// GB9a: × SpacingMark
// GB9b: Prepend ×
if is_spacing_mark(this_rune) {
continue
}
if is_gcb_prepend_class(this_rune) {
grapheme_count += 1
bypass_next_rune = true
continue
}
// _The GB9c rule only applies to extended grapheme clusters:_
// "Do not break within certain combinations with Indic_Conjunct_Break (InCB)=Linker."
//
// GB9c: \p{InCB=Consonant} [ \p{InCB=Extend} \p{InCB=Linker} ]* \p{InCB=Linker} [ \p{InCB=Extend} \p{InCB=Linker} ]* × \p{InCB=Consonant}
if is_indic_conjunct_break_consonant(this_rune) {
if current_sequence == .Indic {
if last_rune == ZERO_WIDTH_JOINER ||
is_indic_conjunct_break_linker(last_rune)
{
continue_sequence = true
} else {
grapheme_count += 1
}
} else {
grapheme_count += 1
current_sequence = .Indic
continue_sequence = true
}
continue
}
if is_indic_conjunct_break_extend(this_rune) {
if current_sequence == .Indic {
if is_indic_conjunct_break_consonant(last_rune) ||
is_indic_conjunct_break_linker(last_rune)
{
continue_sequence = true
} else {
grapheme_count += 1
}
}
continue
}
if is_indic_conjunct_break_linker(this_rune) {
if current_sequence == .Indic {
if is_indic_conjunct_break_extend(last_rune) ||
is_indic_conjunct_break_linker(last_rune)
{
continue_sequence = true
} else {
grapheme_count += 1
}
}
continue
}
//
// (Curiously, there is no GB10.)
//
// "Do not break within emoji modifier sequences or emoji zwj sequences."
//
// GB11: \p{Extended_Pictographic} Extend* ZWJ × \p{Extended_Pictographic}
if is_emoji_extended_pictographic(this_rune) {
if current_sequence != .Emoji || last_rune != ZERO_WIDTH_JOINER {
grapheme_count += 1
}
current_sequence = .Emoji
continue_sequence = true
continue
}
// "Do not break within emoji flag sequences.
// That is, do not break between regional indicator (RI) symbols
// if there is an odd number of RI characters before the break point."
//
// GB12: sot (RI RI)* RI × RI
// GB13: [^RI] (RI RI)* RI × RI
if is_regional_indicator(this_rune) {
if regional_indicator_counter & 1 == 0 {
grapheme_count += 1
}
current_sequence = .Regional
continue_sequence = true
regional_indicator_counter += 1
continue
}
// "Otherwise, break everywhere."
//
// GB999: Any ÷ Any
grapheme_count += 1
}
return
}

View File

@@ -0,0 +1,73 @@
package test_core_unicode
import "core:log"
import "core:testing"
import "core:unicode/utf8"
Test_Case :: struct {
str: string,
expected_clusters: int,
}
run_test_cases :: proc(t: ^testing.T, test_cases: []Test_Case, loc := #caller_location) {
failed := 0
for c, i in test_cases {
log.debugf("(#% 4i) %q ...", i, c.str)
result, _ := utf8.grapheme_count(c.str)
if !testing.expectf(t, result == c.expected_clusters,
"(#% 4i) graphemes: %i != %i, %q %s", i, result, c.expected_clusters, c.str, c.str,
loc = loc)
{
failed += 1
}
}
log.logf(.Error if failed > 0 else .Info, "% 4i/% 4i test cases failed.", failed, len(test_cases), location = loc)
}
@test
test_official_gcb_cases :: proc(t: ^testing.T) {
run_test_cases(t, official_grapheme_break_test_cases)
}
@test
test_official_emoji_cases :: proc(t: ^testing.T) {
run_test_cases(t, official_emoji_test_cases)
}
@test
test_grapheme_byte_index_segmentation :: proc(t: ^testing.T) {
SAMPLE_1 :: "\U0001F600"
SAMPLE_2 :: "\U0001F3F4\U000E0067\U000E0062\U000E0065\U000E006E\U000E0067\U000E007F"
SAMPLE_3 :: "\U0001F468\U0001F3FB\u200D\U0001F9B0"
str := SAMPLE_1 + SAMPLE_2 + SAMPLE_3 + SAMPLE_2 + SAMPLE_1
graphemes, _, _ := utf8.decode_grapheme_clusters(str)
defer delete(graphemes)
defer if testing.failed(t) {
log.infof("%#v\n%q\n%v", graphemes, str, transmute([]u8)str)
}
if !testing.expect_value(t, len(graphemes), 5) {
return
}
testing.expect_value(t, graphemes[0].rune_index, 0)
testing.expect_value(t, graphemes[1].rune_index, 1)
testing.expect_value(t, graphemes[2].rune_index, 8)
testing.expect_value(t, graphemes[3].rune_index, 12)
testing.expect_value(t, graphemes[4].rune_index, 19)
grapheme_1 := str[graphemes[0].byte_index:graphemes[1].byte_index]
grapheme_2 := str[graphemes[1].byte_index:graphemes[2].byte_index]
grapheme_3 := str[graphemes[2].byte_index:graphemes[3].byte_index]
grapheme_4 := str[graphemes[3].byte_index:graphemes[4].byte_index]
grapheme_5 := str[graphemes[4].byte_index:]
testing.expectf(t, grapheme_1 == SAMPLE_1, "expected %q, got %q", SAMPLE_1, grapheme_1)
testing.expectf(t, grapheme_2 == SAMPLE_2, "expected %q, got %q", SAMPLE_2, grapheme_2)
testing.expectf(t, grapheme_3 == SAMPLE_3, "expected %q, got %q", SAMPLE_3, grapheme_3)
testing.expectf(t, grapheme_4 == SAMPLE_2, "expected %q, got %q", SAMPLE_2, grapheme_2)
testing.expectf(t, grapheme_5 == SAMPLE_1, "expected %q, got %q", SAMPLE_1, grapheme_1)
}

File diff suppressed because it is too large Load Diff