Merge pull request #3775 from Feoramund/unicode-graphemes

Add grapheme analysis facilities to `core:unicode`
2026-02-17 08:34:08 +00:00 · 2024-06-18 12:48:31 +01:00
parent e8c17ac356 9e4899d35c
commit de23965ecb
5 changed files with 8084 additions and 4 deletions
--- a/core/unicode/letter.odin
+++ b/core/unicode/letter.odin
@@ -5,6 +5,10 @@ REPLACEMENT_CHAR :: '\ufffd'     // Represented an invalid code point
 MAX_ASCII        :: '\u007f'     // Maximum ASCII value
 MAX_LATIN1       :: '\u00ff'     // Maximum Latin-1 value

+ZERO_WIDTH_NON_JOINER :: '\u200C'
+ZERO_WIDTH_JOINER     :: '\u200D'
+
+@(require_results)
 binary_search :: proc(c: i32, table: []i32, length, stride: int) -> int {
 	n := length
 	t := 0
@@ -24,6 +28,7 @@ binary_search :: proc(c: i32, table: []i32, length, stride: int) -> int {
 	return -1
 }

+@(require_results)
 to_lower :: proc(r: rune) -> rune {
 	c := i32(r)
 	p := binary_search(c, to_lower_ranges[:], len(to_lower_ranges)/3, 3)
@@ -36,6 +41,7 @@ to_lower :: proc(r: rune) -> rune {
 	}
 	return rune(c)
 }
+@(require_results)
 to_upper :: proc(r: rune) -> rune {
 	c := i32(r)
 	p := binary_search(c, to_upper_ranges[:], len(to_upper_ranges)/3, 3)
@@ -48,6 +54,7 @@ to_upper :: proc(r: rune) -> rune {
 	}
 	return rune(c)
 }
+@(require_results)
 to_title :: proc(r: rune) -> rune {
 	c := i32(r)
 	p := binary_search(c, to_upper_singlets[:], len(to_title_singlets)/2, 2)
@@ -58,6 +65,7 @@ to_title :: proc(r: rune) -> rune {
 }


+@(require_results)
 is_lower :: proc(r: rune) -> bool {
 	if r <= MAX_ASCII {
 		return u32(r)-'a' < 26
@@ -74,6 +82,7 @@ is_lower :: proc(r: rune) -> bool {
 	return false
 }

+@(require_results)
 is_upper :: proc(r: rune) -> bool {
 	if r <= MAX_ASCII {
 		return u32(r)-'A' < 26
@@ -91,6 +100,7 @@ is_upper :: proc(r: rune) -> bool {
 }

 is_alpha :: is_letter
+@(require_results)
 is_letter :: proc(r: rune) -> bool {
 	if u32(r) <= MAX_LATIN1 {
 		return char_properties[u8(r)]&pLmask != 0
@@ -111,10 +121,12 @@ is_letter :: proc(r: rune) -> bool {
 	return false
 }

+@(require_results)
 is_title :: proc(r: rune) -> bool {
 	return is_upper(r) && is_lower(r)
 }

+@(require_results)
 is_digit :: proc(r: rune) -> bool {
 	if r <= MAX_LATIN1 {
 		return '0' <= r && r <= '9'
@@ -124,6 +136,7 @@ is_digit :: proc(r: rune) -> bool {


 is_white_space :: is_space
+@(require_results)
 is_space :: proc(r: rune) -> bool {
 	if u32(r) <= MAX_LATIN1 {
 		switch r {
@@ -140,18 +153,20 @@ is_space :: proc(r: rune) -> bool {
 	return false
 }

+@(require_results)
 is_combining :: proc(r: rune) -> bool {
 	c := i32(r)

 	return c >= 0x0300 && (c <= 0x036f ||
-          (c >= 0x1ab0 && c <= 0x1aff) ||
-          (c >= 0x1dc0 && c <= 0x1dff) ||
-          (c >= 0x20d0 && c <= 0x20ff) ||
-          (c >= 0xfe20 && c <= 0xfe2f))
+	      (c >= 0x1ab0 && c <= 0x1aff) ||
+	      (c >= 0x1dc0 && c <= 0x1dff) ||
+	      (c >= 0x20d0 && c <= 0x20ff) ||
+	      (c >= 0xfe20 && c <= 0xfe2f))
 }



+@(require_results)
 is_graphic :: proc(r: rune) -> bool {
 	if u32(r) <= MAX_LATIN1 {
 		return char_properties[u8(r)]&pg != 0
@@ -159,6 +174,7 @@ is_graphic :: proc(r: rune) -> bool {
 	return false
 }

+@(require_results)
 is_print :: proc(r: rune) -> bool {
 	if u32(r) <= MAX_LATIN1 {
 		return char_properties[u8(r)]&pp != 0
@@ -166,6 +182,7 @@ is_print :: proc(r: rune) -> bool {
 	return false
 }

+@(require_results)
 is_control :: proc(r: rune) -> bool {
 	if u32(r) <= MAX_LATIN1 {
 		return char_properties[u8(r)]&pC != 0
@@ -173,6 +190,7 @@ is_control :: proc(r: rune) -> bool {
 	return false
 }

+@(require_results)
 is_number :: proc(r: rune) -> bool {
 	if u32(r) <= MAX_LATIN1 {
 		return char_properties[u8(r)]&pN != 0
@@ -180,6 +198,7 @@ is_number :: proc(r: rune) -> bool {
 	return false
 }

+@(require_results)
 is_punct :: proc(r: rune) -> bool {
 	if u32(r) <= MAX_LATIN1 {
 		return char_properties[u8(r)]&pP != 0
@@ -187,9 +206,249 @@ is_punct :: proc(r: rune) -> bool {
 	return false
 }

+@(require_results)
 is_symbol :: proc(r: rune) -> bool {
 	if u32(r) <= MAX_LATIN1 {
 		return char_properties[u8(r)]&pS != 0
 	}
 	return false
 }
+
+//
+// The procedures below are accurate as of Unicode 15.1.0.
+//
+
+// Emoji_Modifier
+@(require_results)
+is_emoji_modifier :: proc(r: rune) -> bool {
+	return 0x1F3FB <= r && r <= 0x1F3FF
+}
+
+// Regional_Indicator
+@(require_results)
+is_regional_indicator :: proc(r: rune) -> bool {
+	return 0x1F1E6 <= r && r <= 0x1F1FF
+}
+
+// General_Category=Enclosing_Mark
+@(require_results)
+is_enclosing_mark :: proc(r: rune) -> bool {
+	switch r {
+	case 0x0488,
+	     0x0489,
+	     0x1ABE,
+	     0x20DD ..= 0x20E0,
+	     0x20E2 ..= 0x20E4,
+	     0xA670 ..= 0xA672: return true
+	}
+
+	return false
+}
+
+// Prepended_Concatenation_Mark
+@(require_results)
+is_prepended_concatenation_mark :: proc(r: rune) -> bool {
+	switch r {
+	case 0x00600 ..= 0x00605,
+	     0x006DD,
+	     0x0070F,
+	     0x00890 ..= 0x00891,
+	     0x008E2,
+	     0x110BD,
+	     0x110CD:
+		return true
+	case:
+		return false
+	}
+}
+
+// General_Category=Spacing_Mark
+@(require_results)
+is_spacing_mark :: proc(r: rune) -> bool {
+	c := i32(r)
+	p := binary_search(c, spacing_mark_ranges[:], len(spacing_mark_ranges)/2, 2)
+	if p >= 0 && spacing_mark_ranges[p] <= c && c <= spacing_mark_ranges[p+1] {
+		return true
+	}
+	return false
+}
+
+// General_Category=Nonspacing_Mark
+@(require_results)
+is_nonspacing_mark :: proc(r: rune) -> bool {
+	c := i32(r)
+	p := binary_search(c, nonspacing_mark_ranges[:], len(nonspacing_mark_ranges)/2, 2)
+	if p >= 0 && nonspacing_mark_ranges[p] <= c && c <= nonspacing_mark_ranges[p+1] {
+		return true
+	}
+	return false
+}
+
+// Extended_Pictographic
+@(require_results)
+is_emoji_extended_pictographic :: proc(r: rune) -> bool {
+	c := i32(r)
+	p := binary_search(c, emoji_extended_pictographic_ranges[:], len(emoji_extended_pictographic_ranges)/2, 2)
+	if p >= 0 && emoji_extended_pictographic_ranges[p] <= c && c <= emoji_extended_pictographic_ranges[p+1] {
+		return true
+	}
+	return false
+}
+
+// Grapheme_Extend
+@(require_results)
+is_grapheme_extend :: proc(r: rune) -> bool {
+	c := i32(r)
+	p := binary_search(c, grapheme_extend_ranges[:], len(grapheme_extend_ranges)/2, 2)
+	if p >= 0 && grapheme_extend_ranges[p] <= c && c <= grapheme_extend_ranges[p+1] {
+		return true
+	}
+	return false
+}
+
+
+// Hangul_Syllable_Type=Leading_Jamo
+@(require_results)
+is_hangul_syllable_leading :: proc(r: rune) -> bool {
+	return 0x1100 <= r && r <= 0x115F || 0xA960 <= r && r <= 0xA97C
+}
+
+// Hangul_Syllable_Type=Vowel_Jamo
+@(require_results)
+is_hangul_syllable_vowel :: proc(r: rune) -> bool {
+	return 0x1160 <= r && r <= 0x11A7 || 0xD7B0 <= r && r <= 0xD7C6
+}
+
+// Hangul_Syllable_Type=Trailing_Jamo
+@(require_results)
+is_hangul_syllable_trailing :: proc(r: rune) -> bool {
+	return 0x11A8 <= r && r <= 0x11FF || 0xD7CB <= r && r <= 0xD7FB
+}
+
+// Hangul_Syllable_Type=LV_Syllable
+@(require_results)
+is_hangul_syllable_lv :: proc(r: rune) -> bool {
+	c := i32(r)
+	p := binary_search(c, hangul_syllable_lv_singlets[:], len(hangul_syllable_lv_singlets), 1)
+	if p >= 0 && c == hangul_syllable_lv_singlets[p] {
+		return true
+	}
+	return false
+}
+
+// Hangul_Syllable_Type=LVT_Syllable
+@(require_results)
+is_hangul_syllable_lvt :: proc(r: rune) -> bool {
+	c := i32(r)
+	p := binary_search(c, hangul_syllable_lvt_ranges[:], len(hangul_syllable_lvt_ranges)/2, 2)
+	if p >= 0 && hangul_syllable_lvt_ranges[p] <= c && c <= hangul_syllable_lvt_ranges[p+1] {
+		return true
+	}
+	return false
+}
+
+
+// Indic_Syllabic_Category=Consonant_Preceding_Repha
+@(require_results)
+is_indic_consonant_preceding_repha :: proc(r: rune) -> bool {
+	switch r {
+	case 0x00D4E,
+	     0x11941,
+	     0x11D46,
+	     0x11F02:
+		return true
+	case:
+		return false
+	}
+}
+
+// Indic_Syllabic_Category=Consonant_Prefixed
+@(require_results)
+is_indic_consonant_prefixed :: proc(r: rune) -> bool {
+	switch r {
+	case 0x111C2 ..= 0x111C3,
+	     0x1193F,
+	     0x11A3A,
+	     0x11A84 ..= 0x11A89:
+		return true
+	case:
+		return false
+	}
+}
+
+// Indic_Conjunct_Break=Linker
+@(require_results)
+is_indic_conjunct_break_linker :: proc(r: rune) -> bool {
+	switch r {
+	case 0x094D,
+	     0x09CD,
+	     0x0ACD,
+	     0x0B4D,
+	     0x0C4D,
+	     0x0D4D:
+		return true
+	case:
+		return false
+	}
+}
+
+// Indic_Conjunct_Break=Consonant
+@(require_results)
+is_indic_conjunct_break_consonant :: proc(r: rune) -> bool {
+	c := i32(r)
+	p := binary_search(c, indic_conjunct_break_consonant_ranges[:], len(indic_conjunct_break_consonant_ranges)/2, 2)
+	if p >= 0 && indic_conjunct_break_consonant_ranges[p] <= c && c <= indic_conjunct_break_consonant_ranges[p+1] {
+		return true
+	}
+	return false
+}
+
+// Indic_Conjunct_Break=Extend
+@(require_results)
+is_indic_conjunct_break_extend :: proc(r: rune) -> bool {
+	c := i32(r)
+	p := binary_search(c, indic_conjunct_break_extend_ranges[:], len(indic_conjunct_break_extend_ranges)/2, 2)
+	if p >= 0 && indic_conjunct_break_extend_ranges[p] <= c && c <= indic_conjunct_break_extend_ranges[p+1] {
+		return true
+	}
+	return false
+}
+
+
+/*
+For grapheme text segmentation, from Unicode TR 29 Rev 43:
+
+```
+Indic_Syllabic_Category = Consonant_Preceding_Repha, or
+Indic_Syllabic_Category = Consonant_Prefixed, or
+Prepended_Concatenation_Mark = Yes
+```
+*/
+@(require_results)
+is_gcb_prepend_class :: proc(r: rune) -> bool {
+	return is_indic_consonant_preceding_repha(r) || is_indic_consonant_prefixed(r) || is_prepended_concatenation_mark(r)
+}
+
+/*
+For grapheme text segmentation, from Unicode TR 29 Rev 43:
+
+```
+Grapheme_Extend = Yes, or
+Emoji_Modifier = Yes
+
+This includes:
+General_Category = Nonspacing_Mark
+General_Category = Enclosing_Mark
+U+200C ZERO WIDTH NON-JOINER
+
+plus a few General_Category = Spacing_Mark needed for canonical equivalence.
+```
+*/
+@(require_results)
+is_gcb_extend_class :: proc(r: rune) -> bool {
+	return is_grapheme_extend(r) || is_emoji_modifier(r)
+}
+
+//
+// End of Unicode 15.1.0 block.
+//
--- a/core/unicode/tables.odin
+++ b/core/unicode/tables.odin
--- a/core/unicode/utf8/grapheme.odin
+++ b/core/unicode/utf8/grapheme.odin
@@ -0,0 +1,387 @@
+package utf8
+
+import "core:unicode"
+
+ZERO_WIDTH_JOINER                 :: unicode.ZERO_WIDTH_JOINER
+is_control                        :: unicode.is_control
+is_hangul_syllable_leading        :: unicode.is_hangul_syllable_leading
+is_hangul_syllable_vowel          :: unicode.is_hangul_syllable_vowel
+is_hangul_syllable_trailing       :: unicode.is_hangul_syllable_trailing
+is_hangul_syllable_lv             :: unicode.is_hangul_syllable_lv
+is_hangul_syllable_lvt            :: unicode.is_hangul_syllable_lvt
+is_indic_conjunct_break_extend    :: unicode.is_indic_conjunct_break_extend
+is_indic_conjunct_break_linker    :: unicode.is_indic_conjunct_break_linker
+is_indic_conjunct_break_consonant :: unicode.is_indic_conjunct_break_consonant
+is_gcb_extend_class               :: unicode.is_gcb_extend_class
+is_spacing_mark                   :: unicode.is_spacing_mark
+is_gcb_prepend_class              :: unicode.is_gcb_prepend_class
+is_emoji_extended_pictographic    :: unicode.is_emoji_extended_pictographic
+is_regional_indicator             :: unicode.is_regional_indicator
+
+
+Grapheme :: struct {
+	byte_index: int,
+	rune_index: int,
+}
+
+/*
+Count the individual graphemes in a UTF-8 string.
+
+Inputs:
+- str: The input string.
+
+Returns:
+- graphemes: The number of graphemes in the string.
+- runes: The number of runes in the string.
+*/
+@(require_results)
+grapheme_count :: proc(str: string) -> (graphemes, runes: int) {
+	_, graphemes, runes = decode_grapheme_clusters(str, false)
+	return
+}
+
+/*
+Decode the individual graphemes in a UTF-8 string.
+
+*Allocates Using Provided Allocator*
+
+Inputs:
+- str: The input string.
+- track_graphemes: Whether or not to allocate and return `graphemes` with extra data about each grapheme.
+- allocator: (default: context.allocator)
+
+Returns:
+- graphemes: Extra data about each grapheme.
+- grapheme_count: The number of graphemes in the string.
+- rune_count: The number of runes in the string.
+*/
+@(require_results)
+decode_grapheme_clusters :: proc(
+	str: string,
+	track_graphemes := true,
+	allocator       := context.allocator,
+) -> (
+	graphemes:      [dynamic]Grapheme,
+	grapheme_count: int,
+	rune_count:     int,
+) {
+	// The following procedure implements text segmentation by breaking on
+	// Grapheme Cluster Boundaries[1], using the values[2] and rules[3] from
+	// the Unicode® Standard Annex #29, entitled:
+	//
+	// UNICODE TEXT SEGMENTATION
+	//
+	// Version:  Unicode 15.1.0
+	// Date:     2023-08-16
+	// Revision: 43
+	//
+	// This procedure is conformant[4] to UAX29-C1-1, otherwise known as the
+	// extended, non-legacy ruleset.
+	//
+	// Please see the references below for more information.
+	//
+	//
+	// NOTE(Feoramund): This procedure has not been highly optimized.
+	// A couple opportunities were taken to bypass repeated checking when a
+	// rune is outside of certain codepoint ranges, but little else has been
+	// done. Standard switches, conditionals, and binary search are used to
+	// see if a rune fits into a certain category.
+	//
+	// I did find that only one prior rune of state was necessary to build an
+	// algorithm that successfully passes all 4,835 test cases provided with
+	// this implementation from the Unicode organization's website.
+	//
+	// My initial implementation tracked explicit breaks and counted them once
+	// the string iteration had terminated. I've found this current
+	// implementation to be far simpler and need no allocations (unless the
+	// caller wants position data).
+	//
+	// Most rules work backwards instead of forwards which has helped keep this
+	// simple, despite its length and verbosity.
+	//
+	//
+	// The implementation has been left verbose and in the order described by
+	// the specification, to enable better readability and future upkeep.
+	//
+	// Some possible optimizations might include:
+	//
+	// - saving the type of `last_rune` instead of the exact rune.
+	// - reordering rules.
+	// - combining tables.
+	//
+	//
+	// [1]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
+	// [2]: https://www.unicode.org/reports/tr29/#Default_Grapheme_Cluster_Table
+	// [3]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
+	// [4]: https://www.unicode.org/reports/tr29/#Conformance
+
+	Grapheme_Cluster_Sequence :: enum {
+		None,
+		Indic,
+		Emoji,
+		Regional,
+	}
+
+	context.allocator = allocator
+
+	last_rune: rune
+	last_rune_breaks_forward: bool
+
+	last_grapheme_count: int
+
+	bypass_next_rune: bool
+
+	regional_indicator_counter: int
+
+	current_sequence: Grapheme_Cluster_Sequence
+	continue_sequence: bool
+
+	for this_rune, byte_index in str {
+		defer {
+			// "Break at the start and end of text, unless the text is empty."
+			//
+			// GB1: sot  ÷  Any
+			// GB2: Any  ÷  eot
+			if rune_count == 0 && grapheme_count == 0 {
+				grapheme_count += 1
+			}
+			if track_graphemes && grapheme_count > last_grapheme_count {
+				append(&graphemes, Grapheme{ byte_index, rune_count })
+			}
+			last_grapheme_count = grapheme_count
+
+			last_rune = this_rune
+			rune_count += 1
+
+			if !continue_sequence {
+				current_sequence = .None
+				regional_indicator_counter = 0
+			}
+			continue_sequence = false
+		}
+
+		// "Do not break between a CR and LF. Otherwise, break before and after controls."
+		//
+		// GB3:                 CR   ×   LF
+		// GB4: (Control | CR | LF)  ÷
+		// GB5:                      ÷  (Control | CR | LF)
+		if this_rune == '\n' && last_rune == '\r' {
+			last_rune_breaks_forward = false
+			bypass_next_rune = false
+			continue
+		}
+
+		if is_control(this_rune) {
+			grapheme_count += 1
+			last_rune_breaks_forward = true
+			bypass_next_rune = true
+			continue
+		}
+
+		// (This check is for rules that work forwards, instead of backwards.)
+		if bypass_next_rune {
+			if last_rune_breaks_forward {
+				grapheme_count += 1
+				last_rune_breaks_forward = false
+			}
+
+			bypass_next_rune = false
+			continue
+		}
+
+		// (Optimization 1: Prevent low runes from proceeding further.)
+		//
+		//  * 0xA9 and 0xAE are in the Extended_Pictographic range,
+		//    which is checked later in GB11.
+		if this_rune != 0xA9 && this_rune != 0xAE && this_rune <= 0x2FF {
+			grapheme_count += 1
+			continue
+		}
+
+		// (Optimization 2: Check if the rune is in the Hangul space before getting specific.)
+		if 0x1100 <= this_rune && this_rune <= 0xD7FB {
+			// "Do not break Hangul syllable sequences."
+			//
+			// GB6:        L   ×  (L | V | LV | LVT)
+			// GB7:  (LV | V)  ×  (V | T)
+			// GB8: (LVT | T)  ×   T
+			if is_hangul_syllable_leading(this_rune) ||
+			   is_hangul_syllable_lv(this_rune)      ||
+			   is_hangul_syllable_lvt(this_rune)
+			{
+				if !is_hangul_syllable_leading(last_rune) {
+					grapheme_count += 1
+				}
+				continue
+			}
+
+			if is_hangul_syllable_vowel(this_rune) {
+				if is_hangul_syllable_leading(last_rune) ||
+				   is_hangul_syllable_vowel(last_rune)   ||
+				   is_hangul_syllable_lv(last_rune)
+				{
+					continue
+				}
+				grapheme_count += 1
+				continue
+			}
+
+			if is_hangul_syllable_trailing(this_rune) {
+				if is_hangul_syllable_trailing(last_rune) ||
+				   is_hangul_syllable_lvt(last_rune)      ||
+				   is_hangul_syllable_lv(last_rune)       ||
+				   is_hangul_syllable_vowel(last_rune)
+				{
+					continue
+				}
+				grapheme_count += 1
+				continue
+			}
+		}
+
+		// "Do not break before extending characters or ZWJ."
+		//
+		// GB9:         × (Extend | ZWJ)
+		if this_rune == ZERO_WIDTH_JOINER {
+			continue_sequence = true
+			continue
+		}
+
+		if is_gcb_extend_class(this_rune) {
+			// (Support for GB9c.)
+			if current_sequence == .Indic {
+				if is_indic_conjunct_break_extend(this_rune)    && (
+				   is_indic_conjunct_break_linker(last_rune)    ||
+				   is_indic_conjunct_break_consonant(last_rune)    )
+				{
+					continue_sequence = true
+					continue
+				}
+
+				if is_indic_conjunct_break_linker(this_rune)    && (
+				   is_indic_conjunct_break_linker(last_rune)    ||
+				   is_indic_conjunct_break_extend(last_rune)    ||
+				   is_indic_conjunct_break_consonant(last_rune)    )
+				{
+					continue_sequence = true
+					continue
+				}
+
+				continue
+			}
+
+			// (Support for GB11.)
+			if current_sequence == .Emoji                && (
+			   is_gcb_extend_class(last_rune)            ||
+			   is_emoji_extended_pictographic(last_rune)    )
+			{
+				continue_sequence = true
+			}
+
+			continue
+		}
+
+		// _The GB9a and GB9b rules only apply to extended grapheme clusters:_
+		// "Do not break before SpacingMarks, or after Prepend characters."
+		//
+		// GB9a:          ×  SpacingMark
+		// GB9b: Prepend  ×
+		if is_spacing_mark(this_rune) {
+			continue
+		}
+
+		if is_gcb_prepend_class(this_rune) {
+			grapheme_count += 1
+			bypass_next_rune = true
+			continue
+		}
+
+		// _The GB9c rule only applies to extended grapheme clusters:_
+		// "Do not break within certain combinations with Indic_Conjunct_Break (InCB)=Linker."
+		//
+		// GB9c: \p{InCB=Consonant} [ \p{InCB=Extend} \p{InCB=Linker} ]* \p{InCB=Linker} [ \p{InCB=Extend} \p{InCB=Linker} ]*  ×  \p{InCB=Consonant}
+		if is_indic_conjunct_break_consonant(this_rune) {
+			if current_sequence == .Indic {
+				if last_rune == ZERO_WIDTH_JOINER            ||
+				   is_indic_conjunct_break_linker(last_rune)
+				{
+					continue_sequence = true
+				} else {
+					grapheme_count += 1
+				}
+			} else {
+				grapheme_count += 1
+				current_sequence = .Indic
+				continue_sequence = true
+			}
+			continue
+		}
+
+		if is_indic_conjunct_break_extend(this_rune) {
+			if current_sequence == .Indic {
+				if is_indic_conjunct_break_consonant(last_rune) ||
+				   is_indic_conjunct_break_linker(last_rune)
+				{
+					continue_sequence = true
+				} else {
+					grapheme_count += 1
+				}
+			}
+			continue
+		}
+
+		if is_indic_conjunct_break_linker(this_rune) {
+			if current_sequence == .Indic {
+				if is_indic_conjunct_break_extend(last_rune) ||
+				   is_indic_conjunct_break_linker(last_rune)
+				{
+					continue_sequence = true
+				} else {
+					grapheme_count += 1
+				}
+			}
+			continue
+		}
+
+		//
+		// (Curiously, there is no GB10.)
+		//
+
+		// "Do not break within emoji modifier sequences or emoji zwj sequences."
+		//
+		// GB11: \p{Extended_Pictographic} Extend* ZWJ  ×  \p{Extended_Pictographic}
+		if is_emoji_extended_pictographic(this_rune) {
+			if current_sequence != .Emoji || last_rune != ZERO_WIDTH_JOINER {
+				grapheme_count += 1
+			}
+			current_sequence = .Emoji
+			continue_sequence = true
+			continue
+		}
+
+		// "Do not break within emoji flag sequences.
+		//  That is, do not break between regional indicator (RI) symbols
+		//  if there is an odd number of RI characters before the break point."
+		//
+		// GB12:   sot (RI RI)* RI  ×  RI
+		// GB13: [^RI] (RI RI)* RI  ×  RI
+		if is_regional_indicator(this_rune) {
+			if regional_indicator_counter & 1 == 0 {
+				grapheme_count += 1
+			}
+
+			current_sequence = .Regional
+			continue_sequence = true
+			regional_indicator_counter += 1
+
+			continue
+		}
+
+		// "Otherwise, break everywhere."
+		//
+		// GB999: Any ÷ Any
+		grapheme_count += 1
+	}
+
+	return
+}
--- a/tests/core/unicode/test_core_unicode.odin
+++ b/tests/core/unicode/test_core_unicode.odin
@@ -0,0 +1,73 @@
+package test_core_unicode
+
+import "core:log"
+import "core:testing"
+import "core:unicode/utf8"
+
+Test_Case :: struct {
+	str: string,
+	expected_clusters: int,
+}
+
+run_test_cases :: proc(t: ^testing.T, test_cases: []Test_Case, loc := #caller_location) {
+	failed := 0
+	for c, i in test_cases {
+		log.debugf("(#% 4i) %q ...", i, c.str)
+		result, _ := utf8.grapheme_count(c.str)
+		if !testing.expectf(t, result == c.expected_clusters,
+			"(#% 4i) graphemes: %i != %i, %q %s", i, result, c.expected_clusters, c.str, c.str,
+			loc = loc)
+		{
+			failed += 1
+		}
+	}
+
+	log.logf(.Error if failed > 0 else .Info, "% 4i/% 4i test cases failed.", failed, len(test_cases), location = loc)
+}
+
+@test
+test_official_gcb_cases :: proc(t: ^testing.T) {
+	run_test_cases(t, official_grapheme_break_test_cases)
+}
+
+@test
+test_official_emoji_cases :: proc(t: ^testing.T) {
+	run_test_cases(t, official_emoji_test_cases)
+}
+
+@test
+test_grapheme_byte_index_segmentation :: proc(t: ^testing.T) {
+	SAMPLE_1 :: "\U0001F600"
+	SAMPLE_2 :: "\U0001F3F4\U000E0067\U000E0062\U000E0065\U000E006E\U000E0067\U000E007F"
+	SAMPLE_3 :: "\U0001F468\U0001F3FB\u200D\U0001F9B0"
+
+	str := SAMPLE_1 + SAMPLE_2 + SAMPLE_3 + SAMPLE_2 + SAMPLE_1
+
+	graphemes, _, _ := utf8.decode_grapheme_clusters(str)
+	defer delete(graphemes)
+
+	defer if testing.failed(t) {
+		log.infof("%#v\n%q\n%v", graphemes, str, transmute([]u8)str)
+	}
+	if !testing.expect_value(t, len(graphemes), 5) {
+		return
+	}
+
+	testing.expect_value(t, graphemes[0].rune_index, 0)
+	testing.expect_value(t, graphemes[1].rune_index, 1)
+	testing.expect_value(t, graphemes[2].rune_index, 8)
+	testing.expect_value(t, graphemes[3].rune_index, 12)
+	testing.expect_value(t, graphemes[4].rune_index, 19)
+
+	grapheme_1 := str[graphemes[0].byte_index:graphemes[1].byte_index]
+	grapheme_2 := str[graphemes[1].byte_index:graphemes[2].byte_index]
+	grapheme_3 := str[graphemes[2].byte_index:graphemes[3].byte_index]
+	grapheme_4 := str[graphemes[3].byte_index:graphemes[4].byte_index]
+	grapheme_5 := str[graphemes[4].byte_index:]
+
+	testing.expectf(t, grapheme_1 == SAMPLE_1, "expected %q, got %q", SAMPLE_1, grapheme_1)
+	testing.expectf(t, grapheme_2 == SAMPLE_2, "expected %q, got %q", SAMPLE_2, grapheme_2)
+	testing.expectf(t, grapheme_3 == SAMPLE_3, "expected %q, got %q", SAMPLE_3, grapheme_3)
+	testing.expectf(t, grapheme_4 == SAMPLE_2, "expected %q, got %q", SAMPLE_2, grapheme_2)
+	testing.expectf(t, grapheme_5 == SAMPLE_1, "expected %q, got %q", SAMPLE_1, grapheme_1)
+}
--- a/tests/core/unicode/test_core_unicode_data.odin
+++ b/tests/core/unicode/test_core_unicode_data.odin