Add Grapheme_Iterator

2026-07-20 22:51:09 +00:00 · 2025-10-13 15:28:23 +01:00
parent 0d7efeab5f
commit 2bc7344f27
1 changed files with 148 additions and 172 deletions
--- a/core/unicode/utf8/grapheme.odin
+++ b/core/unicode/utf8/grapheme.odin
@@ -23,9 +23,40 @@ normalized_east_asian_width       :: unicode.normalized_east_asian_width
 Grapheme :: struct {
 	byte_index: int,
 	rune_index: int,
-	width: int,
+	width:      int,
 }

+
+Grapheme_Cluster_Sequence :: enum {
+	None,
+	Indic,
+	Emoji,
+	Regional,
+}
+
+Grapheme_Iterator :: struct {
+	str:            string,
+	curr_offset:    int,
+
+	grapheme_count: int, // The number of graphemes in the string
+	rune_count:     int, // The number of runes in the string
+	width:          int, // The widrth of the string in number of monospace cells
+
+	last_rune:                  rune,
+	last_rune_breaks_forward:   bool,
+
+	last_width:                 int,
+	last_grapheme_count:        int,
+
+	bypass_next_rune:           bool,
+
+	regional_indicator_counter: int,
+
+	current_sequence:           Grapheme_Cluster_Sequence,
+	continue_sequence:          bool,
+}
+
+
 /*
 Count the individual graphemes in a UTF-8 string.

@@ -39,7 +70,9 @@ Returns:
 */
@(require_results)
 grapheme_count :: proc(str: string) -> (graphemes, runes, width: int) {
-	_, graphemes, runes, width = decode_grapheme_clusters(str, false)
+	it := decode_grapheme_iterator_make(str)
+	for _, _ in decode_grapheme_iterate(&it) {/**/}
+	graphemes, runes, width = it.grapheme_count, it.rune_count, it.width
 	return
 }

@@ -70,155 +103,98 @@ decode_grapheme_clusters :: proc(
 	rune_count:     int,
 	width:          int,
 ) {
-	// The following procedure implements text segmentation by breaking on
-	// Grapheme Cluster Boundaries[1], using the values[2] and rules[3] from
-	// the Unicode® Standard Annex #29, entitled:
-	//
-	// UNICODE TEXT SEGMENTATION
-	//
-	// Version:  Unicode 15.1.0
-	// Date:     2023-08-16
-	// Revision: 43
-	//
-	// This procedure is conformant[4] to UAX29-C1-1, otherwise known as the
-	// extended, non-legacy ruleset.
-	//
-	// Please see the references below for more information.
-	//
-	//
-	// NOTE(Feoramund): This procedure has not been highly optimized.
-	// A couple opportunities were taken to bypass repeated checking when a
-	// rune is outside of certain codepoint ranges, but little else has been
-	// done. Standard switches, conditionals, and binary search are used to
-	// see if a rune fits into a certain category.
-	//
-	// I did find that only one prior rune of state was necessary to build an
-	// algorithm that successfully passes all 4,835 test cases provided with
-	// this implementation from the Unicode organization's website.
-	//
-	// My initial implementation tracked explicit breaks and counted them once
-	// the string iteration had terminated. I've found this current
-	// implementation to be far simpler and need no allocations (unless the
-	// caller wants position data).
-	//
-	// Most rules work backwards instead of forwards which has helped keep this
-	// simple, despite its length and verbosity.
-	//
-	//
-	// The implementation has been left verbose and in the order described by
-	// the specification, to enable better readability and future upkeep.
-	//
-	// Some possible optimizations might include:
-	//
-	// - saving the type of `last_rune` instead of the exact rune.
-	// - reordering rules.
-	// - combining tables.
-	//
-	//
-	// [1]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
-	// [2]: https://www.unicode.org/reports/tr29/#Default_Grapheme_Cluster_Table
-	// [3]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
-	// [4]: https://www.unicode.org/reports/tr29/#Conformance
-
-	// Additionally, this procedure now takes into account Standard Annex #11,
-	// in order to estimate how visually wide the string will appear on a
-	// monospaced display. This can only ever be a rough guess, as this tends
-	// to be an implementation detail relating to which fonts are being used,
-	// how codepoints are interpreted and drawn, if codepoint sequences are
-	// interpreted correctly, and et cetera.
-	//
-	// For example, a program may not properly interpret an emoji modifier
-	// sequence and print the component glyphs instead of one whole glyph.
-	//
-	// See here for more information: https://www.unicode.org/reports/tr11/
-	//
-	// NOTE: There is no explicit mention of what to do with zero-width spaces
-	// as far as grapheme cluster segmentation goes, therefore this
-	// implementation may count and return graphemes with a `width` of zero.
-	//
-	// Treat them as any other space.
-
-	Grapheme_Cluster_Sequence :: enum {
-		None,
-		Indic,
-		Emoji,
-		Regional,
-	}
-
 	context.allocator = allocator

-	last_rune: rune
-	last_rune_breaks_forward: bool
+	it := decode_grapheme_iterator_make(str)
+	for _, grapheme in decode_grapheme_iterate(&it) {
+		append(&graphemes, grapheme)
+	}

-	last_width: int
-	last_grapheme_count: int
+	grapheme_count = it.grapheme_count
+	rune_count     = it.rune_count
+	width          = it.width
+	return
+}

-	bypass_next_rune: bool
+@(require_results)
+decode_grapheme_iterator_make :: proc(str: string) -> (it: Grapheme_Iterator) {
+	it.str = str
+	return
+}

-	regional_indicator_counter: int
+@(require_results)
+decode_grapheme_iterate :: proc(it: ^Grapheme_Iterator) -> (text: string, grapheme: Grapheme, ok: bool) {
+	for it.curr_offset < len(it.str) {
+		if ok {
+			return
+		}

-	current_sequence: Grapheme_Cluster_Sequence
-	continue_sequence: bool
+		str := it.str[it.curr_offset:]
+		this_rune, this_rune_width := decode_rune(str)
+		byte_index := it.curr_offset
+		it.curr_offset += this_rune_width

-	for this_rune, byte_index in str {
 		defer {
 			// "Break at the start and end of text, unless the text is empty."
 			//
 			// GB1: sot  ÷  Any
 			// GB2: Any  ÷  eot
-			if rune_count == 0 && grapheme_count == 0 {
-				grapheme_count += 1
+			if it.rune_count == 0 && it.grapheme_count == 0 {
+				it.grapheme_count += 1
 			}

-			if grapheme_count > last_grapheme_count {
-				width += normalized_east_asian_width(this_rune)
-				if track_graphemes {
-					append(&graphemes, Grapheme{
-						byte_index,
-						rune_count,
-						width - last_width,
-					})
+			if it.grapheme_count > it.last_grapheme_count {
+				it.width += normalized_east_asian_width(this_rune)
+				grapheme = Grapheme{
+					byte_index,
+					it.rune_count,
+					it.width - it.last_width,
 				}
-				last_grapheme_count = grapheme_count
-				last_width = width
+				text = it.str[byte_index:][:grapheme.width]
+				ok = true
+
+
+				it.last_grapheme_count = it.grapheme_count
+				it.last_width = it.width
 			}

-			last_rune = this_rune
-			rune_count += 1
+			it.last_rune = this_rune
+			it.rune_count += 1

-			if !continue_sequence {
-				current_sequence = .None
-				regional_indicator_counter = 0
+			if !it.continue_sequence {
+				it.current_sequence = .None
+				it.regional_indicator_counter = 0
 			}
-			continue_sequence = false
+			it.continue_sequence = false
 		}

+
 		// "Do not break between a CR and LF. Otherwise, break before and after controls."
 		//
 		// GB3:                 CR   ×   LF
 		// GB4: (Control | CR | LF)  ÷
 		// GB5:                      ÷  (Control | CR | LF)
-		if this_rune == '\n' && last_rune == '\r' {
-			last_rune_breaks_forward = false
-			bypass_next_rune = false
+		if this_rune == '\n' && it.last_rune == '\r' {
+			it.last_rune_breaks_forward = false
+			it.bypass_next_rune = false
 			continue
 		}

 		if is_control(this_rune) {
-			grapheme_count += 1
-			last_rune_breaks_forward = true
-			bypass_next_rune = true
+			it.grapheme_count += 1
+			it.last_rune_breaks_forward = true
+			it.bypass_next_rune = true
 			continue
 		}

 		// (This check is for rules that work forwards, instead of backwards.)
-		if bypass_next_rune {
-			if last_rune_breaks_forward {
-				grapheme_count += 1
-				last_rune_breaks_forward = false
+		if it.bypass_next_rune {
+			if it.last_rune_breaks_forward {
+				it.grapheme_count += 1
+				it.last_rune_breaks_forward = false
 			}

-			bypass_next_rune = false
+			it.bypass_next_rune = false
 			continue
 		}

@@ -227,7 +203,7 @@ decode_grapheme_clusters :: proc(
 		//  * 0xA9 and 0xAE are in the Extended_Pictographic range,
 		//    which is checked later in GB11.
 		if this_rune != 0xA9 && this_rune != 0xAE && this_rune <= 0x2FF {
-			grapheme_count += 1
+			it.grapheme_count += 1
 			continue
 		}

@@ -241,30 +217,30 @@ decode_grapheme_clusters :: proc(
 			if is_hangul_syllable_leading(this_rune) ||
 			   is_hangul_syllable_lv(this_rune)      ||
 			   is_hangul_syllable_lvt(this_rune) {
-				if !is_hangul_syllable_leading(last_rune) {
-					grapheme_count += 1
+				if !is_hangul_syllable_leading(it.last_rune) {
+					it.grapheme_count += 1
 				}
 				continue
 			}

 			if is_hangul_syllable_vowel(this_rune) {
-				if is_hangul_syllable_leading(last_rune) ||
-				   is_hangul_syllable_vowel(last_rune)   ||
-				   is_hangul_syllable_lv(last_rune) {
+				if is_hangul_syllable_leading(it.last_rune) ||
+				   is_hangul_syllable_vowel(it.last_rune)   ||
+				   is_hangul_syllable_lv(it.last_rune) {
 					continue
 				}
-				grapheme_count += 1
+				it.grapheme_count += 1
 				continue
 			}

 			if is_hangul_syllable_trailing(this_rune) {
-				if is_hangul_syllable_trailing(last_rune) ||
-				   is_hangul_syllable_lvt(last_rune)      ||
-				   is_hangul_syllable_lv(last_rune)       ||
-				   is_hangul_syllable_vowel(last_rune) {
+				if is_hangul_syllable_trailing(it.last_rune) ||
+				   is_hangul_syllable_lvt(it.last_rune)      ||
+				   is_hangul_syllable_lv(it.last_rune)       ||
+				   is_hangul_syllable_vowel(it.last_rune) {
 					continue
 				}
-				grapheme_count += 1
+				it.grapheme_count += 1
 				continue
 			}
 		}
@@ -273,25 +249,25 @@ decode_grapheme_clusters :: proc(
 		//
 		// GB9:         × (Extend | ZWJ)
 		if this_rune == ZERO_WIDTH_JOINER {
-			continue_sequence = true
+			it.continue_sequence = true
 			continue
 		}

 		if is_gcb_extend_class(this_rune) {
 			// (Support for GB9c.)
-			if current_sequence == .Indic {
+			if it.current_sequence == .Indic {
 				if is_indic_conjunct_break_extend(this_rune)    && (
-				   is_indic_conjunct_break_linker(last_rune)    ||
-				   is_indic_conjunct_break_consonant(last_rune)    ) {
-					continue_sequence = true
+				   is_indic_conjunct_break_linker(it.last_rune) ||
+				   is_indic_conjunct_break_consonant(it.last_rune) ) {
+					it.continue_sequence = true
 					continue
 				}

-				if is_indic_conjunct_break_linker(this_rune)    && (
-				   is_indic_conjunct_break_linker(last_rune)    ||
-				   is_indic_conjunct_break_extend(last_rune)    ||
-				   is_indic_conjunct_break_consonant(last_rune)    ) {
-					continue_sequence = true
+				if is_indic_conjunct_break_linker(this_rune)       && (
+				   is_indic_conjunct_break_linker(it.last_rune)    ||
+				   is_indic_conjunct_break_extend(it.last_rune)    ||
+				   is_indic_conjunct_break_consonant(it.last_rune)    ) {
+					it.continue_sequence = true
 					continue
 				}

@@ -299,10 +275,10 @@ decode_grapheme_clusters :: proc(
 			}

 			// (Support for GB11.)
-			if current_sequence == .Emoji                && (
-			   is_gcb_extend_class(last_rune)            ||
-			   is_emoji_extended_pictographic(last_rune)    ) {
-				continue_sequence = true
+			if it.current_sequence == .Emoji                && (
+			   is_gcb_extend_class(it.last_rune)            ||
+			   is_emoji_extended_pictographic(it.last_rune)    ) {
+				it.continue_sequence = true
 			}

 			continue
@@ -318,8 +294,8 @@ decode_grapheme_clusters :: proc(
 		}

 		if is_gcb_prepend_class(this_rune) {
-			grapheme_count += 1
-			bypass_next_rune = true
+			it.grapheme_count += 1
+			it.bypass_next_rune = true
 			continue
 		}

@@ -328,40 +304,40 @@ decode_grapheme_clusters :: proc(
 		//
 		// GB9c: \p{InCB=Consonant} [ \p{InCB=Extend} \p{InCB=Linker} ]* \p{InCB=Linker} [ \p{InCB=Extend} \p{InCB=Linker} ]*  ×  \p{InCB=Consonant}
 		if is_indic_conjunct_break_consonant(this_rune) {
-			if current_sequence == .Indic {
-				if last_rune == ZERO_WIDTH_JOINER            ||
-				   is_indic_conjunct_break_linker(last_rune) {
-					continue_sequence = true
+			if it.current_sequence == .Indic {
+				if it.last_rune == ZERO_WIDTH_JOINER            ||
+				   is_indic_conjunct_break_linker(it.last_rune) {
+					it.continue_sequence = true
 				} else {
-					grapheme_count += 1
+					it.grapheme_count += 1
 				}
 			} else {
-				grapheme_count += 1
-				current_sequence = .Indic
-				continue_sequence = true
+				it.grapheme_count += 1
+				it.current_sequence = .Indic
+				it.continue_sequence = true
 			}
 			continue
 		}

 		if is_indic_conjunct_break_extend(this_rune) {
-			if current_sequence == .Indic {
-				if is_indic_conjunct_break_consonant(last_rune) ||
-				   is_indic_conjunct_break_linker(last_rune) {
-					continue_sequence = true
+			if it.current_sequence == .Indic {
+				if is_indic_conjunct_break_consonant(it.last_rune) ||
+				   is_indic_conjunct_break_linker(it.last_rune) {
+					it.continue_sequence = true
 				} else {
-					grapheme_count += 1
+					it.grapheme_count += 1
 				}
 			}
 			continue
 		}

 		if is_indic_conjunct_break_linker(this_rune) {
-			if current_sequence == .Indic {
-				if is_indic_conjunct_break_extend(last_rune) ||
-				   is_indic_conjunct_break_linker(last_rune) {
-					continue_sequence = true
+			if it.current_sequence == .Indic {
+				if is_indic_conjunct_break_extend(it.last_rune) ||
+				   is_indic_conjunct_break_linker(it.last_rune) {
+					it.continue_sequence = true
 				} else {
-					grapheme_count += 1
+					it.grapheme_count += 1
 				}
 			}
 			continue
@@ -375,11 +351,11 @@ decode_grapheme_clusters :: proc(
 		//
 		// GB11: \p{Extended_Pictographic} Extend* ZWJ  ×  \p{Extended_Pictographic}
 		if is_emoji_extended_pictographic(this_rune) {
-			if current_sequence != .Emoji || last_rune != ZERO_WIDTH_JOINER {
-				grapheme_count += 1
+			if it.current_sequence != .Emoji || it.last_rune != ZERO_WIDTH_JOINER {
+				it.grapheme_count += 1
 			}
-			current_sequence = .Emoji
-			continue_sequence = true
+			it.current_sequence = .Emoji
+			it.continue_sequence = true
 			continue
 		}

@@ -390,13 +366,13 @@ decode_grapheme_clusters :: proc(
 		// GB12:   sot (RI RI)* RI  ×  RI
 		// GB13: [^RI] (RI RI)* RI  ×  RI
 		if is_regional_indicator(this_rune) {
-			if regional_indicator_counter & 1 == 0 {
-				grapheme_count += 1
+			if it.regional_indicator_counter & 1 == 0 {
+				it.grapheme_count += 1
 			}

-			current_sequence = .Regional
-			continue_sequence = true
-			regional_indicator_counter += 1
+			it.current_sequence = .Regional
+			it.continue_sequence = true
+			it.regional_indicator_counter += 1

 			continue
 		}
@@ -404,8 +380,8 @@ decode_grapheme_clusters :: proc(
 		// "Otherwise, break everywhere."
 		//
 		// GB999: Any ÷ Any
-		grapheme_count += 1
+		it.grapheme_count += 1
 	}

 	return
-}
+}