mirror of
https://github.com/odin-lang/Odin.git
synced 2026-01-02 11:12:31 +00:00
Merge pull request #3775 from Feoramund/unicode-graphemes
Add grapheme analysis facilities to `core:unicode`
This commit is contained in:
@@ -5,6 +5,10 @@ REPLACEMENT_CHAR :: '\ufffd' // Represented an invalid code point
|
||||
MAX_ASCII :: '\u007f' // Maximum ASCII value
|
||||
MAX_LATIN1 :: '\u00ff' // Maximum Latin-1 value
|
||||
|
||||
ZERO_WIDTH_NON_JOINER :: '\u200C'
|
||||
ZERO_WIDTH_JOINER :: '\u200D'
|
||||
|
||||
@(require_results)
|
||||
binary_search :: proc(c: i32, table: []i32, length, stride: int) -> int {
|
||||
n := length
|
||||
t := 0
|
||||
@@ -24,6 +28,7 @@ binary_search :: proc(c: i32, table: []i32, length, stride: int) -> int {
|
||||
return -1
|
||||
}
|
||||
|
||||
@(require_results)
|
||||
to_lower :: proc(r: rune) -> rune {
|
||||
c := i32(r)
|
||||
p := binary_search(c, to_lower_ranges[:], len(to_lower_ranges)/3, 3)
|
||||
@@ -36,6 +41,7 @@ to_lower :: proc(r: rune) -> rune {
|
||||
}
|
||||
return rune(c)
|
||||
}
|
||||
@(require_results)
|
||||
to_upper :: proc(r: rune) -> rune {
|
||||
c := i32(r)
|
||||
p := binary_search(c, to_upper_ranges[:], len(to_upper_ranges)/3, 3)
|
||||
@@ -48,6 +54,7 @@ to_upper :: proc(r: rune) -> rune {
|
||||
}
|
||||
return rune(c)
|
||||
}
|
||||
@(require_results)
|
||||
to_title :: proc(r: rune) -> rune {
|
||||
c := i32(r)
|
||||
p := binary_search(c, to_upper_singlets[:], len(to_title_singlets)/2, 2)
|
||||
@@ -58,6 +65,7 @@ to_title :: proc(r: rune) -> rune {
|
||||
}
|
||||
|
||||
|
||||
@(require_results)
|
||||
is_lower :: proc(r: rune) -> bool {
|
||||
if r <= MAX_ASCII {
|
||||
return u32(r)-'a' < 26
|
||||
@@ -74,6 +82,7 @@ is_lower :: proc(r: rune) -> bool {
|
||||
return false
|
||||
}
|
||||
|
||||
@(require_results)
|
||||
is_upper :: proc(r: rune) -> bool {
|
||||
if r <= MAX_ASCII {
|
||||
return u32(r)-'A' < 26
|
||||
@@ -91,6 +100,7 @@ is_upper :: proc(r: rune) -> bool {
|
||||
}
|
||||
|
||||
is_alpha :: is_letter
|
||||
@(require_results)
|
||||
is_letter :: proc(r: rune) -> bool {
|
||||
if u32(r) <= MAX_LATIN1 {
|
||||
return char_properties[u8(r)]&pLmask != 0
|
||||
@@ -111,10 +121,12 @@ is_letter :: proc(r: rune) -> bool {
|
||||
return false
|
||||
}
|
||||
|
||||
@(require_results)
|
||||
is_title :: proc(r: rune) -> bool {
|
||||
return is_upper(r) && is_lower(r)
|
||||
}
|
||||
|
||||
@(require_results)
|
||||
is_digit :: proc(r: rune) -> bool {
|
||||
if r <= MAX_LATIN1 {
|
||||
return '0' <= r && r <= '9'
|
||||
@@ -124,6 +136,7 @@ is_digit :: proc(r: rune) -> bool {
|
||||
|
||||
|
||||
is_white_space :: is_space
|
||||
@(require_results)
|
||||
is_space :: proc(r: rune) -> bool {
|
||||
if u32(r) <= MAX_LATIN1 {
|
||||
switch r {
|
||||
@@ -140,18 +153,20 @@ is_space :: proc(r: rune) -> bool {
|
||||
return false
|
||||
}
|
||||
|
||||
@(require_results)
|
||||
is_combining :: proc(r: rune) -> bool {
|
||||
c := i32(r)
|
||||
|
||||
return c >= 0x0300 && (c <= 0x036f ||
|
||||
(c >= 0x1ab0 && c <= 0x1aff) ||
|
||||
(c >= 0x1dc0 && c <= 0x1dff) ||
|
||||
(c >= 0x20d0 && c <= 0x20ff) ||
|
||||
(c >= 0xfe20 && c <= 0xfe2f))
|
||||
(c >= 0x1ab0 && c <= 0x1aff) ||
|
||||
(c >= 0x1dc0 && c <= 0x1dff) ||
|
||||
(c >= 0x20d0 && c <= 0x20ff) ||
|
||||
(c >= 0xfe20 && c <= 0xfe2f))
|
||||
}
|
||||
|
||||
|
||||
|
||||
@(require_results)
|
||||
is_graphic :: proc(r: rune) -> bool {
|
||||
if u32(r) <= MAX_LATIN1 {
|
||||
return char_properties[u8(r)]&pg != 0
|
||||
@@ -159,6 +174,7 @@ is_graphic :: proc(r: rune) -> bool {
|
||||
return false
|
||||
}
|
||||
|
||||
@(require_results)
|
||||
is_print :: proc(r: rune) -> bool {
|
||||
if u32(r) <= MAX_LATIN1 {
|
||||
return char_properties[u8(r)]&pp != 0
|
||||
@@ -166,6 +182,7 @@ is_print :: proc(r: rune) -> bool {
|
||||
return false
|
||||
}
|
||||
|
||||
@(require_results)
|
||||
is_control :: proc(r: rune) -> bool {
|
||||
if u32(r) <= MAX_LATIN1 {
|
||||
return char_properties[u8(r)]&pC != 0
|
||||
@@ -173,6 +190,7 @@ is_control :: proc(r: rune) -> bool {
|
||||
return false
|
||||
}
|
||||
|
||||
@(require_results)
|
||||
is_number :: proc(r: rune) -> bool {
|
||||
if u32(r) <= MAX_LATIN1 {
|
||||
return char_properties[u8(r)]&pN != 0
|
||||
@@ -180,6 +198,7 @@ is_number :: proc(r: rune) -> bool {
|
||||
return false
|
||||
}
|
||||
|
||||
@(require_results)
|
||||
is_punct :: proc(r: rune) -> bool {
|
||||
if u32(r) <= MAX_LATIN1 {
|
||||
return char_properties[u8(r)]&pP != 0
|
||||
@@ -187,9 +206,249 @@ is_punct :: proc(r: rune) -> bool {
|
||||
return false
|
||||
}
|
||||
|
||||
@(require_results)
|
||||
is_symbol :: proc(r: rune) -> bool {
|
||||
if u32(r) <= MAX_LATIN1 {
|
||||
return char_properties[u8(r)]&pS != 0
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
//
|
||||
// The procedures below are accurate as of Unicode 15.1.0.
|
||||
//
|
||||
|
||||
// Emoji_Modifier
|
||||
@(require_results)
|
||||
is_emoji_modifier :: proc(r: rune) -> bool {
|
||||
return 0x1F3FB <= r && r <= 0x1F3FF
|
||||
}
|
||||
|
||||
// Regional_Indicator
|
||||
@(require_results)
|
||||
is_regional_indicator :: proc(r: rune) -> bool {
|
||||
return 0x1F1E6 <= r && r <= 0x1F1FF
|
||||
}
|
||||
|
||||
// General_Category=Enclosing_Mark
|
||||
@(require_results)
|
||||
is_enclosing_mark :: proc(r: rune) -> bool {
|
||||
switch r {
|
||||
case 0x0488,
|
||||
0x0489,
|
||||
0x1ABE,
|
||||
0x20DD ..= 0x20E0,
|
||||
0x20E2 ..= 0x20E4,
|
||||
0xA670 ..= 0xA672: return true
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
// Prepended_Concatenation_Mark
|
||||
@(require_results)
|
||||
is_prepended_concatenation_mark :: proc(r: rune) -> bool {
|
||||
switch r {
|
||||
case 0x00600 ..= 0x00605,
|
||||
0x006DD,
|
||||
0x0070F,
|
||||
0x00890 ..= 0x00891,
|
||||
0x008E2,
|
||||
0x110BD,
|
||||
0x110CD:
|
||||
return true
|
||||
case:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// General_Category=Spacing_Mark
|
||||
@(require_results)
|
||||
is_spacing_mark :: proc(r: rune) -> bool {
|
||||
c := i32(r)
|
||||
p := binary_search(c, spacing_mark_ranges[:], len(spacing_mark_ranges)/2, 2)
|
||||
if p >= 0 && spacing_mark_ranges[p] <= c && c <= spacing_mark_ranges[p+1] {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// General_Category=Nonspacing_Mark
|
||||
@(require_results)
|
||||
is_nonspacing_mark :: proc(r: rune) -> bool {
|
||||
c := i32(r)
|
||||
p := binary_search(c, nonspacing_mark_ranges[:], len(nonspacing_mark_ranges)/2, 2)
|
||||
if p >= 0 && nonspacing_mark_ranges[p] <= c && c <= nonspacing_mark_ranges[p+1] {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// Extended_Pictographic
|
||||
@(require_results)
|
||||
is_emoji_extended_pictographic :: proc(r: rune) -> bool {
|
||||
c := i32(r)
|
||||
p := binary_search(c, emoji_extended_pictographic_ranges[:], len(emoji_extended_pictographic_ranges)/2, 2)
|
||||
if p >= 0 && emoji_extended_pictographic_ranges[p] <= c && c <= emoji_extended_pictographic_ranges[p+1] {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// Grapheme_Extend
|
||||
@(require_results)
|
||||
is_grapheme_extend :: proc(r: rune) -> bool {
|
||||
c := i32(r)
|
||||
p := binary_search(c, grapheme_extend_ranges[:], len(grapheme_extend_ranges)/2, 2)
|
||||
if p >= 0 && grapheme_extend_ranges[p] <= c && c <= grapheme_extend_ranges[p+1] {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
|
||||
// Hangul_Syllable_Type=Leading_Jamo
|
||||
@(require_results)
|
||||
is_hangul_syllable_leading :: proc(r: rune) -> bool {
|
||||
return 0x1100 <= r && r <= 0x115F || 0xA960 <= r && r <= 0xA97C
|
||||
}
|
||||
|
||||
// Hangul_Syllable_Type=Vowel_Jamo
|
||||
@(require_results)
|
||||
is_hangul_syllable_vowel :: proc(r: rune) -> bool {
|
||||
return 0x1160 <= r && r <= 0x11A7 || 0xD7B0 <= r && r <= 0xD7C6
|
||||
}
|
||||
|
||||
// Hangul_Syllable_Type=Trailing_Jamo
|
||||
@(require_results)
|
||||
is_hangul_syllable_trailing :: proc(r: rune) -> bool {
|
||||
return 0x11A8 <= r && r <= 0x11FF || 0xD7CB <= r && r <= 0xD7FB
|
||||
}
|
||||
|
||||
// Hangul_Syllable_Type=LV_Syllable
|
||||
@(require_results)
|
||||
is_hangul_syllable_lv :: proc(r: rune) -> bool {
|
||||
c := i32(r)
|
||||
p := binary_search(c, hangul_syllable_lv_singlets[:], len(hangul_syllable_lv_singlets), 1)
|
||||
if p >= 0 && c == hangul_syllable_lv_singlets[p] {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// Hangul_Syllable_Type=LVT_Syllable
|
||||
@(require_results)
|
||||
is_hangul_syllable_lvt :: proc(r: rune) -> bool {
|
||||
c := i32(r)
|
||||
p := binary_search(c, hangul_syllable_lvt_ranges[:], len(hangul_syllable_lvt_ranges)/2, 2)
|
||||
if p >= 0 && hangul_syllable_lvt_ranges[p] <= c && c <= hangul_syllable_lvt_ranges[p+1] {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
|
||||
// Indic_Syllabic_Category=Consonant_Preceding_Repha
|
||||
@(require_results)
|
||||
is_indic_consonant_preceding_repha :: proc(r: rune) -> bool {
|
||||
switch r {
|
||||
case 0x00D4E,
|
||||
0x11941,
|
||||
0x11D46,
|
||||
0x11F02:
|
||||
return true
|
||||
case:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// Indic_Syllabic_Category=Consonant_Prefixed
|
||||
@(require_results)
|
||||
is_indic_consonant_prefixed :: proc(r: rune) -> bool {
|
||||
switch r {
|
||||
case 0x111C2 ..= 0x111C3,
|
||||
0x1193F,
|
||||
0x11A3A,
|
||||
0x11A84 ..= 0x11A89:
|
||||
return true
|
||||
case:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// Indic_Conjunct_Break=Linker
|
||||
@(require_results)
|
||||
is_indic_conjunct_break_linker :: proc(r: rune) -> bool {
|
||||
switch r {
|
||||
case 0x094D,
|
||||
0x09CD,
|
||||
0x0ACD,
|
||||
0x0B4D,
|
||||
0x0C4D,
|
||||
0x0D4D:
|
||||
return true
|
||||
case:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// Indic_Conjunct_Break=Consonant
|
||||
@(require_results)
|
||||
is_indic_conjunct_break_consonant :: proc(r: rune) -> bool {
|
||||
c := i32(r)
|
||||
p := binary_search(c, indic_conjunct_break_consonant_ranges[:], len(indic_conjunct_break_consonant_ranges)/2, 2)
|
||||
if p >= 0 && indic_conjunct_break_consonant_ranges[p] <= c && c <= indic_conjunct_break_consonant_ranges[p+1] {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// Indic_Conjunct_Break=Extend
|
||||
@(require_results)
|
||||
is_indic_conjunct_break_extend :: proc(r: rune) -> bool {
|
||||
c := i32(r)
|
||||
p := binary_search(c, indic_conjunct_break_extend_ranges[:], len(indic_conjunct_break_extend_ranges)/2, 2)
|
||||
if p >= 0 && indic_conjunct_break_extend_ranges[p] <= c && c <= indic_conjunct_break_extend_ranges[p+1] {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
For grapheme text segmentation, from Unicode TR 29 Rev 43:
|
||||
|
||||
```
|
||||
Indic_Syllabic_Category = Consonant_Preceding_Repha, or
|
||||
Indic_Syllabic_Category = Consonant_Prefixed, or
|
||||
Prepended_Concatenation_Mark = Yes
|
||||
```
|
||||
*/
|
||||
@(require_results)
|
||||
is_gcb_prepend_class :: proc(r: rune) -> bool {
|
||||
return is_indic_consonant_preceding_repha(r) || is_indic_consonant_prefixed(r) || is_prepended_concatenation_mark(r)
|
||||
}
|
||||
|
||||
/*
|
||||
For grapheme text segmentation, from Unicode TR 29 Rev 43:
|
||||
|
||||
```
|
||||
Grapheme_Extend = Yes, or
|
||||
Emoji_Modifier = Yes
|
||||
|
||||
This includes:
|
||||
General_Category = Nonspacing_Mark
|
||||
General_Category = Enclosing_Mark
|
||||
U+200C ZERO WIDTH NON-JOINER
|
||||
|
||||
plus a few General_Category = Spacing_Mark needed for canonical equivalence.
|
||||
```
|
||||
*/
|
||||
@(require_results)
|
||||
is_gcb_extend_class :: proc(r: rune) -> bool {
|
||||
return is_grapheme_extend(r) || is_emoji_modifier(r)
|
||||
}
|
||||
|
||||
//
|
||||
// End of Unicode 15.1.0 block.
|
||||
//
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
387
core/unicode/utf8/grapheme.odin
Normal file
387
core/unicode/utf8/grapheme.odin
Normal file
@@ -0,0 +1,387 @@
|
||||
package utf8
|
||||
|
||||
import "core:unicode"
|
||||
|
||||
ZERO_WIDTH_JOINER :: unicode.ZERO_WIDTH_JOINER
|
||||
is_control :: unicode.is_control
|
||||
is_hangul_syllable_leading :: unicode.is_hangul_syllable_leading
|
||||
is_hangul_syllable_vowel :: unicode.is_hangul_syllable_vowel
|
||||
is_hangul_syllable_trailing :: unicode.is_hangul_syllable_trailing
|
||||
is_hangul_syllable_lv :: unicode.is_hangul_syllable_lv
|
||||
is_hangul_syllable_lvt :: unicode.is_hangul_syllable_lvt
|
||||
is_indic_conjunct_break_extend :: unicode.is_indic_conjunct_break_extend
|
||||
is_indic_conjunct_break_linker :: unicode.is_indic_conjunct_break_linker
|
||||
is_indic_conjunct_break_consonant :: unicode.is_indic_conjunct_break_consonant
|
||||
is_gcb_extend_class :: unicode.is_gcb_extend_class
|
||||
is_spacing_mark :: unicode.is_spacing_mark
|
||||
is_gcb_prepend_class :: unicode.is_gcb_prepend_class
|
||||
is_emoji_extended_pictographic :: unicode.is_emoji_extended_pictographic
|
||||
is_regional_indicator :: unicode.is_regional_indicator
|
||||
|
||||
|
||||
Grapheme :: struct {
|
||||
byte_index: int,
|
||||
rune_index: int,
|
||||
}
|
||||
|
||||
/*
|
||||
Count the individual graphemes in a UTF-8 string.
|
||||
|
||||
Inputs:
|
||||
- str: The input string.
|
||||
|
||||
Returns:
|
||||
- graphemes: The number of graphemes in the string.
|
||||
- runes: The number of runes in the string.
|
||||
*/
|
||||
@(require_results)
|
||||
grapheme_count :: proc(str: string) -> (graphemes, runes: int) {
|
||||
_, graphemes, runes = decode_grapheme_clusters(str, false)
|
||||
return
|
||||
}
|
||||
|
||||
/*
|
||||
Decode the individual graphemes in a UTF-8 string.
|
||||
|
||||
*Allocates Using Provided Allocator*
|
||||
|
||||
Inputs:
|
||||
- str: The input string.
|
||||
- track_graphemes: Whether or not to allocate and return `graphemes` with extra data about each grapheme.
|
||||
- allocator: (default: context.allocator)
|
||||
|
||||
Returns:
|
||||
- graphemes: Extra data about each grapheme.
|
||||
- grapheme_count: The number of graphemes in the string.
|
||||
- rune_count: The number of runes in the string.
|
||||
*/
|
||||
@(require_results)
|
||||
decode_grapheme_clusters :: proc(
|
||||
str: string,
|
||||
track_graphemes := true,
|
||||
allocator := context.allocator,
|
||||
) -> (
|
||||
graphemes: [dynamic]Grapheme,
|
||||
grapheme_count: int,
|
||||
rune_count: int,
|
||||
) {
|
||||
// The following procedure implements text segmentation by breaking on
|
||||
// Grapheme Cluster Boundaries[1], using the values[2] and rules[3] from
|
||||
// the Unicode® Standard Annex #29, entitled:
|
||||
//
|
||||
// UNICODE TEXT SEGMENTATION
|
||||
//
|
||||
// Version: Unicode 15.1.0
|
||||
// Date: 2023-08-16
|
||||
// Revision: 43
|
||||
//
|
||||
// This procedure is conformant[4] to UAX29-C1-1, otherwise known as the
|
||||
// extended, non-legacy ruleset.
|
||||
//
|
||||
// Please see the references below for more information.
|
||||
//
|
||||
//
|
||||
// NOTE(Feoramund): This procedure has not been highly optimized.
|
||||
// A couple opportunities were taken to bypass repeated checking when a
|
||||
// rune is outside of certain codepoint ranges, but little else has been
|
||||
// done. Standard switches, conditionals, and binary search are used to
|
||||
// see if a rune fits into a certain category.
|
||||
//
|
||||
// I did find that only one prior rune of state was necessary to build an
|
||||
// algorithm that successfully passes all 4,835 test cases provided with
|
||||
// this implementation from the Unicode organization's website.
|
||||
//
|
||||
// My initial implementation tracked explicit breaks and counted them once
|
||||
// the string iteration had terminated. I've found this current
|
||||
// implementation to be far simpler and need no allocations (unless the
|
||||
// caller wants position data).
|
||||
//
|
||||
// Most rules work backwards instead of forwards which has helped keep this
|
||||
// simple, despite its length and verbosity.
|
||||
//
|
||||
//
|
||||
// The implementation has been left verbose and in the order described by
|
||||
// the specification, to enable better readability and future upkeep.
|
||||
//
|
||||
// Some possible optimizations might include:
|
||||
//
|
||||
// - saving the type of `last_rune` instead of the exact rune.
|
||||
// - reordering rules.
|
||||
// - combining tables.
|
||||
//
|
||||
//
|
||||
// [1]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
|
||||
// [2]: https://www.unicode.org/reports/tr29/#Default_Grapheme_Cluster_Table
|
||||
// [3]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
|
||||
// [4]: https://www.unicode.org/reports/tr29/#Conformance
|
||||
|
||||
Grapheme_Cluster_Sequence :: enum {
|
||||
None,
|
||||
Indic,
|
||||
Emoji,
|
||||
Regional,
|
||||
}
|
||||
|
||||
context.allocator = allocator
|
||||
|
||||
last_rune: rune
|
||||
last_rune_breaks_forward: bool
|
||||
|
||||
last_grapheme_count: int
|
||||
|
||||
bypass_next_rune: bool
|
||||
|
||||
regional_indicator_counter: int
|
||||
|
||||
current_sequence: Grapheme_Cluster_Sequence
|
||||
continue_sequence: bool
|
||||
|
||||
for this_rune, byte_index in str {
|
||||
defer {
|
||||
// "Break at the start and end of text, unless the text is empty."
|
||||
//
|
||||
// GB1: sot ÷ Any
|
||||
// GB2: Any ÷ eot
|
||||
if rune_count == 0 && grapheme_count == 0 {
|
||||
grapheme_count += 1
|
||||
}
|
||||
if track_graphemes && grapheme_count > last_grapheme_count {
|
||||
append(&graphemes, Grapheme{ byte_index, rune_count })
|
||||
}
|
||||
last_grapheme_count = grapheme_count
|
||||
|
||||
last_rune = this_rune
|
||||
rune_count += 1
|
||||
|
||||
if !continue_sequence {
|
||||
current_sequence = .None
|
||||
regional_indicator_counter = 0
|
||||
}
|
||||
continue_sequence = false
|
||||
}
|
||||
|
||||
// "Do not break between a CR and LF. Otherwise, break before and after controls."
|
||||
//
|
||||
// GB3: CR × LF
|
||||
// GB4: (Control | CR | LF) ÷
|
||||
// GB5: ÷ (Control | CR | LF)
|
||||
if this_rune == '\n' && last_rune == '\r' {
|
||||
last_rune_breaks_forward = false
|
||||
bypass_next_rune = false
|
||||
continue
|
||||
}
|
||||
|
||||
if is_control(this_rune) {
|
||||
grapheme_count += 1
|
||||
last_rune_breaks_forward = true
|
||||
bypass_next_rune = true
|
||||
continue
|
||||
}
|
||||
|
||||
// (This check is for rules that work forwards, instead of backwards.)
|
||||
if bypass_next_rune {
|
||||
if last_rune_breaks_forward {
|
||||
grapheme_count += 1
|
||||
last_rune_breaks_forward = false
|
||||
}
|
||||
|
||||
bypass_next_rune = false
|
||||
continue
|
||||
}
|
||||
|
||||
// (Optimization 1: Prevent low runes from proceeding further.)
|
||||
//
|
||||
// * 0xA9 and 0xAE are in the Extended_Pictographic range,
|
||||
// which is checked later in GB11.
|
||||
if this_rune != 0xA9 && this_rune != 0xAE && this_rune <= 0x2FF {
|
||||
grapheme_count += 1
|
||||
continue
|
||||
}
|
||||
|
||||
// (Optimization 2: Check if the rune is in the Hangul space before getting specific.)
|
||||
if 0x1100 <= this_rune && this_rune <= 0xD7FB {
|
||||
// "Do not break Hangul syllable sequences."
|
||||
//
|
||||
// GB6: L × (L | V | LV | LVT)
|
||||
// GB7: (LV | V) × (V | T)
|
||||
// GB8: (LVT | T) × T
|
||||
if is_hangul_syllable_leading(this_rune) ||
|
||||
is_hangul_syllable_lv(this_rune) ||
|
||||
is_hangul_syllable_lvt(this_rune)
|
||||
{
|
||||
if !is_hangul_syllable_leading(last_rune) {
|
||||
grapheme_count += 1
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
if is_hangul_syllable_vowel(this_rune) {
|
||||
if is_hangul_syllable_leading(last_rune) ||
|
||||
is_hangul_syllable_vowel(last_rune) ||
|
||||
is_hangul_syllable_lv(last_rune)
|
||||
{
|
||||
continue
|
||||
}
|
||||
grapheme_count += 1
|
||||
continue
|
||||
}
|
||||
|
||||
if is_hangul_syllable_trailing(this_rune) {
|
||||
if is_hangul_syllable_trailing(last_rune) ||
|
||||
is_hangul_syllable_lvt(last_rune) ||
|
||||
is_hangul_syllable_lv(last_rune) ||
|
||||
is_hangul_syllable_vowel(last_rune)
|
||||
{
|
||||
continue
|
||||
}
|
||||
grapheme_count += 1
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
// "Do not break before extending characters or ZWJ."
|
||||
//
|
||||
// GB9: × (Extend | ZWJ)
|
||||
if this_rune == ZERO_WIDTH_JOINER {
|
||||
continue_sequence = true
|
||||
continue
|
||||
}
|
||||
|
||||
if is_gcb_extend_class(this_rune) {
|
||||
// (Support for GB9c.)
|
||||
if current_sequence == .Indic {
|
||||
if is_indic_conjunct_break_extend(this_rune) && (
|
||||
is_indic_conjunct_break_linker(last_rune) ||
|
||||
is_indic_conjunct_break_consonant(last_rune) )
|
||||
{
|
||||
continue_sequence = true
|
||||
continue
|
||||
}
|
||||
|
||||
if is_indic_conjunct_break_linker(this_rune) && (
|
||||
is_indic_conjunct_break_linker(last_rune) ||
|
||||
is_indic_conjunct_break_extend(last_rune) ||
|
||||
is_indic_conjunct_break_consonant(last_rune) )
|
||||
{
|
||||
continue_sequence = true
|
||||
continue
|
||||
}
|
||||
|
||||
continue
|
||||
}
|
||||
|
||||
// (Support for GB11.)
|
||||
if current_sequence == .Emoji && (
|
||||
is_gcb_extend_class(last_rune) ||
|
||||
is_emoji_extended_pictographic(last_rune) )
|
||||
{
|
||||
continue_sequence = true
|
||||
}
|
||||
|
||||
continue
|
||||
}
|
||||
|
||||
// _The GB9a and GB9b rules only apply to extended grapheme clusters:_
|
||||
// "Do not break before SpacingMarks, or after Prepend characters."
|
||||
//
|
||||
// GB9a: × SpacingMark
|
||||
// GB9b: Prepend ×
|
||||
if is_spacing_mark(this_rune) {
|
||||
continue
|
||||
}
|
||||
|
||||
if is_gcb_prepend_class(this_rune) {
|
||||
grapheme_count += 1
|
||||
bypass_next_rune = true
|
||||
continue
|
||||
}
|
||||
|
||||
// _The GB9c rule only applies to extended grapheme clusters:_
|
||||
// "Do not break within certain combinations with Indic_Conjunct_Break (InCB)=Linker."
|
||||
//
|
||||
// GB9c: \p{InCB=Consonant} [ \p{InCB=Extend} \p{InCB=Linker} ]* \p{InCB=Linker} [ \p{InCB=Extend} \p{InCB=Linker} ]* × \p{InCB=Consonant}
|
||||
if is_indic_conjunct_break_consonant(this_rune) {
|
||||
if current_sequence == .Indic {
|
||||
if last_rune == ZERO_WIDTH_JOINER ||
|
||||
is_indic_conjunct_break_linker(last_rune)
|
||||
{
|
||||
continue_sequence = true
|
||||
} else {
|
||||
grapheme_count += 1
|
||||
}
|
||||
} else {
|
||||
grapheme_count += 1
|
||||
current_sequence = .Indic
|
||||
continue_sequence = true
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
if is_indic_conjunct_break_extend(this_rune) {
|
||||
if current_sequence == .Indic {
|
||||
if is_indic_conjunct_break_consonant(last_rune) ||
|
||||
is_indic_conjunct_break_linker(last_rune)
|
||||
{
|
||||
continue_sequence = true
|
||||
} else {
|
||||
grapheme_count += 1
|
||||
}
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
if is_indic_conjunct_break_linker(this_rune) {
|
||||
if current_sequence == .Indic {
|
||||
if is_indic_conjunct_break_extend(last_rune) ||
|
||||
is_indic_conjunct_break_linker(last_rune)
|
||||
{
|
||||
continue_sequence = true
|
||||
} else {
|
||||
grapheme_count += 1
|
||||
}
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
//
|
||||
// (Curiously, there is no GB10.)
|
||||
//
|
||||
|
||||
// "Do not break within emoji modifier sequences or emoji zwj sequences."
|
||||
//
|
||||
// GB11: \p{Extended_Pictographic} Extend* ZWJ × \p{Extended_Pictographic}
|
||||
if is_emoji_extended_pictographic(this_rune) {
|
||||
if current_sequence != .Emoji || last_rune != ZERO_WIDTH_JOINER {
|
||||
grapheme_count += 1
|
||||
}
|
||||
current_sequence = .Emoji
|
||||
continue_sequence = true
|
||||
continue
|
||||
}
|
||||
|
||||
// "Do not break within emoji flag sequences.
|
||||
// That is, do not break between regional indicator (RI) symbols
|
||||
// if there is an odd number of RI characters before the break point."
|
||||
//
|
||||
// GB12: sot (RI RI)* RI × RI
|
||||
// GB13: [^RI] (RI RI)* RI × RI
|
||||
if is_regional_indicator(this_rune) {
|
||||
if regional_indicator_counter & 1 == 0 {
|
||||
grapheme_count += 1
|
||||
}
|
||||
|
||||
current_sequence = .Regional
|
||||
continue_sequence = true
|
||||
regional_indicator_counter += 1
|
||||
|
||||
continue
|
||||
}
|
||||
|
||||
// "Otherwise, break everywhere."
|
||||
//
|
||||
// GB999: Any ÷ Any
|
||||
grapheme_count += 1
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
73
tests/core/unicode/test_core_unicode.odin
Normal file
73
tests/core/unicode/test_core_unicode.odin
Normal file
@@ -0,0 +1,73 @@
|
||||
package test_core_unicode
|
||||
|
||||
import "core:log"
|
||||
import "core:testing"
|
||||
import "core:unicode/utf8"
|
||||
|
||||
Test_Case :: struct {
|
||||
str: string,
|
||||
expected_clusters: int,
|
||||
}
|
||||
|
||||
run_test_cases :: proc(t: ^testing.T, test_cases: []Test_Case, loc := #caller_location) {
|
||||
failed := 0
|
||||
for c, i in test_cases {
|
||||
log.debugf("(#% 4i) %q ...", i, c.str)
|
||||
result, _ := utf8.grapheme_count(c.str)
|
||||
if !testing.expectf(t, result == c.expected_clusters,
|
||||
"(#% 4i) graphemes: %i != %i, %q %s", i, result, c.expected_clusters, c.str, c.str,
|
||||
loc = loc)
|
||||
{
|
||||
failed += 1
|
||||
}
|
||||
}
|
||||
|
||||
log.logf(.Error if failed > 0 else .Info, "% 4i/% 4i test cases failed.", failed, len(test_cases), location = loc)
|
||||
}
|
||||
|
||||
@test
|
||||
test_official_gcb_cases :: proc(t: ^testing.T) {
|
||||
run_test_cases(t, official_grapheme_break_test_cases)
|
||||
}
|
||||
|
||||
@test
|
||||
test_official_emoji_cases :: proc(t: ^testing.T) {
|
||||
run_test_cases(t, official_emoji_test_cases)
|
||||
}
|
||||
|
||||
@test
|
||||
test_grapheme_byte_index_segmentation :: proc(t: ^testing.T) {
|
||||
SAMPLE_1 :: "\U0001F600"
|
||||
SAMPLE_2 :: "\U0001F3F4\U000E0067\U000E0062\U000E0065\U000E006E\U000E0067\U000E007F"
|
||||
SAMPLE_3 :: "\U0001F468\U0001F3FB\u200D\U0001F9B0"
|
||||
|
||||
str := SAMPLE_1 + SAMPLE_2 + SAMPLE_3 + SAMPLE_2 + SAMPLE_1
|
||||
|
||||
graphemes, _, _ := utf8.decode_grapheme_clusters(str)
|
||||
defer delete(graphemes)
|
||||
|
||||
defer if testing.failed(t) {
|
||||
log.infof("%#v\n%q\n%v", graphemes, str, transmute([]u8)str)
|
||||
}
|
||||
if !testing.expect_value(t, len(graphemes), 5) {
|
||||
return
|
||||
}
|
||||
|
||||
testing.expect_value(t, graphemes[0].rune_index, 0)
|
||||
testing.expect_value(t, graphemes[1].rune_index, 1)
|
||||
testing.expect_value(t, graphemes[2].rune_index, 8)
|
||||
testing.expect_value(t, graphemes[3].rune_index, 12)
|
||||
testing.expect_value(t, graphemes[4].rune_index, 19)
|
||||
|
||||
grapheme_1 := str[graphemes[0].byte_index:graphemes[1].byte_index]
|
||||
grapheme_2 := str[graphemes[1].byte_index:graphemes[2].byte_index]
|
||||
grapheme_3 := str[graphemes[2].byte_index:graphemes[3].byte_index]
|
||||
grapheme_4 := str[graphemes[3].byte_index:graphemes[4].byte_index]
|
||||
grapheme_5 := str[graphemes[4].byte_index:]
|
||||
|
||||
testing.expectf(t, grapheme_1 == SAMPLE_1, "expected %q, got %q", SAMPLE_1, grapheme_1)
|
||||
testing.expectf(t, grapheme_2 == SAMPLE_2, "expected %q, got %q", SAMPLE_2, grapheme_2)
|
||||
testing.expectf(t, grapheme_3 == SAMPLE_3, "expected %q, got %q", SAMPLE_3, grapheme_3)
|
||||
testing.expectf(t, grapheme_4 == SAMPLE_2, "expected %q, got %q", SAMPLE_2, grapheme_2)
|
||||
testing.expectf(t, grapheme_5 == SAMPLE_1, "expected %q, got %q", SAMPLE_1, grapheme_1)
|
||||
}
|
||||
4912
tests/core/unicode/test_core_unicode_data.odin
Normal file
4912
tests/core/unicode/test_core_unicode_data.odin
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user