Odin/core/unicode/letter.odin

package unicode

import "base:runtime"

MAX_RUNE         :: '\U00010fff' // Maximum valid unicode code point
REPLACEMENT_CHAR :: '\ufffd'     // Represented an invalid code point
MAX_ASCII        :: '\u007f'     // Maximum ASCII value
MAX_LATIN1       :: '\u00ff'     // Maximum Latin-1 value

ZERO_WIDTH_SPACE      :: '\u200B'
ZERO_WIDTH_NON_JOINER :: '\u200C'
ZERO_WIDTH_JOINER     :: '\u200D'
WORD_JOINER           :: '\u2060'

@(require_results)
binary_search :: proc(c: i32, table: []i32, length, stride: int, loc := #caller_location) -> int #no_bounds_check {
	runtime.bounds_check_error_loc(loc, length*stride-1, len(table))
	n := length
	t := 0
	for n > 1 {
		m := n / 2
		p := t + m*stride
		if c >= table[p] {
			t = p
			n = n-m
		} else {
			n = m
		}
	}
	if n != 0 && c >= table[t] {
		return t
	}
	return -1
}

@(require_results)
to_lower :: proc(r: rune) -> rune #no_bounds_check {
	c := i32(r)
	p := binary_search(c, to_lower_ranges[:], len(to_lower_ranges)/3, 3)
	if p >= 0 && to_lower_ranges[p] <= c && c <= to_lower_ranges[p+1] {
		return rune(c + to_lower_ranges[p+2] - 500)
	}
	p = binary_search(c, to_lower_singlets[:], len(to_lower_singlets)/2, 2)
	if p >= 0 && c == to_lower_singlets[p] {
		return rune(c + to_lower_singlets[p+1] - 500)
	}
	return rune(c)
}
@(require_results)
to_upper :: proc(r: rune) -> rune #no_bounds_check {
	c := i32(r)
	p := binary_search(c, to_upper_ranges[:], len(to_upper_ranges)/3, 3)
	if p >= 0 && to_upper_ranges[p] <= c && c <= to_upper_ranges[p+1] {
		return rune(c + to_upper_ranges[p+2] - 500)
	}
	p = binary_search(c, to_upper_singlets[:], len(to_upper_singlets)/2, 2)
	if p >= 0 && c == to_upper_singlets[p] {
		return rune(c + to_upper_singlets[p+1] - 500)
	}
	return rune(c)
}
@(require_results)
to_title :: proc(r: rune) -> rune #no_bounds_check {
	c := i32(r)
	p := binary_search(c, to_upper_singlets[:], len(to_title_singlets)/2, 2)
	if p >= 0 && c == to_upper_singlets[p] {
		return rune(c + to_title_singlets[p+1] - 500)
	}
	return rune(c)
}


@(require_results)
is_lower :: proc(r: rune) -> bool #no_bounds_check {
	if r <= MAX_ASCII {
		return u32(r)-'a' < 26
	}
	c := i32(r)
	p := binary_search(c, to_upper_ranges[:], len(to_upper_ranges)/3, 3)
	if p >= 0 && to_upper_ranges[p] <= c && c <= to_upper_ranges[p+1] {
		return true
	}
	p = binary_search(c, to_upper_singlets[:], len(to_upper_singlets)/2, 2)
	if p >= 0 && c == to_upper_singlets[p] {
		return true
	}
	return false
}

@(require_results)
is_upper :: proc(r: rune) -> bool #no_bounds_check {
	if r <= MAX_ASCII {
		return u32(r)-'A' < 26
	}
	c := i32(r)
	p := binary_search(c, to_lower_ranges[:], len(to_lower_ranges)/3, 3)
	if p >= 0 && to_lower_ranges[p] <= c && c <= to_lower_ranges[p+1] {
		return true
	}
	p = binary_search(c, to_lower_singlets[:], len(to_lower_singlets)/2, 2)
	if p >= 0 && c == to_lower_singlets[p] {
		return true
	}
	return false
}

is_alpha :: is_letter
@(require_results)
is_letter :: proc(r: rune) -> bool #no_bounds_check {
	if u32(r) <= MAX_LATIN1 {
		return char_properties[u8(r)]&pLmask != 0
	}
	if is_upper(r) || is_lower(r) {
		return true
	}

	c := i32(r)
	p := binary_search(c, alpha_ranges[:], len(alpha_ranges)/2, 2)
	if p >= 0 && alpha_ranges[p] <= c && c <= alpha_ranges[p+1] {
		return true
	}
	p = binary_search(c, alpha_singlets[:], len(alpha_singlets), 1)
	if p >= 0 && c == alpha_singlets[p] {
		return true
	}
	return false
}

@(require_results)
is_title :: proc(r: rune) -> bool {
	return is_upper(r) && is_lower(r)
}

@(require_results)
is_digit :: proc(r: rune) -> bool {
	if r <= MAX_LATIN1 {
		return '0' <= r && r <= '9'
	}
	return false
}


is_white_space :: is_space
@(require_results)
is_space :: proc(r: rune) -> bool #no_bounds_check {
	if u32(r) <= MAX_LATIN1 {
		switch r {
		case '\t', '\n', '\v', '\f', '\r', ' ', 0x85, 0xa0:
			return true
		}
		return false
	}
	c := i32(r)
	p := binary_search(c, space_ranges[:], len(space_ranges)/2, 2)
	if p >= 0 && space_ranges[p] <= c && c <= space_ranges[p+1] {
		return true
	}
	return false
}

@(require_results)
is_combining :: proc(r: rune) -> bool {
	c := i32(r)

	return c >= 0x0300 && (c <= 0x036f ||
	      (c >= 0x1ab0 && c <= 0x1aff) ||
	      (c >= 0x1dc0 && c <= 0x1dff) ||
	      (c >= 0x20d0 && c <= 0x20ff) ||
	      (c >= 0xfe20 && c <= 0xfe2f))
}


@(require_results)
is_graphic :: proc(r: rune) -> bool {
	if u32(r) <= MAX_LATIN1 {
		return char_properties[u8(r)]&pg != 0
	}
	return false
}

@(require_results)
is_print :: proc(r: rune) -> bool #no_bounds_check {
	if u32(r) <= MAX_LATIN1 {
		return char_properties[u8(r)]&pp != 0
	}
	return false
}

@(require_results)
is_control :: proc(r: rune) -> bool #no_bounds_check {
	if u32(r) <= MAX_LATIN1 {
		return char_properties[u8(r)]&pC != 0
	}
	return false
}

@(require_results)
is_number :: proc(r: rune) -> bool #no_bounds_check {
	if u32(r) <= MAX_LATIN1 {
		return char_properties[u8(r)]&pN != 0
	}
	return false
}

@(require_results)
is_punct :: proc(r: rune) -> bool #no_bounds_check {
	if u32(r) <= MAX_LATIN1 {
		return char_properties[u8(r)]&pP != 0
	}
	return false
}

@(require_results)
is_symbol :: proc(r: rune) -> bool #no_bounds_check {
	if u32(r) <= MAX_LATIN1 {
		return char_properties[u8(r)]&pS != 0
	}
	return false
}

//
// The procedures below are accurate as of Unicode 15.1.0.
//

// Emoji_Modifier
@(require_results)
is_emoji_modifier :: proc(r: rune) -> bool {
	return 0x1F3FB <= r && r <= 0x1F3FF
}

// Regional_Indicator
@(require_results)
is_regional_indicator :: proc(r: rune) -> bool {
	return 0x1F1E6 <= r && r <= 0x1F1FF
}

// General_Category=Enclosing_Mark
@(require_results)
is_enclosing_mark :: proc(r: rune) -> bool {
	switch r {
	case 0x0488,
	     0x0489,
	     0x1ABE,
	     0x20DD ..= 0x20E0,
	     0x20E2 ..= 0x20E4,
	     0xA670 ..= 0xA672:
		return true
	}

	return false
}

// Prepended_Concatenation_Mark
@(require_results)
is_prepended_concatenation_mark :: proc(r: rune) -> bool {
	switch r {
	case 0x00600 ..= 0x00605,
	     0x006DD,
	     0x0070F,
	     0x00890 ..= 0x00891,
	     0x008E2,
	     0x110BD,
	     0x110CD:
		return true
	case:
		return false
	}
}

// General_Category=Spacing_Mark
@(require_results)
is_spacing_mark :: proc(r: rune) -> bool #no_bounds_check {
	c := i32(r)
	p := binary_search(c, spacing_mark_ranges[:], len(spacing_mark_ranges)/2, 2)
	if p >= 0 && spacing_mark_ranges[p] <= c && c <= spacing_mark_ranges[p+1] {
		return true
	}
	return false
}

// General_Category=Nonspacing_Mark
@(require_results)
is_nonspacing_mark :: proc(r: rune) -> bool #no_bounds_check {
	c := i32(r)
	p := binary_search(c, nonspacing_mark_ranges[:], len(nonspacing_mark_ranges)/2, 2)
	if p >= 0 && nonspacing_mark_ranges[p] <= c && c <= nonspacing_mark_ranges[p+1] {
		return true
	}
	return false
}

// Extended_Pictographic
@(require_results)
is_emoji_extended_pictographic :: proc(r: rune) -> bool #no_bounds_check {
	c := i32(r)
	p := binary_search(c, emoji_extended_pictographic_ranges[:], len(emoji_extended_pictographic_ranges)/2, 2)
	if p >= 0 && emoji_extended_pictographic_ranges[p] <= c && c <= emoji_extended_pictographic_ranges[p+1] {
		return true
	}
	return false
}

// Grapheme_Extend
@(require_results)
is_grapheme_extend :: proc(r: rune) -> bool #no_bounds_check {
	c := i32(r)
	p := binary_search(c, grapheme_extend_ranges[:], len(grapheme_extend_ranges)/2, 2)
	if p >= 0 && grapheme_extend_ranges[p] <= c && c <= grapheme_extend_ranges[p+1] {
		return true
	}
	return false
}


// Hangul_Syllable_Type=Leading_Jamo
@(require_results)
is_hangul_syllable_leading :: proc(r: rune) -> bool {
	return 0x1100 <= r && r <= 0x115F || 0xA960 <= r && r <= 0xA97C
}

// Hangul_Syllable_Type=Vowel_Jamo
@(require_results)
is_hangul_syllable_vowel :: proc(r: rune) -> bool {
	return 0x1160 <= r && r <= 0x11A7 || 0xD7B0 <= r && r <= 0xD7C6
}

// Hangul_Syllable_Type=Trailing_Jamo
@(require_results)
is_hangul_syllable_trailing :: proc(r: rune) -> bool {
	return 0x11A8 <= r && r <= 0x11FF || 0xD7CB <= r && r <= 0xD7FB
}

// Hangul_Syllable_Type=LV_Syllable
@(require_results)
is_hangul_syllable_lv :: proc(r: rune) -> bool #no_bounds_check {
	c := i32(r)
	p := binary_search(c, hangul_syllable_lv_singlets[:], len(hangul_syllable_lv_singlets), 1)
	if p >= 0 && c == hangul_syllable_lv_singlets[p] {
		return true
	}
	return false
}

// Hangul_Syllable_Type=LVT_Syllable
@(require_results)
is_hangul_syllable_lvt :: proc(r: rune) -> bool #no_bounds_check {
	c := i32(r)
	p := binary_search(c, hangul_syllable_lvt_ranges[:], len(hangul_syllable_lvt_ranges)/2, 2)
	if p >= 0 && hangul_syllable_lvt_ranges[p] <= c && c <= hangul_syllable_lvt_ranges[p+1] {
		return true
	}
	return false
}


// Indic_Syllabic_Category=Consonant_Preceding_Repha
@(require_results)
is_indic_consonant_preceding_repha :: proc(r: rune) -> bool {
	switch r {
	case 0x00D4E,
	     0x11941,
	     0x11D46,
	     0x11F02:
		return true
	case:
		return false
	}
}

// Indic_Syllabic_Category=Consonant_Prefixed
@(require_results)
is_indic_consonant_prefixed :: proc(r: rune) -> bool {
	switch r {
	case 0x111C2 ..= 0x111C3,
	     0x1193F,
	     0x11A3A,
	     0x11A84 ..= 0x11A89:
		return true
	case:
		return false
	}
}

// Indic_Conjunct_Break=Linker
@(require_results)
is_indic_conjunct_break_linker :: proc(r: rune) -> bool {
	switch r {
	case 0x094D,
	     0x09CD,
	     0x0ACD,
	     0x0B4D,
	     0x0C4D,
	     0x0D4D:
		return true
	case:
		return false
	}
}

// Indic_Conjunct_Break=Consonant
@(require_results)
is_indic_conjunct_break_consonant :: proc(r: rune) -> bool #no_bounds_check {
	c := i32(r)
	p := binary_search(c, indic_conjunct_break_consonant_ranges[:], len(indic_conjunct_break_consonant_ranges)/2, 2)
	if p >= 0 && indic_conjunct_break_consonant_ranges[p] <= c && c <= indic_conjunct_break_consonant_ranges[p+1] {
		return true
	}
	return false
}

// Indic_Conjunct_Break=Extend
@(require_results)
is_indic_conjunct_break_extend :: proc(r: rune) -> bool #no_bounds_check {
	c := i32(r)
	p := binary_search(c, indic_conjunct_break_extend_ranges[:], len(indic_conjunct_break_extend_ranges)/2, 2)
	if p >= 0 && indic_conjunct_break_extend_ranges[p] <= c && c <= indic_conjunct_break_extend_ranges[p+1] {
		return true
	}
	return false
}


/*
For grapheme text segmentation, from Unicode TR 29 Rev 43:

```
Indic_Syllabic_Category = Consonant_Preceding_Repha, or
Indic_Syllabic_Category = Consonant_Prefixed, or
Prepended_Concatenation_Mark = Yes
```
*/
@(require_results)
is_gcb_prepend_class :: proc(r: rune) -> bool {
	return is_indic_consonant_preceding_repha(r) || is_indic_consonant_prefixed(r) || is_prepended_concatenation_mark(r)
}

/*
For grapheme text segmentation, from Unicode TR 29 Rev 43:

```
Grapheme_Extend = Yes, or
Emoji_Modifier = Yes

This includes:
General_Category = Nonspacing_Mark
General_Category = Enclosing_Mark
U+200C ZERO WIDTH NON-JOINER

plus a few General_Category = Spacing_Mark needed for canonical equivalence.
```
*/
@(require_results)
is_gcb_extend_class :: proc(r: rune) -> bool {
	return is_grapheme_extend(r) || is_emoji_modifier(r)
}

// Return values:
//
// - 2 if East_Asian_Width=F or W, or
// - 0 if non-printable / zero-width, or
// - 1 in all other cases.
//
@(require_results)
normalized_east_asian_width :: proc(r: rune) -> int #no_bounds_check {
	// This is a different interpretation of the BOM which occurs in the middle of text.
	ZERO_WIDTH_NO_BREAK_SPACE :: '\uFEFF'

	if is_control(r) {
		return 0
	} else if r <= 0x10FF {
		// Easy early out for low runes.
		return 1
	}

	switch r {
	case ZERO_WIDTH_NO_BREAK_SPACE,
	     ZERO_WIDTH_SPACE,
	     ZERO_WIDTH_NON_JOINER,
	     ZERO_WIDTH_JOINER,
	     WORD_JOINER:
		return 0
	}

	c := i32(r)
	p := binary_search(c, normalized_east_asian_width_ranges[:], len(normalized_east_asian_width_ranges)/3, 3)
	if p >= 0 && normalized_east_asian_width_ranges[p] <= c && c <= normalized_east_asian_width_ranges[p+1] {
		return cast(int)normalized_east_asian_width_ranges[p+2]
	}
	return 1
}

//
// End of Unicode 15.1.0 block.
//