Merge pull request #6393 from Kelimion/unicode

Unicode
2026-07-21 15:11:12 +00:00 · 2026-03-09 20:00:09 +01:00
parent 1b23231e4a d880404b84
commit dca824c6af
10 changed files with 46464 additions and 34 deletions
--- a/core/unicode/generated.odin
+++ b/core/unicode/generated.odin
--- a/core/unicode/inrange.odin
+++ b/core/unicode/inrange.odin
@@ -0,0 +1,45 @@
+package unicode
+
+/*
+Check to see if the rune `r` is in `range`
+*/
+in_range :: proc(r: rune, range: Range) -> bool {
+
+	if r <= 0xFFFF {
+		r16 := cast(u16) r
+
+		length := len(range.ranges_16)
+		index := binary_search(r16, range.ranges_16, length/2, 2) if length > 0 else -1
+		if index >= 0 && range.ranges_16[index] <= r16 && range.ranges_16[index+1] >= r16 {
+			return true
+		}
+
+		length = len(range.single_16)
+		index = binary_search(r16, range.single_16, length, 1) if length > 0 else -1 
+		if index >= 0 && range.single_16[index] == r16 { 
+				return true
+		}
+	}
+	
+	r32 := cast(i32) r
+
+	length := len(range.ranges_32)
+	index := binary_search(r32, range.ranges_32, length/2, 2) if length >0 else -1
+	if index >= 0 && range.ranges_32[index] <= r32 && range.ranges_32[index+1] >= r32 {
+		return true
+	}
+
+	length = len(range.single_32)
+	index = binary_search(r32, range.single_32, length, 1) if length > 0 else -1
+	if index >= 0 && range.single_32[index] == r32  {
+		return true
+	}
+	
+
+	return false
+}
+
+
+
+
+
--- a/core/unicode/letter.odin
+++ b/core/unicode/letter.odin
@@ -13,7 +13,7 @@ ZERO_WIDTH_JOINER     :: '\u200D'
 WORD_JOINER           :: '\u2060'

@(require_results)
-binary_search :: proc(c: i32, table: []i32, length, stride: int, loc := #caller_location) -> int #no_bounds_check {
+binary_search :: proc(c: $T, table: []T, length, stride: int, loc := #caller_location) -> int #no_bounds_check {
 	runtime.bounds_check_error_loc(loc, length*stride-1, len(table))
 	n := length
 	t := 0
@@ -75,16 +75,7 @@ is_lower :: proc(r: rune) -> bool #no_bounds_check {
 	if r <= MAX_ASCII {
 		return u32(r)-'a' < 26
 	}
-	c := i32(r)
-	p := binary_search(c, to_upper_ranges[:], len(to_upper_ranges)/3, 3)
-	if p >= 0 && to_upper_ranges[p] <= c && c <= to_upper_ranges[p+1] {
-		return true
-	}
-	p = binary_search(c, to_upper_singlets[:], len(to_upper_singlets)/2, 2)
-	if p >= 0 && c == to_upper_singlets[p] {
-		return true
-	}
-	return false
+	return in_range(r, ll_ranges) || in_range(r, other_lowercase_ranges)
 }

@(require_results)
@@ -92,19 +83,22 @@ is_upper :: proc(r: rune) -> bool #no_bounds_check {
 	if r <= MAX_ASCII {
 		return u32(r)-'A' < 26
 	}
-	c := i32(r)
-	p := binary_search(c, to_lower_ranges[:], len(to_lower_ranges)/3, 3)
-	if p >= 0 && to_lower_ranges[p] <= c && c <= to_lower_ranges[p+1] {
-		return true
-	}
-	p = binary_search(c, to_lower_singlets[:], len(to_lower_singlets)/2, 2)
-	if p >= 0 && c == to_lower_singlets[p] {
-		return true
-	}
-	return false
+	return in_range(r, lu_ranges) || in_range(r, other_uppercase_ranges)
 }

 is_alpha :: is_letter
+
+/*
+Return true if the rune `r` is a letter. Being a letter means that the rune has
+the Unicode general category property of L. In practice, the character will have
+a general category property of Ll, Lm, Lo, Lt, or Lu.
+
+Inputs:
+- r: The rune which will be check for having the property of being a letter.
+
+Returns:
+`true` when the rune `r` is a letter. `false` will be returned in all other cases.
+*/
@(require_results)
 is_letter :: proc(r: rune) -> bool #no_bounds_check {
 	if u32(r) <= MAX_LATIN1 {
@@ -114,16 +108,9 @@ is_letter :: proc(r: rune) -> bool #no_bounds_check {
 		return true
 	}

-	c := i32(r)
-	p := binary_search(c, alpha_ranges[:], len(alpha_ranges)/2, 2)
-	if p >= 0 && alpha_ranges[p] <= c && c <= alpha_ranges[p+1] {
-		return true
-	}
-	p = binary_search(c, alpha_singlets[:], len(alpha_singlets), 1)
-	if p >= 0 && c == alpha_singlets[p] {
-		return true
-	}
-	return false
+	ll_lu := in_range(r, ll_ranges) || in_range(r, lu_ranges) 	
+
+	return ll_lu || in_range(r, lo_ranges) || in_range(r, lt_ranges) || in_range(r, lm_ranges) 
 }

@(require_results)
@@ -131,11 +118,45 @@ is_title :: proc(r: rune) -> bool {
 	return is_upper(r) && is_lower(r)
 }

+/*
+Returns true if the rune `r` is in the General Category Nd
+
+Inputs:
+- r: The run to check if it is in the general category Nd.
+
+Returns:
+`true` if the rune is in the general category Nd and `false` otherwise
+
+*/
+is_decimal :: proc(r: rune) -> bool {
+	return in_range(r, nd_ranges)
+}
+
+/*
+This function determincs if a rune is a digit. To be a digit the 
+charage either has a Numeric_Type of Digit or Decimal. 
+
+Inputs:
+- r: The rune to check if it is a digit.
+
+Returns:
+`true` if the rune `r` is a digit, `false` in all other cases
+
+*/
@(require_results)
 is_digit :: proc(r: rune) -> bool {
 	if r <= MAX_LATIN1 {
-		return '0' <= r && r <= '9'
+		return ('0' <= r && r <= '9') || r == 0x00B9 || (r >= 0x00B2 && r <= 0x0B3)
 	}
+
+	if in_range(r, nd_ranges) {
+		return true
+	}
+	
+	if in_range(r, extra_digits_ranges) {
+		return true
+	}
+
 	return false
 }

@@ -176,6 +197,15 @@ is_graphic :: proc(r: rune) -> bool {
 	if u32(r) <= MAX_LATIN1 {
 		return char_properties[u8(r)]&pg != 0
 	}
+
+	if is_letter(r) || is_number(r) || is_punct(r) || is_symbol(r) || in_range(r, zs_ranges) {
+		return true
+	}
+
+	if  in_range(r, mc_ranges) || in_range(r, me_ranges) || in_range(r, mn_ranges) {
+		return true
+	}
+
 	return false
 }

@@ -195,12 +225,25 @@ is_control :: proc(r: rune) -> bool #no_bounds_check {
 	return false
 }

+/*
+Checks to see if the rune `r` is a number. This means the rune is a member
+of the general category Nd, Nl, or No.
+
+Inputs:
+r: The rune to check if it is number.
+
+Returns:
+`true` if the ruen belongs to the general category Nd, Nl, or No. `false`
+is return in all other cases.
+
+*/
@(require_results)
 is_number :: proc(r: rune) -> bool #no_bounds_check {
 	if u32(r) <= MAX_LATIN1 {
 		return char_properties[u8(r)]&pN != 0
 	}
-	return false
+
+	return in_range(r, nd_ranges) || in_range(r, nl_ranges) || in_range(r, no_ranges)
 }

@(require_results)
@@ -208,7 +251,16 @@ is_punct :: proc(r: rune) -> bool #no_bounds_check {
 	if u32(r) <= MAX_LATIN1 {
 		return char_properties[u8(r)]&pP != 0
 	}
-	return false
+
+	if in_range(r, pc_ranges) || in_range(r, pd_ranges) || in_range(r, pe_ranges) {
+		return true
+	}
+	
+	if in_range(r, pf_ranges) || in_range(r, pi_ranges) || in_range(r, po_ranges) {
+		return true
+	}
+
+	return in_range(r, ps_ranges)
 }

@(require_results)
@@ -216,6 +268,13 @@ is_symbol :: proc(r: rune) -> bool #no_bounds_check {
 	if u32(r) <= MAX_LATIN1 {
 		return char_properties[u8(r)]&pS != 0
 	}
+
+	s := in_range(r, sc_ranges) || in_range(r, sm_ranges) 
+	
+	if s || in_range(r, so_ranges) || in_range(r, sk_ranges) {
+		return true
+	}
+
 	return false
 }

--- a/core/unicode/tools/ucd/generate_unicode.odin
+++ b/core/unicode/tools/ucd/generate_unicode.odin
@@ -0,0 +1,287 @@
+package ucd
+
+import "core:fmt"
+import "core:os"
+import "core:strings"
+import "core:mem"
+import "core:io"
+import "core:log"
+
+// Table 2-3. Types of Code Points
+// Table 4-4. General_Category Values page 229
+// Reference https://www.unicode.org/reports/tr44/
+
+/*
+Formats a `Dynamic_Range` into a set of fixed length arrays and writes them to an `io.Writer`.
+The value of the parameter `name` will be used as a prefix to the array names.
+
+If a dynamic array contained in the `range` is empty, no corresponding fixed length array will be written.
+
+Inputs:
+- writer: The `io.Writer` to be written to.
+- name: Prefix to add to any array that is written to `writer`
+- range: `The Dynamic_Range` to format and write to writer.
+*/
+write_range_arrays :: proc(writer: io.Writer, name: string, range: Dynamic_Range) {
+	if len(range.single_16) > 0 {
+		fmt.wprintln(writer, "@(rodata)")
+		fmt.wprintf(writer, "%s_singles16 := [?]u16{{", name)
+		for v, count in range.single_16 {
+			if count % 8 == 0 {
+				fmt.wprintf(writer, "\n\t0x%4X,", v)
+				continue
+			} else {
+				fmt.wprintf(writer, " 0x%4X,", v)
+			}
+		}
+		fmt.wprintln(writer, "\n}\n")
+	}
+
+	if len(range.ranges_16) > 0 {
+		fmt.wprintln(writer, "@(rodata)")
+		fmt.wprintfln(writer, "%s_ranges16 := [?]u16{{", name)
+		for v in range.ranges_16 {
+			fmt.wprintfln(writer, "\t0x%4X, 0x%4X,", v.first, v.last)
+		}
+		fmt.wprintln(writer, "}\n")
+	}
+
+	if len(range.single_32) > 0 {
+		fmt.wprintln(writer, "@(rodata)")
+		fmt.wprintf(writer, "%s_singles32 := [?]i32{{", name)
+		for v, count in range.single_32 {
+			if count % 8 == 0 {
+				fmt.wprintf(writer, "\n\t0x%4X,", v)
+				continue
+			} else {
+				fmt.wprintf(writer, " 0x%4X,", v)
+			}
+		}
+		fmt.wprintln(writer, "\n}\n")
+	}
+
+	if len(range.ranges_32) > 0 {
+		fmt.wprintln(writer, "@(rodata)")
+		fmt.wprintfln(writer, "%s_ranges32 := [?]i32{{", name)
+		for v in range.ranges_32 {
+			fmt.wprintfln(writer, "\t0x%4X, 0x%4X,", v.first, v.last)
+		}
+		fmt.wprintln(writer, "}\n")
+	}
+
+	return
+}
+
+write_range :: proc(writer: io.Writer, name: union{string, General_Category}, range: Dynamic_Range) {
+	buffer: [128]byte
+	str: string
+
+	switch n in name {
+	case string:
+		assert(len(n) <= len(buffer))
+		copy(buffer[:], n)
+		str = string(buffer[:len(n)])
+
+	case General_Category:
+		str = fmt.bprintf(buffer[:], "%s", n)
+	}
+
+	// lowercase table names
+	for &b in buffer[0:len(str)] {
+		if b >= 'A' && b <= 'Z' {
+			b += ('a' - 'A')
+		}
+	}
+
+	write_range_arrays(writer, str, range)
+
+	fmt.wprintfln(writer, "%s_ranges := Range{{", str)
+	if len(range.single_16) > 0 {
+		fmt.wprintfln(writer, "\tsingle_16 = %s_singles16[:],", str)
+	}
+	if len(range.ranges_16) > 0 {
+		fmt.wprintfln(writer, "\tranges_16 = %s_ranges16[:],", str)
+	}
+	if len(range.single_32) > 0 {
+		fmt.wprintfln(writer, "\tsingle_32 = %s_singles32[:],", str)
+	}
+	if len(range.ranges_32) > 0 {
+		fmt.wprintfln(writer, "\tranges_32 = %s_ranges32[:],", str)
+	}
+	fmt.wprintln(writer, "}\n")
+
+	return
+}
+
+GENERATED :: `/*
+	------ GENERATED ------ DO NOT EDIT ------ GENERATED ------ DO NOT EDIT ------ GENERATED ------
+*/
+`
+
+MESSAGE :: `/*
+	This file is generated from UnicodeData.txt and PropList.txt. These files
+	are part of the Unicode Database (UCD) and are covered by the license
+	listed further down. They may be downloaded from the following locations;
+
+	https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
+	https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt
+	https://www.unicode.org/license.txt
+
+	------------------------------------------------------------------------------
+	UNICODE LICENSE V3
+
+	COPYRIGHT AND PERMISSION NOTICE
+
+	Copyright © 1991-2026 Unicode, Inc.
+
+	NOTICE TO USER: Carefully read the following legal agreement. BY
+	DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING DATA FILES, AND/OR
+	SOFTWARE, YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
+	TERMS AND CONDITIONS OF THIS AGREEMENT. IF YOU DO NOT AGREE, DO NOT
+	DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE THE DATA FILES OR SOFTWARE.
+
+	Permission is hereby granted, free of charge, to any person obtaining a
+	copy of data files and any associated documentation (the "Data Files") or
+	software and any associated documentation (the "Software") to deal in the
+	Data Files or Software without restriction, including without limitation
+	the rights to use, copy, modify, merge, publish, distribute, and/or sell
+	copies of the Data Files or Software, and to permit persons to whom the
+	Data Files or Software are furnished to do so, provided that either (a)
+	this copyright and permission notice appear with all copies of the Data
+	Files or Software, or (b) this copyright and permission notice appear in
+	associated Documentation.
+
+	THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
+	KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
+	THIRD PARTY RIGHTS.
+
+	IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE
+	BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES,
+	OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+	WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
+	ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE DATA
+	FILES OR SOFTWARE.
+
+	Except as contained in this notice, the name of a copyright holder shall
+	not be used in advertising or otherwise to promote the sale, use or other
+	dealings in these Data Files or Software without prior written
+	authorization of the copyright holder.
+
+*/
+`
+
+main :: proc() {
+	track: mem.Tracking_Allocator
+
+	mem.tracking_allocator_init(&track, context.allocator)
+	defer {
+		if len(track.allocation_map) > 0 {
+			fmt.eprintf("=== %v allocations not freed: ===\n", len(track.allocation_map))
+			for _, entry in track.allocation_map {
+				fmt.eprintf("- %v bytes @ %v\n", entry.size, entry.location)
+			}
+		}
+		mem.tracking_allocator_destroy(&track)
+	}
+
+	context.allocator = mem.tracking_allocator(&track)
+
+	context.logger = log.create_console_logger()
+	defer log.destroy_console_logger(context.logger)
+
+	ucd_path := ODIN_ROOT + "tests/core/assets/UCD/UnicodeData.txt"
+
+	unicode_data, ucd_err := load_unicode_data(ucd_path)
+	if ucd_err != nil {
+		log.errorf("Error loading Unicode data. %s", ucd_err)
+	}
+	defer destroy_unicode_data(unicode_data)
+
+	general_category_ranges := gc_ranges(&unicode_data)
+	defer destroy_general_category_ranges(general_category_ranges)
+
+	extra_digits := extra_digits(&unicode_data)
+	defer destroy_dynamic_range(extra_digits)
+
+
+	proplist_path := ODIN_ROOT + "tests/core/assets/UCD/PropList.txt"
+	proplist, proplist_err := load_property_list(proplist_path)
+	if proplist_err != nil {
+		log.errorf("Error loading PropList.txt. %s", proplist_err)
+		return
+	}
+	defer destroy_property_list(proplist)
+
+	sb := strings.builder_make_len_cap(0, 1024*32)
+	defer strings.builder_destroy(&sb)
+
+	writer := strings.to_writer(&sb)
+
+	fmt.wprintfln(writer, "package unicode\n")
+	fmt.wprintln(writer, GENERATED)
+	fmt.wprintln(writer, MESSAGE)
+
+	Range_Type :: "Range :: struct {\n" +
+		"\tsingle_16 : []u16,\n" +
+		"\tranges_16 : []u16,\n" +
+		"\tsingle_32 : []i32,\n" +
+		"\tranges_32 : []i32,\n" +
+		"}\n"
+
+	fmt.wprintfln(writer, "%s", Range_Type)
+
+	//List of the general categories to skip when generating the code for
+	//core/unicode/generated.txt.
+	to_exclude := [?]General_Category{
+		.Cc, // Control, a C0 or C1 control code
+		.Cf, // Format, a format control character
+		.Cn, // Unassigned, a reserved unassigned code point or a noncharacter
+		.Co, // Private_Use, a private-use character
+		.Cs, // Surrogate, a surrogate code point
+		// .Ll, // Lowercase_Letter, a lowercase letter
+		// .Lm, // Modifier_Letter, a modifier letter
+		// .Lo, // Other_Letter, other letters, including syllables and ideographs
+		// .Lt, // Titlecase_Letter, a digraph encoded as a single character, with first part uppercase
+		// .Lu, // Uppercase_Letter, an uppercase letter
+		// .Mc, // Spacing_Mark, a spacing combining mark (positive advance width)
+		// .Me, // Enclosing_Mark, an enclosing combining mark
+		// .Mn, // Nonspacing_Mark, a nonspacing combining mark (zero advance width)
+		//.Nd, // Decimal_Number, a decimal digit
+		//.Nl, // Letter_Number, a letterlike numeric character
+		//.No, // Other_Number, a numeric character of other type
+		// .Pc, // Connector_Punctuation, a connecting punctuation mark, like a tie
+		// .Pd, // Dash_Punctuation, a dash or hyphen punctuation mark
+		// .Pe, // Close_Punctuation, a closing punctuation mark (of a pair)
+		// .Pf, // Final_Punctuation, a final quotation mark
+		// .Pi, // Initial_Punctuation, an initial quotation mark
+		// .Po, // Other_Punctuation, a punctuation mark of other type
+		// .Ps, // Open_Punctuation, an opening punctuation mark (of a pair)
+		// .Sc, // Currency_Symbol, a currency sign
+		// .Sk, // Modifier_Symbol, a non-letterlike modifier symbol
+		// .Sm, // Math_Symbol, a symbol of mathematical use
+		// .So, // Other_Symbol, a symbol of other type
+		 .Zl, // Line_Separator, U+2028 LINE SEPARATOR only
+		 .Zp, // Paragraph_Separator, U+2029 PARAGRAPH SEPARATOR only
+		//.Zs, // Space_Separator, a space character (of various non-zero widths)
+	}
+
+	write_loop: for range, category in general_category_ranges {
+		for excluded in to_exclude {
+			if category == excluded {
+				continue write_loop
+			}
+		}
+		write_range(writer, category, range)
+	}
+
+	write_range(writer, "extra_digits",    extra_digits)
+	write_range(writer, "other_lowercase", proplist[.Other_Lowercase])
+	write_range(writer, "other_uppercase", proplist[.Other_Uppercase])
+
+	file_name := ODIN_ROOT + "core/unicode/generated.odin"
+
+	if write_error := os.write_entire_file_from_string(file_name, strings.to_string(sb)); write_error != nil {
+		log.errorf("Error %v writing %q", write_error, file_name)
+	}
+}
--- a/core/unicode/tools/ucd/string_to.odin
+++ b/core/unicode/tools/ucd/string_to.odin
@@ -0,0 +1,162 @@
+package ucd
+
+string_to_general_category :: proc "contextless" (str: string) -> (gc: General_Category, err: Error) {
+	switch str {
+	case "Lu": gc = .Lu
+	case "Ll": gc = .Ll
+	case "Lt": gc = .Lt
+	case "Lm": gc = .Lm
+	case "Lo": gc = .Lo
+	case "Mn": gc = .Mn
+	case "Mc": gc = .Mc
+	case "Me": gc = .Me
+	case "Nd": gc = .Nd
+	case "Nl": gc = .Nl
+	case "No": gc = .No
+	case "Pc": gc = .Pc
+	case "Pd": gc = .Pd
+	case "Ps": gc = .Ps
+	case "Pe": gc = .Pe
+	case "Pi": gc = .Pi
+	case "Pf": gc = .Pf
+	case "Po": gc = .Po
+	case "Sm": gc = .Sm
+	case "Sc": gc = .Sc
+	case "Sk": gc = .Sk
+	case "So": gc = .So
+	case "Zs": gc = .Zs
+	case "Zl": gc = .Zl
+	case "Zp": gc = .Zp
+	case "Cc": gc = .Cc
+	case "Cf": gc = .Cf
+	case "Cs": gc = .Cs
+	case "Co": gc = .Co
+	case "Cn": gc = .Cn
+	case: err = .Invalid_General_Category
+	}
+	return
+}
+
+string_to_proplist_property :: proc(str: string) -> (prop: Prop_List_Property) {
+	switch str {
+	case "White_Space":                        prop = .White_Space
+	case "Bidi_Control":                       prop = .Bidi_Control
+	case "Join_Control":                       prop = .Join_Control
+	case "Dash":                               prop = .Dash
+	case "Hyphen":                             prop = .Hyphen
+	case "Quotation_Mark":                     prop = .Quotation_Mark
+	case "Terminal_Punctuation":               prop = .Terminal_Punctuation
+	case "Other_Math":                         prop = .Other_Math
+	case "Hex_Digit":                          prop = .Hex_Digit
+	case "ASCII_Hex_Digit":                    prop = .ASCII_Hex_Digit
+	case "Other_Alphabetic":                   prop = .Other_Alphabetic
+	case "Ideographic":                        prop = .Ideographic
+	case "Diacritic":                          prop = .Diacritic
+	case "Extender":                           prop = .Extender
+	case "Other_Lowercase":                    prop = .Other_Lowercase
+	case "Other_Uppercase":                    prop = .Other_Uppercase
+	case "Noncharacter_Code_Point":            prop = .Noncharacter_Code_Point
+	case "Other_Grapheme_Extend":              prop = .Other_Grapheme_Extend
+	case "IDS_Binary_Operator":                prop = .IDS_Binary_Operator
+	case "IDS_Trinary_Operator":               prop = .IDS_Trinary_Operator
+	case "IDS_Unary_Operator":                 prop = .IDS_Unary_Operator
+	case "Radical":                            prop = .Radical
+	case "Unified_Ideograph":                  prop = .Unified_Ideograph
+	case "Other_Default_Ignorable_Code_Point": prop = .Other_Default_Ignorable_Code_Point
+	case "Deprecated":                         prop = .Deprecated
+	case "Soft_Dotted":                        prop = .Soft_Dotted
+	case "Logical_Order_Exception":            prop = .Logical_Order_Exception
+	case "Other_ID_Start":                     prop = .Other_ID_Start
+	case "Other_ID_Continue":                  prop = .Other_ID_Continue
+	case "ID_Compat_Math_Continue":            prop = .ID_Compat_Math_Continue
+	case "ID_Compat_Math_Start":               prop = .ID_Compat_Math_Start
+	case "Sentence_Terminal":                  prop = .Sentence_Terminal
+	case "Variation_Selector":                 prop = .Variation_Selector
+	case "Pattern_White_Space":                prop = .Pattern_White_Space
+	case "Pattern_Syntax":                     prop = .Pattern_Syntax
+	case "Prepended_Concatenation_Mark":       prop = .Prepended_Concatenation_Mark
+	case "Regional_Indicator":                 prop = .Regional_Indicator
+	case "Modifier_Combining_Mark":            prop = .Modifier_Combining_Mark
+	case:                                      prop = .Unknown_Property
+	}
+	return
+}
+
+@(deprecated="Unused?")
+string_to_age :: proc "contextless" (str: string) -> (age: Age) {
+	switch str {
+	case "1.1":        age = .Age_1_1
+	case "2.0":        age = .Age_2_0
+	case "2.1":        age = .Age_2_1
+	case "3.0":        age = .Age_3_0
+	case "3.1":        age = .Age_3_1
+	case "3.2":        age = .Age_3_2
+	case "4.0":        age = .Age_4_0
+	case "4.1":        age = .Age_4_1
+	case "5.0":        age = .Age_5_0
+	case "5.1":        age = .Age_5_1
+	case "5.2":        age = .Age_5_2
+	case "6.0":        age = .Age_6_0
+	case "6.1":        age = .Age_6_1
+	case "6.2":        age = .Age_6_2
+	case "6.3":        age = .Age_6_3
+	case "7.0":        age = .Age_7_0
+	case "8.0":        age = .Age_8_0
+	case "9.0":        age = .Age_9_0
+	case "10.0":       age = .Age_10_0
+	case "11.0":       age = .Age_11_0
+	case "12.0":       age = .Age_12_0
+	case "12.1":       age = .Age_12_1
+	case "13.0":       age = .Age_13_0
+	case "14.0":       age = .Age_14_0
+	case "15.0":       age = .Age_15_0
+	case "15.1":       age = .Age_15_1
+	case "16.0":       age = .Age_16_0
+	case "17.0":       age = .Age_17_0
+	case "unassigned": age = .Age_Unassigned
+	case:              age = .Age_Unknown
+	}
+	return
+}
+
+@(deprecated="Unused?")
+string_to_paired_bracket_type :: proc "contextless" (str: string) -> (pbt: Paired_Bracket_Type) {
+	switch str {
+	case "o": pbt = .Open
+	case "c": pbt = .Close
+	case "n": pbt = .None
+	case:     pbt = .Unknown
+	}
+	return
+}
+
+@(deprecated="Unused?")
+string_to_bidi_class :: proc "contextless" (str: string) -> (class: Bidi_Class) {
+	switch str {
+	case "AL":  class = .AL
+	case "AN":  class = .AN
+	case "B":   class = .B
+	case "BN":  class = .BN
+	case "CS":  class = .CS
+	case "EN":  class = .EN
+	case "ES":  class = .ES
+	case "ET":  class = .ET
+	case "FSI": class = .FSI
+	case "L":   class = .L
+	case "LRE": class = .LRE
+	case "LRI": class = .LRI
+	case "LRO": class = .LRO
+	case "NSM": class = .NSM
+	case "ON":  class = .ON
+	case "PDF": class = .PDF
+	case "PDI": class = .PDI
+	case "R":   class = .R
+	case "RLE": class = .RLE
+	case "RLI": class = .RLI
+	case "RLO": class = .RLO
+	case "S":   class = .S
+	case "WS":  class = .WS
+	case:       class = .Unknown
+	}
+	return
+}
--- a/core/unicode/tools/ucd/types.odin
+++ b/core/unicode/tools/ucd/types.odin
@@ -0,0 +1,694 @@
+package ucd
+
+import "core:os"
+
+Age :: enum byte {
+	Age_Unknown = 0,
+	Age_1_1,
+	Age_2_0,
+	Age_2_1,
+	Age_3_0,
+	Age_3_1,
+	Age_3_2,
+	Age_4_0,
+	Age_4_1,
+	Age_5_0,
+	Age_5_1,
+	Age_5_2,
+	Age_6_0,
+	Age_6_1,
+	Age_6_2,
+	Age_6_3,
+	Age_7_0,
+	Age_8_0,
+	Age_9_0,
+	Age_10_0,
+	Age_11_0,
+	Age_12_0,
+	Age_12_1,
+	Age_13_0,
+	Age_14_0,
+	Age_15_0,
+	Age_15_1,
+	Age_16_0,
+	Age_17_0,
+	Age_Unassigned,
+}
+
+General_Category :: enum {
+	Cc, // Control, a C0 or C1 control code
+	Cf, // Format, a format control character
+	Cn, // Unassigned, a reserved unassigned code point or a noncharacter
+	Co, // Private_Use, a private-use character
+	Cs, // Surrogate, a surrogate code point
+	Ll, // Lowercase_Letter, a lowercase letter
+	Lm, // Modifier_Letter, a modifier letter
+	Lo, // Other_Letter, other letters, including syllables and ideographs
+	Lt, // Titlecase_Letter, a digraph encoded as a single character, with first part uppercase
+	Lu, // Uppercase_Letter, an uppercase letter
+	Mc, // Spacing_Mark, a spacing combining mark (positive advance width)
+	Me, // Enclosing_Mark, an enclosing combining mark
+	Mn, // Nonspacing_Mark, a nonspacing combining mark (zero advance width)
+	Nd, // Decimal_Number, a decimal digit
+	Nl, // Letter_Number, a letterlike numeric character
+	No, // Other_Number, a numeric character of other type
+	Pc, // Connector_Punctuation, a connecting punctuation mark, like a tie
+	Pd, // Dash_Punctuation, a dash or hyphen punctuation mark
+	Pe, // Close_Punctuation, a closing punctuation mark (of a pair)
+	Pf, // Final_Punctuation, a final quotation mark
+	Pi, // Initial_Punctuation, an initial quotation mark
+	Po, // Other_Punctuation, a punctuation mark of other type
+	Ps, // Open_Punctuation, an opening punctuation mark (of a pair)
+	Sc, // Currency_Symbol, a currency sign
+	Sk, // Modifier_Symbol, a non-letterlike modifier symbol
+	Sm, // Math_Symbol, a symbol of mathematical use
+	So, // Other_Symbol, a symbol of other type
+	Zl, // Line_Separator, U+2028 LINE SEPARATOR only
+	Zp, // Paragraph_Separator, U+2029 PARAGRAPH SEPARATOR only
+	Zs, // Space_Separator, a space character (of various non-zero widths)
+}
+
+Block :: enum {
+	Nil = 0,
+	Adlam,
+	Aegean_Numbers,
+	Ahom,
+	Alchemical,
+	Alphabetic_PF,
+	Anatolian_Hieroglyphs,
+	Ancient_Greek_Music,
+	Ancient_Greek_Numbers,
+	Ancient_Symbols,
+	Arabic,
+	Arabic_Ext_A,
+	Arabic_Ext_B,
+	Arabic_Ext_C,
+	Arabic_Math,
+	Arabic_PF_A,
+	Arabic_PF_B,
+	Arabic_Sup,
+	Armenian,
+	Arrows,
+	ASCII,
+	Avestan,
+	Balinese,
+	Bamum,
+	Bamum_Sup,
+	Bassa_Vah,
+	Batak,
+	Bengali,
+	Beria_Erfe,
+	Bhaiksuki,
+	Block_Elements,
+	Bopomofo,
+	Bopomofo_Ext,
+	Box_Drawing,
+	Brahmi,
+	Braille,
+	Buginese,
+	Buhid,
+	Byzantine_Music,
+	Carian,
+	Caucasian_Albanian,
+	Chakma,
+	Cham,
+	Cherokee,
+	Cherokee_Sup,
+	Chess_Symbols,
+	Chorasmian,
+	CJK,
+	CJK_Compat,
+	CJK_Compat_Forms,
+	CJK_Compat_Ideographs,
+	CJK_Compat_Ideographs_Sup,
+	CJK_Ext_A,
+	CJK_Ext_B,
+	CJK_Ext_C,
+	CJK_Ext_D,
+	CJK_Ext_E,
+	CJK_Ext_F,
+	CJK_Ext_G,
+	CJK_Ext_H,
+	CJK_Ext_I,
+	CJK_Ext_J,
+	CJK_Radicals_Sup,
+	CJK_Strokes,
+	CJK_Symbols,
+	Compat_Jamo,
+	Control_Pictures,
+	Coptic,
+	Coptic_Epact_Numbers,
+	Counting_Rod,
+	Cuneiform,
+	Cuneiform_Numbers,
+	Currency_Symbols,
+	Cypriot_Syllabary,
+	Cypro_Minoan,
+	Cyrillic,
+	Cyrillic_Ext_A,
+	Cyrillic_Ext_B,
+	Cyrillic_Ext_C,
+	Cyrillic_Ext_D,
+	Cyrillic_Sup,
+	Deseret,
+	Devanagari,
+	Devanagari_Ext,
+	Devanagari_Ext_A,
+	Diacriticals,
+	Diacriticals_Ext,
+	Diacriticals_For_Symbols,
+	Diacriticals_Sup,
+	Dingbats,
+	Dives_Akuru,
+	Dogra,
+	Domino,
+	Duployan,
+	Early_Dynastic_Cuneiform,
+	Egyptian_Hieroglyph_Format_Controls,
+	Egyptian_Hieroglyphs,
+	Egyptian_Hieroglyphs_Ext_A,
+	Elbasan,
+	Elymaic,
+	Emoticons,
+	Enclosed_Alphanum,
+	Enclosed_Alphanum_Sup,
+	Enclosed_CJK,
+	Enclosed_Ideographic_Sup,
+	Ethiopic,
+	Ethiopic_Ext,
+	Ethiopic_Ext_A,
+	Ethiopic_Ext_B,
+	Ethiopic_Sup,
+	Garay,
+	Geometric_Shapes,
+	Geometric_Shapes_Ext,
+	Georgian,
+	Georgian_Ext,
+	Georgian_Sup,
+	Glagolitic,
+	Glagolitic_Sup,
+	Gothic,
+	Grantha,
+	Greek,
+	Greek_Ext,
+	Gujarati,
+	Gunjala_Gondi,
+	Gurmukhi,
+	Gurung_Khema,
+	Half_And_Full_Forms,
+	Half_Marks,
+	Hangul,
+	Hanifi_Rohingya,
+	Hanunoo,
+	Hatran,
+	Hebrew,
+	High_PU_Surrogates,
+	High_Surrogates,
+	Hiragana,
+	IDC,
+	Ideographic_Symbols,
+	Imperial_Aramaic,
+	Indic_Number_Forms,
+	Indic_Siyaq_Numbers,
+	Inscriptional_Pahlavi,
+	Inscriptional_Parthian,
+	IPA_Ext,
+	Jamo,
+	Jamo_Ext_A,
+	Jamo_Ext_B,
+	Javanese,
+	Kaithi,
+	Kaktovik_Numerals,
+	Kana_Ext_A,
+	Kana_Ext_B,
+	Kana_Sup,
+	Kanbun,
+	Kangxi,
+	Kannada,
+	Katakana,
+	Katakana_Ext,
+	Kawi,
+	Kayah_Li,
+	Kharoshthi,
+	Khitan_Small_Script,
+	Khmer,
+	Khmer_Symbols,
+	Khojki,
+	Khudawadi,
+	Kirat_Rai,
+	Lao,
+	Latin_1_Sup,
+	Latin_Ext_A,
+	Latin_Ext_Additional,
+	Latin_Ext_B,
+	Latin_Ext_C,
+	Latin_Ext_D,
+	Latin_Ext_E,
+	Latin_Ext_F,
+	Latin_Ext_G,
+	Lepcha,
+	Letterlike_Symbols,
+	Limbu,
+	Linear_A,
+	Linear_B_Ideograms,
+	Linear_B_Syllabary,
+	Lisu,
+	Lisu_Sup,
+	Low_Surrogates,
+	Lycian,
+	Lydian,
+	Mahajani,
+	Mahjong,
+	Makasar,
+	Malayalam,
+	Mandaic,
+	Manichaean,
+	Marchen,
+	Masaram_Gondi,
+	Math_Alphanum,
+	Math_Operators,
+	Mayan_Numerals,
+	Medefaidrin,
+	Meetei_Mayek,
+	Meetei_Mayek_Ext,
+	Mende_Kikakui,
+	Meroitic_Cursive,
+	Meroitic_Hieroglyphs,
+	Miao,
+	Misc_Arrows,
+	Misc_Math_Symbols_A,
+	Misc_Math_Symbols_B,
+	Misc_Pictographs,
+	Misc_Symbols,
+	Misc_Symbols_Sup,
+	Misc_Technical,
+	Modi,
+	Modifier_Letters,
+	Modifier_Tone_Letters,
+	Mongolian,
+	Mongolian_Sup,
+	Mro,
+	Multani,
+	Music,
+	Myanmar,
+	Myanmar_Ext_A,
+	Myanmar_Ext_B,
+	Myanmar_Ext_C,
+	Nabataean,
+	Nag_Mundari,
+	Nandinagari,
+	NB,
+	New_Tai_Lue,
+	Newa,
+	NKo,
+	Number_Forms,
+	Nushu,
+	Nyiakeng_Puachue_Hmong,
+	OCR,
+	Ogham,
+	Ol_Chiki,
+	Ol_Onal,
+	Old_Hungarian,
+	Old_Italic,
+	Old_North_Arabian,
+	Old_Permic,
+	Old_Persian,
+	Old_Sogdian,
+	Old_South_Arabian,
+	Old_Turkic,
+	Old_Uyghur,
+	Oriya,
+	Ornamental_Dingbats,
+	Osage,
+	Osmanya,
+	Ottoman_Siyaq_Numbers,
+	Pahawh_Hmong,
+	Palmyrene,
+	Pau_Cin_Hau,
+	Phags_Pa,
+	Phaistos,
+	Phoenician,
+	Phonetic_Ext,
+	Phonetic_Ext_Sup,
+	Playing_Cards,
+	Psalter_Pahlavi,
+	PUA,
+	Punctuation,
+	Rejang,
+	Rumi,
+	Runic,
+	Samaritan,
+	Saurashtra,
+	Sharada,
+	Sharada_Sup,
+	Shavian,
+	Shorthand_Format_Controls,
+	Siddham,
+	Sidetic,
+	Sinhala,
+	Sinhala_Archaic_Numbers,
+	Small_Forms,
+	Small_Kana_Ext,
+	Sogdian,
+	Sora_Sompeng,
+	Soyombo,
+	Specials,
+	Sundanese,
+	Sundanese_Sup,
+	Sunuwar,
+	Sup_Arrows_A,
+	Sup_Arrows_B,
+	Sup_Arrows_C,
+	Sup_Math_Operators,
+	Sup_PUA_A,
+	Sup_PUA_B,
+	Sup_Punctuation,
+	Sup_Symbols_And_Pictographs,
+	Super_And_Sub,
+	Sutton_SignWriting,
+	Syloti_Nagri,
+	Symbols_And_Pictographs_Ext_A,
+	Symbols_For_Legacy_Computing,
+	Symbols_For_Legacy_Computing_Sup,
+	Syriac,
+	Syriac_Sup,
+	Tagalog,
+	Tagbanwa,
+	Tags,
+	Tai_Le,
+	Tai_Tham,
+	Tai_Viet,
+	Tai_Xuan_Jing,
+	Tai_Yo,
+	Takri,
+	Tamil,
+	Tamil_Sup,
+	Tangsa,
+	Tangut,
+	Tangut_Components,
+	Tangut_Components_Sup,
+	Tangut_Sup,
+	Telugu,
+	Thaana,
+	Thai,
+	Tibetan,
+	Tifinagh,
+	Tirhuta,
+	Todhri,
+	Tolong_Siki,
+	Toto,
+	Transport_And_Map,
+	Tulu_Tigalari,
+	UCAS,
+	UCAS_Ext,
+	UCAS_Ext_A,
+	Ugaritic,
+	Vai,
+	Vedic_Ext,
+	Vertical_Forms,
+	Vithkuqi,
+	VS,
+	VS_Sup,
+	Wancho,
+	Warang_Citi,
+	Yezidi,
+	Yi_Radicals,
+	Yi_Syllables,
+	Yijing,
+	Zanabazar_Square,
+	Znamenny_Music,
+}
+
+Combining_Class :: distinct byte
+
+Paired_Bracket_Type :: enum {
+	Unknown,
+	Open,
+	Close,
+	None,
+}
+
+Bidi_Class :: enum {
+	Unknown, //
+	L,       // Left-to-Right  LRM
+	R,       // Right-to-Left  RLM
+	AL,      // Right-to-Left Arabic ALM
+	EN,      // European Number
+	ES,      // European Number Separator
+	ET,      // European Number Terminator
+	AN,      // Arabic Number
+	CS,      // Common Number Separator
+	NSM,     // Nonspacing Mark
+	BN,      // Boundary Neutral
+	B,       // Paragraph Separator
+	S,       // Segment Separator
+	WS,      // Whitespace
+	ON,      // Other Neutrals
+	LRE,     // Left-to-Right Embedding  LRE
+	LRO,     // Left-to-Right Override   LRO
+	RLE,     // Right-to-Left Embedding  RLE
+	RLO,     // Right-to-Left Override   RLO
+	PDF,     // Pop Directional Format   PDF
+	LRI,     // Left-to-Right Isolate    LRI
+	RLI,     // Right-to-Left Isolate    RLI
+	FSI,     // First Strong Isolate     FSI
+	PDI,     // Pop Directional Isolate  PDI
+}
+
+Bidi :: struct {
+	bc:  Bidi_Class,
+	bmg: Maybe(rune), // mirrored glyph
+	m:   bool, // Bidi mirrored
+	c:   bool, // Bidi control property
+	pb:  Paired_Bracket_Type, // bidi paired bracket type
+	bpb: rune, // bidi paired bracket properties
+}
+
+Decomposition_Type :: enum {
+	Nil = 0,
+	can,
+	com,
+	enc,
+	fin,
+	font,
+	fra,
+	init,
+	iso,
+	med,
+	nar,
+	nb,
+	sml,
+	sqr,
+	sub,
+	sup,
+	vert,
+	wid,
+	none,
+}
+
+Trinary_Bool :: enum {
+	Maybe = -1,
+	False =  0,
+	True  =  1,
+}
+
+Decomposition_Mapping :: distinct [dynamic]rune 
+
+Decomposition :: struct {
+	dt:               Decomposition_Type, // Decomposition type
+	dm:               Decomposition_Mapping, // Decomposition Mapping
+	ce:               bool, // Composition Exclusion
+	comp_ex:          bool, // Full Composition Exclusion
+	nfc_quick_check:  Trinary_Bool,
+	nfd_quick_check:  bool,
+	nfkc_quick_check: Trinary_Bool,
+	nfkd_quick_check: bool,
+}
+
+Numeric_Type :: enum {
+	None     = 0, // None
+	Decimal,      // De
+	Digit,        // Di
+	Numeric,      // Nu
+}
+
+/*
+Note: Value is NAN when numberator and denominator ar 0
+*/
+Numberic_Value :: struct {
+	numerator:   int,
+	denominator: int,
+}
+
+Char :: struct {
+	cp:    rune,
+	name:  string,
+	gc:    General_Category,
+	ccc:   Combining_Class,
+	bc:    Bidi_Class,
+	dt:    Decomposition_Type,
+	dm:    Decomposition_Mapping,
+	nt:    Numeric_Type,
+	nv:    Numberic_Value,
+	bm:    bool,
+	name1: string,
+	sum:   string, // Simple uppercase mapping
+	slm:   string, // Simple lowercase mapping
+	stm:   string, // Simple titlecase_mapping
+}
+
+Char_Range :: struct {
+	first_cp: rune,
+	last_cp:  rune,
+	name:     string,
+	gc:       General_Category,
+	ccc:      Combining_Class,
+	bc:       Bidi_Class,
+	dt:       Decomposition_Type,
+	dm:       Decomposition_Mapping,
+	nt:       Numeric_Type,
+	nv:       Numberic_Value,
+	bm:       bool,
+	name1:    string,
+	sum:      string, // Simple uppercase mapping
+	slm:      string, // Simple lowercase mapping
+	stm:      string, // Simple titlecase_mapping
+}
+
+Chars :: union {
+	Char,
+	Char_Range,
+}
+
+Unicode_Data :: distinct [dynamic]Chars
+
+
+Prop_List_Property :: enum {
+	Unknown,
+	White_Space,
+	Bidi_Control,
+	Join_Control,
+	Dash,	
+	Hyphen, 
+	Quotation_Mark,
+	Terminal_Punctuation,
+	Other_Math,
+	Hex_Digit,	
+	ASCII_Hex_Digit,
+	Other_Alphabetic,
+	Ideographic,
+	Diacritic,
+	Extender,
+	Other_Lowercase,
+	Other_Uppercase,
+	Noncharacter_Code_Point,
+	Other_Grapheme_Extend,
+	IDS_Binary_Operator,
+	IDS_Trinary_Operator,
+	IDS_Unary_Operator,
+	Radical,
+	Unified_Ideograph,
+	Other_Default_Ignorable_Code_Point,
+	Deprecated,
+	Soft_Dotted,
+	Logical_Order_Exception,
+	Other_ID_Start,
+	Other_ID_Continue,
+	ID_Compat_Math_Continue,
+	ID_Compat_Math_Start,
+	Sentence_Terminal,
+	Variation_Selector,
+	Pattern_White_Space,
+	Pattern_Syntax,
+	Prepended_Concatenation_Mark,
+	Regional_Indicator,
+	Modifier_Combining_Mark,
+}
+
+UCD_Error :: enum {
+	XML_LOAD_ERROR,
+	XML_Not_UCD,
+	Nil_XML_Document,
+	Element_Not_Repertoire,
+	Extra_Fields,
+	Unknown_Property,
+	Unknown_Bidi_Class,
+
+	NO_REPERTOIRE,
+	UNEXPECTED_STRING,
+	Invalid_Hex_Number,
+	Invalid_General_Category,
+	UnicodeData_6_Too_Long,
+	UnicodeData_6_Invalid,
+	UnicodeData_7_Too_Long,
+	UnicodeData_7_Invalid,
+}
+
+
+Error :: union #shared_nil {
+	UCD_Error,
+	os.Error,
+}
+
+Range_u16 :: struct {
+	first: u16,
+	last:  u16,
+}
+
+Range_i32 :: struct {
+	first: i32,
+	last:  i32,
+}
+
+Range_Rune :: struct {
+	first: rune,
+	last:  rune,
+}
+
+Dynamic_Range :: struct {
+	single_16: [dynamic]u16,
+	ranges_16: [dynamic]Range_u16,
+	single_32: [dynamic]i32,
+	ranges_32: [dynamic]Range_i32,
+}
+
+append_to_dynamic_range :: proc(dr: ^Dynamic_Range, range: Range_Rune, allocator := context.allocator) {
+	if range.first == range.last && range.first <= 0xFFFF {
+		if len(dr.single_16) == 0 {
+			dr.single_16 = make([dynamic]u16, 0, 512, allocator) 
+		}
+		append(&dr.single_16, cast(u16)range.first)
+	} else if range.first == range.last {
+		if len(dr.single_32) == 0 {
+			dr.single_32 = make([dynamic]i32, 0, 512, allocator) 
+		}
+		append(&dr.single_32, cast(i32)range.first)
+	
+	} else if range.first <= 0xFFFF && range.last <= 0xFFFF {
+		if len(dr.ranges_16) == 0 {
+			dr.ranges_16 = make([dynamic]Range_u16, 0, 128, allocator) 
+		}
+		r := Range_u16{ cast(u16)range.first, cast(u16)range.last}
+		append(&dr.ranges_16, r)
+	
+	} else {
+		if len(dr.ranges_32) == 0 {
+			dr.ranges_32 = make([dynamic]Range_i32, 0, 128, allocator) 
+		}
+		r := Range_i32{ cast(i32)range.first, cast(i32)range.last}
+		append(&dr.ranges_32, r)
+	}
+}
+
+destroy_dynamic_range :: proc(dr: Dynamic_Range) {
+	delete(dr.ranges_16)
+	delete(dr.ranges_32)
+	delete(dr.single_16)
+	delete(dr.single_32)
+}
+
+destroy_general_category_ranges :: proc(gcr: [General_Category]Dynamic_Range) {
+	for r in gcr {
+		destroy_dynamic_range(r)
+	}
+}
--- a/core/unicode/tools/ucd/ucd.odin
+++ b/core/unicode/tools/ucd/ucd.odin
@@ -0,0 +1,290 @@
+package ucd
+
+import "core:strings"
+import "core:os"
+import "core:strconv"
+
+decode_rune :: proc(str: string) -> (cp1, cp2: rune, err: Error) {
+	head, _, tail := strings.partition(str, "..")
+
+	if _cp1, _ok := strconv.parse_int(head, 16); !_ok {
+		return 0, 0, .Invalid_Hex_Number
+	} else {
+		cp1 = rune(_cp1)
+	}
+
+	if len(tail) == 0 {
+		return cp1, cp1, nil
+	}
+
+	if _cp2, _ok := strconv.parse_int(tail, 16); !_ok {
+		return 0, 0, .Invalid_Hex_Number
+	} else {
+		cp2 = rune(_cp2)
+	}
+	return
+}
+
+load_unicode_data :: proc(filename: string, allocator := context.allocator) -> (unicode_data: Unicode_Data, err: Error) {
+	data := os.read_entire_file(filename, context.temp_allocator) or_return
+	defer free_all(context.temp_allocator)
+
+	first_cp: rune
+
+	str := string(data)
+	line_loop: for _line in strings.split_lines_iterator(&str) {
+		// Ignore any comments
+		line, _, _ := strings.partition(_line, "#")
+
+		// Skip empty lines
+		if len(line) == 0 { continue }
+
+		is_range := false
+		cp:    rune
+		name:  string
+		gc:    General_Category
+		num_6: string
+		num_7: string
+		nt := Numeric_Type.None
+
+		field_num := 0
+		for _field in strings.split_iterator(&line, ";") {
+			defer field_num += 1
+			field := strings.trim_space(_field)
+
+			switch field_num {
+			case 0: // Code point
+				cp, _ = decode_rune(field) or_return
+
+			case 1: // Name
+				if len(field) > 9 && field[0] == '<' && strings.ends_with(field, ", First>") {
+					first_cp = cp
+					continue line_loop
+				}
+				
+				if len(field) > 9 && field[0] == '<' && strings.ends_with(field, ", Last>") {
+					name = strings.clone(field[1:len(field)-7], allocator)
+					is_range = true
+				} else {
+					name = strings.clone(field[:], allocator)
+				}
+
+			case 2: // General_Category
+				// NOTE: This is currently igorning a possible error it should probably be fixed
+				gc, _ = string_to_general_category(field)
+
+			case 3: // Canonical_Combining_Class
+			case 4: // Bidi Class
+			case 5: // Decomposition_Type and Decomposition_Mapping
+			// Numeric_Type and Numeric_Value
+			case 6:
+				num_6 = field
+
+			case 7:  
+				num_7 = field
+
+			case 8:
+				switch {
+				case num_6 != "" && num_7 != "" && field != "" :
+					nt = .Decimal 
+
+				case num_6 == "" && num_7 != "" && field != "" :
+					nt = .Digit
+
+				case num_6 == "" && num_7 == "" && field != "" :
+					nt = .Numeric
+
+				case:
+					nt = .None
+				}
+
+			case 9:  // Bidi mirrored
+			case 10: // Unicode 1 Name (Obsolete as of 6.2.0)
+			case 11: // should be null
+			case 12:
+			case 13:
+			case 14:
+			case:
+				err = .Extra_Fields
+				return
+			}
+		}
+
+		if is_range {
+			append(&unicode_data, Char_Range {
+				gc       = gc,
+				first_cp = first_cp,
+				last_cp  = cp,
+				name     = name,
+				nt       = nt,
+			})
+		} else {
+			append(&unicode_data, Char{
+				gc   = gc,
+				cp   = cp,
+				name = name,
+				nt   = nt,
+			})
+		}
+	}
+	return
+}
+
+destroy_unicode_data :: proc(unicode_data: Unicode_Data) {
+	for point in unicode_data {
+		switch p in point {
+		case Char:
+			delete(p.name)
+		case Char_Range:
+			delete(p.name)
+		}
+	}
+	delete(unicode_data)
+}
+
+
+gc_ranges :: proc(ud: ^Unicode_Data, allocator := context.allocator) -> (lst: [General_Category]Dynamic_Range) {
+	range := Range_Rune{
+		first = -1,
+		last  = -1,
+	}
+	gc: General_Category
+
+	for point in ud {
+		switch p in point {
+		case Char:
+			if range.first != -1 && (p.cp != range.last + 1 || p.gc != gc) {
+				append_to_dynamic_range(&lst[gc], range, allocator)
+				range.first = -1
+				range.last = -1
+			}
+
+			range.first = rune(min(u32(range.first), u32(p.cp)))
+			gc = p.gc
+			range.last = p.cp	
+
+		case Char_Range:
+			if range.first != -1 {
+				append_to_dynamic_range(&lst[gc], range, allocator)
+			}
+			
+			range.first = p.first_cp
+			range.last = p.last_cp
+			append_to_dynamic_range(&lst[p.gc], range ,allocator)
+			range.first = -1
+			range.last = -1
+		}
+	}
+
+	if range.first != -1 {
+		append_to_dynamic_range(&lst[gc], range, allocator)
+	}
+
+	return
+}
+
+
+extra_digits :: proc(ud: ^Unicode_Data, allocator := context.allocator) -> (Dynamic_Range) {
+	range := Range_Rune {
+		first = -1,
+		last = -1,
+	}
+
+	exd: Dynamic_Range
+	for point in ud {
+		switch p in point {
+
+		case Char:
+			exd_type :=  p.gc != .Nd && (p.nt == .Decimal || p.nt == .Digit)
+
+			if range.first != -1 && (p.cp != range.last + 1 || !exd_type) {
+				append_to_dynamic_range(&exd, range, allocator)
+				range.first = -1
+				range.last = -1
+			}
+		
+			if exd_type {
+				range.first = rune(min(u32(range.first), u32(p.cp)))
+				range.last = p.cp	
+			}
+
+		case Char_Range:
+			exd_type :=  p.gc != .Nd && (p.nt == .Decimal || p.nt == .Digit)
+
+			if range.first != -1 {
+				append_to_dynamic_range(&exd, range, allocator)
+			}
+		
+			if exd_type {
+				range.first = p.first_cp
+				range.last = p.last_cp
+				append_to_dynamic_range(&exd, range ,allocator)
+			}
+			range.first = -1
+			range.last = -1
+		}
+	}
+	if range.first != -1 {
+		append_to_dynamic_range(&exd, range, allocator)
+	}
+
+	return exd
+}
+
+/*
+Data contained in the Unicode fiel PropList.txt
+
+A `Prop_List` is the data contained in the Unicode Database (UCD) file `PropList.txt`.
+It is created with the procedure `load_property_list` and destroyed with the procedure `destroy_property_list`.
+*/
+Prop_List :: [PropList_Property]Dynamic_Range
+
+/*
+This function destroys a `Prop_List` created by `load_property_list`.
+
+Inputs:
+- props: The Prop_List to destroy
+*/
+destroy_property_list :: proc(props: Prop_List) {
+	for r in props {
+		delete(r.ranges_16)
+		delete(r.ranges_32)
+		delete(r.single_16)
+		delete(r.single_32)
+	}
+}
+
+
+
+load_property_list :: proc(filename: string, allocator := context.allocator) -> (props: Prop_List, err: Error) {
+	data := os.read_entire_file(filename, allocator) or_return
+	defer delete(data)
+
+	str := string(data)
+	for _line in strings.split_lines_iterator(&str) {
+		line, _, _ := strings.partition(_line, "#")
+		if len(line) == 0 {
+			continue
+		}
+
+		rr:   Range_Rune
+		prop: PropList_Property
+
+		i := 0
+		for _field in strings.split_iterator(&line, ";") {
+			defer i += 1
+			field := strings.trim_space(_field)
+
+			switch i {
+			// Code point or code point range
+			case 0: rr.first, rr.last = decode_rune(field) or_return
+			case 1: prop = string_to_proplist_property(field) or_return
+			case:   return {}, .Extra_Fields
+			}
+		}
+
+		append_to_dynamic_range(&props[prop], rr, allocator)
+	}
+
+	return
+}
--- a/tests/core/assets/UCD/PropList.txt
+++ b/tests/core/assets/UCD/PropList.txt
--- a/tests/core/assets/UCD/UnicodeData.txt
+++ b/tests/core/assets/UCD/UnicodeData.txt
--- a/tests/core/assets/UCD/license.txt
+++ b/tests/core/assets/UCD/license.txt
@@ -0,0 +1,39 @@
+UNICODE LICENSE V3
+
+COPYRIGHT AND PERMISSION NOTICE
+
+Copyright © 1991-2026 Unicode, Inc.
+
+NOTICE TO USER: Carefully read the following legal agreement. BY
+DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING DATA FILES, AND/OR
+SOFTWARE, YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
+TERMS AND CONDITIONS OF THIS AGREEMENT. IF YOU DO NOT AGREE, DO NOT
+DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE THE DATA FILES OR SOFTWARE.
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of data files and any associated documentation (the "Data Files") or
+software and any associated documentation (the "Software") to deal in the
+Data Files or Software without restriction, including without limitation
+the rights to use, copy, modify, merge, publish, distribute, and/or sell
+copies of the Data Files or Software, and to permit persons to whom the
+Data Files or Software are furnished to do so, provided that either (a)
+this copyright and permission notice appear with all copies of the Data
+Files or Software, or (b) this copyright and permission notice appear in
+associated Documentation.
+
+THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
+KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
+THIRD PARTY RIGHTS.
+
+IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE
+BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES,
+OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
+ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE DATA
+FILES OR SOFTWARE.
+
+Except as contained in this notice, the name of a copyright holder shall
+not be used in advertising or otherwise to promote the sale, use or other
+dealings in these Data Files or Software without prior written
+authorization of the copyright holder.