Files
Odin/core/unicode/tools/ucd/ucd.odin
2026-03-09 19:25:00 -04:00

291 lines
6.2 KiB
Odin

package ucd
import "core:strings"
import "core:os"
import "core:strconv"
decode_rune :: proc(str: string) -> (cp1, cp2: rune, err: Error) {
head, _, tail := strings.partition(str, "..")
if _cp1, _ok := strconv.parse_int(head, 16); !_ok {
return 0, 0, .Invalid_Hex_Number
} else {
cp1 = rune(_cp1)
}
if len(tail) == 0 {
return cp1, cp1, nil
}
if _cp2, _ok := strconv.parse_int(tail, 16); !_ok {
return 0, 0, .Invalid_Hex_Number
} else {
cp2 = rune(_cp2)
}
return
}
load_unicode_data :: proc(filename: string, allocator := context.allocator) -> (unicode_data: Unicode_Data, err: Error) {
data := os.read_entire_file(filename, context.temp_allocator) or_return
defer free_all(context.temp_allocator)
first_cp: rune
str := string(data)
line_loop: for _line in strings.split_lines_iterator(&str) {
// Ignore any comments
line, _, _ := strings.partition(_line, "#")
// Skip empty lines
if len(line) == 0 { continue }
is_range := false
cp: rune
name: string
gc: General_Category
num_6: string
num_7: string
nt := Numeric_Type.None
field_num := 0
for _field in strings.split_iterator(&line, ";") {
defer field_num += 1
field := strings.trim_space(_field)
switch field_num {
case 0: // Code point
cp, _ = decode_rune(field) or_return
case 1: // Name
if len(field) > 9 && field[0] == '<' && strings.ends_with(field, ", First>") {
first_cp = cp
continue line_loop
}
if len(field) > 9 && field[0] == '<' && strings.ends_with(field, ", Last>") {
name = strings.clone(field[1:len(field)-7], allocator)
is_range = true
} else {
name = strings.clone(field[:], allocator)
}
case 2: // General_Category
// NOTE: This is currently igorning a possible error it should probably be fixed
gc, _ = string_to_general_category(field)
case 3: // Canonical_Combining_Class
case 4: // Bidi Class
case 5: // Decomposition_Type and Decomposition_Mapping
// Numeric_Type and Numeric_Value
case 6:
num_6 = field
case 7:
num_7 = field
case 8:
switch {
case num_6 != "" && num_7 != "" && field != "" :
nt = .Decimal
case num_6 == "" && num_7 != "" && field != "" :
nt = .Digit
case num_6 == "" && num_7 == "" && field != "" :
nt = .Numeric
case:
nt = .None
}
case 9: // Bidi mirrored
case 10: // Unicode 1 Name (Obsolete as of 6.2.0)
case 11: // should be null
case 12:
case 13:
case 14:
case:
err = .Extra_Fields
return
}
}
if is_range {
append(&unicode_data, Char_Range {
gc = gc,
first_cp = first_cp,
last_cp = cp,
name = name,
nt = nt,
})
} else {
append(&unicode_data, Char{
gc = gc,
cp = cp,
name = name,
nt = nt,
})
}
}
return
}
destroy_unicode_data :: proc(unicode_data: Unicode_Data) {
for point in unicode_data {
switch p in point {
case Char:
delete(p.name)
case Char_Range:
delete(p.name)
}
}
delete(unicode_data)
}
gc_ranges :: proc(ud: ^Unicode_Data, allocator := context.allocator) -> (lst: [General_Category]Dynamic_Range) {
range := Range_Rune{
first = -1,
last = -1,
}
gc: General_Category
for point in ud {
switch p in point {
case Char:
if range.first != -1 && (p.cp != range.last + 1 || p.gc != gc) {
append_to_dynamic_range(&lst[gc], range, allocator)
range.first = -1
range.last = -1
}
range.first = rune(min(u32(range.first), u32(p.cp)))
gc = p.gc
range.last = p.cp
case Char_Range:
if range.first != -1 {
append_to_dynamic_range(&lst[gc], range, allocator)
}
range.first = p.first_cp
range.last = p.last_cp
append_to_dynamic_range(&lst[p.gc], range ,allocator)
range.first = -1
range.last = -1
}
}
if range.first != -1 {
append_to_dynamic_range(&lst[gc], range, allocator)
}
return
}
extra_digits :: proc(ud: ^Unicode_Data, allocator := context.allocator) -> (Dynamic_Range) {
range := Range_Rune {
first = -1,
last = -1,
}
exd: Dynamic_Range
for point in ud {
switch p in point {
case Char:
exd_type := p.gc != .Nd && (p.nt == .Decimal || p.nt == .Digit)
if range.first != -1 && (p.cp != range.last + 1 || !exd_type) {
append_to_dynamic_range(&exd, range, allocator)
range.first = -1
range.last = -1
}
if exd_type {
range.first = rune(min(u32(range.first), u32(p.cp)))
range.last = p.cp
}
case Char_Range:
exd_type := p.gc != .Nd && (p.nt == .Decimal || p.nt == .Digit)
if range.first != -1 {
append_to_dynamic_range(&exd, range, allocator)
}
if exd_type {
range.first = p.first_cp
range.last = p.last_cp
append_to_dynamic_range(&exd, range ,allocator)
}
range.first = -1
range.last = -1
}
}
if range.first != -1 {
append_to_dynamic_range(&exd, range, allocator)
}
return exd
}
/*
Data contained in the Unicode fiel PropList.txt
A `Prop_List` is the data contained in the Unicode Database (UCD) file `PropList.txt`.
It is created with the procedure `load_property_list` and destroyed with the procedure `destroy_property_list`.
*/
Prop_List :: [Prop_List_Property]Dynamic_Range
/*
This function destroys a `Prop_List` created by `load_property_list`.
Inputs:
- props: The Prop_List to destroy
*/
destroy_property_list :: proc(props: Prop_List) {
for r in props {
delete(r.ranges_16)
delete(r.ranges_32)
delete(r.single_16)
delete(r.single_32)
}
}
load_property_list :: proc(filename: string, allocator := context.allocator) -> (props: Prop_List, err: Error) {
data := os.read_entire_file(filename, allocator) or_return
defer delete(data)
str := string(data)
for _line in strings.split_lines_iterator(&str) {
line, _, _ := strings.partition(_line, "#")
if len(line) == 0 {
continue
}
rr: Range_Rune
prop: Prop_List_Property
i := 0
for _field in strings.split_iterator(&line, ";") {
defer i += 1
field := strings.trim_space(_field)
switch i {
// Code point or code point range
case 0: rr.first, rr.last = decode_rune(field) or_return
case 1: prop = string_to_proplist_property(field) or_return
case: return {}, .Extra_Fields
}
}
append_to_dynamic_range(&props[prop], rr, allocator)
}
return
}