mirror of
https://github.com/odin-lang/Odin.git
synced 2026-04-19 13:00:28 +00:00
Added program to generate Unicode Table
This adds a program that will generate tables for use by the `core/unicode` package. The table generated file will be `core/unicode/generated.odin` It may be better to incorporate this into `generate_entity_table.odin`. This can easily be accomplised if desired.
This commit is contained in:
committed by
Jeroen van Rijn
parent
c4f5f9e55a
commit
8f579d1f3b
325
core/unicode/tools/generate_unicode.odin
Normal file
325
core/unicode/tools/generate_unicode.odin
Normal file
@@ -0,0 +1,325 @@
|
||||
package main
|
||||
import "core:fmt"
|
||||
import path "core:path/filepath"
|
||||
import "core:os"
|
||||
import "core:strings"
|
||||
import "base:runtime"
|
||||
import "core:mem"
|
||||
import "core:io"
|
||||
import "core:log"
|
||||
import "ucd"
|
||||
|
||||
// Table 2-3. Types of Code Points
|
||||
// Table 4-4. General_Category Values page 229
|
||||
|
||||
// Reference https://www.unicode.org/reports/tr44/
|
||||
|
||||
|
||||
/*
|
||||
Formats a ucd.Dynamic_Range into a set of fixed length arrays and writes
|
||||
corresponding to a io.Writer. The value of the parameter `name`will be used as a
|
||||
prefix to the array names. If a dynamic array contained in the `range` is empty,
|
||||
no corresponding fixed length array will be written.
|
||||
|
||||
Inputs:
|
||||
- writer: The io.Writer to be written to.
|
||||
- name: Prefix to add to any array that is written to `writer`
|
||||
- range: The ucd.Dynamic_Range to format and write to writer.
|
||||
*/
|
||||
write_range_arrays :: proc(
|
||||
writer: io.Writer,
|
||||
name: string,
|
||||
range : ucd.Dynamic_Range,
|
||||
) -> int {
|
||||
n_written : int
|
||||
if len(range.single_16) > 0 {
|
||||
n_written += fmt.wprintln(writer, "@(rodata)")
|
||||
n_written += fmt.wprintf(writer, "%s_singles16 := [?]u16{{", name)
|
||||
line_length := 100
|
||||
for v in range.single_16 {
|
||||
str_buffer : [32]byte
|
||||
str := fmt.bprintf(str_buffer[:], " 0x%4X,",v)
|
||||
|
||||
if line_length + len(str) > 80 {
|
||||
n_written += fmt.wprintf(writer, "\n")
|
||||
line_length = fmt.wprintf(writer, "\t0x%4X,",v)
|
||||
n_written += line_length
|
||||
} else {
|
||||
temp, _ := io.write_string(writer, str)
|
||||
line_length += temp
|
||||
n_written += temp
|
||||
}
|
||||
}
|
||||
n_written += fmt.wprintln(writer, "\n}\n")
|
||||
}
|
||||
|
||||
if len(range.ranges_16) > 0 {
|
||||
n_written += fmt.wprintln(writer, "@(rodata)")
|
||||
n_written += fmt.wprintfln(writer, "%s_ranges16 := [?]u16{{", name)
|
||||
for v in range.ranges_16 {
|
||||
n_written += fmt.wprintfln(writer, "\t0x%4X, 0x%4X,", v.first, v.last)
|
||||
}
|
||||
n_written += fmt.wprintln(writer, "}\n")
|
||||
}
|
||||
|
||||
if len(range.single_32) > 0 {
|
||||
n_written += fmt.wprintln(writer, "@(rodata)")
|
||||
n_written += fmt.wprintf(writer, "%s_singles32 := [?]i32{{", name)
|
||||
line_length := 100
|
||||
for v in range.single_32 {
|
||||
str_buffer : [32]byte
|
||||
str := fmt.bprintf(str_buffer[:], " 0x%4X,",v)
|
||||
|
||||
if line_length + len(str) > 80 {
|
||||
n_written += fmt.wprint(writer, "\n")
|
||||
line_length = fmt.wprintf(writer, "\t0x%4X,",v)
|
||||
n_written += line_length
|
||||
} else {
|
||||
temp, _ := io.write_string(writer, str)
|
||||
line_length += temp
|
||||
n_written += temp
|
||||
}
|
||||
}
|
||||
n_written += fmt.wprintln(writer, "\n}\n")
|
||||
}
|
||||
|
||||
if len(range.ranges_32) > 0 {
|
||||
n_written += fmt.wprintln(writer, "@(rodata)")
|
||||
n_written += fmt.wprintfln(writer, "%s_ranges32 := [?]i32{{", name)
|
||||
for v in range.ranges_32 {
|
||||
n_written += fmt.wprintfln(writer, "\t0x%4X, 0x%4X,", v.first, v.last)
|
||||
}
|
||||
n_written += fmt.wprintln(writer, "}\n")
|
||||
}
|
||||
|
||||
return n_written
|
||||
}
|
||||
|
||||
write_range :: proc(
|
||||
writer: io.Writer,
|
||||
name: union{string,
|
||||
ucd.General_Category},
|
||||
range: ucd.Dynamic_Range,
|
||||
) -> (n_written: int) {
|
||||
buffer: [128]byte
|
||||
str: string
|
||||
|
||||
switch n in name{
|
||||
case string:
|
||||
assert(len(n) <= len(buffer))
|
||||
runtime.mem_copy(&buffer[0], raw_data(n), len(n))
|
||||
str = transmute(string) buffer[0:len(n)]
|
||||
|
||||
case ucd.General_Category:
|
||||
str = fmt.bprintf(buffer[:], "%s", n)
|
||||
}
|
||||
|
||||
for &b in buffer[0:len(str)] {
|
||||
if b >= 'A' && b <= 'Z' {
|
||||
b += ('a' - 'A')
|
||||
}
|
||||
}
|
||||
|
||||
n_written = write_range_arrays(writer, str, range)
|
||||
|
||||
n_written += fmt.wprintfln(writer, "%s_ranges := Range{{", str)
|
||||
if len(range.single_16) > 0 {
|
||||
n_written += fmt.wprintfln(writer, "\tsingle_16 = %s_singles16[:],", str)
|
||||
}
|
||||
if len(range.ranges_16) > 0 {
|
||||
n_written += fmt.wprintfln(writer, "\tranges_16 = %s_ranges16[:],", str)
|
||||
}
|
||||
if len(range.single_32) > 0 {
|
||||
n_written += fmt.wprintfln(writer, "\tsingle_32 = %s_singles32[:],", str)
|
||||
}
|
||||
if len(range.ranges_32) > 0 {
|
||||
n_written += fmt.wprintfln(writer, "\tranges_32 = %s_ranges32[:],", str)
|
||||
}
|
||||
n_written += fmt.wprintln(writer, "}\n")
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
GENERATED :: `/*
|
||||
------ GENERATED ------ DO NOT EDIT ------ GENERATED ------ DO NOT EDIT ------ GENERATED ------
|
||||
*/
|
||||
`
|
||||
|
||||
MESSAGE :: `/*
|
||||
This file is generated from UnicodeData.txt and PropList.txt. These files
|
||||
are part of the Unicode Database (UCD) and are covered by the license
|
||||
listed further down. They may be downloaded from the following locations;
|
||||
|
||||
https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
|
||||
https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt
|
||||
https://www.unicode.org/license.txt
|
||||
|
||||
------------------------------------------------------------------------------
|
||||
UNICODE LICENSE V3
|
||||
|
||||
COPYRIGHT AND PERMISSION NOTICE
|
||||
|
||||
Copyright © 1991-2026 Unicode, Inc.
|
||||
|
||||
NOTICE TO USER: Carefully read the following legal agreement. BY
|
||||
DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING DATA FILES, AND/OR
|
||||
SOFTWARE, YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
|
||||
TERMS AND CONDITIONS OF THIS AGREEMENT. IF YOU DO NOT AGREE, DO NOT
|
||||
DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE THE DATA FILES OR SOFTWARE.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a
|
||||
copy of data files and any associated documentation (the "Data Files") or
|
||||
software and any associated documentation (the "Software") to deal in the
|
||||
Data Files or Software without restriction, including without limitation
|
||||
the rights to use, copy, modify, merge, publish, distribute, and/or sell
|
||||
copies of the Data Files or Software, and to permit persons to whom the
|
||||
Data Files or Software are furnished to do so, provided that either (a)
|
||||
this copyright and permission notice appear with all copies of the Data
|
||||
Files or Software, or (b) this copyright and permission notice appear in
|
||||
associated Documentation.
|
||||
|
||||
THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
|
||||
KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
|
||||
THIRD PARTY RIGHTS.
|
||||
|
||||
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE
|
||||
BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES,
|
||||
OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
||||
WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
|
||||
ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE DATA
|
||||
FILES OR SOFTWARE.
|
||||
|
||||
Except as contained in this notice, the name of a copyright holder shall
|
||||
not be used in advertising or otherwise to promote the sale, use or other
|
||||
dealings in these Data Files or Software without prior written
|
||||
authorization of the copyright holder.
|
||||
|
||||
*/
|
||||
`
|
||||
|
||||
main :: proc() {
|
||||
track: mem.Tracking_Allocator
|
||||
|
||||
mem.tracking_allocator_init(&track, context.allocator)
|
||||
defer {
|
||||
if len(track.allocation_map) > 0 {
|
||||
fmt.eprintf("=== %v allocations not freed: ===\n", len(track.allocation_map))
|
||||
for _, entry in track.allocation_map {
|
||||
fmt.eprintf("- %v bytes @ %v\n", entry.size, entry.location)
|
||||
}
|
||||
}
|
||||
mem.tracking_allocator_destroy(&track)
|
||||
}
|
||||
|
||||
context.allocator = mem.tracking_allocator(&track)
|
||||
|
||||
context.logger = log.create_console_logger()
|
||||
defer log.destroy_console_logger(context.logger)
|
||||
|
||||
ucd_path, _ := path.join({ODIN_ROOT,
|
||||
"tests","core","assets","UCD","UnicodeData.txt"}, context.allocator)
|
||||
defer delete(ucd_path)
|
||||
|
||||
unicode_data, ucd_err := ucd.load_unicode_data(ucd_path)
|
||||
if ucd_err != nil {
|
||||
log.errorf("Error loading Unicode data. %s", ucd_err)
|
||||
}
|
||||
defer ucd.destroy_unicode_data(unicode_data)
|
||||
|
||||
general_category_ranges := ucd.gc_ranges(&unicode_data)
|
||||
defer ucd.destroy_general_category_ranges(general_category_ranges)
|
||||
|
||||
extra_digits := ucd.extra_digits(&unicode_data)
|
||||
defer ucd.destroy_dynamic_range(extra_digits)
|
||||
|
||||
|
||||
proplist_path, _ := path.join({ODIN_ROOT,
|
||||
"tests","core","assets","UCD","PropList.txt"}, context.allocator)
|
||||
defer delete(proplist_path)
|
||||
proplist, proplist_err := ucd.load_protperty_list(proplist_path)
|
||||
if proplist_err != nil {
|
||||
log.errorf("Error loading PropList.txt. %s", proplist_err)
|
||||
return
|
||||
}
|
||||
defer ucd.destroy_protperty_list(proplist)
|
||||
|
||||
|
||||
|
||||
sb := strings.builder_make_len_cap(0, 1024*32)
|
||||
defer strings.builder_destroy(&sb)
|
||||
|
||||
|
||||
writer := strings.to_writer(&sb)
|
||||
|
||||
fmt.wprintfln(writer, "package unicode\n")
|
||||
fmt.wprintln(writer, GENERATED)
|
||||
fmt.wprintln(writer, MESSAGE)
|
||||
|
||||
Range_Type :: "Range :: struct {\n" +
|
||||
"\tsingle_16 : []u16,\n" +
|
||||
"\tranges_16 : []u16,\n" +
|
||||
"\tsingle_32 : []i32,\n" +
|
||||
"\tranges_32 : []i32,\n" +
|
||||
"}\n"
|
||||
|
||||
fmt.wprintfln(writer, "%s", Range_Type)
|
||||
|
||||
//List of the general categories to skip when generating the code for
|
||||
//core/unicode/generated.txt.
|
||||
to_exclude := [?]ucd.General_Category{
|
||||
.Cc, // Control, a C0 or C1 control code
|
||||
.Cf, // Format, a format control character
|
||||
.Cn, // Unassigned, a reserved unassigned code point or a noncharacter
|
||||
.Co, // Private_Use, a private-use character
|
||||
.Cs, // Surrogate, a surrogate code point
|
||||
// .Ll, // Lowercase_Letter, a lowercase letter
|
||||
// .Lm, // Modifier_Letter, a modifier letter
|
||||
// .Lo, // Other_Letter, other letters, including syllables and ideographs
|
||||
// .Lt, // Titlecase_Letter, a digraph encoded as a single character, with first part uppercase
|
||||
// .Lu, // Uppercase_Letter, an uppercase letter
|
||||
.Mc, // Spacing_Mark, a spacing combining mark (positive advance width)
|
||||
.Me, // Enclosing_Mark, an enclosing combining mark
|
||||
.Mn, // Nonspacing_Mark, a nonspacing combining mark (zero advance width)
|
||||
//.Nd, // Decimal_Number, a decimal digit
|
||||
//.Nl, // Letter_Number, a letterlike numeric character
|
||||
//.No, // Other_Number, a numeric character of other type
|
||||
.Pc, // Connector_Punctuation, a connecting punctuation mark, like a tie
|
||||
.Pd, // Dash_Punctuation, a dash or hyphen punctuation mark
|
||||
.Pe, // Close_Punctuation, a closing punctuation mark (of a pair)
|
||||
.Pf, // Final_Punctuation, a final quotation mark
|
||||
.Pi, // Initial_Punctuation, an initial quotation mark
|
||||
.Po, // Other_Punctuation, a punctuation mark of other type
|
||||
.Ps, // Open_Punctuation, an opening punctuation mark (of a pair)
|
||||
.Sc, // Currency_Symbol, a currency sign
|
||||
.Sk, // Modifier_Symbol, a non-letterlike modifier symbol
|
||||
.Sm, // Math_Symbol, a symbol of mathematical use
|
||||
.So, // Other_Symbol, a symbol of other type
|
||||
.Zl, // Line_Separator, U+2028 LINE SEPARATOR only
|
||||
.Zp, // Paragraph_Separator, U+2029 PARAGRAPH SEPARATOR only
|
||||
.Zs, // Space_Separator, a space character (of various non-zero widths)
|
||||
}
|
||||
|
||||
write_loop : for gc, i in general_category_ranges {
|
||||
for excluded in to_exclude {
|
||||
if i == excluded do continue write_loop
|
||||
}
|
||||
write_range(writer, i, gc)
|
||||
}
|
||||
|
||||
write_range(writer, "extra_digits", extra_digits )
|
||||
|
||||
write_range(writer,"other_lowercase", proplist[.Other_Lowercase])
|
||||
write_range(writer,"other_uppercase", proplist[.Other_Uppercase])
|
||||
|
||||
file_name, _ := path.join({ODIN_ROOT, "core", "unicode", "generated.odin"}, context.allocator)
|
||||
defer delete(file_name)
|
||||
|
||||
str := strings.to_string(sb)
|
||||
|
||||
write_error := os.write_entire_file_from_string(file_name, str)
|
||||
if write_error != nil {
|
||||
log.errorf("Error writting %s. %s", file_name, write_error)
|
||||
}
|
||||
}
|
||||
|
||||
70
core/unicode/tools/ucd/iterator.odin
Normal file
70
core/unicode/tools/ucd/iterator.odin
Normal file
@@ -0,0 +1,70 @@
|
||||
package ucd
|
||||
|
||||
/*
|
||||
An iterator that allows simple iterating over the lines of of a slice of bytes, []byte,
|
||||
without allocating. Each line must end in a new line, i.e., '\n'
|
||||
*/
|
||||
Line_Iterator :: struct {
|
||||
index: int, // current location in data
|
||||
data: []byte, // Data over which to iterate
|
||||
line_counter: int, // line number storage
|
||||
}
|
||||
|
||||
line_iterator :: proc(it: ^Line_Iterator) -> (line: []byte, line_number: int, more: bool) {
|
||||
more = it.index < len(it.data)
|
||||
if more {
|
||||
it.line_counter += 1
|
||||
line_number = it.line_counter
|
||||
} else {
|
||||
return
|
||||
}
|
||||
start:= it.index
|
||||
for it.index < len(it.data) && it.data[it.index] != '\n' && it.data[it.index] != '#' do it.index += 1
|
||||
line = it.data[start:it.index]
|
||||
//index = start
|
||||
|
||||
if it.index < len(it.data) && it.data[it.index] == '#' {
|
||||
for it.index < len(it.data) && it.data[it.index] != '\n' do it.index += 1
|
||||
}
|
||||
if it.index < len(it.data) && it.data[it.index] == '\n' do it.index += 1
|
||||
return
|
||||
}
|
||||
|
||||
Field_Iterator :: struct {
|
||||
index: int,
|
||||
field_counter: int,
|
||||
line: []byte,
|
||||
}
|
||||
|
||||
field_iterator :: proc(it: ^Field_Iterator) -> (field: []byte, field_count: int, valid: bool) {
|
||||
valid = it.index < len(it.line) && it.line[it.index] != '\n' && it.line[it.index] != '#'
|
||||
if !valid do return
|
||||
|
||||
if it.index < len(it.line) && it.index != 0 && it.line[it.index] == ';' do it. index += 1
|
||||
|
||||
start := it.index
|
||||
for it.index < len(it.line) && it.line[it.index] != ';' && it.line[it.index] != '#' do it.index += 1
|
||||
|
||||
field = it.line[start:it.index]
|
||||
temp := field
|
||||
|
||||
// Remove leading spaces
|
||||
for b, i in temp {
|
||||
if b != ' ' {
|
||||
field = temp[i:]
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// Remove trailing spaces
|
||||
temp = field
|
||||
for b, i in temp {
|
||||
if b != ' ' {
|
||||
field = temp[0:i+1]
|
||||
}
|
||||
}
|
||||
|
||||
field_count = it.field_counter
|
||||
it.field_counter += 1
|
||||
return
|
||||
}
|
||||
396
core/unicode/tools/ucd/string_to.odin
Normal file
396
core/unicode/tools/ucd/string_to.odin
Normal file
@@ -0,0 +1,396 @@
|
||||
package ucd
|
||||
|
||||
string_to_general_category :: proc "contextless"(
|
||||
str: string,
|
||||
) -> (gc: General_Category, err: Error) {
|
||||
switch str {
|
||||
case "Lu":
|
||||
gc = .Lu
|
||||
case "Ll":
|
||||
gc = .Ll
|
||||
case "Lt":
|
||||
gc = .Lt
|
||||
case "Lm":
|
||||
gc = .Lm
|
||||
case "Lo":
|
||||
gc = .Lo
|
||||
case "Mn":
|
||||
gc = .Mn
|
||||
case "Mc":
|
||||
gc = .Mc
|
||||
case "Me":
|
||||
gc = .Me
|
||||
case "Nd":
|
||||
gc = .Nd
|
||||
case "Nl":
|
||||
gc = .Nl
|
||||
case "No":
|
||||
gc = .No
|
||||
case "Pc":
|
||||
gc = .Pc
|
||||
case "Pd":
|
||||
gc = .Pd
|
||||
case "Ps":
|
||||
gc = .Ps
|
||||
case "Pe":
|
||||
gc = .Pe
|
||||
case "Pi":
|
||||
gc = .Pi
|
||||
case "Pf":
|
||||
gc = .Pf
|
||||
case "Po":
|
||||
gc = .Po
|
||||
case "Sm":
|
||||
gc = .Sm
|
||||
case "Sc":
|
||||
gc = .Sc
|
||||
case "Sk":
|
||||
gc = .Sk
|
||||
case "So":
|
||||
gc = .So
|
||||
case "Zs":
|
||||
gc = .Zs
|
||||
case "Zl":
|
||||
gc = .Zl
|
||||
case "Zp":
|
||||
gc = .Zp
|
||||
case "Cc":
|
||||
gc = .Cc
|
||||
case "Cf":
|
||||
gc = .Cf
|
||||
case "Cs":
|
||||
gc = .Cs
|
||||
case "Co":
|
||||
gc = .Co
|
||||
case "Cn":
|
||||
gc = .Cn
|
||||
case:
|
||||
err = UCD_Error.Invalid_General_Category
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
|
||||
string_to_age :: proc "contextless" (
|
||||
str: string,
|
||||
) -> (age: Age, err: Error) {
|
||||
switch str {
|
||||
case "1.1":
|
||||
age = .Age_1_1
|
||||
return
|
||||
|
||||
case "2.0":
|
||||
age = .Age_2_0
|
||||
return
|
||||
|
||||
case "2.1":
|
||||
age = .Age_2_1
|
||||
return
|
||||
|
||||
case "3.0":
|
||||
age = .Age_3_0
|
||||
return
|
||||
|
||||
case "3.1":
|
||||
age = .Age_3_1
|
||||
return
|
||||
|
||||
case "3.2":
|
||||
age = .Age_3_2
|
||||
return
|
||||
|
||||
case "4.0":
|
||||
age = .Age_4_0
|
||||
return
|
||||
|
||||
case "4.1":
|
||||
age = .Age_4_1
|
||||
return
|
||||
|
||||
case "5.0":
|
||||
age = .Age_5_0
|
||||
return
|
||||
|
||||
case "5.1":
|
||||
age = .Age_5_1
|
||||
return
|
||||
|
||||
case "5.2":
|
||||
age = .Age_5_2
|
||||
return
|
||||
|
||||
case "6.0":
|
||||
age = .Age_6_0
|
||||
return
|
||||
|
||||
case "6.1":
|
||||
age = .Age_6_1
|
||||
return
|
||||
|
||||
case "6.2":
|
||||
age = .Age_6_2
|
||||
return
|
||||
|
||||
case "6.3":
|
||||
age = .Age_6_3
|
||||
return
|
||||
|
||||
case "7.0":
|
||||
age = .Age_7_0
|
||||
return
|
||||
|
||||
case "8.0":
|
||||
age = .Age_8_0
|
||||
return
|
||||
|
||||
case "9.0":
|
||||
age = .Age_9_0
|
||||
return
|
||||
|
||||
case "10.0":
|
||||
age = .Age_10_0
|
||||
return
|
||||
|
||||
case "11.0":
|
||||
age = .Age_11_0
|
||||
return
|
||||
|
||||
case "12.0":
|
||||
age = .Age_12_0
|
||||
return
|
||||
|
||||
case "12.1":
|
||||
age = .Age_12_1
|
||||
return
|
||||
|
||||
case "13.0":
|
||||
age = .Age_13_0
|
||||
return
|
||||
|
||||
case "14.0":
|
||||
age = .Age_14_0
|
||||
return
|
||||
|
||||
case "15.0":
|
||||
age = .Age_15_0
|
||||
return
|
||||
|
||||
case "15.1":
|
||||
age = .Age_15_1
|
||||
return
|
||||
|
||||
case "16.0":
|
||||
age = .Age_16_0
|
||||
return
|
||||
|
||||
case "17.0":
|
||||
age = .Age_17_0
|
||||
return
|
||||
|
||||
case "unassigned":
|
||||
age = .Age_Unassigned
|
||||
return
|
||||
|
||||
case:
|
||||
// NOTE: Should this return an error instead?
|
||||
unreachable()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
string_to_paired_bracket_type :: proc "contextless"(str: string) -> Paired_Brack_Type {
|
||||
switch str {
|
||||
case "o":
|
||||
return .Open
|
||||
case "c":
|
||||
return .Close
|
||||
case "n":
|
||||
return .None
|
||||
case:
|
||||
// TODO: Add error for this
|
||||
unreachable()
|
||||
}
|
||||
}
|
||||
|
||||
string_to_bidi_class :: proc "contextless"(str: string) -> Bidi_Class {
|
||||
switch str {
|
||||
case "AL":
|
||||
return .AL
|
||||
case "AN":
|
||||
return .AN
|
||||
case "B":
|
||||
return .B
|
||||
case "BN":
|
||||
return .BN
|
||||
case "CS":
|
||||
return .CS
|
||||
case "EN":
|
||||
return .EN
|
||||
case "ES":
|
||||
return .ES
|
||||
case "ET":
|
||||
return .ET
|
||||
case "FSI":
|
||||
return .FSI
|
||||
case "L":
|
||||
return .L
|
||||
case "LRE":
|
||||
return .LRE
|
||||
case "LRI":
|
||||
return .LRI
|
||||
case "LRO":
|
||||
return .LRO
|
||||
case "NSM":
|
||||
return .NSM
|
||||
case "ON":
|
||||
return .ON
|
||||
case "PDF":
|
||||
return .PDF
|
||||
case "PDI":
|
||||
return .PDI
|
||||
case "R":
|
||||
return .R
|
||||
case "RLE":
|
||||
return .RLE
|
||||
case "RLI":
|
||||
return .RLI
|
||||
case "RLO":
|
||||
return .RLO
|
||||
case "S":
|
||||
return .S
|
||||
case "WS":
|
||||
return .WS
|
||||
case:
|
||||
// TODO: Add error for this
|
||||
unreachable()
|
||||
}
|
||||
}
|
||||
|
||||
string_to_proplist_property :: proc(str: string) -> (
|
||||
prop: PropList_Property,
|
||||
err: UCD_Error,
|
||||
) {
|
||||
|
||||
switch str {
|
||||
case "White_Space":
|
||||
prop = .White_Space
|
||||
|
||||
case "Bidi_Control":
|
||||
prop = .Bidi_Control
|
||||
|
||||
case "Join_Control":
|
||||
prop = .Join_Control
|
||||
|
||||
case "Dash":
|
||||
prop = .Dash
|
||||
|
||||
case "Hyphen":
|
||||
prop = .Hyphen
|
||||
|
||||
case "Quotation_Mark":
|
||||
prop = .Quotation_Mark
|
||||
|
||||
case "Terminal_Punctuation":
|
||||
prop = .Terminal_Punctuation
|
||||
|
||||
case "Other_Math":
|
||||
prop = .Other_Math
|
||||
|
||||
case "Hex_Digit":
|
||||
prop = .Hex_Digit
|
||||
|
||||
case "ASCII_Hex_Digit":
|
||||
prop = .ASCII_Hex_Digit
|
||||
|
||||
case "Other_Alphabetic":
|
||||
prop = .Other_Alphabetic
|
||||
|
||||
case "Ideographic":
|
||||
prop = .Ideographic
|
||||
|
||||
case "Diacritic":
|
||||
prop = .Diacritic
|
||||
|
||||
case "Extender":
|
||||
prop = .Extender
|
||||
|
||||
case "Other_Lowercase":
|
||||
prop = .Other_Lowercase
|
||||
|
||||
case "Other_Uppercase":
|
||||
prop = .Other_Uppercase
|
||||
|
||||
|
||||
case "Noncharacter_Code_Point":
|
||||
prop = .Noncharacter_Code_Point
|
||||
|
||||
case "Other_Grapheme_Extend":
|
||||
prop = .Other_Grapheme_Extend
|
||||
|
||||
case "IDS_Binary_Operator":
|
||||
prop = .IDS_Binary_Operator
|
||||
|
||||
case "IDS_Trinary_Operator":
|
||||
prop = .IDS_Trinary_Operator
|
||||
|
||||
case "IDS_Unary_Operator":
|
||||
prop = .IDS_Unary_Operator
|
||||
|
||||
case "Radical":
|
||||
prop = .Radical
|
||||
|
||||
case "Unified_Ideograph":
|
||||
prop = .Unified_Ideograph
|
||||
|
||||
case "Other_Default_Ignorable_Code_Point":
|
||||
prop = .Other_Default_Ignorable_Code_Point
|
||||
|
||||
case "Deprecated":
|
||||
prop = .Deprecated
|
||||
|
||||
case "Soft_Dotted":
|
||||
prop = .Soft_Dotted
|
||||
|
||||
case "Logical_Order_Exception":
|
||||
prop = .Logical_Order_Exception
|
||||
|
||||
case "Other_ID_Start":
|
||||
prop = .Other_ID_Start
|
||||
|
||||
case "Other_ID_Continue":
|
||||
prop = .Other_ID_Continue
|
||||
|
||||
case "ID_Compat_Math_Continue":
|
||||
prop = .ID_Compat_Math_Continue
|
||||
|
||||
case "ID_Compat_Math_Start":
|
||||
prop = .ID_Compat_Math_Start
|
||||
|
||||
case "Sentence_Terminal":
|
||||
prop = .Sentence_Terminal
|
||||
|
||||
case "Variation_Selector":
|
||||
prop = .Variation_Selector
|
||||
|
||||
case "Pattern_White_Space":
|
||||
prop = .Pattern_White_Space
|
||||
|
||||
case "Pattern_Syntax":
|
||||
prop = .Pattern_Syntax
|
||||
|
||||
case "Prepended_Concatenation_Mark":
|
||||
prop = .Prepended_Concatenation_Mark
|
||||
|
||||
case "Regional_Indicator":
|
||||
prop = .Regional_Indicator
|
||||
|
||||
case "Modifier_Combining_Mark":
|
||||
prop = .Modifier_Combining_Mark
|
||||
|
||||
case:
|
||||
err = .Unknown_Property
|
||||
return
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
702
core/unicode/tools/ucd/types.odin
Normal file
702
core/unicode/tools/ucd/types.odin
Normal file
@@ -0,0 +1,702 @@
|
||||
package ucd
|
||||
|
||||
import "core:os"
|
||||
|
||||
Age :: enum byte {
|
||||
Nil = 0,
|
||||
Age_1_1,
|
||||
Age_2_0,
|
||||
Age_2_1,
|
||||
Age_3_0,
|
||||
Age_3_1,
|
||||
Age_3_2,
|
||||
Age_4_0,
|
||||
Age_4_1,
|
||||
Age_5_0,
|
||||
Age_5_1,
|
||||
Age_5_2,
|
||||
Age_6_0,
|
||||
Age_6_1,
|
||||
Age_6_2,
|
||||
Age_6_3,
|
||||
Age_7_0,
|
||||
Age_8_0,
|
||||
Age_9_0,
|
||||
Age_10_0,
|
||||
Age_11_0,
|
||||
Age_12_0,
|
||||
Age_12_1,
|
||||
Age_13_0,
|
||||
Age_14_0,
|
||||
Age_15_0,
|
||||
Age_15_1,
|
||||
Age_16_0,
|
||||
Age_17_0,
|
||||
Age_Unassigned,
|
||||
}
|
||||
|
||||
General_Category :: enum {
|
||||
Cc, // Control, a C0 or C1 control code
|
||||
Cf, // Format, a format control character
|
||||
Cn, // Unassigned, a reserved unassigned code point or a noncharacter
|
||||
Co, // Private_Use, a private-use character
|
||||
Cs, // Surrogate, a surrogate code point
|
||||
Ll, // Lowercase_Letter, a lowercase letter
|
||||
Lm, // Modifier_Letter, a modifier letter
|
||||
Lo, // Other_Letter, other letters, including syllables and ideographs
|
||||
Lt, // Titlecase_Letter, a digraph encoded as a single character, with first part uppercase
|
||||
Lu, // Uppercase_Letter, an uppercase letter
|
||||
Mc, // Spacing_Mark, a spacing combining mark (positive advance width)
|
||||
Me, // Enclosing_Mark, an enclosing combining mark
|
||||
Mn, // Nonspacing_Mark, a nonspacing combining mark (zero advance width)
|
||||
Nd, // Decimal_Number, a decimal digit
|
||||
Nl, // Letter_Number, a letterlike numeric character
|
||||
No, // Other_Number, a numeric character of other type
|
||||
Pc, // Connector_Punctuation, a connecting punctuation mark, like a tie
|
||||
Pd, // Dash_Punctuation, a dash or hyphen punctuation mark
|
||||
Pe, // Close_Punctuation, a closing punctuation mark (of a pair)
|
||||
Pf, // Final_Punctuation, a final quotation mark
|
||||
Pi, // Initial_Punctuation, an initial quotation mark
|
||||
Po, // Other_Punctuation, a punctuation mark of other type
|
||||
Ps, // Open_Punctuation, an opening punctuation mark (of a pair)
|
||||
Sc, // Currency_Symbol, a currency sign
|
||||
Sk, // Modifier_Symbol, a non-letterlike modifier symbol
|
||||
Sm, // Math_Symbol, a symbol of mathematical use
|
||||
So, // Other_Symbol, a symbol of other type
|
||||
Zl, // Line_Separator, U+2028 LINE SEPARATOR only
|
||||
Zp, // Paragraph_Separator, U+2029 PARAGRAPH SEPARATOR only
|
||||
Zs, // Space_Separator, a space character (of various non-zero widths)
|
||||
}
|
||||
|
||||
Block :: enum {
|
||||
Nil = 0,
|
||||
Adlam,
|
||||
Aegean_Numbers,
|
||||
Ahom,
|
||||
Alchemical,
|
||||
Alphabetic_PF,
|
||||
Anatolian_Hieroglyphs,
|
||||
Ancient_Greek_Music,
|
||||
Ancient_Greek_Numbers,
|
||||
Ancient_Symbols,
|
||||
Arabic,
|
||||
Arabic_Ext_A,
|
||||
Arabic_Ext_B,
|
||||
Arabic_Ext_C,
|
||||
Arabic_Math,
|
||||
Arabic_PF_A,
|
||||
Arabic_PF_B,
|
||||
Arabic_Sup,
|
||||
Armenian,
|
||||
Arrows,
|
||||
ASCII,
|
||||
Avestan,
|
||||
Balinese,
|
||||
Bamum,
|
||||
Bamum_Sup,
|
||||
Bassa_Vah,
|
||||
Batak,
|
||||
Bengali,
|
||||
Beria_Erfe,
|
||||
Bhaiksuki,
|
||||
Block_Elements,
|
||||
Bopomofo,
|
||||
Bopomofo_Ext,
|
||||
Box_Drawing,
|
||||
Brahmi,
|
||||
Braille,
|
||||
Buginese,
|
||||
Buhid,
|
||||
Byzantine_Music,
|
||||
Carian,
|
||||
Caucasian_Albanian,
|
||||
Chakma,
|
||||
Cham,
|
||||
Cherokee,
|
||||
Cherokee_Sup,
|
||||
Chess_Symbols,
|
||||
Chorasmian,
|
||||
CJK,
|
||||
CJK_Compat,
|
||||
CJK_Compat_Forms,
|
||||
CJK_Compat_Ideographs,
|
||||
CJK_Compat_Ideographs_Sup,
|
||||
CJK_Ext_A,
|
||||
CJK_Ext_B,
|
||||
CJK_Ext_C,
|
||||
CJK_Ext_D,
|
||||
CJK_Ext_E,
|
||||
CJK_Ext_F,
|
||||
CJK_Ext_G,
|
||||
CJK_Ext_H,
|
||||
CJK_Ext_I,
|
||||
CJK_Ext_J,
|
||||
CJK_Radicals_Sup,
|
||||
CJK_Strokes,
|
||||
CJK_Symbols,
|
||||
Compat_Jamo,
|
||||
Control_Pictures,
|
||||
Coptic,
|
||||
Coptic_Epact_Numbers,
|
||||
Counting_Rod,
|
||||
Cuneiform,
|
||||
Cuneiform_Numbers,
|
||||
Currency_Symbols,
|
||||
Cypriot_Syllabary,
|
||||
Cypro_Minoan,
|
||||
Cyrillic,
|
||||
Cyrillic_Ext_A,
|
||||
Cyrillic_Ext_B,
|
||||
Cyrillic_Ext_C,
|
||||
Cyrillic_Ext_D,
|
||||
Cyrillic_Sup,
|
||||
Deseret,
|
||||
Devanagari,
|
||||
Devanagari_Ext,
|
||||
Devanagari_Ext_A,
|
||||
Diacriticals,
|
||||
Diacriticals_Ext,
|
||||
Diacriticals_For_Symbols,
|
||||
Diacriticals_Sup,
|
||||
Dingbats,
|
||||
Dives_Akuru,
|
||||
Dogra,
|
||||
Domino,
|
||||
Duployan,
|
||||
Early_Dynastic_Cuneiform,
|
||||
Egyptian_Hieroglyph_Format_Controls,
|
||||
Egyptian_Hieroglyphs,
|
||||
Egyptian_Hieroglyphs_Ext_A,
|
||||
Elbasan,
|
||||
Elymaic,
|
||||
Emoticons,
|
||||
Enclosed_Alphanum,
|
||||
Enclosed_Alphanum_Sup,
|
||||
Enclosed_CJK,
|
||||
Enclosed_Ideographic_Sup,
|
||||
Ethiopic,
|
||||
Ethiopic_Ext,
|
||||
Ethiopic_Ext_A,
|
||||
Ethiopic_Ext_B,
|
||||
Ethiopic_Sup,
|
||||
Garay,
|
||||
Geometric_Shapes,
|
||||
Geometric_Shapes_Ext,
|
||||
Georgian,
|
||||
Georgian_Ext,
|
||||
Georgian_Sup,
|
||||
Glagolitic,
|
||||
Glagolitic_Sup,
|
||||
Gothic,
|
||||
Grantha,
|
||||
Greek,
|
||||
Greek_Ext,
|
||||
Gujarati,
|
||||
Gunjala_Gondi,
|
||||
Gurmukhi,
|
||||
Gurung_Khema,
|
||||
Half_And_Full_Forms,
|
||||
Half_Marks,
|
||||
Hangul,
|
||||
Hanifi_Rohingya,
|
||||
Hanunoo,
|
||||
Hatran,
|
||||
Hebrew,
|
||||
High_PU_Surrogates,
|
||||
High_Surrogates,
|
||||
Hiragana,
|
||||
IDC,
|
||||
Ideographic_Symbols,
|
||||
Imperial_Aramaic,
|
||||
Indic_Number_Forms,
|
||||
Indic_Siyaq_Numbers,
|
||||
Inscriptional_Pahlavi,
|
||||
Inscriptional_Parthian,
|
||||
IPA_Ext,
|
||||
Jamo,
|
||||
Jamo_Ext_A,
|
||||
Jamo_Ext_B,
|
||||
Javanese,
|
||||
Kaithi,
|
||||
Kaktovik_Numerals,
|
||||
Kana_Ext_A,
|
||||
Kana_Ext_B,
|
||||
Kana_Sup,
|
||||
Kanbun,
|
||||
Kangxi,
|
||||
Kannada,
|
||||
Katakana,
|
||||
Katakana_Ext,
|
||||
Kawi,
|
||||
Kayah_Li,
|
||||
Kharoshthi,
|
||||
Khitan_Small_Script,
|
||||
Khmer,
|
||||
Khmer_Symbols,
|
||||
Khojki,
|
||||
Khudawadi,
|
||||
Kirat_Rai,
|
||||
Lao,
|
||||
Latin_1_Sup,
|
||||
Latin_Ext_A,
|
||||
Latin_Ext_Additional,
|
||||
Latin_Ext_B,
|
||||
Latin_Ext_C,
|
||||
Latin_Ext_D,
|
||||
Latin_Ext_E,
|
||||
Latin_Ext_F,
|
||||
Latin_Ext_G,
|
||||
Lepcha,
|
||||
Letterlike_Symbols,
|
||||
Limbu,
|
||||
Linear_A,
|
||||
Linear_B_Ideograms,
|
||||
Linear_B_Syllabary,
|
||||
Lisu,
|
||||
Lisu_Sup,
|
||||
Low_Surrogates,
|
||||
Lycian,
|
||||
Lydian,
|
||||
Mahajani,
|
||||
Mahjong,
|
||||
Makasar,
|
||||
Malayalam,
|
||||
Mandaic,
|
||||
Manichaean,
|
||||
Marchen,
|
||||
Masaram_Gondi,
|
||||
Math_Alphanum,
|
||||
Math_Operators,
|
||||
Mayan_Numerals,
|
||||
Medefaidrin,
|
||||
Meetei_Mayek,
|
||||
Meetei_Mayek_Ext,
|
||||
Mende_Kikakui,
|
||||
Meroitic_Cursive,
|
||||
Meroitic_Hieroglyphs,
|
||||
Miao,
|
||||
Misc_Arrows,
|
||||
Misc_Math_Symbols_A,
|
||||
Misc_Math_Symbols_B,
|
||||
Misc_Pictographs,
|
||||
Misc_Symbols,
|
||||
Misc_Symbols_Sup,
|
||||
Misc_Technical,
|
||||
Modi,
|
||||
Modifier_Letters,
|
||||
Modifier_Tone_Letters,
|
||||
Mongolian,
|
||||
Mongolian_Sup,
|
||||
Mro,
|
||||
Multani,
|
||||
Music,
|
||||
Myanmar,
|
||||
Myanmar_Ext_A,
|
||||
Myanmar_Ext_B,
|
||||
Myanmar_Ext_C,
|
||||
Nabataean,
|
||||
Nag_Mundari,
|
||||
Nandinagari,
|
||||
NB,
|
||||
New_Tai_Lue,
|
||||
Newa,
|
||||
NKo,
|
||||
Number_Forms,
|
||||
Nushu,
|
||||
Nyiakeng_Puachue_Hmong,
|
||||
OCR,
|
||||
Ogham,
|
||||
Ol_Chiki,
|
||||
Ol_Onal,
|
||||
Old_Hungarian,
|
||||
Old_Italic,
|
||||
Old_North_Arabian,
|
||||
Old_Permic,
|
||||
Old_Persian,
|
||||
Old_Sogdian,
|
||||
Old_South_Arabian,
|
||||
Old_Turkic,
|
||||
Old_Uyghur,
|
||||
Oriya,
|
||||
Ornamental_Dingbats,
|
||||
Osage,
|
||||
Osmanya,
|
||||
Ottoman_Siyaq_Numbers,
|
||||
Pahawh_Hmong,
|
||||
Palmyrene,
|
||||
Pau_Cin_Hau,
|
||||
Phags_Pa,
|
||||
Phaistos,
|
||||
Phoenician,
|
||||
Phonetic_Ext,
|
||||
Phonetic_Ext_Sup,
|
||||
Playing_Cards,
|
||||
Psalter_Pahlavi,
|
||||
PUA,
|
||||
Punctuation,
|
||||
Rejang,
|
||||
Rumi,
|
||||
Runic,
|
||||
Samaritan,
|
||||
Saurashtra,
|
||||
Sharada,
|
||||
Sharada_Sup,
|
||||
Shavian,
|
||||
Shorthand_Format_Controls,
|
||||
Siddham,
|
||||
Sidetic,
|
||||
Sinhala,
|
||||
Sinhala_Archaic_Numbers,
|
||||
Small_Forms,
|
||||
Small_Kana_Ext,
|
||||
Sogdian,
|
||||
Sora_Sompeng,
|
||||
Soyombo,
|
||||
Specials,
|
||||
Sundanese,
|
||||
Sundanese_Sup,
|
||||
Sunuwar,
|
||||
Sup_Arrows_A,
|
||||
Sup_Arrows_B,
|
||||
Sup_Arrows_C,
|
||||
Sup_Math_Operators,
|
||||
Sup_PUA_A,
|
||||
Sup_PUA_B,
|
||||
Sup_Punctuation,
|
||||
Sup_Symbols_And_Pictographs,
|
||||
Super_And_Sub,
|
||||
Sutton_SignWriting,
|
||||
Syloti_Nagri,
|
||||
Symbols_And_Pictographs_Ext_A,
|
||||
Symbols_For_Legacy_Computing,
|
||||
Symbols_For_Legacy_Computing_Sup,
|
||||
Syriac,
|
||||
Syriac_Sup,
|
||||
Tagalog,
|
||||
Tagbanwa,
|
||||
Tags,
|
||||
Tai_Le,
|
||||
Tai_Tham,
|
||||
Tai_Viet,
|
||||
Tai_Xuan_Jing,
|
||||
Tai_Yo,
|
||||
Takri,
|
||||
Tamil,
|
||||
Tamil_Sup,
|
||||
Tangsa,
|
||||
Tangut,
|
||||
Tangut_Components,
|
||||
Tangut_Components_Sup,
|
||||
Tangut_Sup,
|
||||
Telugu,
|
||||
Thaana,
|
||||
Thai,
|
||||
Tibetan,
|
||||
Tifinagh,
|
||||
Tirhuta,
|
||||
Todhri,
|
||||
Tolong_Siki,
|
||||
Toto,
|
||||
Transport_And_Map,
|
||||
Tulu_Tigalari,
|
||||
UCAS,
|
||||
UCAS_Ext,
|
||||
UCAS_Ext_A,
|
||||
Ugaritic,
|
||||
Vai,
|
||||
Vedic_Ext,
|
||||
Vertical_Forms,
|
||||
Vithkuqi,
|
||||
VS,
|
||||
VS_Sup,
|
||||
Wancho,
|
||||
Warang_Citi,
|
||||
Yezidi,
|
||||
Yi_Radicals,
|
||||
Yi_Syllables,
|
||||
Yijing,
|
||||
Zanabazar_Square,
|
||||
Znamenny_Music,
|
||||
}
|
||||
|
||||
Combining_Class :: distinct byte
|
||||
|
||||
Paired_Brack_Type :: enum {
|
||||
Nil,
|
||||
Open,
|
||||
Close,
|
||||
None,
|
||||
}
|
||||
|
||||
Bidi_Class :: enum {
|
||||
Nil, //
|
||||
L, // Left-to-Right LRM
|
||||
R, // Right-to-Left RLM
|
||||
AL, // Right-to-Left Arabic ALM
|
||||
EN, // European Number
|
||||
ES, // European Number Separator
|
||||
ET, // European Number Terminator
|
||||
AN, // Arabic Number
|
||||
CS, // Common Number Separator
|
||||
NSM, // Nonspacing Mark
|
||||
BN, // Boundary Neutral
|
||||
B, // Paragraph Separator
|
||||
S, // Segment Separator
|
||||
WS, // Whitespace
|
||||
ON, // Other Neutrals
|
||||
LRE, // Left-to-Right Embedding LRE
|
||||
LRO, // Left-to-Right Override LRO
|
||||
RLE, // Right-to-Left Embedding RLE
|
||||
RLO, // Right-to-Left Override RLO
|
||||
PDF, // Pop Directional Format PDF
|
||||
LRI, // Left-to-Right Isolate LRI
|
||||
RLI, // Right-to-Left Isolate RLI
|
||||
FSI, // First Strong Isolate FSI
|
||||
PDI, // Pop Directional Isolate PDI
|
||||
}
|
||||
|
||||
|
||||
Bidi :: struct {
|
||||
bc: Bidi_Class,
|
||||
bmg: Maybe(rune), // mirrored glyph
|
||||
m: bool, // Bidi mirrored
|
||||
c: bool, // Bidi control property
|
||||
bpt : Paired_Brack_Type, // bidi paired bracket type
|
||||
bpb : rune, // bidi paired bracket properties
|
||||
}
|
||||
|
||||
|
||||
Decomposition_Type :: enum {
|
||||
Nil = 0,
|
||||
can,
|
||||
com,
|
||||
enc,
|
||||
fin,
|
||||
font,
|
||||
fra,
|
||||
init,
|
||||
iso,
|
||||
med,
|
||||
nar,
|
||||
nb,
|
||||
sml,
|
||||
sqr,
|
||||
sub,
|
||||
sup,
|
||||
vert,
|
||||
wid,
|
||||
none,
|
||||
}
|
||||
|
||||
Trinary_Bool :: enum {
|
||||
Maybe = -1,
|
||||
False = 0,
|
||||
True = 1,
|
||||
}
|
||||
|
||||
Decomposition_Mapping :: distinct [dynamic]rune
|
||||
|
||||
Decomposition :: struct {
|
||||
dt: Decomposition_Type, // Decomposition type
|
||||
dm: Decomposition_Mapping, // Decomposition Mapping
|
||||
ce: bool, // Composition Exclusion
|
||||
comp_ex: bool, // Full Composition Exclusion
|
||||
nfc_quick_check: Trinary_Bool,
|
||||
nfd_quick_check: bool,
|
||||
nfkc_quick_check: Trinary_Bool,
|
||||
nfkd_quick_check: bool,
|
||||
}
|
||||
|
||||
Numeric_Type :: enum {
|
||||
None = 0, // None
|
||||
Decimal, // De
|
||||
Digit, // Di
|
||||
Numeric, // Nu
|
||||
}
|
||||
|
||||
/*
|
||||
Note: Value is NAN when numberator and denominator ar 0
|
||||
*/
|
||||
Numberic_Value :: struct {
|
||||
numerator: int,
|
||||
denominator: int,
|
||||
}
|
||||
|
||||
Char :: struct {
|
||||
cp: rune,
|
||||
name: string,
|
||||
gc: General_Category,
|
||||
ccc: Combining_Class,
|
||||
bc: Bidi_Class,
|
||||
dt: Decomposition_Type,
|
||||
dm: Decomposition_Mapping,
|
||||
nt: Numeric_Type,
|
||||
nv: Numberic_Value,
|
||||
bm: bool,
|
||||
name1: string,
|
||||
sum: string, // Simple uppercase mapping
|
||||
slm: string, // Simple lowercase mapping
|
||||
stm: string, // Simple titlecase_mapping
|
||||
}
|
||||
|
||||
Char_Range :: struct {
|
||||
first_cp: rune,
|
||||
last_cp: rune,
|
||||
name: string,
|
||||
gc: General_Category,
|
||||
ccc: Combining_Class,
|
||||
bc: Bidi_Class,
|
||||
dt: Decomposition_Type,
|
||||
dm: Decomposition_Mapping,
|
||||
nt: Numeric_Type,
|
||||
nv: Numberic_Value,
|
||||
bm: bool,
|
||||
name1: string,
|
||||
sum: string, // Simple uppercase mapping
|
||||
slm: string, // Simple lowercase mapping
|
||||
stm: string, // Simple titlecase_mapping
|
||||
}
|
||||
|
||||
Chars :: union {
|
||||
Char,
|
||||
Char_Range,
|
||||
}
|
||||
|
||||
Unicode_Data :: distinct [dynamic]Chars
|
||||
|
||||
|
||||
PropList_Property :: enum {
|
||||
White_Space,
|
||||
Bidi_Control,
|
||||
Join_Control,
|
||||
Dash,
|
||||
Hyphen,
|
||||
Quotation_Mark,
|
||||
Terminal_Punctuation,
|
||||
Other_Math,
|
||||
Hex_Digit,
|
||||
ASCII_Hex_Digit,
|
||||
Other_Alphabetic,
|
||||
Ideographic,
|
||||
Diacritic,
|
||||
Extender,
|
||||
Other_Lowercase,
|
||||
Other_Uppercase,
|
||||
Noncharacter_Code_Point,
|
||||
Other_Grapheme_Extend,
|
||||
IDS_Binary_Operator,
|
||||
IDS_Trinary_Operator,
|
||||
IDS_Unary_Operator,
|
||||
Radical,
|
||||
Unified_Ideograph,
|
||||
Other_Default_Ignorable_Code_Point,
|
||||
Deprecated,
|
||||
Soft_Dotted,
|
||||
Logical_Order_Exception,
|
||||
Other_ID_Start,
|
||||
Other_ID_Continue,
|
||||
ID_Compat_Math_Continue,
|
||||
ID_Compat_Math_Start,
|
||||
Sentence_Terminal,
|
||||
Variation_Selector,
|
||||
Pattern_White_Space,
|
||||
Pattern_Syntax,
|
||||
Prepended_Concatenation_Mark,
|
||||
Regional_Indicator,
|
||||
Modifier_Combining_Mark,
|
||||
}
|
||||
|
||||
UCD_Error :: enum {
|
||||
XML_LOAD_ERROR,
|
||||
XML_Not_UCD,
|
||||
Nil_XML_Document,
|
||||
Element_Not_Repertoire,
|
||||
Extra_Fields,
|
||||
Unknown_Property,
|
||||
|
||||
NO_REPERTOIRE,
|
||||
UNEXPECTED_STRING,
|
||||
Invalid_Hex_Number,
|
||||
Invalid_General_Category,
|
||||
UnicodeData_6_Too_Long,
|
||||
UnicodeData_6_Invalid,
|
||||
UnicodeData_7_Too_Long,
|
||||
UnicodeData_7_Invalid,
|
||||
}
|
||||
|
||||
|
||||
Error :: union #shared_nil {
|
||||
UCD_Error,
|
||||
os.Error,
|
||||
}
|
||||
|
||||
Range_u16 :: struct {
|
||||
first: u16,
|
||||
last: u16,
|
||||
}
|
||||
|
||||
Range_i32 :: struct {
|
||||
first: i32,
|
||||
last: i32,
|
||||
}
|
||||
|
||||
Range_Rune :: struct {
|
||||
first: rune,
|
||||
last: rune,
|
||||
}
|
||||
|
||||
Dynamic_Range :: struct {
|
||||
single_16 : [dynamic]u16,
|
||||
ranges_16 : [dynamic]Range_u16,
|
||||
single_32 : [dynamic]i32,
|
||||
ranges_32 : [dynamic]Range_i32,
|
||||
}
|
||||
|
||||
append_to_dynamic_range :: proc(
|
||||
dr: ^Dynamic_Range,
|
||||
range: Range_Rune,
|
||||
allocator := context.allocator,
|
||||
) {
|
||||
if range.first == range.last && range.first <= 0xFFFF {
|
||||
if len(dr.single_16) == 0 {
|
||||
dr.single_16 = make([dynamic]u16, 0, 512, allocator)
|
||||
}
|
||||
append(&dr.single_16, cast(u16) range.first)
|
||||
} else if range.first == range.last {
|
||||
if len(dr.single_32) == 0 {
|
||||
dr.single_32 = make([dynamic]i32, 0, 512, allocator)
|
||||
}
|
||||
append(&dr.single_32, cast(i32) range.first)
|
||||
|
||||
} else if range.first <= 0xFFFF && range.last <= 0xFFFF {
|
||||
if len(dr.ranges_16) == 0 {
|
||||
dr.ranges_16 = make([dynamic]Range_u16, 0, 128, allocator)
|
||||
}
|
||||
r := Range_u16{ cast(u16)range.first, cast(u16) range.last}
|
||||
append(&dr.ranges_16, r)
|
||||
|
||||
} else {
|
||||
if len(dr.ranges_32) == 0 {
|
||||
dr.ranges_32 = make([dynamic]Range_i32, 0, 128, allocator)
|
||||
}
|
||||
r := Range_i32{ cast(i32)range.first, cast(i32) range.last}
|
||||
append(&dr.ranges_32, r)
|
||||
}
|
||||
}
|
||||
|
||||
destroy_dynamic_range :: proc (
|
||||
dr: Dynamic_Range,
|
||||
){
|
||||
delete(dr.ranges_16)
|
||||
delete(dr.ranges_32)
|
||||
delete(dr.single_16)
|
||||
delete(dr.single_32)
|
||||
}
|
||||
|
||||
destroy_general_category_ranges :: proc(
|
||||
gcr: [General_Category]Dynamic_Range,
|
||||
){
|
||||
for r in gcr {
|
||||
destroy_dynamic_range(r)
|
||||
}
|
||||
}
|
||||
307
core/unicode/tools/ucd/ucd.odin
Normal file
307
core/unicode/tools/ucd/ucd.odin
Normal file
@@ -0,0 +1,307 @@
|
||||
package ucd
|
||||
|
||||
import "core:strings"
|
||||
import "core:os"
|
||||
|
||||
load_unicode_data :: proc(
|
||||
filename: string,
|
||||
allocator := context.allocator,
|
||||
) -> (unicode_data : Unicode_Data, err: Error) {
|
||||
|
||||
data, os_error := os.read_entire_file(filename, context.temp_allocator)
|
||||
if os_error != nil {
|
||||
err = os_error
|
||||
return
|
||||
}
|
||||
defer free_all(context.temp_allocator)
|
||||
|
||||
line_iter := Line_Iterator{data = data }
|
||||
first_cp: rune
|
||||
|
||||
line_loop: for line, line_num in line_iterator(&line_iter) {
|
||||
// Skip empty lines
|
||||
if len(line) == 0 do continue
|
||||
|
||||
field_iter := Field_Iterator{line = line}
|
||||
is_range := false
|
||||
cp: rune
|
||||
name: string
|
||||
gc: General_Category
|
||||
|
||||
num_6 : string
|
||||
num_7 : string
|
||||
nt := Numeric_Type.None
|
||||
nv : Numberic_Value
|
||||
|
||||
for field, field_num in field_iterator(&field_iter) {
|
||||
switch field_num {
|
||||
case 0: // Code point
|
||||
cp = 0
|
||||
|
||||
for c in field {
|
||||
if !(c >= '0' && c <= '9') && !(c >= 'A' && c <= 'F') do break
|
||||
cp *= 16
|
||||
cp += cast(rune)(c >= '0' && c <= '9') * cast(rune)(c - '0')
|
||||
cp += cast(rune)(c >= 'A' && c <= 'F') * cast(rune)(c - 'A' + 10)
|
||||
}
|
||||
|
||||
case 1: // Name
|
||||
if len(field) > 9 && field[0] == '<' && strings.ends_with(transmute(string) field, ", First>") {
|
||||
first_cp = cp
|
||||
continue line_loop
|
||||
}
|
||||
|
||||
if len(field) > 9 && field[0] == '<' && strings.ends_with(transmute(string) field, ", Last>") {
|
||||
name = strings.clone_from_bytes(field[1:len(field)-7], allocator)
|
||||
is_range = true
|
||||
} else {
|
||||
name = strings.clone_from_bytes(field[:], allocator)
|
||||
}
|
||||
|
||||
case 2: // General_Category
|
||||
// NOTE: This is currently igorning a possible error it should probably be fixed
|
||||
gc, _ = string_to_general_category(transmute(string)field)
|
||||
|
||||
case 3: // Canonical_Combining_Class
|
||||
case 4: // Bidi Class
|
||||
case 5: // Decomposition_Type and Decomposition_Mapping
|
||||
// Numeric_Type and Numberic_Value
|
||||
case 6:
|
||||
num_6 = transmute(string)field
|
||||
|
||||
case 7:
|
||||
num_7 = transmute(string)field
|
||||
|
||||
case 8:
|
||||
switch {
|
||||
case num_6 != "" && num_7 != "" && transmute(string) field != "" :
|
||||
nt = .Decimal
|
||||
|
||||
case num_6 == "" && num_7 != "" && transmute(string) field != "" :
|
||||
nt = .Digit
|
||||
|
||||
case num_6 == "" && num_7 == "" && transmute(string) field != "" :
|
||||
nt = .Numeric
|
||||
|
||||
case:
|
||||
nt = .None
|
||||
}
|
||||
|
||||
case 9: // Bidi mirrored
|
||||
case 10: // Unicode 1 Name (Obsolete as of 6.2.0)
|
||||
case 11: // should be null
|
||||
case 12:
|
||||
case 13:
|
||||
case 14:
|
||||
case:
|
||||
unreachable()
|
||||
}
|
||||
}
|
||||
|
||||
if is_range {
|
||||
cr : Char_Range
|
||||
cr.gc = gc
|
||||
cr.first_cp = first_cp
|
||||
cr.last_cp = cp
|
||||
cr.name = name
|
||||
cr.nt = nt
|
||||
append(&unicode_data, cr)
|
||||
} else {
|
||||
c : Char
|
||||
c.gc = gc
|
||||
c.cp = cp
|
||||
c.name = name
|
||||
c.nt = nt
|
||||
append(&unicode_data, c)
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
destroy_unicode_data :: proc(unicode_data: Unicode_Data){
|
||||
for point in unicode_data {
|
||||
switch p in point {
|
||||
case Char:
|
||||
delete(p.name)
|
||||
case Char_Range:
|
||||
delete(p.name)
|
||||
}
|
||||
}
|
||||
delete(unicode_data)
|
||||
}
|
||||
|
||||
|
||||
gc_ranges :: proc(ud: ^Unicode_Data, allocator := context.allocator) -> (lst: [General_Category]Dynamic_Range) {
|
||||
range := Range_Rune {
|
||||
first = -1,
|
||||
last = -1,
|
||||
}
|
||||
gc: General_Category
|
||||
|
||||
for point in ud {
|
||||
switch p in point {
|
||||
case Char:
|
||||
if range.first != -1 && (p.cp != range.last + 1 || p.gc != gc) {
|
||||
append_to_dynamic_range(&lst[gc], range, allocator)
|
||||
range.first = -1
|
||||
range.last = -1
|
||||
}
|
||||
|
||||
range.first = transmute(rune) min(transmute(u32)range.first, transmute(u32)p.cp)
|
||||
gc = p.gc
|
||||
range.last = p.cp
|
||||
|
||||
case Char_Range:
|
||||
if range.first != -1 do append_to_dynamic_range(&lst[gc], range, allocator)
|
||||
|
||||
range.first = p.first_cp
|
||||
range.last = p.last_cp
|
||||
append_to_dynamic_range(&lst[p.gc], range ,allocator)
|
||||
range.first = -1
|
||||
range.last = -1
|
||||
}
|
||||
}
|
||||
if range.first != -1 do append_to_dynamic_range(&lst[gc], range, allocator)
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
|
||||
extra_digits :: proc(ud: ^Unicode_Data, allocator := context.allocator) -> (Dynamic_Range) {
|
||||
range := Range_Rune {
|
||||
first = -1,
|
||||
last = -1,
|
||||
}
|
||||
|
||||
exd: Dynamic_Range
|
||||
for point in ud {
|
||||
switch p in point {
|
||||
|
||||
case Char:
|
||||
exd_type := p.gc != .Nd && (p.nt == .Decimal || p.nt == .Digit)
|
||||
|
||||
if range.first != -1 && (p.cp != range.last + 1 || !exd_type) {
|
||||
append_to_dynamic_range(&exd, range, allocator)
|
||||
range.first = -1
|
||||
range.last = -1
|
||||
}
|
||||
|
||||
if exd_type {
|
||||
range.first = transmute(rune) min(transmute(u32)range.first, transmute(u32)p.cp)
|
||||
range.last = p.cp
|
||||
}
|
||||
|
||||
case Char_Range:
|
||||
exd_type := p.gc != .Nd && (p.nt == .Decimal || p.nt == .Digit)
|
||||
|
||||
if range.first != -1 do append_to_dynamic_range(&exd, range, allocator)
|
||||
|
||||
if exd_type {
|
||||
range.first = p.first_cp
|
||||
range.last = p.last_cp
|
||||
append_to_dynamic_range(&exd, range ,allocator)
|
||||
}
|
||||
range.first = -1
|
||||
range.last = -1
|
||||
}
|
||||
}
|
||||
if range.first != -1 do append_to_dynamic_range(&exd, range, allocator)
|
||||
|
||||
return exd
|
||||
}
|
||||
|
||||
/*
|
||||
Data containted in the Unicode fiel PropList.txt
|
||||
|
||||
A `PropList` is the data containted in the Unicode Database (UCD) file
|
||||
PropList.txt. It is created with the procedure `load_property_list` and
|
||||
destroy with the procedure `destroy_property_list`.
|
||||
*/
|
||||
PropList ::[PropList_Property]Dynamic_Range
|
||||
|
||||
/*
|
||||
This function destroys a `PropList` created by `load_property_list`.
|
||||
|
||||
Inputs:
|
||||
- props: The PropList to destroy
|
||||
*/
|
||||
destroy_protperty_list :: proc(
|
||||
props: [PropList_Property]Dynamic_Range,
|
||||
){
|
||||
for r in props {
|
||||
delete(r.ranges_16)
|
||||
delete(r.ranges_32)
|
||||
delete(r.single_16)
|
||||
delete(r.single_32)
|
||||
}
|
||||
}
|
||||
|
||||
load_protperty_list :: proc (
|
||||
filename : string,
|
||||
allocator := context.allocator,
|
||||
) -> (props: [PropList_Property]Dynamic_Range, err: Error) {
|
||||
|
||||
data, os_error := os.read_entire_file(filename, allocator)
|
||||
if os_error != nil {
|
||||
err = os_error
|
||||
return
|
||||
}
|
||||
defer delete(data)
|
||||
|
||||
line_iter := Line_Iterator{
|
||||
data = data
|
||||
}
|
||||
for line in line_iterator(&line_iter) {
|
||||
if len(line) == 0 do continue
|
||||
field_iter := Field_Iterator{ line = line}
|
||||
|
||||
is_range: bool
|
||||
|
||||
rr : Range_Rune
|
||||
|
||||
prop: PropList_Property
|
||||
for field, i in field_iterator(&field_iter) {
|
||||
switch i {
|
||||
case 0: // Code point or code point range
|
||||
for c in field {
|
||||
if !(c >= '0' && c <= '9') && !(c >= 'A' && c <= 'F') {
|
||||
if c == '.' {
|
||||
is_range = true
|
||||
continue
|
||||
} else {
|
||||
err = UCD_Error.Invalid_Hex_Number
|
||||
return
|
||||
}
|
||||
}
|
||||
if is_range {
|
||||
rr.first *= 16
|
||||
rr.first += cast(rune)(c >= '0' && c <= '9') * cast(rune)(c - '0')
|
||||
rr.first += cast(rune)(c >= 'A' && c <= 'F') * cast(rune)(c - 'A' + 10)
|
||||
rr.last = rr.first
|
||||
} else {
|
||||
rr.last *= 16
|
||||
rr.last += cast(rune)(c >= '0' && c <= '9') * cast(rune)(c - '0')
|
||||
rr.last += cast(rune)(c >= 'A' && c <= 'F') * cast(rune)(c - 'A' + 10)
|
||||
}
|
||||
}
|
||||
|
||||
case 1:
|
||||
prop, err = string_to_proplist_property(transmute(string)field)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
case:
|
||||
err = UCD_Error.Extra_Fields
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
append_to_dynamic_range(&props[prop], rr, allocator)
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user