Added program to generate Unicode Table

This adds a program that will generate tables for use by the `core/unicode`
package. The table generated file will be `core/unicode/generated.odin`

It may be better to incorporate this into `generate_entity_table.odin`.
This can easily be accomplised if desired.
This commit is contained in:
StudebakerGuy
2026-03-07 17:34:37 -05:00
committed by Jeroen van Rijn
parent c4f5f9e55a
commit 8f579d1f3b
5 changed files with 1800 additions and 0 deletions

View File

@@ -0,0 +1,325 @@
package main
import "core:fmt"
import path "core:path/filepath"
import "core:os"
import "core:strings"
import "base:runtime"
import "core:mem"
import "core:io"
import "core:log"
import "ucd"
// Table 2-3. Types of Code Points
// Table 4-4. General_Category Values page 229
// Reference https://www.unicode.org/reports/tr44/
/*
Formats a ucd.Dynamic_Range into a set of fixed length arrays and writes
corresponding to a io.Writer. The value of the parameter `name`will be used as a
prefix to the array names. If a dynamic array contained in the `range` is empty,
no corresponding fixed length array will be written.
Inputs:
- writer: The io.Writer to be written to.
- name: Prefix to add to any array that is written to `writer`
- range: The ucd.Dynamic_Range to format and write to writer.
*/
write_range_arrays :: proc(
writer: io.Writer,
name: string,
range : ucd.Dynamic_Range,
) -> int {
n_written : int
if len(range.single_16) > 0 {
n_written += fmt.wprintln(writer, "@(rodata)")
n_written += fmt.wprintf(writer, "%s_singles16 := [?]u16{{", name)
line_length := 100
for v in range.single_16 {
str_buffer : [32]byte
str := fmt.bprintf(str_buffer[:], " 0x%4X,",v)
if line_length + len(str) > 80 {
n_written += fmt.wprintf(writer, "\n")
line_length = fmt.wprintf(writer, "\t0x%4X,",v)
n_written += line_length
} else {
temp, _ := io.write_string(writer, str)
line_length += temp
n_written += temp
}
}
n_written += fmt.wprintln(writer, "\n}\n")
}
if len(range.ranges_16) > 0 {
n_written += fmt.wprintln(writer, "@(rodata)")
n_written += fmt.wprintfln(writer, "%s_ranges16 := [?]u16{{", name)
for v in range.ranges_16 {
n_written += fmt.wprintfln(writer, "\t0x%4X, 0x%4X,", v.first, v.last)
}
n_written += fmt.wprintln(writer, "}\n")
}
if len(range.single_32) > 0 {
n_written += fmt.wprintln(writer, "@(rodata)")
n_written += fmt.wprintf(writer, "%s_singles32 := [?]i32{{", name)
line_length := 100
for v in range.single_32 {
str_buffer : [32]byte
str := fmt.bprintf(str_buffer[:], " 0x%4X,",v)
if line_length + len(str) > 80 {
n_written += fmt.wprint(writer, "\n")
line_length = fmt.wprintf(writer, "\t0x%4X,",v)
n_written += line_length
} else {
temp, _ := io.write_string(writer, str)
line_length += temp
n_written += temp
}
}
n_written += fmt.wprintln(writer, "\n}\n")
}
if len(range.ranges_32) > 0 {
n_written += fmt.wprintln(writer, "@(rodata)")
n_written += fmt.wprintfln(writer, "%s_ranges32 := [?]i32{{", name)
for v in range.ranges_32 {
n_written += fmt.wprintfln(writer, "\t0x%4X, 0x%4X,", v.first, v.last)
}
n_written += fmt.wprintln(writer, "}\n")
}
return n_written
}
write_range :: proc(
writer: io.Writer,
name: union{string,
ucd.General_Category},
range: ucd.Dynamic_Range,
) -> (n_written: int) {
buffer: [128]byte
str: string
switch n in name{
case string:
assert(len(n) <= len(buffer))
runtime.mem_copy(&buffer[0], raw_data(n), len(n))
str = transmute(string) buffer[0:len(n)]
case ucd.General_Category:
str = fmt.bprintf(buffer[:], "%s", n)
}
for &b in buffer[0:len(str)] {
if b >= 'A' && b <= 'Z' {
b += ('a' - 'A')
}
}
n_written = write_range_arrays(writer, str, range)
n_written += fmt.wprintfln(writer, "%s_ranges := Range{{", str)
if len(range.single_16) > 0 {
n_written += fmt.wprintfln(writer, "\tsingle_16 = %s_singles16[:],", str)
}
if len(range.ranges_16) > 0 {
n_written += fmt.wprintfln(writer, "\tranges_16 = %s_ranges16[:],", str)
}
if len(range.single_32) > 0 {
n_written += fmt.wprintfln(writer, "\tsingle_32 = %s_singles32[:],", str)
}
if len(range.ranges_32) > 0 {
n_written += fmt.wprintfln(writer, "\tranges_32 = %s_ranges32[:],", str)
}
n_written += fmt.wprintln(writer, "}\n")
return
}
GENERATED :: `/*
------ GENERATED ------ DO NOT EDIT ------ GENERATED ------ DO NOT EDIT ------ GENERATED ------
*/
`
MESSAGE :: `/*
This file is generated from UnicodeData.txt and PropList.txt. These files
are part of the Unicode Database (UCD) and are covered by the license
listed further down. They may be downloaded from the following locations;
https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt
https://www.unicode.org/license.txt
------------------------------------------------------------------------------
UNICODE LICENSE V3
COPYRIGHT AND PERMISSION NOTICE
Copyright © 1991-2026 Unicode, Inc.
NOTICE TO USER: Carefully read the following legal agreement. BY
DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING DATA FILES, AND/OR
SOFTWARE, YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
TERMS AND CONDITIONS OF THIS AGREEMENT. IF YOU DO NOT AGREE, DO NOT
DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE THE DATA FILES OR SOFTWARE.
Permission is hereby granted, free of charge, to any person obtaining a
copy of data files and any associated documentation (the "Data Files") or
software and any associated documentation (the "Software") to deal in the
Data Files or Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, and/or sell
copies of the Data Files or Software, and to permit persons to whom the
Data Files or Software are furnished to do so, provided that either (a)
this copyright and permission notice appear with all copies of the Data
Files or Software, or (b) this copyright and permission notice appear in
associated Documentation.
THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
THIRD PARTY RIGHTS.
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE
BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES,
OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE DATA
FILES OR SOFTWARE.
Except as contained in this notice, the name of a copyright holder shall
not be used in advertising or otherwise to promote the sale, use or other
dealings in these Data Files or Software without prior written
authorization of the copyright holder.
*/
`
main :: proc() {
track: mem.Tracking_Allocator
mem.tracking_allocator_init(&track, context.allocator)
defer {
if len(track.allocation_map) > 0 {
fmt.eprintf("=== %v allocations not freed: ===\n", len(track.allocation_map))
for _, entry in track.allocation_map {
fmt.eprintf("- %v bytes @ %v\n", entry.size, entry.location)
}
}
mem.tracking_allocator_destroy(&track)
}
context.allocator = mem.tracking_allocator(&track)
context.logger = log.create_console_logger()
defer log.destroy_console_logger(context.logger)
ucd_path, _ := path.join({ODIN_ROOT,
"tests","core","assets","UCD","UnicodeData.txt"}, context.allocator)
defer delete(ucd_path)
unicode_data, ucd_err := ucd.load_unicode_data(ucd_path)
if ucd_err != nil {
log.errorf("Error loading Unicode data. %s", ucd_err)
}
defer ucd.destroy_unicode_data(unicode_data)
general_category_ranges := ucd.gc_ranges(&unicode_data)
defer ucd.destroy_general_category_ranges(general_category_ranges)
extra_digits := ucd.extra_digits(&unicode_data)
defer ucd.destroy_dynamic_range(extra_digits)
proplist_path, _ := path.join({ODIN_ROOT,
"tests","core","assets","UCD","PropList.txt"}, context.allocator)
defer delete(proplist_path)
proplist, proplist_err := ucd.load_protperty_list(proplist_path)
if proplist_err != nil {
log.errorf("Error loading PropList.txt. %s", proplist_err)
return
}
defer ucd.destroy_protperty_list(proplist)
sb := strings.builder_make_len_cap(0, 1024*32)
defer strings.builder_destroy(&sb)
writer := strings.to_writer(&sb)
fmt.wprintfln(writer, "package unicode\n")
fmt.wprintln(writer, GENERATED)
fmt.wprintln(writer, MESSAGE)
Range_Type :: "Range :: struct {\n" +
"\tsingle_16 : []u16,\n" +
"\tranges_16 : []u16,\n" +
"\tsingle_32 : []i32,\n" +
"\tranges_32 : []i32,\n" +
"}\n"
fmt.wprintfln(writer, "%s", Range_Type)
//List of the general categories to skip when generating the code for
//core/unicode/generated.txt.
to_exclude := [?]ucd.General_Category{
.Cc, // Control, a C0 or C1 control code
.Cf, // Format, a format control character
.Cn, // Unassigned, a reserved unassigned code point or a noncharacter
.Co, // Private_Use, a private-use character
.Cs, // Surrogate, a surrogate code point
// .Ll, // Lowercase_Letter, a lowercase letter
// .Lm, // Modifier_Letter, a modifier letter
// .Lo, // Other_Letter, other letters, including syllables and ideographs
// .Lt, // Titlecase_Letter, a digraph encoded as a single character, with first part uppercase
// .Lu, // Uppercase_Letter, an uppercase letter
.Mc, // Spacing_Mark, a spacing combining mark (positive advance width)
.Me, // Enclosing_Mark, an enclosing combining mark
.Mn, // Nonspacing_Mark, a nonspacing combining mark (zero advance width)
//.Nd, // Decimal_Number, a decimal digit
//.Nl, // Letter_Number, a letterlike numeric character
//.No, // Other_Number, a numeric character of other type
.Pc, // Connector_Punctuation, a connecting punctuation mark, like a tie
.Pd, // Dash_Punctuation, a dash or hyphen punctuation mark
.Pe, // Close_Punctuation, a closing punctuation mark (of a pair)
.Pf, // Final_Punctuation, a final quotation mark
.Pi, // Initial_Punctuation, an initial quotation mark
.Po, // Other_Punctuation, a punctuation mark of other type
.Ps, // Open_Punctuation, an opening punctuation mark (of a pair)
.Sc, // Currency_Symbol, a currency sign
.Sk, // Modifier_Symbol, a non-letterlike modifier symbol
.Sm, // Math_Symbol, a symbol of mathematical use
.So, // Other_Symbol, a symbol of other type
.Zl, // Line_Separator, U+2028 LINE SEPARATOR only
.Zp, // Paragraph_Separator, U+2029 PARAGRAPH SEPARATOR only
.Zs, // Space_Separator, a space character (of various non-zero widths)
}
write_loop : for gc, i in general_category_ranges {
for excluded in to_exclude {
if i == excluded do continue write_loop
}
write_range(writer, i, gc)
}
write_range(writer, "extra_digits", extra_digits )
write_range(writer,"other_lowercase", proplist[.Other_Lowercase])
write_range(writer,"other_uppercase", proplist[.Other_Uppercase])
file_name, _ := path.join({ODIN_ROOT, "core", "unicode", "generated.odin"}, context.allocator)
defer delete(file_name)
str := strings.to_string(sb)
write_error := os.write_entire_file_from_string(file_name, str)
if write_error != nil {
log.errorf("Error writting %s. %s", file_name, write_error)
}
}

View File

@@ -0,0 +1,70 @@
package ucd
/*
An iterator that allows simple iterating over the lines of of a slice of bytes, []byte,
without allocating. Each line must end in a new line, i.e., '\n'
*/
Line_Iterator :: struct {
index: int, // current location in data
data: []byte, // Data over which to iterate
line_counter: int, // line number storage
}
line_iterator :: proc(it: ^Line_Iterator) -> (line: []byte, line_number: int, more: bool) {
more = it.index < len(it.data)
if more {
it.line_counter += 1
line_number = it.line_counter
} else {
return
}
start:= it.index
for it.index < len(it.data) && it.data[it.index] != '\n' && it.data[it.index] != '#' do it.index += 1
line = it.data[start:it.index]
//index = start
if it.index < len(it.data) && it.data[it.index] == '#' {
for it.index < len(it.data) && it.data[it.index] != '\n' do it.index += 1
}
if it.index < len(it.data) && it.data[it.index] == '\n' do it.index += 1
return
}
Field_Iterator :: struct {
index: int,
field_counter: int,
line: []byte,
}
field_iterator :: proc(it: ^Field_Iterator) -> (field: []byte, field_count: int, valid: bool) {
valid = it.index < len(it.line) && it.line[it.index] != '\n' && it.line[it.index] != '#'
if !valid do return
if it.index < len(it.line) && it.index != 0 && it.line[it.index] == ';' do it. index += 1
start := it.index
for it.index < len(it.line) && it.line[it.index] != ';' && it.line[it.index] != '#' do it.index += 1
field = it.line[start:it.index]
temp := field
// Remove leading spaces
for b, i in temp {
if b != ' ' {
field = temp[i:]
break
}
}
// Remove trailing spaces
temp = field
for b, i in temp {
if b != ' ' {
field = temp[0:i+1]
}
}
field_count = it.field_counter
it.field_counter += 1
return
}

View File

@@ -0,0 +1,396 @@
package ucd
string_to_general_category :: proc "contextless"(
str: string,
) -> (gc: General_Category, err: Error) {
switch str {
case "Lu":
gc = .Lu
case "Ll":
gc = .Ll
case "Lt":
gc = .Lt
case "Lm":
gc = .Lm
case "Lo":
gc = .Lo
case "Mn":
gc = .Mn
case "Mc":
gc = .Mc
case "Me":
gc = .Me
case "Nd":
gc = .Nd
case "Nl":
gc = .Nl
case "No":
gc = .No
case "Pc":
gc = .Pc
case "Pd":
gc = .Pd
case "Ps":
gc = .Ps
case "Pe":
gc = .Pe
case "Pi":
gc = .Pi
case "Pf":
gc = .Pf
case "Po":
gc = .Po
case "Sm":
gc = .Sm
case "Sc":
gc = .Sc
case "Sk":
gc = .Sk
case "So":
gc = .So
case "Zs":
gc = .Zs
case "Zl":
gc = .Zl
case "Zp":
gc = .Zp
case "Cc":
gc = .Cc
case "Cf":
gc = .Cf
case "Cs":
gc = .Cs
case "Co":
gc = .Co
case "Cn":
gc = .Cn
case:
err = UCD_Error.Invalid_General_Category
}
return
}
string_to_age :: proc "contextless" (
str: string,
) -> (age: Age, err: Error) {
switch str {
case "1.1":
age = .Age_1_1
return
case "2.0":
age = .Age_2_0
return
case "2.1":
age = .Age_2_1
return
case "3.0":
age = .Age_3_0
return
case "3.1":
age = .Age_3_1
return
case "3.2":
age = .Age_3_2
return
case "4.0":
age = .Age_4_0
return
case "4.1":
age = .Age_4_1
return
case "5.0":
age = .Age_5_0
return
case "5.1":
age = .Age_5_1
return
case "5.2":
age = .Age_5_2
return
case "6.0":
age = .Age_6_0
return
case "6.1":
age = .Age_6_1
return
case "6.2":
age = .Age_6_2
return
case "6.3":
age = .Age_6_3
return
case "7.0":
age = .Age_7_0
return
case "8.0":
age = .Age_8_0
return
case "9.0":
age = .Age_9_0
return
case "10.0":
age = .Age_10_0
return
case "11.0":
age = .Age_11_0
return
case "12.0":
age = .Age_12_0
return
case "12.1":
age = .Age_12_1
return
case "13.0":
age = .Age_13_0
return
case "14.0":
age = .Age_14_0
return
case "15.0":
age = .Age_15_0
return
case "15.1":
age = .Age_15_1
return
case "16.0":
age = .Age_16_0
return
case "17.0":
age = .Age_17_0
return
case "unassigned":
age = .Age_Unassigned
return
case:
// NOTE: Should this return an error instead?
unreachable()
}
}
string_to_paired_bracket_type :: proc "contextless"(str: string) -> Paired_Brack_Type {
switch str {
case "o":
return .Open
case "c":
return .Close
case "n":
return .None
case:
// TODO: Add error for this
unreachable()
}
}
string_to_bidi_class :: proc "contextless"(str: string) -> Bidi_Class {
switch str {
case "AL":
return .AL
case "AN":
return .AN
case "B":
return .B
case "BN":
return .BN
case "CS":
return .CS
case "EN":
return .EN
case "ES":
return .ES
case "ET":
return .ET
case "FSI":
return .FSI
case "L":
return .L
case "LRE":
return .LRE
case "LRI":
return .LRI
case "LRO":
return .LRO
case "NSM":
return .NSM
case "ON":
return .ON
case "PDF":
return .PDF
case "PDI":
return .PDI
case "R":
return .R
case "RLE":
return .RLE
case "RLI":
return .RLI
case "RLO":
return .RLO
case "S":
return .S
case "WS":
return .WS
case:
// TODO: Add error for this
unreachable()
}
}
string_to_proplist_property :: proc(str: string) -> (
prop: PropList_Property,
err: UCD_Error,
) {
switch str {
case "White_Space":
prop = .White_Space
case "Bidi_Control":
prop = .Bidi_Control
case "Join_Control":
prop = .Join_Control
case "Dash":
prop = .Dash
case "Hyphen":
prop = .Hyphen
case "Quotation_Mark":
prop = .Quotation_Mark
case "Terminal_Punctuation":
prop = .Terminal_Punctuation
case "Other_Math":
prop = .Other_Math
case "Hex_Digit":
prop = .Hex_Digit
case "ASCII_Hex_Digit":
prop = .ASCII_Hex_Digit
case "Other_Alphabetic":
prop = .Other_Alphabetic
case "Ideographic":
prop = .Ideographic
case "Diacritic":
prop = .Diacritic
case "Extender":
prop = .Extender
case "Other_Lowercase":
prop = .Other_Lowercase
case "Other_Uppercase":
prop = .Other_Uppercase
case "Noncharacter_Code_Point":
prop = .Noncharacter_Code_Point
case "Other_Grapheme_Extend":
prop = .Other_Grapheme_Extend
case "IDS_Binary_Operator":
prop = .IDS_Binary_Operator
case "IDS_Trinary_Operator":
prop = .IDS_Trinary_Operator
case "IDS_Unary_Operator":
prop = .IDS_Unary_Operator
case "Radical":
prop = .Radical
case "Unified_Ideograph":
prop = .Unified_Ideograph
case "Other_Default_Ignorable_Code_Point":
prop = .Other_Default_Ignorable_Code_Point
case "Deprecated":
prop = .Deprecated
case "Soft_Dotted":
prop = .Soft_Dotted
case "Logical_Order_Exception":
prop = .Logical_Order_Exception
case "Other_ID_Start":
prop = .Other_ID_Start
case "Other_ID_Continue":
prop = .Other_ID_Continue
case "ID_Compat_Math_Continue":
prop = .ID_Compat_Math_Continue
case "ID_Compat_Math_Start":
prop = .ID_Compat_Math_Start
case "Sentence_Terminal":
prop = .Sentence_Terminal
case "Variation_Selector":
prop = .Variation_Selector
case "Pattern_White_Space":
prop = .Pattern_White_Space
case "Pattern_Syntax":
prop = .Pattern_Syntax
case "Prepended_Concatenation_Mark":
prop = .Prepended_Concatenation_Mark
case "Regional_Indicator":
prop = .Regional_Indicator
case "Modifier_Combining_Mark":
prop = .Modifier_Combining_Mark
case:
err = .Unknown_Property
return
}
return
}

View File

@@ -0,0 +1,702 @@
package ucd
import "core:os"
Age :: enum byte {
Nil = 0,
Age_1_1,
Age_2_0,
Age_2_1,
Age_3_0,
Age_3_1,
Age_3_2,
Age_4_0,
Age_4_1,
Age_5_0,
Age_5_1,
Age_5_2,
Age_6_0,
Age_6_1,
Age_6_2,
Age_6_3,
Age_7_0,
Age_8_0,
Age_9_0,
Age_10_0,
Age_11_0,
Age_12_0,
Age_12_1,
Age_13_0,
Age_14_0,
Age_15_0,
Age_15_1,
Age_16_0,
Age_17_0,
Age_Unassigned,
}
General_Category :: enum {
Cc, // Control, a C0 or C1 control code
Cf, // Format, a format control character
Cn, // Unassigned, a reserved unassigned code point or a noncharacter
Co, // Private_Use, a private-use character
Cs, // Surrogate, a surrogate code point
Ll, // Lowercase_Letter, a lowercase letter
Lm, // Modifier_Letter, a modifier letter
Lo, // Other_Letter, other letters, including syllables and ideographs
Lt, // Titlecase_Letter, a digraph encoded as a single character, with first part uppercase
Lu, // Uppercase_Letter, an uppercase letter
Mc, // Spacing_Mark, a spacing combining mark (positive advance width)
Me, // Enclosing_Mark, an enclosing combining mark
Mn, // Nonspacing_Mark, a nonspacing combining mark (zero advance width)
Nd, // Decimal_Number, a decimal digit
Nl, // Letter_Number, a letterlike numeric character
No, // Other_Number, a numeric character of other type
Pc, // Connector_Punctuation, a connecting punctuation mark, like a tie
Pd, // Dash_Punctuation, a dash or hyphen punctuation mark
Pe, // Close_Punctuation, a closing punctuation mark (of a pair)
Pf, // Final_Punctuation, a final quotation mark
Pi, // Initial_Punctuation, an initial quotation mark
Po, // Other_Punctuation, a punctuation mark of other type
Ps, // Open_Punctuation, an opening punctuation mark (of a pair)
Sc, // Currency_Symbol, a currency sign
Sk, // Modifier_Symbol, a non-letterlike modifier symbol
Sm, // Math_Symbol, a symbol of mathematical use
So, // Other_Symbol, a symbol of other type
Zl, // Line_Separator, U+2028 LINE SEPARATOR only
Zp, // Paragraph_Separator, U+2029 PARAGRAPH SEPARATOR only
Zs, // Space_Separator, a space character (of various non-zero widths)
}
Block :: enum {
Nil = 0,
Adlam,
Aegean_Numbers,
Ahom,
Alchemical,
Alphabetic_PF,
Anatolian_Hieroglyphs,
Ancient_Greek_Music,
Ancient_Greek_Numbers,
Ancient_Symbols,
Arabic,
Arabic_Ext_A,
Arabic_Ext_B,
Arabic_Ext_C,
Arabic_Math,
Arabic_PF_A,
Arabic_PF_B,
Arabic_Sup,
Armenian,
Arrows,
ASCII,
Avestan,
Balinese,
Bamum,
Bamum_Sup,
Bassa_Vah,
Batak,
Bengali,
Beria_Erfe,
Bhaiksuki,
Block_Elements,
Bopomofo,
Bopomofo_Ext,
Box_Drawing,
Brahmi,
Braille,
Buginese,
Buhid,
Byzantine_Music,
Carian,
Caucasian_Albanian,
Chakma,
Cham,
Cherokee,
Cherokee_Sup,
Chess_Symbols,
Chorasmian,
CJK,
CJK_Compat,
CJK_Compat_Forms,
CJK_Compat_Ideographs,
CJK_Compat_Ideographs_Sup,
CJK_Ext_A,
CJK_Ext_B,
CJK_Ext_C,
CJK_Ext_D,
CJK_Ext_E,
CJK_Ext_F,
CJK_Ext_G,
CJK_Ext_H,
CJK_Ext_I,
CJK_Ext_J,
CJK_Radicals_Sup,
CJK_Strokes,
CJK_Symbols,
Compat_Jamo,
Control_Pictures,
Coptic,
Coptic_Epact_Numbers,
Counting_Rod,
Cuneiform,
Cuneiform_Numbers,
Currency_Symbols,
Cypriot_Syllabary,
Cypro_Minoan,
Cyrillic,
Cyrillic_Ext_A,
Cyrillic_Ext_B,
Cyrillic_Ext_C,
Cyrillic_Ext_D,
Cyrillic_Sup,
Deseret,
Devanagari,
Devanagari_Ext,
Devanagari_Ext_A,
Diacriticals,
Diacriticals_Ext,
Diacriticals_For_Symbols,
Diacriticals_Sup,
Dingbats,
Dives_Akuru,
Dogra,
Domino,
Duployan,
Early_Dynastic_Cuneiform,
Egyptian_Hieroglyph_Format_Controls,
Egyptian_Hieroglyphs,
Egyptian_Hieroglyphs_Ext_A,
Elbasan,
Elymaic,
Emoticons,
Enclosed_Alphanum,
Enclosed_Alphanum_Sup,
Enclosed_CJK,
Enclosed_Ideographic_Sup,
Ethiopic,
Ethiopic_Ext,
Ethiopic_Ext_A,
Ethiopic_Ext_B,
Ethiopic_Sup,
Garay,
Geometric_Shapes,
Geometric_Shapes_Ext,
Georgian,
Georgian_Ext,
Georgian_Sup,
Glagolitic,
Glagolitic_Sup,
Gothic,
Grantha,
Greek,
Greek_Ext,
Gujarati,
Gunjala_Gondi,
Gurmukhi,
Gurung_Khema,
Half_And_Full_Forms,
Half_Marks,
Hangul,
Hanifi_Rohingya,
Hanunoo,
Hatran,
Hebrew,
High_PU_Surrogates,
High_Surrogates,
Hiragana,
IDC,
Ideographic_Symbols,
Imperial_Aramaic,
Indic_Number_Forms,
Indic_Siyaq_Numbers,
Inscriptional_Pahlavi,
Inscriptional_Parthian,
IPA_Ext,
Jamo,
Jamo_Ext_A,
Jamo_Ext_B,
Javanese,
Kaithi,
Kaktovik_Numerals,
Kana_Ext_A,
Kana_Ext_B,
Kana_Sup,
Kanbun,
Kangxi,
Kannada,
Katakana,
Katakana_Ext,
Kawi,
Kayah_Li,
Kharoshthi,
Khitan_Small_Script,
Khmer,
Khmer_Symbols,
Khojki,
Khudawadi,
Kirat_Rai,
Lao,
Latin_1_Sup,
Latin_Ext_A,
Latin_Ext_Additional,
Latin_Ext_B,
Latin_Ext_C,
Latin_Ext_D,
Latin_Ext_E,
Latin_Ext_F,
Latin_Ext_G,
Lepcha,
Letterlike_Symbols,
Limbu,
Linear_A,
Linear_B_Ideograms,
Linear_B_Syllabary,
Lisu,
Lisu_Sup,
Low_Surrogates,
Lycian,
Lydian,
Mahajani,
Mahjong,
Makasar,
Malayalam,
Mandaic,
Manichaean,
Marchen,
Masaram_Gondi,
Math_Alphanum,
Math_Operators,
Mayan_Numerals,
Medefaidrin,
Meetei_Mayek,
Meetei_Mayek_Ext,
Mende_Kikakui,
Meroitic_Cursive,
Meroitic_Hieroglyphs,
Miao,
Misc_Arrows,
Misc_Math_Symbols_A,
Misc_Math_Symbols_B,
Misc_Pictographs,
Misc_Symbols,
Misc_Symbols_Sup,
Misc_Technical,
Modi,
Modifier_Letters,
Modifier_Tone_Letters,
Mongolian,
Mongolian_Sup,
Mro,
Multani,
Music,
Myanmar,
Myanmar_Ext_A,
Myanmar_Ext_B,
Myanmar_Ext_C,
Nabataean,
Nag_Mundari,
Nandinagari,
NB,
New_Tai_Lue,
Newa,
NKo,
Number_Forms,
Nushu,
Nyiakeng_Puachue_Hmong,
OCR,
Ogham,
Ol_Chiki,
Ol_Onal,
Old_Hungarian,
Old_Italic,
Old_North_Arabian,
Old_Permic,
Old_Persian,
Old_Sogdian,
Old_South_Arabian,
Old_Turkic,
Old_Uyghur,
Oriya,
Ornamental_Dingbats,
Osage,
Osmanya,
Ottoman_Siyaq_Numbers,
Pahawh_Hmong,
Palmyrene,
Pau_Cin_Hau,
Phags_Pa,
Phaistos,
Phoenician,
Phonetic_Ext,
Phonetic_Ext_Sup,
Playing_Cards,
Psalter_Pahlavi,
PUA,
Punctuation,
Rejang,
Rumi,
Runic,
Samaritan,
Saurashtra,
Sharada,
Sharada_Sup,
Shavian,
Shorthand_Format_Controls,
Siddham,
Sidetic,
Sinhala,
Sinhala_Archaic_Numbers,
Small_Forms,
Small_Kana_Ext,
Sogdian,
Sora_Sompeng,
Soyombo,
Specials,
Sundanese,
Sundanese_Sup,
Sunuwar,
Sup_Arrows_A,
Sup_Arrows_B,
Sup_Arrows_C,
Sup_Math_Operators,
Sup_PUA_A,
Sup_PUA_B,
Sup_Punctuation,
Sup_Symbols_And_Pictographs,
Super_And_Sub,
Sutton_SignWriting,
Syloti_Nagri,
Symbols_And_Pictographs_Ext_A,
Symbols_For_Legacy_Computing,
Symbols_For_Legacy_Computing_Sup,
Syriac,
Syriac_Sup,
Tagalog,
Tagbanwa,
Tags,
Tai_Le,
Tai_Tham,
Tai_Viet,
Tai_Xuan_Jing,
Tai_Yo,
Takri,
Tamil,
Tamil_Sup,
Tangsa,
Tangut,
Tangut_Components,
Tangut_Components_Sup,
Tangut_Sup,
Telugu,
Thaana,
Thai,
Tibetan,
Tifinagh,
Tirhuta,
Todhri,
Tolong_Siki,
Toto,
Transport_And_Map,
Tulu_Tigalari,
UCAS,
UCAS_Ext,
UCAS_Ext_A,
Ugaritic,
Vai,
Vedic_Ext,
Vertical_Forms,
Vithkuqi,
VS,
VS_Sup,
Wancho,
Warang_Citi,
Yezidi,
Yi_Radicals,
Yi_Syllables,
Yijing,
Zanabazar_Square,
Znamenny_Music,
}
Combining_Class :: distinct byte
Paired_Brack_Type :: enum {
Nil,
Open,
Close,
None,
}
Bidi_Class :: enum {
Nil, //
L, // Left-to-Right LRM
R, // Right-to-Left RLM
AL, // Right-to-Left Arabic ALM
EN, // European Number
ES, // European Number Separator
ET, // European Number Terminator
AN, // Arabic Number
CS, // Common Number Separator
NSM, // Nonspacing Mark
BN, // Boundary Neutral
B, // Paragraph Separator
S, // Segment Separator
WS, // Whitespace
ON, // Other Neutrals
LRE, // Left-to-Right Embedding LRE
LRO, // Left-to-Right Override LRO
RLE, // Right-to-Left Embedding RLE
RLO, // Right-to-Left Override RLO
PDF, // Pop Directional Format PDF
LRI, // Left-to-Right Isolate LRI
RLI, // Right-to-Left Isolate RLI
FSI, // First Strong Isolate FSI
PDI, // Pop Directional Isolate PDI
}
Bidi :: struct {
bc: Bidi_Class,
bmg: Maybe(rune), // mirrored glyph
m: bool, // Bidi mirrored
c: bool, // Bidi control property
bpt : Paired_Brack_Type, // bidi paired bracket type
bpb : rune, // bidi paired bracket properties
}
Decomposition_Type :: enum {
Nil = 0,
can,
com,
enc,
fin,
font,
fra,
init,
iso,
med,
nar,
nb,
sml,
sqr,
sub,
sup,
vert,
wid,
none,
}
Trinary_Bool :: enum {
Maybe = -1,
False = 0,
True = 1,
}
Decomposition_Mapping :: distinct [dynamic]rune
Decomposition :: struct {
dt: Decomposition_Type, // Decomposition type
dm: Decomposition_Mapping, // Decomposition Mapping
ce: bool, // Composition Exclusion
comp_ex: bool, // Full Composition Exclusion
nfc_quick_check: Trinary_Bool,
nfd_quick_check: bool,
nfkc_quick_check: Trinary_Bool,
nfkd_quick_check: bool,
}
Numeric_Type :: enum {
None = 0, // None
Decimal, // De
Digit, // Di
Numeric, // Nu
}
/*
Note: Value is NAN when numberator and denominator ar 0
*/
Numberic_Value :: struct {
numerator: int,
denominator: int,
}
Char :: struct {
cp: rune,
name: string,
gc: General_Category,
ccc: Combining_Class,
bc: Bidi_Class,
dt: Decomposition_Type,
dm: Decomposition_Mapping,
nt: Numeric_Type,
nv: Numberic_Value,
bm: bool,
name1: string,
sum: string, // Simple uppercase mapping
slm: string, // Simple lowercase mapping
stm: string, // Simple titlecase_mapping
}
Char_Range :: struct {
first_cp: rune,
last_cp: rune,
name: string,
gc: General_Category,
ccc: Combining_Class,
bc: Bidi_Class,
dt: Decomposition_Type,
dm: Decomposition_Mapping,
nt: Numeric_Type,
nv: Numberic_Value,
bm: bool,
name1: string,
sum: string, // Simple uppercase mapping
slm: string, // Simple lowercase mapping
stm: string, // Simple titlecase_mapping
}
Chars :: union {
Char,
Char_Range,
}
Unicode_Data :: distinct [dynamic]Chars
PropList_Property :: enum {
White_Space,
Bidi_Control,
Join_Control,
Dash,
Hyphen,
Quotation_Mark,
Terminal_Punctuation,
Other_Math,
Hex_Digit,
ASCII_Hex_Digit,
Other_Alphabetic,
Ideographic,
Diacritic,
Extender,
Other_Lowercase,
Other_Uppercase,
Noncharacter_Code_Point,
Other_Grapheme_Extend,
IDS_Binary_Operator,
IDS_Trinary_Operator,
IDS_Unary_Operator,
Radical,
Unified_Ideograph,
Other_Default_Ignorable_Code_Point,
Deprecated,
Soft_Dotted,
Logical_Order_Exception,
Other_ID_Start,
Other_ID_Continue,
ID_Compat_Math_Continue,
ID_Compat_Math_Start,
Sentence_Terminal,
Variation_Selector,
Pattern_White_Space,
Pattern_Syntax,
Prepended_Concatenation_Mark,
Regional_Indicator,
Modifier_Combining_Mark,
}
UCD_Error :: enum {
XML_LOAD_ERROR,
XML_Not_UCD,
Nil_XML_Document,
Element_Not_Repertoire,
Extra_Fields,
Unknown_Property,
NO_REPERTOIRE,
UNEXPECTED_STRING,
Invalid_Hex_Number,
Invalid_General_Category,
UnicodeData_6_Too_Long,
UnicodeData_6_Invalid,
UnicodeData_7_Too_Long,
UnicodeData_7_Invalid,
}
Error :: union #shared_nil {
UCD_Error,
os.Error,
}
Range_u16 :: struct {
first: u16,
last: u16,
}
Range_i32 :: struct {
first: i32,
last: i32,
}
Range_Rune :: struct {
first: rune,
last: rune,
}
Dynamic_Range :: struct {
single_16 : [dynamic]u16,
ranges_16 : [dynamic]Range_u16,
single_32 : [dynamic]i32,
ranges_32 : [dynamic]Range_i32,
}
append_to_dynamic_range :: proc(
dr: ^Dynamic_Range,
range: Range_Rune,
allocator := context.allocator,
) {
if range.first == range.last && range.first <= 0xFFFF {
if len(dr.single_16) == 0 {
dr.single_16 = make([dynamic]u16, 0, 512, allocator)
}
append(&dr.single_16, cast(u16) range.first)
} else if range.first == range.last {
if len(dr.single_32) == 0 {
dr.single_32 = make([dynamic]i32, 0, 512, allocator)
}
append(&dr.single_32, cast(i32) range.first)
} else if range.first <= 0xFFFF && range.last <= 0xFFFF {
if len(dr.ranges_16) == 0 {
dr.ranges_16 = make([dynamic]Range_u16, 0, 128, allocator)
}
r := Range_u16{ cast(u16)range.first, cast(u16) range.last}
append(&dr.ranges_16, r)
} else {
if len(dr.ranges_32) == 0 {
dr.ranges_32 = make([dynamic]Range_i32, 0, 128, allocator)
}
r := Range_i32{ cast(i32)range.first, cast(i32) range.last}
append(&dr.ranges_32, r)
}
}
destroy_dynamic_range :: proc (
dr: Dynamic_Range,
){
delete(dr.ranges_16)
delete(dr.ranges_32)
delete(dr.single_16)
delete(dr.single_32)
}
destroy_general_category_ranges :: proc(
gcr: [General_Category]Dynamic_Range,
){
for r in gcr {
destroy_dynamic_range(r)
}
}

View File

@@ -0,0 +1,307 @@
package ucd
import "core:strings"
import "core:os"
load_unicode_data :: proc(
filename: string,
allocator := context.allocator,
) -> (unicode_data : Unicode_Data, err: Error) {
data, os_error := os.read_entire_file(filename, context.temp_allocator)
if os_error != nil {
err = os_error
return
}
defer free_all(context.temp_allocator)
line_iter := Line_Iterator{data = data }
first_cp: rune
line_loop: for line, line_num in line_iterator(&line_iter) {
// Skip empty lines
if len(line) == 0 do continue
field_iter := Field_Iterator{line = line}
is_range := false
cp: rune
name: string
gc: General_Category
num_6 : string
num_7 : string
nt := Numeric_Type.None
nv : Numberic_Value
for field, field_num in field_iterator(&field_iter) {
switch field_num {
case 0: // Code point
cp = 0
for c in field {
if !(c >= '0' && c <= '9') && !(c >= 'A' && c <= 'F') do break
cp *= 16
cp += cast(rune)(c >= '0' && c <= '9') * cast(rune)(c - '0')
cp += cast(rune)(c >= 'A' && c <= 'F') * cast(rune)(c - 'A' + 10)
}
case 1: // Name
if len(field) > 9 && field[0] == '<' && strings.ends_with(transmute(string) field, ", First>") {
first_cp = cp
continue line_loop
}
if len(field) > 9 && field[0] == '<' && strings.ends_with(transmute(string) field, ", Last>") {
name = strings.clone_from_bytes(field[1:len(field)-7], allocator)
is_range = true
} else {
name = strings.clone_from_bytes(field[:], allocator)
}
case 2: // General_Category
// NOTE: This is currently igorning a possible error it should probably be fixed
gc, _ = string_to_general_category(transmute(string)field)
case 3: // Canonical_Combining_Class
case 4: // Bidi Class
case 5: // Decomposition_Type and Decomposition_Mapping
// Numeric_Type and Numberic_Value
case 6:
num_6 = transmute(string)field
case 7:
num_7 = transmute(string)field
case 8:
switch {
case num_6 != "" && num_7 != "" && transmute(string) field != "" :
nt = .Decimal
case num_6 == "" && num_7 != "" && transmute(string) field != "" :
nt = .Digit
case num_6 == "" && num_7 == "" && transmute(string) field != "" :
nt = .Numeric
case:
nt = .None
}
case 9: // Bidi mirrored
case 10: // Unicode 1 Name (Obsolete as of 6.2.0)
case 11: // should be null
case 12:
case 13:
case 14:
case:
unreachable()
}
}
if is_range {
cr : Char_Range
cr.gc = gc
cr.first_cp = first_cp
cr.last_cp = cp
cr.name = name
cr.nt = nt
append(&unicode_data, cr)
} else {
c : Char
c.gc = gc
c.cp = cp
c.name = name
c.nt = nt
append(&unicode_data, c)
}
}
return
}
destroy_unicode_data :: proc(unicode_data: Unicode_Data){
for point in unicode_data {
switch p in point {
case Char:
delete(p.name)
case Char_Range:
delete(p.name)
}
}
delete(unicode_data)
}
gc_ranges :: proc(ud: ^Unicode_Data, allocator := context.allocator) -> (lst: [General_Category]Dynamic_Range) {
range := Range_Rune {
first = -1,
last = -1,
}
gc: General_Category
for point in ud {
switch p in point {
case Char:
if range.first != -1 && (p.cp != range.last + 1 || p.gc != gc) {
append_to_dynamic_range(&lst[gc], range, allocator)
range.first = -1
range.last = -1
}
range.first = transmute(rune) min(transmute(u32)range.first, transmute(u32)p.cp)
gc = p.gc
range.last = p.cp
case Char_Range:
if range.first != -1 do append_to_dynamic_range(&lst[gc], range, allocator)
range.first = p.first_cp
range.last = p.last_cp
append_to_dynamic_range(&lst[p.gc], range ,allocator)
range.first = -1
range.last = -1
}
}
if range.first != -1 do append_to_dynamic_range(&lst[gc], range, allocator)
return
}
extra_digits :: proc(ud: ^Unicode_Data, allocator := context.allocator) -> (Dynamic_Range) {
range := Range_Rune {
first = -1,
last = -1,
}
exd: Dynamic_Range
for point in ud {
switch p in point {
case Char:
exd_type := p.gc != .Nd && (p.nt == .Decimal || p.nt == .Digit)
if range.first != -1 && (p.cp != range.last + 1 || !exd_type) {
append_to_dynamic_range(&exd, range, allocator)
range.first = -1
range.last = -1
}
if exd_type {
range.first = transmute(rune) min(transmute(u32)range.first, transmute(u32)p.cp)
range.last = p.cp
}
case Char_Range:
exd_type := p.gc != .Nd && (p.nt == .Decimal || p.nt == .Digit)
if range.first != -1 do append_to_dynamic_range(&exd, range, allocator)
if exd_type {
range.first = p.first_cp
range.last = p.last_cp
append_to_dynamic_range(&exd, range ,allocator)
}
range.first = -1
range.last = -1
}
}
if range.first != -1 do append_to_dynamic_range(&exd, range, allocator)
return exd
}
/*
Data containted in the Unicode fiel PropList.txt
A `PropList` is the data containted in the Unicode Database (UCD) file
PropList.txt. It is created with the procedure `load_property_list` and
destroy with the procedure `destroy_property_list`.
*/
PropList ::[PropList_Property]Dynamic_Range
/*
This function destroys a `PropList` created by `load_property_list`.
Inputs:
- props: The PropList to destroy
*/
destroy_protperty_list :: proc(
props: [PropList_Property]Dynamic_Range,
){
for r in props {
delete(r.ranges_16)
delete(r.ranges_32)
delete(r.single_16)
delete(r.single_32)
}
}
load_protperty_list :: proc (
filename : string,
allocator := context.allocator,
) -> (props: [PropList_Property]Dynamic_Range, err: Error) {
data, os_error := os.read_entire_file(filename, allocator)
if os_error != nil {
err = os_error
return
}
defer delete(data)
line_iter := Line_Iterator{
data = data
}
for line in line_iterator(&line_iter) {
if len(line) == 0 do continue
field_iter := Field_Iterator{ line = line}
is_range: bool
rr : Range_Rune
prop: PropList_Property
for field, i in field_iterator(&field_iter) {
switch i {
case 0: // Code point or code point range
for c in field {
if !(c >= '0' && c <= '9') && !(c >= 'A' && c <= 'F') {
if c == '.' {
is_range = true
continue
} else {
err = UCD_Error.Invalid_Hex_Number
return
}
}
if is_range {
rr.first *= 16
rr.first += cast(rune)(c >= '0' && c <= '9') * cast(rune)(c - '0')
rr.first += cast(rune)(c >= 'A' && c <= 'F') * cast(rune)(c - 'A' + 10)
rr.last = rr.first
} else {
rr.last *= 16
rr.last += cast(rune)(c >= '0' && c <= '9') * cast(rune)(c - '0')
rr.last += cast(rune)(c >= 'A' && c <= 'F') * cast(rune)(c - 'A' + 10)
}
}
case 1:
prop, err = string_to_proplist_property(transmute(string)field)
if err != nil {
return
}
case:
err = UCD_Error.Extra_Fields
return
}
}
append_to_dynamic_range(&props[prop], rr, allocator)
}
return
}