mirror of
https://github.com/odin-lang/Odin.git
synced 2026-05-03 03:24:41 +00:00
Replace iterators; fixes line ending handling
This commit is contained in:
325
core/unicode/tools/ucd/generate_unicode.odin
Normal file
325
core/unicode/tools/ucd/generate_unicode.odin
Normal file
@@ -0,0 +1,325 @@
|
||||
package ucd
|
||||
|
||||
import "core:fmt"
|
||||
import path "core:path/filepath"
|
||||
import "core:os"
|
||||
import "core:strings"
|
||||
import "base:runtime"
|
||||
import "core:mem"
|
||||
import "core:io"
|
||||
import "core:log"
|
||||
|
||||
// Table 2-3. Types of Code Points
|
||||
// Table 4-4. General_Category Values page 229
|
||||
|
||||
// Reference https://www.unicode.org/reports/tr44/
|
||||
|
||||
|
||||
/*
|
||||
Formats a Dynamic_Range into a set of fixed length arrays and writes
|
||||
corresponding to a io.Writer. The value of the parameter `name`will be used as a
|
||||
prefix to the array names. If a dynamic array contained in the `range` is empty,
|
||||
no corresponding fixed length array will be written.
|
||||
|
||||
Inputs:
|
||||
- writer: The io.Writer to be written to.
|
||||
- name: Prefix to add to any array that is written to `writer`
|
||||
- range: The Dynamic_Range to format and write to writer.
|
||||
*/
|
||||
write_range_arrays :: proc(
|
||||
writer: io.Writer,
|
||||
name: string,
|
||||
range : Dynamic_Range,
|
||||
) -> int {
|
||||
n_written : int
|
||||
if len(range.single_16) > 0 {
|
||||
n_written += fmt.wprintln(writer, "@(rodata)")
|
||||
n_written += fmt.wprintf(writer, "%s_singles16 := [?]u16{{", name)
|
||||
line_length := 100
|
||||
for v in range.single_16 {
|
||||
str_buffer : [32]byte
|
||||
str := fmt.bprintf(str_buffer[:], " 0x%4X,",v)
|
||||
|
||||
if line_length + len(str) > 80 {
|
||||
n_written += fmt.wprintf(writer, "\n")
|
||||
line_length = fmt.wprintf(writer, "\t0x%4X,",v)
|
||||
n_written += line_length
|
||||
} else {
|
||||
temp, _ := io.write_string(writer, str)
|
||||
line_length += temp
|
||||
n_written += temp
|
||||
}
|
||||
}
|
||||
n_written += fmt.wprintln(writer, "\n}\n")
|
||||
}
|
||||
|
||||
if len(range.ranges_16) > 0 {
|
||||
n_written += fmt.wprintln(writer, "@(rodata)")
|
||||
n_written += fmt.wprintfln(writer, "%s_ranges16 := [?]u16{{", name)
|
||||
for v in range.ranges_16 {
|
||||
n_written += fmt.wprintfln(writer, "\t0x%4X, 0x%4X,", v.first, v.last)
|
||||
}
|
||||
n_written += fmt.wprintln(writer, "}\n")
|
||||
}
|
||||
|
||||
if len(range.single_32) > 0 {
|
||||
n_written += fmt.wprintln(writer, "@(rodata)")
|
||||
n_written += fmt.wprintf(writer, "%s_singles32 := [?]i32{{", name)
|
||||
line_length := 100
|
||||
for v in range.single_32 {
|
||||
str_buffer : [32]byte
|
||||
str := fmt.bprintf(str_buffer[:], " 0x%4X,",v)
|
||||
|
||||
if line_length + len(str) > 80 {
|
||||
n_written += fmt.wprint(writer, "\n")
|
||||
line_length = fmt.wprintf(writer, "\t0x%4X,",v)
|
||||
n_written += line_length
|
||||
} else {
|
||||
temp, _ := io.write_string(writer, str)
|
||||
line_length += temp
|
||||
n_written += temp
|
||||
}
|
||||
}
|
||||
n_written += fmt.wprintln(writer, "\n}\n")
|
||||
}
|
||||
|
||||
if len(range.ranges_32) > 0 {
|
||||
n_written += fmt.wprintln(writer, "@(rodata)")
|
||||
n_written += fmt.wprintfln(writer, "%s_ranges32 := [?]i32{{", name)
|
||||
for v in range.ranges_32 {
|
||||
n_written += fmt.wprintfln(writer, "\t0x%4X, 0x%4X,", v.first, v.last)
|
||||
}
|
||||
n_written += fmt.wprintln(writer, "}\n")
|
||||
}
|
||||
|
||||
return n_written
|
||||
}
|
||||
|
||||
write_range :: proc(
|
||||
writer: io.Writer,
|
||||
name: union{string,
|
||||
General_Category},
|
||||
range: Dynamic_Range,
|
||||
) -> (n_written: int) {
|
||||
buffer: [128]byte
|
||||
str: string
|
||||
|
||||
switch n in name{
|
||||
case string:
|
||||
assert(len(n) <= len(buffer))
|
||||
runtime.mem_copy(&buffer[0], raw_data(n), len(n))
|
||||
str = transmute(string) buffer[0:len(n)]
|
||||
|
||||
case General_Category:
|
||||
str = fmt.bprintf(buffer[:], "%s", n)
|
||||
}
|
||||
|
||||
for &b in buffer[0:len(str)] {
|
||||
if b >= 'A' && b <= 'Z' {
|
||||
b += ('a' - 'A')
|
||||
}
|
||||
}
|
||||
|
||||
n_written = write_range_arrays(writer, str, range)
|
||||
|
||||
n_written += fmt.wprintfln(writer, "%s_ranges := Range{{", str)
|
||||
if len(range.single_16) > 0 {
|
||||
n_written += fmt.wprintfln(writer, "\tsingle_16 = %s_singles16[:],", str)
|
||||
}
|
||||
if len(range.ranges_16) > 0 {
|
||||
n_written += fmt.wprintfln(writer, "\tranges_16 = %s_ranges16[:],", str)
|
||||
}
|
||||
if len(range.single_32) > 0 {
|
||||
n_written += fmt.wprintfln(writer, "\tsingle_32 = %s_singles32[:],", str)
|
||||
}
|
||||
if len(range.ranges_32) > 0 {
|
||||
n_written += fmt.wprintfln(writer, "\tranges_32 = %s_ranges32[:],", str)
|
||||
}
|
||||
n_written += fmt.wprintln(writer, "}\n")
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
GENERATED :: `/*
|
||||
------ GENERATED ------ DO NOT EDIT ------ GENERATED ------ DO NOT EDIT ------ GENERATED ------
|
||||
*/
|
||||
`
|
||||
|
||||
MESSAGE :: `/*
|
||||
This file is generated from UnicodeData.txt and PropList.txt. These files
|
||||
are part of the Unicode Database (UCD) and are covered by the license
|
||||
listed further down. They may be downloaded from the following locations;
|
||||
|
||||
https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
|
||||
https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt
|
||||
https://www.unicode.org/license.txt
|
||||
|
||||
------------------------------------------------------------------------------
|
||||
UNICODE LICENSE V3
|
||||
|
||||
COPYRIGHT AND PERMISSION NOTICE
|
||||
|
||||
Copyright © 1991-2026 Unicode, Inc.
|
||||
|
||||
NOTICE TO USER: Carefully read the following legal agreement. BY
|
||||
DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING DATA FILES, AND/OR
|
||||
SOFTWARE, YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
|
||||
TERMS AND CONDITIONS OF THIS AGREEMENT. IF YOU DO NOT AGREE, DO NOT
|
||||
DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE THE DATA FILES OR SOFTWARE.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a
|
||||
copy of data files and any associated documentation (the "Data Files") or
|
||||
software and any associated documentation (the "Software") to deal in the
|
||||
Data Files or Software without restriction, including without limitation
|
||||
the rights to use, copy, modify, merge, publish, distribute, and/or sell
|
||||
copies of the Data Files or Software, and to permit persons to whom the
|
||||
Data Files or Software are furnished to do so, provided that either (a)
|
||||
this copyright and permission notice appear with all copies of the Data
|
||||
Files or Software, or (b) this copyright and permission notice appear in
|
||||
associated Documentation.
|
||||
|
||||
THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
|
||||
KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
|
||||
THIRD PARTY RIGHTS.
|
||||
|
||||
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE
|
||||
BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES,
|
||||
OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
||||
WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
|
||||
ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE DATA
|
||||
FILES OR SOFTWARE.
|
||||
|
||||
Except as contained in this notice, the name of a copyright holder shall
|
||||
not be used in advertising or otherwise to promote the sale, use or other
|
||||
dealings in these Data Files or Software without prior written
|
||||
authorization of the copyright holder.
|
||||
|
||||
*/
|
||||
`
|
||||
|
||||
main :: proc() {
|
||||
track: mem.Tracking_Allocator
|
||||
|
||||
mem.tracking_allocator_init(&track, context.allocator)
|
||||
defer {
|
||||
if len(track.allocation_map) > 0 {
|
||||
fmt.eprintf("=== %v allocations not freed: ===\n", len(track.allocation_map))
|
||||
for _, entry in track.allocation_map {
|
||||
fmt.eprintf("- %v bytes @ %v\n", entry.size, entry.location)
|
||||
}
|
||||
}
|
||||
mem.tracking_allocator_destroy(&track)
|
||||
}
|
||||
|
||||
context.allocator = mem.tracking_allocator(&track)
|
||||
|
||||
context.logger = log.create_console_logger()
|
||||
defer log.destroy_console_logger(context.logger)
|
||||
|
||||
ucd_path, _ := path.join({ODIN_ROOT,
|
||||
"tests","core","assets","UCD","UnicodeData.txt"}, context.allocator)
|
||||
defer delete(ucd_path)
|
||||
|
||||
unicode_data, ucd_err := load_unicode_data(ucd_path)
|
||||
if ucd_err != nil {
|
||||
log.errorf("Error loading Unicode data. %s", ucd_err)
|
||||
}
|
||||
defer destroy_unicode_data(unicode_data)
|
||||
|
||||
general_category_ranges := gc_ranges(&unicode_data)
|
||||
defer destroy_general_category_ranges(general_category_ranges)
|
||||
|
||||
extra_digits := extra_digits(&unicode_data)
|
||||
defer destroy_dynamic_range(extra_digits)
|
||||
|
||||
|
||||
proplist_path, _ := path.join({ODIN_ROOT,
|
||||
"tests","core","assets","UCD","PropList.txt"}, context.allocator)
|
||||
defer delete(proplist_path)
|
||||
proplist, proplist_err := load_protperty_list(proplist_path)
|
||||
if proplist_err != nil {
|
||||
log.errorf("Error loading PropList.txt. %s", proplist_err)
|
||||
return
|
||||
}
|
||||
defer destroy_protperty_list(proplist)
|
||||
|
||||
|
||||
|
||||
sb := strings.builder_make_len_cap(0, 1024*32)
|
||||
defer strings.builder_destroy(&sb)
|
||||
|
||||
|
||||
writer := strings.to_writer(&sb)
|
||||
|
||||
fmt.wprintfln(writer, "package unicode\n")
|
||||
fmt.wprintln(writer, GENERATED)
|
||||
fmt.wprintln(writer, MESSAGE)
|
||||
|
||||
Range_Type :: "Range :: struct {\n" +
|
||||
"\tsingle_16 : []u16,\n" +
|
||||
"\tranges_16 : []u16,\n" +
|
||||
"\tsingle_32 : []i32,\n" +
|
||||
"\tranges_32 : []i32,\n" +
|
||||
"}\n"
|
||||
|
||||
fmt.wprintfln(writer, "%s", Range_Type)
|
||||
|
||||
//List of the general categories to skip when generating the code for
|
||||
//core/unicode/generated.txt.
|
||||
to_exclude := [?]General_Category{
|
||||
.Cc, // Control, a C0 or C1 control code
|
||||
.Cf, // Format, a format control character
|
||||
.Cn, // Unassigned, a reserved unassigned code point or a noncharacter
|
||||
.Co, // Private_Use, a private-use character
|
||||
.Cs, // Surrogate, a surrogate code point
|
||||
// .Ll, // Lowercase_Letter, a lowercase letter
|
||||
// .Lm, // Modifier_Letter, a modifier letter
|
||||
// .Lo, // Other_Letter, other letters, including syllables and ideographs
|
||||
// .Lt, // Titlecase_Letter, a digraph encoded as a single character, with first part uppercase
|
||||
// .Lu, // Uppercase_Letter, an uppercase letter
|
||||
// .Mc, // Spacing_Mark, a spacing combining mark (positive advance width)
|
||||
// .Me, // Enclosing_Mark, an enclosing combining mark
|
||||
// .Mn, // Nonspacing_Mark, a nonspacing combining mark (zero advance width)
|
||||
//.Nd, // Decimal_Number, a decimal digit
|
||||
//.Nl, // Letter_Number, a letterlike numeric character
|
||||
//.No, // Other_Number, a numeric character of other type
|
||||
// .Pc, // Connector_Punctuation, a connecting punctuation mark, like a tie
|
||||
// .Pd, // Dash_Punctuation, a dash or hyphen punctuation mark
|
||||
// .Pe, // Close_Punctuation, a closing punctuation mark (of a pair)
|
||||
// .Pf, // Final_Punctuation, a final quotation mark
|
||||
// .Pi, // Initial_Punctuation, an initial quotation mark
|
||||
// .Po, // Other_Punctuation, a punctuation mark of other type
|
||||
// .Ps, // Open_Punctuation, an opening punctuation mark (of a pair)
|
||||
// .Sc, // Currency_Symbol, a currency sign
|
||||
// .Sk, // Modifier_Symbol, a non-letterlike modifier symbol
|
||||
// .Sm, // Math_Symbol, a symbol of mathematical use
|
||||
// .So, // Other_Symbol, a symbol of other type
|
||||
.Zl, // Line_Separator, U+2028 LINE SEPARATOR only
|
||||
.Zp, // Paragraph_Separator, U+2029 PARAGRAPH SEPARATOR only
|
||||
//.Zs, // Space_Separator, a space character (of various non-zero widths)
|
||||
}
|
||||
|
||||
write_loop : for gc, i in general_category_ranges {
|
||||
for excluded in to_exclude {
|
||||
if i == excluded do continue write_loop
|
||||
}
|
||||
write_range(writer, i, gc)
|
||||
}
|
||||
|
||||
write_range(writer, "extra_digits", extra_digits )
|
||||
|
||||
write_range(writer,"other_lowercase", proplist[.Other_Lowercase])
|
||||
write_range(writer,"other_uppercase", proplist[.Other_Uppercase])
|
||||
|
||||
file_name, _ := path.join({ODIN_ROOT, "core", "unicode", "generated.odin"}, context.allocator)
|
||||
defer delete(file_name)
|
||||
|
||||
str := strings.to_string(sb)
|
||||
|
||||
write_error := os.write_entire_file_from_string(file_name, str)
|
||||
if write_error != nil {
|
||||
log.errorf("Error writting %s. %s", file_name, write_error)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,70 +0,0 @@
|
||||
package ucd
|
||||
|
||||
/*
|
||||
An iterator that allows simple iterating over the lines of of a slice of bytes, []byte,
|
||||
without allocating. Each line must end in a new line, i.e., '\n'
|
||||
*/
|
||||
Line_Iterator :: struct {
|
||||
index: int, // current location in data
|
||||
data: []byte, // Data over which to iterate
|
||||
line_counter: int, // line number storage
|
||||
}
|
||||
|
||||
line_iterator :: proc(it: ^Line_Iterator) -> (line: []byte, line_number: int, more: bool) {
|
||||
more = it.index < len(it.data)
|
||||
if more {
|
||||
it.line_counter += 1
|
||||
line_number = it.line_counter
|
||||
} else {
|
||||
return
|
||||
}
|
||||
start:= it.index
|
||||
for it.index < len(it.data) && it.data[it.index] != '\n' && it.data[it.index] != '#' do it.index += 1
|
||||
line = it.data[start:it.index]
|
||||
//index = start
|
||||
|
||||
if it.index < len(it.data) && it.data[it.index] == '#' {
|
||||
for it.index < len(it.data) && it.data[it.index] != '\n' do it.index += 1
|
||||
}
|
||||
if it.index < len(it.data) && it.data[it.index] == '\n' do it.index += 1
|
||||
return
|
||||
}
|
||||
|
||||
Field_Iterator :: struct {
|
||||
index: int,
|
||||
field_counter: int,
|
||||
line: []byte,
|
||||
}
|
||||
|
||||
field_iterator :: proc(it: ^Field_Iterator) -> (field: []byte, field_count: int, valid: bool) {
|
||||
valid = it.index < len(it.line) && it.line[it.index] != '\n' && it.line[it.index] != '#'
|
||||
if !valid do return
|
||||
|
||||
if it.index < len(it.line) && it.index != 0 && it.line[it.index] == ';' do it. index += 1
|
||||
|
||||
start := it.index
|
||||
for it.index < len(it.line) && it.line[it.index] != ';' && it.line[it.index] != '#' do it.index += 1
|
||||
|
||||
field = it.line[start:it.index]
|
||||
temp := field
|
||||
|
||||
// Remove leading spaces
|
||||
for b, i in temp {
|
||||
if b != ' ' {
|
||||
field = temp[i:]
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// Remove trailing spaces
|
||||
temp = field
|
||||
for b, i in temp {
|
||||
if b != ' ' {
|
||||
field = temp[0:i+1]
|
||||
}
|
||||
}
|
||||
|
||||
field_count = it.field_counter
|
||||
it.field_counter += 1
|
||||
return
|
||||
}
|
||||
@@ -15,14 +15,17 @@ load_unicode_data :: proc(
|
||||
}
|
||||
defer free_all(context.temp_allocator)
|
||||
|
||||
line_iter := Line_Iterator{data = data }
|
||||
// line_iter := Line_Iterator{data = data }
|
||||
first_cp: rune
|
||||
|
||||
line_loop: for line, line_num in line_iterator(&line_iter) {
|
||||
// Skip empty lines
|
||||
str := string(data)
|
||||
line_no := 1
|
||||
line_loop: for _line in strings.split_lines_iterator(&str) {
|
||||
defer line_no += 1
|
||||
line, _, _ := strings.partition(_line, "#")
|
||||
if len(line) == 0 do continue
|
||||
|
||||
field_iter := Field_Iterator{line = line}
|
||||
// field_iter := Field_Iterator{line = line}
|
||||
is_range := false
|
||||
cp: rune
|
||||
name: string
|
||||
@@ -33,7 +36,11 @@ load_unicode_data :: proc(
|
||||
nt := Numeric_Type.None
|
||||
nv : Numberic_Value
|
||||
|
||||
for field, field_num in field_iterator(&field_iter) {
|
||||
field_num := 0
|
||||
for field in strings.split_iterator(&line, ";") {
|
||||
defer field_num += 1
|
||||
field := strings.trim_space(field)
|
||||
|
||||
switch field_num {
|
||||
case 0: // Code point
|
||||
cp = 0
|
||||
@@ -52,10 +59,10 @@ load_unicode_data :: proc(
|
||||
}
|
||||
|
||||
if len(field) > 9 && field[0] == '<' && strings.ends_with(transmute(string) field, ", Last>") {
|
||||
name = strings.clone_from_bytes(field[1:len(field)-7], allocator)
|
||||
name = strings.clone(field[1:len(field)-7], allocator)
|
||||
is_range = true
|
||||
} else {
|
||||
name = strings.clone_from_bytes(field[:], allocator)
|
||||
name = strings.clone(field[:], allocator)
|
||||
}
|
||||
|
||||
case 2: // General_Category
|
||||
@@ -236,6 +243,8 @@ destroy_protperty_list :: proc(
|
||||
}
|
||||
}
|
||||
|
||||
import "core:fmt"
|
||||
|
||||
load_protperty_list :: proc (
|
||||
filename : string,
|
||||
allocator := context.allocator,
|
||||
@@ -251,16 +260,26 @@ load_protperty_list :: proc (
|
||||
line_iter := Line_Iterator{
|
||||
data = data
|
||||
}
|
||||
for line in line_iterator(&line_iter) {
|
||||
|
||||
str := string(data)
|
||||
line_no := 1
|
||||
for _line in strings.split_lines_iterator(&str) {
|
||||
defer line_no += 1
|
||||
line, _, _ := strings.partition(_line, "#")
|
||||
if len(line) == 0 do continue
|
||||
field_iter := Field_Iterator{ line = line}
|
||||
fmt.printfln("%d: %q", line_no, line)
|
||||
|
||||
is_range: bool
|
||||
|
||||
rr : Range_Rune
|
||||
|
||||
prop: PropList_Property
|
||||
for field, i in field_iterator(&field_iter) {
|
||||
i := 0
|
||||
for field in strings.split_iterator(&line, ";") {
|
||||
defer i += 1
|
||||
field := strings.trim_space(field)
|
||||
fmt.printfln("%d: %q", i, field)
|
||||
|
||||
switch i {
|
||||
case 0: // Code point or code point range
|
||||
for c in field {
|
||||
@@ -302,7 +321,4 @@ load_protperty_list :: proc (
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
Reference in New Issue
Block a user