Replace iterators; fixes line ending handling

2026-05-03 03:24:41 +00:00 · 2026-03-09 16:25:35 +01:00
parent 1d0510d27f
commit 3f330752cd
3 changed files with 47 additions and 101 deletions
--- a/core/unicode/tools/ucd/generate_unicode.odin
+++ b/core/unicode/tools/ucd/generate_unicode.odin
@@ -0,0 +1,325 @@
+package ucd
+
+import "core:fmt"
+import path "core:path/filepath"
+import "core:os"
+import "core:strings"
+import "base:runtime"
+import "core:mem"
+import "core:io"
+import "core:log"
+
+// Table 2-3. Types of Code Points
+// Table 4-4. General_Category Values page 229
+
+// Reference https://www.unicode.org/reports/tr44/
+
+
+/*
+Formats a Dynamic_Range into a set of fixed length arrays and writes
+corresponding to a io.Writer. The value of the parameter `name`will be used as a
+prefix to the array names. If a dynamic array contained in the `range` is empty,
+no corresponding fixed length array will be written.
+
+Inputs:
+- writer: The io.Writer to be written to.
+- name: Prefix to add to any array that is written to `writer`
+- range: The Dynamic_Range to format and write to writer.
+*/
+write_range_arrays :: proc(
+	writer: io.Writer,
+	name: string,
+	range : Dynamic_Range,
+) -> int {
+	n_written : int
+	if len(range.single_16) > 0 { 
+		n_written += fmt.wprintln(writer, "@(rodata)")
+		n_written += fmt.wprintf(writer, "%s_singles16 := [?]u16{{", name)
+		line_length := 100 
+		for v in range.single_16 {
+			str_buffer : [32]byte
+			str := fmt.bprintf(str_buffer[:], " 0x%4X,",v)
+
+			if line_length + len(str) > 80 {
+				n_written += fmt.wprintf(writer, "\n")
+				line_length = fmt.wprintf(writer, "\t0x%4X,",v)
+				n_written +=  line_length
+			} else {
+				temp, _ := io.write_string(writer, str)
+				line_length += temp
+				n_written += temp
+			}
+		}
+		n_written += fmt.wprintln(writer, "\n}\n")
+	}
+	
+	if len(range.ranges_16) > 0 {
+		n_written += fmt.wprintln(writer, "@(rodata)")
+		n_written += fmt.wprintfln(writer, "%s_ranges16 := [?]u16{{", name)
+		for v in range.ranges_16 {
+			n_written += fmt.wprintfln(writer, "\t0x%4X, 0x%4X,", v.first, v.last)
+		}
+		n_written += fmt.wprintln(writer, "}\n")
+	}
+
+	if len(range.single_32) > 0 {
+		n_written += fmt.wprintln(writer, "@(rodata)")
+		n_written += fmt.wprintf(writer, "%s_singles32 := [?]i32{{", name)
+		line_length := 100
+		for v in range.single_32 {
+			str_buffer : [32]byte
+			str := fmt.bprintf(str_buffer[:], " 0x%4X,",v)
+
+			if line_length + len(str) > 80 { 
+				n_written += fmt.wprint(writer, "\n")
+				line_length = fmt.wprintf(writer, "\t0x%4X,",v)
+				n_written += line_length
+			} else {
+				temp, _ := io.write_string(writer, str)
+				line_length += temp
+				n_written += temp
+			}
+		}
+		n_written += fmt.wprintln(writer, "\n}\n")
+	}
+
+	if len(range.ranges_32) > 0 {
+		n_written += fmt.wprintln(writer, "@(rodata)")
+		n_written += fmt.wprintfln(writer, "%s_ranges32 := [?]i32{{", name)
+		for v in range.ranges_32 {
+			n_written += fmt.wprintfln(writer, "\t0x%4X, 0x%4X,", v.first, v.last)
+		}
+		n_written += fmt.wprintln(writer, "}\n")
+	}
+
+	return n_written
+}
+
+write_range :: proc(
+	writer: io.Writer,
+	name: union{string,
+	General_Category},
+	range: Dynamic_Range,
+) -> (n_written: int) {
+	buffer: [128]byte
+	str: string
+
+	switch n in name{
+	case string:
+		assert(len(n) <= len(buffer))
+		runtime.mem_copy(&buffer[0], raw_data(n), len(n))
+		str = transmute(string) buffer[0:len(n)]
+
+	case General_Category:
+		str = fmt.bprintf(buffer[:], "%s", n)
+	}
+
+	for &b in buffer[0:len(str)] {
+		if b >= 'A' && b <= 'Z' {
+			b += ('a' - 'A')
+		}
+	}
+
+	n_written = write_range_arrays(writer, str, range)
+
+	n_written += fmt.wprintfln(writer, "%s_ranges := Range{{", str)
+	if len(range.single_16) > 0 {
+		n_written += fmt.wprintfln(writer, "\tsingle_16 = %s_singles16[:],", str) 
+	}
+	if len(range.ranges_16) > 0 {
+		n_written += fmt.wprintfln(writer, "\tranges_16 = %s_ranges16[:],", str) 
+	}
+	if len(range.single_32) > 0 {
+		n_written += fmt.wprintfln(writer, "\tsingle_32 = %s_singles32[:],", str) 
+	}
+	if len(range.ranges_32) > 0 {
+		n_written += fmt.wprintfln(writer, "\tranges_32 = %s_ranges32[:],", str) 
+	}
+	n_written += fmt.wprintln(writer, "}\n")
+
+	return
+}
+
+GENERATED :: `/*
+	------ GENERATED ------ DO NOT EDIT ------ GENERATED ------ DO NOT EDIT ------ GENERATED ------
+*/
+`
+
+MESSAGE :: `/* 
+	This file is generated from UnicodeData.txt and PropList.txt. These files
+	are part of the Unicode Database (UCD) and are covered by the license
+	listed further down. They may be downloaded from the following locations;
+
+	https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
+	https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt
+	https://www.unicode.org/license.txt
+
+	------------------------------------------------------------------------------
+	UNICODE LICENSE V3
+	
+	COPYRIGHT AND PERMISSION NOTICE
+	
+	Copyright © 1991-2026 Unicode, Inc.
+	
+	NOTICE TO USER: Carefully read the following legal agreement. BY
+	DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING DATA FILES, AND/OR
+	SOFTWARE, YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
+	TERMS AND CONDITIONS OF THIS AGREEMENT. IF YOU DO NOT AGREE, DO NOT
+	DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE THE DATA FILES OR SOFTWARE.
+	
+	Permission is hereby granted, free of charge, to any person obtaining a
+	copy of data files and any associated documentation (the "Data Files") or
+	software and any associated documentation (the "Software") to deal in the
+	Data Files or Software without restriction, including without limitation
+	the rights to use, copy, modify, merge, publish, distribute, and/or sell
+	copies of the Data Files or Software, and to permit persons to whom the
+	Data Files or Software are furnished to do so, provided that either (a)
+	this copyright and permission notice appear with all copies of the Data
+	Files or Software, or (b) this copyright and permission notice appear in
+	associated Documentation.
+	
+	THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
+	KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
+	THIRD PARTY RIGHTS.
+	
+	IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE
+	BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES,
+	OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+	WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
+	ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE DATA
+	FILES OR SOFTWARE.
+	
+	Except as contained in this notice, the name of a copyright holder shall
+	not be used in advertising or otherwise to promote the sale, use or other
+	dealings in these Data Files or Software without prior written
+	authorization of the copyright holder.
+
+*/
+`
+
+main :: proc() {
+	track: mem.Tracking_Allocator
+
+	mem.tracking_allocator_init(&track, context.allocator)
+	defer {
+		if len(track.allocation_map) > 0 {
+			fmt.eprintf("=== %v allocations not freed: ===\n", len(track.allocation_map))
+			for _, entry in track.allocation_map {
+				fmt.eprintf("- %v bytes @ %v\n", entry.size, entry.location)
+			}
+		}
+		mem.tracking_allocator_destroy(&track)
+	}
+
+	context.allocator = mem.tracking_allocator(&track)
+
+	context.logger = log.create_console_logger()
+	defer log.destroy_console_logger(context.logger)
+
+	ucd_path, _ := path.join({ODIN_ROOT,
+		"tests","core","assets","UCD","UnicodeData.txt"}, context.allocator)
+	defer delete(ucd_path)
+
+	unicode_data, ucd_err := load_unicode_data(ucd_path)
+	if ucd_err != nil {
+		log.errorf("Error loading Unicode data. %s", ucd_err)
+	}
+	defer destroy_unicode_data(unicode_data)
+
+	general_category_ranges := gc_ranges(&unicode_data)
+	defer destroy_general_category_ranges(general_category_ranges)
+
+	extra_digits := extra_digits(&unicode_data)
+	defer destroy_dynamic_range(extra_digits)
+
+
+	proplist_path, _ := path.join({ODIN_ROOT,
+		"tests","core","assets","UCD","PropList.txt"}, context.allocator)
+	defer delete(proplist_path)
+	proplist, proplist_err := load_protperty_list(proplist_path)
+	if proplist_err != nil {
+		log.errorf("Error loading PropList.txt. %s", proplist_err)
+		return
+	}
+	defer destroy_protperty_list(proplist)
+
+
+
+ 	sb := strings.builder_make_len_cap(0, 1024*32)
+ 	defer strings.builder_destroy(&sb)
+ 
+ 
+ 	writer := strings.to_writer(&sb)
+ 
+ 	fmt.wprintfln(writer, "package unicode\n")
+ 	fmt.wprintln(writer, GENERATED)
+ 	fmt.wprintln(writer, MESSAGE)
+ 
+ 	Range_Type :: "Range :: struct {\n" + 
+ 		"\tsingle_16 : []u16,\n" + 
+ 		"\tranges_16 : []u16,\n" +
+ 		"\tsingle_32 : []i32,\n" +
+ 		"\tranges_32 : []i32,\n" +
+ 		"}\n"
+ 
+ 	fmt.wprintfln(writer, "%s", Range_Type)
+
+	//List of the general categories to skip when generating the code for
+	//core/unicode/generated.txt. 
+	to_exclude := [?]General_Category{
+		.Cc, // Control, a C0 or C1 control code
+		.Cf, // Format, a format control character
+		.Cn, // Unassigned, a reserved unassigned code point or a noncharacter
+		.Co, // Private_Use, a private-use character
+		.Cs, // Surrogate, a surrogate code point
+		// .Ll, // Lowercase_Letter, a lowercase letter
+		// .Lm, // Modifier_Letter, a modifier letter
+		// .Lo, // Other_Letter, other letters, including syllables and ideographs
+		// .Lt, // Titlecase_Letter, a digraph encoded as a single character, with first part uppercase
+		// .Lu, // Uppercase_Letter, an uppercase letter
+		// .Mc, // Spacing_Mark, a spacing combining mark (positive advance width)
+		// .Me, // Enclosing_Mark, an enclosing combining mark
+		// .Mn, // Nonspacing_Mark, a nonspacing combining mark (zero advance width)
+		//.Nd, // Decimal_Number, a decimal digit
+		//.Nl, // Letter_Number, a letterlike numeric character
+		//.No, // Other_Number, a numeric character of other type
+		// .Pc, // Connector_Punctuation, a connecting punctuation mark, like a tie
+		// .Pd, // Dash_Punctuation, a dash or hyphen punctuation mark
+		// .Pe, // Close_Punctuation, a closing punctuation mark (of a pair)
+		// .Pf, // Final_Punctuation, a final quotation mark
+		// .Pi, // Initial_Punctuation, an initial quotation mark
+		// .Po, // Other_Punctuation, a punctuation mark of other type
+		// .Ps, // Open_Punctuation, an opening punctuation mark (of a pair)
+		// .Sc, // Currency_Symbol, a currency sign
+		// .Sk, // Modifier_Symbol, a non-letterlike modifier symbol
+		// .Sm, // Math_Symbol, a symbol of mathematical use
+		// .So, // Other_Symbol, a symbol of other type
+		 .Zl, // Line_Separator, U+2028 LINE SEPARATOR only
+		 .Zp, // Paragraph_Separator, U+2029 PARAGRAPH SEPARATOR only
+		//.Zs, // Space_Separator, a space character (of various non-zero widths)
+	}
+ 
+ 	write_loop : for gc, i in general_category_ranges { 
+ 		for excluded in to_exclude {
+ 			if i == excluded do continue write_loop
+ 		}
+ 		write_range(writer, i, gc)
+ 	}
+
+ 	write_range(writer, "extra_digits", extra_digits )
+ 
+ 	write_range(writer,"other_lowercase", proplist[.Other_Lowercase])
+ 	write_range(writer,"other_uppercase", proplist[.Other_Uppercase])
+
+ 	file_name, _ := path.join({ODIN_ROOT, "core", "unicode", "generated.odin"}, context.allocator)
+ 	defer delete(file_name)
+ 
+ 	str := strings.to_string(sb)
+
+	write_error := os.write_entire_file_from_string(file_name, str)
+    if write_error != nil {
+ 		log.errorf("Error writting %s. %s", file_name, write_error)
+ 	}
+}
+
--- a/core/unicode/tools/ucd/iterator.odin
+++ b/core/unicode/tools/ucd/iterator.odin
@@ -1,70 +0,0 @@
-package ucd
-
-/*
-An iterator that allows simple iterating over the lines of of a slice of bytes, []byte,
-without allocating. Each line must end in a new line, i.e., '\n'
-*/
-Line_Iterator :: struct {
-	index: int, // current location in data
-	data: []byte, // Data over which to iterate
-	line_counter: int, // line number storage  
-}
-
-line_iterator :: proc(it: ^Line_Iterator) -> (line: []byte, line_number: int,  more: bool) {
-	more = it.index < len(it.data)
-	if more {
-		it.line_counter += 1
-		line_number = it.line_counter
-	} else {
-		return
-	}	
-	start:= it.index
-	for it.index < len(it.data) && it.data[it.index] != '\n' && it.data[it.index] != '#' do it.index += 1
-	line = it.data[start:it.index]
-	//index = start
-
-	if it.index < len(it.data) && it.data[it.index] == '#' {
-		for it.index < len(it.data) && it.data[it.index] != '\n' do it.index += 1
-	}
-	if it.index < len(it.data) && it.data[it.index] == '\n' do it.index += 1
-	return
-}
-
-Field_Iterator :: struct {
-	index: int,
-	field_counter: int,
-	line: []byte,
-}
-
-field_iterator :: proc(it: ^Field_Iterator) -> (field: []byte, field_count: int,  valid: bool) {
-	valid = it.index < len(it.line) && it.line[it.index] != '\n' && it.line[it.index] != '#'
-	if !valid do return
-
-	if it.index < len(it.line) && it.index != 0 && it.line[it.index] == ';' do it. index += 1
-
-	start := it.index
-	for it.index < len(it.line) && it.line[it.index] != ';'  && it.line[it.index] != '#' do it.index += 1
-
-	field = it.line[start:it.index]	
-	temp := field
-
-	// Remove leading spaces
-	for b, i in temp {
-		if b != ' ' {
-			field = temp[i:]
-			break
-		}
-	}
-
-	// Remove trailing spaces
-	temp = field
-	for b, i in temp {
-		if b != ' ' {
-			field = temp[0:i+1]
-		}
-	}
-
-	field_count = it.field_counter
-	it.field_counter += 1
-	return
-}
--- a/core/unicode/tools/ucd/ucd.odin
+++ b/core/unicode/tools/ucd/ucd.odin
@@ -15,14 +15,17 @@ load_unicode_data :: proc(
 	}
 	defer free_all(context.temp_allocator)

-	line_iter := Line_Iterator{data = data }
+	// line_iter := Line_Iterator{data = data }
 	first_cp: rune

-	line_loop: for line, line_num in line_iterator(&line_iter) {
-		// Skip empty lines
+	str := string(data)
+	line_no := 1
+	line_loop: for _line in strings.split_lines_iterator(&str) {
+		defer line_no += 1
+		line, _, _ := strings.partition(_line, "#")
 		if len(line) == 0 do continue

-		field_iter := Field_Iterator{line = line}
+		// field_iter := Field_Iterator{line = line}
 		is_range := false
 		cp: rune
 		name: string
@@ -33,7 +36,11 @@ load_unicode_data :: proc(
 		nt := Numeric_Type.None
 		nv : Numberic_Value

-		for field, field_num in field_iterator(&field_iter) {
+		field_num := 0
+		for field in strings.split_iterator(&line, ";") {
+			defer field_num += 1
+			field := strings.trim_space(field)
+
 			switch field_num {
 			case 0: // Code point
 				cp = 0
@@ -52,10 +59,10 @@ load_unicode_data :: proc(
 				}
 				
 				if len(field) > 9 && field[0] == '<' && strings.ends_with(transmute(string) field, ", Last>") {
-					name = strings.clone_from_bytes(field[1:len(field)-7], allocator)
+					name = strings.clone(field[1:len(field)-7], allocator)
 					is_range = true
 				} else {
-					name = strings.clone_from_bytes(field[:], allocator)
+					name = strings.clone(field[:], allocator)
 				}

 			case 2: // General_Category
@@ -236,6 +243,8 @@ destroy_protperty_list :: proc(
 	}
 }

+import "core:fmt"
+
 load_protperty_list :: proc (
 	filename : string,
 	allocator := context.allocator,
@@ -251,16 +260,26 @@ load_protperty_list :: proc (
 	line_iter := Line_Iterator{
 		data = data
 	}
-	for line in line_iterator(&line_iter) {
+
+	str := string(data)
+	line_no := 1
+	for _line in strings.split_lines_iterator(&str) {
+		defer line_no += 1
+		line, _, _ := strings.partition(_line, "#")
 		if len(line) == 0 do continue
-		field_iter := Field_Iterator{ line = line}
+		fmt.printfln("%d: %q", line_no, line)

 		is_range: bool

 		rr : Range_Rune

 		prop: PropList_Property 
-		for field, i in field_iterator(&field_iter) {
+		i := 0
+		for field in strings.split_iterator(&line, ";") {
+			defer i += 1
+			field := strings.trim_space(field)
+			fmt.printfln("%d: %q", i, field)
+
 			switch i {
 			case 0: // Code point or code point range
 				for c in field {
@@ -302,7 +321,4 @@ load_protperty_list :: proc (
 	}

 	return
-}
-
-
-
+}