Merge pull request #6314 from Kelimion/update_unicode_xml

Update unicode xml
2026-05-26 05:38:14 +00:00 · 2026-02-21 14:14:59 +01:00
parent 11d2d37277 82b3917300
commit 13e0f7cc35
4 changed files with 209 additions and 121 deletions
--- a/core/encoding/entity/generated.odin
+++ b/core/encoding/entity/generated.odin
@@ -6,14 +6,14 @@ package encoding_unicode_entity

 /*
 	This file is generated from "https://github.com/w3c/xml-entities/blob/gh-pages/unicode.xml".
-	
+
 	UPDATE:
-		- Ensure the XML file was downloaded using "tests\core\download_assets.py".
+		- Ensure the XML file was downloaded using "tests\core\download_assets.py", given the path to the "tests\assets" directory.
 		- Run "core/unicode/tools/generate_entity_table.odin"

 	Odin unicode generated tables: https://github.com/odin-lang/Odin/tree/master/core/encoding/entity

-		Copyright David Carlisle 1999-2023
+		Copyright David Carlisle 1999-2025

 		Use and distribution of this code are permitted under the terms of the
 		W3C Software Notice and License.
--- a/core/unicode/tools/generate_entity_table.odin
+++ b/core/unicode/tools/generate_entity_table.odin
@@ -8,14 +8,53 @@ import      "core:strconv"
 import      "core:slice"
 import      "core:fmt"

+GENERATED :: `/*
+	------ GENERATED ------ DO NOT EDIT ------ GENERATED ------ DO NOT EDIT ------ GENERATED ------
+*/`
+
+TABLE_FILE_PROLOG :: `/*
+	This file is generated from "https://github.com/w3c/xml-entities/blob/gh-pages/unicode.xml".
+
+	UPDATE:
+		- Ensure the XML file was downloaded using "tests\core\download_assets.py", given the path to the "tests\assets" directory.
+		- Run "core/unicode/tools/generate_entity_table.odin"
+
+	Odin unicode generated tables: https://github.com/odin-lang/Odin/tree/master/core/encoding/entity
+
+		Copyright David Carlisle 1999-2025
+
+		Use and distribution of this code are permitted under the terms of the
+		W3C Software Notice and License.
+		http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231.html
+
+
+
+		This file is a collection of information about how to map
+		Unicode entities to LaTeX, and various SGML/XML entity
+		sets (ISO and MathML/HTML). A Unicode character may be mapped
+		to several entities.
+
+		Originally designed by Sebastian Rahtz in conjunction with
+		Barbara Beeton for the STIX project
+
+	See also: LICENSE_table.md
+*/
+`
+
 // Silent error handler for the parser.
 Error_Handler :: proc(pos: xml.Pos, fmt: string, args: ..any) {}

 OPTIONS :: xml.Options{ flags = { .Ignore_Unsupported, }, expected_doctype = "unicode", }

 Entity :: struct {
-	name:        string,
 	codepoints:  [2]rune,
+	name:        string, // &name;
+	description: string,
+}
+
+Character :: struct {
+	codepoint:   rune,
+	category:    string,
 	description: string,
 }

@@ -24,16 +63,8 @@ main :: proc() {
 	defer delete(filename)

 	if err_xml != .None {
-		fmt.eprintfln("Join path error for unicode.xml: %v", err_xml);
-		os.exit(1);
-	}
-
-	generated_filename, err_generated := path.join({ODIN_ROOT, "core", "encoding", "entity", "generated.odin"}, context.allocator)
-	defer delete(generated_filename)
-
-	if err_generated != .None {
-		fmt.eprintfln("Join path error for generated.odin: %v", err_generated);
-		os.exit(1);
+		fmt.eprintfln("Join path error for unicode.xml: %v", err_xml)
+		os.exit(1)
 	}

 	doc, err := xml.load_from_file(filename, OPTIONS, Error_Handler)
@@ -49,10 +80,6 @@ main :: proc() {

 	fmt.printfln("%q loaded and parsed.", filename)

-	generated_buf: strings.Builder
-	defer strings.builder_destroy(&generated_buf)
-	w := strings.to_writer(&generated_buf)
-
 	charlist_id, charlist_ok := xml.find_child_by_ident(doc, 0, "charlist")
 	if !charlist_ok {
 		fmt.eprintln("Could not locate top-level `<charlist>` tag.")
@@ -63,94 +90,130 @@ main :: proc() {

 	fmt.printfln("Found `<charlist>` with %v children.", len(charlist.value))

+	// These are for `core:encoding/entity`, and only keep track of codepoints which have
+	// one or more <entity> children pointing to it.
+	//
+	// This means that this array can have the same codepoint appear more than once, e.g.
+	// `Aring` and `angst` are both a capital A with a circle. The latter is the Angstrom symbol.
+	entities: [dynamic]Entity
+	defer delete(entities)
 	entity_map: map[string]Entity
 	defer delete(entity_map)

-	names: [dynamic]string
-	defer delete(names)
-
 	min_name_length := max(int)
 	max_name_length := min(int)
 	shortest_name: string
 	longest_name:  string

-	count := 0
+	// This is for `core:unicode`'s tables and has all children of `<charlist>`
+	characters: [dynamic]Character
+	defer delete(characters)
+
 	for char_id in charlist.value {
 		id := char_id.(xml.Element_ID)
 		char := doc.elements[id]

 		if char.ident != "character" {
-			fmt.eprintfln("Expected `<character>`, got `<%v>`", char.ident)
+			fmt.eprintfln("Expected `<charlist>` child to be `<character>`, got `<%v>`", char.ident)
 			os.exit(1)
 		}

-		if codepoint_string, ok := xml.find_attribute_val_by_key(doc, id, "dec"); !ok {
-			fmt.eprintln("`<character id=\"...\">` attribute not found.")
+		// `dec` is the codepoint, or codepoints separated by a `-`.
+		codepoint_string, ok := xml.find_attribute_val_by_key(doc, id, "dec")
+		if !ok {
+			fmt.eprintln("`<character dec=\"...\">` attribute not found.")
 			os.exit(1)
-		} else {
-			r1, _, r2 := strings.partition(codepoint_string, "-")
+		}

-			codepoint, codepoint2: int
-			codepoint, _ = strconv.parse_int(r1)
-			if r2 != "" {
-				codepoint2, _ = strconv.parse_int(r2)
-			}
+		r1, _, r2 := strings.partition(codepoint_string, "-")

-			desc, desc_ok := xml.find_child_by_ident(doc, id, "description")
-			assert(desc_ok)
-			description := ""
-			if len(doc.elements[desc].value) == 1 {
-				description = doc.elements[desc].value[0].(string)
-			}
+		codepoint, codepoint2: int
+		codepoint, _ = strconv.parse_int(r1)
+		if r2 != "" {
+			codepoint2, _ = strconv.parse_int(r2)
+		}

-			// For us to be interested in this codepoint, it has to have at least one entity.
-			nth := 0
-			for {
-				character_entity := xml.find_child_by_ident(doc, id, "entity", nth) or_break
-				nth += 1
-				name := xml.find_attribute_val_by_key(doc, character_entity, "id") or_continue
-				if len(name) == 0 {
-					/*
-						Invalid name. Skip.
-					*/
-					continue
-				}
+		// This is the description we add to `core:encoding/entity`'s generated table
+		desc, desc_ok := xml.find_child_by_ident(doc, id, "description")
+		assert(desc_ok)
+		description := ""
+		if len(doc.elements[desc].value) == 1 {
+			description = doc.elements[desc].value[0].(string)
+		}

-				if name == "\"\"" {
-					fmt.printfln("%#v", char)
-					fmt.printfln("%#v", character_entity)
-				}
-
-				if len(name) > max_name_length { longest_name  = name }
-				if len(name) < min_name_length { shortest_name = name }
-
-				min_name_length = min(min_name_length, len(name))
-				max_name_length = max(max_name_length, len(name))
-
-				e := Entity{
-					name        = name,
-					codepoints  = {rune(codepoint), rune(codepoint2)},
+		// For us to be interested in a character for `core:unicode`, it has to have `<unicodedata category="..">`
+		//
+		// Not present for e.g. MULTIPLE CHARACTER OPERATOR: arccos
+		// and some maths characters without a character category
+		if unicodedata, unicodedata_ok := xml.find_child_by_ident(doc, id, "unicodedata"); unicodedata_ok {
+			// Not present for some math characters, e.g. codepoint: 10913-824, desc: "DOUBLE NESTED LESS-THAN with slash"
+			if category_string, category_ok := xml.find_attribute_val_by_key(doc, unicodedata, "category"); category_ok {
+				// These should only consist of a single rune.
+				assert(codepoint2 == 0)
+				append(&characters, Character{
+					codepoint   = rune(codepoint),
 					description = description,
-				}
-
-				if name in entity_map {
-					continue
-				}
-
-				entity_map[name] = e
-				append(&names, name)
-				count += 1
+					category    = category_string,
+				})
 			}
 		}
+
+		// For us to be interested in this codepoint for `core:encoding/entity`, it has to have at least one `<entity>`.
+		nth := 0
+		for {
+			character_entity := xml.find_child_by_ident(doc, id, "entity", nth) or_break
+			nth += 1
+			name := xml.find_attribute_val_by_key(doc, character_entity, "id") or_continue
+			if len(name) == 0 {
+				// Invalid name. Skip.
+				continue
+			}
+
+			if len(name) > max_name_length { longest_name  = name }
+			if len(name) < min_name_length { shortest_name = name }
+
+			min_name_length = min(min_name_length, len(name))
+			max_name_length = max(max_name_length, len(name))
+
+			if name in entity_map {
+				continue
+			}
+
+			e := Entity{
+				name        = name,
+				codepoints  = {rune(codepoint), rune(codepoint2)},
+				description = description,
+			}
+
+			entity_map[name] = e
+			append(&entities, e)
+		}
 	}

-	// Sort by name.
-	slice.sort(names[:])
+	write_encoding_entitities_table(entities[:], shortest_name, longest_name, min_name_length, max_name_length)
+	fmt.println()
+	write_unicode_category_tables(characters[:])

-	fmt.printfln("Found %v unique `&name;` -> rune mappings.", count)
+	// Not a library, no need to clean up.
+}
+
+write_encoding_entitities_table :: proc(entities: []Entity, shortest_name, longest_name: string, min_name_length, max_name_length: int) {
+	fmt.printfln("Found %v unique `&name;` -> rune mappings.", len(entities))
 	fmt.printfln("Shortest name: %v (%v)", shortest_name, min_name_length)
 	fmt.printfln("Longest name:  %v (%v)", longest_name,  max_name_length)

+	generated_filename, err_generated := path.join({ODIN_ROOT, "core", "encoding", "entity", "generated.odin"}, context.allocator)
+	defer delete(generated_filename)
+
+	if err_generated != .None {
+		fmt.eprintfln("Join path error for generated.odin: %v", err_generated)
+		os.exit(1)
+	}
+
+	generated_buf: strings.Builder
+	defer strings.builder_destroy(&generated_buf)
+	w := strings.to_writer(&generated_buf)
+
 	// Generate table.
 	fmt.wprintln(w, "package encoding_unicode_entity")
 	fmt.wprintln(w, "")
@@ -192,19 +255,21 @@ named_xml_entity_to_rune :: proc(name: string) -> (decoded: [2]rune, rune_count:
 	prefix := '?'
 	should_close := false

-	for v in names {
-		if rune(v[0]) != prefix {
+	slice.sort_by(entities, proc(a, b: Entity) -> bool {
+		return a.name < b.name
+	})
+
+	for e in entities {
+		if rune(e.name[0]) != prefix {
 			if should_close {
 				fmt.wprintln(w, "\t\t}\n")
 			}

-			prefix = rune(v[0])
+			prefix = rune(e.name[0])
 			fmt.wprintfln(w, "\tcase '%v':", prefix)
 			fmt.wprintln(w, "\t\tswitch name {")
 		}

-		e := entity_map[v]
-
 		fmt.wprintf(w, "\t\tcase \"%v\":", e.name)
 		for i := len(e.name); i < max_name_length; i += 1 {
 			fmt.wprintf(w, " ")
@@ -224,8 +289,10 @@ named_xml_entity_to_rune :: proc(name: string) -> (decoded: [2]rune, rune_count:
 	fmt.wprintln(w, GENERATED)

 	fmt.println()
-	fmt.println(strings.to_string(generated_buf))
-	fmt.println()
+	when ODIN_DEBUG {
+		fmt.println(strings.to_string(generated_buf))
+		fmt.println()
+	}

 	written := os.write_entire_file(generated_filename, transmute([]byte)strings.to_string(generated_buf))

@@ -234,45 +301,66 @@ named_xml_entity_to_rune :: proc(name: string) -> (decoded: [2]rune, rune_count:
 	} else {
 		fmt.printfln("Failed to write generated \"%v\".", generated_filename)
 	}
-	// Not a library, no need to clean up.
 }

-GENERATED :: `/*
-	------ GENERATED ------ DO NOT EDIT ------ GENERATED ------ DO NOT EDIT ------ GENERATED ------
-*/`
+write_unicode_category_tables :: proc(characters: []Character) {
+	fmt.printfln("Found %v codepoints with a category.", len(characters))

-TABLE_FILE_PROLOG :: `/*
-	This file is generated from "https://github.com/w3c/xml-entities/blob/gh-pages/unicode.xml".
-	
-	UPDATE:
-		- Ensure the XML file was downloaded using "tests\core\download_assets.py".
-		- Run "core/unicode/tools/generate_entity_table.odin"
+	// Sort by `category`, then `codepoints`
+	slice.sort_by(characters, proc(a, b: Character) -> bool {
+		return a.category < b.category && a.codepoint < b.codepoint
+	})

-	Odin unicode generated tables: https://github.com/odin-lang/Odin/tree/master/core/encoding/entity
+	nd_range_start := rune(-1)
+	nd_range_end   := rune(-1)
+	nd_last: rune
+	for c in characters {
+		// Find contiguous ranges for the `Nd` category
+		if c.category == "Nd" {
+			defer nd_last = c.codepoint

-		Copyright David Carlisle 1999-2023
-
-		Use and distribution of this code are permitted under the terms of the
-		W3C Software Notice and License.
-		http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231.html
-
-
-
-		This file is a collection of information about how to map
-		Unicode entities to LaTeX, and various SGML/XML entity
-		sets (ISO and MathML/HTML). A Unicode character may be mapped
-		to several entities.
-
-		Originally designed by Sebastian Rahtz in conjunction with
-		Barbara Beeton for the STIX project
-
-	See also: LICENSE_table.md
-*/
-`
-
-is_dotted_name :: proc(name: string) -> (dotted: bool) {
-	for r in name {
-		if r == '.' { return true}
+			// New range start
+			if c.codepoint != nd_last + 1 {
+				nd_range_end = nd_last
+				if nd_range_start != rune(-1) {
+					// Found a range
+					// fmt.printfln("%r (%d) - %r (%d) // %s", nd_range_start, nd_range_start, nd_range_end, nd_range_end, c.description)
+				}
+				nd_range_start = c.codepoint
+			}
+		}
 	}
-	return false
-}
+
+	/*
+	Lu	Letter, Uppercase
+	Ll	Letter, Lowercase
+	Lt	Letter, Titlecase
+	Lm	Letter, Modifier
+	Lo	Letter, Other
+	Mn	Mark, Nonspacing
+	Mc	Mark, Spacing Combining
+	Me	Mark, Enclosing
+	Nd	Number, Decimal Digit
+	Nl	Number, Letter
+	No	Number, Other
+	Pc	Punctuation, Connector
+	Pd	Punctuation, Dash
+	Ps	Punctuation, Open
+	Pe	Punctuation, Close
+	Pi	Punctuation, Initial quote (may behave like Ps or Pe depending on usage)
+	Pf	Punctuation, Final quote (may behave like Ps or Pe depending on usage)
+	Po	Punctuation, Other
+	Sm	Symbol, Math
+	Sc	Symbol, Currency
+	Sk	Symbol, Modifier
+	So	Symbol, Other
+	Zs	Separator, Space
+	Zl	Separator, Line
+	Zp	Separator, Paragraph
+	Cc	Other, Control
+	Cf	Other, Format
+	Cs	Other, Surrogate
+	Co	Other, Private Use
+	Cn	Other, Not Assigned (no characters in the file have this property)
+	*/
+}
--- a/tests/core/download_assets.py
+++ b/tests/core/download_assets.py
@@ -286,7 +286,7 @@ HMAC_DIGESTS = {
 	'emblem-1024-progressive.jpg': "7a6f4b112bd7189320c58dcddb9129968bcf268798c1e0c4f2243c10b3e3d9a6962c9f142d9fd65f8fb31e9a1e899008cae22b3ffde713250d315499b412e160",
 	'emblem-1024-gray.jpg':        "4c25aaab92451e0452cdb165833b2b5a51978c2571de9d053950944667847666ba198d3001291615acda098ebe45b7d2d53c210c492f077b04a6bfe386f8a5fd",

-	'unicode.xml':                 "e0cdc94f07fdbb15eea811ed2ae6dcf494a83d197dafe6580c740270feb0d8f5f7146d4a7d4c2d2ea25f8bd9678bc986123484b39399819a6b7262687959d1ae",
+	'unicode.xml':                 "ba3a0f730efd6cbb89a54dafddf8902ace1b4e22e96d5c8e47cfcd1b9b79705d1985ea4abd25c5ede1c47a5a7d9456b25b093c1ec9d7705f0c6bf52a515c9aa9",

 	'a128cbc_hs256_test.json':           "4d2c8e5c9c669dba8f4ca88398efaa03003a1f0350a50df6ad94fbc5d93cb9b9ab6f96b727e1bf85298e98b5d5cf2538d38fab745ceca65cd35bf8dd7562ce87",
 	'a192cbc_hs384_test.json':           "3de5725108dc43dfe49571de089488de035631ff378c7708bd51fcdc854f306ca47cf731c65e45662759c8aed232f5111101d6c33836d9c2f8f700e775b8aa82",
--- a/tests/core/encoding/xml/test_core_xml.odin
+++ b/tests/core/encoding/xml/test_core_xml.odin
@@ -186,7 +186,7 @@ xml_test_unicode :: proc(t: ^testing.T) {
 			expected_doctype = "",
 		},
 		err       = .None,
-		crc32     = 0x73070b55,
+		crc32     = 0x738664b1,
 	})
 }