Merge pull request #6314 from Kelimion/update_unicode_xml

Update unicode xml
This commit is contained in:
Jeroen van Rijn
2026-02-21 14:14:59 +01:00
committed by GitHub
4 changed files with 209 additions and 121 deletions

View File

@@ -6,14 +6,14 @@ package encoding_unicode_entity
/*
This file is generated from "https://github.com/w3c/xml-entities/blob/gh-pages/unicode.xml".
UPDATE:
- Ensure the XML file was downloaded using "tests\core\download_assets.py".
- Ensure the XML file was downloaded using "tests\core\download_assets.py", given the path to the "tests\assets" directory.
- Run "core/unicode/tools/generate_entity_table.odin"
Odin unicode generated tables: https://github.com/odin-lang/Odin/tree/master/core/encoding/entity
Copyright David Carlisle 1999-2023
Copyright David Carlisle 1999-2025
Use and distribution of this code are permitted under the terms of the
W3C Software Notice and License.

View File

@@ -8,14 +8,53 @@ import "core:strconv"
import "core:slice"
import "core:fmt"
GENERATED :: `/*
------ GENERATED ------ DO NOT EDIT ------ GENERATED ------ DO NOT EDIT ------ GENERATED ------
*/`
TABLE_FILE_PROLOG :: `/*
This file is generated from "https://github.com/w3c/xml-entities/blob/gh-pages/unicode.xml".
UPDATE:
- Ensure the XML file was downloaded using "tests\core\download_assets.py", given the path to the "tests\assets" directory.
- Run "core/unicode/tools/generate_entity_table.odin"
Odin unicode generated tables: https://github.com/odin-lang/Odin/tree/master/core/encoding/entity
Copyright David Carlisle 1999-2025
Use and distribution of this code are permitted under the terms of the
W3C Software Notice and License.
http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231.html
This file is a collection of information about how to map
Unicode entities to LaTeX, and various SGML/XML entity
sets (ISO and MathML/HTML). A Unicode character may be mapped
to several entities.
Originally designed by Sebastian Rahtz in conjunction with
Barbara Beeton for the STIX project
See also: LICENSE_table.md
*/
`
// Silent error handler for the parser.
Error_Handler :: proc(pos: xml.Pos, fmt: string, args: ..any) {}
OPTIONS :: xml.Options{ flags = { .Ignore_Unsupported, }, expected_doctype = "unicode", }
Entity :: struct {
name: string,
codepoints: [2]rune,
name: string, // &name;
description: string,
}
Character :: struct {
codepoint: rune,
category: string,
description: string,
}
@@ -24,16 +63,8 @@ main :: proc() {
defer delete(filename)
if err_xml != .None {
fmt.eprintfln("Join path error for unicode.xml: %v", err_xml);
os.exit(1);
}
generated_filename, err_generated := path.join({ODIN_ROOT, "core", "encoding", "entity", "generated.odin"}, context.allocator)
defer delete(generated_filename)
if err_generated != .None {
fmt.eprintfln("Join path error for generated.odin: %v", err_generated);
os.exit(1);
fmt.eprintfln("Join path error for unicode.xml: %v", err_xml)
os.exit(1)
}
doc, err := xml.load_from_file(filename, OPTIONS, Error_Handler)
@@ -49,10 +80,6 @@ main :: proc() {
fmt.printfln("%q loaded and parsed.", filename)
generated_buf: strings.Builder
defer strings.builder_destroy(&generated_buf)
w := strings.to_writer(&generated_buf)
charlist_id, charlist_ok := xml.find_child_by_ident(doc, 0, "charlist")
if !charlist_ok {
fmt.eprintln("Could not locate top-level `<charlist>` tag.")
@@ -63,94 +90,130 @@ main :: proc() {
fmt.printfln("Found `<charlist>` with %v children.", len(charlist.value))
// These are for `core:encoding/entity`, and only keep track of codepoints which have
// one or more <entity> children pointing to it.
//
// This means that this array can have the same codepoint appear more than once, e.g.
// `Aring` and `angst` are both a capital A with a circle. The latter is the Angstrom symbol.
entities: [dynamic]Entity
defer delete(entities)
entity_map: map[string]Entity
defer delete(entity_map)
names: [dynamic]string
defer delete(names)
min_name_length := max(int)
max_name_length := min(int)
shortest_name: string
longest_name: string
count := 0
// This is for `core:unicode`'s tables and has all children of `<charlist>`
characters: [dynamic]Character
defer delete(characters)
for char_id in charlist.value {
id := char_id.(xml.Element_ID)
char := doc.elements[id]
if char.ident != "character" {
fmt.eprintfln("Expected `<character>`, got `<%v>`", char.ident)
fmt.eprintfln("Expected `<charlist>` child to be `<character>`, got `<%v>`", char.ident)
os.exit(1)
}
if codepoint_string, ok := xml.find_attribute_val_by_key(doc, id, "dec"); !ok {
fmt.eprintln("`<character id=\"...\">` attribute not found.")
// `dec` is the codepoint, or codepoints separated by a `-`.
codepoint_string, ok := xml.find_attribute_val_by_key(doc, id, "dec")
if !ok {
fmt.eprintln("`<character dec=\"...\">` attribute not found.")
os.exit(1)
} else {
r1, _, r2 := strings.partition(codepoint_string, "-")
}
codepoint, codepoint2: int
codepoint, _ = strconv.parse_int(r1)
if r2 != "" {
codepoint2, _ = strconv.parse_int(r2)
}
r1, _, r2 := strings.partition(codepoint_string, "-")
desc, desc_ok := xml.find_child_by_ident(doc, id, "description")
assert(desc_ok)
description := ""
if len(doc.elements[desc].value) == 1 {
description = doc.elements[desc].value[0].(string)
}
codepoint, codepoint2: int
codepoint, _ = strconv.parse_int(r1)
if r2 != "" {
codepoint2, _ = strconv.parse_int(r2)
}
// For us to be interested in this codepoint, it has to have at least one entity.
nth := 0
for {
character_entity := xml.find_child_by_ident(doc, id, "entity", nth) or_break
nth += 1
name := xml.find_attribute_val_by_key(doc, character_entity, "id") or_continue
if len(name) == 0 {
/*
Invalid name. Skip.
*/
continue
}
// This is the description we add to `core:encoding/entity`'s generated table
desc, desc_ok := xml.find_child_by_ident(doc, id, "description")
assert(desc_ok)
description := ""
if len(doc.elements[desc].value) == 1 {
description = doc.elements[desc].value[0].(string)
}
if name == "\"\"" {
fmt.printfln("%#v", char)
fmt.printfln("%#v", character_entity)
}
if len(name) > max_name_length { longest_name = name }
if len(name) < min_name_length { shortest_name = name }
min_name_length = min(min_name_length, len(name))
max_name_length = max(max_name_length, len(name))
e := Entity{
name = name,
codepoints = {rune(codepoint), rune(codepoint2)},
// For us to be interested in a character for `core:unicode`, it has to have `<unicodedata category="..">`
//
// Not present for e.g. MULTIPLE CHARACTER OPERATOR: arccos
// and some maths characters without a character category
if unicodedata, unicodedata_ok := xml.find_child_by_ident(doc, id, "unicodedata"); unicodedata_ok {
// Not present for some math characters, e.g. codepoint: 10913-824, desc: "DOUBLE NESTED LESS-THAN with slash"
if category_string, category_ok := xml.find_attribute_val_by_key(doc, unicodedata, "category"); category_ok {
// These should only consist of a single rune.
assert(codepoint2 == 0)
append(&characters, Character{
codepoint = rune(codepoint),
description = description,
}
if name in entity_map {
continue
}
entity_map[name] = e
append(&names, name)
count += 1
category = category_string,
})
}
}
// For us to be interested in this codepoint for `core:encoding/entity`, it has to have at least one `<entity>`.
nth := 0
for {
character_entity := xml.find_child_by_ident(doc, id, "entity", nth) or_break
nth += 1
name := xml.find_attribute_val_by_key(doc, character_entity, "id") or_continue
if len(name) == 0 {
// Invalid name. Skip.
continue
}
if len(name) > max_name_length { longest_name = name }
if len(name) < min_name_length { shortest_name = name }
min_name_length = min(min_name_length, len(name))
max_name_length = max(max_name_length, len(name))
if name in entity_map {
continue
}
e := Entity{
name = name,
codepoints = {rune(codepoint), rune(codepoint2)},
description = description,
}
entity_map[name] = e
append(&entities, e)
}
}
// Sort by name.
slice.sort(names[:])
write_encoding_entitities_table(entities[:], shortest_name, longest_name, min_name_length, max_name_length)
fmt.println()
write_unicode_category_tables(characters[:])
fmt.printfln("Found %v unique `&name;` -> rune mappings.", count)
// Not a library, no need to clean up.
}
write_encoding_entitities_table :: proc(entities: []Entity, shortest_name, longest_name: string, min_name_length, max_name_length: int) {
fmt.printfln("Found %v unique `&name;` -> rune mappings.", len(entities))
fmt.printfln("Shortest name: %v (%v)", shortest_name, min_name_length)
fmt.printfln("Longest name: %v (%v)", longest_name, max_name_length)
generated_filename, err_generated := path.join({ODIN_ROOT, "core", "encoding", "entity", "generated.odin"}, context.allocator)
defer delete(generated_filename)
if err_generated != .None {
fmt.eprintfln("Join path error for generated.odin: %v", err_generated)
os.exit(1)
}
generated_buf: strings.Builder
defer strings.builder_destroy(&generated_buf)
w := strings.to_writer(&generated_buf)
// Generate table.
fmt.wprintln(w, "package encoding_unicode_entity")
fmt.wprintln(w, "")
@@ -192,19 +255,21 @@ named_xml_entity_to_rune :: proc(name: string) -> (decoded: [2]rune, rune_count:
prefix := '?'
should_close := false
for v in names {
if rune(v[0]) != prefix {
slice.sort_by(entities, proc(a, b: Entity) -> bool {
return a.name < b.name
})
for e in entities {
if rune(e.name[0]) != prefix {
if should_close {
fmt.wprintln(w, "\t\t}\n")
}
prefix = rune(v[0])
prefix = rune(e.name[0])
fmt.wprintfln(w, "\tcase '%v':", prefix)
fmt.wprintln(w, "\t\tswitch name {")
}
e := entity_map[v]
fmt.wprintf(w, "\t\tcase \"%v\":", e.name)
for i := len(e.name); i < max_name_length; i += 1 {
fmt.wprintf(w, " ")
@@ -224,8 +289,10 @@ named_xml_entity_to_rune :: proc(name: string) -> (decoded: [2]rune, rune_count:
fmt.wprintln(w, GENERATED)
fmt.println()
fmt.println(strings.to_string(generated_buf))
fmt.println()
when ODIN_DEBUG {
fmt.println(strings.to_string(generated_buf))
fmt.println()
}
written := os.write_entire_file(generated_filename, transmute([]byte)strings.to_string(generated_buf))
@@ -234,45 +301,66 @@ named_xml_entity_to_rune :: proc(name: string) -> (decoded: [2]rune, rune_count:
} else {
fmt.printfln("Failed to write generated \"%v\".", generated_filename)
}
// Not a library, no need to clean up.
}
GENERATED :: `/*
------ GENERATED ------ DO NOT EDIT ------ GENERATED ------ DO NOT EDIT ------ GENERATED ------
*/`
write_unicode_category_tables :: proc(characters: []Character) {
fmt.printfln("Found %v codepoints with a category.", len(characters))
TABLE_FILE_PROLOG :: `/*
This file is generated from "https://github.com/w3c/xml-entities/blob/gh-pages/unicode.xml".
UPDATE:
- Ensure the XML file was downloaded using "tests\core\download_assets.py".
- Run "core/unicode/tools/generate_entity_table.odin"
// Sort by `category`, then `codepoints`
slice.sort_by(characters, proc(a, b: Character) -> bool {
return a.category < b.category && a.codepoint < b.codepoint
})
Odin unicode generated tables: https://github.com/odin-lang/Odin/tree/master/core/encoding/entity
nd_range_start := rune(-1)
nd_range_end := rune(-1)
nd_last: rune
for c in characters {
// Find contiguous ranges for the `Nd` category
if c.category == "Nd" {
defer nd_last = c.codepoint
Copyright David Carlisle 1999-2023
Use and distribution of this code are permitted under the terms of the
W3C Software Notice and License.
http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231.html
This file is a collection of information about how to map
Unicode entities to LaTeX, and various SGML/XML entity
sets (ISO and MathML/HTML). A Unicode character may be mapped
to several entities.
Originally designed by Sebastian Rahtz in conjunction with
Barbara Beeton for the STIX project
See also: LICENSE_table.md
*/
`
is_dotted_name :: proc(name: string) -> (dotted: bool) {
for r in name {
if r == '.' { return true}
// New range start
if c.codepoint != nd_last + 1 {
nd_range_end = nd_last
if nd_range_start != rune(-1) {
// Found a range
// fmt.printfln("%r (%d) - %r (%d) // %s", nd_range_start, nd_range_start, nd_range_end, nd_range_end, c.description)
}
nd_range_start = c.codepoint
}
}
}
return false
}
/*
Lu Letter, Uppercase
Ll Letter, Lowercase
Lt Letter, Titlecase
Lm Letter, Modifier
Lo Letter, Other
Mn Mark, Nonspacing
Mc Mark, Spacing Combining
Me Mark, Enclosing
Nd Number, Decimal Digit
Nl Number, Letter
No Number, Other
Pc Punctuation, Connector
Pd Punctuation, Dash
Ps Punctuation, Open
Pe Punctuation, Close
Pi Punctuation, Initial quote (may behave like Ps or Pe depending on usage)
Pf Punctuation, Final quote (may behave like Ps or Pe depending on usage)
Po Punctuation, Other
Sm Symbol, Math
Sc Symbol, Currency
Sk Symbol, Modifier
So Symbol, Other
Zs Separator, Space
Zl Separator, Line
Zp Separator, Paragraph
Cc Other, Control
Cf Other, Format
Cs Other, Surrogate
Co Other, Private Use
Cn Other, Not Assigned (no characters in the file have this property)
*/
}

View File

@@ -286,7 +286,7 @@ HMAC_DIGESTS = {
'emblem-1024-progressive.jpg': "7a6f4b112bd7189320c58dcddb9129968bcf268798c1e0c4f2243c10b3e3d9a6962c9f142d9fd65f8fb31e9a1e899008cae22b3ffde713250d315499b412e160",
'emblem-1024-gray.jpg': "4c25aaab92451e0452cdb165833b2b5a51978c2571de9d053950944667847666ba198d3001291615acda098ebe45b7d2d53c210c492f077b04a6bfe386f8a5fd",
'unicode.xml': "e0cdc94f07fdbb15eea811ed2ae6dcf494a83d197dafe6580c740270feb0d8f5f7146d4a7d4c2d2ea25f8bd9678bc986123484b39399819a6b7262687959d1ae",
'unicode.xml': "ba3a0f730efd6cbb89a54dafddf8902ace1b4e22e96d5c8e47cfcd1b9b79705d1985ea4abd25c5ede1c47a5a7d9456b25b093c1ec9d7705f0c6bf52a515c9aa9",
'a128cbc_hs256_test.json': "4d2c8e5c9c669dba8f4ca88398efaa03003a1f0350a50df6ad94fbc5d93cb9b9ab6f96b727e1bf85298e98b5d5cf2538d38fab745ceca65cd35bf8dd7562ce87",
'a192cbc_hs384_test.json': "3de5725108dc43dfe49571de089488de035631ff378c7708bd51fcdc854f306ca47cf731c65e45662759c8aed232f5111101d6c33836d9c2f8f700e775b8aa82",

View File

@@ -186,7 +186,7 @@ xml_test_unicode :: proc(t: ^testing.T) {
expected_doctype = "",
},
err = .None,
crc32 = 0x73070b55,
crc32 = 0x738664b1,
})
}