mirror of
https://github.com/odin-lang/Odin.git
synced 2026-04-06 14:48:21 +00:00
Merge pull request #6314 from Kelimion/update_unicode_xml
Update unicode xml
This commit is contained in:
@@ -6,14 +6,14 @@ package encoding_unicode_entity
|
||||
|
||||
/*
|
||||
This file is generated from "https://github.com/w3c/xml-entities/blob/gh-pages/unicode.xml".
|
||||
|
||||
|
||||
UPDATE:
|
||||
- Ensure the XML file was downloaded using "tests\core\download_assets.py".
|
||||
- Ensure the XML file was downloaded using "tests\core\download_assets.py", given the path to the "tests\assets" directory.
|
||||
- Run "core/unicode/tools/generate_entity_table.odin"
|
||||
|
||||
Odin unicode generated tables: https://github.com/odin-lang/Odin/tree/master/core/encoding/entity
|
||||
|
||||
Copyright David Carlisle 1999-2023
|
||||
Copyright David Carlisle 1999-2025
|
||||
|
||||
Use and distribution of this code are permitted under the terms of the
|
||||
W3C Software Notice and License.
|
||||
|
||||
@@ -8,14 +8,53 @@ import "core:strconv"
|
||||
import "core:slice"
|
||||
import "core:fmt"
|
||||
|
||||
GENERATED :: `/*
|
||||
------ GENERATED ------ DO NOT EDIT ------ GENERATED ------ DO NOT EDIT ------ GENERATED ------
|
||||
*/`
|
||||
|
||||
TABLE_FILE_PROLOG :: `/*
|
||||
This file is generated from "https://github.com/w3c/xml-entities/blob/gh-pages/unicode.xml".
|
||||
|
||||
UPDATE:
|
||||
- Ensure the XML file was downloaded using "tests\core\download_assets.py", given the path to the "tests\assets" directory.
|
||||
- Run "core/unicode/tools/generate_entity_table.odin"
|
||||
|
||||
Odin unicode generated tables: https://github.com/odin-lang/Odin/tree/master/core/encoding/entity
|
||||
|
||||
Copyright David Carlisle 1999-2025
|
||||
|
||||
Use and distribution of this code are permitted under the terms of the
|
||||
W3C Software Notice and License.
|
||||
http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231.html
|
||||
|
||||
|
||||
|
||||
This file is a collection of information about how to map
|
||||
Unicode entities to LaTeX, and various SGML/XML entity
|
||||
sets (ISO and MathML/HTML). A Unicode character may be mapped
|
||||
to several entities.
|
||||
|
||||
Originally designed by Sebastian Rahtz in conjunction with
|
||||
Barbara Beeton for the STIX project
|
||||
|
||||
See also: LICENSE_table.md
|
||||
*/
|
||||
`
|
||||
|
||||
// Silent error handler for the parser.
|
||||
Error_Handler :: proc(pos: xml.Pos, fmt: string, args: ..any) {}
|
||||
|
||||
OPTIONS :: xml.Options{ flags = { .Ignore_Unsupported, }, expected_doctype = "unicode", }
|
||||
|
||||
Entity :: struct {
|
||||
name: string,
|
||||
codepoints: [2]rune,
|
||||
name: string, // &name;
|
||||
description: string,
|
||||
}
|
||||
|
||||
Character :: struct {
|
||||
codepoint: rune,
|
||||
category: string,
|
||||
description: string,
|
||||
}
|
||||
|
||||
@@ -24,16 +63,8 @@ main :: proc() {
|
||||
defer delete(filename)
|
||||
|
||||
if err_xml != .None {
|
||||
fmt.eprintfln("Join path error for unicode.xml: %v", err_xml);
|
||||
os.exit(1);
|
||||
}
|
||||
|
||||
generated_filename, err_generated := path.join({ODIN_ROOT, "core", "encoding", "entity", "generated.odin"}, context.allocator)
|
||||
defer delete(generated_filename)
|
||||
|
||||
if err_generated != .None {
|
||||
fmt.eprintfln("Join path error for generated.odin: %v", err_generated);
|
||||
os.exit(1);
|
||||
fmt.eprintfln("Join path error for unicode.xml: %v", err_xml)
|
||||
os.exit(1)
|
||||
}
|
||||
|
||||
doc, err := xml.load_from_file(filename, OPTIONS, Error_Handler)
|
||||
@@ -49,10 +80,6 @@ main :: proc() {
|
||||
|
||||
fmt.printfln("%q loaded and parsed.", filename)
|
||||
|
||||
generated_buf: strings.Builder
|
||||
defer strings.builder_destroy(&generated_buf)
|
||||
w := strings.to_writer(&generated_buf)
|
||||
|
||||
charlist_id, charlist_ok := xml.find_child_by_ident(doc, 0, "charlist")
|
||||
if !charlist_ok {
|
||||
fmt.eprintln("Could not locate top-level `<charlist>` tag.")
|
||||
@@ -63,94 +90,130 @@ main :: proc() {
|
||||
|
||||
fmt.printfln("Found `<charlist>` with %v children.", len(charlist.value))
|
||||
|
||||
// These are for `core:encoding/entity`, and only keep track of codepoints which have
|
||||
// one or more <entity> children pointing to it.
|
||||
//
|
||||
// This means that this array can have the same codepoint appear more than once, e.g.
|
||||
// `Aring` and `angst` are both a capital A with a circle. The latter is the Angstrom symbol.
|
||||
entities: [dynamic]Entity
|
||||
defer delete(entities)
|
||||
entity_map: map[string]Entity
|
||||
defer delete(entity_map)
|
||||
|
||||
names: [dynamic]string
|
||||
defer delete(names)
|
||||
|
||||
min_name_length := max(int)
|
||||
max_name_length := min(int)
|
||||
shortest_name: string
|
||||
longest_name: string
|
||||
|
||||
count := 0
|
||||
// This is for `core:unicode`'s tables and has all children of `<charlist>`
|
||||
characters: [dynamic]Character
|
||||
defer delete(characters)
|
||||
|
||||
for char_id in charlist.value {
|
||||
id := char_id.(xml.Element_ID)
|
||||
char := doc.elements[id]
|
||||
|
||||
if char.ident != "character" {
|
||||
fmt.eprintfln("Expected `<character>`, got `<%v>`", char.ident)
|
||||
fmt.eprintfln("Expected `<charlist>` child to be `<character>`, got `<%v>`", char.ident)
|
||||
os.exit(1)
|
||||
}
|
||||
|
||||
if codepoint_string, ok := xml.find_attribute_val_by_key(doc, id, "dec"); !ok {
|
||||
fmt.eprintln("`<character id=\"...\">` attribute not found.")
|
||||
// `dec` is the codepoint, or codepoints separated by a `-`.
|
||||
codepoint_string, ok := xml.find_attribute_val_by_key(doc, id, "dec")
|
||||
if !ok {
|
||||
fmt.eprintln("`<character dec=\"...\">` attribute not found.")
|
||||
os.exit(1)
|
||||
} else {
|
||||
r1, _, r2 := strings.partition(codepoint_string, "-")
|
||||
}
|
||||
|
||||
codepoint, codepoint2: int
|
||||
codepoint, _ = strconv.parse_int(r1)
|
||||
if r2 != "" {
|
||||
codepoint2, _ = strconv.parse_int(r2)
|
||||
}
|
||||
r1, _, r2 := strings.partition(codepoint_string, "-")
|
||||
|
||||
desc, desc_ok := xml.find_child_by_ident(doc, id, "description")
|
||||
assert(desc_ok)
|
||||
description := ""
|
||||
if len(doc.elements[desc].value) == 1 {
|
||||
description = doc.elements[desc].value[0].(string)
|
||||
}
|
||||
codepoint, codepoint2: int
|
||||
codepoint, _ = strconv.parse_int(r1)
|
||||
if r2 != "" {
|
||||
codepoint2, _ = strconv.parse_int(r2)
|
||||
}
|
||||
|
||||
// For us to be interested in this codepoint, it has to have at least one entity.
|
||||
nth := 0
|
||||
for {
|
||||
character_entity := xml.find_child_by_ident(doc, id, "entity", nth) or_break
|
||||
nth += 1
|
||||
name := xml.find_attribute_val_by_key(doc, character_entity, "id") or_continue
|
||||
if len(name) == 0 {
|
||||
/*
|
||||
Invalid name. Skip.
|
||||
*/
|
||||
continue
|
||||
}
|
||||
// This is the description we add to `core:encoding/entity`'s generated table
|
||||
desc, desc_ok := xml.find_child_by_ident(doc, id, "description")
|
||||
assert(desc_ok)
|
||||
description := ""
|
||||
if len(doc.elements[desc].value) == 1 {
|
||||
description = doc.elements[desc].value[0].(string)
|
||||
}
|
||||
|
||||
if name == "\"\"" {
|
||||
fmt.printfln("%#v", char)
|
||||
fmt.printfln("%#v", character_entity)
|
||||
}
|
||||
|
||||
if len(name) > max_name_length { longest_name = name }
|
||||
if len(name) < min_name_length { shortest_name = name }
|
||||
|
||||
min_name_length = min(min_name_length, len(name))
|
||||
max_name_length = max(max_name_length, len(name))
|
||||
|
||||
e := Entity{
|
||||
name = name,
|
||||
codepoints = {rune(codepoint), rune(codepoint2)},
|
||||
// For us to be interested in a character for `core:unicode`, it has to have `<unicodedata category="..">`
|
||||
//
|
||||
// Not present for e.g. MULTIPLE CHARACTER OPERATOR: arccos
|
||||
// and some maths characters without a character category
|
||||
if unicodedata, unicodedata_ok := xml.find_child_by_ident(doc, id, "unicodedata"); unicodedata_ok {
|
||||
// Not present for some math characters, e.g. codepoint: 10913-824, desc: "DOUBLE NESTED LESS-THAN with slash"
|
||||
if category_string, category_ok := xml.find_attribute_val_by_key(doc, unicodedata, "category"); category_ok {
|
||||
// These should only consist of a single rune.
|
||||
assert(codepoint2 == 0)
|
||||
append(&characters, Character{
|
||||
codepoint = rune(codepoint),
|
||||
description = description,
|
||||
}
|
||||
|
||||
if name in entity_map {
|
||||
continue
|
||||
}
|
||||
|
||||
entity_map[name] = e
|
||||
append(&names, name)
|
||||
count += 1
|
||||
category = category_string,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// For us to be interested in this codepoint for `core:encoding/entity`, it has to have at least one `<entity>`.
|
||||
nth := 0
|
||||
for {
|
||||
character_entity := xml.find_child_by_ident(doc, id, "entity", nth) or_break
|
||||
nth += 1
|
||||
name := xml.find_attribute_val_by_key(doc, character_entity, "id") or_continue
|
||||
if len(name) == 0 {
|
||||
// Invalid name. Skip.
|
||||
continue
|
||||
}
|
||||
|
||||
if len(name) > max_name_length { longest_name = name }
|
||||
if len(name) < min_name_length { shortest_name = name }
|
||||
|
||||
min_name_length = min(min_name_length, len(name))
|
||||
max_name_length = max(max_name_length, len(name))
|
||||
|
||||
if name in entity_map {
|
||||
continue
|
||||
}
|
||||
|
||||
e := Entity{
|
||||
name = name,
|
||||
codepoints = {rune(codepoint), rune(codepoint2)},
|
||||
description = description,
|
||||
}
|
||||
|
||||
entity_map[name] = e
|
||||
append(&entities, e)
|
||||
}
|
||||
}
|
||||
|
||||
// Sort by name.
|
||||
slice.sort(names[:])
|
||||
write_encoding_entitities_table(entities[:], shortest_name, longest_name, min_name_length, max_name_length)
|
||||
fmt.println()
|
||||
write_unicode_category_tables(characters[:])
|
||||
|
||||
fmt.printfln("Found %v unique `&name;` -> rune mappings.", count)
|
||||
// Not a library, no need to clean up.
|
||||
}
|
||||
|
||||
write_encoding_entitities_table :: proc(entities: []Entity, shortest_name, longest_name: string, min_name_length, max_name_length: int) {
|
||||
fmt.printfln("Found %v unique `&name;` -> rune mappings.", len(entities))
|
||||
fmt.printfln("Shortest name: %v (%v)", shortest_name, min_name_length)
|
||||
fmt.printfln("Longest name: %v (%v)", longest_name, max_name_length)
|
||||
|
||||
generated_filename, err_generated := path.join({ODIN_ROOT, "core", "encoding", "entity", "generated.odin"}, context.allocator)
|
||||
defer delete(generated_filename)
|
||||
|
||||
if err_generated != .None {
|
||||
fmt.eprintfln("Join path error for generated.odin: %v", err_generated)
|
||||
os.exit(1)
|
||||
}
|
||||
|
||||
generated_buf: strings.Builder
|
||||
defer strings.builder_destroy(&generated_buf)
|
||||
w := strings.to_writer(&generated_buf)
|
||||
|
||||
// Generate table.
|
||||
fmt.wprintln(w, "package encoding_unicode_entity")
|
||||
fmt.wprintln(w, "")
|
||||
@@ -192,19 +255,21 @@ named_xml_entity_to_rune :: proc(name: string) -> (decoded: [2]rune, rune_count:
|
||||
prefix := '?'
|
||||
should_close := false
|
||||
|
||||
for v in names {
|
||||
if rune(v[0]) != prefix {
|
||||
slice.sort_by(entities, proc(a, b: Entity) -> bool {
|
||||
return a.name < b.name
|
||||
})
|
||||
|
||||
for e in entities {
|
||||
if rune(e.name[0]) != prefix {
|
||||
if should_close {
|
||||
fmt.wprintln(w, "\t\t}\n")
|
||||
}
|
||||
|
||||
prefix = rune(v[0])
|
||||
prefix = rune(e.name[0])
|
||||
fmt.wprintfln(w, "\tcase '%v':", prefix)
|
||||
fmt.wprintln(w, "\t\tswitch name {")
|
||||
}
|
||||
|
||||
e := entity_map[v]
|
||||
|
||||
fmt.wprintf(w, "\t\tcase \"%v\":", e.name)
|
||||
for i := len(e.name); i < max_name_length; i += 1 {
|
||||
fmt.wprintf(w, " ")
|
||||
@@ -224,8 +289,10 @@ named_xml_entity_to_rune :: proc(name: string) -> (decoded: [2]rune, rune_count:
|
||||
fmt.wprintln(w, GENERATED)
|
||||
|
||||
fmt.println()
|
||||
fmt.println(strings.to_string(generated_buf))
|
||||
fmt.println()
|
||||
when ODIN_DEBUG {
|
||||
fmt.println(strings.to_string(generated_buf))
|
||||
fmt.println()
|
||||
}
|
||||
|
||||
written := os.write_entire_file(generated_filename, transmute([]byte)strings.to_string(generated_buf))
|
||||
|
||||
@@ -234,45 +301,66 @@ named_xml_entity_to_rune :: proc(name: string) -> (decoded: [2]rune, rune_count:
|
||||
} else {
|
||||
fmt.printfln("Failed to write generated \"%v\".", generated_filename)
|
||||
}
|
||||
// Not a library, no need to clean up.
|
||||
}
|
||||
|
||||
GENERATED :: `/*
|
||||
------ GENERATED ------ DO NOT EDIT ------ GENERATED ------ DO NOT EDIT ------ GENERATED ------
|
||||
*/`
|
||||
write_unicode_category_tables :: proc(characters: []Character) {
|
||||
fmt.printfln("Found %v codepoints with a category.", len(characters))
|
||||
|
||||
TABLE_FILE_PROLOG :: `/*
|
||||
This file is generated from "https://github.com/w3c/xml-entities/blob/gh-pages/unicode.xml".
|
||||
|
||||
UPDATE:
|
||||
- Ensure the XML file was downloaded using "tests\core\download_assets.py".
|
||||
- Run "core/unicode/tools/generate_entity_table.odin"
|
||||
// Sort by `category`, then `codepoints`
|
||||
slice.sort_by(characters, proc(a, b: Character) -> bool {
|
||||
return a.category < b.category && a.codepoint < b.codepoint
|
||||
})
|
||||
|
||||
Odin unicode generated tables: https://github.com/odin-lang/Odin/tree/master/core/encoding/entity
|
||||
nd_range_start := rune(-1)
|
||||
nd_range_end := rune(-1)
|
||||
nd_last: rune
|
||||
for c in characters {
|
||||
// Find contiguous ranges for the `Nd` category
|
||||
if c.category == "Nd" {
|
||||
defer nd_last = c.codepoint
|
||||
|
||||
Copyright David Carlisle 1999-2023
|
||||
|
||||
Use and distribution of this code are permitted under the terms of the
|
||||
W3C Software Notice and License.
|
||||
http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231.html
|
||||
|
||||
|
||||
|
||||
This file is a collection of information about how to map
|
||||
Unicode entities to LaTeX, and various SGML/XML entity
|
||||
sets (ISO and MathML/HTML). A Unicode character may be mapped
|
||||
to several entities.
|
||||
|
||||
Originally designed by Sebastian Rahtz in conjunction with
|
||||
Barbara Beeton for the STIX project
|
||||
|
||||
See also: LICENSE_table.md
|
||||
*/
|
||||
`
|
||||
|
||||
is_dotted_name :: proc(name: string) -> (dotted: bool) {
|
||||
for r in name {
|
||||
if r == '.' { return true}
|
||||
// New range start
|
||||
if c.codepoint != nd_last + 1 {
|
||||
nd_range_end = nd_last
|
||||
if nd_range_start != rune(-1) {
|
||||
// Found a range
|
||||
// fmt.printfln("%r (%d) - %r (%d) // %s", nd_range_start, nd_range_start, nd_range_end, nd_range_end, c.description)
|
||||
}
|
||||
nd_range_start = c.codepoint
|
||||
}
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
/*
|
||||
Lu Letter, Uppercase
|
||||
Ll Letter, Lowercase
|
||||
Lt Letter, Titlecase
|
||||
Lm Letter, Modifier
|
||||
Lo Letter, Other
|
||||
Mn Mark, Nonspacing
|
||||
Mc Mark, Spacing Combining
|
||||
Me Mark, Enclosing
|
||||
Nd Number, Decimal Digit
|
||||
Nl Number, Letter
|
||||
No Number, Other
|
||||
Pc Punctuation, Connector
|
||||
Pd Punctuation, Dash
|
||||
Ps Punctuation, Open
|
||||
Pe Punctuation, Close
|
||||
Pi Punctuation, Initial quote (may behave like Ps or Pe depending on usage)
|
||||
Pf Punctuation, Final quote (may behave like Ps or Pe depending on usage)
|
||||
Po Punctuation, Other
|
||||
Sm Symbol, Math
|
||||
Sc Symbol, Currency
|
||||
Sk Symbol, Modifier
|
||||
So Symbol, Other
|
||||
Zs Separator, Space
|
||||
Zl Separator, Line
|
||||
Zp Separator, Paragraph
|
||||
Cc Other, Control
|
||||
Cf Other, Format
|
||||
Cs Other, Surrogate
|
||||
Co Other, Private Use
|
||||
Cn Other, Not Assigned (no characters in the file have this property)
|
||||
*/
|
||||
}
|
||||
@@ -286,7 +286,7 @@ HMAC_DIGESTS = {
|
||||
'emblem-1024-progressive.jpg': "7a6f4b112bd7189320c58dcddb9129968bcf268798c1e0c4f2243c10b3e3d9a6962c9f142d9fd65f8fb31e9a1e899008cae22b3ffde713250d315499b412e160",
|
||||
'emblem-1024-gray.jpg': "4c25aaab92451e0452cdb165833b2b5a51978c2571de9d053950944667847666ba198d3001291615acda098ebe45b7d2d53c210c492f077b04a6bfe386f8a5fd",
|
||||
|
||||
'unicode.xml': "e0cdc94f07fdbb15eea811ed2ae6dcf494a83d197dafe6580c740270feb0d8f5f7146d4a7d4c2d2ea25f8bd9678bc986123484b39399819a6b7262687959d1ae",
|
||||
'unicode.xml': "ba3a0f730efd6cbb89a54dafddf8902ace1b4e22e96d5c8e47cfcd1b9b79705d1985ea4abd25c5ede1c47a5a7d9456b25b093c1ec9d7705f0c6bf52a515c9aa9",
|
||||
|
||||
'a128cbc_hs256_test.json': "4d2c8e5c9c669dba8f4ca88398efaa03003a1f0350a50df6ad94fbc5d93cb9b9ab6f96b727e1bf85298e98b5d5cf2538d38fab745ceca65cd35bf8dd7562ce87",
|
||||
'a192cbc_hs384_test.json': "3de5725108dc43dfe49571de089488de035631ff378c7708bd51fcdc854f306ca47cf731c65e45662759c8aed232f5111101d6c33836d9c2f8f700e775b8aa82",
|
||||
|
||||
@@ -186,7 +186,7 @@ xml_test_unicode :: proc(t: ^testing.T) {
|
||||
expected_doctype = "",
|
||||
},
|
||||
err = .None,
|
||||
crc32 = 0x73070b55,
|
||||
crc32 = 0x738664b1,
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user