mirror of
https://github.com/odin-lang/Odin.git
synced 2025-12-28 17:04:34 +00:00
[core:encoding/entity] Add new package to decode &<entity>; entities.
Includes generator to generate a lookup for named entitiess.
This commit is contained in:
21
core/encoding/entity/LICENSE_table.md
Normal file
21
core/encoding/entity/LICENSE_table.md
Normal file
@@ -0,0 +1,21 @@
|
||||
# License
|
||||
|
||||
By obtaining, using and/or copying this work, you (the licensee) agree that you have read, understood, and will comply with the following terms and conditions.
|
||||
|
||||
Permission to copy, modify, and distribute this software and its documentation, with or without modification, for any purpose and without fee or royalty is hereby granted, provided that you include the following on ALL copies of the software and documentation or portions thereof, including modifications:
|
||||
|
||||
The full text of this NOTICE in a location viewable to users of the redistributed or derivative work.
|
||||
Any pre-existing intellectual property disclaimers, notices, or terms and conditions. If none exist, the W3C Software Short Notice should be included (hypertext is preferred, text is permitted) within the body of any redistributed or derivative code.
|
||||
|
||||
Notice of any changes or modifications to the files, including the date changes were made. (We recommend you provide URIs to the location from which the code is derived.)
|
||||
|
||||
# Disclaimers
|
||||
|
||||
THIS SOFTWARE AND DOCUMENTATION IS PROVIDED "AS IS," AND COPYRIGHT HOLDERS MAKE NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO, WARRANTIES OF MERCHANTABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE SOFTWARE OR DOCUMENTATION WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS.
|
||||
|
||||
COPYRIGHT HOLDERS WILL NOT BE LIABLE FOR ANY DIRECT, INDIRECT, SPECIAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF ANY USE OF THE SOFTWARE OR DOCUMENTATION.
|
||||
|
||||
The name and trademarks of copyright holders may NOT be used in advertising or publicity pertaining to the software without specific, written prior permission. Title to copyright in this software and any associated documentation will at all times remain with copyright holders.
|
||||
|
||||
# Notes
|
||||
This version: http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231
|
||||
358
core/encoding/entity/entity.odin
Normal file
358
core/encoding/entity/entity.odin
Normal file
@@ -0,0 +1,358 @@
|
||||
package unicode_entity
|
||||
/*
|
||||
A unicode entity encoder/decoder
|
||||
|
||||
Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
|
||||
Made available under Odin's BSD-3 license.
|
||||
|
||||
This code has several procedures to map unicode runes to/from different textual encodings.
|
||||
- SGML/XML/HTML entity
|
||||
-- &#<decimal>;
|
||||
-- &#x<hexadecimal>;
|
||||
-- &<entity name>; (If the lookup tables are compiled in).
|
||||
Reference: https://www.w3.org/2003/entities/2007xml/unicode.xml
|
||||
|
||||
- URL encode / decode %hex entity
|
||||
Reference: https://datatracker.ietf.org/doc/html/rfc3986/#section-2.1
|
||||
|
||||
List of contributors:
|
||||
Jeroen van Rijn: Initial implementation.
|
||||
*/
|
||||
|
||||
import "core:unicode/utf8"
|
||||
import "core:unicode"
|
||||
import "core:strings"
|
||||
|
||||
MAX_RUNE_CODEPOINT :: int(unicode.MAX_RUNE)
|
||||
|
||||
write_rune :: strings.write_rune_builder
|
||||
write_string :: strings.write_string_builder
|
||||
|
||||
Error :: enum u8 {
|
||||
None = 0,
|
||||
Tokenizer_Is_Nil,
|
||||
|
||||
Illegal_NUL_Character,
|
||||
Illegal_UTF_Encoding,
|
||||
Illegal_BOM,
|
||||
|
||||
CDATA_Not_Terminated,
|
||||
Comment_Not_Terminated,
|
||||
Invalid_Entity_Encoding,
|
||||
}
|
||||
|
||||
Tokenizer :: struct {
|
||||
r: rune,
|
||||
w: int,
|
||||
|
||||
src: string,
|
||||
offset: int,
|
||||
read_offset: int,
|
||||
}
|
||||
|
||||
CDATA_START :: "<![CDATA["
|
||||
CDATA_END :: "]]>"
|
||||
|
||||
COMMENT_START :: "<!--"
|
||||
COMMENT_END :: "-->"
|
||||
|
||||
/*
|
||||
Default: CDATA and comments are passed through unchanged.
|
||||
*/
|
||||
XML_Decode_Option :: enum u8 {
|
||||
/*
|
||||
CDATA is unboxed.
|
||||
*/
|
||||
CDATA_Unbox,
|
||||
|
||||
/*
|
||||
Unboxed CDATA is decoded as well.
|
||||
Ignored if `.CDATA_Unbox` is not given.
|
||||
*/
|
||||
CDATA_Decode,
|
||||
|
||||
/*
|
||||
Comments are stripped.
|
||||
*/
|
||||
Comment_Strip,
|
||||
}
|
||||
XML_Decode_Options :: bit_set[XML_Decode_Option; u8]
|
||||
|
||||
/*
|
||||
Decode a string that may include SGML/XML/HTML entities.
|
||||
The caller has to free the result.
|
||||
*/
|
||||
decode_xml :: proc(input: string, options := XML_Decode_Options{}, allocator := context.allocator) -> (decoded: string, err: Error) {
|
||||
context.allocator = allocator
|
||||
|
||||
l := len(input)
|
||||
if l == 0 { return "", .None }
|
||||
|
||||
builder := strings.make_builder()
|
||||
defer strings.destroy_builder(&builder)
|
||||
|
||||
t := Tokenizer{src=input}
|
||||
in_data := false
|
||||
|
||||
loop: for {
|
||||
advance(&t) or_return
|
||||
if t.r < 0 { break loop }
|
||||
|
||||
/*
|
||||
Below here we're never inside a CDATA tag.
|
||||
At most we'll see the start of one, but that doesn't affect the logic.
|
||||
*/
|
||||
switch t.r {
|
||||
case '<':
|
||||
/*
|
||||
Might be the start of a CDATA tag or comment.
|
||||
|
||||
We don't need to check if we need to write a `<`, because if it isn't CDATA or a comment,
|
||||
it couldn't have been part of an XML tag body to be decoded here.
|
||||
*/
|
||||
in_data = _handle_xml_special(&t, &builder, options) or_return
|
||||
|
||||
case ']':
|
||||
/*
|
||||
If we're unboxing _and_ decoding CDATA, we'll have to check for the end tag.
|
||||
*/
|
||||
if in_data {
|
||||
if t.read_offset + len(CDATA_END) < len(t.src) {
|
||||
if string(t.src[t.offset:][:len(CDATA_END)]) == CDATA_END {
|
||||
in_data = false
|
||||
t.read_offset += len(CDATA_END) - 1
|
||||
}
|
||||
}
|
||||
continue
|
||||
} else {
|
||||
write_rune(&builder, ']')
|
||||
}
|
||||
|
||||
case:
|
||||
if in_data && .CDATA_Decode not_in options {
|
||||
/*
|
||||
Unboxed, but undecoded.
|
||||
*/
|
||||
write_rune(&builder, t.r)
|
||||
continue
|
||||
}
|
||||
|
||||
if t.r == '&' {
|
||||
if entity, entity_err := _extract_xml_entity(&t); entity_err != .None {
|
||||
/*
|
||||
We read to the end of the string without closing the entity.
|
||||
Pass through as-is.
|
||||
*/
|
||||
write_string(&builder, entity)
|
||||
} else {
|
||||
if decoded, ok := xml_decode_entity(entity); ok {
|
||||
write_rune(&builder, decoded)
|
||||
} else {
|
||||
/*
|
||||
Decode failed. Pass through original.
|
||||
*/
|
||||
write_string(&builder, "&")
|
||||
write_string(&builder, entity)
|
||||
write_string(&builder, ";")
|
||||
}
|
||||
|
||||
}
|
||||
} else {
|
||||
write_rune(&builder, t.r)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return strings.clone(strings.to_string(builder), allocator), err
|
||||
}
|
||||
|
||||
advance :: proc(t: ^Tokenizer) -> (err: Error) {
|
||||
if t == nil { return .Tokenizer_Is_Nil }
|
||||
using t
|
||||
|
||||
#no_bounds_check {
|
||||
if read_offset < len(src) {
|
||||
offset = read_offset
|
||||
r, w = rune(src[read_offset]), 1
|
||||
switch {
|
||||
case r == 0:
|
||||
return .Illegal_NUL_Character
|
||||
case r >= utf8.RUNE_SELF:
|
||||
r, w = utf8.decode_rune_in_string(src[read_offset:])
|
||||
if r == utf8.RUNE_ERROR && w == 1 {
|
||||
return .Illegal_UTF_Encoding
|
||||
} else if r == utf8.RUNE_BOM && offset > 0 {
|
||||
return .Illegal_BOM
|
||||
}
|
||||
}
|
||||
read_offset += w
|
||||
return .None
|
||||
} else {
|
||||
offset = len(src)
|
||||
r = -1
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
xml_decode_entity :: proc(entity: string) -> (decoded: rune, ok: bool) {
|
||||
entity := entity
|
||||
if len(entity) == 0 { return -1, false }
|
||||
|
||||
switch entity[0] {
|
||||
case '#':
|
||||
base := 10
|
||||
val := 0
|
||||
entity = entity[1:]
|
||||
|
||||
if len(entity) == 0 { return -1, false }
|
||||
|
||||
if entity[0] == 'x' || entity[0] == 'X' {
|
||||
base = 16
|
||||
entity = entity[1:]
|
||||
}
|
||||
|
||||
for len(entity) > 0 {
|
||||
r := entity[0]
|
||||
switch r {
|
||||
case '0'..'9':
|
||||
val *= base
|
||||
val += int(r - '0')
|
||||
|
||||
case 'a'..'f':
|
||||
if base == 10 { return -1, false }
|
||||
val *= base
|
||||
val += int(r - 'a' + 10)
|
||||
|
||||
case 'A'..'F':
|
||||
if base == 10 { return -1, false }
|
||||
val *= base
|
||||
val += int(r - 'A' + 10)
|
||||
|
||||
case:
|
||||
return -1, false
|
||||
}
|
||||
|
||||
if val > MAX_RUNE_CODEPOINT { return -1, false }
|
||||
entity = entity[1:]
|
||||
}
|
||||
return rune(val), true
|
||||
|
||||
case:
|
||||
/*
|
||||
Named entity.
|
||||
*/
|
||||
return named_xml_entity_to_rune(entity)
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
Private XML helper to extract `&<stuff>;` entity.
|
||||
*/
|
||||
@(private="file")
|
||||
_extract_xml_entity :: proc(t: ^Tokenizer) -> (entity: string, err: Error) {
|
||||
assert(t != nil && t.r == '&')
|
||||
|
||||
/*
|
||||
All of these would be in the ASCII range.
|
||||
Even if one is not, it doesn't matter. All characters we need to compare to extract are.
|
||||
*/
|
||||
using t
|
||||
|
||||
length := len(t.src)
|
||||
found := false
|
||||
|
||||
#no_bounds_check {
|
||||
for read_offset < length {
|
||||
if src[read_offset] == ';' {
|
||||
found = true
|
||||
read_offset += 1
|
||||
break
|
||||
}
|
||||
read_offset += 1
|
||||
}
|
||||
}
|
||||
|
||||
if found {
|
||||
return string(src[offset + 1 : read_offset - 1]), .None
|
||||
}
|
||||
return string(src[offset : read_offset]), .Invalid_Entity_Encoding
|
||||
}
|
||||
|
||||
/*
|
||||
Private XML helper for CDATA and comments.
|
||||
*/
|
||||
@(private="file")
|
||||
_handle_xml_special :: proc(t: ^Tokenizer, builder: ^strings.Builder, options: XML_Decode_Options) -> (in_data: bool, err: Error) {
|
||||
assert(t != nil && t.r == '<')
|
||||
if t.read_offset + len(CDATA_START) >= len(t.src) { return false, .None }
|
||||
|
||||
if string(t.src[t.offset:][:len(CDATA_START)]) == CDATA_START {
|
||||
t.read_offset += len(CDATA_START) - 1
|
||||
|
||||
if .CDATA_Unbox in options && .CDATA_Decode in options {
|
||||
/*
|
||||
We're unboxing _and_ decoding CDATA
|
||||
*/
|
||||
return true, .None
|
||||
}
|
||||
|
||||
/*
|
||||
CDATA is passed through.
|
||||
*/
|
||||
offset := t.offset
|
||||
|
||||
/*
|
||||
Scan until end of CDATA.
|
||||
*/
|
||||
for {
|
||||
advance(t) or_return
|
||||
if t.r < 0 { return true, .CDATA_Not_Terminated }
|
||||
|
||||
if t.read_offset + len(CDATA_END) < len(t.src) {
|
||||
if string(t.src[t.offset:][:len(CDATA_END)]) == CDATA_END {
|
||||
t.read_offset += len(CDATA_END) - 1
|
||||
|
||||
cdata := string(t.src[offset : t.read_offset])
|
||||
|
||||
if .CDATA_Unbox in options {
|
||||
cdata = cdata[len(CDATA_START):]
|
||||
cdata = cdata[:len(cdata) - len(CDATA_END)]
|
||||
}
|
||||
|
||||
write_string(builder, cdata)
|
||||
return false, .None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} else if string(t.src[t.offset:][:len(COMMENT_START)]) == COMMENT_START {
|
||||
t.read_offset += len(COMMENT_START)
|
||||
/*
|
||||
Comment is passed through by default.
|
||||
*/
|
||||
offset := t.offset
|
||||
|
||||
/*
|
||||
Scan until end of Comment.
|
||||
*/
|
||||
for {
|
||||
advance(t) or_return
|
||||
if t.r < 0 { return true, .Comment_Not_Terminated }
|
||||
|
||||
if t.read_offset + len(COMMENT_END) < len(t.src) {
|
||||
if string(t.src[t.offset:][:len(COMMENT_END)]) == COMMENT_END {
|
||||
t.read_offset += len(COMMENT_END) - 1
|
||||
|
||||
if .Comment_Strip not_in options {
|
||||
comment := string(t.src[offset : t.read_offset])
|
||||
write_string(builder, comment)
|
||||
}
|
||||
return false, .None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
return false, .None
|
||||
}
|
||||
122
core/encoding/entity/example/entity_example.odin
Normal file
122
core/encoding/entity/example/entity_example.odin
Normal file
@@ -0,0 +1,122 @@
|
||||
package unicode_entity_example
|
||||
|
||||
import "core:encoding/xml"
|
||||
import "core:encoding/entity"
|
||||
import "core:strings"
|
||||
import "core:mem"
|
||||
import "core:fmt"
|
||||
import "core:time"
|
||||
|
||||
OPTIONS :: xml.Options{
|
||||
flags = {
|
||||
.Ignore_Unsupported, .Intern_Comments,
|
||||
},
|
||||
expected_doctype = "",
|
||||
}
|
||||
|
||||
doc_print :: proc(doc: ^xml.Document) {
|
||||
buf: strings.Builder
|
||||
defer strings.destroy_builder(&buf)
|
||||
w := strings.to_writer(&buf)
|
||||
|
||||
xml.print(w, doc)
|
||||
fmt.println(strings.to_string(buf))
|
||||
}
|
||||
|
||||
_entities :: proc() {
|
||||
doc: ^xml.Document
|
||||
err: xml.Error
|
||||
|
||||
DOC :: #load("../../../../tests/core/assets/XML/unicode.xml")
|
||||
|
||||
parse_duration: time.Duration
|
||||
|
||||
{
|
||||
time.SCOPED_TICK_DURATION(&parse_duration)
|
||||
doc, err = xml.parse(DOC, OPTIONS)
|
||||
}
|
||||
defer xml.destroy(doc)
|
||||
|
||||
doc_print(doc)
|
||||
|
||||
ms := time.duration_milliseconds(parse_duration)
|
||||
|
||||
speed := (f64(1000.0) / ms) * f64(len(DOC)) / 1_024.0 / 1_024.0
|
||||
|
||||
fmt.printf("Parse time: %.2f ms (%.2f MiB/s).\n", ms, speed)
|
||||
fmt.printf("Error: %v\n", err)
|
||||
}
|
||||
|
||||
_main :: proc() {
|
||||
using fmt
|
||||
|
||||
doc, err := xml.parse(#load("test.html"))
|
||||
defer xml.destroy(doc)
|
||||
doc_print(doc)
|
||||
|
||||
if false {
|
||||
val := doc.root.children[1].children[2].value
|
||||
|
||||
println()
|
||||
replaced, ok := entity.decode_xml(val)
|
||||
defer delete(replaced)
|
||||
|
||||
printf("Before: '%v', Err: %v\n", val, err)
|
||||
printf("Passthrough: '%v'\nOK: %v\n", replaced, ok)
|
||||
println()
|
||||
}
|
||||
|
||||
if false {
|
||||
val := doc.root.children[1].children[2].value
|
||||
|
||||
println()
|
||||
replaced, ok := entity.decode_xml(val, { .CDATA_Unbox })
|
||||
defer delete(replaced)
|
||||
|
||||
printf("Before: '%v', Err: %v\n", val, err)
|
||||
printf("CDATA_Unbox: '%v'\nOK: %v\n", replaced, ok)
|
||||
println()
|
||||
}
|
||||
|
||||
if true {
|
||||
val := doc.root.children[1].children[2].value
|
||||
|
||||
println()
|
||||
replaced, ok := entity.decode_xml(val, { .CDATA_Unbox, .CDATA_Decode })
|
||||
defer delete(replaced)
|
||||
|
||||
printf("Before: '%v', Err: %v\n", val, err)
|
||||
printf("CDATA_Decode: '%v'\nOK: %v\n", replaced, ok)
|
||||
println()
|
||||
}
|
||||
|
||||
if true {
|
||||
val := doc.root.children[1].children[1].value
|
||||
|
||||
println()
|
||||
replaced, ok := entity.decode_xml(val, { .Comment_Strip })
|
||||
defer delete(replaced)
|
||||
|
||||
printf("Before: '%v', Err: %v\n", val, err)
|
||||
printf("Comment_Strip: '%v'\nOK: %v\n", replaced, ok)
|
||||
println()
|
||||
}
|
||||
}
|
||||
|
||||
main :: proc() {
|
||||
using fmt
|
||||
|
||||
track: mem.Tracking_Allocator
|
||||
mem.tracking_allocator_init(&track, context.allocator)
|
||||
context.allocator = mem.tracking_allocator(&track)
|
||||
|
||||
_main()
|
||||
//_entities()
|
||||
|
||||
if len(track.allocation_map) > 0 {
|
||||
println()
|
||||
for _, v in track.allocation_map {
|
||||
printf("%v Leaked %v bytes.\n", v.location, v.size)
|
||||
}
|
||||
}
|
||||
}
|
||||
26
core/encoding/entity/example/test.html
Normal file
26
core/encoding/entity/example/test.html
Normal file
@@ -0,0 +1,26 @@
|
||||
<html>
|
||||
<head>
|
||||
<title>Entity Reference Test</title>
|
||||
<style>
|
||||
body {
|
||||
background: #000; color: #eee;
|
||||
width: 40%;
|
||||
margin-left: auto;
|
||||
margin-right: auto;
|
||||
font-size: 14pt;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Entity Reference Test</h1>
|
||||
<div id="test_cdata_in_comment" foo="">
|
||||
Foozle]! © <!-- <![CDATA[ ® ]]> -->42&;1234&
|
||||
</div>
|
||||
<div id="test_cdata_unwrap_and_passthrough">
|
||||
Foozle]! © <![CDATA[BOX ® /BOX]]>42&;1234&
|
||||
</div>
|
||||
<div>
|
||||
| | | fj ` \ ® ϱ ∳
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
7493
core/encoding/entity/generated.odin
Normal file
7493
core/encoding/entity/generated.odin
Normal file
File diff suppressed because it is too large
Load Diff
@@ -519,8 +519,6 @@ parse_attribute :: proc(doc: ^Document) -> (attr: Attr, offset: int, err: Error)
|
||||
_ = expect(t, .Eq) or_return
|
||||
value := expect(t, .String) or_return
|
||||
|
||||
error(t, t.offset, "String: %v\n", value)
|
||||
|
||||
attr.key = strings.intern_get(&doc.intern, key.text)
|
||||
attr.val = strings.intern_get(&doc.intern, value.text)
|
||||
|
||||
|
||||
287
core/unicode/tools/generate_entity_table.odin
Normal file
287
core/unicode/tools/generate_entity_table.odin
Normal file
@@ -0,0 +1,287 @@
|
||||
package xml_example
|
||||
|
||||
import "core:encoding/xml"
|
||||
import "core:os"
|
||||
import "core:path"
|
||||
import "core:mem"
|
||||
import "core:strings"
|
||||
import "core:strconv"
|
||||
import "core:slice"
|
||||
import "core:fmt"
|
||||
|
||||
/*
|
||||
Silent error handler for the parser.
|
||||
*/
|
||||
Error_Handler :: proc(pos: xml.Pos, fmt: string, args: ..any) {}
|
||||
|
||||
OPTIONS :: xml.Options{ flags = { .Ignore_Unsupported, }, expected_doctype = "unicode", }
|
||||
|
||||
Entity :: struct {
|
||||
name: string,
|
||||
codepoint: rune,
|
||||
description: string,
|
||||
}
|
||||
|
||||
generate_encoding_entity_table :: proc() {
|
||||
using fmt
|
||||
|
||||
filename := path.join(ODIN_ROOT, "tests", "core", "assets", "XML", "unicode.xml")
|
||||
defer delete(filename)
|
||||
|
||||
generated_filename := path.join(ODIN_ROOT, "core", "encoding", "entity", "generated.odin")
|
||||
defer delete(generated_filename)
|
||||
|
||||
doc, err := xml.parse(filename, OPTIONS, Error_Handler)
|
||||
defer xml.destroy(doc)
|
||||
|
||||
if err != .None {
|
||||
printf("Load/Parse error: %v\n", err)
|
||||
if err == .File_Error {
|
||||
printf("\"%v\" not found. Did you run \"tests\\download_assets.py\"?", filename)
|
||||
}
|
||||
os.exit(1)
|
||||
}
|
||||
|
||||
printf("\"%v\" loaded and parsed.\n", filename)
|
||||
|
||||
generated_buf: strings.Builder
|
||||
defer strings.destroy_builder(&generated_buf)
|
||||
w := strings.to_writer(&generated_buf)
|
||||
|
||||
charlist, charlist_ok := xml.find_child_by_ident(doc.root, "charlist")
|
||||
if !charlist_ok {
|
||||
eprintln("Could not locate top-level `<charlist>` tag.")
|
||||
os.exit(1)
|
||||
}
|
||||
|
||||
printf("Found `<charlist>` with %v children.\n", len(charlist.children))
|
||||
|
||||
entity_map: map[string]Entity
|
||||
names: [dynamic]string
|
||||
|
||||
min_name_length := max(int)
|
||||
max_name_length := min(int)
|
||||
shortest_name: string
|
||||
longest_name: string
|
||||
|
||||
count := 0
|
||||
for char in charlist.children {
|
||||
if char.ident != "character" {
|
||||
eprintf("Expected `<character>`, got `<%v>`\n", char.ident)
|
||||
os.exit(1)
|
||||
}
|
||||
|
||||
if codepoint_string, ok := xml.find_attribute_val_by_key(char, "dec"); !ok {
|
||||
eprintln("`<character id=\"...\">` attribute not found.")
|
||||
os.exit(1)
|
||||
} else {
|
||||
codepoint := strconv.atoi(codepoint_string)
|
||||
|
||||
desc, desc_ok := xml.find_child_by_ident(char, "description")
|
||||
description := desc.value if desc_ok else ""
|
||||
|
||||
/*
|
||||
For us to be interested in this codepoint, it has to have at least one entity.
|
||||
*/
|
||||
|
||||
nth := 0
|
||||
for {
|
||||
character_entity, entity_ok := xml.find_child_by_ident(char, "entity", nth)
|
||||
if !entity_ok { break }
|
||||
|
||||
nth += 1
|
||||
if name, name_ok := xml.find_attribute_val_by_key(character_entity, "id"); name_ok {
|
||||
|
||||
if len(name) == 0 {
|
||||
/*
|
||||
Invalid name. Skip.
|
||||
*/
|
||||
continue
|
||||
}
|
||||
|
||||
if name == "\"\"" {
|
||||
printf("%#v\n", char)
|
||||
printf("%#v\n", character_entity)
|
||||
}
|
||||
|
||||
if len(name) > max_name_length { longest_name = name }
|
||||
if len(name) < min_name_length { shortest_name = name }
|
||||
|
||||
min_name_length = min(min_name_length, len(name))
|
||||
max_name_length = max(max_name_length, len(name))
|
||||
|
||||
e := Entity{
|
||||
name = name,
|
||||
codepoint = rune(codepoint),
|
||||
description = description,
|
||||
}
|
||||
|
||||
if _, seen := entity_map[name]; seen {
|
||||
continue
|
||||
}
|
||||
|
||||
entity_map[name] = e
|
||||
append(&names, name)
|
||||
count += 1
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
Sort by name.
|
||||
*/
|
||||
slice.sort(names[:])
|
||||
|
||||
printf("Found %v unique `&name;` -> rune mappings.\n", count)
|
||||
printf("Shortest name: %v (%v)\n", shortest_name, min_name_length)
|
||||
printf("Longest name: %v (%v)\n", longest_name, max_name_length)
|
||||
|
||||
// println(rune_to_string(1234))
|
||||
|
||||
/*
|
||||
Generate table.
|
||||
*/
|
||||
wprintln(w, "package unicode_entity")
|
||||
wprintln(w, "")
|
||||
wprintln(w, GENERATED)
|
||||
wprintln(w, "")
|
||||
wprintf (w, TABLE_FILE_PROLOG)
|
||||
wprintln(w, "")
|
||||
|
||||
wprintf (w, "// `&%v;`\n", shortest_name)
|
||||
wprintf (w, "XML_NAME_TO_RUNE_MIN_LENGTH :: %v\n", min_name_length)
|
||||
wprintf (w, "// `&%v;`\n", longest_name)
|
||||
wprintf (w, "XML_NAME_TO_RUNE_MAX_LENGTH :: %v\n", max_name_length)
|
||||
wprintln(w, "")
|
||||
|
||||
wprintln(w,
|
||||
`
|
||||
/*
|
||||
Input:
|
||||
entity_name - a string, like "copy" that describes a user-encoded Unicode entity as used in XML.
|
||||
|
||||
Output:
|
||||
"decoded" - The decoded rune if found by name, or -1 otherwise.
|
||||
"ok" - true if found, false if not.
|
||||
|
||||
IMPORTANT: XML processors (including browsers) treat these names as case-sensitive. So do we.
|
||||
*/
|
||||
named_xml_entity_to_rune :: proc(name: string) -> (decoded: rune, ok: bool) {
|
||||
/*
|
||||
Early out if the name is too short or too long.
|
||||
min as a precaution in case the generated table has a bogus value.
|
||||
*/
|
||||
if len(name) < min(1, XML_NAME_TO_RUNE_MIN_LENGTH) || len(name) > XML_NAME_TO_RUNE_MAX_LENGTH {
|
||||
return -1, false
|
||||
}
|
||||
|
||||
switch rune(name[0]) {
|
||||
`)
|
||||
|
||||
prefix := '?'
|
||||
should_close := false
|
||||
|
||||
for v in names {
|
||||
if rune(v[0]) != prefix {
|
||||
if should_close {
|
||||
wprintln(w, "\t\t}\n")
|
||||
}
|
||||
|
||||
prefix = rune(v[0])
|
||||
wprintf (w, "\tcase '%v':\n", prefix)
|
||||
wprintln(w, "\t\tswitch name {")
|
||||
}
|
||||
|
||||
e := entity_map[v]
|
||||
|
||||
wprintf(w, "\t\t\tcase \"%v\": \n", e.name)
|
||||
wprintf(w, "\t\t\t\t// %v\n", e.description)
|
||||
wprintf(w, "\t\t\t\treturn %v, true\n", rune_to_string(e.codepoint))
|
||||
|
||||
should_close = true
|
||||
}
|
||||
wprintln(w, "\t\t}")
|
||||
wprintln(w, "\t}")
|
||||
wprintln(w, "\treturn -1, false")
|
||||
wprintln(w, "}\n")
|
||||
wprintln(w, GENERATED)
|
||||
|
||||
println()
|
||||
println(strings.to_string(generated_buf))
|
||||
println()
|
||||
|
||||
written := os.write_entire_file(generated_filename, transmute([]byte)strings.to_string(generated_buf))
|
||||
|
||||
if written {
|
||||
fmt.printf("Successfully written generated \"%v\".", generated_filename)
|
||||
} else {
|
||||
fmt.printf("Failed to write generated \"%v\".", generated_filename)
|
||||
}
|
||||
|
||||
delete(entity_map)
|
||||
delete(names)
|
||||
for name in &names {
|
||||
free(&name)
|
||||
}
|
||||
}
|
||||
|
||||
GENERATED :: `/*
|
||||
------ GENERATED ------ DO NOT EDIT ------ GENERATED ------ DO NOT EDIT ------ GENERATED ------
|
||||
*/`
|
||||
|
||||
TABLE_FILE_PROLOG :: `/*
|
||||
This file is generated from "https://www.w3.org/2003/entities/2007xml/unicode.xml".
|
||||
|
||||
UPDATE:
|
||||
- Ensure the XML file was downloaded using "tests\core\download_assets.py".
|
||||
- Run "core/unicode/tools/generate_entity_table.odin"
|
||||
|
||||
Odin unicode generated tables: https://github.com/odin-lang/Odin/tree/master/core/encoding/entity
|
||||
|
||||
Copyright © 2021 World Wide Web Consortium, (Massachusetts Institute of Technology,
|
||||
European Research Consortium for Informatics and Mathematics, Keio University, Beihang).
|
||||
|
||||
All Rights Reserved.
|
||||
|
||||
This work is distributed under the W3C® Software License [1] in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
|
||||
[1] http://www.w3.org/Consortium/Legal/copyright-software
|
||||
|
||||
See also: LICENSE_table.md
|
||||
*/
|
||||
`
|
||||
|
||||
rune_to_string :: proc(r: rune) -> (res: string) {
|
||||
res = fmt.tprintf("%08x", int(r))
|
||||
for len(res) > 2 && res[:2] == "00" {
|
||||
res = res[2:]
|
||||
}
|
||||
return fmt.tprintf("rune(0x%v)", res)
|
||||
}
|
||||
|
||||
is_dotted_name :: proc(name: string) -> (dotted: bool) {
|
||||
for r in name {
|
||||
if r == '.' { return true}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
main :: proc() {
|
||||
using fmt
|
||||
|
||||
track: mem.Tracking_Allocator
|
||||
mem.tracking_allocator_init(&track, context.allocator)
|
||||
context.allocator = mem.tracking_allocator(&track)
|
||||
|
||||
generate_encoding_entity_table()
|
||||
|
||||
if len(track.allocation_map) > 0 {
|
||||
println()
|
||||
for _, v in track.allocation_map {
|
||||
printf("%v Leaked %v bytes.\n", v.location, v.size)
|
||||
}
|
||||
}
|
||||
println("Done and cleaned up!")
|
||||
}
|
||||
Reference in New Issue
Block a user