[core:encoding/entity] Add new package to decode &<entity>; entities.

Includes generator to generate a lookup for named entitiess.
This commit is contained in:
Jeroen van Rijn
2021-12-02 20:12:12 +01:00
parent 5807214406
commit 2dd67dba89
7 changed files with 8307 additions and 2 deletions

View File

@@ -0,0 +1,21 @@
# License
By obtaining, using and/or copying this work, you (the licensee) agree that you have read, understood, and will comply with the following terms and conditions.
Permission to copy, modify, and distribute this software and its documentation, with or without modification, for any purpose and without fee or royalty is hereby granted, provided that you include the following on ALL copies of the software and documentation or portions thereof, including modifications:
The full text of this NOTICE in a location viewable to users of the redistributed or derivative work.
Any pre-existing intellectual property disclaimers, notices, or terms and conditions. If none exist, the W3C Software Short Notice should be included (hypertext is preferred, text is permitted) within the body of any redistributed or derivative code.
Notice of any changes or modifications to the files, including the date changes were made. (We recommend you provide URIs to the location from which the code is derived.)
# Disclaimers
THIS SOFTWARE AND DOCUMENTATION IS PROVIDED "AS IS," AND COPYRIGHT HOLDERS MAKE NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO, WARRANTIES OF MERCHANTABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE SOFTWARE OR DOCUMENTATION WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS.
COPYRIGHT HOLDERS WILL NOT BE LIABLE FOR ANY DIRECT, INDIRECT, SPECIAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF ANY USE OF THE SOFTWARE OR DOCUMENTATION.
The name and trademarks of copyright holders may NOT be used in advertising or publicity pertaining to the software without specific, written prior permission. Title to copyright in this software and any associated documentation will at all times remain with copyright holders.
# Notes
This version: http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231

View File

@@ -0,0 +1,358 @@
package unicode_entity
/*
A unicode entity encoder/decoder
Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
Made available under Odin's BSD-3 license.
This code has several procedures to map unicode runes to/from different textual encodings.
- SGML/XML/HTML entity
-- &#<decimal>;
-- &#x<hexadecimal>;
-- &<entity name>; (If the lookup tables are compiled in).
Reference: https://www.w3.org/2003/entities/2007xml/unicode.xml
- URL encode / decode %hex entity
Reference: https://datatracker.ietf.org/doc/html/rfc3986/#section-2.1
List of contributors:
Jeroen van Rijn: Initial implementation.
*/
import "core:unicode/utf8"
import "core:unicode"
import "core:strings"
MAX_RUNE_CODEPOINT :: int(unicode.MAX_RUNE)
write_rune :: strings.write_rune_builder
write_string :: strings.write_string_builder
Error :: enum u8 {
None = 0,
Tokenizer_Is_Nil,
Illegal_NUL_Character,
Illegal_UTF_Encoding,
Illegal_BOM,
CDATA_Not_Terminated,
Comment_Not_Terminated,
Invalid_Entity_Encoding,
}
Tokenizer :: struct {
r: rune,
w: int,
src: string,
offset: int,
read_offset: int,
}
CDATA_START :: "<![CDATA["
CDATA_END :: "]]>"
COMMENT_START :: "<!--"
COMMENT_END :: "-->"
/*
Default: CDATA and comments are passed through unchanged.
*/
XML_Decode_Option :: enum u8 {
/*
CDATA is unboxed.
*/
CDATA_Unbox,
/*
Unboxed CDATA is decoded as well.
Ignored if `.CDATA_Unbox` is not given.
*/
CDATA_Decode,
/*
Comments are stripped.
*/
Comment_Strip,
}
XML_Decode_Options :: bit_set[XML_Decode_Option; u8]
/*
Decode a string that may include SGML/XML/HTML entities.
The caller has to free the result.
*/
decode_xml :: proc(input: string, options := XML_Decode_Options{}, allocator := context.allocator) -> (decoded: string, err: Error) {
context.allocator = allocator
l := len(input)
if l == 0 { return "", .None }
builder := strings.make_builder()
defer strings.destroy_builder(&builder)
t := Tokenizer{src=input}
in_data := false
loop: for {
advance(&t) or_return
if t.r < 0 { break loop }
/*
Below here we're never inside a CDATA tag.
At most we'll see the start of one, but that doesn't affect the logic.
*/
switch t.r {
case '<':
/*
Might be the start of a CDATA tag or comment.
We don't need to check if we need to write a `<`, because if it isn't CDATA or a comment,
it couldn't have been part of an XML tag body to be decoded here.
*/
in_data = _handle_xml_special(&t, &builder, options) or_return
case ']':
/*
If we're unboxing _and_ decoding CDATA, we'll have to check for the end tag.
*/
if in_data {
if t.read_offset + len(CDATA_END) < len(t.src) {
if string(t.src[t.offset:][:len(CDATA_END)]) == CDATA_END {
in_data = false
t.read_offset += len(CDATA_END) - 1
}
}
continue
} else {
write_rune(&builder, ']')
}
case:
if in_data && .CDATA_Decode not_in options {
/*
Unboxed, but undecoded.
*/
write_rune(&builder, t.r)
continue
}
if t.r == '&' {
if entity, entity_err := _extract_xml_entity(&t); entity_err != .None {
/*
We read to the end of the string without closing the entity.
Pass through as-is.
*/
write_string(&builder, entity)
} else {
if decoded, ok := xml_decode_entity(entity); ok {
write_rune(&builder, decoded)
} else {
/*
Decode failed. Pass through original.
*/
write_string(&builder, "&")
write_string(&builder, entity)
write_string(&builder, ";")
}
}
} else {
write_rune(&builder, t.r)
}
}
}
return strings.clone(strings.to_string(builder), allocator), err
}
advance :: proc(t: ^Tokenizer) -> (err: Error) {
if t == nil { return .Tokenizer_Is_Nil }
using t
#no_bounds_check {
if read_offset < len(src) {
offset = read_offset
r, w = rune(src[read_offset]), 1
switch {
case r == 0:
return .Illegal_NUL_Character
case r >= utf8.RUNE_SELF:
r, w = utf8.decode_rune_in_string(src[read_offset:])
if r == utf8.RUNE_ERROR && w == 1 {
return .Illegal_UTF_Encoding
} else if r == utf8.RUNE_BOM && offset > 0 {
return .Illegal_BOM
}
}
read_offset += w
return .None
} else {
offset = len(src)
r = -1
return
}
}
}
xml_decode_entity :: proc(entity: string) -> (decoded: rune, ok: bool) {
entity := entity
if len(entity) == 0 { return -1, false }
switch entity[0] {
case '#':
base := 10
val := 0
entity = entity[1:]
if len(entity) == 0 { return -1, false }
if entity[0] == 'x' || entity[0] == 'X' {
base = 16
entity = entity[1:]
}
for len(entity) > 0 {
r := entity[0]
switch r {
case '0'..'9':
val *= base
val += int(r - '0')
case 'a'..'f':
if base == 10 { return -1, false }
val *= base
val += int(r - 'a' + 10)
case 'A'..'F':
if base == 10 { return -1, false }
val *= base
val += int(r - 'A' + 10)
case:
return -1, false
}
if val > MAX_RUNE_CODEPOINT { return -1, false }
entity = entity[1:]
}
return rune(val), true
case:
/*
Named entity.
*/
return named_xml_entity_to_rune(entity)
}
}
/*
Private XML helper to extract `&<stuff>;` entity.
*/
@(private="file")
_extract_xml_entity :: proc(t: ^Tokenizer) -> (entity: string, err: Error) {
assert(t != nil && t.r == '&')
/*
All of these would be in the ASCII range.
Even if one is not, it doesn't matter. All characters we need to compare to extract are.
*/
using t
length := len(t.src)
found := false
#no_bounds_check {
for read_offset < length {
if src[read_offset] == ';' {
found = true
read_offset += 1
break
}
read_offset += 1
}
}
if found {
return string(src[offset + 1 : read_offset - 1]), .None
}
return string(src[offset : read_offset]), .Invalid_Entity_Encoding
}
/*
Private XML helper for CDATA and comments.
*/
@(private="file")
_handle_xml_special :: proc(t: ^Tokenizer, builder: ^strings.Builder, options: XML_Decode_Options) -> (in_data: bool, err: Error) {
assert(t != nil && t.r == '<')
if t.read_offset + len(CDATA_START) >= len(t.src) { return false, .None }
if string(t.src[t.offset:][:len(CDATA_START)]) == CDATA_START {
t.read_offset += len(CDATA_START) - 1
if .CDATA_Unbox in options && .CDATA_Decode in options {
/*
We're unboxing _and_ decoding CDATA
*/
return true, .None
}
/*
CDATA is passed through.
*/
offset := t.offset
/*
Scan until end of CDATA.
*/
for {
advance(t) or_return
if t.r < 0 { return true, .CDATA_Not_Terminated }
if t.read_offset + len(CDATA_END) < len(t.src) {
if string(t.src[t.offset:][:len(CDATA_END)]) == CDATA_END {
t.read_offset += len(CDATA_END) - 1
cdata := string(t.src[offset : t.read_offset])
if .CDATA_Unbox in options {
cdata = cdata[len(CDATA_START):]
cdata = cdata[:len(cdata) - len(CDATA_END)]
}
write_string(builder, cdata)
return false, .None
}
}
}
} else if string(t.src[t.offset:][:len(COMMENT_START)]) == COMMENT_START {
t.read_offset += len(COMMENT_START)
/*
Comment is passed through by default.
*/
offset := t.offset
/*
Scan until end of Comment.
*/
for {
advance(t) or_return
if t.r < 0 { return true, .Comment_Not_Terminated }
if t.read_offset + len(COMMENT_END) < len(t.src) {
if string(t.src[t.offset:][:len(COMMENT_END)]) == COMMENT_END {
t.read_offset += len(COMMENT_END) - 1
if .Comment_Strip not_in options {
comment := string(t.src[offset : t.read_offset])
write_string(builder, comment)
}
return false, .None
}
}
}
}
return false, .None
}

View File

@@ -0,0 +1,122 @@
package unicode_entity_example
import "core:encoding/xml"
import "core:encoding/entity"
import "core:strings"
import "core:mem"
import "core:fmt"
import "core:time"
OPTIONS :: xml.Options{
flags = {
.Ignore_Unsupported, .Intern_Comments,
},
expected_doctype = "",
}
doc_print :: proc(doc: ^xml.Document) {
buf: strings.Builder
defer strings.destroy_builder(&buf)
w := strings.to_writer(&buf)
xml.print(w, doc)
fmt.println(strings.to_string(buf))
}
_entities :: proc() {
doc: ^xml.Document
err: xml.Error
DOC :: #load("../../../../tests/core/assets/XML/unicode.xml")
parse_duration: time.Duration
{
time.SCOPED_TICK_DURATION(&parse_duration)
doc, err = xml.parse(DOC, OPTIONS)
}
defer xml.destroy(doc)
doc_print(doc)
ms := time.duration_milliseconds(parse_duration)
speed := (f64(1000.0) / ms) * f64(len(DOC)) / 1_024.0 / 1_024.0
fmt.printf("Parse time: %.2f ms (%.2f MiB/s).\n", ms, speed)
fmt.printf("Error: %v\n", err)
}
_main :: proc() {
using fmt
doc, err := xml.parse(#load("test.html"))
defer xml.destroy(doc)
doc_print(doc)
if false {
val := doc.root.children[1].children[2].value
println()
replaced, ok := entity.decode_xml(val)
defer delete(replaced)
printf("Before: '%v', Err: %v\n", val, err)
printf("Passthrough: '%v'\nOK: %v\n", replaced, ok)
println()
}
if false {
val := doc.root.children[1].children[2].value
println()
replaced, ok := entity.decode_xml(val, { .CDATA_Unbox })
defer delete(replaced)
printf("Before: '%v', Err: %v\n", val, err)
printf("CDATA_Unbox: '%v'\nOK: %v\n", replaced, ok)
println()
}
if true {
val := doc.root.children[1].children[2].value
println()
replaced, ok := entity.decode_xml(val, { .CDATA_Unbox, .CDATA_Decode })
defer delete(replaced)
printf("Before: '%v', Err: %v\n", val, err)
printf("CDATA_Decode: '%v'\nOK: %v\n", replaced, ok)
println()
}
if true {
val := doc.root.children[1].children[1].value
println()
replaced, ok := entity.decode_xml(val, { .Comment_Strip })
defer delete(replaced)
printf("Before: '%v', Err: %v\n", val, err)
printf("Comment_Strip: '%v'\nOK: %v\n", replaced, ok)
println()
}
}
main :: proc() {
using fmt
track: mem.Tracking_Allocator
mem.tracking_allocator_init(&track, context.allocator)
context.allocator = mem.tracking_allocator(&track)
_main()
//_entities()
if len(track.allocation_map) > 0 {
println()
for _, v in track.allocation_map {
printf("%v Leaked %v bytes.\n", v.location, v.size)
}
}
}

View File

@@ -0,0 +1,26 @@
<html>
<head>
<title>Entity Reference Test</title>
<style>
body {
background: #000; color: #eee;
width: 40%;
margin-left: auto;
margin-right: auto;
font-size: 14pt;
}
</style>
</head>
<body>
<h1>Entity Reference Test</h1>
<div id="test_cdata_in_comment" foo="">
Foozle]!&#32;&copy;&#x20;<!-- <![CDATA[&#32;&reg;&#x20;]]> -->42&;1234&
</div>
<div id="test_cdata_unwrap_and_passthrough">
Foozle]!&#32;&copy;&#x20;<![CDATA[BOX&#32;&reg;&#x20;/BOX]]>42&;1234&
</div>
<div>
&verbar; &vert; &VerticalLine; &fjlig; &grave; &bsol; &reg; &rhov; &CounterClockwiseContourIntegral;
</div>
</body>
</html>

File diff suppressed because it is too large Load Diff

View File

@@ -519,8 +519,6 @@ parse_attribute :: proc(doc: ^Document) -> (attr: Attr, offset: int, err: Error)
_ = expect(t, .Eq) or_return
value := expect(t, .String) or_return
error(t, t.offset, "String: %v\n", value)
attr.key = strings.intern_get(&doc.intern, key.text)
attr.val = strings.intern_get(&doc.intern, value.text)

View File

@@ -0,0 +1,287 @@
package xml_example
import "core:encoding/xml"
import "core:os"
import "core:path"
import "core:mem"
import "core:strings"
import "core:strconv"
import "core:slice"
import "core:fmt"
/*
Silent error handler for the parser.
*/
Error_Handler :: proc(pos: xml.Pos, fmt: string, args: ..any) {}
OPTIONS :: xml.Options{ flags = { .Ignore_Unsupported, }, expected_doctype = "unicode", }
Entity :: struct {
name: string,
codepoint: rune,
description: string,
}
generate_encoding_entity_table :: proc() {
using fmt
filename := path.join(ODIN_ROOT, "tests", "core", "assets", "XML", "unicode.xml")
defer delete(filename)
generated_filename := path.join(ODIN_ROOT, "core", "encoding", "entity", "generated.odin")
defer delete(generated_filename)
doc, err := xml.parse(filename, OPTIONS, Error_Handler)
defer xml.destroy(doc)
if err != .None {
printf("Load/Parse error: %v\n", err)
if err == .File_Error {
printf("\"%v\" not found. Did you run \"tests\\download_assets.py\"?", filename)
}
os.exit(1)
}
printf("\"%v\" loaded and parsed.\n", filename)
generated_buf: strings.Builder
defer strings.destroy_builder(&generated_buf)
w := strings.to_writer(&generated_buf)
charlist, charlist_ok := xml.find_child_by_ident(doc.root, "charlist")
if !charlist_ok {
eprintln("Could not locate top-level `<charlist>` tag.")
os.exit(1)
}
printf("Found `<charlist>` with %v children.\n", len(charlist.children))
entity_map: map[string]Entity
names: [dynamic]string
min_name_length := max(int)
max_name_length := min(int)
shortest_name: string
longest_name: string
count := 0
for char in charlist.children {
if char.ident != "character" {
eprintf("Expected `<character>`, got `<%v>`\n", char.ident)
os.exit(1)
}
if codepoint_string, ok := xml.find_attribute_val_by_key(char, "dec"); !ok {
eprintln("`<character id=\"...\">` attribute not found.")
os.exit(1)
} else {
codepoint := strconv.atoi(codepoint_string)
desc, desc_ok := xml.find_child_by_ident(char, "description")
description := desc.value if desc_ok else ""
/*
For us to be interested in this codepoint, it has to have at least one entity.
*/
nth := 0
for {
character_entity, entity_ok := xml.find_child_by_ident(char, "entity", nth)
if !entity_ok { break }
nth += 1
if name, name_ok := xml.find_attribute_val_by_key(character_entity, "id"); name_ok {
if len(name) == 0 {
/*
Invalid name. Skip.
*/
continue
}
if name == "\"\"" {
printf("%#v\n", char)
printf("%#v\n", character_entity)
}
if len(name) > max_name_length { longest_name = name }
if len(name) < min_name_length { shortest_name = name }
min_name_length = min(min_name_length, len(name))
max_name_length = max(max_name_length, len(name))
e := Entity{
name = name,
codepoint = rune(codepoint),
description = description,
}
if _, seen := entity_map[name]; seen {
continue
}
entity_map[name] = e
append(&names, name)
count += 1
}
}
}
}
/*
Sort by name.
*/
slice.sort(names[:])
printf("Found %v unique `&name;` -> rune mappings.\n", count)
printf("Shortest name: %v (%v)\n", shortest_name, min_name_length)
printf("Longest name: %v (%v)\n", longest_name, max_name_length)
// println(rune_to_string(1234))
/*
Generate table.
*/
wprintln(w, "package unicode_entity")
wprintln(w, "")
wprintln(w, GENERATED)
wprintln(w, "")
wprintf (w, TABLE_FILE_PROLOG)
wprintln(w, "")
wprintf (w, "// `&%v;`\n", shortest_name)
wprintf (w, "XML_NAME_TO_RUNE_MIN_LENGTH :: %v\n", min_name_length)
wprintf (w, "// `&%v;`\n", longest_name)
wprintf (w, "XML_NAME_TO_RUNE_MAX_LENGTH :: %v\n", max_name_length)
wprintln(w, "")
wprintln(w,
`
/*
Input:
entity_name - a string, like "copy" that describes a user-encoded Unicode entity as used in XML.
Output:
"decoded" - The decoded rune if found by name, or -1 otherwise.
"ok" - true if found, false if not.
IMPORTANT: XML processors (including browsers) treat these names as case-sensitive. So do we.
*/
named_xml_entity_to_rune :: proc(name: string) -> (decoded: rune, ok: bool) {
/*
Early out if the name is too short or too long.
min as a precaution in case the generated table has a bogus value.
*/
if len(name) < min(1, XML_NAME_TO_RUNE_MIN_LENGTH) || len(name) > XML_NAME_TO_RUNE_MAX_LENGTH {
return -1, false
}
switch rune(name[0]) {
`)
prefix := '?'
should_close := false
for v in names {
if rune(v[0]) != prefix {
if should_close {
wprintln(w, "\t\t}\n")
}
prefix = rune(v[0])
wprintf (w, "\tcase '%v':\n", prefix)
wprintln(w, "\t\tswitch name {")
}
e := entity_map[v]
wprintf(w, "\t\t\tcase \"%v\": \n", e.name)
wprintf(w, "\t\t\t\t// %v\n", e.description)
wprintf(w, "\t\t\t\treturn %v, true\n", rune_to_string(e.codepoint))
should_close = true
}
wprintln(w, "\t\t}")
wprintln(w, "\t}")
wprintln(w, "\treturn -1, false")
wprintln(w, "}\n")
wprintln(w, GENERATED)
println()
println(strings.to_string(generated_buf))
println()
written := os.write_entire_file(generated_filename, transmute([]byte)strings.to_string(generated_buf))
if written {
fmt.printf("Successfully written generated \"%v\".", generated_filename)
} else {
fmt.printf("Failed to write generated \"%v\".", generated_filename)
}
delete(entity_map)
delete(names)
for name in &names {
free(&name)
}
}
GENERATED :: `/*
------ GENERATED ------ DO NOT EDIT ------ GENERATED ------ DO NOT EDIT ------ GENERATED ------
*/`
TABLE_FILE_PROLOG :: `/*
This file is generated from "https://www.w3.org/2003/entities/2007xml/unicode.xml".
UPDATE:
- Ensure the XML file was downloaded using "tests\core\download_assets.py".
- Run "core/unicode/tools/generate_entity_table.odin"
Odin unicode generated tables: https://github.com/odin-lang/Odin/tree/master/core/encoding/entity
Copyright © 2021 World Wide Web Consortium, (Massachusetts Institute of Technology,
European Research Consortium for Informatics and Mathematics, Keio University, Beihang).
All Rights Reserved.
This work is distributed under the W3C® Software License [1] in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
[1] http://www.w3.org/Consortium/Legal/copyright-software
See also: LICENSE_table.md
*/
`
rune_to_string :: proc(r: rune) -> (res: string) {
res = fmt.tprintf("%08x", int(r))
for len(res) > 2 && res[:2] == "00" {
res = res[2:]
}
return fmt.tprintf("rune(0x%v)", res)
}
is_dotted_name :: proc(name: string) -> (dotted: bool) {
for r in name {
if r == '.' { return true}
}
return false
}
main :: proc() {
using fmt
track: mem.Tracking_Allocator
mem.tracking_allocator_init(&track, context.allocator)
context.allocator = mem.tracking_allocator(&track)
generate_encoding_entity_table()
if len(track.allocation_map) > 0 {
println()
for _, v in track.allocation_map {
printf("%v Leaked %v bytes.\n", v.location, v.size)
}
}
println("Done and cleaned up!")
}