[xml] Speedup.

This commit is contained in:
Jeroen van Rijn
2022-04-28 15:29:00 +02:00
parent 6df21d6a9f
commit 80878264b6
6 changed files with 246 additions and 183 deletions

View File

@@ -1,8 +1,7 @@
package xml
/*
An XML 1.0 / 1.1 parser
Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
Copyright 2021-2022 Jeroen van Rijn <nom@duclavier.com>.
Made available under Odin's BSD-3 license.
A from-scratch XML implementation, loosely modeled on the [spec](https://www.w3.org/TR/2006/REC-xml11-20060816).
@@ -10,6 +9,8 @@ package xml
List of contributors:
Jeroen van Rijn: Initial implementation.
*/
package xml
import "core:io"
import "core:fmt"
@@ -40,17 +41,16 @@ print :: proc(writer: io.Writer, doc: ^Document) -> (written: int, err: io.Error
written += wprintf(writer, "[Pre-root comment] %v\n", comment)
}
if doc.root != nil {
if len(doc.elements) > 0 {
wprintln(writer, " --- ")
print_element(writer, doc, 0)
wprintln(writer, " --- ")
print_element(writer, doc.root)
wprintln(writer, " --- ")
}
return written, .None
}
print_element :: proc(writer: io.Writer, element: ^Element, indent := 0) -> (written: int, err: io.Error) {
if element == nil { return }
print_element :: proc(writer: io.Writer, doc: ^Document, element_id: Element_ID, indent := 0) -> (written: int, err: io.Error) {
using fmt
tab :: proc(writer: io.Writer, indent: int) {
@@ -61,6 +61,8 @@ print_element :: proc(writer: io.Writer, element: ^Element, indent := 0) -> (wri
tab(writer, indent)
element := doc.elements[element_id]
if element.kind == .Element {
wprintf(writer, "<%v>\n", element.ident)
if len(element.value) > 0 {
@@ -74,7 +76,7 @@ print_element :: proc(writer: io.Writer, element: ^Element, indent := 0) -> (wri
}
for child in element.children {
print_element(writer, child, indent + 1)
print_element(writer, doc, child, indent + 1)
}
} else if element.kind == .Comment {
wprintf(writer, "[COMMENT] %v\n", element.value)

View File

@@ -1,52 +1,85 @@
package xml_example
import "core:encoding/xml"
import "core:os"
import "core:mem"
import "core:fmt"
import "core:time"
import "core:strings"
import "core:hash"
N :: 1
example :: proc() {
using fmt
doc: ^xml.Document
err: xml.Error
docs: [N]^xml.Document
errs: [N]xml.Error
times: [N]time.Duration
defer for round in 0..<N {
xml.destroy(docs[round])
}
DOC :: #load("../../../../tests/core/assets/XML/unicode.xml")
input := DOC
parse_duration: time.Duration
{
time.SCOPED_TICK_DURATION(&parse_duration)
doc, err = xml.parse(DOC, xml.Options{flags={.Ignore_Unsupported}})
for round in 0..<N {
start := time.tick_now()
docs[round], errs[round] = xml.parse(input, xml.Options{
flags={.Ignore_Unsupported},
expected_doctype = "",
})
end := time.tick_now()
times[round] = time.tick_diff(start, end)
}
defer xml.destroy(doc)
ms := time.duration_milliseconds(parse_duration)
speed := (f64(1000.0) / ms) * f64(len(DOC)) / 1_024.0 / 1_024.0
fmt.printf("Parse time: %v bytes in %.2f ms (%.2f MiB/s).\n", len(DOC), ms, speed)
fastest := time.Duration(max(i64))
slowest := time.Duration(0)
total := time.Duration(0)
if err != .None {
printf("Load/Parse error: %v\n", err)
if err == .File_Error {
for round in 0..<N {
fastest = min(fastest, times[round])
slowest = max(slowest, times[round])
total += times[round]
}
fastest_ms := time.duration_milliseconds(fastest)
slowest_ms := time.duration_milliseconds(slowest)
average_ms := time.duration_milliseconds(time.Duration(f64(total) / f64(N)))
fastest_speed := (f64(1000.0) / fastest_ms) * f64(len(DOC)) / 1_024.0 / 1_024.0
slowest_speed := (f64(1000.0) / slowest_ms) * f64(len(DOC)) / 1_024.0 / 1_024.0
average_speed := (f64(1000.0) / average_ms) * f64(len(DOC)) / 1_024.0 / 1_024.0
fmt.printf("N = %v\n", N)
fmt.printf("[Fastest]: %v bytes in %.2f ms (%.2f MiB/s).\n", len(input), fastest_ms, fastest_speed)
fmt.printf("[Slowest]: %v bytes in %.2f ms (%.2f MiB/s).\n", len(input), slowest_ms, slowest_speed)
fmt.printf("[Average]: %v bytes in %.2f ms (%.2f MiB/s).\n", len(input), average_ms, average_speed)
if errs[0] != .None {
printf("Load/Parse error: %v\n", errs[0])
if errs[0] == .File_Error {
println("\"unicode.xml\" not found. Did you run \"tests\\download_assets.py\"?")
}
os.exit(1)
return
}
println("\"unicode.xml\" loaded and parsed.")
charlist, charlist_ok := xml.find_child_by_ident(doc.root, "charlist")
charlist, charlist_ok := xml.find_child_by_ident(docs[0], 0, "charlist")
if !charlist_ok {
eprintln("Could not locate top-level `<charlist>` tag.")
os.exit(1)
eprintln("Could not locate top-level `<charlist>` tag.")
return
}
printf("Found `<charlist>` with %v children.\n", len(charlist.children))
printf("Found `<charlist>` with %v children, %v elements total\n", len(docs[0].elements[charlist].children), docs[0].element_count)
crc32 := doc_hash(doc)
crc32 := doc_hash(docs[0])
printf("[%v] CRC32: 0x%08x\n", "🎉" if crc32 == 0xcaa042b9 else "🤬", crc32)
for round in 0..<N {
defer xml.destroy(docs[round])
}
}
doc_hash :: proc(doc: ^xml.Document, print := false) -> (crc32: u32) {

View File

@@ -1,22 +1,20 @@
package xml
/*
An XML 1.0 / 1.1 parser
Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
Copyright 2021-2022 Jeroen van Rijn <nom@duclavier.com>.
Made available under Odin's BSD-3 license.
This file contains helper functions.
*/
package xml
/*
Find `tag`'s nth child with a given ident.
*/
find_child_by_ident :: proc(tag: ^Element, ident: string, nth := 0) -> (res: ^Element, found: bool) {
if tag == nil { return nil, false }
// Find parent's nth child with a given ident.
find_child_by_ident :: proc(doc: ^Document, parent_id: Element_ID, ident: string, nth := 0) -> (res: Element_ID, found: bool) {
tag := doc.elements[parent_id]
count := 0
for child in tag.children {
for child_id in tag.children {
child := doc.elements[child_id]
/*
Skip commments. They have no name.
*/
@@ -26,18 +24,16 @@ find_child_by_ident :: proc(tag: ^Element, ident: string, nth := 0) -> (res: ^El
If the ident matches and it's the nth such child, return it.
*/
if child.ident == ident {
if count == nth { return child, true }
if count == nth { return child_id, true }
count += 1
}
}
return nil, false
return 0, false
}
/*
Find an attribute by key.
*/
find_attribute_val_by_key :: proc(tag: ^Element, key: string) -> (val: string, found: bool) {
if tag == nil { return "", false }
// Find an attribute by key.
find_attribute_val_by_key :: proc(doc: ^Document, parent_id: Element_ID, key: string) -> (val: string, found: bool) {
tag := doc.elements[parent_id]
for attr in tag.attribs {
/*

View File

@@ -1,3 +1,14 @@
/*
An XML 1.0 / 1.1 parser
Copyright 2021-2022 Jeroen van Rijn <nom@duclavier.com>.
Made available under Odin's BSD-3 license.
A from-scratch XML implementation, loosely modeled on the [spec](https://www.w3.org/TR/2006/REC-xml11-20060816).
List of contributors:
Jeroen van Rijn: Initial implementation.
*/
package xml
import "core:fmt"

View File

@@ -1,8 +1,7 @@
package xml
/*
An XML 1.0 / 1.1 parser
Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
Copyright 2021-2022 Jeroen van Rijn <nom@duclavier.com>.
Made available under Odin's BSD-3 license.
A from-scratch XML implementation, loosely modelled on the [spec](https://www.w3.org/TR/2006/REC-xml11-20060816).
@@ -25,12 +24,17 @@ package xml
List of contributors:
Jeroen van Rijn: Initial implementation.
*/
package xml
// An XML 1.0 / 1.1 parser
import "core:bytes"
import "core:strings"
import "core:encoding/entity"
import "core:intrinsics"
import "core:mem"
import "core:os"
import "core:strings"
likely :: intrinsics.expect
DEFAULT_Options :: Options{
flags = {
@@ -88,7 +92,9 @@ Option_Flag :: enum {
Option_Flags :: bit_set[Option_Flag; u16]
Document :: struct {
root: ^Element,
elements: [dynamic]Element,
element_count: Element_ID,
prolog: Attributes,
encoding: Encoding,
@@ -129,8 +135,8 @@ Element :: struct {
Comment,
},
parent: ^Element,
children: [dynamic]^Element,
parent: Element_ID,
children: [dynamic]Element_ID,
}
Attr :: struct {
@@ -185,7 +191,7 @@ Error :: enum {
No_DocType,
Too_Many_DocTypes,
DocType_Must_Proceed_Elements,
DocType_Must_Preceed_Elements,
/*
If a DOCTYPE is present _or_ the caller
@@ -237,12 +243,16 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
doc.tokenizer = t
doc.input = data
doc.elements = make([dynamic]Element, 1024, 1024, allocator)
// strings.intern_init(&doc.intern, allocator, allocator)
err = .Unexpected_Token
element, parent: ^Element
element, parent: Element_ID
tag_is_open := false
tag_is_open := false
first_element := true
open: Token
/*
If a DOCTYPE is present, the root tag has to match.
@@ -252,6 +262,7 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
loop: for {
skip_whitespace(t)
// NOTE(Jeroen): This is faster as a switch.
switch t.ch {
case '<':
/*
@@ -259,35 +270,85 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
*/
advance_rune(t)
open := scan(t)
#partial switch open.kind {
case .Question:
open = scan(t)
// NOTE(Jeroen): We're not using a switch because this if-else chain ordered by likelihood is 2.5% faster at -o:size and -o:speed.
if likely(open.kind, Token_Kind.Ident) == .Ident {
/*
<?xml
e.g. <odin - Start of new element.
*/
next := scan(t)
#partial switch next.kind {
case .Ident:
if len(next.text) == 3 && strings.to_lower(next.text, context.temp_allocator) == "xml" {
parse_prolog(doc) or_return
} else if len(doc.prolog) > 0 {
/*
We've already seen a prolog.
*/
return doc, .Too_Many_Prologs
} else {
/*
Could be `<?xml-stylesheet`, etc. Ignore it.
*/
skip_element(t) or_return
element = new_element(doc)
tag_is_open = true
if first_element {
/*
First element.
*/
parent = element
first_element = false
} else {
append(&doc.elements[parent].children, element)
}
doc.elements[element].parent = parent
doc.elements[element].ident = open.text
parse_attributes(doc, &doc.elements[element].attribs) or_return
/*
If a DOCTYPE is present _or_ the caller
asked for a specific DOCTYPE and the DOCTYPE
and root tag don't match, we return .Invalid_Root_Tag.
*/
if element == 0 { // Root tag?
if len(expected_doctype) > 0 && expected_doctype != open.text {
error(t, t.offset, "Root Tag doesn't match DOCTYPE. Expected: %v, got: %v\n", expected_doctype, open.text)
return doc, .Invalid_DocType
}
}
/*
One of these should follow:
- `>`, which means we've just opened this tag and expect a later element to close it.
- `/>`, which means this is an 'empty' or self-closing tag.
*/
end_token := scan(t)
#partial switch end_token.kind {
case .Gt:
/*
We're now the new parent.
*/
parent = element
case .Slash:
/*
Empty tag. Close it.
*/
expect(t, .Gt) or_return
parent = doc.elements[element].parent
element = parent
tag_is_open = false
case:
error(t, t.offset, "Expected \"<?xml\", got \"<?%v\".", next.text)
error(t, t.offset, "Expected close tag, got: %#v\n", end_token)
return
}
case .Exclaim:
} else if open.kind == .Slash {
/*
Close tag.
*/
ident := expect(t, .Ident) or_return
_ = expect(t, .Gt) or_return
if doc.elements[element].ident != ident.text {
error(t, t.offset, "Mismatched Closing Tag. Expected %v, got %v\n", doc.elements[element].ident, ident.text)
return doc, .Mismatched_Closing_Tag
}
parent = doc.elements[element].parent
element = parent
tag_is_open = false
} else if open.kind == .Exclaim {
/*
<!
*/
@@ -299,8 +360,8 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
if len(doc.doctype.ident) > 0 {
return doc, .Too_Many_DocTypes
}
if doc.root != nil {
return doc, .DocType_Must_Proceed_Elements
if doc.element_count > 0 {
return doc, .DocType_Must_Preceed_Elements
}
parse_doctype(doc) or_return
@@ -327,14 +388,14 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
comment := scan_comment(t) or_return
if .Intern_Comments in opts.flags {
if doc.root == nil {
if len(doc.elements) == 0 {
append(&doc.comments, comment)
} else {
el := new(Element)
el.parent = element
el.kind = .Comment
el.value = comment
append(&element.children, el)
el := new_element(doc)
doc.elements[el].parent = element
doc.elements[el].kind = .Comment
doc.elements[el].value = comment
append(&doc.elements[element].children, el)
}
}
@@ -343,83 +404,32 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
return
}
case .Ident:
} else if open.kind == .Question {
/*
e.g. <odin - Start of new element.
<?xml
*/
element = new(Element)
tag_is_open = true
if doc.root == nil {
/*
First element.
*/
doc.root = element
parent = element
} else {
append(&parent.children, element)
}
element.parent = parent
element.ident = open.text
parse_attributes(doc, &element.attribs) or_return
/*
If a DOCTYPE is present _or_ the caller
asked for a specific DOCTYPE and the DOCTYPE
and root tag don't match, we return .Invalid_Root_Tag.
*/
if element == doc.root {
if len(expected_doctype) > 0 && expected_doctype != open.text {
error(t, t.offset, "Root Tag doesn't match DOCTYPE. Expected: %v, got: %v\n", expected_doctype, open.text)
return doc, .Invalid_DocType
next := scan(t)
#partial switch next.kind {
case .Ident:
if len(next.text) == 3 && strings.to_lower(next.text, context.temp_allocator) == "xml" {
parse_prolog(doc) or_return
} else if len(doc.prolog) > 0 {
/*
We've already seen a prolog.
*/
return doc, .Too_Many_Prologs
} else {
/*
Could be `<?xml-stylesheet`, etc. Ignore it.
*/
skip_element(t) or_return
}
}
/*
One of these should follow:
- `>`, which means we've just opened this tag and expect a later element to close it.
- `/>`, which means this is an 'empty' or self-closing tag.
*/
end_token := scan(t)
#partial switch end_token.kind {
case .Gt:
/*
We're now the new parent.
*/
parent = element
case .Slash:
/*
Empty tag. Close it.
*/
expect(t, .Gt) or_return
parent = element.parent
element = parent
tag_is_open = false
case:
error(t, t.offset, "Expected close tag, got: %#v\n", end_token)
error(t, t.offset, "Expected \"<?xml\", got \"<?%v\".", next.text)
return
}
case .Slash:
/*
Close tag.
*/
ident := expect(t, .Ident) or_return
_ = expect(t, .Gt) or_return
if element.ident != ident.text {
error(t, t.offset, "Mismatched Closing Tag. Expected %v, got %v\n", element.ident, ident.text)
return doc, .Mismatched_Closing_Tag
}
parent = element.parent
element = parent
tag_is_open = false
case:
} else {
error(t, t.offset, "Invalid Token after <: %#v\n", open)
return
}
@@ -442,7 +452,7 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
needs_processing |= .Decode_SGML_Entities in opts.flags
if !needs_processing {
element.value = body_text
doc.elements[element].value = body_text
continue
}
@@ -464,10 +474,10 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
decoded, decode_err := entity.decode_xml(body_text, decode_opts)
if decode_err == .None {
element.value = decoded
doc.elements[element].value = decoded
append(&doc.strings_to_free, decoded)
} else {
element.value = body_text
doc.elements[element].value = body_text
}
}
}
@@ -480,6 +490,7 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
return doc, .No_DocType
}
resize(&doc.elements, int(doc.element_count))
return doc, .None
}
@@ -497,26 +508,14 @@ parse_from_file :: proc(filename: string, options := DEFAULT_Options, error_hand
parse :: proc { parse_from_file, parse_from_slice }
free_element :: proc(element: ^Element) {
if element == nil { return }
for child in element.children {
/*
NOTE: Recursive.
Could be rewritten so it adds them to a list of pointers to free.
*/
free_element(child)
}
delete(element.attribs)
delete(element.children)
free(element)
}
destroy :: proc(doc: ^Document) {
if doc == nil { return }
free_element(doc.root)
for el in doc.elements {
delete(el.attribs)
delete(el.children)
}
delete(doc.elements)
delete(doc.prolog)
delete(doc.comments)
@@ -686,4 +685,25 @@ parse_doctype :: proc(doc: ^Document) -> (err: Error) {
*/
doc.doctype.rest = string(t.src[offset : t.offset - 1])
return .None
}
Element_ID :: u32
new_element :: proc(doc: ^Document) -> (id: Element_ID) {
element_space := len(doc.elements)
// Need to resize
if int(doc.element_count) + 1 > element_space {
if element_space < 65536 {
element_space *= 2
} else {
element_space += 65536
}
resize(&doc.elements, element_space)
}
cur := doc.element_count
doc.element_count += 1
return cur
}

View File

@@ -224,7 +224,7 @@ doc_to_string :: proc(doc: ^xml.Document) -> (result: string) {
written += wprintf(writer, "[DOCTYPE] %v\n", doc.doctype.ident)
if len(doc.doctype.rest) > 0 {
wprintf(writer, "\t%v\n", doc.doctype.rest)
wprintf(writer, "\t%v\n", doc.doctype.rest)
}
}
@@ -232,17 +232,16 @@ doc_to_string :: proc(doc: ^xml.Document) -> (result: string) {
written += wprintf(writer, "[Pre-root comment] %v\n", comment)
}
if doc.root != nil {
wprintln(writer, " --- ")
print_element(writer, doc.root)
wprintln(writer, " --- ")
if doc.element_count > 0 {
wprintln(writer, " --- ")
print_element(writer, doc, 0)
wprintln(writer, " --- ")
}
return written, .None
}
print_element :: proc(writer: io.Writer, element: ^xml.Element, indent := 0) -> (written: int, err: io.Error) {
if element == nil { return }
print_element :: proc(writer: io.Writer, doc: ^xml.Document, element_id: xml.Element_ID, indent := 0) -> (written: int, err: io.Error) {
using fmt
tab :: proc(writer: io.Writer, indent: int) {
@@ -253,6 +252,8 @@ doc_to_string :: proc(doc: ^xml.Document) -> (result: string) {
tab(writer, indent)
element := doc.elements[element_id]
if element.kind == .Element {
wprintf(writer, "<%v>\n", element.ident)
if len(element.value) > 0 {
@@ -266,7 +267,7 @@ doc_to_string :: proc(doc: ^xml.Document) -> (result: string) {
}
for child in element.children {
print_element(writer, child, indent + 1)
print_element(writer, doc, child, indent + 1)
}
} else if element.kind == .Comment {
wprintf(writer, "[COMMENT] %v\n", element.value)