[xml] Speedup.

2026-04-19 21:10:30 +00:00 · 2022-04-28 15:29:00 +02:00
parent 6df21d6a9f
commit 80878264b6
6 changed files with 246 additions and 183 deletions
--- a/core/encoding/xml/debug_print.odin
+++ b/core/encoding/xml/debug_print.odin
@@ -1,8 +1,7 @@
-package xml
 /*
 	An XML 1.0 / 1.1 parser

-	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
+	Copyright 2021-2022 Jeroen van Rijn <nom@duclavier.com>.
 	Made available under Odin's BSD-3 license.

 	A from-scratch XML implementation, loosely modeled on the [spec](https://www.w3.org/TR/2006/REC-xml11-20060816).
@@ -10,6 +9,8 @@ package xml
 	List of contributors:
 		Jeroen van Rijn: Initial implementation.
 */
+package xml
+
 import "core:io"
 import "core:fmt"

@@ -40,17 +41,16 @@ print :: proc(writer: io.Writer, doc: ^Document) -> (written: int, err: io.Error
 		written += wprintf(writer, "[Pre-root comment]  %v\n", comment)
 	}

-	if doc.root != nil {
+	if len(doc.elements) > 0 {
+	 	wprintln(writer, " --- ")
+	 	print_element(writer, doc, 0)
 	 	wprintln(writer, " --- ")
-	 	print_element(writer, doc.root)
-	 	wprintln(writer, " --- ")		
 	 }

 	return written, .None
 }

-print_element :: proc(writer: io.Writer, element: ^Element, indent := 0) -> (written: int, err: io.Error) {
-	if element == nil { return }
+print_element :: proc(writer: io.Writer, doc: ^Document, element_id: Element_ID, indent := 0) -> (written: int, err: io.Error) {
 	using fmt

 	tab :: proc(writer: io.Writer, indent: int) {
@@ -61,6 +61,8 @@ print_element :: proc(writer: io.Writer, element: ^Element, indent := 0) -> (wri

 	tab(writer, indent)

+	element := doc.elements[element_id]
+
 	if element.kind == .Element {
 		wprintf(writer, "<%v>\n", element.ident)
 		if len(element.value) > 0 {
@@ -74,7 +76,7 @@ print_element :: proc(writer: io.Writer, element: ^Element, indent := 0) -> (wri
 		}

 		for child in element.children {
-			print_element(writer, child, indent + 1)
+			print_element(writer, doc, child, indent + 1)
 		}
 	} else if element.kind == .Comment {
 		wprintf(writer, "[COMMENT] %v\n", element.value)
--- a/core/encoding/xml/example/xml_example.odin
+++ b/core/encoding/xml/example/xml_example.odin
@@ -1,52 +1,85 @@
 package xml_example

 import "core:encoding/xml"
-import "core:os"
 import "core:mem"
 import "core:fmt"
 import "core:time"
 import "core:strings"
 import "core:hash"

+N :: 1
+
 example :: proc() {
 	using fmt

-	doc: ^xml.Document
-	err: xml.Error
+	docs:  [N]^xml.Document
+	errs:  [N]xml.Error
+	times: [N]time.Duration
+
+	defer for round in 0..<N {
+		xml.destroy(docs[round])
+	}

 	DOC :: #load("../../../../tests/core/assets/XML/unicode.xml")
+	input := DOC

-	parse_duration: time.Duration
-	{
-		time.SCOPED_TICK_DURATION(&parse_duration)
-		doc, err = xml.parse(DOC, xml.Options{flags={.Ignore_Unsupported}})
+	for round in 0..<N {
+		start := time.tick_now()
+
+		docs[round], errs[round] = xml.parse(input, xml.Options{
+			flags={.Ignore_Unsupported},
+			expected_doctype = "",
+		})
+
+		end   := time.tick_now()
+		times[round] = time.tick_diff(start, end)
 	}
-	defer xml.destroy(doc)

-	ms := time.duration_milliseconds(parse_duration)
-	speed := (f64(1000.0) / ms) * f64(len(DOC)) / 1_024.0 / 1_024.0
-	fmt.printf("Parse time: %v bytes in %.2f ms (%.2f MiB/s).\n", len(DOC), ms, speed)
+	fastest := time.Duration(max(i64))
+	slowest := time.Duration(0)
+	total   := time.Duration(0)

-	if err != .None {
-		printf("Load/Parse error: %v\n", err)
-		if err == .File_Error {
+	for round in 0..<N {
+		fastest = min(fastest, times[round])
+		slowest = max(slowest, times[round])
+		total  += times[round]
+	}
+
+	fastest_ms := time.duration_milliseconds(fastest)
+	slowest_ms := time.duration_milliseconds(slowest)
+	average_ms := time.duration_milliseconds(time.Duration(f64(total) / f64(N)))
+
+	fastest_speed := (f64(1000.0) / fastest_ms) * f64(len(DOC)) / 1_024.0 / 1_024.0
+	slowest_speed := (f64(1000.0) / slowest_ms) * f64(len(DOC)) / 1_024.0 / 1_024.0
+	average_speed := (f64(1000.0) / average_ms) * f64(len(DOC)) / 1_024.0 / 1_024.0
+
+	fmt.printf("N = %v\n", N)
+	fmt.printf("[Fastest]: %v bytes in %.2f ms (%.2f MiB/s).\n", len(input), fastest_ms, fastest_speed)
+	fmt.printf("[Slowest]: %v bytes in %.2f ms (%.2f MiB/s).\n", len(input), slowest_ms, slowest_speed)
+	fmt.printf("[Average]: %v bytes in %.2f ms (%.2f MiB/s).\n", len(input), average_ms, average_speed)
+
+	if errs[0] != .None {
+		printf("Load/Parse error: %v\n", errs[0])
+		if errs[0] == .File_Error {
 			println("\"unicode.xml\" not found. Did you run \"tests\\download_assets.py\"?")
 		}
-		os.exit(1)
+		return
 	}

-	println("\"unicode.xml\" loaded and parsed.")
-
-	charlist, charlist_ok := xml.find_child_by_ident(doc.root, "charlist")
+	charlist, charlist_ok := xml.find_child_by_ident(docs[0], 0, "charlist")
 	if !charlist_ok {
-		eprintln("Could not locate top-level `<charlist>` tag.")
-		os.exit(1)
+	 	eprintln("Could not locate top-level `<charlist>` tag.")
+	 	return
 	}

-	printf("Found `<charlist>` with %v children.\n", len(charlist.children))
+	printf("Found `<charlist>` with %v children, %v elements total\n", len(docs[0].elements[charlist].children), docs[0].element_count)

-	crc32 := doc_hash(doc)
+	crc32 := doc_hash(docs[0])
 	printf("[%v] CRC32: 0x%08x\n", "🎉" if crc32 == 0xcaa042b9 else "🤬", crc32)
+
+	for round in 0..<N {
+		defer xml.destroy(docs[round])
+	}
 }

 doc_hash :: proc(doc: ^xml.Document, print := false) -> (crc32: u32) {
--- a/core/encoding/xml/helpers.odin
+++ b/core/encoding/xml/helpers.odin
@@ -1,22 +1,20 @@
-package xml
 /*
 	An XML 1.0 / 1.1 parser

-	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
+	Copyright 2021-2022 Jeroen van Rijn <nom@duclavier.com>.
 	Made available under Odin's BSD-3 license.

 	This file contains helper functions.
 */
+package xml

-
-/*
-	Find `tag`'s nth child with a given ident.
-*/
-find_child_by_ident :: proc(tag: ^Element, ident: string, nth := 0) -> (res: ^Element, found: bool) {
-	if tag == nil                                 { return nil, false }
+// Find parent's nth child with a given ident.
+find_child_by_ident :: proc(doc: ^Document, parent_id: Element_ID, ident: string, nth := 0) -> (res: Element_ID, found: bool) {
+	tag := doc.elements[parent_id]

 	count := 0
-	for child in tag.children {
+	for child_id in tag.children {
+		child := doc.elements[child_id]
 		/*
 			Skip commments. They have no name.
 		*/
@@ -26,18 +24,16 @@ find_child_by_ident :: proc(tag: ^Element, ident: string, nth := 0) -> (res: ^El
 			If the ident matches and it's the nth such child, return it.
 		*/
 		if child.ident == ident {
-			if count == nth                       { return child, true }
+			if count == nth                       { return child_id, true }
 			count += 1
 		}
 	}
-	return nil, false
+	return 0, false
 }

-/*
-	Find an attribute by key.
-*/
-find_attribute_val_by_key :: proc(tag: ^Element, key: string) -> (val: string, found: bool) {
-	if tag == nil            { return "", false }
+// Find an attribute by key.
+find_attribute_val_by_key :: proc(doc: ^Document, parent_id: Element_ID, key: string) -> (val: string, found: bool) {
+	tag := doc.elements[parent_id]

 	for attr in tag.attribs {
 		/*
--- a/core/encoding/xml/tokenizer.odin
+++ b/core/encoding/xml/tokenizer.odin
@@ -1,3 +1,14 @@
+/*
+	An XML 1.0 / 1.1 parser
+
+	Copyright 2021-2022 Jeroen van Rijn <nom@duclavier.com>.
+	Made available under Odin's BSD-3 license.
+
+	A from-scratch XML implementation, loosely modeled on the [spec](https://www.w3.org/TR/2006/REC-xml11-20060816).
+
+	List of contributors:
+		Jeroen van Rijn: Initial implementation.
+*/
 package xml

 import "core:fmt"
--- a/core/encoding/xml/xml_reader.odin
+++ b/core/encoding/xml/xml_reader.odin
@@ -1,8 +1,7 @@
-package xml
 /*
 	An XML 1.0 / 1.1 parser

-	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
+	Copyright 2021-2022 Jeroen van Rijn <nom@duclavier.com>.
 	Made available under Odin's BSD-3 license.

 	A from-scratch XML implementation, loosely modelled on the [spec](https://www.w3.org/TR/2006/REC-xml11-20060816).
@@ -25,12 +24,17 @@ package xml
 	List of contributors:
 		Jeroen van Rijn: Initial implementation.
 */
+package xml
+// An XML 1.0 / 1.1 parser

 import "core:bytes"
-import "core:strings"
 import "core:encoding/entity"
+import "core:intrinsics"
 import "core:mem"
 import "core:os"
+import "core:strings"
+
+likely :: intrinsics.expect

 DEFAULT_Options :: Options{
 	flags            = {
@@ -88,7 +92,9 @@ Option_Flag :: enum {
 Option_Flags :: bit_set[Option_Flag; u16]

 Document :: struct {
-	root:     ^Element,
+	elements:      [dynamic]Element,
+	element_count: Element_ID,
+
 	prolog:   Attributes,
 	encoding: Encoding,

@@ -129,8 +135,8 @@ Element :: struct {
 		Comment,
 	},

-	parent:   ^Element,
-	children: [dynamic]^Element,
+	parent:   Element_ID,
+	children: [dynamic]Element_ID,
 }

 Attr :: struct {
@@ -185,7 +191,7 @@ Error :: enum {

 	No_DocType,
 	Too_Many_DocTypes,
-	DocType_Must_Proceed_Elements,
+	DocType_Must_Preceed_Elements,

 	/*
 		If a DOCTYPE is present _or_ the caller
@@ -237,12 +243,16 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
 	doc.tokenizer = t
 	doc.input     = data

+	doc.elements = make([dynamic]Element, 1024, 1024, allocator)
+
 	// strings.intern_init(&doc.intern, allocator, allocator)

 	err =            .Unexpected_Token
-	element, parent: ^Element
+	element, parent: Element_ID

-	tag_is_open := false
+	tag_is_open   := false
+	first_element := true
+	open: Token

 	/*
 		If a DOCTYPE is present, the root tag has to match.
@@ -252,6 +262,7 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err

 	loop: for {
 		skip_whitespace(t)
+		// NOTE(Jeroen): This is faster as a switch.
 		switch t.ch {
 		case '<':
 			/*
@@ -259,35 +270,85 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
 			*/
 			advance_rune(t)

-			open := scan(t)
-			#partial switch open.kind {
-
-			case .Question:
+			open = scan(t)
+			// NOTE(Jeroen): We're not using a switch because this if-else chain ordered by likelihood is 2.5% faster at -o:size and -o:speed.
+			if likely(open.kind, Token_Kind.Ident) == .Ident {
 				/*
-					<?xml
+					e.g. <odin - Start of new element.
 				*/
-				next := scan(t)
-				#partial switch next.kind {
-				case .Ident:
-					if len(next.text) == 3 && strings.to_lower(next.text, context.temp_allocator) == "xml" {
-						parse_prolog(doc) or_return
-					} else if len(doc.prolog) > 0 {
-						/*
-							We've already seen a prolog.
-						*/
-						return doc, .Too_Many_Prologs
-					} else {
-						/*
-							Could be `<?xml-stylesheet`, etc. Ignore it.
-						*/
-						skip_element(t) or_return
+				element = new_element(doc)
+				tag_is_open = true
+
+				if first_element {
+					/*
+						First element.
+					*/
+					parent   = element
+					first_element = false
+				} else {
+					append(&doc.elements[parent].children, element)
+				}
+
+				doc.elements[element].parent = parent
+				doc.elements[element].ident  = open.text
+
+				parse_attributes(doc, &doc.elements[element].attribs) or_return
+
+				/*
+					If a DOCTYPE is present _or_ the caller
+					asked for a specific DOCTYPE and the DOCTYPE
+					and root tag don't match, we return .Invalid_Root_Tag.
+				*/
+				if element == 0 { // Root tag?
+					if len(expected_doctype) > 0 && expected_doctype != open.text {
+						error(t, t.offset, "Root Tag doesn't match DOCTYPE. Expected: %v, got: %v\n", expected_doctype, open.text)
+						return doc, .Invalid_DocType
 					}
+				}
+
+				/*
+					One of these should follow:
+					- `>`,  which means we've just opened this tag and expect a later element to close it.
+					- `/>`, which means this is an 'empty' or self-closing tag.
+				*/
+				end_token := scan(t)
+				#partial switch end_token.kind {
+				case .Gt:
+					/*
+						We're now the new parent.
+					*/
+					parent = element
+
+				case .Slash:
+					/*
+						Empty tag. Close it.
+					*/
+					expect(t, .Gt) or_return
+					parent      = doc.elements[element].parent
+					element     = parent
+					tag_is_open = false
+
 				case:
-					error(t, t.offset, "Expected \"<?xml\", got \"<?%v\".", next.text)
+					error(t, t.offset, "Expected close tag, got: %#v\n", end_token)
 					return
 				}

-			case .Exclaim:
+			} else if open.kind == .Slash {
+				/*
+					Close tag.
+				*/
+				ident := expect(t, .Ident) or_return
+				_      = expect(t, .Gt)    or_return
+
+				if doc.elements[element].ident != ident.text {
+					error(t, t.offset, "Mismatched Closing Tag. Expected %v, got %v\n", doc.elements[element].ident, ident.text)
+					return doc, .Mismatched_Closing_Tag
+				}
+				parent      = doc.elements[element].parent
+				element     = parent
+				tag_is_open = false
+
+			} else if open.kind == .Exclaim {
 				/*
 					<!
 				*/
@@ -299,8 +360,8 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
 						if len(doc.doctype.ident) > 0 {
 							return doc, .Too_Many_DocTypes
 						}
-						if doc.root != nil {
-							return doc, .DocType_Must_Proceed_Elements
+						if doc.element_count > 0 {
+							return doc, .DocType_Must_Preceed_Elements
 						}
 						parse_doctype(doc) or_return

@@ -327,14 +388,14 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
 					comment := scan_comment(t) or_return

 					if .Intern_Comments in opts.flags {
-						if doc.root == nil {
+						if len(doc.elements) == 0 {
 							append(&doc.comments, comment)
 						} else {
-							el := new(Element)
-							el.parent = element
-							el.kind   = .Comment
-							el.value  = comment
-							append(&element.children, el)
+							el := new_element(doc)
+							doc.elements[el].parent = element
+							doc.elements[el].kind   = .Comment
+							doc.elements[el].value  = comment
+							append(&doc.elements[element].children, el)
 						}
 					}

@@ -343,83 +404,32 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
 					return
 				}

-			case .Ident:
+			} else if open.kind == .Question {
 				/*
-					e.g. <odin - Start of new element.
+					<?xml
 				*/
-				element = new(Element)
-				tag_is_open = true
-
-				if doc.root == nil {
-					/*
-						First element.
-					*/
-					doc.root = element
-					parent   = element
-				} else {
-					append(&parent.children, element)
-				}
-
-				element.parent = parent
-				element.ident  = open.text
-
-				parse_attributes(doc, &element.attribs) or_return
-
-				/*
-					If a DOCTYPE is present _or_ the caller
-					asked for a specific DOCTYPE and the DOCTYPE
-					and root tag don't match, we return .Invalid_Root_Tag.
-				*/
-				if element == doc.root {
-					if len(expected_doctype) > 0 && expected_doctype != open.text {
-						error(t, t.offset, "Root Tag doesn't match DOCTYPE. Expected: %v, got: %v\n", expected_doctype, open.text)
-						return doc, .Invalid_DocType
+				next := scan(t)
+				#partial switch next.kind {
+				case .Ident:
+					if len(next.text) == 3 && strings.to_lower(next.text, context.temp_allocator) == "xml" {
+						parse_prolog(doc) or_return
+					} else if len(doc.prolog) > 0 {
+						/*
+							We've already seen a prolog.
+						*/
+						return doc, .Too_Many_Prologs
+					} else {
+						/*
+							Could be `<?xml-stylesheet`, etc. Ignore it.
+						*/
+						skip_element(t) or_return
 					}
-				}
-
-				/*
-					One of these should follow:
-					- `>`,  which means we've just opened this tag and expect a later element to close it.
-					- `/>`, which means this is an 'empty' or self-closing tag.
-				*/
-				end_token := scan(t)
-				#partial switch end_token.kind {
-				case .Gt:
-					/*
-						We're now the new parent.
-					*/
-					parent = element
-
-				case .Slash:
-					/*
-						Empty tag. Close it.
-					*/
-					expect(t, .Gt) or_return
-					parent      = element.parent
-					element     = parent
-					tag_is_open = false
-
 				case:
-					error(t, t.offset, "Expected close tag, got: %#v\n", end_token)
+					error(t, t.offset, "Expected \"<?xml\", got \"<?%v\".", next.text)
 					return
 				}

-			case .Slash:
-				/*
-					Close tag.
-				*/
-				ident := expect(t, .Ident) or_return
-				_      = expect(t, .Gt)    or_return
-
-				if element.ident != ident.text {
-					error(t, t.offset, "Mismatched Closing Tag. Expected %v, got %v\n", element.ident, ident.text)
-					return doc, .Mismatched_Closing_Tag
-				}
-				parent      = element.parent
-				element     = parent
-				tag_is_open = false
-
-			case:
+			} else {
 				error(t, t.offset, "Invalid Token after <: %#v\n", open)
 				return
 			}
@@ -442,7 +452,7 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
 			needs_processing |= .Decode_SGML_Entities in opts.flags

 			if !needs_processing {
-				element.value = body_text
+				doc.elements[element].value = body_text
 				continue
 			}

@@ -464,10 +474,10 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err

 			decoded, decode_err := entity.decode_xml(body_text, decode_opts)
 			if decode_err == .None {
-				element.value = decoded
+				doc.elements[element].value = decoded
 				append(&doc.strings_to_free, decoded)
 			} else {
-				element.value = body_text
+				doc.elements[element].value = body_text
 			}
 		}
 	}
@@ -480,6 +490,7 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
 		return doc, .No_DocType
 	}

+	resize(&doc.elements, int(doc.element_count))
 	return doc, .None
 }

@@ -497,26 +508,14 @@ parse_from_file :: proc(filename: string, options := DEFAULT_Options, error_hand

 parse :: proc { parse_from_file, parse_from_slice }

-free_element :: proc(element: ^Element) {
-	if element == nil { return }
-
-	for child in element.children {
-		/*
-			NOTE: Recursive.
-
-			Could be rewritten so it adds them to a list of pointers to free.
-		*/
-		free_element(child)
-	}
-	delete(element.attribs)
-	delete(element.children)
-	free(element)
-}
-
 destroy :: proc(doc: ^Document) {
 	if doc == nil { return }

-	free_element(doc.root)
+	for el in doc.elements {
+		delete(el.attribs)
+		delete(el.children)
+	}
+	delete(doc.elements)

 	delete(doc.prolog)
 	delete(doc.comments)
@@ -686,4 +685,25 @@ parse_doctype :: proc(doc: ^Document) -> (err: Error) {
 	*/
 	doc.doctype.rest = string(t.src[offset : t.offset - 1])
 	return .None
+}
+
+Element_ID :: u32
+
+new_element :: proc(doc: ^Document) -> (id: Element_ID) {
+	element_space := len(doc.elements)
+
+	// Need to resize
+	if int(doc.element_count) + 1 > element_space {
+		if element_space < 65536 {
+			element_space *= 2
+		} else {
+			element_space += 65536
+		}
+		resize(&doc.elements, element_space)
+	}
+
+	cur := doc.element_count
+	doc.element_count += 1
+
+	return cur
 }
--- a/tests/core/encoding/xml/test_core_xml.odin
+++ b/tests/core/encoding/xml/test_core_xml.odin
@@ -224,7 +224,7 @@ doc_to_string :: proc(doc: ^xml.Document) -> (result: string) {
 			written += wprintf(writer, "[DOCTYPE]  %v\n", doc.doctype.ident)

 			if len(doc.doctype.rest) > 0 {
-				wprintf(writer, "\t%v\n", doc.doctype.rest)
+			 	wprintf(writer, "\t%v\n", doc.doctype.rest)
 			}
 		}

@@ -232,17 +232,16 @@ doc_to_string :: proc(doc: ^xml.Document) -> (result: string) {
 			written += wprintf(writer, "[Pre-root comment]  %v\n", comment)
 		}

-		if doc.root != nil {
-			wprintln(writer, " --- ")
-			print_element(writer, doc.root)
-			wprintln(writer, " --- ")		
+		if doc.element_count > 0 {
+		 	wprintln(writer, " --- ")
+		 	print_element(writer, doc, 0)
+		 	wprintln(writer, " --- ")
 		 }

 		return written, .None
 	}

-	print_element :: proc(writer: io.Writer, element: ^xml.Element, indent := 0) -> (written: int, err: io.Error) {
-		if element == nil { return }
+	print_element :: proc(writer: io.Writer, doc: ^xml.Document, element_id: xml.Element_ID, indent := 0) -> (written: int, err: io.Error) {
 		using fmt

 		tab :: proc(writer: io.Writer, indent: int) {
@@ -253,6 +252,8 @@ doc_to_string :: proc(doc: ^xml.Document) -> (result: string) {

 		tab(writer, indent)

+		element := doc.elements[element_id]
+
 		if element.kind == .Element {
 			wprintf(writer, "<%v>\n", element.ident)
 			if len(element.value) > 0 {
@@ -266,7 +267,7 @@ doc_to_string :: proc(doc: ^xml.Document) -> (result: string) {
 			}

 			for child in element.children {
-				print_element(writer, child, indent + 1)
+				print_element(writer, doc, child, indent + 1)
 			}
 		} else if element.kind == .Comment {
 			wprintf(writer, "[COMMENT] %v\n", element.value)