Merge pull request #3107 from ktsiligkiris/documentation/fix_xml_docs

Fix comments for proper rendering in documentation in encoding/xml
This commit is contained in:
Jeroen van Rijn
2024-01-17 21:10:49 +01:00
committed by GitHub
5 changed files with 85 additions and 156 deletions

View File

@@ -1,3 +1,5 @@
package xml
/*
An XML 1.0 / 1.1 parser
@@ -9,7 +11,7 @@
List of contributors:
Jeroen van Rijn: Initial implementation.
*/
package xml
import "core:io"
import "core:fmt"
@@ -81,4 +83,4 @@ print_element :: proc(writer: io.Writer, doc: ^Document, element_id: Element_ID,
}
return written, .None
}
}

View File

@@ -20,7 +20,7 @@ example :: proc() {
xml.destroy(docs[round])
}
DOC :: #load("../../../../tests/core/assets/XML/unicode.xml")
DOC :: #load("../../../../tests/core/assets/XML/utf8.xml")
input := DOC
for round in 0..<N {
@@ -109,4 +109,4 @@ main :: proc() {
}
}
println("Done and cleaned up!")
}
}

View File

@@ -1,3 +1,5 @@
package xml
/*
An XML 1.0 / 1.1 parser
@@ -6,7 +8,7 @@
This file contains helper functions.
*/
package xml
// Find parent's nth child with a given ident.
find_child_by_ident :: proc(doc: ^Document, parent_id: Element_ID, ident: string, nth := 0) -> (res: Element_ID, found: bool) {
@@ -47,4 +49,4 @@ find_attribute_val_by_key :: proc(doc: ^Document, parent_id: Element_ID, key: st
if attr.key == key { return attr.val, true }
}
return "", false
}
}

View File

@@ -1,3 +1,5 @@
package xml
/*
An XML 1.0 / 1.1 parser
@@ -9,7 +11,7 @@
List of contributors:
Jeroen van Rijn: Initial implementation.
*/
package xml
import "core:fmt"
import "core:unicode"
@@ -433,4 +435,4 @@ scan :: proc(t: ^Tokenizer) -> Token {
lit = string(t.src[offset : t.offset])
}
return Token{kind, lit, pos}
}
}

View File

@@ -1,28 +1,28 @@
/*
An XML 1.0 / 1.1 parser
XML 1.0 / 1.1 parser
Copyright 2021-2022 Jeroen van Rijn <nom@duclavier.com>.
Made available under Odin's BSD-3 license.
2021-2022 Jeroen van Rijn <nom@duclavier.com>.
available under Odin's BSD-3 license.
A from-scratch XML implementation, loosely modelled on the [spec](https://www.w3.org/TR/2006/REC-xml11-20060816).
from-scratch XML implementation, loosely modelled on the [spec](https://www.w3.org/TR/2006/REC-xml11-20060816).
Features:
- Supports enough of the XML 1.0/1.1 spec to handle the 99.9% of XML documents in common current usage.
- Simple to understand and use. Small.
Features:
- Supports enough of the XML 1.0/1.1 spec to handle the 99.9% of XML documents in common current usage.
- Simple to understand and use. Small.
Caveats:
- We do NOT support HTML in this package, as that may or may not be valid XML.
If it works, great. If it doesn't, that's not considered a bug.
Caveats:
- We do NOT support HTML in this package, as that may or may not be valid XML.
If it works, great. If it doesn't, that's not considered a bug.
- We do NOT support UTF-16. If you have a UTF-16 XML file, please convert it to UTF-8 first. Also, our condolences.
- <[!ELEMENT and <[!ATTLIST are not supported, and will be either ignored or return an error depending on the parser options.
- We do NOT support UTF-16. If you have a UTF-16 XML file, please convert it to UTF-8 first. Also, our condolences.
- <[!ELEMENT and <[!ATTLIST are not supported, and will be either ignored or return an error depending on the parser options.
MAYBE:
- XML writer?
- Serialize/deserialize Odin types?
MAYBE:
- XML writer?
- Serialize/deserialize Odin types?
List of contributors:
Jeroen van Rijn: Initial implementation.
List of contributors:
- Jeroen van Rijn: Initial implementation.
*/
package xml
// An XML 1.0 / 1.1 parser
@@ -43,48 +43,32 @@ DEFAULT_OPTIONS :: Options{
}
Option_Flag :: enum {
/*
If the caller says that input may be modified, we can perform in-situ parsing.
If this flag isn't provided, the XML parser first duplicates the input so that it can.
*/
// If the caller says that input may be modified, we can perform in-situ parsing.
// If this flag isn't provided, the XML parser first duplicates the input so that it can.
Input_May_Be_Modified,
/*
Document MUST start with `<?xml` prologue.
*/
// Document MUST start with `<?xml` prologue.
Must_Have_Prolog,
/*
Document MUST have a `<!DOCTYPE`.
*/
// Document MUST have a `<!DOCTYPE`.
Must_Have_DocType,
/*
By default we skip comments. Use this option to intern a comment on a parented Element.
*/
// By default we skip comments. Use this option to intern a comment on a parented Element.
Intern_Comments,
/*
How to handle unsupported parts of the specification, like <! other than <!DOCTYPE and <![CDATA[
*/
// How to handle unsupported parts of the specification, like <! other than <!DOCTYPE and <![CDATA[
Error_on_Unsupported,
Ignore_Unsupported,
/*
By default CDATA tags are passed-through as-is.
This option unwraps them when encountered.
*/
// By default CDATA tags are passed-through as-is.
// This option unwraps them when encountered.
Unbox_CDATA,
/*
By default SGML entities like `&gt;`, `&#32;` and `&#x20;` are passed-through as-is.
This option decodes them when encountered.
*/
// By default SGML entities like `&gt;`, `&#32;` and `&#x20;` are passed-through as-is.
// This option decodes them when encountered.
Decode_SGML_Entities,
/*
If a tag body has a comment, it will be stripped unless this option is given.
*/
// If a tag body has a comment, it will be stripped unless this option is given.
Keep_Tag_Body_Comments,
}
Option_Flags :: bit_set[Option_Flag; u16]
@@ -97,28 +81,20 @@ Document :: struct {
encoding: Encoding,
doctype: struct {
/*
We only scan the <!DOCTYPE IDENT part and skip the rest.
*/
// We only scan the <!DOCTYPE IDENT part and skip the rest.
ident: string,
rest: string,
},
/*
If we encounter comments before the root node, and the option to intern comments is given, this is where they'll live.
Otherwise they'll be in the element tree.
*/
// If we encounter comments before the root node, and the option to intern comments is given, this is where they'll live.
// Otherwise they'll be in the element tree.
comments: [dynamic]string,
/*
Internal
*/
// Internal
tokenizer: ^Tokenizer,
allocator: mem.Allocator,
/*
Input. Either the original buffer, or a copy if `.Input_May_Be_Modified` isn't specified.
*/
// Input. Either the original buffer, or a copy if `.Input_May_Be_Modified` isn't specified.
input: []u8,
strings_to_free: [dynamic]string,
}
@@ -158,34 +134,24 @@ Encoding :: enum {
UTF_8,
ISO_8859_1,
/*
Aliases
*/
// Aliases
LATIN_1 = ISO_8859_1,
}
Error :: enum {
/*
General return values.
*/
// General return values.
None = 0,
General_Error,
Unexpected_Token,
Invalid_Token,
/*
Couldn't find, open or read file.
*/
// Couldn't find, open or read file.
File_Error,
/*
File too short.
*/
// File too short.
Premature_EOF,
/*
XML-specific errors.
*/
// XML-specific errors.
No_Prolog,
Invalid_Prolog,
Too_Many_Prologs,
@@ -194,11 +160,9 @@ Error :: enum {
Too_Many_DocTypes,
DocType_Must_Preceed_Elements,
/*
If a DOCTYPE is present _or_ the caller
asked for a specific DOCTYPE and the DOCTYPE
and root tag don't match, we return `.Invalid_DocType`.
*/
// If a DOCTYPE is present _or_ the caller
// asked for a specific DOCTYPE and the DOCTYPE
// and root tag don't match, we return `.Invalid_DocType`.
Invalid_DocType,
Invalid_Tag_Value,
@@ -211,27 +175,20 @@ Error :: enum {
Unsupported_Version,
Unsupported_Encoding,
/*
<!FOO are usually skipped.
*/
// <!FOO are usually skipped.
Unhandled_Bang,
Duplicate_Attribute,
Conflicting_Options,
}
/*
Implementation starts here.
*/
parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_handler := default_error_handler, allocator := context.allocator) -> (doc: ^Document, err: Error) {
data := data
context.allocator = allocator
opts := validate_options(options) or_return
/*
If `.Input_May_Be_Modified` is not specified, we duplicate the input so that we can modify it in-place.
*/
// If `.Input_May_Be_Modified` is not specified, we duplicate the input so that we can modify it in-place.
if .Input_May_Be_Modified not_in opts.flags {
data = bytes.clone(data)
}
@@ -252,10 +209,8 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha
element, parent: Element_ID
open: Token
/*
If a DOCTYPE is present, the root tag has to match.
If an expected DOCTYPE is given in options (i.e. it's non-empty), the DOCTYPE (if present) and root tag have to match.
*/
// If a DOCTYPE is present, the root tag has to match.
// If an expected DOCTYPE is given in options (i.e. it's non-empty), the DOCTYPE (if present) and root tag have to match.
expected_doctype := options.expected_doctype
loop: for {
@@ -263,17 +218,13 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha
// NOTE(Jeroen): This is faster as a switch.
switch t.ch {
case '<':
/*
Consume peeked `<`
*/
// Consume peeked `<`
advance_rune(t)
open = scan(t)
// NOTE(Jeroen): We're not using a switch because this if-else chain ordered by likelihood is 2.5% faster at -o:size and -o:speed.
if likely(open.kind, Token_Kind.Ident) == .Ident {
/*
e.g. <odin - Start of new element.
*/
// e.g. <odin - Start of new element.
element = new_element(doc)
if element == 0 { // First Element
parent = element
@@ -286,11 +237,9 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha
parse_attributes(doc, &doc.elements[element].attribs) or_return
/*
If a DOCTYPE is present _or_ the caller
asked for a specific DOCTYPE and the DOCTYPE
and root tag don't match, we return .Invalid_Root_Tag.
*/
// If a DOCTYPE is present _or_ the caller
// asked for a specific DOCTYPE and the DOCTYPE
// and root tag don't match, we return .Invalid_Root_Tag.
if element == 0 { // Root tag?
if len(expected_doctype) > 0 && expected_doctype != open.text {
error(t, t.offset, "Root Tag doesn't match DOCTYPE. Expected: %v, got: %v\n", expected_doctype, open.text)
@@ -298,23 +247,17 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha
}
}
/*
One of these should follow:
- `>`, which means we've just opened this tag and expect a later element to close it.
- `/>`, which means this is an 'empty' or self-closing tag.
*/
// One of these should follow:
// - `>`, which means we've just opened this tag and expect a later element to close it.
// - `/>`, which means this is an 'empty' or self-closing tag.
end_token := scan(t)
#partial switch end_token.kind {
case .Gt:
/*
We're now the new parent.
*/
// We're now the new parent.
parent = element
case .Slash:
/*
Empty tag. Close it.
*/
// Empty tag. Close it.
expect(t, .Gt) or_return
parent = doc.elements[element].parent
element = parent
@@ -325,9 +268,7 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha
}
} else if open.kind == .Slash {
/*
Close tag.
*/
// Close tag.
ident := expect(t, .Ident) or_return
_ = expect(t, .Gt) or_return
@@ -339,9 +280,7 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha
element = parent
} else if open.kind == .Exclaim {
/*
<!
*/
// <!
next := scan(t)
#partial switch next.kind {
case .Ident:
@@ -370,10 +309,8 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha
}
case .Dash:
/*
Comment: <!-- -->.
The grammar does not allow a comment to end in --->
*/
// Comment: <!-- -->.
// The grammar does not allow a comment to end in --->
expect(t, .Dash)
comment := scan_comment(t) or_return
@@ -395,23 +332,17 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha
}
} else if open.kind == .Question {
/*
<?xml
*/
// <?xml
next := scan(t)
#partial switch next.kind {
case .Ident:
if len(next.text) == 3 && strings.equal_fold(next.text, "xml") {
parse_prologue(doc) or_return
} else if len(doc.prologue) > 0 {
/*
We've already seen a prologue.
*/
// We've already seen a prologue.
return doc, .Too_Many_Prologs
} else {
/*
Could be `<?xml-stylesheet`, etc. Ignore it.
*/
// Could be `<?xml-stylesheet`, etc. Ignore it.
skip_element(t) or_return
}
case:
@@ -425,15 +356,11 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha
}
case -1:
/*
End of file.
*/
// End of file.
break loop
case:
/*
This should be a tag's body text.
*/
// This should be a tag's body text.
body_text := scan_string(t, t.offset) or_return
needs_processing := .Unbox_CDATA in opts.flags
needs_processing |= .Decode_SGML_Entities in opts.flags
@@ -613,9 +540,7 @@ parse_prologue :: proc(doc: ^Document) -> (err: Error) {
doc.encoding = .LATIN_1
case:
/*
Unrecognized encoding, assume UTF-8.
*/
// Unrecognized encoding, assume UTF-8.
error(t, offset, "[parse_prologue] Warning: Unrecognized encoding: %v\n", attr.val)
}
@@ -658,11 +583,11 @@ skip_element :: proc(t: ^Tokenizer) -> (err: Error) {
parse_doctype :: proc(doc: ^Document) -> (err: Error) {
/*
<!DOCTYPE greeting SYSTEM "hello.dtd">
<!DOCTYPE greeting SYSTEM "hello.dtd">
<!DOCTYPE greeting [
<!ELEMENT greeting (#PCDATA)>
]>
<!DOCTYPE greeting [
<!ELEMENT greeting (#PCDATA)>
]>
*/
assert(doc != nil)
context.allocator = doc.allocator
@@ -675,9 +600,7 @@ parse_doctype :: proc(doc: ^Document) -> (err: Error) {
offset := t.offset
skip_element(t) or_return
/*
-1 because the current offset is that of the closing tag, so the rest of the DOCTYPE tag ends just before it.
*/
// -1 because the current offset is that of the closing tag, so the rest of the DOCTYPE tag ends just before it.
doc.doctype.rest = string(t.src[offset : t.offset - 1])
return .None
}
@@ -700,4 +623,4 @@ new_element :: proc(doc: ^Document) -> (id: Element_ID) {
cur := doc.element_count
doc.element_count += 1
return cur
}
}