mirror of
https://github.com/odin-lang/Odin.git
synced 2026-04-19 21:10:30 +00:00
[xml] Implement optional unboxing of CDATA and decoding of tag values.
This commit is contained in:
@@ -60,16 +60,22 @@ COMMENT_END :: "-->"
|
||||
Default: CDATA and comments are passed through unchanged.
|
||||
*/
|
||||
XML_Decode_Option :: enum u8 {
|
||||
/*
|
||||
Do not decode & entities. It decodes by default.
|
||||
If given, overrides `Decode_CDATA`.
|
||||
*/
|
||||
No_Entity_Decode,
|
||||
|
||||
/*
|
||||
CDATA is unboxed.
|
||||
*/
|
||||
CDATA_Unbox,
|
||||
Unbox_CDATA,
|
||||
|
||||
/*
|
||||
Unboxed CDATA is decoded as well.
|
||||
Ignored if `.CDATA_Unbox` is not given.
|
||||
Ignored if `.Unbox_CDATA` is not given.
|
||||
*/
|
||||
CDATA_Decode,
|
||||
Decode_CDATA,
|
||||
|
||||
/*
|
||||
Comments are stripped.
|
||||
@@ -129,7 +135,7 @@ decode_xml :: proc(input: string, options := XML_Decode_Options{}, allocator :=
|
||||
}
|
||||
|
||||
case:
|
||||
if in_data && .CDATA_Decode not_in options {
|
||||
if in_data && .Decode_CDATA not_in options {
|
||||
/*
|
||||
Unboxed, but undecoded.
|
||||
*/
|
||||
@@ -145,17 +151,20 @@ decode_xml :: proc(input: string, options := XML_Decode_Options{}, allocator :=
|
||||
*/
|
||||
write_string(&builder, entity)
|
||||
} else {
|
||||
if decoded, ok := xml_decode_entity(entity); ok {
|
||||
write_rune(&builder, decoded)
|
||||
} else {
|
||||
/*
|
||||
Decode failed. Pass through original.
|
||||
*/
|
||||
write_string(&builder, "&")
|
||||
write_string(&builder, entity)
|
||||
write_string(&builder, ";")
|
||||
|
||||
if .No_Entity_Decode not_in options {
|
||||
if decoded, ok := xml_decode_entity(entity); ok {
|
||||
write_rune(&builder, decoded)
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
Literal passthrough because the decode failed or we want entities not decoded.
|
||||
*/
|
||||
write_string(&builder, "&")
|
||||
write_string(&builder, entity)
|
||||
write_string(&builder, ";")
|
||||
}
|
||||
} else {
|
||||
write_rune(&builder, t.r)
|
||||
@@ -290,7 +299,7 @@ _handle_xml_special :: proc(t: ^Tokenizer, builder: ^strings.Builder, options: X
|
||||
if string(t.src[t.offset:][:len(CDATA_START)]) == CDATA_START {
|
||||
t.read_offset += len(CDATA_START) - 1
|
||||
|
||||
if .CDATA_Unbox in options && .CDATA_Decode in options {
|
||||
if .Unbox_CDATA in options && .Decode_CDATA in options {
|
||||
/*
|
||||
We're unboxing _and_ decoding CDATA
|
||||
*/
|
||||
@@ -315,7 +324,7 @@ _handle_xml_special :: proc(t: ^Tokenizer, builder: ^strings.Builder, options: X
|
||||
|
||||
cdata := string(t.src[offset : t.read_offset])
|
||||
|
||||
if .CDATA_Unbox in options {
|
||||
if .Unbox_CDATA in options {
|
||||
cdata = cdata[len(CDATA_START):]
|
||||
cdata = cdata[:len(cdata) - len(CDATA_END)]
|
||||
}
|
||||
|
||||
@@ -1,19 +1,11 @@
|
||||
package unicode_entity_example
|
||||
|
||||
import "core:encoding/xml"
|
||||
import "core:encoding/entity"
|
||||
import "core:strings"
|
||||
import "core:mem"
|
||||
import "core:fmt"
|
||||
import "core:time"
|
||||
|
||||
OPTIONS :: xml.Options{
|
||||
flags = {
|
||||
.Ignore_Unsupported, .Intern_Comments,
|
||||
},
|
||||
expected_doctype = "",
|
||||
}
|
||||
|
||||
doc_print :: proc(doc: ^xml.Document) {
|
||||
buf: strings.Builder
|
||||
defer strings.destroy_builder(&buf)
|
||||
@@ -29,6 +21,13 @@ _entities :: proc() {
|
||||
|
||||
DOC :: #load("../../../../tests/core/assets/XML/unicode.xml")
|
||||
|
||||
OPTIONS :: xml.Options{
|
||||
flags = {
|
||||
.Ignore_Unsupported, .Intern_Comments,
|
||||
},
|
||||
expected_doctype = "",
|
||||
}
|
||||
|
||||
parse_duration: time.Duration
|
||||
|
||||
{
|
||||
@@ -50,57 +49,11 @@ _entities :: proc() {
|
||||
_main :: proc() {
|
||||
using fmt
|
||||
|
||||
doc, err := xml.parse(#load("test.html"))
|
||||
options := xml.Options{ flags = { .Ignore_Unsupported, .Intern_Comments, .Unbox_CDATA, .Decode_SGML_Entities }}
|
||||
doc, _ := xml.parse(#load("test.html"), options)
|
||||
|
||||
defer xml.destroy(doc)
|
||||
doc_print(doc)
|
||||
|
||||
if false {
|
||||
val := doc.root.children[1].children[2].value
|
||||
|
||||
println()
|
||||
replaced, ok := entity.decode_xml(val)
|
||||
defer delete(replaced)
|
||||
|
||||
printf("Before: '%v', Err: %v\n", val, err)
|
||||
printf("Passthrough: '%v'\nOK: %v\n", replaced, ok)
|
||||
println()
|
||||
}
|
||||
|
||||
if false {
|
||||
val := doc.root.children[1].children[2].value
|
||||
|
||||
println()
|
||||
replaced, ok := entity.decode_xml(val, { .CDATA_Unbox })
|
||||
defer delete(replaced)
|
||||
|
||||
printf("Before: '%v', Err: %v\n", val, err)
|
||||
printf("CDATA_Unbox: '%v'\nOK: %v\n", replaced, ok)
|
||||
println()
|
||||
}
|
||||
|
||||
if true {
|
||||
val := doc.root.children[1].children[2].value
|
||||
|
||||
println()
|
||||
replaced, ok := entity.decode_xml(val, { .CDATA_Unbox, .CDATA_Decode })
|
||||
defer delete(replaced)
|
||||
|
||||
printf("Before: '%v', Err: %v\n", val, err)
|
||||
printf("CDATA_Decode: '%v'\nOK: %v\n", replaced, ok)
|
||||
println()
|
||||
}
|
||||
|
||||
if true {
|
||||
val := doc.root.children[1].children[1].value
|
||||
|
||||
println()
|
||||
replaced, ok := entity.decode_xml(val, { .Comment_Strip })
|
||||
defer delete(replaced)
|
||||
|
||||
printf("Before: '%v', Err: %v\n", val, err)
|
||||
printf("Comment_Strip: '%v'\nOK: %v\n", replaced, ok)
|
||||
println()
|
||||
}
|
||||
}
|
||||
|
||||
main :: proc() {
|
||||
|
||||
@@ -16,9 +16,11 @@
|
||||
<div id="test_cdata_in_comment" foo="">
|
||||
Foozle]! © <!-- <![CDATA[ ® ]]> -->42&;1234&
|
||||
</div>
|
||||
<!-- EXPECTED: Foozle]! © 42&;1234& -->
|
||||
<div id="test_cdata_unwrap_and_passthrough">
|
||||
Foozle]! © <![CDATA[BOX ® /BOX]]>42&;1234&
|
||||
</div>
|
||||
<!-- EXPECTED: Foozle]! © BOX ® /BOX42&;1234& -->
|
||||
<div>
|
||||
| | | fj ` \ ® ϱ ∳
|
||||
</div>
|
||||
|
||||
@@ -18,10 +18,6 @@ package xml
|
||||
- We do NOT support UTF-16. If you have a UTF-16 XML file, please convert it to UTF-8 first. Also, our condolences.
|
||||
- <[!ELEMENT and <[!ATTLIST are not supported, and will be either ignored or return an error depending on the parser options.
|
||||
|
||||
TODO:
|
||||
- Optional CDATA unboxing.
|
||||
- Optional `>`, ` `, ` ` and other escape substitution in tag bodies.
|
||||
|
||||
MAYBE:
|
||||
- XML writer?
|
||||
- Serialize/deserialize Odin types?
|
||||
@@ -31,6 +27,7 @@ package xml
|
||||
*/
|
||||
|
||||
import "core:strings"
|
||||
import "core:encoding/entity"
|
||||
import "core:mem"
|
||||
import "core:os"
|
||||
|
||||
@@ -196,12 +193,6 @@ Error :: enum {
|
||||
|
||||
Duplicate_Attribute,
|
||||
Conflicting_Options,
|
||||
|
||||
/*
|
||||
Unhandled TODO:
|
||||
*/
|
||||
Unhandled_CDATA_Unboxing,
|
||||
Unhandled_SGML_Entity_Decoding,
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -422,8 +413,25 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
|
||||
/*
|
||||
This should be a tag's body text.
|
||||
*/
|
||||
body_text := scan_string(t, t.offset) or_return
|
||||
element.value = strings.intern_get(&doc.intern, body_text)
|
||||
body_text := scan_string(t, t.offset) or_return
|
||||
|
||||
decode_opts := entity.XML_Decode_Options{ .Comment_Strip }
|
||||
|
||||
if .Decode_SGML_Entities not_in opts.flags {
|
||||
decode_opts += { .No_Entity_Decode }
|
||||
}
|
||||
if .Unbox_CDATA in opts.flags {
|
||||
decode_opts += { .Unbox_CDATA, .Decode_CDATA }
|
||||
}
|
||||
|
||||
decoded, decode_err := entity.decode_xml(body_text, decode_opts)
|
||||
defer delete(decoded)
|
||||
|
||||
if decode_err == .None {
|
||||
element.value = strings.intern_get(&doc.intern, decoded)
|
||||
} else {
|
||||
element.value = strings.intern_get(&doc.intern, body_text)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -488,15 +496,6 @@ validate_options :: proc(options: Options) -> (validated: Options, err: Error) {
|
||||
if .Error_on_Unsupported in validated.flags && .Ignore_Unsupported in validated.flags {
|
||||
return options, .Conflicting_Options
|
||||
}
|
||||
|
||||
if .Unbox_CDATA in validated.flags {
|
||||
return options, .Unhandled_CDATA_Unboxing
|
||||
}
|
||||
|
||||
if .Decode_SGML_Entities in validated.flags {
|
||||
return options, .Unhandled_SGML_Entity_Decoding
|
||||
}
|
||||
|
||||
return validated, .None
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user