Update XML reader to normalize whitespace, part 1.

This commit is contained in:
Jeroen van Rijn
2024-06-12 12:52:48 +02:00
parent e87c5bca58
commit ebadff555d
4 changed files with 70 additions and 116 deletions

View File

@@ -56,38 +56,27 @@ CDATA_END :: "]]>"
COMMENT_START :: "<!--"
COMMENT_END :: "-->"
/*
Default: CDATA and comments are passed through unchanged.
*/
// Default: CDATA and comments are passed through unchanged.
XML_Decode_Option :: enum u8 {
/*
Do not decode & entities. It decodes by default.
If given, overrides `Decode_CDATA`.
*/
// Do not decode & entities. It decodes by default. If given, overrides `Decode_CDATA`.
No_Entity_Decode,
/*
CDATA is unboxed.
*/
// CDATA is unboxed.
Unbox_CDATA,
/*
Unboxed CDATA is decoded as well.
Ignored if `.Unbox_CDATA` is not given.
*/
// Unboxed CDATA is decoded as well. Ignored if `.Unbox_CDATA` is not given.
Decode_CDATA,
/*
Comments are stripped.
*/
// Comments are stripped.
Comment_Strip,
// Normalize whitespace
Normalize_Whitespace,
}
XML_Decode_Options :: bit_set[XML_Decode_Option; u8]
/*
Decode a string that may include SGML/XML/HTML entities.
The caller has to free the result.
*/
// Decode a string that may include SGML/XML/HTML entities.
// The caller has to free the result.
decode_xml :: proc(input: string, options := XML_Decode_Options{}, allocator := context.allocator) -> (decoded: string, err: Error) {
context.allocator = allocator
@@ -100,14 +89,14 @@ decode_xml :: proc(input: string, options := XML_Decode_Options{}, allocator :=
t := Tokenizer{src=input}
in_data := false
prev: rune
loop: for {
advance(&t) or_return
if t.r < 0 { break loop }
/*
Below here we're never inside a CDATA tag.
At most we'll see the start of one, but that doesn't affect the logic.
*/
// Below here we're never inside a CDATA tag. At most we'll see the start of one,
// but that doesn't affect the logic.
switch t.r {
case '<':
/*
@@ -126,9 +115,7 @@ decode_xml :: proc(input: string, options := XML_Decode_Options{}, allocator :=
in_data = _handle_xml_special(&t, &builder, options) or_return
case ']':
/*
If we're unboxing _and_ decoding CDATA, we'll have to check for the end tag.
*/
// If we're unboxing _and_ decoding CDATA, we'll have to check for the end tag.
if in_data {
if t.read_offset + len(CDATA_END) < len(t.src) {
if string(t.src[t.offset:][:len(CDATA_END)]) == CDATA_END {
@@ -143,22 +130,16 @@ decode_xml :: proc(input: string, options := XML_Decode_Options{}, allocator :=
case:
if in_data && .Decode_CDATA not_in options {
/*
Unboxed, but undecoded.
*/
// Unboxed, but undecoded.
write_rune(&builder, t.r)
continue
}
if t.r == '&' {
if entity, entity_err := _extract_xml_entity(&t); entity_err != .None {
/*
We read to the end of the string without closing the entity.
Pass through as-is.
*/
// We read to the end of the string without closing the entity. Pass through as-is.
write_string(&builder, entity)
} else {
if .No_Entity_Decode not_in options {
if decoded, ok := xml_decode_entity(entity); ok {
write_rune(&builder, decoded)
@@ -166,19 +147,27 @@ decode_xml :: proc(input: string, options := XML_Decode_Options{}, allocator :=
}
}
/*
Literal passthrough because the decode failed or we want entities not decoded.
*/
// Literal passthrough because the decode failed or we want entities not decoded.
write_string(&builder, "&")
write_string(&builder, entity)
write_string(&builder, ";")
}
} else {
write_rune(&builder, t.r)
// https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-line-ends
switch t.r {
case '\n', 0x85, 0x2028:
write_rune(&builder, '\n')
case '\r': // Do nothing until next character
case:
if prev == '\r' { // Turn a single carriage return into a \n
write_rune(&builder, '\n')
}
write_rune(&builder, t.r)
}
prev = t.r
}
}
}
return strings.clone(strings.to_string(builder), allocator), err
}
@@ -253,24 +242,18 @@ xml_decode_entity :: proc(entity: string) -> (decoded: rune, ok: bool) {
return rune(val), true
case:
/*
Named entity.
*/
// Named entity.
return named_xml_entity_to_rune(entity)
}
}
/*
Private XML helper to extract `&<stuff>;` entity.
*/
// Private XML helper to extract `&<stuff>;` entity.
@(private="file")
_extract_xml_entity :: proc(t: ^Tokenizer) -> (entity: string, err: Error) {
assert(t != nil && t.r == '&')
/*
All of these would be in the ASCII range.
Even if one is not, it doesn't matter. All characters we need to compare to extract are.
*/
// All of these would be in the ASCII range.
// Even if one is not, it doesn't matter. All characters we need to compare to extract are.
length := len(t.src)
found := false
@@ -292,9 +275,7 @@ _extract_xml_entity :: proc(t: ^Tokenizer) -> (entity: string, err: Error) {
return string(t.src[t.offset : t.read_offset]), .Invalid_Entity_Encoding
}
/*
Private XML helper for CDATA and comments.
*/
// Private XML helper for CDATA and comments.
@(private="file")
_handle_xml_special :: proc(t: ^Tokenizer, builder: ^strings.Builder, options: XML_Decode_Options) -> (in_data: bool, err: Error) {
assert(t != nil && t.r == '<')
@@ -304,20 +285,14 @@ _handle_xml_special :: proc(t: ^Tokenizer, builder: ^strings.Builder, options: X
t.read_offset += len(CDATA_START) - 1
if .Unbox_CDATA in options && .Decode_CDATA in options {
/*
We're unboxing _and_ decoding CDATA
*/
// We're unboxing _and_ decoding CDATA
return true, .None
}
/*
CDATA is passed through.
*/
// CDATA is passed through.
offset := t.offset
/*
Scan until end of CDATA.
*/
// Scan until end of CDATA.
for {
advance(t) or_return
if t.r < 0 { return true, .CDATA_Not_Terminated }
@@ -341,14 +316,10 @@ _handle_xml_special :: proc(t: ^Tokenizer, builder: ^strings.Builder, options: X
} else if string(t.src[t.offset:][:len(COMMENT_START)]) == COMMENT_START {
t.read_offset += len(COMMENT_START)
/*
Comment is passed through by default.
*/
// Comment is passed through by default.
offset := t.offset
/*
Scan until end of Comment.
*/
// Scan until end of Comment.
for {
advance(t) or_return
if t.r < 0 { return true, .Comment_Not_Terminated }

View File

@@ -218,9 +218,7 @@ scan_identifier :: proc(t: ^Tokenizer) -> string {
for is_valid_identifier_rune(t.ch) {
advance_rune(t)
if t.ch == ':' {
/*
A namespaced attr can have at most two parts, `namespace:ident`.
*/
// A namespaced attr can have at most two parts, `namespace:ident`.
if namespaced {
break
}
@@ -268,14 +266,10 @@ scan_comment :: proc(t: ^Tokenizer) -> (comment: string, err: Error) {
return string(t.src[offset : t.offset - 1]), .None
}
/*
Skip CDATA
*/
// Skip CDATA
skip_cdata :: proc(t: ^Tokenizer) -> (err: Error) {
if t.read_offset + len(CDATA_START) >= len(t.src) {
/*
Can't be the start of a CDATA tag.
*/
// Can't be the start of a CDATA tag.
return .None
}
@@ -290,9 +284,7 @@ skip_cdata :: proc(t: ^Tokenizer) -> (err: Error) {
return .Premature_EOF
}
/*
Scan until the end of a CDATA tag.
*/
// Scan until the end of a CDATA tag.
if t.read_offset + len(CDATA_END) < len(t.src) {
if string(t.src[t.offset:][:len(CDATA_END)]) == CDATA_END {
t.read_offset += len(CDATA_END)
@@ -319,14 +311,10 @@ scan_string :: proc(t: ^Tokenizer, offset: int, close: rune = '<', consume_close
case '<':
if peek_byte(t) == '!' {
if peek_byte(t, 1) == '[' {
/*
Might be the start of a CDATA tag.
*/
// Might be the start of a CDATA tag.
skip_cdata(t) or_return
} else if peek_byte(t, 1) == '-' && peek_byte(t, 2) == '-' {
/*
Comment start. Eat comment.
*/
// Comment start. Eat comment.
t.read_offset += 3
_ = scan_comment(t) or_return
}
@@ -342,17 +330,13 @@ scan_string :: proc(t: ^Tokenizer, offset: int, close: rune = '<', consume_close
}
if t.ch == close {
/*
If it's not a CDATA or comment, it's the end of this body.
*/
// If it's not a CDATA or comment, it's the end of this body.
break loop
}
advance_rune(t)
}
/*
Strip trailing whitespace.
*/
// Strip trailing whitespace.
lit := string(t.src[offset : t.offset])
end := len(lit)
@@ -369,11 +353,6 @@ scan_string :: proc(t: ^Tokenizer, offset: int, close: rune = '<', consume_close
if consume_close {
advance_rune(t)
}
/*
TODO: Handle decoding escape characters and unboxing CDATA.
*/
return lit, err
}
@@ -384,7 +363,7 @@ peek :: proc(t: ^Tokenizer) -> (token: Token) {
return token
}
scan :: proc(t: ^Tokenizer) -> Token {
scan :: proc(t: ^Tokenizer, multiline_string := false) -> Token {
skip_whitespace(t)
offset := t.offset
@@ -418,7 +397,7 @@ scan :: proc(t: ^Tokenizer) -> Token {
case '"', '\'':
kind = .Invalid
lit, err = scan_string(t, t.offset, ch, true, false)
lit, err = scan_string(t, t.offset, ch, true, multiline_string)
if err == .None {
kind = .String
}
@@ -435,4 +414,4 @@ scan :: proc(t: ^Tokenizer) -> Token {
lit = string(t.src[offset : t.offset])
}
return Token{kind, lit, pos}
}
}

View File

@@ -203,9 +203,7 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha
doc.elements = make([dynamic]Element, 1024, 1024, allocator)
// strings.intern_init(&doc.intern, allocator, allocator)
err = .Unexpected_Token
err = .Unexpected_Token
element, parent: Element_ID
open: Token
@@ -259,8 +257,8 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha
case .Slash:
// Empty tag. Close it.
expect(t, .Gt) or_return
parent = doc.elements[element].parent
element = parent
parent = doc.elements[element].parent
element = parent
case:
error(t, t.offset, "Expected close tag, got: %#v\n", end_token)
@@ -276,8 +274,8 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha
error(t, t.offset, "Mismatched Closing Tag. Expected %v, got %v\n", doc.elements[element].ident, ident.text)
return doc, .Mismatched_Closing_Tag
}
parent = doc.elements[element].parent
element = parent
parent = doc.elements[element].parent
element = parent
} else if open.kind == .Exclaim {
// <!
@@ -463,8 +461,8 @@ validate_options :: proc(options: Options) -> (validated: Options, err: Error) {
return validated, .None
}
expect :: proc(t: ^Tokenizer, kind: Token_Kind) -> (tok: Token, err: Error) {
tok = scan(t)
expect :: proc(t: ^Tokenizer, kind: Token_Kind, multiline_string := false) -> (tok: Token, err: Error) {
tok = scan(t, multiline_string=multiline_string)
if tok.kind == kind { return tok, .None }
error(t, t.offset, "Expected \"%v\", got \"%v\".", kind, tok.kind)
@@ -480,7 +478,13 @@ parse_attribute :: proc(doc: ^Document) -> (attr: Attribute, offset: int, err: E
offset = t.offset - len(key.text)
_ = expect(t, .Eq) or_return
value := expect(t, .String) or_return
value := expect(t, .String, multiline_string=true) or_return
normalized, normalize_err := entity.decode_xml(value.text, {.Normalize_Whitespace}, doc.allocator)
if normalize_err == .None {
append(&doc.strings_to_free, normalized)
value.text = normalized
}
attr.key = key.text
attr.val = value.text

View File

@@ -36,7 +36,7 @@ xml_test_utf8_normal :: proc(t: ^testing.T) {
},
expected_doctype = "恥ずべきフクロウ",
},
crc32 = 0xe9b62f03,
crc32 = 0xefa55f27,
})
}
@@ -52,7 +52,7 @@ xml_test_utf8_unbox_cdata :: proc(t: ^testing.T) {
},
expected_doctype = "恥ずべきフクロウ",
},
crc32 = 0x9c2643ed,
crc32 = 0x2dd27770,
})
}
@@ -128,7 +128,7 @@ xml_test_entities_unbox :: proc(t: ^testing.T) {
},
expected_doctype = "html",
},
crc32 = 0x3b6d4a90,
crc32 = 0x350ca83e,
})
}
@@ -142,7 +142,7 @@ xml_test_entities_unbox_decode :: proc(t: ^testing.T) {
},
expected_doctype = "html",
},
crc32 = 0x5be2ffdc,
crc32 = 0x7f58db7d,
})
}
@@ -172,7 +172,7 @@ xml_test_unicode :: proc(t: ^testing.T) {
expected_doctype = "",
},
err = .None,
crc32 = 0x0b6100ab,
crc32 = 0x73070b55,
})
}