diff --git a/core/encoding/entity/entity.odin b/core/encoding/entity/entity.odin index cee6230ef..280be9377 100644 --- a/core/encoding/entity/entity.odin +++ b/core/encoding/entity/entity.odin @@ -56,38 +56,27 @@ CDATA_END :: "]]>" COMMENT_START :: "" -/* - Default: CDATA and comments are passed through unchanged. -*/ +// Default: CDATA and comments are passed through unchanged. XML_Decode_Option :: enum u8 { - /* - Do not decode & entities. It decodes by default. - If given, overrides `Decode_CDATA`. - */ + // Do not decode & entities. It decodes by default. If given, overrides `Decode_CDATA`. No_Entity_Decode, - /* - CDATA is unboxed. - */ + // CDATA is unboxed. Unbox_CDATA, - /* - Unboxed CDATA is decoded as well. - Ignored if `.Unbox_CDATA` is not given. - */ + // Unboxed CDATA is decoded as well. Ignored if `.Unbox_CDATA` is not given. Decode_CDATA, - /* - Comments are stripped. - */ + // Comments are stripped. Comment_Strip, + + // Normalize whitespace + Normalize_Whitespace, } XML_Decode_Options :: bit_set[XML_Decode_Option; u8] -/* - Decode a string that may include SGML/XML/HTML entities. - The caller has to free the result. -*/ +// Decode a string that may include SGML/XML/HTML entities. +// The caller has to free the result. decode_xml :: proc(input: string, options := XML_Decode_Options{}, allocator := context.allocator) -> (decoded: string, err: Error) { context.allocator = allocator @@ -100,14 +89,14 @@ decode_xml :: proc(input: string, options := XML_Decode_Options{}, allocator := t := Tokenizer{src=input} in_data := false + prev: rune + loop: for { advance(&t) or_return if t.r < 0 { break loop } - /* - Below here we're never inside a CDATA tag. - At most we'll see the start of one, but that doesn't affect the logic. - */ + // Below here we're never inside a CDATA tag. At most we'll see the start of one, + // but that doesn't affect the logic. switch t.r { case '<': /* @@ -126,9 +115,7 @@ decode_xml :: proc(input: string, options := XML_Decode_Options{}, allocator := in_data = _handle_xml_special(&t, &builder, options) or_return case ']': - /* - If we're unboxing _and_ decoding CDATA, we'll have to check for the end tag. - */ + // If we're unboxing _and_ decoding CDATA, we'll have to check for the end tag. if in_data { if t.read_offset + len(CDATA_END) < len(t.src) { if string(t.src[t.offset:][:len(CDATA_END)]) == CDATA_END { @@ -143,22 +130,16 @@ decode_xml :: proc(input: string, options := XML_Decode_Options{}, allocator := case: if in_data && .Decode_CDATA not_in options { - /* - Unboxed, but undecoded. - */ + // Unboxed, but undecoded. write_rune(&builder, t.r) continue } if t.r == '&' { if entity, entity_err := _extract_xml_entity(&t); entity_err != .None { - /* - We read to the end of the string without closing the entity. - Pass through as-is. - */ + // We read to the end of the string without closing the entity. Pass through as-is. write_string(&builder, entity) } else { - if .No_Entity_Decode not_in options { if decoded, ok := xml_decode_entity(entity); ok { write_rune(&builder, decoded) @@ -166,19 +147,27 @@ decode_xml :: proc(input: string, options := XML_Decode_Options{}, allocator := } } - /* - Literal passthrough because the decode failed or we want entities not decoded. - */ + // Literal passthrough because the decode failed or we want entities not decoded. write_string(&builder, "&") write_string(&builder, entity) write_string(&builder, ";") } } else { - write_rune(&builder, t.r) + // https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-line-ends + switch t.r { + case '\n', 0x85, 0x2028: + write_rune(&builder, '\n') + case '\r': // Do nothing until next character + case: + if prev == '\r' { // Turn a single carriage return into a \n + write_rune(&builder, '\n') + } + write_rune(&builder, t.r) + } + prev = t.r } } } - return strings.clone(strings.to_string(builder), allocator), err } @@ -253,24 +242,18 @@ xml_decode_entity :: proc(entity: string) -> (decoded: rune, ok: bool) { return rune(val), true case: - /* - Named entity. - */ + // Named entity. return named_xml_entity_to_rune(entity) } } -/* - Private XML helper to extract `&;` entity. -*/ +// Private XML helper to extract `&;` entity. @(private="file") _extract_xml_entity :: proc(t: ^Tokenizer) -> (entity: string, err: Error) { assert(t != nil && t.r == '&') - /* - All of these would be in the ASCII range. - Even if one is not, it doesn't matter. All characters we need to compare to extract are. - */ + // All of these would be in the ASCII range. + // Even if one is not, it doesn't matter. All characters we need to compare to extract are. length := len(t.src) found := false @@ -292,9 +275,7 @@ _extract_xml_entity :: proc(t: ^Tokenizer) -> (entity: string, err: Error) { return string(t.src[t.offset : t.read_offset]), .Invalid_Entity_Encoding } -/* - Private XML helper for CDATA and comments. -*/ +// Private XML helper for CDATA and comments. @(private="file") _handle_xml_special :: proc(t: ^Tokenizer, builder: ^strings.Builder, options: XML_Decode_Options) -> (in_data: bool, err: Error) { assert(t != nil && t.r == '<') @@ -304,20 +285,14 @@ _handle_xml_special :: proc(t: ^Tokenizer, builder: ^strings.Builder, options: X t.read_offset += len(CDATA_START) - 1 if .Unbox_CDATA in options && .Decode_CDATA in options { - /* - We're unboxing _and_ decoding CDATA - */ + // We're unboxing _and_ decoding CDATA return true, .None } - /* - CDATA is passed through. - */ + // CDATA is passed through. offset := t.offset - /* - Scan until end of CDATA. - */ + // Scan until end of CDATA. for { advance(t) or_return if t.r < 0 { return true, .CDATA_Not_Terminated } @@ -341,14 +316,10 @@ _handle_xml_special :: proc(t: ^Tokenizer, builder: ^strings.Builder, options: X } else if string(t.src[t.offset:][:len(COMMENT_START)]) == COMMENT_START { t.read_offset += len(COMMENT_START) - /* - Comment is passed through by default. - */ + // Comment is passed through by default. offset := t.offset - /* - Scan until end of Comment. - */ + // Scan until end of Comment. for { advance(t) or_return if t.r < 0 { return true, .Comment_Not_Terminated } diff --git a/core/encoding/xml/tokenizer.odin b/core/encoding/xml/tokenizer.odin index 0f87c366b..2d06038b7 100644 --- a/core/encoding/xml/tokenizer.odin +++ b/core/encoding/xml/tokenizer.odin @@ -218,9 +218,7 @@ scan_identifier :: proc(t: ^Tokenizer) -> string { for is_valid_identifier_rune(t.ch) { advance_rune(t) if t.ch == ':' { - /* - A namespaced attr can have at most two parts, `namespace:ident`. - */ + // A namespaced attr can have at most two parts, `namespace:ident`. if namespaced { break } @@ -268,14 +266,10 @@ scan_comment :: proc(t: ^Tokenizer) -> (comment: string, err: Error) { return string(t.src[offset : t.offset - 1]), .None } -/* - Skip CDATA -*/ +// Skip CDATA skip_cdata :: proc(t: ^Tokenizer) -> (err: Error) { if t.read_offset + len(CDATA_START) >= len(t.src) { - /* - Can't be the start of a CDATA tag. - */ + // Can't be the start of a CDATA tag. return .None } @@ -290,9 +284,7 @@ skip_cdata :: proc(t: ^Tokenizer) -> (err: Error) { return .Premature_EOF } - /* - Scan until the end of a CDATA tag. - */ + // Scan until the end of a CDATA tag. if t.read_offset + len(CDATA_END) < len(t.src) { if string(t.src[t.offset:][:len(CDATA_END)]) == CDATA_END { t.read_offset += len(CDATA_END) @@ -319,14 +311,10 @@ scan_string :: proc(t: ^Tokenizer, offset: int, close: rune = '<', consume_close case '<': if peek_byte(t) == '!' { if peek_byte(t, 1) == '[' { - /* - Might be the start of a CDATA tag. - */ + // Might be the start of a CDATA tag. skip_cdata(t) or_return } else if peek_byte(t, 1) == '-' && peek_byte(t, 2) == '-' { - /* - Comment start. Eat comment. - */ + // Comment start. Eat comment. t.read_offset += 3 _ = scan_comment(t) or_return } @@ -342,17 +330,13 @@ scan_string :: proc(t: ^Tokenizer, offset: int, close: rune = '<', consume_close } if t.ch == close { - /* - If it's not a CDATA or comment, it's the end of this body. - */ + // If it's not a CDATA or comment, it's the end of this body. break loop } advance_rune(t) } - /* - Strip trailing whitespace. - */ + // Strip trailing whitespace. lit := string(t.src[offset : t.offset]) end := len(lit) @@ -369,11 +353,6 @@ scan_string :: proc(t: ^Tokenizer, offset: int, close: rune = '<', consume_close if consume_close { advance_rune(t) } - - /* - TODO: Handle decoding escape characters and unboxing CDATA. - */ - return lit, err } @@ -384,7 +363,7 @@ peek :: proc(t: ^Tokenizer) -> (token: Token) { return token } -scan :: proc(t: ^Tokenizer) -> Token { +scan :: proc(t: ^Tokenizer, multiline_string := false) -> Token { skip_whitespace(t) offset := t.offset @@ -418,7 +397,7 @@ scan :: proc(t: ^Tokenizer) -> Token { case '"', '\'': kind = .Invalid - lit, err = scan_string(t, t.offset, ch, true, false) + lit, err = scan_string(t, t.offset, ch, true, multiline_string) if err == .None { kind = .String } @@ -435,4 +414,4 @@ scan :: proc(t: ^Tokenizer) -> Token { lit = string(t.src[offset : t.offset]) } return Token{kind, lit, pos} -} +} \ No newline at end of file diff --git a/core/encoding/xml/xml_reader.odin b/core/encoding/xml/xml_reader.odin index 5b4b12948..b9656900f 100644 --- a/core/encoding/xml/xml_reader.odin +++ b/core/encoding/xml/xml_reader.odin @@ -203,9 +203,7 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha doc.elements = make([dynamic]Element, 1024, 1024, allocator) - // strings.intern_init(&doc.intern, allocator, allocator) - - err = .Unexpected_Token + err = .Unexpected_Token element, parent: Element_ID open: Token @@ -259,8 +257,8 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha case .Slash: // Empty tag. Close it. expect(t, .Gt) or_return - parent = doc.elements[element].parent - element = parent + parent = doc.elements[element].parent + element = parent case: error(t, t.offset, "Expected close tag, got: %#v\n", end_token) @@ -276,8 +274,8 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha error(t, t.offset, "Mismatched Closing Tag. Expected %v, got %v\n", doc.elements[element].ident, ident.text) return doc, .Mismatched_Closing_Tag } - parent = doc.elements[element].parent - element = parent + parent = doc.elements[element].parent + element = parent } else if open.kind == .Exclaim { // (validated: Options, err: Error) { return validated, .None } -expect :: proc(t: ^Tokenizer, kind: Token_Kind) -> (tok: Token, err: Error) { - tok = scan(t) +expect :: proc(t: ^Tokenizer, kind: Token_Kind, multiline_string := false) -> (tok: Token, err: Error) { + tok = scan(t, multiline_string=multiline_string) if tok.kind == kind { return tok, .None } error(t, t.offset, "Expected \"%v\", got \"%v\".", kind, tok.kind) @@ -480,7 +478,13 @@ parse_attribute :: proc(doc: ^Document) -> (attr: Attribute, offset: int, err: E offset = t.offset - len(key.text) _ = expect(t, .Eq) or_return - value := expect(t, .String) or_return + value := expect(t, .String, multiline_string=true) or_return + + normalized, normalize_err := entity.decode_xml(value.text, {.Normalize_Whitespace}, doc.allocator) + if normalize_err == .None { + append(&doc.strings_to_free, normalized) + value.text = normalized + } attr.key = key.text attr.val = value.text diff --git a/tests/core/encoding/xml/test_core_xml.odin b/tests/core/encoding/xml/test_core_xml.odin index 22852d1f3..09d1a4611 100644 --- a/tests/core/encoding/xml/test_core_xml.odin +++ b/tests/core/encoding/xml/test_core_xml.odin @@ -36,7 +36,7 @@ xml_test_utf8_normal :: proc(t: ^testing.T) { }, expected_doctype = "恥ずべきフクロウ", }, - crc32 = 0xe9b62f03, + crc32 = 0xefa55f27, }) } @@ -52,7 +52,7 @@ xml_test_utf8_unbox_cdata :: proc(t: ^testing.T) { }, expected_doctype = "恥ずべきフクロウ", }, - crc32 = 0x9c2643ed, + crc32 = 0x2dd27770, }) } @@ -128,7 +128,7 @@ xml_test_entities_unbox :: proc(t: ^testing.T) { }, expected_doctype = "html", }, - crc32 = 0x3b6d4a90, + crc32 = 0x350ca83e, }) } @@ -142,7 +142,7 @@ xml_test_entities_unbox_decode :: proc(t: ^testing.T) { }, expected_doctype = "html", }, - crc32 = 0x5be2ffdc, + crc32 = 0x7f58db7d, }) } @@ -172,7 +172,7 @@ xml_test_unicode :: proc(t: ^testing.T) { expected_doctype = "", }, err = .None, - crc32 = 0x0b6100ab, + crc32 = 0x73070b55, }) }