Files
Odin/core/encoding/entity/entity.odin
gingerBill 842cfee0f3 Change Odin's LICENSE to zlib from BSD 3-clause
This change was made in order to allow things produced with Odin and using Odin's core library, to not require the LICENSE to also be distributed alongside the binary form.
2025-10-28 14:38:25 +00:00

357 lines
8.6 KiB
Odin

/*
Encode and decode `rune`s to/from a Unicode `&entity;`.
This code has several procedures to map unicode runes to/from different textual encodings.
- SGML/XML/HTML entity
- &#<decimal>;
- &#x<hexadecimal>;
- &<entity name>; (If the lookup tables are compiled in).
Reference: [[ https://www.w3.org/2003/entities/2007xml/unicode.xml ]]
- URL encode / decode %hex entity
Reference: [[ https://datatracker.ietf.org/doc/html/rfc3986/#section-2.1 ]]
*/
package encoding_unicode_entity
/*
Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
Made available under Odin's license.
List of contributors:
Jeroen van Rijn: Initial implementation.
*/
import "core:unicode/utf8"
import "core:unicode"
import "core:strings"
MAX_RUNE_CODEPOINT :: int(unicode.MAX_RUNE)
write_rune :: strings.write_rune
write_string :: strings.write_string
Error :: enum u8 {
None = 0,
Tokenizer_Is_Nil,
Illegal_NUL_Character,
Illegal_UTF_Encoding,
Illegal_BOM,
CDATA_Not_Terminated,
Comment_Not_Terminated,
Invalid_Entity_Encoding,
}
Tokenizer :: struct {
r: rune,
w: int,
src: string,
offset: int,
read_offset: int,
}
CDATA_START :: "<![CDATA["
CDATA_END :: "]]>"
COMMENT_START :: "<!--"
COMMENT_END :: "-->"
// Default: CDATA and comments are passed through unchanged.
XML_Decode_Option :: enum u8 {
// Do not decode & entities. It decodes by default. If given, overrides `Decode_CDATA`.
No_Entity_Decode,
// CDATA is unboxed.
Unbox_CDATA,
// Unboxed CDATA is decoded as well. Ignored if `.Unbox_CDATA` is not given.
Decode_CDATA,
// Comments are stripped.
Comment_Strip,
// Normalize whitespace
Normalize_Whitespace,
}
XML_Decode_Options :: bit_set[XML_Decode_Option; u8]
// Decode a string that may include SGML/XML/HTML entities.
// The caller has to free the result.
decode_xml :: proc(input: string, options := XML_Decode_Options{}, allocator := context.allocator) -> (decoded: string, err: Error) {
context.allocator = allocator
l := len(input)
if l == 0 { return "", .None }
builder := strings.builder_make()
defer strings.builder_destroy(&builder)
t := Tokenizer{src=input}
in_data := false
prev: rune = ' '
loop: for {
advance(&t) or_return
if t.r < 0 { break loop }
// Below here we're never inside a CDATA tag. At most we'll see the start of one,
// but that doesn't affect the logic.
switch t.r {
case '<':
/*
Might be the start of a CDATA tag or comment.
We don't need to check if we need to write a `<`, because if it isn't CDATA or a comment,
it couldn't have been part of an XML tag body to be decoded here.
Keep in mind that we could already *be* inside a CDATA tag.
If so, write `<` as a literal and continue.
*/
if in_data {
write_rune(&builder, '<')
continue
}
in_data = _handle_xml_special(&t, &builder, options) or_return
case ']':
// If we're unboxing _and_ decoding CDATA, we'll have to check for the end tag.
if in_data {
if strings.has_prefix(t.src[t.offset:], CDATA_END) {
in_data = false
t.read_offset += len(CDATA_END) - 1
}
continue
} else {
write_rune(&builder, ']')
}
case:
if in_data && .Decode_CDATA not_in options {
// Unboxed, but undecoded.
write_rune(&builder, t.r)
continue
}
if t.r == '&' {
if entity, entity_err := _extract_xml_entity(&t); entity_err != .None {
// We read to the end of the string without closing the entity. Pass through as-is.
write_string(&builder, entity)
} else {
if .No_Entity_Decode not_in options {
if decoded, ok := xml_decode_entity(entity); ok {
write_rune(&builder, decoded)
continue
}
}
// Literal passthrough because the decode failed or we want entities not decoded.
write_string(&builder, "&")
write_string(&builder, entity)
write_string(&builder, ";")
}
} else {
// Handle AV Normalization: https://www.w3.org/TR/2006/REC-xml11-20060816/#AVNormalize
if .Normalize_Whitespace in options {
switch t.r {
case ' ', '\r', '\n', '\t':
if prev != ' ' {
write_rune(&builder, ' ')
prev = ' '
}
case:
write_rune(&builder, t.r)
prev = t.r
}
} else {
// https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-line-ends
switch t.r {
case '\n', 0x85, 0x2028:
write_rune(&builder, '\n')
case '\r': // Do nothing until next character
case:
if prev == '\r' { // Turn a single carriage return into a \n
write_rune(&builder, '\n')
}
write_rune(&builder, t.r)
}
prev = t.r
}
}
}
}
return strings.clone(strings.to_string(builder), allocator), err
}
advance :: proc(t: ^Tokenizer) -> (err: Error) {
if t == nil { return .Tokenizer_Is_Nil }
#no_bounds_check {
if t.read_offset < len(t.src) {
t.offset = t.read_offset
t.r, t.w = rune(t.src[t.read_offset]), 1
switch {
case t.r == 0:
return .Illegal_NUL_Character
case t.r >= utf8.RUNE_SELF:
t.r, t.w = utf8.decode_rune_in_string(t.src[t.read_offset:])
if t.r == utf8.RUNE_ERROR && t.w == 1 {
return .Illegal_UTF_Encoding
} else if t.r == utf8.RUNE_BOM && t.offset > 0 {
return .Illegal_BOM
}
}
t.read_offset += t.w
return .None
} else {
t.offset = len(t.src)
t.r = -1
return
}
}
}
xml_decode_entity :: proc(entity: string) -> (decoded: rune, ok: bool) {
entity := entity
if len(entity) == 0 { return -1, false }
switch entity[0] {
case '#':
base := 10
val := 0
entity = entity[1:]
if len(entity) == 0 { return -1, false }
if entity[0] == 'x' || entity[0] == 'X' {
base = 16
entity = entity[1:]
}
for len(entity) > 0 {
r := entity[0]
switch r {
case '0'..='9':
val *= base
val += int(r - '0')
case 'a'..='f':
if base == 10 { return -1, false }
val *= base
val += int(r - 'a' + 10)
case 'A'..='F':
if base == 10 { return -1, false }
val *= base
val += int(r - 'A' + 10)
case:
return -1, false
}
if val > MAX_RUNE_CODEPOINT { return -1, false }
entity = entity[1:]
}
return rune(val), true
case:
// Named entity.
return named_xml_entity_to_rune(entity)
}
}
// Private XML helper to extract `&<stuff>;` entity.
@(private="file")
_extract_xml_entity :: proc(t: ^Tokenizer) -> (entity: string, err: Error) {
assert(t != nil && t.r == '&')
// All of these would be in the ASCII range.
// Even if one is not, it doesn't matter. All characters we need to compare to extract are.
length := len(t.src)
found := false
#no_bounds_check {
for t.read_offset < length {
if t.src[t.read_offset] == ';' {
t.read_offset += 1
found = true
break
}
t.read_offset += 1
}
}
if found {
return string(t.src[t.offset + 1 : t.read_offset - 1]), .None
}
return string(t.src[t.offset : t.read_offset]), .Invalid_Entity_Encoding
}
// Private XML helper for CDATA and comments.
@(private="file")
_handle_xml_special :: proc(t: ^Tokenizer, builder: ^strings.Builder, options: XML_Decode_Options) -> (in_data: bool, err: Error) {
assert(t != nil && t.r == '<')
if t.read_offset + len(CDATA_START) >= len(t.src) { return false, .None }
s := string(t.src[t.offset:])
if strings.has_prefix(s, CDATA_START) {
if .Unbox_CDATA in options && .Decode_CDATA in options {
// We're unboxing _and_ decoding CDATA
t.read_offset += len(CDATA_START) - 1
return true, .None
}
// CDATA is passed through. Scan until end of CDATA.
start_offset := t.offset
t.read_offset += len(CDATA_START)
for {
advance(t)
if t.r < 0 {
// error(t, offset, "[scan_string] CDATA was not terminated\n")
return true, .CDATA_Not_Terminated
}
// Scan until the end of a CDATA tag.
if s = string(t.src[t.read_offset:]); strings.has_prefix(s, CDATA_END) {
t.read_offset += len(CDATA_END)
cdata := string(t.src[start_offset:t.read_offset])
if .Unbox_CDATA in options {
cdata = cdata[len(CDATA_START):]
cdata = cdata[:len(cdata) - len(CDATA_END)]
}
write_string(builder, cdata)
return false, .None
}
}
} else if strings.has_prefix(s, COMMENT_START) {
t.read_offset += len(COMMENT_START)
// Comment is passed through by default.
offset := t.offset
// Scan until end of Comment.
for {
advance(t) or_return
if t.r < 0 { return true, .Comment_Not_Terminated }
if t.read_offset + len(COMMENT_END) < len(t.src) {
if string(t.src[t.offset:][:len(COMMENT_END)]) == COMMENT_END {
t.read_offset += len(COMMENT_END) - 1
if .Comment_Strip not_in options {
comment := string(t.src[offset : t.read_offset])
write_string(builder, comment)
}
return false, .None
}
}
}
}
return false, .None
}