mirror of
https://github.com/odin-lang/Odin.git
synced 2026-06-12 05:18:09 +00:00
[core:encoding/entity] Add new package to decode &<entity>; entities.
Includes generator to generate a lookup for named entitiess.
This commit is contained in:
358
core/encoding/entity/entity.odin
Normal file
358
core/encoding/entity/entity.odin
Normal file
@@ -0,0 +1,358 @@
|
||||
package unicode_entity
|
||||
/*
|
||||
A unicode entity encoder/decoder
|
||||
|
||||
Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
|
||||
Made available under Odin's BSD-3 license.
|
||||
|
||||
This code has several procedures to map unicode runes to/from different textual encodings.
|
||||
- SGML/XML/HTML entity
|
||||
-- &#<decimal>;
|
||||
-- &#x<hexadecimal>;
|
||||
-- &<entity name>; (If the lookup tables are compiled in).
|
||||
Reference: https://www.w3.org/2003/entities/2007xml/unicode.xml
|
||||
|
||||
- URL encode / decode %hex entity
|
||||
Reference: https://datatracker.ietf.org/doc/html/rfc3986/#section-2.1
|
||||
|
||||
List of contributors:
|
||||
Jeroen van Rijn: Initial implementation.
|
||||
*/
|
||||
|
||||
import "core:unicode/utf8"
|
||||
import "core:unicode"
|
||||
import "core:strings"
|
||||
|
||||
MAX_RUNE_CODEPOINT :: int(unicode.MAX_RUNE)
|
||||
|
||||
write_rune :: strings.write_rune_builder
|
||||
write_string :: strings.write_string_builder
|
||||
|
||||
Error :: enum u8 {
|
||||
None = 0,
|
||||
Tokenizer_Is_Nil,
|
||||
|
||||
Illegal_NUL_Character,
|
||||
Illegal_UTF_Encoding,
|
||||
Illegal_BOM,
|
||||
|
||||
CDATA_Not_Terminated,
|
||||
Comment_Not_Terminated,
|
||||
Invalid_Entity_Encoding,
|
||||
}
|
||||
|
||||
Tokenizer :: struct {
|
||||
r: rune,
|
||||
w: int,
|
||||
|
||||
src: string,
|
||||
offset: int,
|
||||
read_offset: int,
|
||||
}
|
||||
|
||||
CDATA_START :: "<![CDATA["
|
||||
CDATA_END :: "]]>"
|
||||
|
||||
COMMENT_START :: "<!--"
|
||||
COMMENT_END :: "-->"
|
||||
|
||||
/*
|
||||
Default: CDATA and comments are passed through unchanged.
|
||||
*/
|
||||
XML_Decode_Option :: enum u8 {
|
||||
/*
|
||||
CDATA is unboxed.
|
||||
*/
|
||||
CDATA_Unbox,
|
||||
|
||||
/*
|
||||
Unboxed CDATA is decoded as well.
|
||||
Ignored if `.CDATA_Unbox` is not given.
|
||||
*/
|
||||
CDATA_Decode,
|
||||
|
||||
/*
|
||||
Comments are stripped.
|
||||
*/
|
||||
Comment_Strip,
|
||||
}
|
||||
XML_Decode_Options :: bit_set[XML_Decode_Option; u8]
|
||||
|
||||
/*
|
||||
Decode a string that may include SGML/XML/HTML entities.
|
||||
The caller has to free the result.
|
||||
*/
|
||||
decode_xml :: proc(input: string, options := XML_Decode_Options{}, allocator := context.allocator) -> (decoded: string, err: Error) {
|
||||
context.allocator = allocator
|
||||
|
||||
l := len(input)
|
||||
if l == 0 { return "", .None }
|
||||
|
||||
builder := strings.make_builder()
|
||||
defer strings.destroy_builder(&builder)
|
||||
|
||||
t := Tokenizer{src=input}
|
||||
in_data := false
|
||||
|
||||
loop: for {
|
||||
advance(&t) or_return
|
||||
if t.r < 0 { break loop }
|
||||
|
||||
/*
|
||||
Below here we're never inside a CDATA tag.
|
||||
At most we'll see the start of one, but that doesn't affect the logic.
|
||||
*/
|
||||
switch t.r {
|
||||
case '<':
|
||||
/*
|
||||
Might be the start of a CDATA tag or comment.
|
||||
|
||||
We don't need to check if we need to write a `<`, because if it isn't CDATA or a comment,
|
||||
it couldn't have been part of an XML tag body to be decoded here.
|
||||
*/
|
||||
in_data = _handle_xml_special(&t, &builder, options) or_return
|
||||
|
||||
case ']':
|
||||
/*
|
||||
If we're unboxing _and_ decoding CDATA, we'll have to check for the end tag.
|
||||
*/
|
||||
if in_data {
|
||||
if t.read_offset + len(CDATA_END) < len(t.src) {
|
||||
if string(t.src[t.offset:][:len(CDATA_END)]) == CDATA_END {
|
||||
in_data = false
|
||||
t.read_offset += len(CDATA_END) - 1
|
||||
}
|
||||
}
|
||||
continue
|
||||
} else {
|
||||
write_rune(&builder, ']')
|
||||
}
|
||||
|
||||
case:
|
||||
if in_data && .CDATA_Decode not_in options {
|
||||
/*
|
||||
Unboxed, but undecoded.
|
||||
*/
|
||||
write_rune(&builder, t.r)
|
||||
continue
|
||||
}
|
||||
|
||||
if t.r == '&' {
|
||||
if entity, entity_err := _extract_xml_entity(&t); entity_err != .None {
|
||||
/*
|
||||
We read to the end of the string without closing the entity.
|
||||
Pass through as-is.
|
||||
*/
|
||||
write_string(&builder, entity)
|
||||
} else {
|
||||
if decoded, ok := xml_decode_entity(entity); ok {
|
||||
write_rune(&builder, decoded)
|
||||
} else {
|
||||
/*
|
||||
Decode failed. Pass through original.
|
||||
*/
|
||||
write_string(&builder, "&")
|
||||
write_string(&builder, entity)
|
||||
write_string(&builder, ";")
|
||||
}
|
||||
|
||||
}
|
||||
} else {
|
||||
write_rune(&builder, t.r)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return strings.clone(strings.to_string(builder), allocator), err
|
||||
}
|
||||
|
||||
advance :: proc(t: ^Tokenizer) -> (err: Error) {
|
||||
if t == nil { return .Tokenizer_Is_Nil }
|
||||
using t
|
||||
|
||||
#no_bounds_check {
|
||||
if read_offset < len(src) {
|
||||
offset = read_offset
|
||||
r, w = rune(src[read_offset]), 1
|
||||
switch {
|
||||
case r == 0:
|
||||
return .Illegal_NUL_Character
|
||||
case r >= utf8.RUNE_SELF:
|
||||
r, w = utf8.decode_rune_in_string(src[read_offset:])
|
||||
if r == utf8.RUNE_ERROR && w == 1 {
|
||||
return .Illegal_UTF_Encoding
|
||||
} else if r == utf8.RUNE_BOM && offset > 0 {
|
||||
return .Illegal_BOM
|
||||
}
|
||||
}
|
||||
read_offset += w
|
||||
return .None
|
||||
} else {
|
||||
offset = len(src)
|
||||
r = -1
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
xml_decode_entity :: proc(entity: string) -> (decoded: rune, ok: bool) {
|
||||
entity := entity
|
||||
if len(entity) == 0 { return -1, false }
|
||||
|
||||
switch entity[0] {
|
||||
case '#':
|
||||
base := 10
|
||||
val := 0
|
||||
entity = entity[1:]
|
||||
|
||||
if len(entity) == 0 { return -1, false }
|
||||
|
||||
if entity[0] == 'x' || entity[0] == 'X' {
|
||||
base = 16
|
||||
entity = entity[1:]
|
||||
}
|
||||
|
||||
for len(entity) > 0 {
|
||||
r := entity[0]
|
||||
switch r {
|
||||
case '0'..'9':
|
||||
val *= base
|
||||
val += int(r - '0')
|
||||
|
||||
case 'a'..'f':
|
||||
if base == 10 { return -1, false }
|
||||
val *= base
|
||||
val += int(r - 'a' + 10)
|
||||
|
||||
case 'A'..'F':
|
||||
if base == 10 { return -1, false }
|
||||
val *= base
|
||||
val += int(r - 'A' + 10)
|
||||
|
||||
case:
|
||||
return -1, false
|
||||
}
|
||||
|
||||
if val > MAX_RUNE_CODEPOINT { return -1, false }
|
||||
entity = entity[1:]
|
||||
}
|
||||
return rune(val), true
|
||||
|
||||
case:
|
||||
/*
|
||||
Named entity.
|
||||
*/
|
||||
return named_xml_entity_to_rune(entity)
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
Private XML helper to extract `&<stuff>;` entity.
|
||||
*/
|
||||
@(private="file")
|
||||
_extract_xml_entity :: proc(t: ^Tokenizer) -> (entity: string, err: Error) {
|
||||
assert(t != nil && t.r == '&')
|
||||
|
||||
/*
|
||||
All of these would be in the ASCII range.
|
||||
Even if one is not, it doesn't matter. All characters we need to compare to extract are.
|
||||
*/
|
||||
using t
|
||||
|
||||
length := len(t.src)
|
||||
found := false
|
||||
|
||||
#no_bounds_check {
|
||||
for read_offset < length {
|
||||
if src[read_offset] == ';' {
|
||||
found = true
|
||||
read_offset += 1
|
||||
break
|
||||
}
|
||||
read_offset += 1
|
||||
}
|
||||
}
|
||||
|
||||
if found {
|
||||
return string(src[offset + 1 : read_offset - 1]), .None
|
||||
}
|
||||
return string(src[offset : read_offset]), .Invalid_Entity_Encoding
|
||||
}
|
||||
|
||||
/*
|
||||
Private XML helper for CDATA and comments.
|
||||
*/
|
||||
@(private="file")
|
||||
_handle_xml_special :: proc(t: ^Tokenizer, builder: ^strings.Builder, options: XML_Decode_Options) -> (in_data: bool, err: Error) {
|
||||
assert(t != nil && t.r == '<')
|
||||
if t.read_offset + len(CDATA_START) >= len(t.src) { return false, .None }
|
||||
|
||||
if string(t.src[t.offset:][:len(CDATA_START)]) == CDATA_START {
|
||||
t.read_offset += len(CDATA_START) - 1
|
||||
|
||||
if .CDATA_Unbox in options && .CDATA_Decode in options {
|
||||
/*
|
||||
We're unboxing _and_ decoding CDATA
|
||||
*/
|
||||
return true, .None
|
||||
}
|
||||
|
||||
/*
|
||||
CDATA is passed through.
|
||||
*/
|
||||
offset := t.offset
|
||||
|
||||
/*
|
||||
Scan until end of CDATA.
|
||||
*/
|
||||
for {
|
||||
advance(t) or_return
|
||||
if t.r < 0 { return true, .CDATA_Not_Terminated }
|
||||
|
||||
if t.read_offset + len(CDATA_END) < len(t.src) {
|
||||
if string(t.src[t.offset:][:len(CDATA_END)]) == CDATA_END {
|
||||
t.read_offset += len(CDATA_END) - 1
|
||||
|
||||
cdata := string(t.src[offset : t.read_offset])
|
||||
|
||||
if .CDATA_Unbox in options {
|
||||
cdata = cdata[len(CDATA_START):]
|
||||
cdata = cdata[:len(cdata) - len(CDATA_END)]
|
||||
}
|
||||
|
||||
write_string(builder, cdata)
|
||||
return false, .None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} else if string(t.src[t.offset:][:len(COMMENT_START)]) == COMMENT_START {
|
||||
t.read_offset += len(COMMENT_START)
|
||||
/*
|
||||
Comment is passed through by default.
|
||||
*/
|
||||
offset := t.offset
|
||||
|
||||
/*
|
||||
Scan until end of Comment.
|
||||
*/
|
||||
for {
|
||||
advance(t) or_return
|
||||
if t.r < 0 { return true, .Comment_Not_Terminated }
|
||||
|
||||
if t.read_offset + len(COMMENT_END) < len(t.src) {
|
||||
if string(t.src[t.offset:][:len(COMMENT_END)]) == COMMENT_END {
|
||||
t.read_offset += len(COMMENT_END) - 1
|
||||
|
||||
if .Comment_Strip not_in options {
|
||||
comment := string(t.src[offset : t.read_offset])
|
||||
write_string(builder, comment)
|
||||
}
|
||||
return false, .None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
return false, .None
|
||||
}
|
||||
Reference in New Issue
Block a user