mirror of
https://github.com/odin-lang/Odin.git
synced 2025-12-29 09:24:33 +00:00
418 lines
8.1 KiB
Odin
418 lines
8.1 KiB
Odin
package encoding_xml
|
|
|
|
/*
|
|
An XML 1.0 / 1.1 parser
|
|
|
|
Copyright 2021-2022 Jeroen van Rijn <nom@duclavier.com>.
|
|
Made available under Odin's BSD-3 license.
|
|
|
|
A from-scratch XML implementation, loosely modeled on the [spec](https://www.w3.org/TR/2006/REC-xml11-20060816).
|
|
|
|
List of contributors:
|
|
Jeroen van Rijn: Initial implementation.
|
|
*/
|
|
|
|
|
|
import "core:fmt"
|
|
import "core:unicode"
|
|
import "core:unicode/utf8"
|
|
|
|
Error_Handler :: #type proc(pos: Pos, fmt: string, args: ..any)
|
|
|
|
Token :: struct {
|
|
kind: Token_Kind,
|
|
text: string,
|
|
pos: Pos,
|
|
}
|
|
|
|
Pos :: struct {
|
|
file: string,
|
|
offset: int, // starting at 0
|
|
line: int, // starting at 1
|
|
column: int, // starting at 1
|
|
}
|
|
|
|
Token_Kind :: enum {
|
|
Invalid,
|
|
|
|
Ident,
|
|
Literal,
|
|
Rune,
|
|
String,
|
|
|
|
Double_Quote, // "
|
|
Single_Quote, // '
|
|
Colon, // :
|
|
|
|
Eq, // =
|
|
Lt, // <
|
|
Gt, // >
|
|
Exclaim, // !
|
|
Question, // ?
|
|
Hash, // #
|
|
Slash, // /
|
|
Dash, // -
|
|
|
|
Open_Bracket, // [
|
|
Close_Bracket, // ]
|
|
|
|
EOF,
|
|
}
|
|
|
|
CDATA_START :: "<![CDATA["
|
|
CDATA_END :: "]]>"
|
|
|
|
COMMENT_START :: "<!--"
|
|
COMMENT_END :: "-->"
|
|
|
|
Tokenizer :: struct {
|
|
// Immutable data
|
|
path: string,
|
|
src: string,
|
|
err: Error_Handler,
|
|
|
|
// Tokenizing state
|
|
ch: rune,
|
|
offset: int,
|
|
read_offset: int,
|
|
line_offset: int,
|
|
line_count: int,
|
|
|
|
// Mutable data
|
|
error_count: int,
|
|
}
|
|
|
|
init :: proc(t: ^Tokenizer, src: string, path: string, err: Error_Handler = default_error_handler) {
|
|
t.src = src
|
|
t.err = err
|
|
t.ch = ' '
|
|
t.offset = 0
|
|
t.read_offset = 0
|
|
t.line_offset = 0
|
|
t.line_count = len(src) > 0 ? 1 : 0
|
|
t.error_count = 0
|
|
t.path = path
|
|
|
|
advance_rune(t)
|
|
if t.ch == utf8.RUNE_BOM {
|
|
advance_rune(t)
|
|
}
|
|
}
|
|
|
|
@(private)
|
|
offset_to_pos :: proc(t: ^Tokenizer, offset: int) -> Pos {
|
|
line := t.line_count
|
|
column := offset - t.line_offset + 1
|
|
|
|
return Pos {
|
|
file = t.path,
|
|
offset = offset,
|
|
line = line,
|
|
column = column,
|
|
}
|
|
}
|
|
|
|
default_error_handler :: proc(pos: Pos, msg: string, args: ..any) {
|
|
fmt.eprintf("%s(%d:%d) ", pos.file, pos.line, pos.column)
|
|
fmt.eprintf(msg, ..args)
|
|
fmt.eprintf("\n")
|
|
}
|
|
|
|
error :: proc(t: ^Tokenizer, offset: int, msg: string, args: ..any) {
|
|
pos := offset_to_pos(t, offset)
|
|
if t.err != nil {
|
|
t.err(pos, msg, ..args)
|
|
}
|
|
t.error_count += 1
|
|
}
|
|
|
|
@(optimization_mode="favor_size")
|
|
advance_rune :: proc(t: ^Tokenizer) {
|
|
#no_bounds_check {
|
|
/*
|
|
Already bounds-checked here.
|
|
*/
|
|
if t.read_offset < len(t.src) {
|
|
t.offset = t.read_offset
|
|
if t.ch == '\n' {
|
|
t.line_offset = t.offset
|
|
t.line_count += 1
|
|
}
|
|
r, w := rune(t.src[t.read_offset]), 1
|
|
switch {
|
|
case r == 0:
|
|
error(t, t.offset, "illegal character NUL")
|
|
case r >= utf8.RUNE_SELF:
|
|
r, w = #force_inline utf8.decode_rune_in_string(t.src[t.read_offset:])
|
|
if r == utf8.RUNE_ERROR && w == 1 {
|
|
error(t, t.offset, "illegal UTF-8 encoding")
|
|
} else if r == utf8.RUNE_BOM && t.offset > 0 {
|
|
error(t, t.offset, "illegal byte order mark")
|
|
}
|
|
}
|
|
t.read_offset += w
|
|
t.ch = r
|
|
} else {
|
|
t.offset = len(t.src)
|
|
if t.ch == '\n' {
|
|
t.line_offset = t.offset
|
|
t.line_count += 1
|
|
}
|
|
t.ch = -1
|
|
}
|
|
}
|
|
}
|
|
|
|
peek_byte :: proc(t: ^Tokenizer, offset := 0) -> byte {
|
|
if t.read_offset+offset < len(t.src) {
|
|
#no_bounds_check return t.src[t.read_offset+offset]
|
|
}
|
|
return 0
|
|
}
|
|
|
|
@(optimization_mode="favor_size")
|
|
skip_whitespace :: proc(t: ^Tokenizer) {
|
|
for {
|
|
switch t.ch {
|
|
case ' ', '\t', '\r', '\n':
|
|
advance_rune(t)
|
|
case:
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
@(optimization_mode="favor_size")
|
|
is_letter :: proc(r: rune) -> bool {
|
|
if r < utf8.RUNE_SELF {
|
|
switch r {
|
|
case '_':
|
|
return true
|
|
case 'A'..='Z', 'a'..='z':
|
|
return true
|
|
}
|
|
}
|
|
return unicode.is_letter(r)
|
|
}
|
|
|
|
is_valid_identifier_rune :: proc(r: rune) -> bool {
|
|
if r < utf8.RUNE_SELF {
|
|
switch r {
|
|
case '_', '-', ':': return true
|
|
case 'A'..='Z', 'a'..='z': return true
|
|
case '0'..='9': return true
|
|
case -1: return false
|
|
}
|
|
}
|
|
|
|
if unicode.is_letter(r) || unicode.is_digit(r) {
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
scan_identifier :: proc(t: ^Tokenizer) -> string {
|
|
offset := t.offset
|
|
namespaced := false
|
|
|
|
for is_valid_identifier_rune(t.ch) {
|
|
advance_rune(t)
|
|
if t.ch == ':' {
|
|
// A namespaced attr can have at most two parts, `namespace:ident`.
|
|
if namespaced {
|
|
break
|
|
}
|
|
namespaced = true
|
|
}
|
|
}
|
|
return string(t.src[offset : t.offset])
|
|
}
|
|
|
|
/*
|
|
A comment ends when we see -->, preceded by a character that's not a dash.
|
|
"For compatibility, the string "--" (double-hyphen) must not occur within comments."
|
|
|
|
See: https://www.w3.org/TR/2006/REC-xml11-20060816/#dt-comment
|
|
|
|
Thanks to the length (4) of the comment start, we also have enough lookback,
|
|
and the peek at the next byte asserts that there's at least one more character
|
|
that's a `>`.
|
|
*/
|
|
scan_comment :: proc(t: ^Tokenizer) -> (comment: string, err: Error) {
|
|
offset := t.offset
|
|
|
|
for {
|
|
advance_rune(t)
|
|
ch := t.ch
|
|
|
|
if ch < 0 {
|
|
error(t, offset, "[parse] Comment was not terminated\n")
|
|
return "", .Unclosed_Comment
|
|
}
|
|
|
|
if string(t.src[t.offset - 1:][:2]) == "--" {
|
|
if peek_byte(t) == '>' {
|
|
break
|
|
} else {
|
|
error(t, t.offset - 1, "Invalid -- sequence in comment.\n")
|
|
return "", .Invalid_Sequence_In_Comment
|
|
}
|
|
}
|
|
}
|
|
|
|
expect(t, .Dash)
|
|
expect(t, .Gt)
|
|
|
|
return string(t.src[offset : t.offset - 1]), .None
|
|
}
|
|
|
|
// Skip CDATA
|
|
skip_cdata :: proc(t: ^Tokenizer) -> (err: Error) {
|
|
if t.read_offset + len(CDATA_START) >= len(t.src) {
|
|
// Can't be the start of a CDATA tag.
|
|
return .None
|
|
}
|
|
|
|
if string(t.src[t.offset:][:len(CDATA_START)]) == CDATA_START {
|
|
t.read_offset += len(CDATA_START)
|
|
offset := t.offset
|
|
|
|
cdata_scan: for {
|
|
advance_rune(t)
|
|
if t.ch < 0 {
|
|
error(t, offset, "[scan_string] CDATA was not terminated\n")
|
|
return .Premature_EOF
|
|
}
|
|
|
|
// Scan until the end of a CDATA tag.
|
|
if t.read_offset + len(CDATA_END) < len(t.src) {
|
|
if string(t.src[t.offset:][:len(CDATA_END)]) == CDATA_END {
|
|
t.read_offset += len(CDATA_END)
|
|
break cdata_scan
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return
|
|
}
|
|
|
|
@(optimization_mode="favor_size")
|
|
scan_string :: proc(t: ^Tokenizer, offset: int, close: rune = '<', consume_close := false, multiline := true) -> (value: string, err: Error) {
|
|
err = .None
|
|
|
|
loop: for {
|
|
ch := t.ch
|
|
|
|
switch ch {
|
|
case -1:
|
|
error(t, t.offset, "[scan_string] Premature end of file.\n")
|
|
return "", .Premature_EOF
|
|
|
|
case '<':
|
|
if peek_byte(t) == '!' {
|
|
if peek_byte(t, 1) == '[' {
|
|
// Might be the start of a CDATA tag.
|
|
skip_cdata(t) or_return
|
|
} else if peek_byte(t, 1) == '-' && peek_byte(t, 2) == '-' {
|
|
// Comment start. Eat comment.
|
|
t.read_offset += 3
|
|
_ = scan_comment(t) or_return
|
|
}
|
|
}
|
|
|
|
case '\n':
|
|
if !multiline {
|
|
error(t, offset, string(t.src[offset : t.offset]))
|
|
error(t, offset, "[scan_string] Not terminated\n")
|
|
err = .Invalid_Tag_Value
|
|
break loop
|
|
}
|
|
}
|
|
|
|
if t.ch == close {
|
|
// If it's not a CDATA or comment, it's the end of this body.
|
|
break loop
|
|
}
|
|
advance_rune(t)
|
|
}
|
|
|
|
// Strip trailing whitespace.
|
|
lit := string(t.src[offset : t.offset])
|
|
|
|
end := len(lit)
|
|
eat: for ; end > 0; end -= 1 {
|
|
ch := lit[end - 1]
|
|
switch ch {
|
|
case ' ', '\t', '\r', '\n':
|
|
case:
|
|
break eat
|
|
}
|
|
}
|
|
lit = lit[:end]
|
|
|
|
if consume_close {
|
|
advance_rune(t)
|
|
}
|
|
return lit, err
|
|
}
|
|
|
|
peek :: proc(t: ^Tokenizer) -> (token: Token) {
|
|
old := t^
|
|
token = scan(t)
|
|
t^ = old
|
|
return token
|
|
}
|
|
|
|
scan :: proc(t: ^Tokenizer, multiline_string := false) -> Token {
|
|
skip_whitespace(t)
|
|
|
|
offset := t.offset
|
|
|
|
kind: Token_Kind
|
|
err: Error
|
|
lit: string
|
|
pos := offset_to_pos(t, offset)
|
|
|
|
switch ch := t.ch; true {
|
|
case is_letter(ch):
|
|
lit = scan_identifier(t)
|
|
kind = .Ident
|
|
|
|
case:
|
|
advance_rune(t)
|
|
switch ch {
|
|
case -1:
|
|
kind = .EOF
|
|
|
|
case '<': kind = .Lt
|
|
case '>': kind = .Gt
|
|
case '!': kind = .Exclaim
|
|
case '?': kind = .Question
|
|
case '=': kind = .Eq
|
|
case '#': kind = .Hash
|
|
case '/': kind = .Slash
|
|
case '-': kind = .Dash
|
|
case ':': kind = .Colon
|
|
|
|
case '"', '\'':
|
|
kind = .Invalid
|
|
|
|
lit, err = scan_string(t, t.offset, ch, true, multiline_string)
|
|
if err == .None {
|
|
kind = .String
|
|
}
|
|
|
|
case '\n':
|
|
lit = "\n"
|
|
|
|
case:
|
|
kind = .Invalid
|
|
}
|
|
}
|
|
|
|
if kind != .String && lit == "" {
|
|
lit = string(t.src[offset : t.offset])
|
|
}
|
|
return Token{kind, lit, pos}
|
|
}
|