Odin/core/encoding/xml/tokenizer.odin

package encoding_xml

/*
	An XML 1.0 / 1.1 parser

	Copyright 2021-2022 Jeroen van Rijn <nom@duclavier.com>.
	Made available under Odin's license.

	A from-scratch XML implementation, loosely modeled on the [spec](https://www.w3.org/TR/2006/REC-xml11-20060816).

	List of contributors:
		Jeroen van Rijn: Initial implementation.
*/


import "core:fmt"
import "core:unicode"
import "core:unicode/utf8"
import "core:strings"

Error_Handler :: #type proc(pos: Pos, fmt: string, args: ..any)

Token :: struct {
	kind: Token_Kind,
	text: string,
	pos:  Pos,
}

Pos :: struct {
	file:   string,
	offset: int, // starting at 0
	line:   int, // starting at 1
	column: int, // starting at 1
}

Token_Kind :: enum {
	Invalid,

	Ident,
	Literal,
	Rune,
	String,

	Double_Quote,  // "
	Single_Quote,  // '
	Colon,         // :

	Eq,            // =
	Lt,            // <
	Gt,            // >
	Exclaim,       // !
	Question,      // ?
	Hash,          // #
	Slash,         // /
	Dash,          // -

	Open_Bracket,  // [
	Close_Bracket, // ]

	EOF,
}

CDATA_START   :: "<![CDATA["
CDATA_END     :: "]]>"

COMMENT_START :: "<!--"
COMMENT_END   :: "-->"

Tokenizer :: struct {
	// Immutable data
	path: string,
	src:  string,
	err:  Error_Handler,

	// Tokenizing state
	ch:          rune,
	offset:      int,
	read_offset: int,
	line_offset: int,
	line_count:  int,

	// Mutable data
	error_count: int,
}

init :: proc(t: ^Tokenizer, src: string, path: string, err: Error_Handler = default_error_handler) {
	t.src = src
	t.err = err
	t.ch = ' '
	t.offset = 0
	t.read_offset = 0
	t.line_offset = 0
	t.line_count = len(src) > 0 ? 1 : 0
	t.error_count = 0
	t.path = path

	advance_rune(t)
	if t.ch == utf8.RUNE_BOM {
		advance_rune(t)
	}
}

@(private)
offset_to_pos :: proc(t: ^Tokenizer, offset: int) -> Pos {
	line := t.line_count
	column := offset - t.line_offset + 1

	return Pos {
		file = t.path,
		offset = offset,
		line = line,
		column = column,
	}
}

default_error_handler :: proc(pos: Pos, msg: string, args: ..any) {
	fmt.eprintf("%s(%d:%d) ", pos.file, pos.line, pos.column)
	fmt.eprintf(msg, ..args)
	fmt.eprintf("\n")
}

error :: proc(t: ^Tokenizer, offset: int, msg: string, args: ..any) {
	pos := offset_to_pos(t, offset)
	if t.err != nil {
		t.err(pos=pos, fmt=msg, args=args)
	}
	t.error_count += 1
}

@(optimization_mode="favor_size")
advance_rune :: proc(t: ^Tokenizer) {
	#no_bounds_check {
		/*
			Already bounds-checked here.
		*/
		if t.read_offset < len(t.src) {
			t.offset = t.read_offset
			if t.ch == '\n' {
				t.line_offset = t.offset
				t.line_count += 1
			}
			r, w := rune(t.src[t.read_offset]), 1
			switch {
			case r == 0:
				error(t, t.offset, "illegal character NUL")
			case r >= utf8.RUNE_SELF:
				r, w = #force_inline utf8.decode_rune_in_string(t.src[t.read_offset:])
				if r == utf8.RUNE_ERROR && w == 1 {
					error(t, t.offset, "illegal UTF-8 encoding")
				} else if r == utf8.RUNE_BOM && t.offset > 0 {
					error(t, t.offset, "illegal byte order mark")
				}
			}
			t.read_offset += w
			t.ch = r
		} else {
			t.offset = len(t.src)
			if t.ch == '\n' {
				t.line_offset = t.offset
				t.line_count += 1
			}
			t.ch = -1
		}
	}
}

peek_byte :: proc(t: ^Tokenizer, offset := 0) -> byte {
	if t.read_offset+offset < len(t.src) {
		#no_bounds_check return t.src[t.read_offset+offset]
	}
	return 0
}

@(optimization_mode="favor_size")
skip_whitespace :: proc(t: ^Tokenizer) {
	for {
		switch t.ch {
		case ' ', '\t', '\r', '\n':
			advance_rune(t)
		case:
			return
		}
	}
}

@(optimization_mode="favor_size")
is_letter :: proc(r: rune) -> bool {
	if r < utf8.RUNE_SELF {
		switch r {
		case '_':
			return true
		case 'A'..='Z', 'a'..='z':
			return true
		}
	}
	return unicode.is_letter(r)
}

is_valid_identifier_rune :: proc(r: rune) -> bool {
	if r < utf8.RUNE_SELF {
		switch r {
		case '_', '-', ':':        return true
		case 'A'..='Z', 'a'..='z': return true
		case '0'..='9':            return true
		case -1:                   return false
		}
	}

	if unicode.is_letter(r) || unicode.is_digit(r) {
		return true
	}
	return false
}

scan_identifier :: proc(t: ^Tokenizer) -> string {
	offset     := t.offset
	namespaced := false

	for is_valid_identifier_rune(t.ch) {
		advance_rune(t)
		if t.ch == ':' {
			// A namespaced attr can have at most two parts, `namespace:ident`.
			if namespaced {
				break
			}
			namespaced = true
		}
	}
	return string(t.src[offset : t.offset])
}

/*
	A comment ends when we see -->, preceded by a character that's not a dash.
	"For compatibility, the string "--" (double-hyphen) must not occur within comments."

	See: https://www.w3.org/TR/2006/REC-xml11-20060816/#dt-comment

	Thanks to the length (4) of the comment start, we also have enough lookback,
	and the peek at the next byte asserts that there's at least one more character
	that's a `>`.
*/
scan_comment :: proc(t: ^Tokenizer) -> (comment: string, err: Error) {
	offset := t.offset

	for {
		advance_rune(t)
		ch := t.ch

		if ch < 0 {
			error(t, offset, "[parse] Comment was not terminated\n")
			return "", .Unclosed_Comment
		}

		if string(t.src[t.offset - 1:][:2]) == "--" {
			if peek_byte(t) == '>' {
				break
			} else {
				error(t, t.offset - 1, "Invalid -- sequence in comment.\n")
				return "", .Invalid_Sequence_In_Comment
			}
		}
	}

	expect(t, .Dash)
	expect(t, .Gt)

	return string(t.src[offset : t.offset - 1]), .None
}

// Skip CDATA
skip_cdata :: proc(t: ^Tokenizer) -> (err: Error) {
	if s := string(t.src[t.offset:]); !strings.has_prefix(s, CDATA_START) {
		return .None
	}

	t.read_offset += len(CDATA_START)
	offset := t.offset

	cdata_scan: for {
		advance_rune(t)
		if t.ch < 0 {
			error(t, offset, "[scan_string] CDATA was not terminated\n")
			return .Premature_EOF
		}

		// Scan until the end of a CDATA tag.
		if s := string(t.src[t.read_offset:]); strings.has_prefix(s, CDATA_END) {
			t.read_offset += len(CDATA_END)
			break cdata_scan
		}
	}
	return .None
}

@(optimization_mode="favor_size")
scan_string :: proc(t: ^Tokenizer, offset: int, close: rune = '<', consume_close := false, multiline := true) -> (value: string, err: Error) {
	err = .None

	loop: for {
		ch := t.ch

		switch ch {
		case -1:
			error(t, t.offset, "[scan_string] Premature end of file.\n")
			return "", .Premature_EOF

		case '<':
			if peek_byte(t) == '!' {
				if peek_byte(t, 1) == '[' {
					// Might be the start of a CDATA tag.
					skip_cdata(t) or_return
				} else if peek_byte(t, 1) == '-' && peek_byte(t, 2) == '-' {
					// Comment start. Eat comment.
					t.read_offset += 3
					_ = scan_comment(t) or_return
				}
			}

		case '\n':
			if !multiline {
				error(t, offset, string(t.src[offset : t.offset]))
				error(t, offset, "[scan_string] Not terminated\n")
				err = .Invalid_Tag_Value
				break loop
			}
		}

		if t.ch == close {
			// If it's not a CDATA or comment, it's the end of this body.
			break loop
		}
		advance_rune(t)
	}

	// Strip trailing whitespace.
	lit := string(t.src[offset : t.offset])

	end := len(lit)
	eat: for ; end > 0; end -= 1 {
		ch := lit[end - 1]
		switch ch {
		case ' ', '\t', '\r', '\n':
		case:
			break eat
		}
	}
	lit = lit[:end]

	if consume_close {
		advance_rune(t)
	}
	return lit, err
}

peek :: proc(t: ^Tokenizer) -> (token: Token) {
	old  := t^
	token = scan(t)
	t^ = old
	return token
}

scan :: proc(t: ^Tokenizer, multiline_string := false) -> Token {
	skip_whitespace(t)

	offset := t.offset

	kind: Token_Kind
	err:  Error
	lit:  string
	pos := offset_to_pos(t, offset)

	switch ch := t.ch; true {
	case is_letter(ch):
		lit = scan_identifier(t)
		kind = .Ident

	case:
		advance_rune(t)
		switch ch {
		case -1:
			kind = .EOF

		case '<': kind = .Lt
		case '>': kind = .Gt
		case '!': kind = .Exclaim
		case '?': kind = .Question
		case '=': kind = .Eq
		case '#': kind = .Hash
		case '/': kind = .Slash
		case '-': kind = .Dash
		case ':': kind = .Colon
		case '[': kind = .Open_Bracket
		case ']': kind = .Close_Bracket

		case '"', '\'':
			kind = .Invalid

			lit, err = scan_string(t, t.offset, ch, true, multiline_string)
			if err == .None {
				kind = .String
			}

		case '\n':
			lit = "\n"

		case:
			kind = .Invalid
		}
	}

	if kind != .String && lit == "" {
		lit = string(t.src[offset : t.offset])
	}
	return Token{kind, lit, pos}
}