Odin/core/encoding/json/parser.odin

package json

import "core:mem"
import "core:unicode/utf8"
import "core:strconv"

Parser :: struct {
	tok:            Tokenizer,
	prev_token:     Token,
	curr_token:     Token,
	spec:           Specification,
	allocator:      mem.Allocator,
	unmarshal_data: any,
	parse_integers: bool,
}

make_parser :: proc(data: []byte, spec := Specification.JSON, parse_integers := false, allocator := context.allocator) -> Parser {
	p: Parser
	p.tok = make_tokenizer(data, spec, parse_integers)
	p.spec = spec
	p.allocator = allocator
	assert(p.allocator.procedure != nil)
	advance_token(&p)
	return p
}

parse :: proc(data: []byte, spec := Specification.JSON, parse_integers := false, allocator := context.allocator) -> (Value, Error) {
	context.allocator = allocator
	p := make_parser(data, spec, parse_integers, allocator)

	if p.spec == Specification.JSON5 {
		return parse_value(&p)
	}
	return parse_object(&p)
}

token_end_pos :: proc(tok: Token) -> Pos {
	end := tok.pos
	end.offset += len(tok.text)
	return end
}

advance_token :: proc(p: ^Parser) -> (Token, Error) {
	err: Error
	p.prev_token = p.curr_token
	p.curr_token, err = get_token(&p.tok)
	return p.prev_token, err
}


allow_token :: proc(p: ^Parser, kind: Token_Kind) -> bool {
	if p.curr_token.kind == kind {
		advance_token(p)
		return true
	}
	return false
}

expect_token :: proc(p: ^Parser, kind: Token_Kind) -> Error {
	prev := p.curr_token
	advance_token(p)
	if prev.kind == kind {
		return .None
	}
	return .Unexpected_Token
}


parse_value :: proc(p: ^Parser) -> (value: Value, err: Error) {
	token := p.curr_token
	#partial switch token.kind {
	case .Null:
		value = Null{}
		advance_token(p)
		return
	case .False:
		value = Boolean(false)
		advance_token(p)
		return
	case .True:
		value = Boolean(true)
		advance_token(p)
		return

	case .Integer:
		i, _ := strconv.parse_i64(token.text)
		value = Integer(i)
		advance_token(p)
		return
	case .Float:
		f, _ := strconv.parse_f64(token.text)
		value = Float(f)
		advance_token(p)
		return
	case .String:
		value = String(unquote_string(token, p.spec, p.allocator))
		advance_token(p)
		return

	case .Open_Brace:
		return parse_object(p)

	case .Open_Bracket:
		return parse_array(p)

	case:
		if p.spec == Specification.JSON5 {
			#partial switch token.kind {
			case .Infinity:
				inf: u64 = 0x7ff0000000000000
				if token.text[0] == '-' {
					inf = 0xfff0000000000000
				}
				value = transmute(f64)inf
				advance_token(p)
				return
			case .NaN:
				nan: u64 = 0x7ff7ffffffffffff
				if token.text[0] == '-' {
					nan = 0xfff7ffffffffffff
				}
				value = transmute(f64)nan
				advance_token(p)
				return
			}
		}
	}

	err = .Unexpected_Token
	advance_token(p)
	return
}

parse_array :: proc(p: ^Parser) -> (value: Value, err: Error) {
	expect_token(p, .Open_Bracket) or_return

	array: Array
	array.allocator = p.allocator
	defer if err != .None {
		for elem in array {
			destroy_value(elem)
		}
		delete(array)
	}

	for p.curr_token.kind != .Close_Bracket {
		elem := parse_value(p) or_return
		append(&array, elem)

		// Disallow trailing commas for the time being
		if allow_token(p, .Comma) {
			continue
		} else {
			break
		}
	}

	expect_token(p, .Close_Bracket) or_return
	value = array
	return
}

clone_string :: proc(s: string, allocator: mem.Allocator) -> string {
	n := len(s)
	b := make([]byte, n+1, allocator)
	copy(b, s)
	b[n] = 0
	return string(b[:n])
}

parse_object_key :: proc(p: ^Parser) -> (key: string, err: Error) {
	tok := p.curr_token
	if p.spec == Specification.JSON5 {
		if tok.kind == .String {
			expect_token(p, .String)
			key = unquote_string(tok, p.spec, p.allocator)
			return
		} else if tok.kind == .Ident {
			expect_token(p, .Ident)
			key = clone_string(tok.text, p.allocator)
			return
		}
	}
	if tok_err := expect_token(p, .String); tok_err != .None {
		err = .Expected_String_For_Object_Key
		return
	}
	key = unquote_string(tok, p.spec, p.allocator)
	return
}

parse_object :: proc(p: ^Parser) -> (value: Value, err: Error) {
	expect_token(p, .Open_Brace) or_return

	obj: Object
	obj.allocator = p.allocator
	defer if err != .None {
		for key, elem in obj {
			delete(key, p.allocator)
			destroy_value(elem)
		}
		delete(obj)
	}

	for p.curr_token.kind != .Close_Brace {
		key: string
		key, err = parse_object_key(p)
		if err != .None {
			delete(key, p.allocator)
			return
		}

		if colon_err := expect_token(p, .Colon); colon_err != .None {
			err = .Expected_Colon_After_Key
			return
		}

		elem := parse_value(p) or_return

		if key in obj {
			err = .Duplicate_Object_Key
			delete(key, p.allocator)
			return
		}

		obj[key] = elem

		if p.spec == Specification.JSON5 {
			// Allow trailing commas
			if allow_token(p, .Comma) {
				continue
			}
		} else {
			// Disallow trailing commas
			if allow_token(p, .Comma) {
				continue
			} else {
				break
			}
		}
	}

	expect_token(p, .Close_Brace) or_return
	value = obj
	return
}


// IMPORTANT NOTE(bill): unquote_string assumes a mostly valid string
unquote_string :: proc(token: Token, spec: Specification, allocator := context.allocator) -> string {
	get_u2_rune :: proc(s: string) -> rune {
		if len(s) < 4 || s[0] != '\\' || s[1] != 'x' {
			return -1
		}

		r: rune
		for c in s[2:4] {
			x: rune
			switch c {
			case '0'..='9': x = c - '0'
			case 'a'..='f': x = c - 'a' + 10
			case 'A'..='F': x = c - 'A' + 10
			case: return -1
			}
			r = r*16 + x
		}
		return r
	}
	get_u4_rune :: proc(s: string) -> rune {
		if len(s) < 6 || s[0] != '\\' || s[1] != 'u' {
			return -1
		}

		r: rune
		for c in s[2:6] {
			x: rune
			switch c {
			case '0'..='9': x = c - '0'
			case 'a'..='f': x = c - 'a' + 10
			case 'A'..='F': x = c - 'A' + 10
			case: return -1
			}
			r = r*16 + x
		}
		return r
	}

	if token.kind != .String {
		return ""
	}
	s := token.text
	if len(s) <= 2 {
		return ""
	}
	quote := s[0]
	if s[0] != s[len(s)-1] {
		// Invalid string
		return ""
	}
	s = s[1:len(s)-1]

	i := 0
	for i < len(s) {
		c := s[i]
		if c == '\\' || c == quote || c < ' ' {
			break
		}
		if c < utf8.RUNE_SELF {
			i += 1
			continue
		}
		r, w := utf8.decode_rune_in_string(s)
		if r == utf8.RUNE_ERROR && w == 1 {
			break
		}
		i += w
	}
	if i == len(s) {
		return clone_string(s, allocator)
	}

	b := make([]byte, len(s) + 2*utf8.UTF_MAX, allocator)
	w := copy(b, s[0:i])
	loop: for i < len(s) {
		c := s[i]
		switch {
		case c == '\\':
			i += 1
			if i >= len(s) {
				break loop
			}
			switch s[i] {
			case: break loop
			case '"',  '\'', '\\', '/':
				b[w] = s[i]
				i += 1
				w += 1

			case 'b':
				b[w] = '\b'
				i += 1
				w += 1
			case 'f':
				b[w] = '\f'
				i += 1
				w += 1
			case 'r':
				b[w] = '\r'
				i += 1
				w += 1
			case 't':
				b[w] = '\t'
				i += 1
				w += 1
			case 'n':
				b[w] = '\n'
				i += 1
				w += 1
			case 'u':
				i -= 1 // Include the \u in the check for sanity sake
				r := get_u4_rune(s[i:])
				if r < 0 {
					break loop
				}
				i += 6

				buf, buf_width := utf8.encode_rune(r)
				copy(b[w:], buf[:buf_width])
				w += buf_width


			case '0':
				if spec == Specification.JSON5 {
					b[w] = '\x00'
					i += 1
					w += 1
				} else {
					break loop
				}
			case 'v':
				if spec == Specification.JSON5 {
					b[w] = '\v'
					i += 1
					w += 1
				} else {
					break loop
				}

			case 'x':
				if spec == Specification.JSON5 {
					i -= 1 // Include the \x in the check for sanity sake
					r := get_u2_rune(s[i:])
					if r < 0 {
						break loop
					}
					i += 4

					buf, buf_width := utf8.encode_rune(r)
					copy(b[w:], buf[:buf_width])
					w += buf_width
				} else {
					break loop
				}
			}

		case c == quote, c < ' ':
			break loop

		case c < utf8.RUNE_SELF:
			b[w] = c
			i += 1
			w += 1

		case:
			r, width := utf8.decode_rune_in_string(s[i:])
			i += width

			buf, buf_width := utf8.encode_rune(r)
			assert(buf_width <= width)
			copy(b[w:], buf[:buf_width])
			w += buf_width
		}
	}

	return string(b[:w])
}