Add package text/scanner

2026-07-21 23:21:08 +00:00 · 2020-11-18 23:33:16 +00:00
parent 9408eb9580
commit a39921aa6a
1 changed files with 583 additions and 0 deletions
--- a/core/text/scanner/scanner.odin
+++ b/core/text/scanner/scanner.odin
@@ -0,0 +1,583 @@
+package text_scanner
+
+import "core:fmt"
+import "core:strings"
+import "core:unicode"
+import "core:unicode/utf8"
+
+Position :: struct {
+	filename: string, // filename, if present
+	offset:   int,    // byte offset, starting @ 0
+	line:     int,    // line number, starting @ 1
+	column:   int,    // column number, starting @ 1 (character count per line)
+}
+
+position_is_valid :: proc(pos: Position) -> bool {
+	return pos.line > 0;
+}
+
+position_to_string :: proc(pos: Position, allocator := context.temp_allocator) -> string {
+	s := pos.filename;
+	if s == "" {
+		s = "<input>";
+	}
+
+	context.allocator = allocator;
+	if position_is_valid(pos) {
+		return fmt.aprintf("%s(%d:%d)", s, pos.line, pos.column);
+	} else {
+		return strings.clone(s);
+	}
+}
+
+EOF        :: -1;
+Ident      :: -2;
+Int        :: -3;
+Float      :: -4;
+Char       :: -5;
+String     :: -6;
+Raw_String :: -7;
+Comment    :: -8;
+
+Scan_Flag :: enum u32 {
+	Scan_Idents,
+	Scan_Ints,
+	Scan_C_Int_Prefixes,
+	Scan_Floats,
+	Scan_Chars,
+	Scan_Strings,
+	Scan_Raw_Strings,
+	Scan_Comments,
+	Skip_Comments,
+}
+Scan_Flags :: bit_set[Scan_Flag; u32];
+
+Odin_Like_Tokens :: Scan_Flags{.Scan_Idents, .Scan_Ints, .Scan_Floats, .Scan_Chars, .Scan_Strings, .Scan_Raw_Strings, .Scan_Comments, .Skip_Comments};
+C_Like_Tokens    :: Scan_Flags{.Scan_Idents, .Scan_Ints, .Scan_C_Int_Prefixes, .Scan_Floats, .Scan_Chars, .Scan_Strings, .Scan_Raw_Strings, .Scan_Comments, .Skip_Comments};
+
+Odin_Whitespace :: 1<<'\t' | 1<<'\n' | 1<<'\r' | 1<<' ';
+C_Whitespace    :: 1<<'\t' | 1<<'\n' | 1<<'\r' | 1<<'\v' | 1<<'\f' | 1<<' ';
+
+
+Scanner :: struct {
+	src: string,
+
+	src_pos: int,
+	src_end: int,
+
+	tok_pos: int,
+	tok_end: int,
+
+	ch: rune,
+
+	line:   int,
+	column: int,
+	prev_line_len: int,
+	prev_char_len: int,
+
+	error: proc(s: ^Scanner, msg: string),
+	error_count: int,
+
+	flags: Scan_Flags,
+	whitespace: u64,
+
+	is_ident_rune: proc(ch: rune, i: int) -> bool,
+
+	pos: Position,
+}
+
+init :: proc(s: ^Scanner, src: string, filename := "") -> ^Scanner {
+	s^ = {};
+
+	s.src = src;
+	s.pos.filename = filename;
+
+	s.tok_pos = -1;
+
+	s.ch = -2; // no char read yet, not an EOF
+
+	s.line = 1;
+
+	s.flags = Odin_Like_Tokens;
+	s.whitespace = Odin_Whitespace;
+
+	return s;
+}
+
+
+@(private)
+advance :: proc(s: ^Scanner) -> rune {
+	if s.src_pos >= len(s.src) {
+		s.prev_char_len = 0;
+		return EOF;
+	}
+	ch, width := rune(s.src[s.src_pos]), 1;
+
+	if ch >= utf8.RUNE_SELF {
+		ch, width = utf8.decode_rune_in_string(s.src[s.src_pos:]);
+		if ch == utf8.RUNE_ERROR && width == 1 {
+			s.src_pos += width;
+			s.prev_char_len = width;
+			s.column += 1;
+			error(s, "invalid UTF-8 encoding");
+			return ch;
+		}
+	}
+
+	s.src_pos += width;
+	s.prev_char_len = width;
+	s.column += 1;
+
+	switch ch {
+	case 0:
+		error(s, "invalid character NUL");
+	case '\n':
+		s.line += 1;
+		s.prev_line_len = s.column;
+		s.column = 0;
+	}
+
+	return ch;
+}
+
+next :: proc(s: ^Scanner) -> rune {
+	s.tok_pos = -1;
+	s.pos.line = 0;
+	ch := peek(s);
+	if ch != EOF {
+		s.ch = advance(s);
+	}
+	return ch;
+}
+
+peek :: proc(s: ^Scanner) -> rune {
+	if s.ch == -2 {
+		s.ch = advance(s);
+		if s.ch == '\ufeff' { // Ignore BOM
+			s.ch = advance(s);
+		}
+	}
+	return s.ch;
+}
+
+
+error :: proc(s: ^Scanner, msg: string) {
+	s.error_count += 1;
+	if s.error != nil {
+		s.error(s, msg);
+		return;
+	}
+	p := s.pos;
+	if !position_is_valid(p) {
+		p = position(s);
+	}
+
+	s := p.filename;
+	if s == "" {
+		s = "<input>";
+	}
+
+	if position_is_valid(p) {
+		fmt.eprintf("%s(%d:%d): %s\n", s, p.line, p.column, msg);
+	} else {
+		fmt.eprintf("%s: %s\n", s, msg);
+	}
+}
+
+errorf :: proc(s: ^Scanner, format: string, args: ..any) {
+	error(s, fmt.tprintf(format, ..args));
+}
+
+@(private)
+is_ident_rune :: proc(s: ^Scanner, ch: rune, i: int) -> bool {
+	if s.is_ident_rune != nil {
+		return s.is_ident_rune(ch, i);
+	}
+	return ch == '_' || unicode.is_letter(ch) || unicode.is_digit(ch) && i > 0;
+}
+
+@(private)
+scan_identifier :: proc(s: ^Scanner) -> rune {
+	ch := advance(s);
+	for i := 1; is_ident_rune(s, ch, i); i += 1 {
+		ch = advance(s);
+	}
+	return ch;
+}
+
+@(private) lower      :: proc(ch: rune) -> rune { return ('a' - 'A') | ch; }
+@(private) is_decimal :: proc(ch: rune) -> bool { return '0' <= ch && ch <= '9'; }
+@(private) is_hex     :: proc(ch: rune) -> bool { return '0' <= ch && ch <= '9' || 'a' <= lower(ch) && lower(ch) <= 'f'; }
+
+
+
+@(private)
+scan_number :: proc(s: ^Scanner, ch: rune, seen_dot: bool) -> (rune, rune) {
+	lit_name :: proc(prefix: rune) -> string {
+		switch prefix {
+		case 'b': return "binary literal";
+		case 'o': return "octal literal";
+		case 'z': return "dozenal literal";
+		case 'x': return "hexadecimal literal";
+		}
+		return "decimal literal";
+	}
+
+	digits :: proc(s: ^Scanner, ch0: rune, base: int, invalid: ^rune) -> (ch: rune, digsep: int) {
+		ch = ch0;
+		if base <= 10 {
+			max := rune('0' + base);
+			for is_decimal(ch) || ch == '_' {
+				ds := 1;
+				if ch == '_' {
+					ds = 2;
+				} else if ch >= max && invalid^ == 0 {
+					invalid^ = ch;
+				}
+				digsep |= ds;
+				ch = advance(s);
+			}
+		} else {
+			for is_hex(ch) || ch == '_' {
+				ds := 1;
+				if ch == '_' {
+					ds = 2;
+				}
+				digsep |= ds;
+				ch = advance(s);
+			}
+		}
+		return;
+	}
+
+	ch, seen_dot := ch, seen_dot;
+
+	base := 10;
+	prefix := rune(0);
+	digsep := 0;
+	invalid := rune(0);
+
+	tok: rune;
+	ds: int;
+
+	if !seen_dot {
+		tok = Int;
+		ch0 := ch;
+		if ch == '0' {
+			ch = advance(s);
+
+			p := lower(ch);
+			if .Scan_C_Int_Prefixes in s.flags {
+				switch lower(ch) {
+				case 'b':
+					ch = advance(s);
+					base, prefix = 2, 'b';
+				case 'x':
+					ch = advance(s);
+					base, prefix = 16, 'x';
+				case:
+					base, prefix = 8, 'o';
+					digsep = 1; // Leading zero
+				}
+			} else {
+				switch lower(ch) {
+				case 'b':
+					ch = advance(s);
+					base, prefix = 2, 'b';
+				case 'o':
+					ch = advance(s);
+					base, prefix = 8, 'o';
+				case 'd':
+					ch = advance(s);
+					base, prefix = 10, 'd';
+				case 'z':
+					ch = advance(s);
+					base, prefix = 12, 'z';
+				case 'x':
+					ch = advance(s);
+					base, prefix = 16, 'x';
+				case:
+					digsep = 1; // Leading zero
+				}
+			}
+		}
+
+		ch, ds = digits(s, ch, base, &invalid);
+		digsep |= ds;
+		if ch == '.' && .Scan_Floats in s.flags {
+			ch = advance(s);
+			seen_dot = true;
+		}
+	}
+
+	if seen_dot {
+		tok = Float;
+		if prefix != 0 && prefix != 'x' {
+			errorf(s, "invalid radix point in %s", lit_name(prefix));
+		}
+		ch, ds = digits(s, ch, base, &invalid);
+		digsep |= ds;
+	}
+
+	if digsep&1 == 0 {
+		errorf(s, "%s has no digits", lit_name(prefix));
+	}
+
+	if e := lower(ch); (e == 'e' || e == 'p') && .Scan_Floats in s.flags {
+		switch {
+		case e == 'e' && prefix != 0:
+			errorf(s, "%q exponent requires decimal mantissa", ch);
+		case e == 'p' && prefix != 'x':
+			errorf(s, "%q exponent requires hexadecimal mantissa", ch);
+		}
+		ch = advance(s);
+		tok = Float;
+		if ch == '+' || ch == '-' {
+			ch = advance(s);
+		}
+		ch, ds = digits(s, ch, 10, nil);
+		digsep |= ds;
+		if ds&1 == 0 {
+			error(s, "exponent has no digits");
+		}
+	} else if prefix == 'x' && tok == Float {
+		error(s, "hexadecimal mantissa requires a 'p' exponent");
+	}
+
+	if tok == Int && invalid != 0 {
+		errorf(s, "invalid digit %q in %s", invalid, lit_name(prefix));
+	}
+
+	if digsep&2 != 0 {
+		s.tok_end = s.src_pos - s.prev_char_len;
+	}
+	return tok, ch;
+}
+
+@(private)
+scan_string :: proc(s: ^Scanner, quote: rune) -> (n: int) {
+	digit_val :: proc(ch: rune) -> int {
+		switch v := lower(ch); v {
+		case '0'..'9': return int(v - '0');
+		case 'a'..'z': return int(v - 'a');
+		}
+		return 16;
+	}
+
+	scan_digits :: proc(s: ^Scanner, ch: rune, base, n: int) -> rune {
+		ch, n := ch, n;
+		for n > 0 && digit_val(ch) < base {
+			ch = advance(s);
+			n -= 1;
+		}
+		if n > 0 {
+			error(s, "invalid char escape");
+		}
+		return ch;
+	}
+
+	ch := advance(s);
+	for ch != quote {
+		if ch == '\n' || ch < 0 {
+			error(s, "literal no terminated");
+			return;
+		}
+		if ch == '\\' {
+			ch = advance(s);
+			switch ch {
+			case quote, 'a', 'b', 'e', 'f', 'n', 'r', 't', 'v', '\\':
+				ch = advance(s);
+			case '0'..'7': ch = scan_digits(s, ch, 8, 3);
+			case 'x':      ch = scan_digits(s, ch, 16, 2);
+			case 'u':      ch = scan_digits(s, ch, 16, 4);
+			case 'U':      ch = scan_digits(s, ch, 16, 8);
+			case:
+				error(s, "invalid char escape");
+			}
+		} else {
+			ch = advance(s);
+		}
+		n += 1;
+	}
+	return;
+}
+
+@(private)
+scan_raw_string :: proc(s: ^Scanner) {
+	ch := advance(s);
+	for ch != '`' {
+		if ch < 0 {
+			error(s, "literal not terminated");
+			return;
+		}
+		ch = advance(s);
+	}
+}
+
+@(private)
+scan_char :: proc(s: ^Scanner) {
+	if scan_string(s, '\'') != 1 {
+		error(s, "invalid char literal");
+	}
+}
+
+@(private)
+scan_comment :: proc(s: ^Scanner, ch: rune) -> rune {
+	ch := ch;
+	if ch == '/' { // line comment
+		ch = advance(s);
+		for ch != '\n' && ch >= 0 {
+			ch = advance(s);
+		}
+		return ch;
+	}
+
+	// block /**/ comment
+	ch = advance(s);
+	for {
+		if ch < 0 {
+			error(s, "comment not terminated");
+			break;
+		}
+		ch0 := ch;
+		ch = advance(s);
+		if ch0 == '*' && ch == '/' {
+			return advance(s);
+		}
+	}
+	return ch;
+}
+
+scan :: proc(s: ^Scanner) -> (tok: rune) {
+	ch := peek(s);
+	if ch == EOF {
+		return ch;
+	}
+
+	// reset position
+	s.tok_pos = -1;
+	s.pos.line = 0;
+
+	redo: for {
+		for s.whitespace & (1<<uint(ch)) != 0 {
+			ch = advance(s);
+		}
+
+		s.tok_pos = s.src_pos - s.prev_char_len;
+		s.pos.offset = s.tok_pos;
+
+		if s.column > 0 {
+			s.pos.line = s.line;
+			s.pos.column = s.column;
+		} else {
+			// previous character was newline
+			s.pos.line = s.line - 1;
+			s.pos.column = s.prev_line_len;
+		}
+
+		tok = ch;
+		if is_ident_rune(s, ch, 0) {
+			if .Scan_Idents in s.flags {
+				tok = Ident;
+				ch = scan_identifier(s);
+			} else {
+				ch = advance(s);
+			}
+
+		} else if is_decimal(ch) {
+			if s.flags >= {.Scan_Ints, .Scan_Floats} {
+				tok, ch = scan_number(s, ch, false);
+			} else {
+				ch = advance(s);
+			}
+		} else {
+			switch ch {
+			case EOF:
+				break;
+			case '"':
+				if .Scan_Strings in s.flags {
+					scan_string(s, '"');
+					tok = String;
+				}
+				ch = advance(s);
+			case '\'':
+				if .Scan_Chars in s.flags {
+					scan_string(s, '\'');
+					tok = Char;
+				}
+				ch = advance(s);
+			case '`':
+				if .Scan_Raw_Strings in s.flags {
+					scan_raw_string(s);
+					tok = Raw_String;
+				}
+				ch = advance(s);
+			case '.':
+				ch = advance(s);
+				if is_decimal(ch) && .Scan_Floats in s.flags {
+					tok, ch = scan_number(s, ch, true);
+				}
+			case '/':
+				ch = advance(s);
+				if (ch == '/' || ch == '*') && .Scan_Comments in s.flags {
+					if .Skip_Comments in s.flags {
+						s.tok_pos = -1;
+						ch = scan_comment(s, ch);
+						continue redo;
+					}
+					ch = scan_comment(s, ch);
+					tok = Comment;
+				}
+			case:
+				ch = advance(s);
+			}
+		}
+
+		break redo;
+	}
+
+	s.tok_end = s.src_pos - s.prev_char_len;
+
+	s.ch = ch;
+	return tok;
+}
+
+position :: proc(s: ^Scanner) -> Position {
+	pos: Position;
+	pos.filename = s.pos.filename;
+	pos.offset = s.src_pos - s.prev_char_len;
+	switch {
+	case s.column > 0:
+		pos.line = s.line;
+		pos.column = s.column;
+	case s.prev_line_len > 0:
+		pos.line = s.line-1;
+		pos.column = s.prev_line_len;
+	case:
+		pos.line = 1;
+		pos.column = 1;
+	}
+	return pos;
+}
+
+token_text :: proc(s: ^Scanner) -> string {
+	if s.tok_pos < 0 {
+		return "";
+	}
+	return string(s.src[s.tok_pos:s.tok_end]);
+}
+
+token_string :: proc(tok: rune, allocator := context.temp_allocator) -> string {
+	context.allocator = allocator;
+	switch tok {
+	case EOF:        return strings.clone("EOF");
+	case Ident:      return strings.clone("Ident");
+	case Int:        return strings.clone("Int");
+	case Float:      return strings.clone("Float");
+	case Char:       return strings.clone("Char");
+	case String:     return strings.clone("String");
+	case Raw_String: return strings.clone("Raw_String");
+	case Comment:    return strings.clone("Comment");
+	}
+	return fmt.aprintf("%q", tok);
+}