Files
Odin/core/c/frontend/tokenizer/tokenizer.odin

668 lines
12 KiB
Odin

package c_frontend_tokenizer
import "core:fmt"
import "core:os"
import "core:strings"
import "core:unicode/utf8"
Error_Handler :: #type proc(pos: Pos, fmt: string, args: ..any);
Tokenizer :: struct {
// Immutable data
path: string,
src: []byte,
// Tokenizing state
ch: rune,
offset: int,
read_offset: int,
line_offset: int,
line_count: int,
// Extra information for tokens
at_bol: bool,
has_space: bool,
// Mutable data
err: Error_Handler,
warn: Error_Handler,
error_count: int,
warning_count: int,
}
init_defaults :: proc(t: ^Tokenizer, err: Error_Handler = default_error_handler, warn: Error_Handler = default_warn_handler) {
t.err = err;
t.warn = warn;
}
@(private)
offset_to_pos :: proc(t: ^Tokenizer, offset: int) -> (pos: Pos) {
pos.file = t.path;
pos.offset = offset;
pos.line = t.line_count;
pos.column = offset - t.line_offset + 1;
return;
}
default_error_handler :: proc(pos: Pos, msg: string, args: ..any) {
fmt.eprintf("%s(%d:%d) ", pos.file, pos.line, pos.column);
fmt.eprintf(msg, ..args);
fmt.eprintf("\n");
}
default_warn_handler :: proc(pos: Pos, msg: string, args: ..any) {
fmt.eprintf("%s(%d:%d) warning: ", pos.file, pos.line, pos.column);
fmt.eprintf(msg, ..args);
fmt.eprintf("\n");
}
error_offset :: proc(t: ^Tokenizer, offset: int, msg: string, args: ..any) {
pos := offset_to_pos(t, offset);
if t.err != nil {
t.err(pos, msg, ..args);
}
t.error_count += 1;
}
warn_offset :: proc(t: ^Tokenizer, offset: int, msg: string, args: ..any) {
pos := offset_to_pos(t, offset);
if t.warn != nil {
t.warn(pos, msg, ..args);
}
t.warning_count += 1;
}
error :: proc(t: ^Tokenizer, tok: ^Token, msg: string, args: ..any) {
pos := tok.pos;
if t.err != nil {
t.err(pos, msg, ..args);
}
t.error_count += 1;
}
warn :: proc(t: ^Tokenizer, tok: ^Token, msg: string, args: ..any) {
pos := tok.pos;
if t.warn != nil {
t.warn(pos, msg, ..args);
}
t.warning_count += 1;
}
advance_rune :: proc(t: ^Tokenizer) {
if t.read_offset < len(t.src) {
t.offset = t.read_offset;
if t.ch == '\n' {
t.at_bol = true;
t.line_offset = t.offset;
t.line_count += 1;
}
r, w := rune(t.src[t.read_offset]), 1;
switch {
case r == 0:
error_offset(t, t.offset, "illegal character NUL");
case r >= utf8.RUNE_SELF:
r, w = utf8.decode_rune(t.src[t.read_offset:]);
if r == utf8.RUNE_ERROR && w == 1 {
error_offset(t, t.offset, "illegal UTF-8 encoding");
} else if r == utf8.RUNE_BOM && t.offset > 0 {
error_offset(t, t.offset, "illegal byte order mark");
}
}
t.read_offset += w;
t.ch = r;
} else {
t.offset = len(t.src);
if t.ch == '\n' {
t.at_bol = true;
t.line_offset = t.offset;
t.line_count += 1;
}
t.ch = -1;
}
}
advance_rune_n :: proc(t: ^Tokenizer, n: int) {
for in 0..<n {
advance_rune(t);
}
}
is_digit :: proc(r: rune) -> bool {
return '0' <= r && r <= '9';
}
skip_whitespace :: proc(t: ^Tokenizer) {
for {
switch t.ch {
case ' ', '\t', '\r', '\v', '\f', '\n':
t.has_space = true;
advance_rune(t);
case:
return;
}
}
}
scan_comment :: proc(t: ^Tokenizer) -> string {
offset := t.offset-1;
next := -1;
general: {
if t.ch == '/'{ // line comments
advance_rune(t);
for t.ch != '\n' && t.ch >= 0 {
advance_rune(t);
}
next = t.offset;
if t.ch == '\n' {
next += 1;
}
break general;
}
/* style comment */
advance_rune(t);
for t.ch >= 0 {
ch := t.ch;
advance_rune(t);
if ch == '*' && t.ch == '/' {
advance_rune(t);
next = t.offset;
break general;
}
}
error_offset(t, offset, "comment not terminated");
}
lit := t.src[offset : t.offset];
// NOTE(bill): Strip CR for line comments
for len(lit) > 2 && lit[1] == '/' && lit[len(lit)-1] == '\r' {
lit = lit[:len(lit)-1];
}
return string(lit);
}
scan_identifier :: proc(t: ^Tokenizer) -> string {
offset := t.offset;
for is_ident1(t.ch) {
advance_rune(t);
}
return string(t.src[offset : t.offset]);
}
scan_string :: proc(t: ^Tokenizer) -> string {
offset := t.offset-1;
for {
ch := t.ch;
if ch == '\n' || ch < 0 {
error_offset(t, offset, "string literal was not terminated");
break;
}
advance_rune(t);
if ch == '"' {
break;
}
if ch == '\\' {
scan_escape(t);
}
}
return string(t.src[offset : t.offset]);
}
digit_val :: proc(r: rune) -> int {
switch r {
case '0'..'9':
return int(r-'0');
case 'A'..'F':
return int(r-'A' + 10);
case 'a'..'f':
return int(r-'a' + 10);
}
return 16;
}
scan_escape :: proc(t: ^Tokenizer) -> bool {
offset := t.offset;
esc := t.ch;
n: int;
base, max: u32;
switch esc {
case 'a', 'b', 'e', 'f', 'n', 't', 'v', 'r', '\\', '\'', '"':
advance_rune(t);
return true;
case '0'..'7':
for digit_val(t.ch) < 8 {
advance_rune(t);
}
return true;
case 'x':
advance_rune(t);
for digit_val(t.ch) < 16 {
advance_rune(t);
}
return true;
case 'u':
advance_rune(t);
n, base, max = 4, 16, utf8.MAX_RUNE;
case 'U':
advance_rune(t);
n, base, max = 8, 16, utf8.MAX_RUNE;
case:
if t.ch < 0 {
error_offset(t, offset, "escape sequence was not terminated");
} else {
break;
}
return false;
}
x: u32;
main_loop: for n > 0 {
d := u32(digit_val(t.ch));
if d >= base {
if t.ch == '"' || t.ch == '\'' {
break main_loop;
}
if t.ch < 0 {
error_offset(t, t.offset, "escape sequence was not terminated");
} else {
error_offset(t, t.offset, "illegal character '%r' : %d in escape sequence", t.ch, t.ch);
}
return false;
}
x = x*base + d;
advance_rune(t);
n -= 1;
}
if x > max || 0xd800 <= x && x <= 0xe000 {
error_offset(t, offset, "escape sequence is an invalid Unicode code point");
return false;
}
return true;
}
scan_rune :: proc(t: ^Tokenizer) -> string {
offset := t.offset-1;
valid := true;
n := 0;
for {
ch := t.ch;
if ch == '\n' || ch < 0 {
if valid {
error_offset(t, offset, "rune literal not terminated");
valid = false;
}
break;
}
advance_rune(t);
if ch == '\'' {
break;
}
n += 1;
if ch == '\\' {
if !scan_escape(t) {
valid = false;
}
}
}
if valid && n != 1 {
error_offset(t, offset, "illegal rune literal");
}
return string(t.src[offset : t.offset]);
}
scan_number :: proc(t: ^Tokenizer, seen_decimal_point: bool) -> (Token_Kind, string) {
scan_mantissa :: proc(t: ^Tokenizer, base: int) {
for digit_val(t.ch) < base {
advance_rune(t);
}
}
scan_exponent :: proc(t: ^Tokenizer) {
if t.ch == 'e' || t.ch == 'E' || t.ch == 'p' || t.ch == 'P' {
advance_rune(t);
if t.ch == '-' || t.ch == '+' {
advance_rune(t);
}
if digit_val(t.ch) < 10 {
scan_mantissa(t, 10);
} else {
error_offset(t, t.offset, "illegal floating-point exponent");
}
}
}
scan_fraction :: proc(t: ^Tokenizer) -> (early_exit: bool) {
if t.ch == '.' && peek(t) == '.' {
return true;
}
if t.ch == '.' {
advance_rune(t);
scan_mantissa(t, 10);
}
return false;
}
check_end := true;
offset := t.offset;
seen_point := seen_decimal_point;
if seen_point {
offset -= 1;
scan_mantissa(t, 10);
scan_exponent(t);
} else {
if t.ch == '0' {
int_base :: proc(t: ^Tokenizer, base: int, msg: string) {
prev := t.offset;
advance_rune(t);
scan_mantissa(t, base);
if t.offset - prev <= 1 {
error_offset(t, t.offset, msg);
}
}
advance_rune(t);
switch t.ch {
case 'b', 'B':
int_base(t, 2, "illegal binary integer");
case 'x', 'X':
int_base(t, 16, "illegal hexadecimal integer");
case:
seen_point = false;
scan_mantissa(t, 10);
if t.ch == '.' {
seen_point = true;
if scan_fraction(t) {
check_end = false;
}
}
if check_end {
scan_exponent(t);
check_end = false;
}
}
}
}
if check_end {
scan_mantissa(t, 10);
if !scan_fraction(t) {
scan_exponent(t);
}
}
return .Number, string(t.src[offset : t.offset]);
}
scan_punct :: proc(t: ^Tokenizer, ch: rune) -> (kind: Token_Kind) {
kind = .Punct;
switch ch {
case:
kind = .Invalid;
case '<', '>':
if t.ch == ch {
advance_rune(t);
}
if t.ch == '=' {
advance_rune(t);
}
case '!', '+', '-', '*', '/', '%', '^', '=':
if t.ch == '=' {
advance_rune(t);
}
case '#':
if t.ch == '#' {
advance_rune(t);
}
case '&':
if t.ch == '=' || t.ch == '&' {
advance_rune(t);
}
case '|':
if t.ch == '=' || t.ch == '|' {
advance_rune(t);
}
case '(', ')', '[', ']', '{', '}':
// okay
case '~', ',', ':', ';', '?':
// okay
case '`':
// okay
case '.':
if t.ch == '.' && peek(t) == '.' {
advance_rune(t);
advance_rune(t); // consume last '.'
}
}
return;
}
peek :: proc(t: ^Tokenizer) -> byte {
if t.read_offset < len(t.src) {
return t.src[t.read_offset];
}
return 0;
}
peek_str :: proc(t: ^Tokenizer, str: string) -> bool {
if t.read_offset < len(t.src) {
return strings.has_prefix(string(t.src[t.offset:]), str);
}
return false;
}
scan_literal_prefix :: proc(t: ^Tokenizer, str: string, prefix: ^string) -> bool {
if peek_str(t, str) {
offset := t.offset;
for _ in str {
advance_rune(t);
}
prefix^ = string(t.src[offset:][:len(str)-1]);
return true;
}
return false;
}
allow_next_to_be_newline :: proc(t: ^Tokenizer) -> bool {
if t.ch == '\n' {
advance_rune(t);
return true;
} else if t.ch == '\r' && peek(t) == '\n' { // allow for MS-DOS style line endings
advance_rune(t); // \r
advance_rune(t); // \n
return true;
}
return false;
}
scan :: proc(t: ^Tokenizer, f: ^File) -> ^Token {
skip_whitespace(t);
offset := t.offset;
kind: Token_Kind;
lit: string;
prefix: string;
switch ch := t.ch; {
case scan_literal_prefix(t, `u8"`, &prefix):
kind = .String;
lit = scan_string(t);
case scan_literal_prefix(t, `u"`, &prefix):
kind = .String;
lit = scan_string(t);
case scan_literal_prefix(t, `L"`, &prefix):
kind = .String;
lit = scan_string(t);
case scan_literal_prefix(t, `U"`, &prefix):
kind = .String;
lit = scan_string(t);
case scan_literal_prefix(t, `u'`, &prefix):
kind = .Char;
lit = scan_rune(t);
case scan_literal_prefix(t, `L'`, &prefix):
kind = .Char;
lit = scan_rune(t);
case scan_literal_prefix(t, `U'`, &prefix):
kind = .Char;
lit = scan_rune(t);
case is_ident0(ch):
lit = scan_identifier(t);
kind = .Ident;
case '0' <= ch && ch <= '9':
kind, lit = scan_number(t, false);
case:
advance_rune(t);
switch ch {
case -1:
kind = .EOF;
case '\\':
kind = .Punct;
if allow_next_to_be_newline(t) {
t.at_bol = true;
t.has_space = false;
return scan(t, f);
}
case '.':
if is_digit(t.ch) {
kind, lit = scan_number(t, true);
} else {
kind = scan_punct(t, ch);
}
case '"':
kind = .String;
lit = scan_string(t);
case '\'':
kind = .Char;
lit = scan_rune(t);
case '/':
if t.ch == '/' || t.ch == '*' {
kind = .Comment;
lit = scan_comment(t);
t.has_space = true;
break;
}
fallthrough;
case:
kind = scan_punct(t, ch);
if kind == .Invalid && ch != utf8.RUNE_BOM {
error_offset(t, t.offset, "illegal character '%r': %d", ch, ch);
}
}
}
if lit == "" {
lit = string(t.src[offset : t.offset]);
}
if kind == .Comment {
return scan(t, f);
}
tok := new(Token);
tok.kind = kind;
tok.lit = lit;
tok.pos = offset_to_pos(t, offset);
tok.file = f;
tok.prefix = prefix;
tok.at_bol = t.at_bol;
tok.has_space = t.has_space;
t.at_bol, t.has_space = false, false;
return tok;
}
tokenize :: proc(t: ^Tokenizer, f: ^File) -> ^Token {
setup_tokenizer: {
t.src = f.src;
t.ch = ' ';
t.offset = 0;
t.read_offset = 0;
t.line_offset = 0;
t.line_count = len(t.src) > 0 ? 1 : 0;
t.error_count = 0;
t.path = f.name;
advance_rune(t);
if t.ch == utf8.RUNE_BOM {
advance_rune(t);
}
}
t.at_bol = true;
t.has_space = false;
head: Token;
curr := &head;
for {
tok := scan(t, f);
if tok == nil {
break;
}
curr.next = tok;
curr = curr.next;
if tok.kind == .EOF {
break;
}
}
return head.next;
}
add_new_file :: proc(t: ^Tokenizer, name: string, src: []byte, id: int) -> ^File {
file := new(File);
file.id = id;
file.src = src;
file.name = name;
file.display_name = name;
return file;
}
tokenize_file :: proc(t: ^Tokenizer, path: string, id: int, loc := #caller_location) -> ^Token {
src, ok := os.read_entire_file(path);
if !ok {
return nil;
}
return tokenize(t, add_new_file(t, path, src, id));
}
inline_tokenize :: proc(t: ^Tokenizer, tok: ^Token, src: []byte) -> ^Token {
file := new(File);
file.src = src;
if tok.file != nil {
file.id = tok.file.id;
file.name = tok.file.name;
file.display_name = tok.file.name;
}
return tokenize(t, file);
}