mirror of
https://github.com/odin-lang/Odin.git
synced 2025-12-30 01:44:36 +00:00
668 lines
12 KiB
Odin
668 lines
12 KiB
Odin
package c_frontend_tokenizer
|
|
|
|
import "core:fmt"
|
|
import "core:os"
|
|
import "core:strings"
|
|
import "core:unicode/utf8"
|
|
|
|
|
|
Error_Handler :: #type proc(pos: Pos, fmt: string, args: ..any)
|
|
|
|
|
|
Tokenizer :: struct {
|
|
// Immutable data
|
|
path: string,
|
|
src: []byte,
|
|
|
|
|
|
// Tokenizing state
|
|
ch: rune,
|
|
offset: int,
|
|
read_offset: int,
|
|
line_offset: int,
|
|
line_count: int,
|
|
|
|
// Extra information for tokens
|
|
at_bol: bool,
|
|
has_space: bool,
|
|
|
|
// Mutable data
|
|
err: Error_Handler,
|
|
warn: Error_Handler,
|
|
error_count: int,
|
|
warning_count: int,
|
|
}
|
|
|
|
init_defaults :: proc(t: ^Tokenizer, err: Error_Handler = default_error_handler, warn: Error_Handler = default_warn_handler) {
|
|
t.err = err
|
|
t.warn = warn
|
|
}
|
|
|
|
|
|
@(private)
|
|
offset_to_pos :: proc(t: ^Tokenizer, offset: int) -> (pos: Pos) {
|
|
pos.file = t.path
|
|
pos.offset = offset
|
|
pos.line = t.line_count
|
|
pos.column = offset - t.line_offset + 1
|
|
return
|
|
}
|
|
|
|
default_error_handler :: proc(pos: Pos, msg: string, args: ..any) {
|
|
fmt.eprintf("%s(%d:%d) ", pos.file, pos.line, pos.column)
|
|
fmt.eprintf(msg, ..args)
|
|
fmt.eprintf("\n")
|
|
}
|
|
|
|
default_warn_handler :: proc(pos: Pos, msg: string, args: ..any) {
|
|
fmt.eprintf("%s(%d:%d) warning: ", pos.file, pos.line, pos.column)
|
|
fmt.eprintf(msg, ..args)
|
|
fmt.eprintf("\n")
|
|
}
|
|
|
|
error_offset :: proc(t: ^Tokenizer, offset: int, msg: string, args: ..any) {
|
|
pos := offset_to_pos(t, offset)
|
|
if t.err != nil {
|
|
t.err(pos, msg, ..args)
|
|
}
|
|
t.error_count += 1
|
|
}
|
|
|
|
warn_offset :: proc(t: ^Tokenizer, offset: int, msg: string, args: ..any) {
|
|
pos := offset_to_pos(t, offset)
|
|
if t.warn != nil {
|
|
t.warn(pos, msg, ..args)
|
|
}
|
|
t.warning_count += 1
|
|
}
|
|
|
|
error :: proc(t: ^Tokenizer, tok: ^Token, msg: string, args: ..any) {
|
|
pos := tok.pos
|
|
if t.err != nil {
|
|
t.err(pos, msg, ..args)
|
|
}
|
|
t.error_count += 1
|
|
}
|
|
|
|
warn :: proc(t: ^Tokenizer, tok: ^Token, msg: string, args: ..any) {
|
|
pos := tok.pos
|
|
if t.warn != nil {
|
|
t.warn(pos, msg, ..args)
|
|
}
|
|
t.warning_count += 1
|
|
}
|
|
|
|
|
|
advance_rune :: proc(t: ^Tokenizer) {
|
|
if t.read_offset < len(t.src) {
|
|
t.offset = t.read_offset
|
|
if t.ch == '\n' {
|
|
t.at_bol = true
|
|
t.line_offset = t.offset
|
|
t.line_count += 1
|
|
}
|
|
r, w := rune(t.src[t.read_offset]), 1
|
|
switch {
|
|
case r == 0:
|
|
error_offset(t, t.offset, "illegal character NUL")
|
|
case r >= utf8.RUNE_SELF:
|
|
r, w = utf8.decode_rune(t.src[t.read_offset:])
|
|
if r == utf8.RUNE_ERROR && w == 1 {
|
|
error_offset(t, t.offset, "illegal UTF-8 encoding")
|
|
} else if r == utf8.RUNE_BOM && t.offset > 0 {
|
|
error_offset(t, t.offset, "illegal byte order mark")
|
|
}
|
|
}
|
|
t.read_offset += w
|
|
t.ch = r
|
|
} else {
|
|
t.offset = len(t.src)
|
|
if t.ch == '\n' {
|
|
t.at_bol = true
|
|
t.line_offset = t.offset
|
|
t.line_count += 1
|
|
}
|
|
t.ch = -1
|
|
}
|
|
}
|
|
|
|
advance_rune_n :: proc(t: ^Tokenizer, n: int) {
|
|
for _ in 0..<n {
|
|
advance_rune(t)
|
|
}
|
|
}
|
|
|
|
is_digit :: proc(r: rune) -> bool {
|
|
return '0' <= r && r <= '9'
|
|
}
|
|
|
|
skip_whitespace :: proc(t: ^Tokenizer) {
|
|
for {
|
|
switch t.ch {
|
|
case ' ', '\t', '\r', '\v', '\f', '\n':
|
|
t.has_space = true
|
|
advance_rune(t)
|
|
case:
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
scan_comment :: proc(t: ^Tokenizer) -> string {
|
|
offset := t.offset-1
|
|
next := -1
|
|
general: {
|
|
if t.ch == '/'{ // line comments
|
|
advance_rune(t)
|
|
for t.ch != '\n' && t.ch >= 0 {
|
|
advance_rune(t)
|
|
}
|
|
|
|
next = t.offset
|
|
if t.ch == '\n' {
|
|
next += 1
|
|
}
|
|
break general
|
|
}
|
|
|
|
/* style comment */
|
|
advance_rune(t)
|
|
for t.ch >= 0 {
|
|
ch := t.ch
|
|
advance_rune(t)
|
|
if ch == '*' && t.ch == '/' {
|
|
advance_rune(t)
|
|
next = t.offset
|
|
break general
|
|
}
|
|
}
|
|
|
|
error_offset(t, offset, "comment not terminated")
|
|
}
|
|
|
|
lit := t.src[offset : t.offset]
|
|
|
|
// NOTE(bill): Strip CR for line comments
|
|
for len(lit) > 2 && lit[1] == '/' && lit[len(lit)-1] == '\r' {
|
|
lit = lit[:len(lit)-1]
|
|
}
|
|
|
|
|
|
return string(lit)
|
|
}
|
|
|
|
scan_identifier :: proc(t: ^Tokenizer) -> string {
|
|
offset := t.offset
|
|
|
|
for is_ident1(t.ch) {
|
|
advance_rune(t)
|
|
}
|
|
|
|
return string(t.src[offset : t.offset])
|
|
}
|
|
|
|
scan_string :: proc(t: ^Tokenizer) -> string {
|
|
offset := t.offset-1
|
|
|
|
for {
|
|
ch := t.ch
|
|
if ch == '\n' || ch < 0 {
|
|
error_offset(t, offset, "string literal was not terminated")
|
|
break
|
|
}
|
|
advance_rune(t)
|
|
if ch == '"' {
|
|
break
|
|
}
|
|
if ch == '\\' {
|
|
scan_escape(t)
|
|
}
|
|
}
|
|
|
|
return string(t.src[offset : t.offset])
|
|
}
|
|
|
|
digit_val :: proc(r: rune) -> int {
|
|
switch r {
|
|
case '0'..='9':
|
|
return int(r-'0')
|
|
case 'A'..='F':
|
|
return int(r-'A' + 10)
|
|
case 'a'..='f':
|
|
return int(r-'a' + 10)
|
|
}
|
|
return 16
|
|
}
|
|
|
|
scan_escape :: proc(t: ^Tokenizer) -> bool {
|
|
offset := t.offset
|
|
|
|
esc := t.ch
|
|
n: int
|
|
base, max: u32
|
|
switch esc {
|
|
case 'a', 'b', 'e', 'f', 'n', 't', 'v', 'r', '\\', '\'', '"':
|
|
advance_rune(t)
|
|
return true
|
|
|
|
case '0'..='7':
|
|
for digit_val(t.ch) < 8 {
|
|
advance_rune(t)
|
|
}
|
|
return true
|
|
case 'x':
|
|
advance_rune(t)
|
|
for digit_val(t.ch) < 16 {
|
|
advance_rune(t)
|
|
}
|
|
return true
|
|
case 'u':
|
|
advance_rune(t)
|
|
n, base, max = 4, 16, utf8.MAX_RUNE
|
|
case 'U':
|
|
advance_rune(t)
|
|
n, base, max = 8, 16, utf8.MAX_RUNE
|
|
case:
|
|
if t.ch < 0 {
|
|
error_offset(t, offset, "escape sequence was not terminated")
|
|
} else {
|
|
break
|
|
}
|
|
return false
|
|
}
|
|
|
|
x: u32
|
|
main_loop: for n > 0 {
|
|
d := u32(digit_val(t.ch))
|
|
if d >= base {
|
|
if t.ch == '"' || t.ch == '\'' {
|
|
break main_loop
|
|
}
|
|
if t.ch < 0 {
|
|
error_offset(t, t.offset, "escape sequence was not terminated")
|
|
} else {
|
|
error_offset(t, t.offset, "illegal character '%r' : %d in escape sequence", t.ch, t.ch)
|
|
}
|
|
return false
|
|
}
|
|
|
|
x = x*base + d
|
|
advance_rune(t)
|
|
n -= 1
|
|
}
|
|
|
|
if x > max || 0xd800 <= x && x <= 0xe000 {
|
|
error_offset(t, offset, "escape sequence is an invalid Unicode code point")
|
|
return false
|
|
}
|
|
return true
|
|
}
|
|
|
|
scan_rune :: proc(t: ^Tokenizer) -> string {
|
|
offset := t.offset-1
|
|
valid := true
|
|
n := 0
|
|
for {
|
|
ch := t.ch
|
|
if ch == '\n' || ch < 0 {
|
|
if valid {
|
|
error_offset(t, offset, "rune literal not terminated")
|
|
valid = false
|
|
}
|
|
break
|
|
}
|
|
advance_rune(t)
|
|
if ch == '\'' {
|
|
break
|
|
}
|
|
n += 1
|
|
if ch == '\\' {
|
|
if !scan_escape(t) {
|
|
valid = false
|
|
}
|
|
}
|
|
}
|
|
|
|
if valid && n != 1 {
|
|
error_offset(t, offset, "illegal rune literal")
|
|
}
|
|
|
|
return string(t.src[offset : t.offset])
|
|
}
|
|
|
|
scan_number :: proc(t: ^Tokenizer, seen_decimal_point: bool) -> (Token_Kind, string) {
|
|
scan_mantissa :: proc(t: ^Tokenizer, base: int) {
|
|
for digit_val(t.ch) < base {
|
|
advance_rune(t)
|
|
}
|
|
}
|
|
scan_exponent :: proc(t: ^Tokenizer) {
|
|
if t.ch == 'e' || t.ch == 'E' || t.ch == 'p' || t.ch == 'P' {
|
|
advance_rune(t)
|
|
if t.ch == '-' || t.ch == '+' {
|
|
advance_rune(t)
|
|
}
|
|
if digit_val(t.ch) < 10 {
|
|
scan_mantissa(t, 10)
|
|
} else {
|
|
error_offset(t, t.offset, "illegal floating-point exponent")
|
|
}
|
|
}
|
|
}
|
|
scan_fraction :: proc(t: ^Tokenizer) -> (early_exit: bool) {
|
|
if t.ch == '.' && peek(t) == '.' {
|
|
return true
|
|
}
|
|
if t.ch == '.' {
|
|
advance_rune(t)
|
|
scan_mantissa(t, 10)
|
|
}
|
|
return false
|
|
}
|
|
|
|
check_end := true
|
|
|
|
|
|
offset := t.offset
|
|
seen_point := seen_decimal_point
|
|
|
|
if seen_point {
|
|
offset -= 1
|
|
scan_mantissa(t, 10)
|
|
scan_exponent(t)
|
|
} else {
|
|
if t.ch == '0' {
|
|
int_base :: proc(t: ^Tokenizer, base: int, msg: string) {
|
|
prev := t.offset
|
|
advance_rune(t)
|
|
scan_mantissa(t, base)
|
|
if t.offset - prev <= 1 {
|
|
error_offset(t, t.offset, msg)
|
|
}
|
|
}
|
|
|
|
advance_rune(t)
|
|
switch t.ch {
|
|
case 'b', 'B':
|
|
int_base(t, 2, "illegal binary integer")
|
|
case 'x', 'X':
|
|
int_base(t, 16, "illegal hexadecimal integer")
|
|
case:
|
|
seen_point = false
|
|
scan_mantissa(t, 10)
|
|
if t.ch == '.' {
|
|
seen_point = true
|
|
if scan_fraction(t) {
|
|
check_end = false
|
|
}
|
|
}
|
|
if check_end {
|
|
scan_exponent(t)
|
|
check_end = false
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if check_end {
|
|
scan_mantissa(t, 10)
|
|
|
|
if !scan_fraction(t) {
|
|
scan_exponent(t)
|
|
}
|
|
}
|
|
|
|
return .Number, string(t.src[offset : t.offset])
|
|
}
|
|
|
|
scan_punct :: proc(t: ^Tokenizer, ch: rune) -> (kind: Token_Kind) {
|
|
kind = .Punct
|
|
switch ch {
|
|
case:
|
|
kind = .Invalid
|
|
|
|
case '<', '>':
|
|
if t.ch == ch {
|
|
advance_rune(t)
|
|
}
|
|
if t.ch == '=' {
|
|
advance_rune(t)
|
|
}
|
|
case '!', '+', '-', '*', '/', '%', '^', '=':
|
|
if t.ch == '=' {
|
|
advance_rune(t)
|
|
}
|
|
case '#':
|
|
if t.ch == '#' {
|
|
advance_rune(t)
|
|
}
|
|
case '&':
|
|
if t.ch == '=' || t.ch == '&' {
|
|
advance_rune(t)
|
|
}
|
|
case '|':
|
|
if t.ch == '=' || t.ch == '|' {
|
|
advance_rune(t)
|
|
}
|
|
case '(', ')', '[', ']', '{', '}':
|
|
// okay
|
|
case '~', ',', ':', ';', '?':
|
|
// okay
|
|
case '`':
|
|
// okay
|
|
case '.':
|
|
if t.ch == '.' && peek(t) == '.' {
|
|
advance_rune(t)
|
|
advance_rune(t) // consume last '.'
|
|
}
|
|
}
|
|
return
|
|
}
|
|
|
|
peek :: proc(t: ^Tokenizer) -> byte {
|
|
if t.read_offset < len(t.src) {
|
|
return t.src[t.read_offset]
|
|
}
|
|
return 0
|
|
}
|
|
peek_str :: proc(t: ^Tokenizer, str: string) -> bool {
|
|
if t.read_offset < len(t.src) {
|
|
return strings.has_prefix(string(t.src[t.offset:]), str)
|
|
}
|
|
return false
|
|
}
|
|
|
|
scan_literal_prefix :: proc(t: ^Tokenizer, str: string, prefix: ^string) -> bool {
|
|
if peek_str(t, str) {
|
|
offset := t.offset
|
|
for _ in str {
|
|
advance_rune(t)
|
|
}
|
|
prefix^ = string(t.src[offset:][:len(str)-1])
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
|
|
allow_next_to_be_newline :: proc(t: ^Tokenizer) -> bool {
|
|
if t.ch == '\n' {
|
|
advance_rune(t)
|
|
return true
|
|
} else if t.ch == '\r' && peek(t) == '\n' { // allow for MS-DOS style line endings
|
|
advance_rune(t) // \r
|
|
advance_rune(t) // \n
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
scan :: proc(t: ^Tokenizer, f: ^File) -> ^Token {
|
|
skip_whitespace(t)
|
|
|
|
offset := t.offset
|
|
|
|
kind: Token_Kind
|
|
lit: string
|
|
prefix: string
|
|
|
|
switch ch := t.ch; {
|
|
case scan_literal_prefix(t, `u8"`, &prefix):
|
|
kind = .String
|
|
lit = scan_string(t)
|
|
case scan_literal_prefix(t, `u"`, &prefix):
|
|
kind = .String
|
|
lit = scan_string(t)
|
|
case scan_literal_prefix(t, `L"`, &prefix):
|
|
kind = .String
|
|
lit = scan_string(t)
|
|
case scan_literal_prefix(t, `U"`, &prefix):
|
|
kind = .String
|
|
lit = scan_string(t)
|
|
case scan_literal_prefix(t, `u'`, &prefix):
|
|
kind = .Char
|
|
lit = scan_rune(t)
|
|
case scan_literal_prefix(t, `L'`, &prefix):
|
|
kind = .Char
|
|
lit = scan_rune(t)
|
|
case scan_literal_prefix(t, `U'`, &prefix):
|
|
kind = .Char
|
|
lit = scan_rune(t)
|
|
|
|
case is_ident0(ch):
|
|
lit = scan_identifier(t)
|
|
kind = .Ident
|
|
case '0' <= ch && ch <= '9':
|
|
kind, lit = scan_number(t, false)
|
|
case:
|
|
advance_rune(t)
|
|
switch ch {
|
|
case -1:
|
|
kind = .EOF
|
|
case '\\':
|
|
kind = .Punct
|
|
if allow_next_to_be_newline(t) {
|
|
t.at_bol = true
|
|
t.has_space = false
|
|
return scan(t, f)
|
|
}
|
|
|
|
case '.':
|
|
if is_digit(t.ch) {
|
|
kind, lit = scan_number(t, true)
|
|
} else {
|
|
kind = scan_punct(t, ch)
|
|
}
|
|
case '"':
|
|
kind = .String
|
|
lit = scan_string(t)
|
|
case '\'':
|
|
kind = .Char
|
|
lit = scan_rune(t)
|
|
case '/':
|
|
if t.ch == '/' || t.ch == '*' {
|
|
kind = .Comment
|
|
lit = scan_comment(t)
|
|
t.has_space = true
|
|
break
|
|
}
|
|
fallthrough
|
|
case:
|
|
kind = scan_punct(t, ch)
|
|
if kind == .Invalid && ch != utf8.RUNE_BOM {
|
|
error_offset(t, t.offset, "illegal character '%r': %d", ch, ch)
|
|
}
|
|
}
|
|
}
|
|
|
|
if lit == "" {
|
|
lit = string(t.src[offset : t.offset])
|
|
}
|
|
|
|
if kind == .Comment {
|
|
return scan(t, f)
|
|
}
|
|
|
|
tok := new(Token)
|
|
tok.kind = kind
|
|
tok.lit = lit
|
|
tok.pos = offset_to_pos(t, offset)
|
|
tok.file = f
|
|
tok.prefix = prefix
|
|
tok.at_bol = t.at_bol
|
|
tok.has_space = t.has_space
|
|
|
|
t.at_bol, t.has_space = false, false
|
|
|
|
return tok
|
|
}
|
|
|
|
tokenize :: proc(t: ^Tokenizer, f: ^File) -> ^Token {
|
|
setup_tokenizer: {
|
|
t.src = f.src
|
|
t.ch = ' '
|
|
t.offset = 0
|
|
t.read_offset = 0
|
|
t.line_offset = 0
|
|
t.line_count = len(t.src) > 0 ? 1 : 0
|
|
t.error_count = 0
|
|
t.path = f.name
|
|
|
|
|
|
advance_rune(t)
|
|
if t.ch == utf8.RUNE_BOM {
|
|
advance_rune(t)
|
|
}
|
|
}
|
|
|
|
|
|
t.at_bol = true
|
|
t.has_space = false
|
|
|
|
head: Token
|
|
curr := &head
|
|
for {
|
|
tok := scan(t, f)
|
|
if tok == nil {
|
|
break
|
|
}
|
|
curr.next = tok
|
|
curr = curr.next
|
|
if tok.kind == .EOF {
|
|
break
|
|
}
|
|
}
|
|
|
|
return head.next
|
|
}
|
|
|
|
add_new_file :: proc(t: ^Tokenizer, name: string, src: []byte, id: int) -> ^File {
|
|
file := new(File)
|
|
file.id = id
|
|
file.src = src
|
|
file.name = name
|
|
file.display_name = name
|
|
return file
|
|
}
|
|
|
|
tokenize_file :: proc(t: ^Tokenizer, path: string, id: int, loc := #caller_location) -> ^Token {
|
|
src, ok := os.read_entire_file(path)
|
|
if !ok {
|
|
return nil
|
|
}
|
|
return tokenize(t, add_new_file(t, path, src, id))
|
|
}
|
|
|
|
|
|
inline_tokenize :: proc(t: ^Tokenizer, tok: ^Token, src: []byte) -> ^Token {
|
|
file := new(File)
|
|
file.src = src
|
|
if tok.file != nil {
|
|
file.id = tok.file.id
|
|
file.name = tok.file.name
|
|
file.display_name = tok.file.name
|
|
}
|
|
|
|
return tokenize(t, file)
|
|
}
|