diff --git a/core/c/frontend/preprocessor/const_expr.odin b/core/c/frontend/preprocessor/const_expr.odin deleted file mode 100644 index ff13f6432..000000000 --- a/core/c/frontend/preprocessor/const_expr.odin +++ /dev/null @@ -1,25 +0,0 @@ -package c_frontend_preprocess - -import "core:c/frontend/tokenizer" - -const_expr :: proc(rest: ^^Token, tok: ^Token) -> i64 { - // TODO(bill): Handle const_expr correctly - // This is effectively a mini-parser - - assert(rest != nil) - assert(tok != nil) - rest^ = tokenizer.new_eof(tok) - switch v in tok.val { - case i64: - return v - case f64: - return i64(v) - case string: - return 0 - case []u16: - // TODO - case []u32: - // TODO - } - return 0 -} diff --git a/core/c/frontend/preprocessor/preprocess.odin b/core/c/frontend/preprocessor/preprocess.odin deleted file mode 100644 index b5eab0bb3..000000000 --- a/core/c/frontend/preprocessor/preprocess.odin +++ /dev/null @@ -1,1510 +0,0 @@ -package c_frontend_preprocess - -import "../tokenizer" - -import "core:strings" -import "core:strconv" -import "core:path/filepath" -import "core:unicode/utf8" -import "core:unicode/utf16" -import "core:os" -import "core:io" - -@(private) -Tokenizer :: tokenizer.Tokenizer -@(private) -Token :: tokenizer.Token - -Error_Handler :: tokenizer.Error_Handler - -Macro_Param :: struct { - next: ^Macro_Param, - name: string, -} - -Macro_Arg :: struct { - next: ^Macro_Arg, - name: string, - tok: ^Token, - is_va_args: bool, -} - -Macro_Kind :: enum u8 { - Function_Like, - Value_Like, -} - -Macro_Handler :: #type proc(^Preprocessor, ^Token) -> ^Token - -Macro :: struct { - name: string, - kind: Macro_Kind, - params: ^Macro_Param, - va_args_name: string, - body: ^Token, - handler: Macro_Handler, -} - -Cond_Incl_State :: enum u8 { - In_Then, - In_Elif, - In_Else, -} - -Cond_Incl :: struct { - next: ^Cond_Incl, - tok: ^Token, - state: Cond_Incl_State, - included: bool, -} - -Pragma_Handler :: #type proc(^Preprocessor, ^Token) - -Preprocessor :: struct { - // Lookup tables - macros: map[string]^Macro, - pragma_once: map[string]bool, - include_guards: map[string]string, - filepath_cache: map[string]string, - - // Include path data - include_paths: []string, - - // Counter for __COUNTER__ macro - counter: i64, - - // Include information - cond_incl: ^Cond_Incl, - include_level: int, - include_next_index: int, - - wide_char_size: int, - - // Mutable data - err: Error_Handler, - warn: Error_Handler, - pragma_handler: Pragma_Handler, - error_count: int, - warning_count: int, -} - -MAX_INCLUDE_LEVEL :: 1024 - -error :: proc(cpp: ^Preprocessor, tok: ^Token, msg: string, args: ..any) { - if cpp.err != nil { - cpp.err(tok.pos, msg, ..args) - } - cpp.error_count += 1 -} - -warn :: proc(cpp: ^Preprocessor, tok: ^Token, msg: string, args: ..any) { - if cpp.warn != nil { - cpp.warn(tok.pos, msg, ..args) - } - cpp.warning_count += 1 -} - -is_hash :: proc(tok: ^Token) -> bool { - return tok.at_bol && tok.lit == "#" -} - -skip_line :: proc(cpp: ^Preprocessor, tok: ^Token) -> ^Token { - tok := tok - if tok.at_bol { - return tok - } - warn(cpp, tok, "extra token") - for tok.at_bol { - tok = tok.next - } - return tok -} - - -append_token :: proc(a, b: ^Token) -> ^Token { - if a.kind == .EOF { - return b - } - - head: Token - curr := &head - - for tok := a; tok.kind != .EOF; tok = tok.next { - curr.next = tokenizer.copy_token(tok) - curr = curr.next - } - curr.next = b - return head.next -} - - -is_hex_digit :: proc(x: byte) -> bool { - switch x { - case '0'..='9', 'a'..='f', 'A'..='F': - return true - } - return false -} -from_hex :: proc(x: byte) -> i32 { - switch x { - case '0'..='9': - return i32(x) - '0' - case 'a'..='f': - return i32(x) - 'a' + 10 - case 'A'..='F': - return i32(x) - 'A' + 10 - } - return 16 -} - - -convert_pp_number :: proc(tok: ^Token) { - convert_pp_int :: proc(tok: ^Token) -> bool { - p := tok.lit - base := 10 - if len(p) > 2 { - if strings.equal_fold(p[:2], "0x") && is_hex_digit(p[2]) { - p = p[2:] - base = 16 - } else if strings.equal_fold(p[:2], "0b") && p[2] == '0' || p[2] == '1' { - p = p[2:] - base = 2 - } - } - if base == 10 && p[0] == '0' { - base = 8 - } - - - tok.val, _ = strconv.parse_i64_of_base(p, base) - - l, u: int - - suf: [3]byte - suf_n := 0 - i := len(p)-1 - for /**/; i >= 0 && suf_n < len(suf); i -= 1 { - switch p[i] { - case 'l', 'L': - suf[suf_n] = 'l' - l += 1 - suf_n += 1 - case 'u', 'U': - suf[suf_n] = 'u' - u += 1 - suf_n += 1 - } - } - if i < len(p) { - if !is_hex_digit(p[i]) && p[i] != '.' { - return false - } - } - if u > 1 { - return false - } - - if l > 2 { - return false - } - - if u == 1 { - switch l { - case 0: tok.type_hint = .Unsigned_Int - case 1: tok.type_hint = .Unsigned_Long - case 2: tok.type_hint = .Unsigned_Long_Long - } - } else { - switch l { - case 0: tok.type_hint = .Int - case 1: tok.type_hint = .Long - case 2: tok.type_hint = .Long_Long - } - } - return true - } - - if convert_pp_int(tok) { - return - } - - fval, _ := strconv.parse_f64(tok.lit) - tok.val = fval - - end := tok.lit[len(tok.lit)-1] - switch end { - case 'f', 'F': - tok.type_hint = .Float - case 'l', 'L': - tok.type_hint = .Long_Double - case: - tok.type_hint = .Double - } - -} - -convert_pp_char :: proc(tok: ^Token) { - assert(len(tok.lit) >= 2) - r, _, _, _ := unquote_char(tok.lit, tok.lit[0]) - tok.val = i64(r) - - tok.type_hint = .Int - switch tok.prefix { - case "u": tok.type_hint = .UTF_16 - case "U": tok.type_hint = .UTF_32 - case "L": tok.type_hint = .UTF_Wide - } -} - -wide_char_size :: proc(cpp: ^Preprocessor) -> int { - char_size := 4 - if cpp.wide_char_size > 0 { - char_size = clamp(cpp.wide_char_size, 1, 4) - assert(char_size & (char_size-1) == 0) - } - return char_size -} - -convert_pp_string :: proc(cpp: ^Preprocessor, tok: ^Token) { - assert(len(tok.lit) >= 2) - str, _, _ := unquote_string(tok.lit) - tok.val = str - - char_size := 1 - - switch tok.prefix { - case "u8": - tok.type_hint = .UTF_8 - char_size = 1 - case "u": - tok.type_hint = .UTF_16 - char_size = 2 - case "U": - tok.type_hint = .UTF_32 - char_size = 4 - case "L": - tok.type_hint = .UTF_Wide - char_size = wide_char_size(cpp) - } - - switch char_size { - case 2: - n: int - buf := make([]u16, len(str)) - for c in str { - ch := c - if ch < 0x10000 { - buf[n] = u16(ch) - n += 1 - } else { - ch -= 0x10000 - buf[n+0] = 0xd800 + u16((ch >> 10) & 0x3ff) - buf[n+1] = 0xdc00 + u16(ch & 0x3ff) - n += 2 - } - } - tok.val = buf[:n] - case 4: - n: int - buf := make([]u32, len(str)) - for ch in str { - buf[n] = u32(ch) - n += 1 - } - tok.val = buf[:n] - } - -} - -convert_pp_token :: proc(cpp: ^Preprocessor, t: ^Token, is_keyword: tokenizer.Is_Keyword_Proc) { - switch { - case t.kind == .Char: - convert_pp_char(t) - case t.kind == .String: - convert_pp_string(cpp, t) - case is_keyword != nil && is_keyword(t): - t.kind = .Keyword - case t.kind == .PP_Number: - convert_pp_number(t) - } -} -convert_pp_tokens :: proc(cpp: ^Preprocessor, tok: ^Token, is_keyword: tokenizer.Is_Keyword_Proc) { - for t := tok; t != nil && t.kind != .EOF; t = t.next { - convert_pp_token(cpp, tok, is_keyword) - } -} - -join_adjacent_string_literals :: proc(cpp: ^Preprocessor, initial_tok: ^Token) { - for tok1 := initial_tok; tok1.kind != .EOF; /**/ { - if tok1.kind != .String || tok1.next.kind != .String { - tok1 = tok1.next - continue - } - - type_hint := tokenizer.Token_Type_Hint.None - char_size := 1 - - start := tok1 - for t := tok1; t != nil && t.kind == .String; t = t.next { - if t.val == nil { - convert_pp_string(cpp, t) - } - tok1 = t.next - if type_hint != t.type_hint { - if t.type_hint != .None && type_hint != .None { - error(cpp, t, "unsupported non-standard concatenation of string literals of different types") - } - prev_char_size := char_size - - #partial switch type_hint { - case .UTF_8: char_size = max(char_size, 1) - case .UTF_16: char_size = max(char_size, 2) - case .UTF_32: char_size = max(char_size, 4) - case .UTF_Wide: char_size = max(char_size, wide_char_size(cpp)) - } - - if type_hint == .None || prev_char_size < char_size { - type_hint = t.type_hint - } - } - } - - // NOTE(bill): Verbose logic in order to correctly concantenate strings, even if they different in type - max_len := 0 - switch char_size { - case 1: - for t := start; t != nil && t.kind == .String; t = t.next { - #partial switch v in t.val { - case string: max_len += len(v) - case []u16: max_len += 2*len(v) - case []u32: max_len += 4*len(v) - } - } - n := 0 - buf := make([]byte, max_len) - for t := start; t != nil && t.kind == .String; t = t.next { - #partial switch v in t.val { - case string: - n += copy(buf[n:], v) - case []u16: - for i := 0; i < len(v); /**/ { - c1 := v[i] - r: rune - if !utf16.is_surrogate(rune(c1)) { - r = rune(c1) - i += 1 - } else if i+1 == len(v) { - r = utf16.REPLACEMENT_CHAR - i += 1 - } else { - c2 := v[i+1] - i += 2 - r = utf16.decode_surrogate_pair(rune(c1), rune(c2)) - } - - b, w := utf8.encode_rune(r) - n += copy(buf[n:], b[:w]) - } - case []u32: - for r in v { - b, w := utf8.encode_rune(rune(r)) - n += copy(buf[n:], b[:w]) - } - } - } - - new_tok := tokenizer.copy_token(start) - new_tok.lit = "" - new_tok.val = string(buf[:n]) - new_tok.next = tok1 - new_tok.type_hint = type_hint - start^ = new_tok^ - case 2: - for t := start; t != nil && t.kind == .String; t = t.next { - #partial switch v in t.val { - case string: max_len += len(v) - case []u16: max_len += len(v) - case []u32: max_len += 2*len(v) - } - } - n := 0 - buf := make([]u16, max_len) - for t := start; t != nil && t.kind == .String; t = t.next { - #partial switch v in t.val { - case string: - for r in v { - if r >= 0x10000 { - c1, c2 := utf16.encode_surrogate_pair(r) - buf[n+0] = u16(c1) - buf[n+1] = u16(c2) - n += 2 - } else { - buf[n] = u16(r) - n += 1 - } - } - case []u16: - n += copy(buf[n:], v) - case []u32: - for r in v { - if r >= 0x10000 { - c1, c2 := utf16.encode_surrogate_pair(rune(r)) - buf[n+0] = u16(c1) - buf[n+1] = u16(c2) - n += 2 - } else { - buf[n] = u16(r) - n += 1 - } - } - } - } - - new_tok := tokenizer.copy_token(start) - new_tok.lit = "" - new_tok.val = buf[:n] - new_tok.next = tok1 - new_tok.type_hint = type_hint - start^ = new_tok^ - case 4: - for t := start; t != nil && t.kind == .String; t = t.next { - #partial switch v in t.val { - case string: max_len += len(v) - case []u16: max_len += len(v) - case []u32: max_len += len(v) - } - } - n := 0 - buf := make([]u32, max_len) - for t := start; t != nil && t.kind == .String; t = t.next { - #partial switch v in t.val { - case string: - for r in v { - buf[n] = u32(r) - n += 1 - } - case []u16: - for i := 0; i < len(v); /**/ { - c1 := v[i] - if !utf16.is_surrogate(rune(c1)) { - buf[n] = u32(c1) - n += 1 - i += 1 - } else if i+1 == len(v) { - buf[n] = utf16.REPLACEMENT_CHAR - n += 1 - i += 1 - } else { - c2 := v[i+1] - i += 2 - r := utf16.decode_surrogate_pair(rune(c1), rune(c2)) - buf[n] = u32(r) - n += 1 - } - } - case []u32: - n += copy(buf[n:], v) - } - } - - new_tok := tokenizer.copy_token(start) - new_tok.lit = "" - new_tok.val = buf[:n] - new_tok.next = tok1 - new_tok.type_hint = type_hint - start^ = new_tok^ - } - } -} - - -quote_string :: proc(s: string) -> []byte { - b := strings.builder_make(0, len(s)+2) - io.write_quoted_string(strings.to_writer(&b), s, '"') - return b.buf[:] -} - - -_init_tokenizer_from_preprocessor :: proc(t: ^Tokenizer, cpp: ^Preprocessor) -> ^Tokenizer { - t.warn = cpp.warn - t.err = cpp.err - return t -} - -new_string_token :: proc(cpp: ^Preprocessor, str: string, tok: ^Token) -> ^Token { - assert(tok != nil) - assert(str != "") - t := _init_tokenizer_from_preprocessor(&Tokenizer{}, cpp) - src := quote_string(str) - return tokenizer.inline_tokenize(t, tok, src) -} - -stringize :: proc(cpp: ^Preprocessor, hash, arg: ^Token) -> ^Token { - s := join_tokens(arg, nil) - return new_string_token(cpp, s, hash) -} - - -new_number_token :: proc(cpp: ^Preprocessor, i: i64, tok: ^Token) -> ^Token { - t := _init_tokenizer_from_preprocessor(&Tokenizer{}, cpp) - buf: [32]byte - n := len(strconv.append_int(buf[:], i, 10)) - src := make([]byte, n) - copy(src, buf[:n]) - return tokenizer.inline_tokenize(t, tok, src) -} - - -find_macro :: proc(cpp: ^Preprocessor, tok: ^Token) -> ^Macro { - if tok.kind != .Ident { - return nil - } - return cpp.macros[tok.lit] -} - -add_macro :: proc(cpp: ^Preprocessor, name: string, kind: Macro_Kind, body: ^Token) -> ^Macro { - m := new(Macro) - m.name = name - m.kind = kind - m.body = body - cpp.macros[name] = m - return m -} - - -undef_macro :: proc(cpp: ^Preprocessor, name: string) { - delete_key(&cpp.macros, name) -} - -add_builtin :: proc(cpp: ^Preprocessor, name: string, handler: Macro_Handler) -> ^Macro { - m := add_macro(cpp, name, .Value_Like, nil) - m.handler = handler - return m -} - - -skip :: proc(cpp: ^Preprocessor, tok: ^Token, op: string) -> ^Token { - if tok.lit != op { - error(cpp, tok, "expected '%q'", op) - } - return tok.next -} - -consume :: proc(rest: ^^Token, tok: ^Token, lit: string) -> bool { - if tok.lit == lit { - rest^ = tok.next - return true - } - rest^ = tok - return false -} - -read_macro_params :: proc(cpp: ^Preprocessor, rest: ^^Token, tok: ^Token) -> (param: ^Macro_Param, va_args_name: string) { - head: Macro_Param - curr := &head - - tok := tok - for tok.lit != ")" && tok.kind != .EOF { - if curr != &head { - tok = skip(cpp, tok, ",") - } - - if tok.lit == "..." { - va_args_name = "__VA_ARGS__" - rest^ = skip(cpp, tok.next, ")") - param = head.next - return - } - - if tok.kind != .Ident { - error(cpp, tok, "expected an identifier") - } - - if tok.next.lit == "..." { - va_args_name = tok.lit - rest^ = skip(cpp, tok.next.next, ")") - param = head.next - return - } - - m := new(Macro_Param) - m.name = tok.lit - curr.next = m - curr = curr.next - tok = tok.next - } - - - rest^ = tok.next - param = head.next - return -} - -copy_line :: proc(rest: ^^Token, tok: ^Token) -> ^Token { - head: Token - curr := &head - - tok := tok - for ; !tok.at_bol; tok = tok.next { - curr.next = tokenizer.copy_token(tok) - curr = curr.next - } - curr.next = tokenizer.new_eof(tok) - rest^ = tok - return head.next -} - -read_macro_definition :: proc(cpp: ^Preprocessor, rest: ^^Token, tok: ^Token) { - tok := tok - if tok.kind != .Ident { - error(cpp, tok, "macro name must be an identifier") - } - name := tok.lit - tok = tok.next - - if !tok.has_space && tok.lit == "(" { - params, va_args_name := read_macro_params(cpp, &tok, tok.next) - - m := add_macro(cpp, name, .Function_Like, copy_line(rest, tok)) - m.params = params - m.va_args_name = va_args_name - } else { - add_macro(cpp, name, .Value_Like, copy_line(rest, tok)) - } -} - - -join_tokens :: proc(tok, end: ^Token) -> string { - n := 1 - for t := tok; t != end && t.kind != .EOF; t = t.next { - if t != tok && t.has_space { - n += 1 - } - n += len(t.lit) - } - - buf := make([]byte, n) - - pos := 0 - for t := tok; t != end && t.kind != .EOF; t = t.next { - if t != tok && t.has_space { - buf[pos] = ' ' - pos += 1 - } - copy(buf[pos:], t.lit) - pos += len(t.lit) - } - - return string(buf[:pos]) -} - -read_include_filename :: proc(cpp: ^Preprocessor, rest: ^^Token, tok: ^Token) -> (filename: string, is_quote: bool) { - tok := tok - - if tok.kind == .String { - rest^ = skip_line(cpp, tok.next) - filename = tok.lit[1:len(tok.lit)-1] - is_quote = true - return - } - - if tok.lit == "<" { - start := tok - for ; tok.kind != .EOF; tok = tok.next { - if tok.at_bol || tok.kind == .EOF { - error(cpp, tok, "expected '>'") - } - is_quote = false - if tok.lit == ">" { - break - } - } - rest^ = skip_line(cpp, tok.next) - filename = join_tokens(start.next, tok) - return - } - - if tok.kind == .Ident { - tok2 := preprocess_internal(cpp, copy_line(rest, tok)) - return read_include_filename(cpp, &tok2, tok2) - } - - error(cpp, tok, "expected a filename") - return -} - -skip_cond_incl :: proc(tok: ^Token) -> ^Token { - next_skip :: proc(tok: ^Token) -> ^Token { - tok := tok - for tok.kind != .EOF { - if is_hash(tok) { - switch tok.next.lit { - case "if", "ifdef", "ifndef": - tok = next_skip(tok.next.next) - continue - - case "endif": - return tok.next.next - } - } - tok = tok.next - } - return tok - } - - tok := tok - - loop: for tok.kind != .EOF { - if is_hash(tok) { - switch tok.next.lit { - case "if", "ifdef", "ifndef": - tok = next_skip(tok.next.next) - continue loop - - case "elif", "else", "endif": - break loop - } - } - - tok = tok.next - } - return tok -} - -check_for_include_guard :: proc(tok: ^Token) -> (guard: string, ok: bool) { - if !is_hash(tok) || tok.next.lit != "ifndef" { - return - } - tok := tok - tok = tok.next.next - - if tok.kind != .Ident { - return - } - - m := tok.lit - tok = tok.next - - if !is_hash(tok) || tok.next.lit != "define" || tok.next.lit != "macro" { - return - } - - for tok.kind != .EOF { - if !is_hash(tok) { - tok = tok.next - continue - } - - if tok.next.lit == "endif" && tok.next.next.kind == .EOF { - return m, true - } - - switch tok.lit { - case "if", "ifdef", "ifndef": - tok = skip_cond_incl(tok.next) - case: - tok = tok.next - } - } - return -} - -include_file :: proc(cpp: ^Preprocessor, tok: ^Token, path: string, filename_tok: ^Token) -> ^Token { - if cpp.pragma_once[path] { - return tok - } - - guard_name, guard_name_found := cpp.include_guards[path] - if guard_name_found && cpp.macros[guard_name] != nil { - return tok - } - - if !os.exists(path) { - error(cpp, filename_tok, "%s: cannot open file", path) - return tok - } - - cpp.include_level += 1 - if cpp.include_level > MAX_INCLUDE_LEVEL { - error(cpp, tok, "exceeded maximum nest amount: %d", MAX_INCLUDE_LEVEL) - return tok - } - - t := _init_tokenizer_from_preprocessor(&Tokenizer{}, cpp) - tok2 := tokenizer.tokenize_file(t, path, /*file.id*/1) - if tok2 == nil { - error(cpp, filename_tok, "%s: cannot open file", path) - } - cpp.include_level -= 1 - - guard_name, guard_name_found = check_for_include_guard(tok2) - if guard_name_found { - cpp.include_guards[path] = guard_name - } - - return append_token(tok2, tok) -} - -find_arg :: proc(args: ^Macro_Arg, tok: ^Token) -> ^Macro_Arg { - for ap := args; ap != nil; ap = ap.next { - if tok.lit == ap.name { - return ap - } - } - return nil -} - -paste :: proc(cpp: ^Preprocessor, lhs, rhs: ^Token) -> ^Token { - buf := strings.concatenate({lhs.lit, rhs.lit}) - t := _init_tokenizer_from_preprocessor(&Tokenizer{}, cpp) - tok := tokenizer.inline_tokenize(t, lhs, transmute([]byte)buf) - if tok.next.kind != .EOF { - error(cpp, lhs, "pasting forms '%s', an invalid token", buf) - } - return tok -} - -has_varargs :: proc(args: ^Macro_Arg) -> bool { - for ap := args; ap != nil; ap = ap.next { - if ap.name == "__VA_ARGS__" { - return ap.tok.kind != .EOF - } - } - return false -} - -substitute_token :: proc(cpp: ^Preprocessor, tok: ^Token, args: ^Macro_Arg) -> ^Token { - head: Token - curr := &head - tok := tok - for tok.kind != .EOF { - if tok.lit == "#" { - arg := find_arg(args, tok.next) - if arg == nil { - error(cpp, tok.next, "'#' is not followed by a macro parameter") - } - arg_tok := arg.tok if arg != nil else tok.next - curr.next = stringize(cpp, tok, arg_tok) - curr = curr.next - tok = tok.next.next - continue - } - - if tok.lit == "," && tok.next.lit == "##" { - if arg := find_arg(args, tok.next.next); arg != nil && arg.is_va_args { - if arg.tok.kind == .EOF { - tok = tok.next.next.next - } else { - curr.next = tokenizer.copy_token(tok) - curr = curr.next - tok = tok.next.next - } - continue - } - } - - if tok.lit == "##" { - if curr == &head { - error(cpp, tok, "'##' cannot appear at start of macro expansion") - } - if tok.next.kind == .EOF { - error(cpp, tok, "'##' cannot appear at end of macro expansion") - } - - if arg := find_arg(args, tok.next); arg != nil { - if arg.tok.kind != .EOF { - curr^ = paste(cpp, curr, arg.tok)^ - for t := arg.tok.next; t.kind != .EOF; t = t.next { - curr.next = tokenizer.copy_token(t) - curr = curr.next - } - } - tok = tok.next.next - continue - } - - curr^ = paste(cpp, curr, tok.next)^ - tok = tok.next.next - continue - } - - arg := find_arg(args, tok) - - if arg != nil && tok.next.lit == "##" { - rhs := tok.next.next - - if arg.tok.kind == .EOF { - args2 := find_arg(args, rhs) - if args2 != nil { - for t := args.tok; t.kind != .EOF; t = t.next { - curr.next = tokenizer.copy_token(t) - curr = curr.next - } - } else { - curr.next = tokenizer.copy_token(rhs) - curr = curr.next - } - tok = rhs.next - continue - } - - for t := arg.tok; t.kind != .EOF; t = t.next { - curr.next = tokenizer.copy_token(t) - curr = curr.next - } - tok = tok.next - continue - } - - if tok.lit == "__VA_OPT__" && tok.next.lit == "(" { - opt_arg := read_macro_arg_one(cpp, &tok, tok.next.next, true) - if has_varargs(args) { - for t := opt_arg.tok; t.kind != .EOF; t = t.next { - curr.next = t - curr = curr.next - } - } - tok = skip(cpp, tok, ")") - continue - } - - if arg != nil { - t := preprocess_internal(cpp, arg.tok) - t.at_bol = tok.at_bol - t.has_space = tok.has_space - for ; t.kind != .EOF; t = t.next { - curr.next = tokenizer.copy_token(t) - curr = curr.next - } - tok = tok.next - continue - } - - curr.next = tokenizer.copy_token(tok) - curr = curr.next - tok = tok.next - continue - } - - curr.next = tok - return head.next -} - -read_macro_arg_one :: proc(cpp: ^Preprocessor, rest: ^^Token, tok: ^Token, read_rest: bool) -> ^Macro_Arg { - tok := tok - head: Token - curr := &head - level := 0 - for { - if level == 0 && tok.lit == ")" { - break - } - if level == 0 && !read_rest && tok.lit == "," { - break - } - - if tok.kind == .EOF { - error(cpp, tok, "premature end of input") - } - - switch tok.lit { - case "(": level += 1 - case ")": level -= 1 - } - - curr.next = tokenizer.copy_token(tok) - curr = curr.next - tok = tok.next - } - curr.next = tokenizer.new_eof(tok) - - arg := new(Macro_Arg) - arg.tok = head.next - rest^ = tok - return arg -} - -read_macro_args :: proc(cpp: ^Preprocessor, rest: ^^Token, tok: ^Token, params: ^Macro_Param, va_args_name: string) -> ^Macro_Arg { - tok := tok - start := tok - tok = tok.next.next - - head: Macro_Arg - curr := &head - - pp := params - for ; pp != nil; pp = pp.next { - if curr != &head { - tok = skip(cpp, tok, ",") - } - curr.next = read_macro_arg_one(cpp, &tok, tok, false) - curr = curr.next - curr.name = pp.name - } - - if va_args_name != "" { - arg: ^Macro_Arg - if tok.lit == ")" { - arg = new(Macro_Arg) - arg.tok = tokenizer.new_eof(tok) - } else { - if pp != params { - tok = skip(cpp, tok, ",") - } - arg = read_macro_arg_one(cpp, &tok, tok, true) - } - arg.name = va_args_name - arg.is_va_args = true - curr.next = arg - curr = curr.next - } else if pp != nil { - error(cpp, start, "too many arguments") - } - - skip(cpp, tok, ")") - rest^ = tok - return head.next -} - -expand_macro :: proc(cpp: ^Preprocessor, rest: ^^Token, tok: ^Token) -> bool { - if tokenizer.hide_set_contains(tok.hide_set, tok.lit) { - return false - } - tok := tok - m := find_macro(cpp, tok) - if m == nil { - return false - } - - if m.handler != nil { - rest^ = m.handler(cpp, tok) - rest^.next = tok.next - return true - } - - if m.kind == .Value_Like { - hs := tokenizer.hide_set_union(tok.hide_set, tokenizer.new_hide_set(m.name)) - body := tokenizer.add_hide_set(m.body, hs) - for t := body; t.kind != .EOF; t = t.next { - t.origin = tok - } - rest^ = append_token(body, tok.next) - rest^.at_bol = tok.at_bol - rest^.has_space = tok.has_space - return true - } - - if tok.next.lit != "(" { - return false - } - - macro_token := tok - args := read_macro_args(cpp, &tok, tok, m.params, m.va_args_name) - close_paren := tok - - hs := tokenizer.hide_set_intersection(macro_token.hide_set, close_paren.hide_set) - hs = tokenizer.hide_set_union(hs, tokenizer.new_hide_set(m.name)) - - body := substitute_token(cpp, m.body, args) - body = tokenizer.add_hide_set(body, hs) - for t := body; t.kind != .EOF; t = t.next { - t.origin = macro_token - } - rest^ = append_token(body, tok.next) - rest^.at_bol = macro_token.at_bol - rest^.has_space = macro_token.has_space - return true -} - -search_include_next :: proc(cpp: ^Preprocessor, filename: string) -> (path: string, ok: bool) { - for ; cpp.include_next_index < len(cpp.include_paths); cpp.include_next_index += 1 { - tpath := filepath.join({cpp.include_paths[cpp.include_next_index], filename}, allocator=context.temp_allocator) - if os.exists(tpath) { - return strings.clone(tpath), true - } - } - return -} - -search_include_paths :: proc(cpp: ^Preprocessor, filename: string) -> (path: string, ok: bool) { - if filepath.is_abs(filename) { - return filename, true - } - - if path, ok = cpp.filepath_cache[filename]; ok { - return - } - - for include_path in cpp.include_paths { - tpath := filepath.join({include_path, filename}, allocator=context.temp_allocator) - if os.exists(tpath) { - path, ok = strings.clone(tpath), true - cpp.filepath_cache[filename] = path - return - } - } - - return -} - -read_const_expr :: proc(cpp: ^Preprocessor, rest: ^^Token, tok: ^Token) -> ^Token { - tok := tok - tok = copy_line(rest, tok) - head: Token - curr := &head - for tok.kind != .EOF { - if tok.lit == "defined" { - start := tok - has_paren := consume(&tok, tok.next, "(") - if tok.kind != .Ident { - error(cpp, start, "macro name must be an identifier") - } - m := find_macro(cpp, tok) - tok = tok.next - - if has_paren { - tok = skip(cpp, tok, ")") - } - - curr.next = new_number_token(cpp, 1 if m != nil else 0, start) - curr = curr.next - continue - } - - curr.next = tok - curr = curr.next - tok = tok.next - } - - curr.next = tok - return head.next -} - -eval_const_expr :: proc(cpp: ^Preprocessor, rest: ^^Token, tok: ^Token) -> (val: i64) { - tok := tok - start := tok - expr := read_const_expr(cpp, rest, tok.next) - expr = preprocess_internal(cpp, expr) - - if expr.kind == .EOF { - error(cpp, start, "no expression") - } - - for t := expr; t.kind != .EOF; t = t.next { - if t.kind == .Ident { - next := t.next - t^ = new_number_token(cpp, 0, t)^ - t.next = next - } - } - - val = 1 - convert_pp_tokens(cpp, expr, tokenizer.default_is_keyword) - - rest2: ^Token - val = const_expr(&rest2, expr) - if rest2 != nil && rest2.kind != .EOF { - error(cpp, rest2, "extra token") - } - return -} - -push_cond_incl :: proc(cpp: ^Preprocessor, tok: ^Token, included: bool) -> ^Cond_Incl { - ci := new(Cond_Incl) - ci.next = cpp.cond_incl - ci.state = .In_Then - ci.tok = tok - ci.included = included - cpp.cond_incl = ci - return ci -} - -read_line_marker:: proc(cpp: ^Preprocessor, rest: ^^Token, tok: ^Token) { - tok := tok - start := tok - tok = preprocess(cpp, copy_line(rest, tok)) - if tok.kind != .Number { - error(cpp, tok, "invalid line marker") - } - ival, _ := tok.val.(i64) - start.file.line_delta = int(ival - i64(start.pos.line)) - tok = tok.next - if tok.kind == .EOF { - return - } - - if tok.kind != .String { - error(cpp, tok, "filename expected") - } - start.file.display_name = tok.lit -} - -preprocess_internal :: proc(cpp: ^Preprocessor, tok: ^Token) -> ^Token { - head: Token - curr := &head - - tok := tok - for tok != nil && tok.kind != .EOF { - if expand_macro(cpp, &tok, tok) { - continue - } - - if !is_hash(tok) { - if tok.file != nil { - tok.line_delta = tok.file.line_delta - } - curr.next = tok - curr = curr.next - tok = tok.next - continue - } - - start := tok - tok = tok.next - - switch tok.lit { - case "include": - filename, is_quote := read_include_filename(cpp, &tok, tok.next) - is_absolute := filepath.is_abs(filename) - if is_absolute { - tok = include_file(cpp, tok, filename, start.next.next) - continue - } - - if is_quote { - dir := "" - if start.file != nil { - dir = filepath.dir(start.file.name) - } - path := filepath.join({dir, filename}) - if os.exists(path) { - tok = include_file(cpp, tok, path, start.next.next) - continue - } - } - - path, ok := search_include_paths(cpp, filename) - if !ok { - path = filename - } - tok = include_file(cpp, tok, path, start.next.next) - continue - - case "include_next": - filename, _ := read_include_filename(cpp, &tok, tok.next) - path, ok := search_include_next(cpp, filename) - if !ok { - path = filename - } - tok = include_file(cpp, tok, path, start.next.next) - continue - - case "define": - read_macro_definition(cpp, &tok, tok.next) - continue - - case "undef": - tok = tok.next - if tok.kind != .Ident { - error(cpp, tok, "macro name must be an identifier") - } - undef_macro(cpp, tok.lit) - tok = skip_line(cpp, tok.next) - continue - - case "if": - val := eval_const_expr(cpp, &tok, tok) - push_cond_incl(cpp, start, val != 0) - if val == 0 { - tok = skip_cond_incl(tok) - } - continue - - case "ifdef": - defined := find_macro(cpp, tok.next) - push_cond_incl(cpp, tok, defined != nil) - tok = skip_line(cpp, tok.next.next) - if defined == nil { - tok = skip_cond_incl(tok) - } - continue - - case "ifndef": - defined := find_macro(cpp, tok.next) - push_cond_incl(cpp, tok, defined != nil) - tok = skip_line(cpp, tok.next.next) - if !(defined == nil) { - tok = skip_cond_incl(tok) - } - continue - - case "elif": - if cpp.cond_incl == nil || cpp.cond_incl.state == .In_Else { - error(cpp, start, "stray #elif") - } - if cpp.cond_incl != nil { - cpp.cond_incl.state = .In_Elif - } - - if (cpp.cond_incl != nil && !cpp.cond_incl.included) && eval_const_expr(cpp, &tok, tok) != 0 { - cpp.cond_incl.included = true - } else { - tok = skip_cond_incl(tok) - } - continue - - case "else": - if cpp.cond_incl == nil || cpp.cond_incl.state == .In_Else { - error(cpp, start, "stray #else") - } - if cpp.cond_incl != nil { - cpp.cond_incl.state = .In_Else - } - tok = skip_line(cpp, tok.next) - - if cpp.cond_incl != nil { - tok = skip_cond_incl(tok) - } - continue - - case "endif": - if cpp.cond_incl == nil { - error(cpp, start, "stray #endif") - } else { - cpp.cond_incl = cpp.cond_incl.next - } - tok = skip_line(cpp, tok.next) - continue - - case "line": - read_line_marker(cpp, &tok, tok.next) - continue - - case "pragma": - if tok.next.lit == "once" { - cpp.pragma_once[tok.pos.file] = true - tok = skip_line(cpp, tok.next.next) - continue - } - - pragma_tok, pragma_end := tok, tok - - for tok != nil && tok.kind != .EOF { - pragma_end = tok - tok = tok.next - if tok.at_bol { - break - } - } - pragma_end.next = tokenizer.new_eof(tok) - if cpp.pragma_handler != nil { - cpp.pragma_handler(cpp, pragma_tok.next) - continue - } - - continue - - case "error": - error(cpp, tok, "error") - } - - if tok.kind == .PP_Number { - read_line_marker(cpp, &tok, tok) - continue - } - - if !tok.at_bol { - error(cpp, tok, "invalid preprocessor directive") - } - } - - curr.next = tok - return head.next -} - - -preprocess :: proc(cpp: ^Preprocessor, tok: ^Token) -> ^Token { - tok := tok - tok = preprocess_internal(cpp, tok) - if cpp.cond_incl != nil { - error(cpp, tok, "unterminated conditional directive") - } - convert_pp_tokens(cpp, tok, tokenizer.default_is_keyword) - join_adjacent_string_literals(cpp, tok) - for t := tok; t != nil; t = t.next { - t.pos.line += t.line_delta - } - return tok -} - - -define_macro :: proc(cpp: ^Preprocessor, name, def: string) { - src := transmute([]byte)def - - file := new(tokenizer.File) - file.id = -1 - file.src = src - file.name = "" - file.display_name = file.name - - - t := _init_tokenizer_from_preprocessor(&Tokenizer{}, cpp) - tok := tokenizer.tokenize(t, file) - add_macro(cpp, name, .Value_Like, tok) -} - - -file_macro :: proc(cpp: ^Preprocessor, tok: ^Token) -> ^Token { - tok := tok - for tok.origin != nil { - tok = tok.origin - } - i := i64(tok.pos.line + tok.file.line_delta) - return new_number_token(cpp, i, tok) -} -line_macro :: proc(cpp: ^Preprocessor, tok: ^Token) -> ^Token { - tok := tok - for tok.origin != nil { - tok = tok.origin - } - return new_string_token(cpp, tok.file.display_name, tok) -} -counter_macro :: proc(cpp: ^Preprocessor, tok: ^Token) -> ^Token { - i := cpp.counter - cpp.counter += 1 - return new_number_token(cpp, i, tok) -} - -init_default_macros :: proc(cpp: ^Preprocessor) { - define_macro(cpp, "__C99_MACRO_WITH_VA_ARGS", "1") - define_macro(cpp, "__alignof__", "_Alignof") - define_macro(cpp, "__const__", "const") - define_macro(cpp, "__inline__", "inline") - define_macro(cpp, "__signed__", "signed") - define_macro(cpp, "__typeof__", "typeof") - define_macro(cpp, "__volatile__", "volatile") - - add_builtin(cpp, "__FILE__", file_macro) - add_builtin(cpp, "__LINE__", line_macro) - add_builtin(cpp, "__COUNTER__", counter_macro) -} - -init_lookup_tables :: proc(cpp: ^Preprocessor, allocator := context.allocator) { - context.allocator = allocator - reserve(&cpp.macros, max(16, cap(cpp.macros))) - reserve(&cpp.pragma_once, max(16, cap(cpp.pragma_once))) - reserve(&cpp.include_guards, max(16, cap(cpp.include_guards))) - reserve(&cpp.filepath_cache, max(16, cap(cpp.filepath_cache))) -} - - -init_defaults :: proc(cpp: ^Preprocessor, lookup_tables_allocator := context.allocator) { - if cpp.warn == nil { - cpp.warn = tokenizer.default_warn_handler - } - if cpp.err == nil { - cpp.err = tokenizer.default_error_handler - } - init_lookup_tables(cpp, lookup_tables_allocator) - init_default_macros(cpp) -} diff --git a/core/c/frontend/preprocessor/unquote.odin b/core/c/frontend/preprocessor/unquote.odin deleted file mode 100644 index 5869fa7ef..000000000 --- a/core/c/frontend/preprocessor/unquote.odin +++ /dev/null @@ -1,154 +0,0 @@ -package c_frontend_preprocess - -import "core:unicode/utf8" - -unquote_char :: proc(str: string, quote: byte) -> (r: rune, multiple_bytes: bool, tail_string: string, success: bool) { - hex_to_int :: proc(c: byte) -> int { - switch c { - case '0'..='9': return int(c-'0') - case 'a'..='f': return int(c-'a')+10 - case 'A'..='F': return int(c-'A')+10 - } - return -1 - } - w: int - - if str[0] == quote && quote == '"' { - return - } else if str[0] >= 0x80 { - r, w = utf8.decode_rune_in_string(str) - return r, true, str[w:], true - } else if str[0] != '\\' { - return rune(str[0]), false, str[1:], true - } - - if len(str) <= 1 { - return - } - s := str - c := s[1] - s = s[2:] - - switch c { - case: r = rune(c) - - case 'a': r = '\a' - case 'b': r = '\b' - case 'e': r = '\e' - case 'f': r = '\f' - case 'n': r = '\n' - case 'r': r = '\r' - case 't': r = '\t' - case 'v': r = '\v' - case '\\': r = '\\' - - case '"': r = '"' - case '\'': r = '\'' - - case '0'..='7': - v := int(c-'0') - if len(s) < 2 { - return - } - for i in 0.. 7 { - return - } - v = (v<<3) | d - } - s = s[2:] - if v > 0xff { - return - } - r = rune(v) - - case 'x', 'u', 'U': - count: int - switch c { - case 'x': count = 2 - case 'u': count = 4 - case 'U': count = 8 - } - - if len(s) < count { - return - } - - for i in 0.. utf8.MAX_RUNE { - return - } - multiple_bytes = true - } - - success = true - tail_string = s - return -} - -unquote_string :: proc(lit: string, allocator := context.allocator) -> (res: string, allocated, success: bool) { - contains_rune :: proc(s: string, r: rune) -> int { - for c, offset in s { - if c == r { - return offset - } - } - return -1 - } - - assert(len(lit) >= 2) - - s := lit - quote := '"' - - if s == `""` { - return "", false, true - } - - if contains_rune(s, '\n') >= 0 { - return s, false, false - } - - if contains_rune(s, '\\') < 0 && contains_rune(s, quote) < 0 { - if quote == '"' { - return s, false, true - } - } - s = s[1:len(s)-1] - - - buf_len := 3*len(s) / 2 - buf := make([]byte, buf_len, allocator) - offset := 0 - for len(s) > 0 { - r, multiple_bytes, tail_string, ok := unquote_char(s, byte(quote)) - if !ok { - delete(buf) - return s, false, false - } - s = tail_string - if r < 0x80 || !multiple_bytes { - buf[offset] = byte(r) - offset += 1 - } else { - b, w := utf8.encode_rune(r) - copy(buf[offset:], b[:w]) - offset += w - } - } - - new_string := string(buf[:offset]) - - return new_string, true, true -} diff --git a/core/c/frontend/tokenizer/doc.odin b/core/c/frontend/tokenizer/doc.odin deleted file mode 100644 index 43747dfe8..000000000 --- a/core/c/frontend/tokenizer/doc.odin +++ /dev/null @@ -1,31 +0,0 @@ -/* -Example: - package demo - - import tokenizer "core:c/frontend/tokenizer" - import preprocessor "core:c/frontend/preprocessor" - import "core:fmt" - - main :: proc() { - t := &tokenizer.Tokenizer{}; - tokenizer.init_defaults(t); - - cpp := &preprocessor.Preprocessor{}; - cpp.warn, cpp.err = t.warn, t.err; - preprocessor.init_lookup_tables(cpp); - preprocessor.init_default_macros(cpp); - cpp.include_paths = {"my/path/to/include"}; - - tok := tokenizer.tokenize_file(t, "the/source/file.c", 1); - - tok = preprocessor.preprocess(cpp, tok); - if tok != nil { - for t := tok; t.kind != .EOF; t = t.next { - fmt.println(t.lit); - } - } - - fmt.println("[Done]"); - } -*/ -package c_frontend_tokenizer diff --git a/core/c/frontend/tokenizer/hide_set.odin b/core/c/frontend/tokenizer/hide_set.odin deleted file mode 100644 index ec8b77e6e..000000000 --- a/core/c/frontend/tokenizer/hide_set.odin +++ /dev/null @@ -1,68 +0,0 @@ -package c_frontend_tokenizer - -// NOTE(bill): This is a really dumb approach for a hide set, -// but it's really simple and probably fast enough in practice - - -Hide_Set :: struct { - next: ^Hide_Set, - name: string, -} - - -new_hide_set :: proc(name: string) -> ^Hide_Set { - hs := new(Hide_Set) - hs.name = name - return hs -} - -hide_set_contains :: proc(hs: ^Hide_Set, name: string) -> bool { - for h := hs; h != nil; h = h.next { - if h.name == name { - return true - } - } - return false -} - - -hide_set_union :: proc(a, b: ^Hide_Set) -> ^Hide_Set { - head: Hide_Set - curr := &head - - for h := a; h != nil; h = h.next { - curr.next = new_hide_set(h.name) - curr = curr.next - } - curr.next = b - return head.next -} - - -hide_set_intersection :: proc(a, b: ^Hide_Set) -> ^Hide_Set { - head: Hide_Set - curr := &head - - for h := a; h != nil; h = h.next { - if hide_set_contains(b, h.name) { - curr.next = new_hide_set(h.name) - curr = curr.next - } - } - return head.next -} - - -add_hide_set :: proc(tok: ^Token, hs: ^Hide_Set) -> ^Token { - head: Token - curr := &head - - tok := tok - for ; tok != nil; tok = tok.next { - t := copy_token(tok) - t.hide_set = hide_set_union(t.hide_set, hs) - curr.next = t - curr = curr.next - } - return head.next -} diff --git a/core/c/frontend/tokenizer/token.odin b/core/c/frontend/tokenizer/token.odin deleted file mode 100644 index 1376a651f..000000000 --- a/core/c/frontend/tokenizer/token.odin +++ /dev/null @@ -1,169 +0,0 @@ -package c_frontend_tokenizer - - -Pos :: struct { - file: string, - line: int, - column: int, - offset: int, -} - -Token_Kind :: enum { - Invalid, - Ident, - Punct, - Keyword, - Char, - String, - Number, - PP_Number, - Comment, - EOF, -} - -File :: struct { - name: string, - id: int, - src: []byte, - - display_name: string, - line_delta: int, -} - - -Token_Type_Hint :: enum u8 { - None, - - Int, - Long, - Long_Long, - - Unsigned_Int, - Unsigned_Long, - Unsigned_Long_Long, - - Float, - Double, - Long_Double, - - UTF_8, - UTF_16, - UTF_32, - UTF_Wide, -} - -Token_Value :: union { - i64, - f64, - string, - []u16, - []u32, -} - -Token :: struct { - kind: Token_Kind, - next: ^Token, - lit: string, - - pos: Pos, - file: ^File, - line_delta: int, - at_bol: bool, - has_space: bool, - - type_hint: Token_Type_Hint, - val: Token_Value, - prefix: string, - - // Preprocessor values - hide_set: ^Hide_Set, - origin: ^Token, -} - -Is_Keyword_Proc :: #type proc(tok: ^Token) -> bool - -copy_token :: proc(tok: ^Token) -> ^Token { - t, _ := new_clone(tok^) - t.next = nil - return t -} - -new_eof :: proc(tok: ^Token) -> ^Token { - t, _ := new_clone(tok^) - t.kind = .EOF - t.lit = "" - return t -} - -default_is_keyword :: proc(tok: ^Token) -> bool { - if tok.kind == .Keyword { - return true - } - if len(tok.lit) > 0 { - return default_keyword_set[tok.lit] - } - return false -} - - -token_name := [Token_Kind]string { - .Invalid = "invalid", - .Ident = "ident", - .Punct = "punct", - .Keyword = "keyword", - .Char = "char", - .String = "string", - .Number = "number", - .PP_Number = "preprocessor number", - .Comment = "comment", - .EOF = "eof", -} - -default_keyword_set := map[string]bool{ - "auto" = true, - "break" = true, - "case" = true, - "char" = true, - "const" = true, - "continue" = true, - "default" = true, - "do" = true, - "double" = true, - "else" = true, - "enum" = true, - "extern" = true, - "float" = true, - "for" = true, - "goto" = true, - "if" = true, - "int" = true, - "long" = true, - "register" = true, - "restrict" = true, - "return" = true, - "short" = true, - "signed" = true, - "sizeof" = true, - "static" = true, - "struct" = true, - "switch" = true, - "typedef" = true, - "union" = true, - "unsigned" = true, - "void" = true, - "volatile" = true, - "while" = true, - "_Alignas" = true, - "_Alignof" = true, - "_Atomic" = true, - "_Bool" = true, - "_Generic" = true, - "_Noreturn" = true, - "_Thread_local" = true, - "__restrict" = true, - "typeof" = true, - "asm" = true, - "__restrict__" = true, - "__thread" = true, - "__attribute__" = true, -} diff --git a/core/c/frontend/tokenizer/tokenizer.odin b/core/c/frontend/tokenizer/tokenizer.odin deleted file mode 100644 index 558077717..000000000 --- a/core/c/frontend/tokenizer/tokenizer.odin +++ /dev/null @@ -1,667 +0,0 @@ -package c_frontend_tokenizer - -import "core:fmt" -import "core:os" -import "core:strings" -import "core:unicode/utf8" - - -Error_Handler :: #type proc(pos: Pos, fmt: string, args: ..any) - - -Tokenizer :: struct { - // Immutable data - path: string, - src: []byte, - - - // Tokenizing state - ch: rune, - offset: int, - read_offset: int, - line_offset: int, - line_count: int, - - // Extra information for tokens - at_bol: bool, - has_space: bool, - - // Mutable data - err: Error_Handler, - warn: Error_Handler, - error_count: int, - warning_count: int, -} - -init_defaults :: proc(t: ^Tokenizer, err: Error_Handler = default_error_handler, warn: Error_Handler = default_warn_handler) { - t.err = err - t.warn = warn -} - - -@(private) -offset_to_pos :: proc(t: ^Tokenizer, offset: int) -> (pos: Pos) { - pos.file = t.path - pos.offset = offset - pos.line = t.line_count - pos.column = offset - t.line_offset + 1 - return -} - -default_error_handler :: proc(pos: Pos, msg: string, args: ..any) { - fmt.eprintf("%s(%d:%d) ", pos.file, pos.line, pos.column) - fmt.eprintf(msg, ..args) - fmt.eprintf("\n") -} - -default_warn_handler :: proc(pos: Pos, msg: string, args: ..any) { - fmt.eprintf("%s(%d:%d) warning: ", pos.file, pos.line, pos.column) - fmt.eprintf(msg, ..args) - fmt.eprintf("\n") -} - -error_offset :: proc(t: ^Tokenizer, offset: int, msg: string, args: ..any) { - pos := offset_to_pos(t, offset) - if t.err != nil { - t.err(pos, msg, ..args) - } - t.error_count += 1 -} - -warn_offset :: proc(t: ^Tokenizer, offset: int, msg: string, args: ..any) { - pos := offset_to_pos(t, offset) - if t.warn != nil { - t.warn(pos, msg, ..args) - } - t.warning_count += 1 -} - -error :: proc(t: ^Tokenizer, tok: ^Token, msg: string, args: ..any) { - pos := tok.pos - if t.err != nil { - t.err(pos, msg, ..args) - } - t.error_count += 1 -} - -warn :: proc(t: ^Tokenizer, tok: ^Token, msg: string, args: ..any) { - pos := tok.pos - if t.warn != nil { - t.warn(pos, msg, ..args) - } - t.warning_count += 1 -} - - -advance_rune :: proc(t: ^Tokenizer) { - if t.read_offset < len(t.src) { - t.offset = t.read_offset - if t.ch == '\n' { - t.at_bol = true - t.line_offset = t.offset - t.line_count += 1 - } - r, w := rune(t.src[t.read_offset]), 1 - switch { - case r == 0: - error_offset(t, t.offset, "illegal character NUL") - case r >= utf8.RUNE_SELF: - r, w = utf8.decode_rune(t.src[t.read_offset:]) - if r == utf8.RUNE_ERROR && w == 1 { - error_offset(t, t.offset, "illegal UTF-8 encoding") - } else if r == utf8.RUNE_BOM && t.offset > 0 { - error_offset(t, t.offset, "illegal byte order mark") - } - } - t.read_offset += w - t.ch = r - } else { - t.offset = len(t.src) - if t.ch == '\n' { - t.at_bol = true - t.line_offset = t.offset - t.line_count += 1 - } - t.ch = -1 - } -} - -advance_rune_n :: proc(t: ^Tokenizer, n: int) { - for _ in 0.. bool { - return '0' <= r && r <= '9' -} - -skip_whitespace :: proc(t: ^Tokenizer) { - for { - switch t.ch { - case ' ', '\t', '\r', '\v', '\f', '\n': - t.has_space = true - advance_rune(t) - case: - return - } - } -} - -scan_comment :: proc(t: ^Tokenizer) -> string { - offset := t.offset-1 - next := -1 - general: { - if t.ch == '/'{ // line comments - advance_rune(t) - for t.ch != '\n' && t.ch >= 0 { - advance_rune(t) - } - - next = t.offset - if t.ch == '\n' { - next += 1 - } - break general - } - - /* style comment */ - advance_rune(t) - for t.ch >= 0 { - ch := t.ch - advance_rune(t) - if ch == '*' && t.ch == '/' { - advance_rune(t) - next = t.offset - break general - } - } - - error_offset(t, offset, "comment not terminated") - } - - lit := t.src[offset : t.offset] - - // NOTE(bill): Strip CR for line comments - for len(lit) > 2 && lit[1] == '/' && lit[len(lit)-1] == '\r' { - lit = lit[:len(lit)-1] - } - - - return string(lit) -} - -scan_identifier :: proc(t: ^Tokenizer) -> string { - offset := t.offset - - for is_ident1(t.ch) { - advance_rune(t) - } - - return string(t.src[offset : t.offset]) -} - -scan_string :: proc(t: ^Tokenizer) -> string { - offset := t.offset-1 - - for { - ch := t.ch - if ch == '\n' || ch < 0 { - error_offset(t, offset, "string literal was not terminated") - break - } - advance_rune(t) - if ch == '"' { - break - } - if ch == '\\' { - scan_escape(t) - } - } - - return string(t.src[offset : t.offset]) -} - -digit_val :: proc(r: rune) -> int { - switch r { - case '0'..='9': - return int(r-'0') - case 'A'..='F': - return int(r-'A' + 10) - case 'a'..='f': - return int(r-'a' + 10) - } - return 16 -} - -scan_escape :: proc(t: ^Tokenizer) -> bool { - offset := t.offset - - esc := t.ch - n: int - base, max: u32 - switch esc { - case 'a', 'b', 'e', 'f', 'n', 't', 'v', 'r', '\\', '\'', '"': - advance_rune(t) - return true - - case '0'..='7': - for digit_val(t.ch) < 8 { - advance_rune(t) - } - return true - case 'x': - advance_rune(t) - for digit_val(t.ch) < 16 { - advance_rune(t) - } - return true - case 'u': - advance_rune(t) - n, base, max = 4, 16, utf8.MAX_RUNE - case 'U': - advance_rune(t) - n, base, max = 8, 16, utf8.MAX_RUNE - case: - if t.ch < 0 { - error_offset(t, offset, "escape sequence was not terminated") - } else { - break - } - return false - } - - x: u32 - main_loop: for n > 0 { - d := u32(digit_val(t.ch)) - if d >= base { - if t.ch == '"' || t.ch == '\'' { - break main_loop - } - if t.ch < 0 { - error_offset(t, t.offset, "escape sequence was not terminated") - } else { - error_offset(t, t.offset, "illegal character '%r' : %d in escape sequence", t.ch, t.ch) - } - return false - } - - x = x*base + d - advance_rune(t) - n -= 1 - } - - if x > max || 0xd800 <= x && x <= 0xdfff { - error_offset(t, offset, "escape sequence is an invalid Unicode code point") - return false - } - return true -} - -scan_rune :: proc(t: ^Tokenizer) -> string { - offset := t.offset-1 - valid := true - n := 0 - for { - ch := t.ch - if ch == '\n' || ch < 0 { - if valid { - error_offset(t, offset, "rune literal not terminated") - valid = false - } - break - } - advance_rune(t) - if ch == '\'' { - break - } - n += 1 - if ch == '\\' { - if !scan_escape(t) { - valid = false - } - } - } - - if valid && n != 1 { - error_offset(t, offset, "illegal rune literal") - } - - return string(t.src[offset : t.offset]) -} - -scan_number :: proc(t: ^Tokenizer, seen_decimal_point: bool) -> (Token_Kind, string) { - scan_mantissa :: proc(t: ^Tokenizer, base: int) { - for digit_val(t.ch) < base { - advance_rune(t) - } - } - scan_exponent :: proc(t: ^Tokenizer) { - if t.ch == 'e' || t.ch == 'E' || t.ch == 'p' || t.ch == 'P' { - advance_rune(t) - if t.ch == '-' || t.ch == '+' { - advance_rune(t) - } - if digit_val(t.ch) < 10 { - scan_mantissa(t, 10) - } else { - error_offset(t, t.offset, "illegal floating-point exponent") - } - } - } - scan_fraction :: proc(t: ^Tokenizer) -> (early_exit: bool) { - if t.ch == '.' && peek(t) == '.' { - return true - } - if t.ch == '.' { - advance_rune(t) - scan_mantissa(t, 10) - } - return false - } - - check_end := true - - - offset := t.offset - seen_point := seen_decimal_point - - if seen_point { - offset -= 1 - scan_mantissa(t, 10) - scan_exponent(t) - } else { - if t.ch == '0' { - int_base :: proc(t: ^Tokenizer, base: int, msg: string) { - prev := t.offset - advance_rune(t) - scan_mantissa(t, base) - if t.offset - prev <= 1 { - error_offset(t, t.offset, msg) - } - } - - advance_rune(t) - switch t.ch { - case 'b', 'B': - int_base(t, 2, "illegal binary integer") - case 'x', 'X': - int_base(t, 16, "illegal hexadecimal integer") - case: - seen_point = false - scan_mantissa(t, 10) - if t.ch == '.' { - seen_point = true - if scan_fraction(t) { - check_end = false - } - } - if check_end { - scan_exponent(t) - check_end = false - } - } - } - } - - if check_end { - scan_mantissa(t, 10) - - if !scan_fraction(t) { - scan_exponent(t) - } - } - - return .Number, string(t.src[offset : t.offset]) -} - -scan_punct :: proc(t: ^Tokenizer, ch: rune) -> (kind: Token_Kind) { - kind = .Punct - switch ch { - case: - kind = .Invalid - - case '<', '>': - if t.ch == ch { - advance_rune(t) - } - if t.ch == '=' { - advance_rune(t) - } - case '!', '+', '-', '*', '/', '%', '^', '=': - if t.ch == '=' { - advance_rune(t) - } - case '#': - if t.ch == '#' { - advance_rune(t) - } - case '&': - if t.ch == '=' || t.ch == '&' { - advance_rune(t) - } - case '|': - if t.ch == '=' || t.ch == '|' { - advance_rune(t) - } - case '(', ')', '[', ']', '{', '}': - // okay - case '~', ',', ':', ';', '?': - // okay - case '`': - // okay - case '.': - if t.ch == '.' && peek(t) == '.' { - advance_rune(t) - advance_rune(t) // consume last '.' - } - } - return -} - -peek :: proc(t: ^Tokenizer) -> byte { - if t.read_offset < len(t.src) { - return t.src[t.read_offset] - } - return 0 -} -peek_str :: proc(t: ^Tokenizer, str: string) -> bool { - if t.read_offset < len(t.src) { - return strings.has_prefix(string(t.src[t.offset:]), str) - } - return false -} - -scan_literal_prefix :: proc(t: ^Tokenizer, str: string, prefix: ^string) -> bool { - if peek_str(t, str) { - offset := t.offset - for _ in str { - advance_rune(t) - } - prefix^ = string(t.src[offset:][:len(str)-1]) - return true - } - return false -} - - -allow_next_to_be_newline :: proc(t: ^Tokenizer) -> bool { - if t.ch == '\n' { - advance_rune(t) - return true - } else if t.ch == '\r' && peek(t) == '\n' { // allow for MS-DOS style line endings - advance_rune(t) // \r - advance_rune(t) // \n - return true - } - return false -} - -scan :: proc(t: ^Tokenizer, f: ^File) -> ^Token { - skip_whitespace(t) - - offset := t.offset - - kind: Token_Kind - lit: string - prefix: string - - switch ch := t.ch; { - case scan_literal_prefix(t, `u8"`, &prefix): - kind = .String - lit = scan_string(t) - case scan_literal_prefix(t, `u"`, &prefix): - kind = .String - lit = scan_string(t) - case scan_literal_prefix(t, `L"`, &prefix): - kind = .String - lit = scan_string(t) - case scan_literal_prefix(t, `U"`, &prefix): - kind = .String - lit = scan_string(t) - case scan_literal_prefix(t, `u'`, &prefix): - kind = .Char - lit = scan_rune(t) - case scan_literal_prefix(t, `L'`, &prefix): - kind = .Char - lit = scan_rune(t) - case scan_literal_prefix(t, `U'`, &prefix): - kind = .Char - lit = scan_rune(t) - - case is_ident0(ch): - lit = scan_identifier(t) - kind = .Ident - case '0' <= ch && ch <= '9': - kind, lit = scan_number(t, false) - case: - advance_rune(t) - switch ch { - case -1: - kind = .EOF - case '\\': - kind = .Punct - if allow_next_to_be_newline(t) { - t.at_bol = true - t.has_space = false - return scan(t, f) - } - - case '.': - if is_digit(t.ch) { - kind, lit = scan_number(t, true) - } else { - kind = scan_punct(t, ch) - } - case '"': - kind = .String - lit = scan_string(t) - case '\'': - kind = .Char - lit = scan_rune(t) - case '/': - if t.ch == '/' || t.ch == '*' { - kind = .Comment - lit = scan_comment(t) - t.has_space = true - break - } - fallthrough - case: - kind = scan_punct(t, ch) - if kind == .Invalid && ch != utf8.RUNE_BOM { - error_offset(t, t.offset, "illegal character '%r': %d", ch, ch) - } - } - } - - if lit == "" { - lit = string(t.src[offset : t.offset]) - } - - if kind == .Comment { - return scan(t, f) - } - - tok := new(Token) - tok.kind = kind - tok.lit = lit - tok.pos = offset_to_pos(t, offset) - tok.file = f - tok.prefix = prefix - tok.at_bol = t.at_bol - tok.has_space = t.has_space - - t.at_bol, t.has_space = false, false - - return tok -} - -tokenize :: proc(t: ^Tokenizer, f: ^File) -> ^Token { - setup_tokenizer: { - t.src = f.src - t.ch = ' ' - t.offset = 0 - t.read_offset = 0 - t.line_offset = 0 - t.line_count = len(t.src) > 0 ? 1 : 0 - t.error_count = 0 - t.path = f.name - - - advance_rune(t) - if t.ch == utf8.RUNE_BOM { - advance_rune(t) - } - } - - - t.at_bol = true - t.has_space = false - - head: Token - curr := &head - for { - tok := scan(t, f) - if tok == nil { - break - } - curr.next = tok - curr = curr.next - if tok.kind == .EOF { - break - } - } - - return head.next -} - -add_new_file :: proc(t: ^Tokenizer, name: string, src: []byte, id: int) -> ^File { - file := new(File) - file.id = id - file.src = src - file.name = name - file.display_name = name - return file -} - -tokenize_file :: proc(t: ^Tokenizer, path: string, id: int, loc := #caller_location) -> ^Token { - src, ok := os.read_entire_file(path) - if !ok { - return nil - } - return tokenize(t, add_new_file(t, path, src, id)) -} - - -inline_tokenize :: proc(t: ^Tokenizer, tok: ^Token, src: []byte) -> ^Token { - file := new(File) - file.src = src - if tok.file != nil { - file.id = tok.file.id - file.name = tok.file.name - file.display_name = tok.file.name - } - - return tokenize(t, file) -} diff --git a/core/c/frontend/tokenizer/unicode.odin b/core/c/frontend/tokenizer/unicode.odin deleted file mode 100644 index 317ee160e..000000000 --- a/core/c/frontend/tokenizer/unicode.odin +++ /dev/null @@ -1,116 +0,0 @@ -package c_frontend_tokenizer - - -in_range :: proc(range: []rune, c: rune) -> bool #no_bounds_check { - for i := 0; range[i] != -1; i += 2 { - if range[i] <= c && c <= range[i+1] { - return true - } - } - return false -} - - -// [https://www.sigbus.info/n1570#D] C11 allows ASCII and some multibyte characters in certan Unicode ranges to be used in an identifier. -// -// is_ident0 returns true if a given character is acceptable as the first character of an identifier. -is_ident0 :: proc(c: rune) -> bool { - return in_range(_range_ident0, c) -} -// is_ident0 returns true if a given character is acceptable as a non-first character of an identifier. -is_ident1 :: proc(c: rune) -> bool { - return is_ident0(c) || in_range(_range_ident1, c) -} - -// Returns the number of columns needed to display a given character in a fixed-width font. -// Based on https://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c -char_width :: proc(c: rune) -> int { - switch { - case in_range(_range_width0, c): - return 0 - case in_range(_range_width2, c): - return 2 - } - return 1 -} - -display_width :: proc(str: string) -> (w: int) { - for c in str { - w += char_width(c) - } - return -} - - - -_range_ident0 := []rune{ - '_', '_', 'a', 'z', 'A', 'Z', '$', '$', - 0x00A8, 0x00A8, 0x00AA, 0x00AA, 0x00AD, 0x00AD, 0x00AF, 0x00AF, - 0x00B2, 0x00B5, 0x00B7, 0x00BA, 0x00BC, 0x00BE, 0x00C0, 0x00D6, - 0x00D8, 0x00F6, 0x00F8, 0x00FF, 0x0100, 0x02FF, 0x0370, 0x167F, - 0x1681, 0x180D, 0x180F, 0x1DBF, 0x1E00, 0x1FFF, 0x200B, 0x200D, - 0x202A, 0x202E, 0x203F, 0x2040, 0x2054, 0x2054, 0x2060, 0x206F, - 0x2070, 0x20CF, 0x2100, 0x218F, 0x2460, 0x24FF, 0x2776, 0x2793, - 0x2C00, 0x2DFF, 0x2E80, 0x2FFF, 0x3004, 0x3007, 0x3021, 0x302F, - 0x3031, 0x303F, 0x3040, 0xD7FF, 0xF900, 0xFD3D, 0xFD40, 0xFDCF, - 0xFDF0, 0xFE1F, 0xFE30, 0xFE44, 0xFE47, 0xFFFD, - 0x10000, 0x1FFFD, 0x20000, 0x2FFFD, 0x30000, 0x3FFFD, 0x40000, 0x4FFFD, - 0x50000, 0x5FFFD, 0x60000, 0x6FFFD, 0x70000, 0x7FFFD, 0x80000, 0x8FFFD, - 0x90000, 0x9FFFD, 0xA0000, 0xAFFFD, 0xB0000, 0xBFFFD, 0xC0000, 0xCFFFD, - 0xD0000, 0xDFFFD, 0xE0000, 0xEFFFD, - -1, -} - -_range_ident1 := []rune{ - '0', '9', '$', '$', 0x0300, 0x036F, 0x1DC0, 0x1DFF, 0x20D0, 0x20FF, 0xFE20, 0xFE2F, - -1, -} - - -_range_width0 := []rune{ - 0x0000, 0x001F, 0x007f, 0x00a0, 0x0300, 0x036F, 0x0483, 0x0486, - 0x0488, 0x0489, 0x0591, 0x05BD, 0x05BF, 0x05BF, 0x05C1, 0x05C2, - 0x05C4, 0x05C5, 0x05C7, 0x05C7, 0x0600, 0x0603, 0x0610, 0x0615, - 0x064B, 0x065E, 0x0670, 0x0670, 0x06D6, 0x06E4, 0x06E7, 0x06E8, - 0x06EA, 0x06ED, 0x070F, 0x070F, 0x0711, 0x0711, 0x0730, 0x074A, - 0x07A6, 0x07B0, 0x07EB, 0x07F3, 0x0901, 0x0902, 0x093C, 0x093C, - 0x0941, 0x0948, 0x094D, 0x094D, 0x0951, 0x0954, 0x0962, 0x0963, - 0x0981, 0x0981, 0x09BC, 0x09BC, 0x09C1, 0x09C4, 0x09CD, 0x09CD, - 0x09E2, 0x09E3, 0x0A01, 0x0A02, 0x0A3C, 0x0A3C, 0x0A41, 0x0A42, - 0x0A47, 0x0A48, 0x0A4B, 0x0A4D, 0x0A70, 0x0A71, 0x0A81, 0x0A82, - 0x0ABC, 0x0ABC, 0x0AC1, 0x0AC5, 0x0AC7, 0x0AC8, 0x0ACD, 0x0ACD, - 0x0AE2, 0x0AE3, 0x0B01, 0x0B01, 0x0B3C, 0x0B3C, 0x0B3F, 0x0B3F, - 0x0B41, 0x0B43, 0x0B4D, 0x0B4D, 0x0B56, 0x0B56, 0x0B82, 0x0B82, - 0x0BC0, 0x0BC0, 0x0BCD, 0x0BCD, 0x0C3E, 0x0C40, 0x0C46, 0x0C48, - 0x0C4A, 0x0C4D, 0x0C55, 0x0C56, 0x0CBC, 0x0CBC, 0x0CBF, 0x0CBF, - 0x0CC6, 0x0CC6, 0x0CCC, 0x0CCD, 0x0CE2, 0x0CE3, 0x0D41, 0x0D43, - 0x0D4D, 0x0D4D, 0x0DCA, 0x0DCA, 0x0DD2, 0x0DD4, 0x0DD6, 0x0DD6, - 0x0E31, 0x0E31, 0x0E34, 0x0E3A, 0x0E47, 0x0E4E, 0x0EB1, 0x0EB1, - 0x0EB4, 0x0EB9, 0x0EBB, 0x0EBC, 0x0EC8, 0x0ECD, 0x0F18, 0x0F19, - 0x0F35, 0x0F35, 0x0F37, 0x0F37, 0x0F39, 0x0F39, 0x0F71, 0x0F7E, - 0x0F80, 0x0F84, 0x0F86, 0x0F87, 0x0F90, 0x0F97, 0x0F99, 0x0FBC, - 0x0FC6, 0x0FC6, 0x102D, 0x1030, 0x1032, 0x1032, 0x1036, 0x1037, - 0x1039, 0x1039, 0x1058, 0x1059, 0x1160, 0x11FF, 0x135F, 0x135F, - 0x1712, 0x1714, 0x1732, 0x1734, 0x1752, 0x1753, 0x1772, 0x1773, - 0x17B4, 0x17B5, 0x17B7, 0x17BD, 0x17C6, 0x17C6, 0x17C9, 0x17D3, - 0x17DD, 0x17DD, 0x180B, 0x180D, 0x18A9, 0x18A9, 0x1920, 0x1922, - 0x1927, 0x1928, 0x1932, 0x1932, 0x1939, 0x193B, 0x1A17, 0x1A18, - 0x1B00, 0x1B03, 0x1B34, 0x1B34, 0x1B36, 0x1B3A, 0x1B3C, 0x1B3C, - 0x1B42, 0x1B42, 0x1B6B, 0x1B73, 0x1DC0, 0x1DCA, 0x1DFE, 0x1DFF, - 0x200B, 0x200F, 0x202A, 0x202E, 0x2060, 0x2063, 0x206A, 0x206F, - 0x20D0, 0x20EF, 0x302A, 0x302F, 0x3099, 0x309A, 0xA806, 0xA806, - 0xA80B, 0xA80B, 0xA825, 0xA826, 0xFB1E, 0xFB1E, 0xFE00, 0xFE0F, - 0xFE20, 0xFE23, 0xFEFF, 0xFEFF, 0xFFF9, 0xFFFB, 0x10A01, 0x10A03, - 0x10A05, 0x10A06, 0x10A0C, 0x10A0F, 0x10A38, 0x10A3A, 0x10A3F, 0x10A3F, - 0x1D167, 0x1D169, 0x1D173, 0x1D182, 0x1D185, 0x1D18B, 0x1D1AA, 0x1D1AD, - 0x1D242, 0x1D244, 0xE0001, 0xE0001, 0xE0020, 0xE007F, 0xE0100, 0xE01EF, - -1, -} - -_range_width2 := []rune{ - 0x1100, 0x115F, 0x2329, 0x2329, 0x232A, 0x232A, 0x2E80, 0x303E, - 0x3040, 0xA4CF, 0xAC00, 0xD7A3, 0xF900, 0xFAFF, 0xFE10, 0xFE19, - 0xFE30, 0xFE6F, 0xFF00, 0xFF60, 0xFFE0, 0xFFE6, 0x1F000, 0x1F644, - 0x20000, 0x2FFFD, 0x30000, 0x3FFFD, - -1, -}