From 6ac0fb80a6eab6ba28838e8e577dc8ae8cba06a7 Mon Sep 17 00:00:00 2001 From: gingerBill Date: Wed, 27 May 2020 12:32:11 +0100 Subject: [PATCH] Minor tokenizer performance improvements --- src/common.cpp | 10 ++- src/llvm_backend.cpp | 9 ++- src/main.cpp | 2 +- src/parser.cpp | 13 +++- src/tokenizer.cpp | 168 +++++++++++++++++++++++++++++-------------- 5 files changed, 143 insertions(+), 59 deletions(-) diff --git a/src/common.cpp b/src/common.cpp index a0d69b308..7068eb333 100644 --- a/src/common.cpp +++ b/src/common.cpp @@ -158,7 +158,15 @@ GB_ALLOCATOR_PROC(heap_allocator_proc) { #include "range_cache.cpp" - +u32 fnv32a(void const *data, isize len) { + u8 const *bytes = cast(u8 const *)data; + u32 h = 0x811c9dc5; + for (isize i = 0; i < len; i++) { + u32 b = cast(u32)bytes[i]; + h = (h ^ b) * 0x01000193; + } + return h; +} u64 fnv64a(void const *data, isize len) { u8 const *bytes = cast(u8 const *)data; diff --git a/src/llvm_backend.cpp b/src/llvm_backend.cpp index 8095b06f1..f152c7596 100644 --- a/src/llvm_backend.cpp +++ b/src/llvm_backend.cpp @@ -2034,6 +2034,11 @@ lbProcedure *lb_create_procedure(lbModule *m, Entity *entity) { LLVMSetLinkage(p->value, LLVMDLLExportLinkage); LLVMSetDLLStorageClass(p->value, LLVMDLLExportStorageClass); LLVMSetVisibility(p->value, LLVMDefaultVisibility); + + if (build_context.metrics.os == TargetOs_js) { + LLVMAddTargetDependentFunctionAttr(p->value, "wasm-export-name", alloc_cstring(heap_allocator(), p->name)); + LLVMAddTargetDependentFunctionAttr(p->value, "wasm-exported", nullptr); + } } // NOTE(bill): offset==0 is the return value @@ -12173,7 +12178,9 @@ void lb_generate_code(lbGenerator *gen) { TIME_SECTION("LLVM Object Generation"); - LLVMBool was_an_error = LLVMTargetMachineEmitToFile(target_machine, mod, cast(char *)filepath_obj.text, LLVMObjectFile, &llvm_error); + LLVMCodeGenFileType code_gen_file_type = LLVMObjectFile; + + LLVMBool was_an_error = LLVMTargetMachineEmitToFile(target_machine, mod, cast(char *)filepath_obj.text, code_gen_file_type, &llvm_error); if (was_an_error) { gb_printf_err("LLVM Error: %s\n", llvm_error); gb_exit(1); diff --git a/src/main.cpp b/src/main.cpp index 60ed95ced..0e3d5836d 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -1513,10 +1513,10 @@ int main(int arg_count, char const **arg_ptr) { init_string_buffer_memory(); init_string_interner(); init_global_error_collector(); + init_keyword_hash_table(); global_big_int_init(); arena_init(&global_ast_arena, heap_allocator()); - array_init(&library_collections, heap_allocator()); // NOTE(bill): 'core' cannot be (re)defined by the user add_library_collection(str_lit("core"), get_fullpath_relative(heap_allocator(), odin_root_dir(), str_lit("core"))); diff --git a/src/parser.cpp b/src/parser.cpp index 3d2041183..5bdadfd9a 100644 --- a/src/parser.cpp +++ b/src/parser.cpp @@ -4284,8 +4284,15 @@ ParseFileError init_ast_file(AstFile *f, String fullpath, TokenPos *err_pos) { } isize file_size = f->tokenizer.end - f->tokenizer.start; - isize init_token_cap = cast(isize)gb_max(next_pow2(cast(i64)(file_size/2ll)), 16); + + // NOTE(bill): Determine allocation size required for tokens + isize token_cap = file_size/3ll; + isize pow2_cap = gb_max(cast(isize)prev_pow2(cast(i64)token_cap)/2, 16); + token_cap = ((token_cap + pow2_cap-1)/pow2_cap) * pow2_cap; + + isize init_token_cap = gb_max(token_cap, 16); array_init(&f->tokens, heap_allocator(), 0, gb_max(init_token_cap, 16)); + isize cap0 = f->tokens.capacity; if (err == TokenizerInit_Empty) { Token token = {Token_EOF}; @@ -4314,8 +4321,8 @@ ParseFileError init_ast_file(AstFile *f, String fullpath, TokenPos *err_pos) { f->prev_token = f->tokens[f->curr_token_index]; f->curr_token = f->tokens[f->curr_token_index]; - array_init(&f->comments, heap_allocator()); - array_init(&f->imports, heap_allocator()); + array_init(&f->comments, heap_allocator(), 0, 0); + array_init(&f->imports, heap_allocator(), 0, 0); f->curr_proc = nullptr; diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index e07071a27..4e0755742 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -135,6 +135,49 @@ String const token_strings[] = { }; +struct KeywordHashEntry { + u32 hash; + TokenKind kind; +}; + +enum { + KEYWORD_HASH_TABLE_COUNT = 1<<9, + KEYWORD_HASH_TABLE_MASK = KEYWORD_HASH_TABLE_COUNT-1, +}; +gb_global KeywordHashEntry keyword_hash_table[KEYWORD_HASH_TABLE_COUNT] = {}; +GB_STATIC_ASSERT(Token__KeywordEnd-Token__KeywordBegin <= gb_count_of(keyword_hash_table)); + +gb_inline u32 keyword_hash(u8 const *text, isize len) { + return fnv32a(text, len); +} +void add_keyword_hash_entry(String const &s, TokenKind kind) { + u32 hash = keyword_hash(s.text, s.len); + + // NOTE(bill): This is a bit of an empirical hack in order to speed things up + u32 index = hash & KEYWORD_HASH_TABLE_MASK; + KeywordHashEntry *entry = &keyword_hash_table[index]; + GB_ASSERT_MSG(entry->kind == Token_Invalid, "Keyword hash table initialtion collision: %.*s %.*s %08x %08x", LIT(s), LIT(token_strings[entry->kind]), hash, entry->hash); + entry->hash = hash; + entry->kind = kind; +} +void init_keyword_hash_table(void) { + for (i32 kind = Token__KeywordBegin+1; kind < Token__KeywordEnd; kind++) { + add_keyword_hash_entry(token_strings[kind], cast(TokenKind)kind); + } + + static struct { + String s; + TokenKind kind; + } const legacy_keywords[] = { + {str_lit("notin"), Token_not_in}, + }; + + for (i32 i = 0; i < gb_count_of(legacy_keywords); i++) { + add_keyword_hash_entry(legacy_keywords[i].s, legacy_keywords[i].kind); + } +} + + struct TokenPos { String file; isize offset; // starting at 0 @@ -215,7 +258,8 @@ void end_error_block(void) { u8 *text = gb_alloc_array(heap_allocator(), u8, n+1); gb_memmove(text, global_error_collector.error_buffer.data, n); text[n] = 0; - array_add(&global_error_collector.errors, make_string(text, n)); + String s = {text, n}; + array_add(&global_error_collector.errors, s); global_error_collector.error_buffer.count = 0; // gbFile *f = gb_file_get_standard(gbFileStandard_Error); @@ -539,10 +583,11 @@ void advance_to_next_rune(Tokenizer *t) { tokenizer_err(t, "Illegal character NUL"); } else if (rune >= 0x80) { // not ASCII width = gb_utf8_decode(t->read_curr, t->end-t->read_curr, &rune); - if (rune == GB_RUNE_INVALID && width == 1) + if (rune == GB_RUNE_INVALID && width == 1) { tokenizer_err(t, "Illegal UTF-8 encoding"); - else if (rune == GB_RUNE_BOM && t->curr-t->start > 0) + } else if (rune == GB_RUNE_BOM && t->curr-t->start > 0){ tokenizer_err(t, "Illegal byte order mark"); + } } t->read_curr += width; t->curr_rune = rune; @@ -609,21 +654,13 @@ gb_inline void destroy_tokenizer(Tokenizer *t) { array_free(&t->allocated_strings); } -void tokenizer_skip_whitespace(Tokenizer *t) { - while (t->curr_rune == ' ' || - t->curr_rune == '\t' || - t->curr_rune == '\n' || - t->curr_rune == '\r') { - advance_to_next_rune(t); - } -} - gb_inline i32 digit_value(Rune r) { - if (gb_char_is_digit(cast(char)r)) { + switch (r) { + case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': return r - '0'; - } else if (gb_is_between(cast(char)r, 'a', 'f')) { + case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': return r - 'a' + 10; - } else if (gb_is_between(cast(char)r, 'A', 'F')) { + case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': return r - 'A' + 10; } return 16; // NOTE(bill): Larger than highest possible @@ -645,7 +682,7 @@ u8 peek_byte(Tokenizer *t, isize offset=0) { Token scan_number_to_token(Tokenizer *t, bool seen_decimal_point) { Token token = {}; token.kind = Token_Integer; - token.string = make_string(t->curr, 1); + token.string = {t->curr, 1}; token.pos.file = t->fullpath; token.pos.line = t->line_count; token.pos.column = t->curr-t->line+1; @@ -662,37 +699,43 @@ Token scan_number_to_token(Tokenizer *t, bool seen_decimal_point) { if (t->curr_rune == '0') { u8 *prev = t->curr; advance_to_next_rune(t); - if (t->curr_rune == 'b') { // Binary + switch (t->curr_rune) { + case 'b': // Binary advance_to_next_rune(t); scan_mantissa(t, 2); if (t->curr - prev <= 2) { token.kind = Token_Invalid; } - } else if (t->curr_rune == 'o') { // Octal + goto end; + case 'o': // Octal advance_to_next_rune(t); scan_mantissa(t, 8); if (t->curr - prev <= 2) { token.kind = Token_Invalid; } - } else if (t->curr_rune == 'd') { // Decimal + goto end; + case 'd': // Decimal advance_to_next_rune(t); scan_mantissa(t, 10); if (t->curr - prev <= 2) { token.kind = Token_Invalid; } - } else if (t->curr_rune == 'z') { // Dozenal + goto end; + case 'z': // Dozenal advance_to_next_rune(t); scan_mantissa(t, 12); if (t->curr - prev <= 2) { token.kind = Token_Invalid; } - } else if (t->curr_rune == 'x') { // Hexadecimal + goto end; + case 'x': // Hexadecimal advance_to_next_rune(t); scan_mantissa(t, 16); if (t->curr - prev <= 2) { token.kind = Token_Invalid; } - } else if (t->curr_rune == 'h') { // Hexadecimal Float + goto end; + case 'h': // Hexadecimal Float token.kind = Token_Float; advance_to_next_rune(t); scan_mantissa(t, 16); @@ -716,13 +759,11 @@ Token scan_number_to_token(Tokenizer *t, bool seen_decimal_point) { break; } } - - } else { + goto end; + default: scan_mantissa(t, 10); goto fraction; } - - goto end; } scan_mantissa(t, 10); @@ -762,36 +803,47 @@ end: return token; } + bool scan_escape(Tokenizer *t) { isize len = 0; u32 base = 0, max = 0, x = 0; Rune r = t->curr_rune; - if (r == 'a' || - r == 'b' || - r == 'e' || - r == 'f' || - r == 'n' || - r == 'r' || - r == 't' || - r == 'v' || - r == '\\' || - r == '\'' || - r == '\"') { + switch (r) { + case 'a': + case 'b': + case 'e': + case 'f': + case 'n': + case 'r': + case 't': + case 'v': + case '\\': + case '\'': + case '\"': advance_to_next_rune(t); return true; - } else if (gb_is_between(r, '0', '7')) { + + case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': len = 3; base = 8; max = 255; - } else if (r == 'x') { + break; + + case 'x': advance_to_next_rune(t); len = 2; base = 16; max = 255; - } else if (r == 'u') { + break; + + case 'u': advance_to_next_rune(t); len = 4; base = 16; max = GB_RUNE_MAX; - } else if (r == 'U') { + break; + + case 'U': advance_to_next_rune(t); len = 8; base = 16; max = GB_RUNE_MAX; - } else { + break; + + default: if (t->curr_rune < 0) { tokenizer_err(t, "Escape sequence was not terminated"); } else { @@ -871,10 +923,21 @@ gb_inline TokenKind token_kind_dub_eq(Tokenizer *t, Rune sing_rune, TokenKind si Token tokenizer_get_token(Tokenizer *t) { - tokenizer_skip_whitespace(t); + // Skip whitespace + for (;;) { + switch (t->curr_rune) { + case ' ': + case '\t': + case '\n': + case '\r': + advance_to_next_rune(t); + continue; + } + break; + } Token token = {}; - token.string = make_string(t->curr, 1); + token.string = {t->curr, 1}; token.pos.file = t->fullpath; token.pos.line = t->line_count; token.pos.offset = t->curr - t->start; @@ -891,16 +954,15 @@ Token tokenizer_get_token(Tokenizer *t) { // NOTE(bill): All keywords are > 1 if (token.string.len > 1) { - for (i32 k = Token__KeywordBegin+1; k < Token__KeywordEnd; k++) { - if (token.string == token_strings[k]) { - token.kind = cast(TokenKind)k; - break; + u32 hash = keyword_hash(token.string.text, token.string.len); + u32 index = hash & KEYWORD_HASH_TABLE_MASK; + KeywordHashEntry *entry = &keyword_hash_table[index]; + if (entry->kind != Token_Invalid) { + String const &entry_text = token_strings[entry->kind]; + if (str_eq(entry_text, token.string)) { + token.kind = entry->kind; } } - - if (token.kind == Token_Ident && token.string == "notin") { - token.kind = Token_not_in; - } } } else if (gb_is_between(curr_rune, '0', '9')) { @@ -1142,7 +1204,7 @@ Token tokenizer_get_token(Tokenizer *t) { case '|': token.kind = token_kind_dub_eq(t, '|', Token_Or, Token_OrEq, Token_CmpOr, Token_CmpOrEq); break; default: - if (curr_rune != GB_RUNE_BOM) { + if (curr_rune != GB_RUNE_BOM) { u8 str[4] = {}; int len = cast(int)gb_utf8_encode_rune(str, curr_rune); tokenizer_err(t, "Illegal character: %.*s (%d) ", len, str, curr_rune);