From a8c6ea7c8a1bc09e055daf71b49d7cedba23b7c0 Mon Sep 17 00:00:00 2001 From: gingerBill Date: Tue, 17 Mar 2026 09:18:39 +0000 Subject: [PATCH] Implement a new `StringInterner` --- src/common.cpp | 43 +------- src/common_memory.cpp | 125 ++++++++++++++++++++--- src/main.cpp | 4 +- src/parser.cpp | 5 +- src/parser.hpp | 7 +- src/string_interner.cpp | 215 ++++++++++++++++++++++++++++++++++++++++ 6 files changed, 338 insertions(+), 61 deletions(-) create mode 100644 src/string_interner.cpp diff --git a/src/common.cpp b/src/common.cpp index d5fc1df4b..86ebb0fa8 100644 --- a/src/common.cpp +++ b/src/common.cpp @@ -366,6 +366,7 @@ gb_global bool global_module_path_set = false; #include "string_set.cpp" #include "priority_queue.cpp" #include "thread_pool.cpp" +#include "string_interner.cpp" gb_internal String obfuscate_string(String const &s, char const *prefix) { @@ -388,48 +389,6 @@ gb_internal i32 obfuscate_i32(i32 i) { } - -struct StringIntern { - StringIntern *next; - isize len; - char str[1]; -}; - -PtrMap string_intern_map = {}; // Key: u64 -gb_global Arena string_intern_arena = {}; - -gb_internal char const *string_intern(char const *text, isize len) { - u64 hash = gb_fnv64a(text, len); - uintptr key = cast(uintptr)(hash ? hash : 1); - StringIntern **found = map_get(&string_intern_map, key); - if (found) { - for (StringIntern *it = *found; it != nullptr; it = it->next) { - if (it->len == len && gb_strncmp(it->str, (char *)text, len) == 0) { - return it->str; - } - } - } - - StringIntern *new_intern = cast(StringIntern *)arena_alloc(&string_intern_arena, gb_offset_of(StringIntern, str) + len + 1, gb_align_of(StringIntern)); - new_intern->len = len; - new_intern->next = found ? *found : nullptr; - gb_memmove(new_intern->str, text, len); - new_intern->str[len] = 0; - map_set(&string_intern_map, key, new_intern); - return new_intern->str; -} - -gb_internal char const *string_intern(String const &string) { - return string_intern(cast(char const *)string.text, string.len); -} - -gb_internal void init_string_interner(void) { - map_init(&string_intern_map); -} - - - - gb_internal i32 next_pow2(i32 n) { if (n <= 0) { return 0; diff --git a/src/common_memory.cpp b/src/common_memory.cpp index addd43687..ba38720b3 100644 --- a/src/common_memory.cpp +++ b/src/common_memory.cpp @@ -40,6 +40,7 @@ struct MemoryBlock { u8 * base; isize size; isize used; + isize committed; }; struct Arena { @@ -48,13 +49,14 @@ struct Arena { // BlockingMutex mutex; isize temp_count; Thread * parent_thread; + bool custom_arena; }; enum { DEFAULT_MINIMUM_BLOCK_SIZE = 8ll*1024ll*1024ll }; gb_global isize DEFAULT_PAGE_SIZE = 4096; -gb_internal MemoryBlock *virtual_memory_alloc(isize size); +gb_internal MemoryBlock *virtual_memory_alloc(isize size, bool commit); gb_internal void virtual_memory_dealloc(MemoryBlock *block); gb_internal void *arena_alloc(Arena *arena, isize min_size, isize alignment); gb_internal void arena_free_all(Arena *arena); @@ -82,7 +84,7 @@ gb_internal void thread_init_arenas(Thread *t) { gb_internal void *arena_alloc(Arena *arena, isize min_size, isize alignment) { GB_ASSERT(gb_is_power_of_two(alignment)); - GB_ASSERT(arena->parent_thread == get_current_thread()); + GB_ASSERT(arena->custom_arena || arena->parent_thread == get_current_thread()); isize size = 0; if (arena->curr_block != nullptr) { @@ -95,7 +97,7 @@ gb_internal void *arena_alloc(Arena *arena, isize min_size, isize alignment) { isize block_size = gb_max(size, arena->minimum_block_size); - MemoryBlock *new_block = virtual_memory_alloc(block_size); + MemoryBlock *new_block = virtual_memory_alloc(block_size, true); new_block->prev = arena->curr_block; arena->curr_block = new_block; } @@ -113,6 +115,62 @@ gb_internal void *arena_alloc(Arena *arena, isize min_size, isize alignment) { return ptr; } +gb_internal void *platform_virtual_memory_alloc_internal(isize total_size, bool commit); +gb_internal bool platform_virtual_memory_commit_internal(void *data, isize commit_amount); + +struct StaticArena { + u8 * data; + isize used; + isize committed; + isize reserved; + isize commit_block_size; +}; + +enum {STATIC_ARENA_DEFAULT_COMMIT_BLOCK_SIZE = 8<<20}; + +gb_internal void static_arena_init(StaticArena *arena, isize reserve_size, isize commit_block_size) { + GB_ASSERT(gb_is_power_of_two(reserve_size)); + GB_ASSERT(gb_is_power_of_two(commit_block_size)); + GB_ASSERT(commit_block_size <= reserve_size); + arena->data = cast(u8 *)platform_virtual_memory_alloc_internal(reserve_size, false); + arena->reserved = reserve_size; + arena->commit_block_size = commit_block_size; +} + +gb_internal void static_arena_commit_memory(StaticArena *arena, isize amount) { + isize blocks = (amount + arena->commit_block_size-1)/arena->commit_block_size; + isize total_amount = blocks * arena->commit_block_size; + + if (total_amount > arena->reserved - arena->committed) { + total_amount = arena->reserved - arena->committed; + } + + platform_virtual_memory_commit_internal(arena->data + arena->committed, total_amount); + arena->committed += total_amount; +} + +gb_internal void *static_arena_alloc(StaticArena *arena, isize size, isize alignment) { + GB_ASSERT(gb_is_power_of_two(alignment)); + + size = align_formula_isize(size, alignment); + + u8 *curr = arena->data + arena->used; + curr = cast(u8 *)align_formula_ptr(curr, alignment); + + u8 *end = curr + size; + if (end-arena->data > arena->committed) { + isize needed = (end - arena->data) - arena->committed; + static_arena_commit_memory(arena, needed); + } + GB_ASSERT_MSG(end-arena->data <= arena->committed, "out of memory for the static arena"); + + arena->used = end - arena->data; + + return curr; +} + + + template gb_internal T *arena_alloc_item(Arena *arena) { @@ -138,10 +196,13 @@ struct PlatformMemoryBlock { gb_global std::atomic global_platform_memory_total_usage; gb_global PlatformMemoryBlock global_platform_memory_block_sentinel; -gb_internal PlatformMemoryBlock *platform_virtual_memory_alloc(isize total_size); +gb_internal PlatformMemoryBlock *platform_virtual_memory_alloc(isize total_size, bool commit); gb_internal void platform_virtual_memory_free(PlatformMemoryBlock *block); gb_internal void platform_virtual_memory_protect(void *memory, isize size); +gb_internal void *platform_virtual_memory_alloc_internal(isize total_size, bool commit); +gb_internal bool platform_virtual_memory_commit_internal(void *data, isize commit_amount); + #if defined(GB_SYSTEM_WINDOWS) gb_internal void platform_virtual_memory_init(void) { global_platform_memory_block_sentinel.prev = &global_platform_memory_block_sentinel; @@ -153,14 +214,20 @@ gb_internal void platform_virtual_memory_protect(void *memory, isize size); GB_ASSERT(gb_is_power_of_two(DEFAULT_PAGE_SIZE)); } - gb_internal PlatformMemoryBlock *platform_virtual_memory_alloc(isize total_size) { - PlatformMemoryBlock *pmblock = (PlatformMemoryBlock *)VirtualAlloc(0, total_size, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE); - if (pmblock == nullptr) { + gb_internal void *platform_virtual_memory_alloc_internal(isize total_size, bool commit) { + DWORD flags = commit ? MEM_RESERVE|MEM_COMMIT : MEM_RESERVE; + void *mem = VirtualAlloc(0, total_size, flags, PAGE_READWRITE); + if (mem == nullptr) { gb_printf_err("Out of Virtual memory, oh no...\n"); gb_printf_err("Requested: %lld bytes\n", cast(long long)total_size); gb_printf_err("Total Usage: %lld bytes\n", cast(long long)global_platform_memory_total_usage); - GB_ASSERT_MSG(pmblock != nullptr, "Out of Virtual Memory, oh no..."); + GB_ASSERT_MSG(mem != nullptr, "Out of Virtual Memory, oh no..."); } + return mem; + } + + gb_internal PlatformMemoryBlock *platform_virtual_memory_alloc(isize total_size, bool commit) { + PlatformMemoryBlock *pmblock = cast(PlatformMemoryBlock *)platform_virtual_memory_alloc_internal(total_size, commit); global_platform_memory_total_usage.fetch_add(total_size); return pmblock; } @@ -173,6 +240,16 @@ gb_internal void platform_virtual_memory_protect(void *memory, isize size); BOOL is_protected = VirtualProtect(memory, size, PAGE_NOACCESS, &old_protect); GB_ASSERT(is_protected); } + + gb_internal bool platform_virtual_memory_commit_internal(void *data, isize commit_amount) { + void *res = VirtualAlloc(data, commit_amount, MEM_COMMIT, PAGE_READWRITE); + if (res == nullptr) { + GB_PANIC("Out of Virtual Memory, oh no...\n"); + GB_ASSERT_MSG(res != nullptr, "Out of Virtual Memory, oh no..."); + return false; + } + return true; + } #else #if !defined(MAP_ANONYMOUS) && defined(MAP_ANON) #define MAP_ANONYMOUS MAP_ANON @@ -185,9 +262,20 @@ gb_internal void platform_virtual_memory_protect(void *memory, isize size); DEFAULT_PAGE_SIZE = gb_max(DEFAULT_PAGE_SIZE, cast(isize)sysconf(_SC_PAGE_SIZE)); GB_ASSERT(gb_is_power_of_two(DEFAULT_PAGE_SIZE)); } + + gb_internal void *platform_virtual_memory_alloc_internal(isize total_size, bool commit) { + void *mem = mmap(nullptr, total_size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (mem == nullptr) { + gb_printf_err("Out of Virtual memory, oh no...\n"); + gb_printf_err("Requested: %lld bytes\n", cast(long long)total_size); + gb_printf_err("Total Usage: %lld bytes\n", cast(long long)global_platform_memory_total_usage); + GB_ASSERT_MSG(mem != nullptr, "Out of Virtual Memory, oh no..."); + } + return mem; + } - gb_internal PlatformMemoryBlock *platform_virtual_memory_alloc(isize total_size) { - PlatformMemoryBlock *pmblock = (PlatformMemoryBlock *)mmap(nullptr, total_size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + gb_internal PlatformMemoryBlock *platform_virtual_memory_alloc(isize total_size, bool commit) { + PlatformMemoryBlock *pmblock = cast(PlatformMemoryBlock *)platform_virtual_memory_alloc_internal(total_size, commit); if (pmblock == nullptr) { gb_printf_err("Out of Virtual memory, oh no...\n"); gb_printf_err("Requested: %lld bytes\n", cast(long long)total_size); @@ -197,6 +285,9 @@ gb_internal void platform_virtual_memory_protect(void *memory, isize size); global_platform_memory_total_usage.fetch_add(total_size); return pmblock; } + gb_internal PlatformMemoryBlock *platform_virtual_memory_alloc_uncommited(isize total_size) { + return platform_virtual_memory_alloc(total_size); + } gb_internal void platform_virtual_memory_free(PlatformMemoryBlock *block) { isize size = block->total_size; global_platform_memory_total_usage.fetch_sub(size); @@ -206,9 +297,19 @@ gb_internal void platform_virtual_memory_protect(void *memory, isize size); int err = mprotect(memory, size, PROT_NONE); GB_ASSERT(err == 0); } + + gb_internal bool platform_virtual_memory_commit_internal(void *data, isize commit_amount) { + int err = mprotect(data, commit_amount, PROT_READ | PROT_WRITE) + if (err != 0) { + GB_PANIC("Out of Virtual Memory, oh no...\n"); + GB_ASSERT_MSG(err == 0, "Out of Virtual Memory, oh no..."); + return false; + } + return true; + } #endif -gb_internal MemoryBlock *virtual_memory_alloc(isize size) { +gb_internal MemoryBlock *virtual_memory_alloc(isize size, bool commit) { isize const page_size = DEFAULT_PAGE_SIZE; isize total_size = size + gb_size_of(PlatformMemoryBlock); @@ -224,7 +325,7 @@ gb_internal MemoryBlock *virtual_memory_alloc(isize size) { do_protection = true; } - PlatformMemoryBlock *pmblock = platform_virtual_memory_alloc(total_size); + PlatformMemoryBlock *pmblock = platform_virtual_memory_alloc(total_size, commit); GB_ASSERT_MSG(pmblock != nullptr, "Out of Virtual Memory, oh no..."); pmblock->block.base = cast(u8 *)pmblock + base_offset; diff --git a/src/main.cpp b/src/main.cpp index ee3f42660..1f0df6add 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -1086,7 +1086,7 @@ gb_internal bool parse_build_flags(Array args) { break; } - char const *key = string_intern(name); + char const *key = string_intern_cstring(name); if (map_get(&build_context.defined_values, key) != nullptr) { gb_printf_err("Defined constant '%.*s' already exists\n", LIT(name)); @@ -3588,7 +3588,7 @@ int main(int arg_count, char const **arg_ptr) { MAIN_TIME_SECTION("initialization"); - init_string_interner(); + g_string_interner = string_interner_create(); init_global_error_collector(); init_keyword_hash_table(); init_terminal(); diff --git a/src/parser.cpp b/src/parser.cpp index f79287ceb..101167e45 100644 --- a/src/parser.cpp +++ b/src/parser.cpp @@ -773,8 +773,9 @@ gb_internal Ast *ast_matrix_index_expr(AstFile *f, Ast *expr, Token open, Token gb_internal Ast *ast_ident(AstFile *f, Token token) { Ast *result = alloc_ast_node(f, Ast_Ident); - result->Ident.token = token; - result->Ident.hash = string_hash(token.string); + result->Ident.token = token; + result->Ident.hash = string_hash(token.string); + result->Ident.interned = string_interner_insert(token.string); return result; } diff --git a/src/parser.hpp b/src/parser.hpp index c68b3614f..149cf6330 100644 --- a/src/parser.hpp +++ b/src/parser.hpp @@ -424,9 +424,10 @@ struct AstSplitArgs { #define AST_KINDS \ AST_KIND(Ident, "identifier", struct { \ - Token token; \ - std::atomic entity; \ - u32 hash; \ + Token token; \ + std::atomic entity; \ + u32 hash; \ + InternedString interned; \ }) \ AST_KIND(Implicit, "implicit", Token) \ AST_KIND(Uninit, "uninitialized value", Token) \ diff --git a/src/string_interner.cpp b/src/string_interner.cpp new file mode 100644 index 000000000..f8496e3b0 --- /dev/null +++ b/src/string_interner.cpp @@ -0,0 +1,215 @@ +#define STRING_INTERNER_CELL_WIDTH 8 +#define STRING_INTERNER_MUTEX_STRIPE_COUNT 1024 +#define STRING_INTERNER_MUTEX_STRIPE_MASK (STRING_INTERNER_MUTEX_STRIPE_COUNT - 1) +#define STRING_INTERNER_THREAD_LOCAL_SIZE (1024 * 1024 * 2) +#define STRING_INTERN_CACHE_LINE (2*GB_CACHE_LINE_SIZE) + +struct InternedString { + u32 value; + bool operator==(InternedString other) const { + return this->value == other.value; + } + + String load() const; + char const *load_cstring() const; +}; +struct alignas(STRING_INTERN_CACHE_LINE) StringInternCell { + std::atomic hashes [STRING_INTERNER_CELL_WIDTH]; + InternedString offsets[STRING_INTERNER_CELL_WIDTH]; + std::atomic next; +}; + +struct alignas(STRING_INTERN_CACHE_LINE) PaddedMutex { + BlockingMutex m; +}; + +struct alignas(STRING_INTERN_CACHE_LINE) PaddedI64 { + std::atomic value; +}; + +struct StringInterner { + StringInternCell *cells; + u64 cell_mask; + PaddedMutex mutexes[STRING_INTERNER_MUTEX_STRIPE_COUNT]; + StaticArena arena; + PaddedMutex arena_mutex; + bool track_count; + PaddedI64 count; +}; + +gb_internal StringInterner *string_interner_create(); +gb_internal InternedString string_interner_insert(String str, u32 hash=0); +gb_internal String string_interner_load(InternedString interned); + +gb_global StringInterner *g_string_interner; + +struct StringInternerThreadLocalArena { + u8 *data; + u64 cursor; +}; +gb_thread_local gb_global StringInternerThreadLocalArena g_interner_arena; + +gb_internal void string_interner_thread_local_arena_init(StringInternerThreadLocalArena *tl_arena); +gb_internal void *string_interner_thread_local_arena_alloc(StringInternerThreadLocalArena *tl_arena, isize size, isize alignment); + +gb_internal StringInterner *string_interner_create() { + StaticArena arena = {}; + static_arena_init(&arena, 1<<30, STATIC_ARENA_DEFAULT_COMMIT_BLOCK_SIZE); + + StringInterner *interner = cast(StringInterner *)static_arena_alloc(&arena, gb_size_of(StringInterner), STRING_INTERN_CACHE_LINE); + interner->arena = arena; + u64 cell_size = 1llu << 17llu; + u64 cell_mask = cell_size - 1; + interner->cell_mask = cell_mask; + interner->cells = cast(StringInternCell *)static_arena_alloc(&interner->arena, cell_size * gb_size_of(StringInternCell), STRING_INTERN_CACHE_LINE); + interner->track_count = false; + return interner; +} + +gb_internal String string_interner_load(InternedString interned) { + StringInterner* interner = g_string_interner; + if (interned.value == 0) { + return {}; + } + u8 *base = cast(u8 *)interner + interned.value; + u32 str_len = *cast(u32 *)base; + u8 *text = base + 4; + String str = { text, str_len }; + return str; +} + +gb_internal char const *string_interner_load_cstring(InternedString interned) { + StringInterner* interner = g_string_interner; + if (interned.value == 0) { + return ""; + } + u8 *base = cast(u8 *)interner + interned.value; + // u32 str_len = *cast(u32 *)base; + u8 *text = base + 4; + return cast(char const *)text; +} + +String InternedString::load() const { + return string_interner_load(*this); +} +char const *InternedString::load_cstring() const { + return string_interner_load_cstring(*this); +} + +gb_internal InternedString string_interner_insert(String str, u32 hash) { + StringInterner* interner = g_string_interner; + if (str.len == 0) { + return {}; + } + + if (hash == 0) { + hash = string_hash(str); + } + + u64 cell_idx = hash & interner->cell_mask; + StringInternCell *cell = &interner->cells[cell_idx]; + while (true) { + StringInternCell *next = cell->next.load(std::memory_order_acquire); + + for (i32 i = 0; i < STRING_INTERNER_CELL_WIDTH; i += 1) { + if (cell->hashes[i].load(std::memory_order_acquire) == hash) { + String to_compare = string_interner_load(cell->offsets[i]); + if (to_compare == str) { + return cell->offsets[i]; + } + } + } + if (next == nullptr) { + break; + } + cell = next; + } + + u64 mutex_cell = cell_idx & STRING_INTERNER_MUTEX_STRIPE_MASK; + PaddedMutex* m = &interner->mutexes[mutex_cell]; + MUTEX_GUARD(&m->m); + + StringInternCell *load_cell = nullptr; + while (cell) { + for (i32 i = 0; i < STRING_INTERNER_CELL_WIDTH; i += 1) { + if (cell->hashes[i].load(std::memory_order_relaxed) == hash) { + // string check + String to_compare = string_interner_load(cell->offsets[i]); + if (to_compare == str) { + return cell->offsets[i]; + } + } + } + load_cell = cell; + cell = cell->next.load(std::memory_order_relaxed); + } + + u64 data_to_allocate = 4 + str.len + 1; + u8 *data = cast(u8 *)string_interner_thread_local_arena_alloc(&g_interner_arena, data_to_allocate, 8); + u32 str_len = cast(u32)str.len; + gb_memcopy(data, &str_len, 4); + gb_memcopy(&data[4], str.text, str_len); + data[4+str_len] = 0; + InternedString offset = { cast(u32)(cast(u8 *)data - cast(u8 *)interner) }; + + for (i32 i = 0; i < STRING_INTERNER_CELL_WIDTH; i += 1) { + if (load_cell->hashes[i].load(std::memory_order_relaxed) == 0) { + load_cell->offsets[i] = offset; + load_cell->hashes[i].store(hash, std::memory_order_release); + if (interner->track_count) { + interner->count.value.fetch_add(1, std::memory_order_relaxed); + } + return offset; + } + } + + StringInternCell *new_cell = cast(StringInternCell *)string_interner_thread_local_arena_alloc(&g_interner_arena, gb_size_of(StringInternCell), STRING_INTERN_CACHE_LINE); + new_cell->offsets[0] = offset; + new_cell->hashes[0].store(hash, std::memory_order_relaxed); + load_cell->next.store(new_cell, std::memory_order_release); + + if (interner->track_count) { + interner->count.value.fetch_add(1, std::memory_order_relaxed); + } + + return offset; +} + +gb_internal char const *string_intern_cstring(String str) { + InternedString i = string_interner_insert(str, 0); + return string_interner_load_cstring(i); +} + + +gb_internal String string_intern_string(String str) { + InternedString i = string_interner_insert(str, 0); + return string_interner_load(i); +} + + + + +gb_internal void string_interner_thread_local_arena_init(StringInternerThreadLocalArena *tl_arena) { + *tl_arena = { + nullptr, + STRING_INTERNER_THREAD_LOCAL_SIZE, + }; +} + +gb_internal void *string_interner_thread_local_arena_alloc(StringInternerThreadLocalArena *tl_arena, isize size, isize alignment) { + if (tl_arena->data == nullptr) { + tl_arena->cursor = STRING_INTERNER_THREAD_LOCAL_SIZE; + } + isize new_head = align_formula_isize(tl_arena->cursor, alignment); + isize cursor = new_head + size; + if (cursor > STRING_INTERNER_THREAD_LOCAL_SIZE) { + mutex_lock(&g_string_interner->arena_mutex.m); + tl_arena->data = cast(u8 *)static_arena_alloc(&g_string_interner->arena, STRING_INTERNER_THREAD_LOCAL_SIZE, 4096); + tl_arena->cursor = 0; + mutex_unlock(&g_string_interner->arena_mutex.m); + return string_interner_thread_local_arena_alloc(tl_arena, size, alignment); + } + u8 *return_head = tl_arena->data + new_head; + tl_arena->cursor = cursor; + return return_head; +} \ No newline at end of file