From df372dbd5be1fd219322f0b0b8f141e036fb5203 Mon Sep 17 00:00:00 2001
From: gingerBill <bill@gingerbill.org>
Date: Thu, 19 Aug 2021 15:38:21 +0100
Subject: [PATCH] Migrate and remove more from gb.h

---
 src/checker.cpp              |   30 +-
 src/common.cpp               |  130 +----
 src/gb/gb.h                  | 1056 +---------------------------------
 src/llvm_backend.hpp         |    6 +-
 src/llvm_backend_const.cpp   |    2 +-
 src/llvm_backend_general.cpp |    8 +-
 src/thread_pool.cpp          |   26 +-
 src/threading.cpp            |  348 +++++++++++
 8 files changed, 388 insertions(+), 1218 deletions(-)
 create mode 100644 src/threading.cpp

diff --git a/src/checker.cpp b/src/checker.cpp
index a207ed78e..4ae5f5b1c 100644
--- a/src/checker.cpp
+++ b/src/checker.cpp
@@ -4129,7 +4129,7 @@ struct ThreadProcCheckerSection {
 };
 
 
-void check_with_workers(Checker *c, gbThreadProc *proc, isize total_count) {
+void check_with_workers(Checker *c, ThreadProc *proc, isize total_count) {
 	isize thread_count = gb_max(build_context.thread_count, 1);
 	isize worker_count = thread_count-1; // NOTE(bill): The main thread will also be used for work
 	if (!build_context.threaded_checker) {
@@ -4143,7 +4143,7 @@ void check_with_workers(Checker *c, gbThreadProc *proc, isize total_count) {
 		section_all.checker = c;
 		section_all.offset = 0;
 		section_all.count = total_count;
-		gbThread dummy_main_thread = {};
+		Thread dummy_main_thread = {};
 		dummy_main_thread.user_data = &section_all;
 		proc(&dummy_main_thread);
 		return;
@@ -4162,27 +4162,27 @@ void check_with_workers(Checker *c, gbThreadProc *proc, isize total_count) {
 	}
 	GB_ASSERT(remaining_count <= 0);
 
-	gbThread *threads = gb_alloc_array(permanent_allocator(), gbThread, worker_count);
+	Thread *threads = gb_alloc_array(permanent_allocator(), Thread, worker_count);
 	for (isize i = 0; i < worker_count; i++) {
-		gb_thread_init(threads+i);
+		thread_init(threads+i);
 	}
 
 	for (isize i = 0; i < worker_count; i++) {
-		gb_thread_start(threads+i, proc, thread_data+i);
+		thread_start(threads+i, proc, thread_data+i);
 	}
-	gbThread dummy_main_thread = {};
+	Thread dummy_main_thread = {};
 	dummy_main_thread.user_data = thread_data+worker_count;
 	proc(&dummy_main_thread);
 
 	semaphore_wait(&c->info.collect_semaphore);
 
 	for (isize i = 0; i < worker_count; i++) {
-		gb_thread_destroy(threads+i);
+		thread_destroy(threads+i);
 	}
 }
 
 
-GB_THREAD_PROC(thread_proc_collect_entities) {
+THREAD_PROC(thread_proc_collect_entities) {
 	auto *data = cast(ThreadProcCheckerSection *)thread->user_data;
 	Checker *c = data->checker;
 	CheckerContext collect_entity_ctx = make_checker_context(c);
@@ -4231,7 +4231,7 @@ void check_export_entities_in_pkg(CheckerContext *ctx, AstPackage *pkg, UntypedE
 	}
 }
 
-GB_THREAD_PROC(thread_proc_check_export_entities) {
+THREAD_PROC(thread_proc_check_export_entities) {
 	auto data = cast(ThreadProcCheckerSection *)thread->user_data;
 	Checker *c = data->checker;
 
@@ -4720,7 +4720,7 @@ struct ThreadProcBodyData {
 	ThreadProcBodyData *all_data;
 };
 
-GB_THREAD_PROC(thread_proc_body) {
+THREAD_PROC(thread_proc_body) {
 	ThreadProcBodyData *data = cast(ThreadProcBodyData *)thread->user_data;
 	Checker *c = data->checker;
 	GB_ASSERT(c != nullptr);
@@ -4797,22 +4797,22 @@ void check_procedure_bodies(Checker *c) {
 
 	semaphore_post(&c->procs_to_check_semaphore, cast(i32)thread_count);
 
-	gbThread *threads = gb_alloc_array(permanent_allocator(), gbThread, worker_count);
+	Thread *threads = gb_alloc_array(permanent_allocator(), Thread, worker_count);
 	for (isize i = 0; i < worker_count; i++) {
-		gb_thread_init(threads+i);
+		thread_init(threads+i);
 	}
 
 	for (isize i = 0; i < worker_count; i++) {
-		gb_thread_start(threads+i, thread_proc_body, thread_data+i);
+		thread_start(threads+i, thread_proc_body, thread_data+i);
 	}
-	gbThread dummy_main_thread = {};
+	Thread dummy_main_thread = {};
 	dummy_main_thread.user_data = thread_data+worker_count;
 	thread_proc_body(&dummy_main_thread);
 
 	semaphore_wait(&c->procs_to_check_semaphore);
 
 	for (isize i = 0; i < worker_count; i++) {
-		gb_thread_destroy(threads+i);
+		thread_destroy(threads+i);
 	}
 
 	isize global_remaining = c->procs_to_check_queue.count.load(std::memory_order_relaxed);
diff --git a/src/common.cpp b/src/common.cpp
index fd6268a82..ab6dbabb1 100644
--- a/src/common.cpp
+++ b/src/common.cpp
@@ -29,135 +29,9 @@
 #include <string.h>
 #include <atomic> // Because I wanted the C++11 memory order semantics, of which gb.h does not offer (because it was a C89 library)
 
+gbAllocator heap_allocator(void);
 
-#if defined(GB_SYSTEM_WINDOWS)
-	struct BlockingMutex {
-		SRWLOCK srwlock;
-	};
-	void mutex_init(BlockingMutex *m) {
-	}
-	void mutex_destroy(BlockingMutex *m) {
-	}
-	void mutex_lock(BlockingMutex *m) {
-		AcquireSRWLockExclusive(&m->srwlock);
-	}
-	bool mutex_try_lock(BlockingMutex *m) {
-		return !!TryAcquireSRWLockExclusive(&m->srwlock);
-	}
-	void mutex_unlock(BlockingMutex *m) {
-		ReleaseSRWLockExclusive(&m->srwlock);
-	}
-
-	struct RecursiveMutex {
-		CRITICAL_SECTION win32_critical_section;
-	};
-	void mutex_init(RecursiveMutex *m) {
-		InitializeCriticalSection(&m->win32_critical_section);
-	}
-	void mutex_destroy(RecursiveMutex *m) {
-		DeleteCriticalSection(&m->win32_critical_section);
-	}
-	void mutex_lock(RecursiveMutex *m) {
-		EnterCriticalSection(&m->win32_critical_section);
-	}
-	bool mutex_try_lock(RecursiveMutex *m) {
-		return TryEnterCriticalSection(&m->win32_critical_section) != 0;
-	}
-	void mutex_unlock(RecursiveMutex *m) {
-		LeaveCriticalSection(&m->win32_critical_section);
-	}
-
-	struct Semaphore {
-		void *win32_handle;
-	};
-
-	gb_inline void semaphore_init(Semaphore *s) {
-		s->win32_handle = CreateSemaphoreA(NULL, 0, I32_MAX, NULL);
-	}
-	gb_inline void semaphore_destroy(Semaphore *s) {
-		CloseHandle(s->win32_handle);
-	}
-	gb_inline void semaphore_post(Semaphore *s, i32 count) {
-		ReleaseSemaphore(s->win32_handle, count, NULL);
-	}
-	gb_inline void semaphore_wait(Semaphore *s) {
-		WaitForSingleObjectEx(s->win32_handle, INFINITE, FALSE);
-	}
-
-	gb_inline void semaphore_release(Semaphore *s) {
-		semaphore_post(s, 1);
-	}
-
-#else
-	struct BlockingMutex {
-		pthread_mutex_t pthread_mutex;
-	};
-	void mutex_init(BlockingMutex *m) {
-		pthread_mutex_init(&m->pthread_mutex, nullptr);
-	}
-	void mutex_destroy(BlockingMutex *m) {
-		pthread_mutex_destroy(&m->pthread_mutex);
-	}
-	void mutex_lock(BlockingMutex *m) {
-		pthread_mutex_lock(&m->pthread_mutex);
-	}
-	bool mutex_try_lock(BlockingMutex *m) {
-		return pthread_mutex_trylock(&m->pthread_mutex) == 0;
-	}
-	void mutex_unlock(BlockingMutex *m) {
-		pthread_mutex_unlock(&m->pthread_mutex);
-	}
-
-	struct RecursiveMutex {
-		pthread_mutex_t pthread_mutex;
-		pthread_mutexattr_t pthread_mutexattr;
-	};
-	void mutex_init(RecursiveMutex *m) {
-		pthread_mutexattr_init(&m->pthread_mutexattr);
-		pthread_mutexattr_settype(&m->pthread_mutexattr, PTHREAD_MUTEX_RECURSIVE);
-		pthread_mutex_init(&m->pthread_mutex, &m->pthread_mutexattr);
-	}
-	void mutex_destroy(RecursiveMutex *m) {
-		pthread_mutex_destroy(&m->pthread_mutex);
-	}
-	void mutex_lock(RecursiveMutex *m) {
-		pthread_mutex_lock(&m->pthread_mutex);
-	}
-	bool mutex_try_lock(RecursiveMutex *m) {
-		return pthread_mutex_trylock(&m->pthread_mutex) == 0;
-	}
-	void mutex_unlock(RecursiveMutex *m) {
-		pthread_mutex_unlock(&m->pthread_mutex);
-	}
-
-	#if defined(GB_SYSTEM_OSX)
-		struct Semaphore {
-			semaphore_t osx_handle;
-		};
-
-		gb_inline void semaphore_init   (Semaphore *s)            { semaphore_create(mach_task_self(), &s->osx_handle, SYNC_POLICY_FIFO, 0); }
-		gb_inline void semaphore_destroy(Semaphore *s)            { semaphore_destroy(mach_task_self(), s->osx_handle); }
-		gb_inline void semaphore_post   (Semaphore *s, i32 count) { while (count --> 0) semaphore_signal(s->osx_handle); }
-		gb_inline void semaphore_wait   (Semaphore *s)            { semaphore_wait(s->osx_handle); }
-	#elif defined(GB_SYSTEM_UNIX)
-		struct Semaphore {
-			sem_t unix_handle;
-		};
-
-		gb_inline void semaphore_init   (Semaphore *s)            { sem_init(&s->unix_handle, 0, 0); }
-		gb_inline void semaphore_destroy(Semaphore *s)            { sem_destroy(&s->unix_handle); }
-		gb_inline void semaphore_post   (Semaphore *s, i32 count) { while (count --> 0) sem_post(&s->unix_handle); }
-		gb_inline void semaphore_wait   (Semaphore *s)            { int i; do { i = sem_wait(&s->unix_handle); } while (i == -1 && errno == EINTR); }
-	#else
-	#error
-	#endif
-
-	gb_inline void semaphore_release(Semaphore *s) {
-		semaphore_post(s, 1);
-	}
-#endif
-
-
+#include "threading.cpp"
 
 gb_inline void zero_size(void *ptr, isize len) {
 	memset(ptr, 0, len);
diff --git a/src/gb/gb.h b/src/gb/gb.h
index a67e0a076..1ffaa81e1 100644
--- a/src/gb/gb.h
+++ b/src/gb/gb.h
@@ -747,171 +747,8 @@ GB_DEF void const *gb_memrchr   (void const *data, u8 byte_value, isize size);
 
 
 
-
-// Atomics
-
-// TODO(bill): Be specific with memory order?
-// e.g. relaxed, acquire, release, acquire_release
-
-#if defined(GB_COMPILER_MSVC)
-typedef struct gbAtomic32  { i32   volatile value; } gbAtomic32;
-typedef struct gbAtomic64  { i64   volatile value; } gbAtomic64;
-typedef struct gbAtomicPtr { void *volatile value; } gbAtomicPtr;
-#else
-	#if defined(GB_ARCH_32_BIT)
-	#define GB_ATOMIC_PTR_ALIGNMENT 4
-	#elif defined(GB_ARCH_64_BIT)
-	#define GB_ATOMIC_PTR_ALIGNMENT 8
-	#else
-	#error Unknown architecture
-	#endif
-
-typedef struct gbAtomic32  { i32   volatile value; } __attribute__ ((aligned(4))) gbAtomic32;
-typedef struct gbAtomic64  { i64   volatile value; } __attribute__ ((aligned(8))) gbAtomic64;
-typedef struct gbAtomicPtr { void *volatile value; } __attribute__ ((aligned(GB_ATOMIC_PTR_ALIGNMENT))) gbAtomicPtr;
-#endif
-
-GB_DEF i32  gb_atomic32_load            (gbAtomic32 const volatile *a);
-GB_DEF void gb_atomic32_store           (gbAtomic32 volatile *a, i32 value);
-GB_DEF i32  gb_atomic32_compare_exchange(gbAtomic32 volatile *a, i32 expected, i32 desired);
-GB_DEF i32  gb_atomic32_exchanged       (gbAtomic32 volatile *a, i32 desired);
-GB_DEF i32  gb_atomic32_fetch_add       (gbAtomic32 volatile *a, i32 operand);
-GB_DEF i32  gb_atomic32_fetch_and       (gbAtomic32 volatile *a, i32 operand);
-GB_DEF i32  gb_atomic32_fetch_or        (gbAtomic32 volatile *a, i32 operand);
-GB_DEF b32  gb_atomic32_spin_lock       (gbAtomic32 volatile *a, isize time_out); // NOTE(bill): time_out = -1 as default
-GB_DEF void gb_atomic32_spin_unlock     (gbAtomic32 volatile *a);
-GB_DEF b32  gb_atomic32_try_acquire_lock(gbAtomic32 volatile *a);
-
-
-GB_DEF i64  gb_atomic64_load            (gbAtomic64 const volatile *a);
-GB_DEF void gb_atomic64_store           (gbAtomic64 volatile *a, i64 value);
-GB_DEF i64  gb_atomic64_compare_exchange(gbAtomic64 volatile *a, i64 expected, i64 desired);
-GB_DEF i64  gb_atomic64_exchanged       (gbAtomic64 volatile *a, i64 desired);
-GB_DEF i64  gb_atomic64_fetch_add       (gbAtomic64 volatile *a, i64 operand);
-GB_DEF i64  gb_atomic64_fetch_and       (gbAtomic64 volatile *a, i64 operand);
-GB_DEF i64  gb_atomic64_fetch_or        (gbAtomic64 volatile *a, i64 operand);
-GB_DEF b32  gb_atomic64_spin_lock       (gbAtomic64 volatile *a, isize time_out); // NOTE(bill): time_out = -1 as default
-GB_DEF void gb_atomic64_spin_unlock     (gbAtomic64 volatile *a);
-GB_DEF b32  gb_atomic64_try_acquire_lock(gbAtomic64 volatile *a);
-
-
-GB_DEF void *gb_atomic_ptr_load            (gbAtomicPtr const volatile *a);
-GB_DEF void  gb_atomic_ptr_store           (gbAtomicPtr volatile *a, void *value);
-GB_DEF void *gb_atomic_ptr_compare_exchange(gbAtomicPtr volatile *a, void *expected, void *desired);
-GB_DEF void *gb_atomic_ptr_exchanged       (gbAtomicPtr volatile *a, void *desired);
-GB_DEF void *gb_atomic_ptr_fetch_add       (gbAtomicPtr volatile *a, void *operand);
-GB_DEF void *gb_atomic_ptr_fetch_and       (gbAtomicPtr volatile *a, void *operand);
-GB_DEF void *gb_atomic_ptr_fetch_or        (gbAtomicPtr volatile *a, void *operand);
-GB_DEF b32   gb_atomic_ptr_spin_lock       (gbAtomicPtr volatile *a, isize time_out); // NOTE(bill): time_out = -1 as default
-GB_DEF void  gb_atomic_ptr_spin_unlock     (gbAtomicPtr volatile *a);
-GB_DEF b32   gb_atomic_ptr_try_acquire_lock(gbAtomicPtr volatile *a);
-
-
 // Fences
-GB_DEF void gb_yield_thread(void);
-GB_DEF void gb_mfence      (void);
-GB_DEF void gb_sfence      (void);
-GB_DEF void gb_lfence      (void);
-
-
-#if defined(GB_SYSTEM_WINDOWS)
-typedef struct gbSemaphore { void *win32_handle;}      gbSemaphore;
-#elif defined(GB_SYSTEM_OSX)
-typedef struct gbSemaphore { semaphore_t osx_handle; } gbSemaphore;
-#elif defined(GB_SYSTEM_UNIX)
-typedef struct gbSemaphore { sem_t unix_handle; }      gbSemaphore;
-#else
-#error
-#endif
-
-GB_DEF void gb_semaphore_init   (gbSemaphore *s);
-GB_DEF void gb_semaphore_destroy(gbSemaphore *s);
-GB_DEF void gb_semaphore_post   (gbSemaphore *s, i32 count);
-GB_DEF void gb_semaphore_release(gbSemaphore *s);
-GB_DEF void gb_semaphore_wait   (gbSemaphore *s);
-
-
-// Mutex
-typedef struct gbMutex {
-#if defined(GB_SYSTEM_WINDOWS)
-	CRITICAL_SECTION win32_critical_section;
-#else
-	pthread_mutex_t pthread_mutex;
-	pthread_mutexattr_t pthread_mutexattr;
-#endif
-} gbMutex;
-
-GB_DEF void gb_mutex_init    (gbMutex *m);
-GB_DEF void gb_mutex_destroy (gbMutex *m);
-GB_DEF void gb_mutex_lock    (gbMutex *m);
-GB_DEF b32  gb_mutex_try_lock(gbMutex *m);
-GB_DEF void gb_mutex_unlock  (gbMutex *m);
-
-// NOTE(bill): If you wanted a Scoped Mutex in C++, why not use the defer() construct?
-// No need for a silly wrapper class and it's clear!
-#if 0
-gbMutex m = {0};
-gb_mutex_init(&m);
-{
-	gb_mutex_lock(&m);
-	defer (gb_mutex_unlock(&m));
-
-	// Do whatever as the mutex is now scoped based!
-}
-#endif
-
-
-
-#define GB_THREAD_PROC(name) isize name(struct gbThread *thread)
-typedef GB_THREAD_PROC(gbThreadProc);
-
-typedef struct gbThread {
-#if defined(GB_SYSTEM_WINDOWS)
-	void *        win32_handle;
-#else
-	pthread_t     posix_handle;
-#endif
-
-	gbThreadProc * proc;
-	void *         user_data;
-	isize          user_index;
-	isize volatile return_value;
-
-	gbSemaphore   semaphore;
-	isize         stack_size;
-	b32 volatile  is_running;
-} gbThread;
-
-GB_DEF void gb_thread_init            (gbThread *t);
-GB_DEF void gb_thread_destroy         (gbThread *t);
-GB_DEF void gb_thread_start           (gbThread *t, gbThreadProc *proc, void *data);
-GB_DEF void gb_thread_start_with_stack(gbThread *t, gbThreadProc *proc, void *data, isize stack_size);
-GB_DEF void gb_thread_join            (gbThread *t);
-GB_DEF b32  gb_thread_is_running      (gbThread const *t);
-GB_DEF u32  gb_thread_current_id      (void);
-GB_DEF void gb_thread_set_name        (gbThread *t, char const *name);
-
-
-// NOTE(bill): Thread Merge Operation
-// Based on Sean Barrett's stb_sync
-typedef struct gbSync {
-	i32 target;  // Target Number of threads
-	i32 current; // Threads to hit
-	i32 waiting; // Threads waiting
-
-	gbMutex start;
-	gbMutex mutex;
-	gbSemaphore release;
-} gbSync;
-
-GB_DEF void gb_sync_init          (gbSync *s);
-GB_DEF void gb_sync_destroy       (gbSync *s);
-GB_DEF void gb_sync_set_target    (gbSync *s, i32 count);
-GB_DEF void gb_sync_release       (gbSync *s);
-GB_DEF i32  gb_sync_reach         (gbSync *s);
-GB_DEF void gb_sync_reach_and_wait(gbSync *s);
-
-
+GB_DEF u32 gb_thread_current_id(void);
 
 #if defined(GB_SYSTEM_WINDOWS)
 
@@ -2049,8 +1886,7 @@ GB_DEF f64   gb_random_range_f64     (gbRandom *r, f64 lower_inc, f64 higher_inc
 
 
 
-GB_DEF void gb_exit     (u32 code);
-GB_DEF void gb_yield    (void);
+GB_DEF void gb_exit(u32 code);
 GB_DEF char const *gb_get_env  (char const *name, gbAllocator allocator);
 GB_DEF void gb_set_env  (char const *name, char const *value);
 GB_DEF void gb_unset_env(char const *name);
@@ -3628,772 +3464,6 @@ gb_inline void *gb_default_resize_align(gbAllocator a, void *old_memory, isize o
 // Concurrency
 //
 //
-// IMPORTANT TODO(bill): Use compiler intrinsics for the atomics
-
-#if defined(GB_COMPILER_MSVC) && !defined(GB_COMPILER_CLANG)
-gb_inline i32  gb_atomic32_load (gbAtomic32 const volatile *a)      { return a->value;  }
-gb_inline void gb_atomic32_store(gbAtomic32 volatile *a, i32 value) { a->value = value; }
-
-gb_inline i32 gb_atomic32_compare_exchange(gbAtomic32 volatile *a, i32 expected, i32 desired) {
-	return _InterlockedCompareExchange(cast(long volatile *)a, desired, expected);
-}
-gb_inline i32 gb_atomic32_exchanged(gbAtomic32 volatile *a, i32 desired) {
-	return _InterlockedExchange(cast(long volatile *)a, desired);
-}
-gb_inline i32 gb_atomic32_fetch_add(gbAtomic32 volatile *a, i32 operand) {
-	return _InterlockedExchangeAdd(cast(long volatile *)a, operand);
-}
-gb_inline i32 gb_atomic32_fetch_and(gbAtomic32 volatile *a, i32 operand) {
-	return _InterlockedAnd(cast(long volatile *)a, operand);
-}
-gb_inline i32 gb_atomic32_fetch_or(gbAtomic32 volatile *a, i32 operand) {
-	return _InterlockedOr(cast(long volatile *)a, operand);
-}
-
-gb_inline i64 gb_atomic64_load(gbAtomic64 const volatile *a) {
-#if defined(GB_ARCH_64_BIT)
-	return a->value;
-#elif GB_CPU_X86
-	// NOTE(bill): The most compatible way to get an atomic 64-bit load on x86 is with cmpxchg8b
-	i64 result;
-	__asm {
-		mov esi, a;
-		mov ebx, eax;
-		mov ecx, edx;
-		lock cmpxchg8b [esi];
-		mov dword ptr result, eax;
-		mov dword ptr result[4], edx;
-	}
-	return result;
-#else
-#error TODO(bill): atomics for this CPU
-#endif
-}
-
-gb_inline void gb_atomic64_store(gbAtomic64 volatile *a, i64 value) {
-#if defined(GB_ARCH_64_BIT)
-	a->value = value;
-#elif GB_CPU_X86
-	// NOTE(bill): The most compatible way to get an atomic 64-bit store on x86 is with cmpxchg8b
-	__asm {
-		mov esi, a;
-		mov ebx, dword ptr value;
-		mov ecx, dword ptr value[4];
-	retry:
-		cmpxchg8b [esi];
-		jne retry;
-	}
-#else
-#error TODO(bill): atomics for this CPU
-#endif
-}
-
-gb_inline i64 gb_atomic64_compare_exchange(gbAtomic64 volatile *a, i64 expected, i64 desired) {
-	return _InterlockedCompareExchange64(cast(i64 volatile *)a, desired, expected);
-}
-
-gb_inline i64 gb_atomic64_exchanged(gbAtomic64 volatile *a, i64 desired) {
-#if defined(GB_ARCH_64_BIT)
-	return _InterlockedExchange64(cast(i64 volatile *)a, desired);
-#elif GB_CPU_X86
-	i64 expected = a->value;
-	for (;;) {
-		i64 original = _InterlockedCompareExchange64(cast(i64 volatile *)a, desired, expected);
-		if (original == expected)
-			return original;
-		expected = original;
-	}
-#else
-#error TODO(bill): atomics for this CPU
-#endif
-}
-
-gb_inline i64 gb_atomic64_fetch_add(gbAtomic64 volatile *a, i64 operand) {
-#if defined(GB_ARCH_64_BIT)
-	return _InterlockedExchangeAdd64(cast(i64 volatile *)a, operand);
-#elif GB_CPU_X86
-	i64 expected = a->value;
-	for (;;) {
-		i64 original = _InterlockedCompareExchange64(cast(i64 volatile *)a, expected + operand, expected);
-		if (original == expected)
-			return original;
-		expected = original;
-	}
-#else
-#error TODO(bill): atomics for this CPU
-#endif
-}
-
-gb_inline i64 gb_atomic64_fetch_and(gbAtomic64 volatile *a, i64 operand) {
-#if defined(GB_ARCH_64_BIT)
-	return _InterlockedAnd64(cast(i64 volatile *)a, operand);
-#elif GB_CPU_X86
-	i64 expected = a->value;
-	for (;;) {
-		i64 original = _InterlockedCompareExchange64(cast(i64 volatile *)a, expected & operand, expected);
-		if (original == expected)
-			return original;
-		expected = original;
-	}
-#else
-#error TODO(bill): atomics for this CPU
-#endif
-}
-
-gb_inline i64 gb_atomic64_fetch_or(gbAtomic64 volatile *a, i64 operand) {
-#if defined(GB_ARCH_64_BIT)
-	return _InterlockedOr64(cast(i64 volatile *)a, operand);
-#elif GB_CPU_X86
-	i64 expected = a->value;
-	for (;;) {
-		i64 original = _InterlockedCompareExchange64(cast(i64 volatile *)a, expected | operand, expected);
-		if (original == expected)
-			return original;
-		expected = original;
-	}
-#else
-#error TODO(bill): atomics for this CPU
-#endif
-}
-
-
-
-#elif defined(GB_CPU_X86)
-
-gb_inline i32  gb_atomic32_load (gbAtomic32 const volatile *a)      { return a->value;  }
-gb_inline void gb_atomic32_store(gbAtomic32 volatile *a, i32 value) { a->value = value; }
-
-gb_inline i32 gb_atomic32_compare_exchange(gbAtomic32 volatile *a, i32 expected, i32 desired) {
-	i32 original;
-	__asm__ volatile(
-		"lock; cmpxchgl %2, %1"
-		: "=a"(original), "+m"(a->value)
-		: "q"(desired), "0"(expected)
-	);
-	return original;
-}
-
-gb_inline i32 gb_atomic32_exchanged(gbAtomic32 volatile *a, i32 desired) {
-	// NOTE(bill): No lock prefix is necessary for xchgl
-	i32 original;
-	__asm__ volatile(
-		"xchgl %0, %1"
-		: "=r"(original), "+m"(a->value)
-		: "0"(desired)
-	);
-	return original;
-}
-
-gb_inline i32 gb_atomic32_fetch_add(gbAtomic32 volatile *a, i32 operand) {
-	i32 original;
-	__asm__ volatile(
-		"lock; xaddl %0, %1"
-		: "=r"(original), "+m"(a->value)
-		: "0"(operand)
-	);
-	return original;
-}
-
-gb_inline i32 gb_atomic32_fetch_and(gbAtomic32 volatile *a, i32 operand) {
-	i32 original;
-	i32 tmp;
-	__asm__ volatile(
-		"1:     movl    %1, %0\n"
-		"       movl    %0, %2\n"
-		"       andl    %3, %2\n"
-		"       lock; cmpxchgl %2, %1\n"
-		"       jne     1b"
-		: "=&a"(original), "+m"(a->value), "=&r"(tmp)
-		: "r"(operand)
-	);
-	return original;
-}
-
-gb_inline i32 gb_atomic32_fetch_or(gbAtomic32 volatile *a, i32 operand) {
-	i32 original;
-	i32 temp;
-	__asm__ volatile(
-		"1:     movl    %1, %0\n"
-		"       movl    %0, %2\n"
-		"       orl     %3, %2\n"
-		"       lock; cmpxchgl %2, %1\n"
-		"       jne     1b"
-		: "=&a"(original), "+m"(a->value), "=&r"(temp)
-		: "r"(operand)
-	);
-	return original;
-}
-
-
-gb_inline i64 gb_atomic64_load(gbAtomic64 const volatile *a) {
-#if defined(GB_ARCH_64_BIT)
-	return a->value;
-#else
-	i64 original;
-	__asm__ volatile(
-		"movl %%ebx, %%eax\n"
-		"movl %%ecx, %%edx\n"
-		"lock; cmpxchg8b %1"
-		: "=&A"(original)
-		: "m"(a->value)
-	);
-	return original;
-#endif
-}
-
-gb_inline void gb_atomic64_store(gbAtomic64 volatile *a, i64 value) {
-#if defined(GB_ARCH_64_BIT)
-	a->value = value;
-#else
-	i64 expected = a->value;
-	__asm__ volatile(
-		"1:    cmpxchg8b %0\n"
-		"      jne 1b"
-		: "=m"(a->value)
-		: "b"((i32)value), "c"((i32)(value >> 32)), "A"(expected)
-	);
-#endif
-}
-
-gb_inline i64 gb_atomic64_compare_exchange(gbAtomic64 volatile *a, i64 expected, i64 desired) {
-#if defined(GB_ARCH_64_BIT)
-	i64 original;
-	__asm__ volatile(
-		"lock; cmpxchgq %2, %1"
-		: "=a"(original), "+m"(a->value)
-		: "q"(desired), "0"(expected)
-	);
-	return original;
-#else
-	i64 original;
-	__asm__ volatile(
-		"lock; cmpxchg8b %1"
-		: "=A"(original), "+m"(a->value)
-		: "b"((i32)desired), "c"((i32)(desired >> 32)), "0"(expected)
-	);
-	return original;
-#endif
-}
-
-gb_inline i64 gb_atomic64_exchanged(gbAtomic64 volatile *a, i64 desired) {
-#if defined(GB_ARCH_64_BIT)
-	i64 original;
-	__asm__ volatile(
-		"xchgq %0, %1"
-		: "=r"(original), "+m"(a->value)
-		: "0"(desired)
-	);
-	return original;
-#else
-	i64 original = a->value;
-	for (;;) {
-		i64 previous = gb_atomic64_compare_exchange(a, original, desired);
-		if (original == previous)
-			return original;
-		original = previous;
-	}
-#endif
-}
-
-gb_inline i64 gb_atomic64_fetch_add(gbAtomic64 volatile *a, i64 operand) {
-#if defined(GB_ARCH_64_BIT)
-	i64 original;
-	__asm__ volatile(
-		"lock; xaddq %0, %1"
-		: "=r"(original), "+m"(a->value)
-		: "0"(operand)
-	);
-	return original;
-#else
-	for (;;) {
-		i64 original = a->value;
-		if (gb_atomic64_compare_exchange(a, original, original + operand) == original)
-			return original;
-	}
-#endif
-}
-
-gb_inline i64 gb_atomic64_fetch_and(gbAtomic64 volatile *a, i64 operand) {
-#if defined(GB_ARCH_64_BIT)
-	i64 original;
-	i64 tmp;
-	__asm__ volatile(
-		"1:     movq    %1, %0\n"
-		"       movq    %0, %2\n"
-		"       andq    %3, %2\n"
-		"       lock; cmpxchgq %2, %1\n"
-		"       jne     1b"
-		: "=&a"(original), "+m"(a->value), "=&r"(tmp)
-		: "r"(operand)
-	);
-	return original;
-#else
-	for (;;) {
-		i64 original = a->value;
-		if (gb_atomic64_compare_exchange(a, original, original & operand) == original)
-			return original;
-	}
-#endif
-}
-
-gb_inline i64 gb_atomic64_fetch_or(gbAtomic64 volatile *a, i64 operand) {
-#if defined(GB_ARCH_64_BIT)
-	i64 original;
-	i64 temp;
-	__asm__ volatile(
-		"1:     movq    %1, %0\n"
-		"       movq    %0, %2\n"
-		"       orq     %3, %2\n"
-		"       lock; cmpxchgq %2, %1\n"
-		"       jne     1b"
-		: "=&a"(original), "+m"(a->value), "=&r"(temp)
-		: "r"(operand)
-	);
-	return original;
-#else
-	for (;;) {
-		i64 original = a->value;
-		if (gb_atomic64_compare_exchange(a, original, original | operand) == original)
-			return original;
-	}
-#endif
-}
-
-#elif defined(GB_CPU_ARM)
-
-gb_inline i32  gb_atomic32_load (gbAtomic32 const volatile *a) {
-	return __atomic_load_n(&a->value, __ATOMIC_SEQ_CST);
-}
-gb_inline void gb_atomic32_store(gbAtomic32 volatile *a, i32 value) {
-	__atomic_store_n(&a->value, value, __ATOMIC_SEQ_CST);
-}
-
-gb_inline i32 gb_atomic32_compare_exchange(gbAtomic32 volatile *a, i32 expected, i32 desired) {
-	i32 expected_copy = expected;
-	auto result = __atomic_compare_exchange_n(&a->value, &expected_copy, desired, true, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
-	if (result) {
-		return expected;
-	} else {
-		return expected_copy;
-	}
-}
-
-gb_inline i32 gb_atomic32_exchanged(gbAtomic32 volatile *a, i32 desired) {
-	return __atomic_exchange_n(&a->value, desired, __ATOMIC_SEQ_CST);
-}
-
-gb_inline i32 gb_atomic32_fetch_add(gbAtomic32 volatile *a, i32 operand) {
-	return __atomic_fetch_add(&a->value, operand, __ATOMIC_SEQ_CST);
-}
-
-gb_inline i32 gb_atomic32_fetch_and(gbAtomic32 volatile *a, i32 operand) {
-	return __atomic_fetch_and(&a->value, operand, __ATOMIC_SEQ_CST);
-}
-
-gb_inline i32 gb_atomic32_fetch_or(gbAtomic32 volatile *a, i32 operand) {
-	return __atomic_fetch_or(&a->value, operand, __ATOMIC_SEQ_CST);
-}
-
-gb_inline i64 gb_atomic64_load(gbAtomic64 const volatile *a) {
-	return __atomic_load_n(&a->value, __ATOMIC_SEQ_CST);
-}
-
-gb_inline void gb_atomic64_store(gbAtomic64 volatile *a, i64 value) {
-	__atomic_store_n(&a->value, value, __ATOMIC_SEQ_CST);
-}
-
-gb_inline i64 gb_atomic64_compare_exchange(gbAtomic64 volatile *a, i64 expected, i64 desired) {
-	i64 expected_copy = expected;
-	auto result = __atomic_compare_exchange_n(&a->value, &expected_copy, desired, true, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
-	if (result) {
-		return expected;
-	} else {
-		return expected_copy;
-	}
-}
-
-gb_inline i64 gb_atomic64_exchanged(gbAtomic64 volatile *a, i64 desired) {
-	return __atomic_exchange_n(&a->value, desired, __ATOMIC_SEQ_CST);
-}
-
-gb_inline i64 gb_atomic64_fetch_add(gbAtomic64 volatile *a, i64 operand) {
-	return __atomic_fetch_add(&a->value, operand, __ATOMIC_SEQ_CST);
-}
-
-gb_inline i64 gb_atomic64_fetch_and(gbAtomic64 volatile *a, i64 operand) {
-	return __atomic_fetch_and(&a->value, operand, __ATOMIC_SEQ_CST);
-}
-
-gb_inline i64 gb_atomic64_fetch_or(gbAtomic64 volatile *a, i64 operand) {
-	return __atomic_fetch_or(&a->value, operand, __ATOMIC_SEQ_CST);
-}
-
-
-#else
-#error TODO(bill): Implement Atomics for this CPU
-#endif
-
-gb_inline b32 gb_atomic32_spin_lock(gbAtomic32 volatile *a, isize time_out) {
-	i32 old_value = gb_atomic32_compare_exchange(a, 1, 0);
-	i32 counter = 0;
-	while (old_value != 0 && (time_out < 0 || counter++ < time_out)) {
-		gb_yield_thread();
-		old_value = gb_atomic32_compare_exchange(a, 1, 0);
-		gb_mfence();
-	}
-	return old_value == 0;
-}
-gb_inline void gb_atomic32_spin_unlock(gbAtomic32 volatile *a) {
-	gb_atomic32_store(a, 0);
-	gb_mfence();
-}
-
-gb_inline b32 gb_atomic64_spin_lock(gbAtomic64 volatile *a, isize time_out) {
-	i64 old_value = gb_atomic64_compare_exchange(a, 1, 0);
-	i64 counter = 0;
-	while (old_value != 0 && (time_out < 0 || counter++ < time_out)) {
-		gb_yield_thread();
-		old_value = gb_atomic64_compare_exchange(a, 1, 0);
-		gb_mfence();
-	}
-	return old_value == 0;
-}
-
-gb_inline void gb_atomic64_spin_unlock(gbAtomic64 volatile *a) {
-	gb_atomic64_store(a, 0);
-	gb_mfence();
-}
-
-gb_inline b32 gb_atomic32_try_acquire_lock(gbAtomic32 volatile *a) {
-	i32 old_value;
-	gb_yield_thread();
-	old_value = gb_atomic32_compare_exchange(a, 1, 0);
-	gb_mfence();
-	return old_value == 0;
-}
-
-gb_inline b32 gb_atomic64_try_acquire_lock(gbAtomic64 volatile *a) {
-	i64 old_value;
-	gb_yield_thread();
-	old_value = gb_atomic64_compare_exchange(a, 1, 0);
-	gb_mfence();
-	return old_value == 0;
-}
-
-
-
-#if defined(GB_ARCH_32_BIT)
-
-gb_inline void *gb_atomic_ptr_load(gbAtomicPtr const volatile *a) {
-	return cast(void *)cast(intptr)gb_atomic32_load(cast(gbAtomic32 const volatile *)a);
-}
-gb_inline void gb_atomic_ptr_store(gbAtomicPtr volatile *a, void *value) {
-	gb_atomic32_store(cast(gbAtomic32 volatile *)a, cast(i32)cast(intptr)value);
-}
-gb_inline void *gb_atomic_ptr_compare_exchange(gbAtomicPtr volatile *a, void *expected, void *desired) {
-	return cast(void *)cast(intptr)gb_atomic32_compare_exchange(cast(gbAtomic32 volatile *)a, cast(i32)cast(intptr)expected, cast(i32)cast(intptr)desired);
-}
-gb_inline void *gb_atomic_ptr_exchanged(gbAtomicPtr volatile *a, void *desired) {
-	return cast(void *)cast(intptr)gb_atomic32_exchanged(cast(gbAtomic32 volatile *)a, cast(i32)cast(intptr)desired);
-}
-gb_inline void *gb_atomic_ptr_fetch_add(gbAtomicPtr volatile *a, void *operand) {
-	return cast(void *)cast(intptr)gb_atomic32_fetch_add(cast(gbAtomic32 volatile *)a, cast(i32)cast(intptr)operand);
-}
-gb_inline void *gb_atomic_ptr_fetch_and(gbAtomicPtr volatile *a, void *operand) {
-	return cast(void *)cast(intptr)gb_atomic32_fetch_and(cast(gbAtomic32 volatile *)a, cast(i32)cast(intptr)operand);
-}
-gb_inline void *gb_atomic_ptr_fetch_or(gbAtomicPtr volatile *a, void *operand) {
-	return cast(void *)cast(intptr)gb_atomic32_fetch_or(cast(gbAtomic32 volatile *)a, cast(i32)cast(intptr)operand);
-}
-gb_inline b32 gb_atomic_ptr_spin_lock(gbAtomicPtr volatile *a, isize time_out) {
-	return gb_atomic32_spin_lock(cast(gbAtomic32 volatile *)a, time_out);
-}
-gb_inline void gb_atomic_ptr_spin_unlock(gbAtomicPtr volatile *a) {
-	gb_atomic32_spin_unlock(cast(gbAtomic32 volatile *)a);
-}
-gb_inline b32 gb_atomic_ptr_try_acquire_lock(gbAtomicPtr volatile *a) {
-	return gb_atomic32_try_acquire_lock(cast(gbAtomic32 volatile *)a);
-}
-
-#elif defined(GB_ARCH_64_BIT)
-
-gb_inline void *gb_atomic_ptr_load(gbAtomicPtr const volatile *a) {
-	return cast(void *)cast(intptr)gb_atomic64_load(cast(gbAtomic64 const volatile *)a);
-}
-gb_inline void gb_atomic_ptr_store(gbAtomicPtr volatile *a, void *value) {
-	gb_atomic64_store(cast(gbAtomic64 volatile *)a, cast(i64)cast(intptr)value);
-}
-gb_inline void *gb_atomic_ptr_compare_exchange(gbAtomicPtr volatile *a, void *expected, void *desired) {
-	return cast(void *)cast(intptr)gb_atomic64_compare_exchange(cast(gbAtomic64 volatile *)a, cast(i64)cast(intptr)expected, cast(i64)cast(intptr)desired);
-}
-gb_inline void *gb_atomic_ptr_exchanged(gbAtomicPtr volatile *a, void *desired) {
-	return cast(void *)cast(intptr)gb_atomic64_exchanged(cast(gbAtomic64 volatile *)a, cast(i64)cast(intptr)desired);
-}
-gb_inline void *gb_atomic_ptr_fetch_add(gbAtomicPtr volatile *a, void *operand) {
-	return cast(void *)cast(intptr)gb_atomic64_fetch_add(cast(gbAtomic64 volatile *)a, cast(i64)cast(intptr)operand);
-}
-gb_inline void *gb_atomic_ptr_fetch_and(gbAtomicPtr volatile *a, void *operand) {
-	return cast(void *)cast(intptr)gb_atomic64_fetch_and(cast(gbAtomic64 volatile *)a, cast(i64)cast(intptr)operand);
-}
-gb_inline void *gb_atomic_ptr_fetch_or(gbAtomicPtr volatile *a, void *operand) {
-	return cast(void *)cast(intptr)gb_atomic64_fetch_or(cast(gbAtomic64 volatile *)a, cast(i64)cast(intptr)operand);
-}
-gb_inline b32 gb_atomic_ptr_spin_lock(gbAtomicPtr volatile *a, isize time_out) {
-	return gb_atomic64_spin_lock(cast(gbAtomic64 volatile *)a, time_out);
-}
-gb_inline void gb_atomic_ptr_spin_unlock(gbAtomicPtr volatile *a) {
-	gb_atomic64_spin_unlock(cast(gbAtomic64 volatile *)a);
-}
-gb_inline b32 gb_atomic_ptr_try_acquire_lock(gbAtomicPtr volatile *a) {
-	return gb_atomic64_try_acquire_lock(cast(gbAtomic64 volatile *)a);
-}
-#endif
-
-
-gb_inline void gb_yield_thread(void) {
-#if defined(GB_SYSTEM_WINDOWS)
-	_mm_pause();
-#elif defined(GB_SYSTEM_OSX)
-	#if defined(GB_CPU_X86)
-	__asm__ volatile ("" : : : "memory");
-	#elif defined(GB_CPU_ARM)
-	__asm__ volatile ("yield" : : : "memory");
-	#endif
-#elif defined(GB_CPU_X86)
-	_mm_pause();
-#else
-#error Unknown architecture
-#endif
-}
-
-gb_inline void gb_mfence(void) {
-#if defined(GB_SYSTEM_WINDOWS)
-	#if defined(__clang__)
-	#pragma clang diagnostic push
-	#pragma clang diagnostic ignored "-Wdeprecated-declarations"
-		_ReadWriteBarrier();
-	#pragma clang diagnostic pop
-	#else
-		_ReadWriteBarrier();
-	#endif
-#elif defined(GB_SYSTEM_OSX)
-	#if defined(GB_CPU_X86)
-	__sync_synchronize();
-	#elif defined(GB_CPU_ARM)
-	__atomic_thread_fence(__ATOMIC_SEQ_CST);
-	#endif
-#elif defined(GB_CPU_X86)
-	_mm_mfence();
-#else
-#error Unknown architecture
-#endif
-}
-
-gb_inline void gb_sfence(void) {
-#if defined(GB_SYSTEM_WINDOWS)
-	#if defined(__clang__)
-	#pragma clang diagnostic push
-	#pragma clang diagnostic ignored "-Wdeprecated-declarations"
-		_WriteBarrier();
-	#pragma clang diagnostic pop
-	#else
-		_WriteBarrier();
-	#endif
-#elif defined(GB_SYSTEM_OSX)
-	#if defined(GB_CPU_X86)
-	__asm__ volatile ("" : : : "memory");
-	#elif defined(GB_CPU_ARM)
-	// TODO(bill): is this correct?
-	__atomic_thread_fence(__ATOMIC_SEQ_CST);
-	#endif
-#elif defined(GB_CPU_X86)
-	_mm_sfence();
-#else
-#error Unknown architecture
-#endif
-}
-
-gb_inline void gb_lfence(void) {
-#if defined(GB_SYSTEM_WINDOWS)
-	#if defined(__clang__)
-	#pragma clang diagnostic push
-	#pragma clang diagnostic ignored "-Wdeprecated-declarations"
-		_ReadBarrier();
-	#pragma clang diagnostic pop
-	#else
-		_ReadBarrier();
-	#endif
-#elif defined(GB_SYSTEM_OSX)
-	__asm__ volatile ("" : : : "memory");
-#elif defined(GB_CPU_X86)
-	_mm_lfence();
-#else
-#error Unknown architecture
-#endif
-}
-
-
-gb_inline void gb_semaphore_release(gbSemaphore *s) { gb_semaphore_post(s, 1); }
-
-#if defined(GB_SYSTEM_WINDOWS)
-	gb_inline void gb_semaphore_init(gbSemaphore *s) {
-		s->win32_handle = CreateSemaphoreA(NULL, 0, I32_MAX, NULL);
-	}
-	gb_inline void gb_semaphore_destroy(gbSemaphore *s) {
-		CloseHandle(s->win32_handle);
-	}
-	gb_inline void gb_semaphore_post(gbSemaphore *s, i32 count) {
-		ReleaseSemaphore(s->win32_handle, count, NULL);
-	}
-	gb_inline void gb_semaphore_wait(gbSemaphore *s) {
-		WaitForSingleObjectEx(s->win32_handle, INFINITE, FALSE);
-	}
-
-#elif defined(GB_SYSTEM_OSX)
-	gb_inline void gb_semaphore_init   (gbSemaphore *s)            { semaphore_create(mach_task_self(), &s->osx_handle, SYNC_POLICY_FIFO, 0); }
-	gb_inline void gb_semaphore_destroy(gbSemaphore *s)            { semaphore_destroy(mach_task_self(), s->osx_handle); }
-	gb_inline void gb_semaphore_post   (gbSemaphore *s, i32 count) { while (count --> 0) semaphore_signal(s->osx_handle); }
-	gb_inline void gb_semaphore_wait   (gbSemaphore *s)            { semaphore_wait(s->osx_handle); }
-
-#elif defined(GB_SYSTEM_UNIX)
-	gb_inline void gb_semaphore_init   (gbSemaphore *s)            { sem_init(&s->unix_handle, 0, 0); }
-	gb_inline void gb_semaphore_destroy(gbSemaphore *s)            { sem_destroy(&s->unix_handle); }
-	gb_inline void gb_semaphore_post   (gbSemaphore *s, i32 count) { while (count --> 0) sem_post(&s->unix_handle); }
-	gb_inline void gb_semaphore_wait   (gbSemaphore *s)            { int i; do { i = sem_wait(&s->unix_handle); } while (i == -1 && errno == EINTR); }
-
-#else
-#error
-#endif
-
-gb_inline void gb_mutex_init(gbMutex *m) {
-#if defined(GB_SYSTEM_WINDOWS)
-	InitializeCriticalSection(&m->win32_critical_section);
-#else
-	pthread_mutexattr_init(&m->pthread_mutexattr);
-	pthread_mutexattr_settype(&m->pthread_mutexattr, PTHREAD_MUTEX_RECURSIVE);
-	pthread_mutex_init(&m->pthread_mutex, &m->pthread_mutexattr);
-#endif
-}
-
-gb_inline void gb_mutex_destroy(gbMutex *m) {
-#if defined(GB_SYSTEM_WINDOWS)
-	DeleteCriticalSection(&m->win32_critical_section);
-#else
-	pthread_mutex_destroy(&m->pthread_mutex);
-#endif
-}
-
-gb_inline void gb_mutex_lock(gbMutex *m) {
-#if defined(GB_SYSTEM_WINDOWS)
-	EnterCriticalSection(&m->win32_critical_section);
-#else
-	pthread_mutex_lock(&m->pthread_mutex);
-#endif
-}
-
-gb_inline b32 gb_mutex_try_lock(gbMutex *m) {
-#if defined(GB_SYSTEM_WINDOWS)
-	return TryEnterCriticalSection(&m->win32_critical_section) != 0;
-#else
-	return pthread_mutex_trylock(&m->pthread_mutex) == 0;
-#endif
-}
-
-gb_inline void gb_mutex_unlock(gbMutex *m) {
-#if defined(GB_SYSTEM_WINDOWS)
-	LeaveCriticalSection(&m->win32_critical_section);
-#else
-	pthread_mutex_unlock(&m->pthread_mutex);
-#endif
-}
-
-
-
-
-
-
-
-void gb_thread_init(gbThread *t) {
-	gb_zero_item(t);
-#if defined(GB_SYSTEM_WINDOWS)
-	t->win32_handle = INVALID_HANDLE_VALUE;
-#else
-	t->posix_handle = 0;
-#endif
-	gb_semaphore_init(&t->semaphore);
-}
-
-void gb_thread_destroy(gbThread *t) {
-	if (t->is_running) gb_thread_join(t);
-	gb_semaphore_destroy(&t->semaphore);
-}
-
-
-gb_inline void gb__thread_run(gbThread *t) {
-	gb_semaphore_release(&t->semaphore);
-	t->return_value = t->proc(t);
-}
-
-#if defined(GB_SYSTEM_WINDOWS)
-	gb_inline DWORD __stdcall gb__thread_proc(void *arg) {
-		gbThread *t = cast(gbThread *)arg;
-		gb__thread_run(t);
-		t->is_running = false;
-		return 0;
-	}
-#else
-	gb_inline void *          gb__thread_proc(void *arg) {
-		gbThread *t = cast(gbThread *)arg;
-		gb__thread_run(t);
-		t->is_running = false;
-		return NULL;
-	}
-#endif
-
-gb_inline void gb_thread_start(gbThread *t, gbThreadProc *proc, void *user_data) { gb_thread_start_with_stack(t, proc, user_data, 0); }
-
-gb_inline void gb_thread_start_with_stack(gbThread *t, gbThreadProc *proc, void *user_data, isize stack_size) {
-	GB_ASSERT(!t->is_running);
-	GB_ASSERT(proc != NULL);
-	t->proc = proc;
-	t->user_data = user_data;
-	t->stack_size = stack_size;
-	t->is_running = true;
-
-#if defined(GB_SYSTEM_WINDOWS)
-	t->win32_handle = CreateThread(NULL, stack_size, gb__thread_proc, t, 0, NULL);
-	GB_ASSERT_MSG(t->win32_handle != NULL, "CreateThread: GetLastError");
-#else
-	{
-		pthread_attr_t attr;
-		pthread_attr_init(&attr);
-		pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
-		if (stack_size != 0) {
-			pthread_attr_setstacksize(&attr, stack_size);
-		}
-		pthread_create(&t->posix_handle, &attr, gb__thread_proc, t);
-		pthread_attr_destroy(&attr);
-	}
-#endif
-
-	gb_semaphore_wait(&t->semaphore);
-}
-
-gb_inline void gb_thread_join(gbThread *t) {
-	if (!t->is_running) return;
-
-#if defined(GB_SYSTEM_WINDOWS)
-	WaitForSingleObject(t->win32_handle, INFINITE);
-	CloseHandle(t->win32_handle);
-	t->win32_handle = INVALID_HANDLE_VALUE;
-#else
-	pthread_join(t->posix_handle, NULL);
-	t->posix_handle = 0;
-#endif
-	t->is_running = false;
-}
-
-gb_inline b32 gb_thread_is_running(gbThread const *t) { return t->is_running != 0; }
 
 gb_inline u32 gb_thread_current_id(void) {
 	u32 thread_id;
@@ -4421,116 +3491,6 @@ gb_inline u32 gb_thread_current_id(void) {
 
 
 
-void gb_thread_set_name(gbThread *t, char const *name) {
-#if defined(GB_COMPILER_MSVC)
-	#pragma pack(push, 8)
-		typedef struct {
-			DWORD       type;
-			char const *name;
-			DWORD       id;
-			DWORD       flags;
-		} gbprivThreadName;
-	#pragma pack(pop)
-		gbprivThreadName tn;
-		tn.type  = 0x1000;
-		tn.name  = name;
-		tn.id    = GetThreadId(cast(HANDLE)t->win32_handle);
-		tn.flags = 0;
-
-		__try {
-			RaiseException(0x406d1388, 0, gb_size_of(tn)/4, cast(ULONG_PTR *)&tn);
-		} __except(1 /*EXCEPTION_EXECUTE_HANDLER*/) {
-		}
-
-#elif defined(GB_SYSTEM_WINDOWS) && !defined(GB_COMPILER_MSVC)
-	// IMPORTANT TODO(bill): Set thread name for GCC/Clang on windows
-	return;
-#elif defined(GB_SYSTEM_OSX)
-	// TODO(bill): Test if this works
-	pthread_setname_np(name);
-#elif defined(GB_SYSTEM_FREEBSD)
-	pthread_set_name_np(t->posix_handle, name);
-#else
-	// TODO(bill): Test if this works
-	pthread_setname_np(t->posix_handle, name);
-#endif
-}
-
-
-
-
-void gb_sync_init(gbSync *s) {
-	gb_zero_item(s);
-	gb_mutex_init(&s->mutex);
-	gb_mutex_init(&s->start);
-	gb_semaphore_init(&s->release);
-}
-
-void gb_sync_destroy(gbSync *s) {
-	if (s->waiting)
-		GB_PANIC("Cannot destroy while threads are waiting!");
-
-	gb_mutex_destroy(&s->mutex);
-	gb_mutex_destroy(&s->start);
-	gb_semaphore_destroy(&s->release);
-}
-
-void gb_sync_set_target(gbSync *s, i32 count) {
-	gb_mutex_lock(&s->start);
-
-	gb_mutex_lock(&s->mutex);
-	GB_ASSERT(s->target == 0);
-	s->target = count;
-	s->current = 0;
-	s->waiting = 0;
-	gb_mutex_unlock(&s->mutex);
-}
-
-void gb_sync_release(gbSync *s) {
-	if (s->waiting) {
-		gb_semaphore_release(&s->release);
-	} else {
-		s->target = 0;
-		gb_mutex_unlock(&s->start);
-	}
-}
-
-i32 gb_sync_reach(gbSync *s) {
-	i32 n;
-	gb_mutex_lock(&s->mutex);
-	GB_ASSERT(s->current < s->target);
-	n = ++s->current; // NOTE(bill): Record this value to avoid possible race if `return s->current` was done
-	if (s->current == s->target)
-		gb_sync_release(s);
-	gb_mutex_unlock(&s->mutex);
-	return n;
-}
-
-void gb_sync_reach_and_wait(gbSync *s) {
-	gb_mutex_lock(&s->mutex);
-	GB_ASSERT(s->current < s->target);
-	s->current++;
-	if (s->current == s->target) {
-		gb_sync_release(s);
-		gb_mutex_unlock(&s->mutex);
-	} else {
-		s->waiting++;                   // NOTE(bill): Waiting, so one more waiter
-		gb_mutex_unlock(&s->mutex);     // NOTE(bill): Release the mutex to other threads
-
-		gb_semaphore_wait(&s->release); // NOTE(bill): Wait for merge completion
-
-		gb_mutex_lock(&s->mutex);       // NOTE(bill): On merge completion, lock mutex
-		s->waiting--;                   // NOTE(bill): Done waiting
-		gb_sync_release(s);             // NOTE(bill): Restart the next waiter
-		gb_mutex_unlock(&s->mutex);
-	}
-}
-
-
-
-
-
-
 
 
 gb_inline gbAllocator gb_heap_allocator(void) {
@@ -8565,7 +7525,7 @@ gb_inline gbDllProc gb_dll_proc_address(gbDllHandle dll, char const *proc_name)
 //
 //
 
-gb_global gbAtomic32 gb__random_shared_counter = {0};
+gb_global i32 gb__random_shared_counter;
 
 gb_internal u32 gb__get_noise_from_time(void) {
 	u32 accum = 0;
@@ -8612,7 +7572,7 @@ void gb_random_init(gbRandom *r) {
 	r->value = 0;
 
 	r->offsets[0] = gb__get_noise_from_time();
-	r->offsets[1] = gb_atomic32_fetch_add(&gb__random_shared_counter, 1);
+	r->offsets[1] = gb__random_shared_counter++; // doesn't matter if it's not thread safe
 	r->offsets[2] = gb_thread_current_id();
 	r->offsets[3] = gb_thread_current_id() * 3 + 1;
 	time = gb_utc_time_now();
@@ -8730,14 +7690,6 @@ gb_inline void gb_exit(u32 code) { ExitProcess(code); }
 gb_inline void gb_exit(u32 code) { exit(code); }
 #endif
 
-gb_inline void gb_yield(void) {
-#if defined(GB_SYSTEM_WINDOWS)
-	YieldProcessor();
-#else
-	sched_yield();
-#endif
-}
-
 char const *gb_get_env(char const *name, gbAllocator allocator) {
 	#if defined(GB_SYSTEM_WINDOWS)
 		if (!name || !*name) {
diff --git a/src/llvm_backend.hpp b/src/llvm_backend.hpp
index c5f5897c1..19e5ffdb6 100644
--- a/src/llvm_backend.hpp
+++ b/src/llvm_backend.hpp
@@ -136,8 +136,6 @@ struct lbModule {
 struct lbGenerator {
 	CheckerInfo *info;
 
-	gbMutex mutex;
-
 	Array<String> output_object_paths;
 	Array<String> output_temp_paths;
 	String   output_base;
@@ -148,8 +146,8 @@ struct lbGenerator {
 
 	Map<lbProcedure *> anonymous_proc_lits; // Key: Ast *
 
-	gbAtomic32 global_array_index;
-	gbAtomic32 global_generated_index;
+	std::atomic<u32> global_array_index;
+	std::atomic<u32> global_generated_index;
 };
 
 
diff --git a/src/llvm_backend_const.cpp b/src/llvm_backend_const.cpp
index f05202e79..5ad2b09b6 100644
--- a/src/llvm_backend_const.cpp
+++ b/src/llvm_backend_const.cpp
@@ -400,7 +400,7 @@ lbValue lb_const_value(lbModule *m, Type *type, ExactValue value, bool allow_loc
 			} else {
 				isize max_len = 7+8+1;
 				char *str = gb_alloc_array(permanent_allocator(), char, max_len);
-				u32 id = cast(u32)gb_atomic32_fetch_add(&m->gen->global_array_index, 1);
+				u32 id = m->gen->global_array_index.fetch_add(1);
 				isize len = gb_snprintf(str, max_len, "csba$%x", id);
 
 				String name = make_string(cast(u8 *)str, len-1);
diff --git a/src/llvm_backend_general.cpp b/src/llvm_backend_general.cpp
index 6907bfe2c..20b3aba25 100644
--- a/src/llvm_backend_general.cpp
+++ b/src/llvm_backend_general.cpp
@@ -127,8 +127,6 @@ bool lb_init_generator(lbGenerator *gen, Checker *c) {
 	map_init(&gen->modules_through_ctx, permanent_allocator(), gen->info->packages.entries.count*2);
 	map_init(&gen->anonymous_proc_lits, heap_allocator(), 1024);
 
-	gb_mutex_init(&gen->mutex);
-
 	if (USE_SEPARATE_MODULES) {
 		for_array(i, gen->info->packages.entries) {
 			AstPackage *pkg = gen->info->packages.entries[i].value;
@@ -2163,7 +2161,7 @@ LLVMValueRef lb_find_or_add_entity_string_ptr(lbModule *m, String const &str) {
 		isize max_len = 7+8+1;
 		char *name = gb_alloc_array(permanent_allocator(), char, max_len);
 
-		u32 id = cast(u32)gb_atomic32_fetch_add(&m->gen->global_array_index, 1);
+		u32 id = m->gen->global_array_index.fetch_add(1);
 		isize len = gb_snprintf(name, max_len, "csbs$%x", id);
 		len -= 1;
 
@@ -2205,7 +2203,7 @@ lbValue lb_find_or_add_entity_string_byte_slice(lbModule *m, String const &str)
 	{
 		isize max_len = 7+8+1;
 		name = gb_alloc_array(permanent_allocator(), char, max_len);
-		u32 id = cast(u32)gb_atomic32_fetch_add(&m->gen->global_array_index, 1);
+		u32 id = m->gen->global_array_index.fetch_add(1);
 		isize len = gb_snprintf(name, max_len, "csbs$%x", id);
 		len -= 1;
 	}
@@ -2317,7 +2315,7 @@ lbAddr lb_add_global_generated(lbModule *m, Type *type, lbValue value) {
 	isize max_len = 7+8+1;
 	u8 *str = cast(u8 *)gb_alloc_array(permanent_allocator(), u8, max_len);
 
-	u32 id = cast(u32)gb_atomic32_fetch_add(&m->gen->global_generated_index, 1);
+	u32 id = m->gen->global_generated_index.fetch_add(1);
 
 	isize len = gb_snprintf(cast(char *)str, max_len, "ggv$%x", id);
 	String name = make_string(str, len-1);
diff --git a/src/thread_pool.cpp b/src/thread_pool.cpp
index be4c3122b..738ef19b6 100644
--- a/src/thread_pool.cpp
+++ b/src/thread_pool.cpp
@@ -20,7 +20,7 @@ struct ThreadPool {
 
 	MPMCQueue<WorkerTask> tasks;
 
-	gbThread *threads;
+	Thread *threads;
 	isize thread_count;
 
 	char worker_prefix[10];
@@ -32,13 +32,13 @@ void thread_pool_destroy(ThreadPool *pool);
 void thread_pool_start(ThreadPool *pool);
 void thread_pool_join(ThreadPool *pool);
 void thread_pool_add_task(ThreadPool *pool, WorkerTaskProc *proc, void *data);
-GB_THREAD_PROC(worker_thread_internal);
+THREAD_PROC(worker_thread_internal);
 
 void thread_pool_init(ThreadPool *pool, gbAllocator const &a, isize thread_count, char const *worker_prefix) {
 	pool->allocator = a;
 	mpmc_init(&pool->tasks, a, 1024);
 	pool->thread_count = gb_max(thread_count, 0);
-	pool->threads = gb_alloc_array(a, gbThread, pool->thread_count);
+	pool->threads = gb_alloc_array(a, Thread, pool->thread_count);
 	mutex_init(&pool->mutex);
 	semaphore_init(&pool->sem_available);
 	pool->is_running = true;
@@ -52,15 +52,15 @@ void thread_pool_init(ThreadPool *pool, gbAllocator const &a, isize thread_count
 	}
 
 	for (isize i = 0; i < pool->thread_count; i++) {
-		gbThread *t = &pool->threads[i];
-		gb_thread_init(t);
+		Thread *t = &pool->threads[i];
+		thread_init(t);
 		t->user_index = i;
 		#if 0
 		// TODO(bill): Fix this on Linux as it causes a seg-fault
 		if (pool->worker_prefix_len > 0) {
 			char worker_name[16] = {};
 			gb_snprintf(worker_name, gb_size_of(worker_name), "%.*s%u", pool->worker_prefix_len, pool->worker_prefix, cast(u16)i);
-			gb_thread_set_name(t, worker_name);
+			thread_set_name(t, worker_name);
 		}
 		#endif
 	}
@@ -68,8 +68,8 @@ void thread_pool_init(ThreadPool *pool, gbAllocator const &a, isize thread_count
 
 void thread_pool_start(ThreadPool *pool) {
 	for (isize i = 0; i < pool->thread_count; i++) {
-		gbThread *t = &pool->threads[i];
-		gb_thread_start(t, worker_thread_internal, pool);
+		Thread *t = &pool->threads[i];
+		thread_start(t, worker_thread_internal, pool);
 	}
 }
 
@@ -78,11 +78,11 @@ void thread_pool_join(ThreadPool *pool) {
 
 	semaphore_post(&pool->sem_available, cast(i32)pool->thread_count);
 
-	gb_yield();
+	yield();
 
 	for (isize i = 0; i < pool->thread_count; i++) {
-		gbThread *t = &pool->threads[i];
-		gb_thread_join(t);
+		Thread *t = &pool->threads[i];
+		thread_join(t);
 	}
 }
 
@@ -144,14 +144,14 @@ void thread_pool_wait_to_process(ThreadPool *pool) {
 			mutex_unlock(&pool->mutex);
 		}
 
-		gb_yield();
+		yield();
 	}
 
 	thread_pool_join(pool);
 }
 
 
-GB_THREAD_PROC(worker_thread_internal) {
+THREAD_PROC(worker_thread_internal) {
 	ThreadPool *pool = cast(ThreadPool *)thread->user_data;
 	while (pool->is_running) {
 		semaphore_wait(&pool->sem_available);
diff --git a/src/threading.cpp b/src/threading.cpp
new file mode 100644
index 000000000..803cdb662
--- /dev/null
+++ b/src/threading.cpp
@@ -0,0 +1,348 @@
+struct BlockingMutex;
+struct RecursiveMutex;
+struct Semaphore;
+struct Thread;
+
+#define THREAD_PROC(name) isize name(struct Thread *thread)
+typedef THREAD_PROC(ThreadProc);
+
+struct Thread {
+#if defined(GB_SYSTEM_WINDOWS)
+	void *        win32_handle;
+#else
+	pthread_t     posix_handle;
+#endif
+
+	ThreadProc * proc;
+	void *         user_data;
+	isize          user_index;
+	isize volatile return_value;
+
+	Semaphore     *semaphore;
+	isize          stack_size;
+	b32 volatile   is_running;
+};
+
+
+void mutex_init    (BlockingMutex *m);
+void mutex_destroy (BlockingMutex *m);
+void mutex_lock    (BlockingMutex *m);
+bool mutex_try_lock(BlockingMutex *m);
+void mutex_unlock  (BlockingMutex *m);
+void mutex_init    (RecursiveMutex *m);
+void mutex_destroy (RecursiveMutex *m);
+void mutex_lock    (RecursiveMutex *m);
+bool mutex_try_lock(RecursiveMutex *m);
+void mutex_unlock  (RecursiveMutex *m);
+
+void semaphore_init   (Semaphore *s);
+void semaphore_destroy(Semaphore *s);
+void semaphore_post   (Semaphore *s, i32 count);
+void semaphore_wait   (Semaphore *s);
+void semaphore_release(Semaphore *s) { semaphore_post(s, 1); }
+
+u32  thread_current_id(void);
+
+void thread_init            (Thread *t);
+void thread_destroy         (Thread *t);
+void thread_start           (Thread *t, ThreadProc *proc, void *data);
+void thread_start_with_stack(Thread *t, ThreadProc *proc, void *data, isize stack_size);
+void thread_join            (Thread *t);
+bool thread_is_running      (Thread const *t);
+void thread_set_name        (Thread *t, char const *name);
+
+void yield_thread(void);
+void yield_process(void);
+
+
+#if defined(GB_SYSTEM_WINDOWS)
+	struct BlockingMutex {
+		SRWLOCK srwlock;
+	};
+	void mutex_init(BlockingMutex *m) {
+	}
+	void mutex_destroy(BlockingMutex *m) {
+	}
+	void mutex_lock(BlockingMutex *m) {
+		AcquireSRWLockExclusive(&m->srwlock);
+	}
+	bool mutex_try_lock(BlockingMutex *m) {
+		return !!TryAcquireSRWLockExclusive(&m->srwlock);
+	}
+	void mutex_unlock(BlockingMutex *m) {
+		ReleaseSRWLockExclusive(&m->srwlock);
+	}
+
+	struct RecursiveMutex {
+		CRITICAL_SECTION win32_critical_section;
+	};
+	void mutex_init(RecursiveMutex *m) {
+		InitializeCriticalSection(&m->win32_critical_section);
+	}
+	void mutex_destroy(RecursiveMutex *m) {
+		DeleteCriticalSection(&m->win32_critical_section);
+	}
+	void mutex_lock(RecursiveMutex *m) {
+		EnterCriticalSection(&m->win32_critical_section);
+	}
+	bool mutex_try_lock(RecursiveMutex *m) {
+		return TryEnterCriticalSection(&m->win32_critical_section) != 0;
+	}
+	void mutex_unlock(RecursiveMutex *m) {
+		LeaveCriticalSection(&m->win32_critical_section);
+	}
+
+	struct Semaphore {
+		void *win32_handle;
+	};
+
+	void semaphore_init(Semaphore *s) {
+		s->win32_handle = CreateSemaphoreA(NULL, 0, I32_MAX, NULL);
+	}
+	void semaphore_destroy(Semaphore *s) {
+		CloseHandle(s->win32_handle);
+	}
+	void semaphore_post(Semaphore *s, i32 count) {
+		ReleaseSemaphore(s->win32_handle, count, NULL);
+	}
+	void semaphore_wait(Semaphore *s) {
+		WaitForSingleObjectEx(s->win32_handle, INFINITE, FALSE);
+	}
+
+#else
+	struct BlockingMutex {
+		pthread_mutex_t pthread_mutex;
+	};
+	void mutex_init(BlockingMutex *m) {
+		pthread_mutex_init(&m->pthread_mutex, nullptr);
+	}
+	void mutex_destroy(BlockingMutex *m) {
+		pthread_mutex_destroy(&m->pthread_mutex);
+	}
+	void mutex_lock(BlockingMutex *m) {
+		pthread_mutex_lock(&m->pthread_mutex);
+	}
+	bool mutex_try_lock(BlockingMutex *m) {
+		return pthread_mutex_trylock(&m->pthread_mutex) == 0;
+	}
+	void mutex_unlock(BlockingMutex *m) {
+		pthread_mutex_unlock(&m->pthread_mutex);
+	}
+
+	struct RecursiveMutex {
+		pthread_mutex_t pthread_mutex;
+		pthread_mutexattr_t pthread_mutexattr;
+	};
+	void mutex_init(RecursiveMutex *m) {
+		pthread_mutexattr_init(&m->pthread_mutexattr);
+		pthread_mutexattr_settype(&m->pthread_mutexattr, PTHREAD_MUTEX_RECURSIVE);
+		pthread_mutex_init(&m->pthread_mutex, &m->pthread_mutexattr);
+	}
+	void mutex_destroy(RecursiveMutex *m) {
+		pthread_mutex_destroy(&m->pthread_mutex);
+	}
+	void mutex_lock(RecursiveMutex *m) {
+		pthread_mutex_lock(&m->pthread_mutex);
+	}
+	bool mutex_try_lock(RecursiveMutex *m) {
+		return pthread_mutex_trylock(&m->pthread_mutex) == 0;
+	}
+	void mutex_unlock(RecursiveMutex *m) {
+		pthread_mutex_unlock(&m->pthread_mutex);
+	}
+
+	#if defined(GB_SYSTEM_OSX)
+		struct Semaphore {
+			semaphore_t osx_handle;
+		};
+
+		void semaphore_init   (Semaphore *s)            { semaphore_create(mach_task_self(), &s->osx_handle, SYNC_POLICY_FIFO, 0); }
+		void semaphore_destroy(Semaphore *s)            { semaphore_destroy(mach_task_self(), s->osx_handle); }
+		void semaphore_post   (Semaphore *s, i32 count) { while (count --> 0) semaphore_signal(s->osx_handle); }
+		void semaphore_wait   (Semaphore *s)            { semaphore_wait(s->osx_handle); }
+	#elif defined(GB_SYSTEM_UNIX)
+		struct Semaphore {
+			sem_t unix_handle;
+		};
+
+		void semaphore_init   (Semaphore *s)            { sem_init(&s->unix_handle, 0, 0); }
+		void semaphore_destroy(Semaphore *s)            { sem_destroy(&s->unix_handle); }
+		void semaphore_post   (Semaphore *s, i32 count) { while (count --> 0) sem_post(&s->unix_handle); }
+		void semaphore_wait   (Semaphore *s)            { int i; do { i = sem_wait(&s->unix_handle); } while (i == -1 && errno == EINTR); }
+	#else
+	#error
+	#endif
+#endif
+
+
+
+
+u32 thread_current_id(void) {
+	u32 thread_id;
+#if defined(GB_SYSTEM_WINDOWS)
+	#if defined(GB_ARCH_32_BIT) && defined(GB_CPU_X86)
+		thread_id = (cast(u32 *)__readfsdword(24))[9];
+	#elif defined(GB_ARCH_64_BIT) && defined(GB_CPU_X86)
+		thread_id = (cast(u32 *)__readgsqword(48))[18];
+	#else
+		thread_id = GetCurrentThreadId();
+	#endif
+
+#elif defined(GB_SYSTEM_OSX) && defined(GB_ARCH_64_BIT)
+	thread_id = pthread_mach_thread_np(pthread_self());
+#elif defined(GB_ARCH_32_BIT) && defined(GB_CPU_X86)
+	__asm__("mov %%gs:0x08,%0" : "=r"(thread_id));
+#elif defined(GB_ARCH_64_BIT) && defined(GB_CPU_X86)
+	__asm__("mov %%fs:0x10,%0" : "=r"(thread_id));
+#else
+	#error Unsupported architecture for thread_current_id()
+#endif
+
+	return thread_id;
+}
+
+
+gb_inline void yield_thread(void) {
+#if defined(GB_SYSTEM_WINDOWS)
+	_mm_pause();
+#elif defined(GB_SYSTEM_OSX)
+	#if defined(GB_CPU_X86)
+	__asm__ volatile ("" : : : "memory");
+	#elif defined(GB_CPU_ARM)
+	__asm__ volatile ("yield" : : : "memory");
+	#endif
+#elif defined(GB_CPU_X86)
+	_mm_pause();
+#else
+#error Unknown architecture
+#endif
+}
+
+gb_inline void yield(void) {
+#if defined(GB_SYSTEM_WINDOWS)
+	YieldProcessor();
+#else
+	sched_yield();
+#endif
+}
+
+
+void thread_init(Thread *t) {
+	gb_zero_item(t);
+#if defined(GB_SYSTEM_WINDOWS)
+	t->win32_handle = INVALID_HANDLE_VALUE;
+#else
+	t->posix_handle = 0;
+#endif
+	t->semaphore = gb_alloc_item(heap_allocator(), Semaphore);
+	semaphore_init(t->semaphore);
+}
+
+void thread_destroy(Thread *t) {
+	if (t->is_running) thread_join(t);
+	semaphore_destroy(t->semaphore);
+	gb_free(heap_allocator(), t->semaphore);
+}
+
+
+void gb__thread_run(Thread *t) {
+	semaphore_release(t->semaphore);
+	t->return_value = t->proc(t);
+}
+
+#if defined(GB_SYSTEM_WINDOWS)
+	DWORD __stdcall internal_thread_proc(void *arg) {
+		Thread *t = cast(Thread *)arg;
+		gb__thread_run(t);
+		t->is_running = false;
+		return 0;
+	}
+#else
+	void *          internal_thread_proc(void *arg) {
+		Thread *t = cast(Thread *)arg;
+		gb__thread_run(t);
+		t->is_running = false;
+		return NULL;
+	}
+#endif
+
+void thread_start(Thread *t, ThreadProc *proc, void *user_data) { thread_start_with_stack(t, proc, user_data, 0); }
+
+void thread_start_with_stack(Thread *t, ThreadProc *proc, void *user_data, isize stack_size) {
+	GB_ASSERT(!t->is_running);
+	GB_ASSERT(proc != NULL);
+	t->proc = proc;
+	t->user_data = user_data;
+	t->stack_size = stack_size;
+	t->is_running = true;
+
+#if defined(GB_SYSTEM_WINDOWS)
+	t->win32_handle = CreateThread(NULL, stack_size, internal_thread_proc, t, 0, NULL);
+	GB_ASSERT_MSG(t->win32_handle != NULL, "CreateThread: GetLastError");
+#else
+	{
+		pthread_attr_t attr;
+		pthread_attr_init(&attr);
+		pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+		if (stack_size != 0) {
+			pthread_attr_setstacksize(&attr, stack_size);
+		}
+		pthread_create(&t->posix_handle, &attr, internal_thread_proc, t);
+		pthread_attr_destroy(&attr);
+	}
+#endif
+
+	semaphore_wait(t->semaphore);
+}
+
+void thread_join(Thread *t) {
+	if (!t->is_running) return;
+
+#if defined(GB_SYSTEM_WINDOWS)
+	WaitForSingleObject(t->win32_handle, INFINITE);
+	CloseHandle(t->win32_handle);
+	t->win32_handle = INVALID_HANDLE_VALUE;
+#else
+	pthread_join(t->posix_handle, NULL);
+	t->posix_handle = 0;
+#endif
+	t->is_running = false;
+}
+
+bool thread_is_running(Thread const *t) { return t->is_running != 0; }
+
+void thread_set_name(Thread *t, char const *name) {
+#if defined(GB_COMPILER_MSVC)
+	#pragma pack(push, 8)
+		typedef struct {
+			DWORD       type;
+			char const *name;
+			DWORD       id;
+			DWORD       flags;
+		} gbprivThreadName;
+	#pragma pack(pop)
+		gbprivThreadName tn;
+		tn.type  = 0x1000;
+		tn.name  = name;
+		tn.id    = GetThreadId(cast(HANDLE)t->win32_handle);
+		tn.flags = 0;
+
+		__try {
+			RaiseException(0x406d1388, 0, gb_size_of(tn)/4, cast(ULONG_PTR *)&tn);
+		} __except(1 /*EXCEPTION_EXECUTE_HANDLER*/) {
+		}
+
+#elif defined(GB_SYSTEM_WINDOWS) && !defined(GB_COMPILER_MSVC)
+	// IMPORTANT TODO(bill): Set thread name for GCC/Clang on windows
+	return;
+#elif defined(GB_SYSTEM_OSX)
+	// TODO(bill): Test if this works
+	pthread_setname_np(name);
+#elif defined(GB_SYSTEM_FREEBSD)
+	pthread_set_name_np(t->posix_handle, name);
+#else
+	// TODO(bill): Test if this works
+	pthread_setname_np(t->posix_handle, name);
+#endif
+}
+