Merge branch 'odin-lang:master' into core-image-tga

2026-02-14 23:33:15 +00:00 · 2025-12-23 16:12:53 -06:00
parent 729d0a8e8a 57352d9933
commit 550e57aba9
598 changed files with 51089 additions and 21234 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -74,43 +74,52 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        # MacOS 13 runs on Intel, 14 runs on ARM
-        os: [macos-14, ubuntu-latest]
+        os: [macos-15-intel, macos-latest, ubuntu-latest, ubuntu-24.04-arm]
    runs-on: ${{ matrix.os }}
-    name: ${{ matrix.os == 'macos-14' && 'MacOS ARM' || (matrix.os == 'macos-13' && 'MacOS Intel') || (matrix.os == 'ubuntu-latest' && 'Ubuntu') }} Build, Check, and Test
+    name: ${{ matrix.os == 'macos-latest' && 'MacOS ARM' || (matrix.os == 'macos-15-intel' && 'MacOS Intel') || (matrix.os == 'ubuntu-latest' && 'Ubuntu') || (matrix.os == 'ubuntu-24.04-arm' && 'Ubuntu ARM') }} Build, Check, and Test
    timeout-minutes: 15
    steps:

      - uses: actions/checkout@v4

      - name: Download LLVM (MacOS Intel)
-        if: matrix.os == 'macos-13'
+        if: matrix.os == 'macos-15-intel'
        run: |
          brew update
          brew install llvm@20 lua@5.4 lld
          echo "$(brew --prefix llvm@20)/bin" >> $GITHUB_PATH

      - name: Download LLVM (MacOS ARM)
-        if: matrix.os == 'macos-14'
+        if: matrix.os == 'macos-latest'
        run: |
          brew update
          brew install llvm@20 wasmtime lua@5.4 lld
          echo "$(brew --prefix llvm@20)/bin" >> $GITHUB_PATH

      - name: Download LLVM (Ubuntu)
-        if: matrix.os == 'ubuntu-latest'
+        if: matrix.os == 'ubuntu-latest' || matrix.os == 'ubuntu-24.04-arm'
        run: |
          wget https://apt.llvm.org/llvm.sh
          chmod +x llvm.sh
          sudo ./llvm.sh 20
          echo "/usr/lib/llvm-20/bin" >> $GITHUB_PATH
-
      - name: Build Odin
        run: ./build_odin.sh release
      - name: Odin version
        run: ./odin version
      - name: Odin report
        run: ./odin report
+      - name: Get needed vendor libs
+        if: matrix.os == 'ubuntu-24.04-arm'
+        run: sudo apt-get install -y liblua5.4-dev
+      - name: Install libcurl (Ubuntu)
+        if: matrix.os == 'ubuntu-latest' || matrix.os == 'ubuntu-24.04-arm'
+        run: |
+          sudo apt-get install libcurl4-openssl-dev libmbedtls-dev
+      - name: Install libcurl (macOS)
+        if: matrix.os == 'macos-15-intel' || matrix.os == 'macos-latest'
+        run: |
+          brew install curl mbedtls
      - name: Compile needed Vendor
        run: |
          make -C vendor/stb/src
@@ -143,7 +152,7 @@ jobs:
        run: |
          ./odin build examples/demo -target:wasi_wasm32 -vet -vet-tabs -strict-style -vet-style -warnings-as-errors -disallow-do -out:demo
          wasmtime ./demo.wasm
-        if: matrix.os == 'macos-14'
+        if: matrix.os == 'macos-latest'

      - name: Check benchmarks
        run: ./odin check tests/benchmark -vet -vet-tabs -strict-style -vet-style -warnings-as-errors -disallow-do -no-entry-point
@@ -277,6 +286,10 @@ jobs:
          sudo ./llvm.sh 18
          echo "/usr/lib/llvm-18/bin" >> $GITHUB_PATH

+      - name: Install libcurl
+        run: |
+          sudo apt-get install libcurl4-openssl-dev libmbedtls-dev
+
      - name: Build Odin
        run: ./build_odin.sh release

--- a/.github/workflows/cover.yml
+++ b/.github/workflows/cover.yml
@@ -1,60 +0,0 @@
-name: Test Coverage
-on: [push, pull_request, workflow_dispatch]
-
-jobs:
-  build_linux_amd64:
-    runs-on: ubuntu-latest
-    name: Linux AMD64 Test Coverage
-    timeout-minutes: 60
-    steps:
-      - uses: actions/checkout@v4
-
-      - name: Download LLVM (Ubuntu)
-        if: matrix.os == 'ubuntu-latest'
-        run: |
-          wget https://apt.llvm.org/llvm.sh
-          chmod +x llvm.sh
-          sudo ./llvm.sh 20
-          echo "/usr/lib/llvm-20/bin" >> $GITHUB_PATH
-
-      - name: Install kcov
-        run: |
-          sudo apt-get update
-          sudo apt-get install binutils-dev build-essential cmake libssl-dev libcurl4-openssl-dev libelf-dev libstdc++-12-dev zlib1g-dev libdw-dev libiberty-dev
-          git clone https://github.com/SimonKagstrom/kcov.git
-          mkdir kcov/build
-          cd kcov/build
-          cmake ..
-          sudo make
-          sudo make install
-          cd ../..
-          kcov --version
-
-      - name: Build Odin
-        run: ./build_odin.sh release
-
-      - name: Odin report
-        run: ./odin report
-
-      - name: Normal Core library tests
-        run: |
-          ./odin build tests/core/normal.odin -build-mode:test -debug -file -all-packages -vet -strict-style -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true -target:linux_amd64
-          mkdir kcov-out
-          kcov --exclude-path=tests,/usr kcov-out ./normal.bin .
-
-      - name: Optimized Core library tests
-        run: |
-          ./odin build tests/core/speed.odin -build-mode:test -debug -file -all-packages -vet -strict-style -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true -target:linux_amd64
-          kcov --exclude-path=tests,/usr kcov-out ./speed.bin .
-
-      - name: Internals tests
-        run: |
-          ./odin build tests/internal -build-mode:test -debug -all-packages -vet -strict-style -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true -target:linux_amd64
-          kcov --exclude-path=tests,/usr kcov-out ./internal .
-
-      - uses: codecov/codecov-action@v5
-        with:
-          name: Ubuntu Coverage # optional
-          token: ${{ secrets.CODECOV_TOKEN }}
-          verbose: true # optional (default = false
-          directory: kcov-out/kcov-merged
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -124,13 +124,13 @@ jobs:
  build_macos:
    name: MacOS Build
    if: github.repository == 'odin-lang/Odin'
-    runs-on: macos-13
+    runs-on: macos-15-intel
    steps:
      - uses: actions/checkout@v4
      - name: Download LLVM and setup PATH
        run: |
          brew update
-          brew install llvm@20 dylibbundler lld
+          brew install llvm@20 dylibbundler lld@20

      - name: build odin
        # These -L makes the linker prioritize system libraries over LLVM libraries, this is mainly to
@@ -163,13 +163,13 @@ jobs:
  build_macos_arm:
    name: MacOS ARM Build
    if: github.repository == 'odin-lang/Odin'
-    runs-on: macos-14 # ARM machine
+    runs-on: macos-latest # ARM machine
    steps:
      - uses: actions/checkout@v4
      - name: Download LLVM and setup PATH
        run: |
          brew update
-          brew install llvm@20 dylibbundler lld
+          brew install llvm@20 dylibbundler lld@20

      - name: build odin
        # These -L makes the linker prioritize system libraries over LLVM libraries, this is mainly to
--- a/.gitignore
+++ b/.gitignore
@@ -302,3 +302,9 @@ misc/featuregen/featuregen
 .cache/
 .clangd
 compile_commands.json
+
+# Dev cmake helpers
+build/
+cmake-build*/
+CMakeLists.txt
+sandbox/
--- a/37
+++ b/37
@@ -1,26 +1,17 @@
-Copyright (c) 2016-2024 Ginger Bill. All rights reserved.
+Copyright (c) 2016-2025 Ginger Bill. All rights reserved.

-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.

-1. Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:

-2. Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its
-   contributors may be used to endorse or promote products derived from
-   this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+1. The origin of this software must not be misrepresented; you must not
+   claim that you wrote the original software. If you use this software
+   in a product, an acknowledgment in the product documentation would be
+   appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be
+   misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
--- a/base/intrinsics/intrinsics.odin
+++ b/base/intrinsics/intrinsics.odin
@@ -138,10 +138,13 @@ type_is_rune       :: proc($T: typeid) -> bool ---
 type_is_float      :: proc($T: typeid) -> bool ---
 type_is_complex    :: proc($T: typeid) -> bool ---
 type_is_quaternion :: proc($T: typeid) -> bool ---
-type_is_string     :: proc($T: typeid) -> bool ---
 type_is_typeid     :: proc($T: typeid) -> bool ---
 type_is_any        :: proc($T: typeid) -> bool ---
+type_is_string     :: proc($T: typeid) -> bool ---
 type_is_string16   :: proc($T: typeid) -> bool ---
+type_is_cstring    :: proc($T: typeid) -> bool ---
+type_is_cstring16  :: proc($T: typeid) -> bool ---
+

 type_is_endian_platform       :: proc($T: typeid) -> bool ---
 type_is_endian_little         :: proc($T: typeid) -> bool ---
@@ -154,6 +157,7 @@ type_is_indexable             :: proc($T: typeid) -> bool ---
 type_is_sliceable             :: proc($T: typeid) -> bool ---
 type_is_comparable            :: proc($T: typeid) -> bool ---
 type_is_simple_compare        :: proc($T: typeid) -> bool --- // easily compared using memcmp (== and !=)
+type_is_nearly_simple_compare :: proc($T: typeid) -> bool --- // easily compared using memcmp (including floats)
 type_is_dereferenceable       :: proc($T: typeid) -> bool ---
 type_is_valid_map_key         :: proc($T: typeid) -> bool ---
 type_is_valid_matrix_elements :: proc($T: typeid) -> bool ---
@@ -211,7 +215,8 @@ type_polymorphic_record_parameter_value :: proc($T: typeid, index: int) -> $V --
 type_is_specialized_polymorphic_record   :: proc($T: typeid) -> bool ---
 type_is_unspecialized_polymorphic_record :: proc($T: typeid) -> bool ---

-type_is_subtype_of :: proc($T, $U: typeid) -> bool ---
+type_is_subtype_of  :: proc($T, $U: typeid) -> bool ---
+type_is_superset_of :: proc($Super, $Sub: typeid) -> bool ---

 type_field_index_of :: proc($T: typeid, $name: string) -> uintptr ---

@@ -240,6 +245,11 @@ constant_utf16_cstring :: proc($literal: string) -> [^]u16 ---

 constant_log2 :: proc($v: $T) -> T where type_is_integer(T) ---

+constant_floor :: proc($v: $T) -> T where type_is_integer(T) || type_is_float(T) ---
+constant_trunc :: proc($v: $T) -> T where type_is_integer(T) || type_is_float(T) ---
+constant_ceil  :: proc($v: $T) -> T where type_is_integer(T) || type_is_float(T) ---
+constant_round :: proc($v: $T) -> T where type_is_integer(T) || type_is_float(T) ---
+
 // SIMD related
 simd_add  :: proc(a, b: #simd[N]T) -> #simd[N]T ---
 simd_sub  :: proc(a, b: #simd[N]T) -> #simd[N]T ---
@@ -344,6 +354,9 @@ simd_lanes_rotate_right :: proc(a: #simd[N]T, $offset: int) -> #simd[N]T ---
 has_target_feature :: proc($test: $T) -> bool where type_is_string(T) || type_is_proc(T) ---


+// Utility Calls
+concatentate :: proc(x, y: $T, z: ..T) -> T where type_is_array(T) || type_is_slice(T) ---
+
 // Returns the value of the procedure where `x` must be a call expression
 procedure_of :: proc(x: $T) -> T where type_is_proc(T) ---

@@ -374,10 +387,11 @@ objc_selector :: struct{}
 objc_class    :: struct{}
 objc_ivar     :: struct{}

-objc_id    :: ^objc_object
-objc_SEL   :: ^objc_selector
-objc_Class :: ^objc_class
-objc_Ivar  :: ^objc_ivar
+objc_id           :: ^objc_object
+objc_SEL          :: ^objc_selector
+objc_Class        :: ^objc_class
+objc_Ivar         :: ^objc_ivar
+objc_instancetype :: distinct objc_id

 objc_find_selector     :: proc($name: string) -> objc_SEL   ---
 objc_register_selector :: proc($name: string) -> objc_SEL   ---
@@ -385,6 +399,7 @@ objc_find_class        :: proc($name: string) -> objc_Class ---
 objc_register_class    :: proc($name: string) -> objc_Class ---
 objc_ivar_get          :: proc(self: ^$T) -> ^$U ---
 objc_block             :: proc(invoke: $T, ..any) -> ^Objc_Block(T) where type_is_proc(T) ---
+objc_super             :: proc(obj: ^$T) -> ^$U where type_is_subtype_of(T, objc_object) && type_is_subtype_of(U, objc_object) ---

 valgrind_client_request :: proc(default: uintptr, request: uintptr, a0, a1, a2, a3, a4: uintptr) -> uintptr ---

--- a/base/runtime/core.odin
+++ b/base/runtime/core.odin
@@ -15,7 +15,7 @@
 //
 // IMPORTANT NOTE(bill): `type_info_of` cannot be used within a
 // #shared_global_scope due to  the internals of the compiler.
-// This could change at a later date if the all these data structures are
+// This could change at a later date if all these data structures are
 // implemented within the compiler rather than in this "preload" file
 //
 #+no-instrumentation
@@ -118,10 +118,10 @@ Type_Info_Parameters :: struct { // Only used for procedures parameters and resu

 Type_Info_Struct_Flags :: distinct bit_set[Type_Info_Struct_Flag; u8]
 Type_Info_Struct_Flag :: enum u8 {
-	packed    = 0,
-	raw_union = 1,
-	_         = 2,
-	align     = 3,
+	packed      = 0,
+	raw_union   = 1,
+	all_or_none = 2,
+	align       = 3,
 }

 Type_Info_Struct :: struct {
@@ -636,6 +636,8 @@ _cleanup_runtime_contextless :: proc "contextless" () {
 /////////////////////////////


+// type_info_base returns the base-type of a `^Type_Info` stripping the `distinct`ness from the first level
+@(require_results)
 type_info_base :: proc "contextless" (info: ^Type_Info) -> ^Type_Info {
 	if info == nil {
 		return nil
@@ -652,6 +654,10 @@ type_info_base :: proc "contextless" (info: ^Type_Info) -> ^Type_Info {
 }


+// type_info_core returns the core-type of a `^Type_Info` stripping the `distinct`ness from the first level AND/OR
+// returns the backing integer type of an enum or bit_set `^Type_Info`.
+// This is also aliased as `type_info_base_without_enum`
+@(require_results)
 type_info_core :: proc "contextless" (info: ^Type_Info) -> ^Type_Info {
 	if info == nil {
 		return nil
@@ -668,6 +674,10 @@ type_info_core :: proc "contextless" (info: ^Type_Info) -> ^Type_Info {
 	}
 	return base
 }
+
+// type_info_base_without_enum returns the core-type of a `^Type_Info` stripping the `distinct`ness from the first level AND/OR
+// returns the backing integer type of an enum or bit_set `^Type_Info`.
+// This is also aliased as `type_info_core`
 type_info_base_without_enum :: type_info_core

 __type_info_of :: proc "contextless" (id: typeid) -> ^Type_Info #no_bounds_check {
@@ -684,15 +694,23 @@ __type_info_of :: proc "contextless" (id: typeid) -> ^Type_Info #no_bounds_check
 }

 when !ODIN_NO_RTTI {
+	// typeid_base returns the base-type of a `typeid` stripping the `distinct`ness from the first level
 	typeid_base :: proc "contextless" (id: typeid) -> typeid {
 		ti := type_info_of(id)
 		ti = type_info_base(ti)
 		return ti.id
 	}
+	// typeid_core returns the core-type of a `typeid` stripping the `distinct`ness from the first level AND/OR
+	// returns the backing integer type of an enum or bit_set `typeid`.
+	// This is also aliased as `typeid_base_without_enum`
 	typeid_core :: proc "contextless" (id: typeid) -> typeid {
 		ti := type_info_core(type_info_of(id))
 		return ti.id
 	}
+
+	// typeid_base_without_enum returns the core-type of a `typeid` stripping the `distinct`ness from the first level AND/OR
+	// returns the backing integer type of an enum or bit_set `typeid`.
+	// This is also aliased as `typeid_core`
 	typeid_base_without_enum :: typeid_core
 }

@@ -708,11 +726,15 @@ default_logger_proc :: proc(data: rawptr, level: Logger_Level, text: string, opt
 	// Nothing
 }

+// Returns the default logger used by `context.logger`
+@(require_results)
 default_logger :: proc() -> Logger {
 	return Logger{default_logger_proc, nil, Logger_Level.Debug, nil}
 }


+// Returns the default `context`
+@(require_results)
 default_context :: proc "contextless" () -> Context {
 	c: Context
 	__init_context(&c)
--- a/base/runtime/core_builtin.odin
+++ b/base/runtime/core_builtin.odin
@@ -54,9 +54,16 @@ container_of :: #force_inline proc "contextless" (ptr: $P/^$Field_Type, $T: type


 when !NO_DEFAULT_TEMP_ALLOCATOR {
-	@thread_local global_default_temp_allocator_data: Default_Temp_Allocator
+	when ODIN_ARCH == .i386 && ODIN_OS == .Windows {
+		// Thread-local storage is problematic on Windows i386
+		global_default_temp_allocator_data: Default_Temp_Allocator
+	} else {
+		@thread_local global_default_temp_allocator_data: Default_Temp_Allocator
+	}
 }

+// Initializes the global temporary allocator used as the default `context.temp_allocator`.
+// This is ignored when `NO_DEFAULT_TEMP_ALLOCATOR` is true.
@(builtin, disabled=NO_DEFAULT_TEMP_ALLOCATOR)
 init_global_temporary_allocator :: proc(size: int, backup_allocator := context.allocator) {
 	when !NO_DEFAULT_TEMP_ALLOCATOR {
@@ -65,31 +72,33 @@ init_global_temporary_allocator :: proc(size: int, backup_allocator := context.a
 }


+@(require_results)
+copy_slice_raw :: proc "contextless" (dst, src: rawptr, dst_len, src_len, elem_size: int) -> int {
+	n := min(dst_len, src_len)
+	if n > 0 {
+		intrinsics.mem_copy(dst, src, n*elem_size)
+	}
+	return n
+}
+
 // `copy_slice` is a built-in procedure that copies elements from a source slice `src` to a destination slice `dst`.
 // The source and destination may overlap. Copy returns the number of elements copied, which will be the minimum
 // of len(src) and len(dst).
 //
 // Prefer the procedure group `copy`.
@builtin
-copy_slice :: proc "contextless" (dst, src: $T/[]$E) -> int {
-	n := min(len(dst), len(src))
-	if n > 0 {
-		intrinsics.mem_copy(raw_data(dst), raw_data(src), n*size_of(E))
-	}
-	return n
+copy_slice :: #force_inline proc "contextless" (dst, src: $T/[]$E) -> int {
+	return copy_slice_raw(raw_data(dst), raw_data(src), len(dst), len(src), size_of(E))
 }
+
 // `copy_from_string` is a built-in procedure that copies elements from a source string `src` to a destination slice `dst`.
 // The source and destination may overlap. Copy returns the number of elements copied, which will be the minimum
 // of len(src) and len(dst).
 //
 // Prefer the procedure group `copy`.
@builtin
-copy_from_string :: proc "contextless" (dst: $T/[]$E/u8, src: $S/string) -> int {
-	n := min(len(dst), len(src))
-	if n > 0 {
-		intrinsics.mem_copy(raw_data(dst), raw_data(src), n)
-	}
-	return n
+copy_from_string :: #force_inline proc "contextless" (dst: $T/[]$E/u8, src: $S/string) -> int {
+	return copy_slice_raw(raw_data(dst), raw_data(src), len(dst), len(src), 1)
 }

 // `copy_from_string16` is a built-in procedure that copies elements from a source string `src` to a destination slice `dst`.
@@ -98,12 +107,8 @@ copy_from_string :: proc "contextless" (dst: $T/[]$E/u8, src: $S/string) -> int
 //
 // Prefer the procedure group `copy`.
@builtin
-copy_from_string16 :: proc "contextless" (dst: $T/[]$E/u16, src: $S/string16) -> int {
-	n := min(len(dst), len(src))
-	if n > 0 {
-		intrinsics.mem_copy(raw_data(dst), raw_data(src), n*size_of(u16))
-	}
-	return n
+copy_from_string16 :: #force_inline proc "contextless" (dst: $T/[]$E/u16, src: $S/string16) -> int {
+	return copy_slice_raw(raw_data(dst), raw_data(src), len(dst), len(src), 2)
 }

 // `copy` is a built-in procedure that copies elements from a source slice/string `src` to a destination slice `dst`.
@@ -166,11 +171,17 @@ remove_range :: proc(array: ^$D/[dynamic]$T, #any_int lo, hi: int, loc := #calle
@builtin
 pop :: proc(array: ^$T/[dynamic]$E, loc := #caller_location) -> (res: E) #no_bounds_check {
 	assert(len(array) > 0, loc=loc)
-	res = array[len(array)-1]
-	(^Raw_Dynamic_Array)(array).len -= 1
+	_pop_type_erased(&res, (^Raw_Dynamic_Array)(array), size_of(E))
 	return res
 }

+_pop_type_erased :: proc(res: rawptr, array: ^Raw_Dynamic_Array, elem_size: int, loc := #caller_location) {
+	end := rawptr(uintptr(array.data) + uintptr(elem_size*(array.len-1)))
+	intrinsics.mem_copy_non_overlapping(res, end, elem_size)
+	array.len -= 1
+}
+
+

 // `pop_safe` trys to remove and return the end value of dynamic array `array` and reduces the length of `array` by 1.
 // If the operation is not possible, it will return false.
@@ -256,7 +267,10 @@ non_zero_resize :: proc{

 // Shrinks the capacity of a dynamic array or map down to the current length, or the given capacity.
@builtin
-shrink :: proc{shrink_dynamic_array, shrink_map}
+shrink :: proc{
+	shrink_dynamic_array,
+	shrink_map,
+}

 // `free` will try to free the passed pointer, with the given `allocator` if the allocator supports this operation.
@builtin
@@ -334,20 +348,19 @@ delete :: proc{
 // The new built-in procedure allocates memory. The first argument is a type, not a value, and the value
 // return is a pointer to a newly allocated value of that type using the specified allocator, default is context.allocator
@(builtin, require_results)
-new :: proc($T: typeid, allocator := context.allocator, loc := #caller_location) -> (^T, Allocator_Error) #optional_allocator_error {
-	return new_aligned(T, align_of(T), allocator, loc)
+new :: proc($T: typeid, allocator := context.allocator, loc := #caller_location) -> (t: ^T, err: Allocator_Error) #optional_allocator_error {
+	t = (^T)(raw_data(mem_alloc_bytes(size_of(T), align_of(T), allocator, loc) or_return))
+	return
 }
@(require_results)
 new_aligned :: proc($T: typeid, alignment: int, allocator := context.allocator, loc := #caller_location) -> (t: ^T, err: Allocator_Error) {
-	data := mem_alloc_bytes(size_of(T), alignment, allocator, loc) or_return
-	t = (^T)(raw_data(data))
+	t = (^T)(raw_data(mem_alloc_bytes(size_of(T), alignment, allocator, loc) or_return))
 	return
 }

@(builtin, require_results)
 new_clone :: proc(data: $T, allocator := context.allocator, loc := #caller_location) -> (t: ^T, err: Allocator_Error) #optional_allocator_error {
-	t_data := mem_alloc_bytes(size_of(T), align_of(T), allocator, loc) or_return
-	t = (^T)(raw_data(t_data))
+	t = (^T)(raw_data(mem_alloc_bytes(size_of(T), align_of(T), allocator, loc) or_return))
 	if t != nil {
 		t^ = data
 	}
@@ -357,14 +370,21 @@ new_clone :: proc(data: $T, allocator := context.allocator, loc := #caller_locat
 DEFAULT_DYNAMIC_ARRAY_CAPACITY :: 8

@(require_results)
-make_aligned :: proc($T: typeid/[]$E, #any_int len: int, alignment: int, allocator := context.allocator, loc := #caller_location) -> (T, Allocator_Error) #optional_allocator_error {
+make_aligned :: proc($T: typeid/[]$E, #any_int len: int, alignment: int, allocator := context.allocator, loc := #caller_location) -> (res: T, err: Allocator_Error) #optional_allocator_error {
+	err = _make_aligned_type_erased(&res, size_of(E), len, alignment, allocator, loc)
+	return
+}
+
+@(require_results)
+_make_aligned_type_erased :: proc(slice: rawptr, elem_size: int, len: int, alignment: int, allocator: Allocator, loc := #caller_location) -> Allocator_Error {
 	make_slice_error_loc(loc, len)
-	data, err := mem_alloc_bytes(size_of(E)*len, alignment, allocator, loc)
-	if data == nil && size_of(E) != 0 {
-		return nil, err
+	data, err := mem_alloc_bytes(elem_size*len, alignment, allocator, loc)
+	if data == nil && elem_size != 0 {
+		return err
 	}
-	s := Raw_Slice{raw_data(data), len}
-	return transmute(T)s, err
+	(^Raw_Slice)(slice).data = raw_data(data)
+	(^Raw_Slice)(slice).len  = len
+	return err
 }

 // `make_slice` allocates and initializes a slice. Like `new`, the first argument is a type, not a value.
@@ -372,24 +392,27 @@ make_aligned :: proc($T: typeid/[]$E, #any_int len: int, alignment: int, allocat
 //
 // Note: Prefer using the procedure group `make`.
@(builtin, require_results)
-make_slice :: proc($T: typeid/[]$E, #any_int len: int, allocator := context.allocator, loc := #caller_location) -> (T, Allocator_Error) #optional_allocator_error {
-	return make_aligned(T, len, align_of(E), allocator, loc)
+make_slice :: proc($T: typeid/[]$E, #any_int len: int, allocator := context.allocator, loc := #caller_location) -> (res: T, err: Allocator_Error) #optional_allocator_error {
+	err = _make_aligned_type_erased(&res, size_of(E), len, align_of(E), allocator, loc)
+	return
 }
 // `make_dynamic_array` allocates and initializes a dynamic array. Like `new`, the first argument is a type, not a value.
 // Unlike `new`, `make`'s return value is the same as the type of its argument, not a pointer to it.
 //
 // Note: Prefer using the procedure group `make`.
@(builtin, require_results)
-make_dynamic_array :: proc($T: typeid/[dynamic]$E, allocator := context.allocator, loc := #caller_location) -> (T, Allocator_Error) #optional_allocator_error {
-	return make_dynamic_array_len_cap(T, 0, 0, allocator, loc)
+make_dynamic_array :: proc($T: typeid/[dynamic]$E, allocator := context.allocator, loc := #caller_location) -> (array: T, err: Allocator_Error) #optional_allocator_error {
+	err = _make_dynamic_array_len_cap((^Raw_Dynamic_Array)(&array), size_of(E), align_of(E), 0, 0, allocator, loc)
+	return
 }
 // `make_dynamic_array_len` allocates and initializes a dynamic array. Like `new`, the first argument is a type, not a value.
 // Unlike `new`, `make`'s return value is the same as the type of its argument, not a pointer to it.
 //
 // Note: Prefer using the procedure group `make`.
@(builtin, require_results)
-make_dynamic_array_len :: proc($T: typeid/[dynamic]$E, #any_int len: int, allocator := context.allocator, loc := #caller_location) -> (T, Allocator_Error) #optional_allocator_error {
-	return make_dynamic_array_len_cap(T, len, len, allocator, loc)
+make_dynamic_array_len :: proc($T: typeid/[dynamic]$E, #any_int len: int, allocator := context.allocator, loc := #caller_location) -> (array: T, err: Allocator_Error) #optional_allocator_error {
+	err = _make_dynamic_array_len_cap((^Raw_Dynamic_Array)(&array), size_of(E), align_of(E), len, len, allocator, loc)
+	return
 }
 // `make_dynamic_array_len_cap` allocates and initializes a dynamic array. Like `new`, the first argument is a type, not a value.
 // Unlike `new`, `make`'s return value is the same as the type of its argument, not a pointer to it.
@@ -494,7 +517,7 @@ clear_map :: proc "contextless" (m: ^$T/map[$K]$V) {
 // Note: Prefer the procedure group `reserve`
@builtin
 reserve_map :: proc(m: ^$T/map[$K]$V, #any_int capacity: int, loc := #caller_location) -> Allocator_Error {
-	return __dynamic_map_reserve((^Raw_Map)(m), map_info(T), uint(capacity), loc) if m != nil else nil
+	return __dynamic_map_reserve((^Raw_Map)(m), map_info(T), uint(capacity), loc)
 }

 // Shrinks the capacity of a map down to the current length.
@@ -523,7 +546,7 @@ delete_key :: proc(m: ^$T/map[$K]$V, key: K) -> (deleted_key: K, deleted_value:
 	return
 }

-_append_elem :: #force_inline proc(array: ^Raw_Dynamic_Array, size_of_elem, align_of_elem: int, arg_ptr: rawptr, should_zero: bool, loc := #caller_location) -> (n: int, err: Allocator_Error) #optional_allocator_error {
+_append_elem :: #force_no_inline proc(array: ^Raw_Dynamic_Array, size_of_elem, align_of_elem: int, arg_ptr: rawptr, should_zero: bool, loc := #caller_location) -> (n: int, err: Allocator_Error) #optional_allocator_error {
 	if array == nil {
 		return
 	}
@@ -546,6 +569,7 @@ _append_elem :: #force_inline proc(array: ^Raw_Dynamic_Array, size_of_elem, alig
 	return
 }

+// `append_elem` appends an element to the end of a dynamic array.
@builtin
 append_elem :: proc(array: ^$T/[dynamic]$E, #no_broadcast arg: E, loc := #caller_location) -> (n: int, err: Allocator_Error) #optional_allocator_error {
 	when size_of(E) == 0 {
@@ -557,6 +581,9 @@ append_elem :: proc(array: ^$T/[dynamic]$E, #no_broadcast arg: E, loc := #caller
 	}
 }

+// `non_zero_append_elem` appends an element to the end of a dynamic array, without zeroing any reserved memory
+//
+// Note: Prefer using the procedure group `non_zero_append
@builtin
 non_zero_append_elem :: proc(array: ^$T/[dynamic]$E, #no_broadcast arg: E, loc := #caller_location) -> (n: int, err: Allocator_Error) #optional_allocator_error {
 	when size_of(E) == 0 {
@@ -568,7 +595,7 @@ non_zero_append_elem :: proc(array: ^$T/[dynamic]$E, #no_broadcast arg: E, loc :
 	}
 }

-_append_elems :: #force_inline proc(array: ^Raw_Dynamic_Array, size_of_elem, align_of_elem: int, should_zero: bool, loc := #caller_location, args: rawptr, arg_len: int) -> (n: int, err: Allocator_Error) #optional_allocator_error {
+_append_elems :: #force_no_inline proc(array: ^Raw_Dynamic_Array, size_of_elem, align_of_elem: int, should_zero: bool, loc := #caller_location, args: rawptr, arg_len: int) -> (n: int, err: Allocator_Error) #optional_allocator_error {
 	if array == nil {
 		return 0, nil
 	}
@@ -595,6 +622,9 @@ _append_elems :: #force_inline proc(array: ^Raw_Dynamic_Array, size_of_elem, ali
 	return arg_len, err
 }

+// `append_elems` appends `args` to the end of a dynamic array.
+//
+// Note: Prefer using the procedure group `append`.
@builtin
 append_elems :: proc(array: ^$T/[dynamic]$E, #no_broadcast args: ..E, loc := #caller_location) -> (n: int, err: Allocator_Error) #optional_allocator_error {
 	when size_of(E) == 0 {
@@ -606,6 +636,9 @@ append_elems :: proc(array: ^$T/[dynamic]$E, #no_broadcast args: ..E, loc := #ca
 	}
 }

+// `non_zero_append_elems` appends `args` to the end of a dynamic array, without zeroing any reserved memory
+//
+// Note: Prefer using the procedure group `non_zero_append
@builtin
 non_zero_append_elems :: proc(array: ^$T/[dynamic]$E, #no_broadcast args: ..E, loc := #caller_location) -> (n: int, err: Allocator_Error) #optional_allocator_error {
 	when size_of(E) == 0 {
@@ -622,10 +655,16 @@ _append_elem_string :: proc(array: ^$T/[dynamic]$E/u8, arg: $A/string, should_ze
 	return _append_elems((^Raw_Dynamic_Array)(array), 1, 1, should_zero, loc, raw_data(arg), len(arg))
 }

+// `append_elem_string` appends a string to the end of a dynamic array of bytes
+//
+// Note: Prefer using the procedure group `append`.
@builtin
 append_elem_string :: proc(array: ^$T/[dynamic]$E/u8, arg: $A/string, loc := #caller_location) -> (n: int, err: Allocator_Error) #optional_allocator_error {
 	return _append_elem_string(array, arg, true, loc)
 }
+// `non_zero_append_elem_string` appends a string to the end of a dynamic array of bytes, without zeroing any reserved memory
+//
+// Note: Prefer using the procedure group `non_zero_append`.
@builtin
 non_zero_append_elem_string :: proc(array: ^$T/[dynamic]$E/u8, arg: $A/string, loc := #caller_location) -> (n: int, err: Allocator_Error) #optional_allocator_error {
 	return _append_elem_string(array, arg, false, loc)
@@ -633,6 +672,8 @@ non_zero_append_elem_string :: proc(array: ^$T/[dynamic]$E/u8, arg: $A/string, l


 // The append_string built-in procedure appends multiple strings to the end of a [dynamic]u8 like type
+//
+// Note: Prefer using the procedure group `append`.
@builtin
 append_string :: proc(array: ^$T/[dynamic]$E/u8, args: ..string, loc := #caller_location) -> (n: int, err: Allocator_Error) #optional_allocator_error {
 	n_arg: int
@@ -647,7 +688,8 @@ append_string :: proc(array: ^$T/[dynamic]$E/u8, args: ..string, loc := #caller_
 }

 // The append built-in procedure appends elements to the end of a dynamic array
-@builtin append :: proc{
+@builtin
+append :: proc{
 	append_elem,
 	append_elems,
 	append_elem_string,
@@ -656,7 +698,8 @@ append_string :: proc(array: ^$T/[dynamic]$E/u8, args: ..string, loc := #caller_
 	append_soa_elems,
 }

-@builtin non_zero_append :: proc{
+@builtin
+non_zero_append :: proc{
 	non_zero_append_elem,
 	non_zero_append_elems,
 	non_zero_append_elem_string,
@@ -666,6 +709,8 @@ append_string :: proc(array: ^$T/[dynamic]$E/u8, args: ..string, loc := #caller_
 }


+// `append_nothing` appends an empty value to a dynamic array. It returns `1, nil` if successful, and `0, err` when it was not possible,
+// whatever `err` happens to be.
@builtin
 append_nothing :: proc(array: ^$T/[dynamic]$E, loc := #caller_location) -> (n: int, err: Allocator_Error) #optional_allocator_error {
 	if array == nil {
@@ -677,6 +722,7 @@ append_nothing :: proc(array: ^$T/[dynamic]$E, loc := #caller_location) -> (n: i
 }


+// `inject_at_elem` injects an element in a dynamic array at a specified index and moves the previous elements after that index "across"
@builtin
 inject_at_elem :: proc(array: ^$T/[dynamic]$E, #any_int index: int, #no_broadcast arg: E, loc := #caller_location) -> (ok: bool, err: Allocator_Error) #no_bounds_check #optional_allocator_error {
 	when !ODIN_NO_BOUNDS_CHECK {
@@ -698,6 +744,7 @@ inject_at_elem :: proc(array: ^$T/[dynamic]$E, #any_int index: int, #no_broadcas
 	return
 }

+// `inject_at_elems` injects multiple elements in a dynamic array at a specified index and moves the previous elements after that index "across"
@builtin
 inject_at_elems :: proc(array: ^$T/[dynamic]$E, #any_int index: int, #no_broadcast args: ..E, loc := #caller_location) -> (ok: bool, err: Allocator_Error) #no_bounds_check #optional_allocator_error {
 	when !ODIN_NO_BOUNDS_CHECK {
@@ -724,6 +771,7 @@ inject_at_elems :: proc(array: ^$T/[dynamic]$E, #any_int index: int, #no_broadca
 	return
 }

+// `inject_at_elem_string` injects a string into a dynamic array at a specified index and moves the previous elements after that index "across"
@builtin
 inject_at_elem_string :: proc(array: ^$T/[dynamic]$E/u8, #any_int index: int, arg: string, loc := #caller_location) -> (ok: bool, err: Allocator_Error) #no_bounds_check #optional_allocator_error {
 	when !ODIN_NO_BOUNDS_CHECK {
@@ -748,10 +796,17 @@ inject_at_elem_string :: proc(array: ^$T/[dynamic]$E/u8, #any_int index: int, ar
 	return
 }

-@builtin inject_at :: proc{inject_at_elem, inject_at_elems, inject_at_elem_string}
+// `inject_at` injects something into a dynamic array at a specified index and moves the previous elements after that index "across"
+@builtin inject_at :: proc{
+	inject_at_elem,
+	inject_at_elems,
+	inject_at_elem_string,
+}



+// `assign_at_elem` assigns a value at a given index. If the requested index is smaller than the current
+// size of the dynamic array, it will attempt to `resize` the a new length of `index+1` and then assign as `index`.
@builtin
 assign_at_elem :: proc(array: ^$T/[dynamic]$E, #any_int index: int, arg: E, loc := #caller_location) -> (ok: bool, err: Allocator_Error) #no_bounds_check #optional_allocator_error {
 	if index < len(array) {
@@ -766,6 +821,8 @@ assign_at_elem :: proc(array: ^$T/[dynamic]$E, #any_int index: int, arg: E, loc
 }


+// `assign_at_elems` assigns a values at a given index. If the requested index is smaller than the current
+// size of the dynamic array, it will attempt to `resize` the a new length of `index+len(args)` and then assign as `index`.
@builtin
 assign_at_elems :: proc(array: ^$T/[dynamic]$E, #any_int index: int, #no_broadcast args: ..E, loc := #caller_location) -> (ok: bool, err: Allocator_Error) #no_bounds_check #optional_allocator_error {
 	new_size := index + len(args)
@@ -782,7 +839,8 @@ assign_at_elems :: proc(array: ^$T/[dynamic]$E, #any_int index: int, #no_broadca
 	return
 }

-
+// `assign_at_elem_string` assigns a string at a given index. If the requested index is smaller than the current
+// size of the dynamic array, it will attempt to `resize` the a new length of `index+len(arg)` and then assign as `index`.
@builtin
 assign_at_elem_string :: proc(array: ^$T/[dynamic]$E/u8, #any_int index: int, arg: string, loc := #caller_location) -> (ok: bool, err: Allocator_Error) #no_bounds_check #optional_allocator_error {
 	new_size := index + len(arg)
@@ -799,8 +857,14 @@ assign_at_elem_string :: proc(array: ^$T/[dynamic]$E/u8, #any_int index: int, ar
 	return
 }

-@builtin assign_at :: proc{assign_at_elem, assign_at_elems, assign_at_elem_string}
-
+// `assign_at` assigns a value at a given index. If the requested index is smaller than the current
+// size of the dynamic array, it will attempt to `resize` the a new length of `index+size_needed` and then assign as `index`.
+@builtin
+assign_at :: proc{
+	assign_at_elem,
+	assign_at_elems,
+	assign_at_elem_string,
+}



@@ -816,8 +880,10 @@ clear_dynamic_array :: proc "contextless" (array: ^$T/[dynamic]$E) {

 // `reserve_dynamic_array` will try to reserve memory of a passed dynamic array or map to the requested element count (setting the `cap`).
 //
+// When a memory resize allocation is required, the memory will be asked to be zeroed (i.e. it calls `mem_resize`).
+//
 // Note: Prefer the procedure group `reserve`.
-_reserve_dynamic_array :: #force_inline proc(a: ^Raw_Dynamic_Array, size_of_elem, align_of_elem: int, capacity: int, should_zero: bool, loc := #caller_location) -> Allocator_Error {
+_reserve_dynamic_array :: #force_no_inline proc(a: ^Raw_Dynamic_Array, size_of_elem, align_of_elem: int, capacity: int, should_zero: bool, loc := #caller_location) -> Allocator_Error {
 	if a == nil {
 		return nil
 	}
@@ -850,18 +916,28 @@ _reserve_dynamic_array :: #force_inline proc(a: ^Raw_Dynamic_Array, size_of_elem
 	return nil
 }

+// `reserve_dynamic_array` will try to reserve memory of a passed dynamic array or map to the requested element count (setting the `cap`).
+//
+// When a memory resize allocation is required, the memory will be asked to be zeroed (i.e. it calls `mem_resize`).
+//
+// Note: Prefer the procedure group `reserve`.
@builtin
 reserve_dynamic_array :: proc(array: ^$T/[dynamic]$E, #any_int capacity: int, loc := #caller_location) -> Allocator_Error {
 	return _reserve_dynamic_array((^Raw_Dynamic_Array)(array), size_of(E), align_of(E), capacity, true, loc)
 }

+// `non_zero_reserve_dynamic_array` will try to reserve memory of a passed dynamic array or map to the requested element count (setting the `cap`).
+//
+// When a memory resize allocation is required, the memory will be asked to not be zeroed (i.e. it calls `non_zero_mem_resize`).
+//
+// Note: Prefer the procedure group `non_zero_reserve`.
@builtin
 non_zero_reserve_dynamic_array :: proc(array: ^$T/[dynamic]$E, #any_int capacity: int, loc := #caller_location) -> Allocator_Error {
 	return _reserve_dynamic_array((^Raw_Dynamic_Array)(array), size_of(E), align_of(E), capacity, false, loc)
 }


-_resize_dynamic_array :: #force_inline proc(a: ^Raw_Dynamic_Array, size_of_elem, align_of_elem: int, length: int, should_zero: bool, loc := #caller_location) -> Allocator_Error {
+_resize_dynamic_array :: #force_no_inline proc(a: ^Raw_Dynamic_Array, size_of_elem, align_of_elem: int, length: int, should_zero: bool, loc := #caller_location) -> Allocator_Error {
 	if a == nil {
 		return nil
 	}
@@ -903,28 +979,34 @@ _resize_dynamic_array :: #force_inline proc(a: ^Raw_Dynamic_Array, size_of_elem,

 // `resize_dynamic_array` will try to resize memory of a passed dynamic array or map to the requested element count (setting the `len`, and possibly `cap`).
 //
+// When a memory resize allocation is required, the memory will be asked to be zeroed (i.e. it calls `mem_resize`).
+//
 // Note: Prefer the procedure group `resize`
@builtin
 resize_dynamic_array :: proc(array: ^$T/[dynamic]$E, #any_int length: int, loc := #caller_location) -> Allocator_Error {
 	return _resize_dynamic_array((^Raw_Dynamic_Array)(array), size_of(E), align_of(E), length, true, loc=loc)
 }

+// `non_zero_resize_dynamic_array` will try to resize memory of a passed dynamic array or map to the requested element count (setting the `len`, and possibly `cap`).
+//
+// When a memory resize allocation is required, the memory will be asked to not be zeroed (i.e. it calls `non_zero_mem_resize`).
+//
+// Note: Prefer the procedure group `non_zero_resize`
@builtin
 non_zero_resize_dynamic_array :: proc(array: ^$T/[dynamic]$E, #any_int length: int, loc := #caller_location) -> Allocator_Error {
 	return _resize_dynamic_array((^Raw_Dynamic_Array)(array), size_of(E), align_of(E), length, false, loc=loc)
 }

-/*
-	Shrinks the capacity of a dynamic array down to the current length, or the given capacity.
-
-	If `new_cap` is negative, then `len(array)` is used.
-
-	Returns false if `cap(array) < new_cap`, or the allocator report failure.
-
-	If `len(array) < new_cap`, then `len(array)` will be left unchanged.
-
-	Note: Prefer the procedure group `shrink`
-*/
+// Shrinks the capacity of a dynamic array down to the current length, or the given capacity.
+//
+// If `new_cap` is negative, then `len(array)` is used.
+//
+// Returns false if `cap(array) < new_cap`, or the allocator report failure.
+//
+// If `len(array) < new_cap`, then `len(array)` will be left unchanged.
+//
+// Note: Prefer the procedure group `shrink`
+@builtin
 shrink_dynamic_array :: proc(array: ^$T/[dynamic]$E, #any_int new_cap := -1, loc := #caller_location) -> (did_shrink: bool, err: Allocator_Error) {
 	return _shrink_dynamic_array((^Raw_Dynamic_Array)(array), size_of(E), align_of(E), new_cap, loc)
 }
@@ -1005,6 +1087,7 @@ map_entry :: proc(m: ^$T/map[$K]$V, key: K, loc := #caller_location) -> (key_ptr
 }


+// `card` returns the number of bits that are set in a bit_set—its cardinality
@builtin
 card :: proc "contextless" (s: $S/bit_set[$E; $U]) -> int {
 	return int(intrinsics.count_ones(transmute(intrinsics.type_bit_set_underlying_type(S))s))
@@ -1012,6 +1095,10 @@ card :: proc "contextless" (s: $S/bit_set[$E; $U]) -> int {



+// Evaluates the condition and panics the program iff the condition is false.
+// This uses the `context.assertion_failure_procedure` to assert.
+//
+// This routine will be ignored when `ODIN_DISABLE_ASSERT` is true.
@builtin
@(disabled=ODIN_DISABLE_ASSERT)
 assert :: proc(condition: bool, message := #caller_expression(condition), loc := #caller_location) {
@@ -1032,9 +1119,9 @@ assert :: proc(condition: bool, message := #caller_expression(condition), loc :=
 	}
 }

-// Evaluates the condition and aborts the program iff the condition is
-// false.  This routine ignores `ODIN_DISABLE_ASSERT`, and will always
-// execute.
+// Evaluates the condition and panics the program iff the condition is false.
+// This uses the `context.assertion_failure_procedure` to assert.
+// This routine ignores `ODIN_DISABLE_ASSERT`, and will always execute.
@builtin
 ensure :: proc(condition: bool, message := #caller_expression(condition), loc := #caller_location) {
 	if !condition {
@@ -1050,6 +1137,8 @@ ensure :: proc(condition: bool, message := #caller_expression(condition), loc :=
 	}
 }

+// Panics the program with a message.
+// This uses the `context.assertion_failure_procedure` to panic.
@builtin
 panic :: proc(message: string, loc := #caller_location) -> ! {
 	p := context.assertion_failure_proc
@@ -1059,6 +1148,8 @@ panic :: proc(message: string, loc := #caller_location) -> ! {
 	p("panic", message, loc)
 }

+// Panics the program with a message to indicate something has yet to be implemented.
+// This uses the `context.assertion_failure_procedure` to assert.
@builtin
 unimplemented :: proc(message := "", loc := #caller_location) -> ! {
 	p := context.assertion_failure_proc
@@ -1068,7 +1159,10 @@ unimplemented :: proc(message := "", loc := #caller_location) -> ! {
 	p("not yet implemented", message, loc)
 }

-
+// Evaluates the condition and panics the program iff the condition is false.
+// This uses the `default_assertion_contextless_failure_proc` to assert.
+//
+// This routine will be ignored when `ODIN_DISABLE_ASSERT` is true.
@builtin
@(disabled=ODIN_DISABLE_ASSERT)
 assert_contextless :: proc "contextless" (condition: bool, message := #caller_expression(condition), loc := #caller_location) {
@@ -1085,6 +1179,8 @@ assert_contextless :: proc "contextless" (condition: bool, message := #caller_ex
 	}
 }

+// Evaluates the condition and panics the program iff the condition is false.
+// This uses the `default_assertion_contextless_failure_proc` to assert.
@builtin
 ensure_contextless :: proc "contextless" (condition: bool, message := #caller_expression(condition), loc := #caller_location) {
 	if !condition {
@@ -1096,11 +1192,15 @@ ensure_contextless :: proc "contextless" (condition: bool, message := #caller_ex
 	}
 }

+// Panics the program with a message to indicate something has yet to be implemented.
+// This uses the `default_assertion_contextless_failure_proc` to assert.
@builtin
 panic_contextless :: proc "contextless" (message: string, loc := #caller_location) -> ! {
 	default_assertion_contextless_failure_proc("panic", message, loc)
 }

+// Panics the program with a message.
+// This uses the `default_assertion_contextless_failure_proc` to assert.
@builtin
 unimplemented_contextless :: proc "contextless" (message := "", loc := #caller_location) -> ! {
 	default_assertion_contextless_failure_proc("not yet implemented", message, loc)
--- a/base/runtime/core_builtin_soa.odin
+++ b/base/runtime/core_builtin_soa.odin
@@ -501,6 +501,121 @@ append_soa :: proc{
 }


+// `append_nothing_soa` appends an empty value to a dynamic SOA array. It returns `1, nil` if successful, and `0, err` when it was not possible,
+// whatever `err` happens to be.
+@builtin
+append_nothing_soa :: proc(array: ^$T/#soa[dynamic]$E, loc := #caller_location) -> (n: int, err: Allocator_Error) #optional_allocator_error {
+	if array == nil {
+		return 0, nil
+	}
+	prev_len := len(array)
+	resize_soa(array, len(array)+1, loc) or_return
+	return len(array)-prev_len, nil
+}
+
+
+// `inject_at_elem_soa` injects an element in a dynamic SOA array at a specified index and moves the previous elements after that index "across"
+@builtin
+inject_at_elem_soa :: proc(array: ^$T/#soa[dynamic]$E, #any_int index: int, #no_broadcast arg: E, loc := #caller_location) -> (ok: bool, err: Allocator_Error) #no_bounds_check #optional_allocator_error {
+	when !ODIN_NO_BOUNDS_CHECK {
+		ensure(index >= 0, "Index must be positive.", loc)
+	}
+	if array == nil {
+		return
+	}
+	n := max(len(array), index)
+	m :: 1
+	new_len := n + m
+
+	resize_soa(array, new_len, loc) or_return
+
+	when size_of(E) != 0 {
+		ti := type_info_base(type_info_of(typeid_of(T)))
+		si := &ti.variant.(Type_Info_Struct)
+
+		field_count := len(E) when intrinsics.type_is_array(E) else intrinsics.type_struct_field_count(E)
+
+		item_offset := 0
+
+		arg_copy := arg
+		arg_ptr := &arg_copy
+
+		for i in 0..<field_count {
+			data := (^uintptr)(uintptr(array) + uintptr(si.offsets[i]))^
+			type := si.types[i].variant.(Type_Info_Multi_Pointer).elem
+			item_offset = align_forward_int(item_offset, type.align)
+
+			src := data + uintptr(index * type.size)
+			dst := data + uintptr((index + m) * type.size)
+			mem_copy(rawptr(dst), rawptr(src), (n - index) * type.size)
+
+			mem_copy(rawptr(src), rawptr(uintptr(arg_ptr) + uintptr(item_offset)), type.size)
+
+			item_offset += type.size
+		}
+	}
+
+	ok = true
+	return
+}
+
+// `inject_at_elems_soa` injects multiple elements in a dynamic SOA array at a specified index and moves the previous elements after that index "across"
+@builtin
+inject_at_elems_soa :: proc(array: ^$T/#soa[dynamic]$E, #any_int index: int, #no_broadcast args: ..E, loc := #caller_location) -> (ok: bool, err: Allocator_Error) #no_bounds_check #optional_allocator_error {
+	when !ODIN_NO_BOUNDS_CHECK {
+		ensure(index >= 0, "Index must be positive.", loc)
+	}
+	if array == nil {
+		return
+	}
+	if len(args) == 0 {
+		ok = true
+		return
+	}
+
+	n := max(len(array), index)
+	m := len(args)
+	new_len := n + m
+
+	resize_soa(array, new_len, loc) or_return
+
+	when size_of(E) != 0 {
+		ti := type_info_base(type_info_of(typeid_of(T)))
+		si := &ti.variant.(Type_Info_Struct)
+
+		field_count := len(E) when intrinsics.type_is_array(E) else intrinsics.type_struct_field_count(E)
+
+		item_offset := 0
+
+		args_ptr := &args[0]
+
+		for i in 0..<field_count {
+			data := (^uintptr)(uintptr(array) + uintptr(si.offsets[i]))^
+			type := si.types[i].variant.(Type_Info_Multi_Pointer).elem
+			item_offset = align_forward_int(item_offset, type.align)
+
+			src := data + uintptr(index * type.size)
+			dst := data + uintptr((index + m) * type.size)
+			mem_copy(rawptr(dst), rawptr(src), (n - index) * type.size)
+
+			for j in 0..<len(args) {
+				d := rawptr(src + uintptr(j*type.size))
+				s := rawptr(uintptr(args_ptr) + uintptr(item_offset) + uintptr(j*size_of(E)))
+				mem_copy(d, s, type.size)
+			}
+
+			item_offset += type.size
+		}
+	}
+
+	ok = true
+	return
+}
+
+// `inject_at_soa` injects something into a dynamic SOA array at a specified index and moves the previous elements after that index "across"
+@builtin inject_at_soa :: proc{inject_at_elem_soa, inject_at_elems_soa}
+
+@builtin
 delete_soa_slice :: proc(array: $T/#soa[]$E, allocator := context.allocator, loc := #caller_location) -> Allocator_Error {
 	field_count :: len(E) when intrinsics.type_is_array(E) else intrinsics.type_struct_field_count(E)
 	when field_count != 0 {
@@ -511,6 +626,7 @@ delete_soa_slice :: proc(array: $T/#soa[]$E, allocator := context.allocator, loc
 	return nil
 }

+@builtin
 delete_soa_dynamic_array :: proc(array: $T/#soa[dynamic]$E, loc := #caller_location) -> Allocator_Error {
 	field_count :: len(E) when intrinsics.type_is_array(E) else intrinsics.type_struct_field_count(E)
 	when field_count != 0 {
@@ -529,7 +645,7 @@ delete_soa :: proc{
 	delete_soa_dynamic_array,
 }

-
+@builtin
 clear_soa_dynamic_array :: proc(array: ^$T/#soa[dynamic]$E) {
 	field_count :: len(E) when intrinsics.type_is_array(E) else intrinsics.type_struct_field_count(E)
 	when field_count != 0 {
--- a/base/runtime/default_allocators_nil.odin
+++ b/base/runtime/default_allocators_nil.odin
@@ -23,6 +23,14 @@ nil_allocator_proc :: proc(allocator_data: rawptr, mode: Allocator_Mode,
 	return nil, .None
 }

+// nil_allocator returns an allocator which will return `nil` for any result.
+// * `.Alloc`, `.Alloc_Non_Zero`, `.Resize`, `.Resize_Non_Zeroed` will return `nil, .Out_Of_Memory`
+// * `.Free` will return `nil, .None`
+// * `.Free_All` will return `nil, .Mode_Not_Implemented`
+// * `.Query_Features`, `.Query_Info` will return `nil, .Mode_Not_Implemented`
+//
+// This is extremely useful for creating a dynamic array from a buffer which does not nothing
+// on a resize/reserve beyond the originally allocated memory.
@(require_results)
 nil_allocator :: proc "contextless" () -> Allocator {
 	return Allocator{
@@ -73,6 +81,9 @@ panic_allocator_proc :: proc(allocator_data: rawptr, mode: Allocator_Mode,
 	return nil, nil
 }

+// panic_allocator returns an allocator which will panic for any non-zero-sized allocation or `query_info`
+//
+// This is extremely useful for to check when something does a memory operation when it should not, and thus panic.
@(require_results)
 panic_allocator :: proc() -> Allocator {
 	return Allocator{
--- a/base/runtime/default_temp_allocator_arena.odin
+++ b/base/runtime/default_temp_allocator_arena.odin
@@ -235,6 +235,7 @@ arena_allocator_proc :: proc(allocator_data: rawptr, mode: Allocator_Mode,
 				if start < old_end && old_end == block.used && new_end <= block.capacity {
 					// grow data in-place, adjusting next allocation
 					block.used = uint(new_end)
+					arena.total_used = uint(new_end)
 					data = block.base[start:new_end]
 					// sanitizer.address_unpoison(data)
 					return
--- a/base/runtime/default_temporary_allocator.odin
+++ b/base/runtime/default_temporary_allocator.odin
@@ -4,6 +4,7 @@ DEFAULT_TEMP_ALLOCATOR_BACKING_SIZE: int : #config(DEFAULT_TEMP_ALLOCATOR_BACKIN
 NO_DEFAULT_TEMP_ALLOCATOR: bool : ODIN_OS == .Freestanding || ODIN_DEFAULT_TO_NIL_ALLOCATOR

 when NO_DEFAULT_TEMP_ALLOCATOR {
+	// `Default_Temp_Allocator` is a `nil_allocator` when `NO_DEFAULT_TEMP_ALLOCATOR` is `true`.
 	Default_Temp_Allocator :: struct {}
 	
 	default_temp_allocator_init :: proc(s: ^Default_Temp_Allocator, size: int, backing_allocator := context.allocator) {}
@@ -20,6 +21,11 @@ when NO_DEFAULT_TEMP_ALLOCATOR {
 	default_temp_allocator_temp_end :: proc(temp: Arena_Temp, loc := #caller_location) {
 	}
 } else {
+	// `Default_Temp_Allocator` is an `Arena` based type of allocator. See `runtime.Arena` for its implementation.
+	// The default `context.temp_allocator` is typically called with `free_all(context.temp_allocator)` once per "frame-loop"
+	// to prevent it from "leaking" memory.
+	//
+	// Note: `Default_Temp_Allocator` is a `nil_allocator` when `NO_DEFAULT_TEMP_ALLOCATOR` is `true`.
 	Default_Temp_Allocator :: struct {
 		arena: Arena,
 	}
--- a/base/runtime/dynamic_map_internal.odin
+++ b/base/runtime/dynamic_map_internal.odin
@@ -6,8 +6,6 @@ _ :: intrinsics
 // High performance, cache-friendly, open-addressed Robin Hood hashing hash map
 // data structure with various optimizations for Odin.
 //
-// Copyright 2022 (c) Dale Weiler
-//
 // The core of the hash map data structure is the Raw_Map struct which is a
 // type-erased representation of the map. This type-erased representation is
 // used in two ways: static and dynamic. When static type information is known,
@@ -985,6 +983,9 @@ __dynamic_map_entry :: proc "odin" (#no_alias m: ^Raw_Map, #no_alias info: ^Map_
 // IMPORTANT: USED WITHIN THE COMPILER
@(private)
 __dynamic_map_reserve :: proc "odin" (#no_alias m: ^Raw_Map, #no_alias info: ^Map_Info, new_capacity: uint, loc := #caller_location) -> Allocator_Error {
+	if m == nil {
+		return nil
+	}
 	return map_reserve_dynamic(m, info, uintptr(new_capacity), loc)
 }

--- a/base/runtime/entry_windows.odin
+++ b/base/runtime/entry_windows.odin
@@ -28,7 +28,19 @@ when ODIN_BUILD_MODE == .Dynamic {
 		return true
 	}
 } else when !ODIN_TEST && !ODIN_NO_ENTRY_POINT {
-	when ODIN_ARCH == .i386 || ODIN_NO_CRT {
+	when ODIN_ARCH == .i386 && !ODIN_NO_CRT {
+		// Windows i386 with CRT: libcmt provides mainCRTStartup which calls _main
+		// Note: "c" calling convention adds underscore prefix automatically on i386
+		@(link_name="main", linkage="strong", require)
+		main :: proc "c" (argc: i32, argv: [^]cstring) -> i32 {
+			args__ = argv[:argc]
+			context = default_context()
+			#force_no_inline _startup_runtime()
+			intrinsics.__entry_point()
+			#force_no_inline _cleanup_runtime()
+			return 0
+		}
+	} else when ODIN_NO_CRT {
 		@(link_name="mainCRTStartup", linkage="strong", require)
 		mainCRTStartup :: proc "system" () -> i32 {
 			context = default_context()
--- a/base/runtime/heap_allocator.odin
+++ b/base/runtime/heap_allocator.odin
@@ -71,10 +71,12 @@ heap_allocator_proc :: proc(allocator_data: rawptr, mode: Allocator_Mode,

 		new_memory = aligned_alloc(new_size, new_alignment, p, old_size, zero_memory) or_return

-		// NOTE: heap_resize does not zero the new memory, so we do it
-		if zero_memory && new_size > old_size {
-			new_region := raw_data(new_memory[old_size:])
-			intrinsics.mem_zero(new_region, new_size - old_size)
+		when ODIN_OS != .Windows {
+			// NOTE: heap_resize does not zero the new memory, so we do it
+			if zero_memory && new_size > old_size {
+				new_region := raw_data(new_memory[old_size:])
+				conditional_mem_zero(new_region, new_size - old_size)
+			}
 		}
 		return
 	}
--- a/base/runtime/internal.odin
+++ b/base/runtime/internal.odin
@@ -7,10 +7,11 @@ import "base:intrinsics"
 IS_WASM :: ODIN_ARCH == .wasm32 || ODIN_ARCH == .wasm64p32

@(private)
-RUNTIME_LINKAGE :: "strong" when (
-	ODIN_USE_SEPARATE_MODULES || 
-	ODIN_BUILD_MODE == .Dynamic ||
-	!ODIN_NO_CRT) else "internal"
+RUNTIME_LINKAGE :: "strong"   when ODIN_USE_SEPARATE_MODULES else
+                   "internal" when ODIN_NO_ENTRY_POINT && (ODIN_BUILD_MODE == .Static || ODIN_BUILD_MODE == .Dynamic || ODIN_BUILD_MODE == .Object) else
+                   "strong"   when ODIN_BUILD_MODE == .Dynamic else
+                   "strong"   when !ODIN_NO_CRT else
+                   "internal"
 RUNTIME_REQUIRE :: false // !ODIN_TILDE

@(private)
@@ -123,7 +124,7 @@ mem_copy_non_overlapping :: proc "contextless" (dst, src: rawptr, len: int) -> r

 DEFAULT_ALIGNMENT :: 2*align_of(rawptr)

-mem_alloc_bytes :: #force_inline proc(size: int, alignment: int = DEFAULT_ALIGNMENT, allocator := context.allocator, loc := #caller_location) -> ([]byte, Allocator_Error) {
+mem_alloc_bytes :: #force_no_inline proc(size: int, alignment: int = DEFAULT_ALIGNMENT, allocator := context.allocator, loc := #caller_location) -> ([]byte, Allocator_Error) {
 	assert(is_power_of_two_int(alignment), "Alignment must be a power of two", loc)
 	if size == 0 || allocator.procedure == nil{
 		return nil, nil
@@ -131,7 +132,7 @@ mem_alloc_bytes :: #force_inline proc(size: int, alignment: int = DEFAULT_ALIGNM
 	return allocator.procedure(allocator.data, .Alloc, size, alignment, nil, 0, loc)
 }

-mem_alloc :: #force_inline proc(size: int, alignment: int = DEFAULT_ALIGNMENT, allocator := context.allocator, loc := #caller_location) -> ([]byte, Allocator_Error) {
+mem_alloc :: #force_no_inline proc(size: int, alignment: int = DEFAULT_ALIGNMENT, allocator := context.allocator, loc := #caller_location) -> ([]byte, Allocator_Error) {
 	assert(is_power_of_two_int(alignment), "Alignment must be a power of two", loc)
 	if size == 0 || allocator.procedure == nil {
 		return nil, nil
@@ -139,7 +140,7 @@ mem_alloc :: #force_inline proc(size: int, alignment: int = DEFAULT_ALIGNMENT, a
 	return allocator.procedure(allocator.data, .Alloc, size, alignment, nil, 0, loc)
 }

-mem_alloc_non_zeroed :: #force_inline proc(size: int, alignment: int = DEFAULT_ALIGNMENT, allocator := context.allocator, loc := #caller_location) -> ([]byte, Allocator_Error) {
+mem_alloc_non_zeroed :: #force_no_inline proc(size: int, alignment: int = DEFAULT_ALIGNMENT, allocator := context.allocator, loc := #caller_location) -> ([]byte, Allocator_Error) {
 	assert(is_power_of_two_int(alignment), "Alignment must be a power of two", loc)
 	if size == 0 || allocator.procedure == nil {
 		return nil, nil
@@ -147,7 +148,8 @@ mem_alloc_non_zeroed :: #force_inline proc(size: int, alignment: int = DEFAULT_A
 	return allocator.procedure(allocator.data, .Alloc_Non_Zeroed, size, alignment, nil, 0, loc)
 }

-mem_free :: #force_inline proc(ptr: rawptr, allocator := context.allocator, loc := #caller_location) -> Allocator_Error {
+@builtin
+mem_free :: #force_no_inline proc(ptr: rawptr, allocator := context.allocator, loc := #caller_location) -> Allocator_Error {
 	if ptr == nil || allocator.procedure == nil {
 		return nil
 	}
@@ -155,7 +157,7 @@ mem_free :: #force_inline proc(ptr: rawptr, allocator := context.allocator, loc
 	return err
 }

-mem_free_with_size :: #force_inline proc(ptr: rawptr, byte_count: int, allocator := context.allocator, loc := #caller_location) -> Allocator_Error {
+mem_free_with_size :: #force_no_inline proc(ptr: rawptr, byte_count: int, allocator := context.allocator, loc := #caller_location) -> Allocator_Error {
 	if ptr == nil || allocator.procedure == nil {
 		return nil
 	}
@@ -163,7 +165,7 @@ mem_free_with_size :: #force_inline proc(ptr: rawptr, byte_count: int, allocator
 	return err
 }

-mem_free_bytes :: #force_inline proc(bytes: []byte, allocator := context.allocator, loc := #caller_location) -> Allocator_Error {
+mem_free_bytes :: #force_no_inline proc(bytes: []byte, allocator := context.allocator, loc := #caller_location) -> Allocator_Error {
 	if bytes == nil || allocator.procedure == nil {
 		return nil
 	}
@@ -171,15 +173,15 @@ mem_free_bytes :: #force_inline proc(bytes: []byte, allocator := context.allocat
 	return err
 }

-
-mem_free_all :: #force_inline proc(allocator := context.allocator, loc := #caller_location) -> (err: Allocator_Error) {
+@builtin
+mem_free_all :: #force_no_inline proc(allocator := context.allocator, loc := #caller_location) -> (err: Allocator_Error) {
 	if allocator.procedure != nil {
 		_, err = allocator.procedure(allocator.data, .Free_All, 0, 0, nil, 0, loc)
 	}
 	return
 }

-_mem_resize :: #force_inline proc(ptr: rawptr, old_size, new_size: int, alignment: int = DEFAULT_ALIGNMENT, allocator := context.allocator, should_zero: bool, loc := #caller_location) -> (data: []byte, err: Allocator_Error) {
+_mem_resize :: #force_no_inline proc(ptr: rawptr, old_size, new_size: int, alignment: int = DEFAULT_ALIGNMENT, allocator := context.allocator, should_zero: bool, loc := #caller_location) -> (data: []byte, err: Allocator_Error) {
 	assert(is_power_of_two_int(alignment), "Alignment must be a power of two", loc)
 	if allocator.procedure == nil {
 		return nil, nil
@@ -230,6 +232,55 @@ non_zero_mem_resize :: proc(ptr: rawptr, old_size, new_size: int, alignment: int
 	return _mem_resize(ptr, old_size, new_size, alignment, allocator, false, loc)
 }

+conditional_mem_zero :: proc "contextless" (data: rawptr, n_: int) #no_bounds_check {
+	// When acquiring memory from the OS for the first time it's likely that the
+	// OS already gives the zero page mapped multiple times for the request. The
+	// actual allocation does not have physical pages allocated to it until those
+	// pages are written to which causes a page-fault. This is often called COW
+	// (Copy on Write)
+	//
+	// You do not want to actually zero out memory in this case because it would
+	// cause a bunch of page faults decreasing the speed of allocations and
+	// increase the amount of actual resident physical memory used.
+	//
+	// Instead a better technique is to check if memory is zerored before zeroing
+	// it. This turns out to be an important optimization in practice, saving
+	// nearly half (or more) the amount of physical memory used by an application.
+	// This is why every implementation of calloc in libc does this optimization.
+	//
+	// It may seem counter-intuitive but most allocations in an application are
+	// wasted and never used. When you consider something like a [dynamic]T which
+	// always doubles in capacity on resize but you rarely ever actually use the
+	// full capacity of a dynamic array it means you have a lot of resident waste
+	// if you actually zeroed the remainder of the memory.
+	//
+	// Keep in mind the OS is already guaranteed to give you zeroed memory by
+	// mapping in this zero page multiple times so in the best case there is no
+	// need to actually zero anything. As for testing all this memory for a zero
+	// value, it costs nothing because the the same zero page is used for the
+	// whole allocation and will exist in L1 cache for the entire zero checking
+	// process.
+
+	if n_ <= 0 {
+		return
+	}
+	n := uint(n_)
+
+	n_words := n / size_of(uintptr)
+	p_words := ([^]uintptr)(data)[:n_words]
+	p_bytes := ([^]byte)(data)[size_of(uintptr) * n_words:n]
+	for &p_word in p_words {
+		if p_word != 0 {
+			p_word = 0
+		}
+	}
+	for &p_byte in p_bytes {
+		if p_byte != 0 {
+			p_byte = 0
+		}
+	}
+}
+
 memory_equal :: proc "contextless" (x, y: rawptr, n: int) -> bool {
 	switch {
 	case n == 0: return true
@@ -291,7 +342,7 @@ memory_compare :: proc "contextless" (x, y: rawptr, n: int) -> int #no_bounds_ch
 	case y == nil: return +1
 	}
 	a, b := cast([^]byte)x, cast([^]byte)y
-	
+
 	n := uint(n)
 	i := uint(0)
 	m := uint(0)
@@ -667,7 +718,7 @@ quaternion256_eq :: #force_inline proc "contextless" (a, b: quaternion256) -> bo
 quaternion256_ne :: #force_inline proc "contextless" (a, b: quaternion256) -> bool { return real(a) != real(b) || imag(a) != imag(b) || jmag(a) != jmag(b) || kmag(a) != kmag(b) }


-string_decode_rune :: #force_inline proc "contextless" (s: string) -> (rune, int) {
+string_decode_rune :: proc "contextless" (s: string) -> (rune, int) {
 	// NOTE(bill): Duplicated here to remove dependency on package unicode/utf8

 	@(static, rodata) accept_sizes := [256]u8{
@@ -782,7 +833,7 @@ string_decode_last_rune :: proc "contextless" (s: string) -> (rune, int) {
 }


-string16_decode_rune :: #force_inline proc "contextless" (s: string16) -> (rune, int) {
+string16_decode_rune :: proc "contextless" (s: string16) -> (rune, int) {
 	REPLACEMENT_CHAR :: '\ufffd'
 	_surr1           :: 0xd800
 	_surr2           :: 0xdc00
@@ -1359,4 +1410,3 @@ when .Address in ODIN_SANITIZER_FLAGS {
 		__asan_unpoison_memory_region :: proc "system" (address: rawptr, size: uint) ---
 	}
 }
-
--- a/base/runtime/os_specific.odin
+++ b/base/runtime/os_specific.odin
@@ -2,6 +2,20 @@ package runtime

 _OS_Errno :: distinct int

+HAS_RAND_BYTES :: _HAS_RAND_BYTES
+
 stderr_write :: proc "contextless" (data: []byte) -> (int, _OS_Errno) {
 	return _stderr_write(data)
 }
+
+rand_bytes :: proc "contextless" (dst: []byte) {
+	when HAS_RAND_BYTES {
+		_rand_bytes(dst)
+	} else {
+		panic_contextless("base/runtime: no runtime entropy source")
+	}
+}
+
+exit :: proc "contextless" (code: int) -> ! {
+	_exit(code)
+}
--- a/base/runtime/os_specific_bsd.odin
+++ b/base/runtime/os_specific_bsd.odin
@@ -4,6 +4,8 @@ package runtime

 foreign import libc "system:c"

+_HAS_RAND_BYTES :: true
+
@(default_calling_convention="c")
 foreign libc {
 	@(link_name="write")
@@ -14,6 +16,8 @@ foreign libc {
 	} else {
 		__error :: proc() -> ^i32 ---
 	}
+
+	arc4random_buf :: proc(buf: [^]byte, nbytes: uint) ---
 }

 _stderr_write :: proc "contextless" (data: []byte) -> (int, _OS_Errno) {
@@ -24,3 +28,15 @@ _stderr_write :: proc "contextless" (data: []byte) -> (int, _OS_Errno) {
 	}
 	return int(ret), 0
 }
+
+_rand_bytes :: proc "contextless" (dst: []byte) {
+	arc4random_buf(raw_data(dst), len(dst))
+}
+
+_exit :: proc "contextless" (code: int) -> ! {
+	@(default_calling_convention="c")
+	foreign libc {
+		exit :: proc(status: i32) -> ! ---
+	}
+	exit(i32(code))
+}
--- a/base/runtime/os_specific_darwin.odin
+++ b/base/runtime/os_specific_darwin.odin
@@ -4,6 +4,8 @@ package runtime

 import "base:intrinsics"

+_HAS_RAND_BYTES :: true
+
 _stderr_write :: proc "contextless" (data: []byte) -> (int, _OS_Errno) {
 	STDERR :: 2
 	when ODIN_NO_CRT {
@@ -26,3 +28,25 @@ _stderr_write :: proc "contextless" (data: []byte) -> (int, _OS_Errno) {
 		return 0, _OS_Errno(__error()^)
 	}
 }
+
+foreign import libc "system:System"
+
+_rand_bytes :: proc "contextless" (dst: []byte) {
+	// This process used to use Security/RandomCopyBytes, however
+	// on every version of MacOS (>= 10.12) that we care about,
+	// arc4random is implemented securely.
+
+	@(default_calling_convention="c")
+	foreign libc {
+		arc4random_buf :: proc(buf: [^]byte, nbytes: uint) ---
+	}
+	arc4random_buf(raw_data(dst), len(dst))
+}
+
+_exit :: proc "contextless" (code: int) -> ! {
+	@(default_calling_convention="c")
+	foreign libc {
+		exit :: proc(status: i32) -> ! ---
+	}
+	exit(i32(code))
+}
--- a/base/runtime/os_specific_freestanding.odin
+++ b/base/runtime/os_specific_freestanding.odin
@@ -2,7 +2,13 @@
 #+private
 package runtime

+_HAS_RAND_BYTES :: false
+
 // TODO(bill): reimplement `os.write`
 _stderr_write :: proc "contextless" (data: []byte) -> (int, _OS_Errno) {
 	return 0, -1
 }
+
+_exit :: proc "contextless" (code: int) -> ! {
+	trap()
+}
--- a/base/runtime/os_specific_haiku.odin
+++ b/base/runtime/os_specific_haiku.odin
@@ -4,11 +4,15 @@ package runtime

 foreign import libc "system:c"

+_HAS_RAND_BYTES :: true
+
 foreign libc {
 	@(link_name="write")
 	_unix_write :: proc(fd: i32, buf: rawptr, size: int) -> int ---

 	_errnop :: proc() -> ^i32 ---
+
+	arc4random_buf :: proc(buf: [^]byte, nbytes: uint) ---
 }

 _stderr_write :: proc "contextless" (data: []byte) -> (int, _OS_Errno) {
@@ -19,3 +23,11 @@ _stderr_write :: proc "contextless" (data: []byte) -> (int, _OS_Errno) {
 	}
 	return int(ret), 0
 }
+
+_rand_bytes :: proc "contextless" (dst: []byte) {
+	arc4random_buf(raw_data(dst), len(dst))
+}
+
+_exit :: proc "contextless" (code: int) -> ! {
+	trap()
+}
--- a/base/runtime/os_specific_js.odin
+++ b/base/runtime/os_specific_js.odin
@@ -4,6 +4,8 @@ package runtime

 foreign import "odin_env"

+_HAS_RAND_BYTES :: true
+
 _stderr_write :: proc "contextless" (data: []byte) -> (int, _OS_Errno) {
 	foreign odin_env {
 		write :: proc "contextless" (fd: u32, p: []byte) ---
@@ -11,3 +13,24 @@ _stderr_write :: proc "contextless" (data: []byte) -> (int, _OS_Errno) {
 	write(1, data)
 	return len(data), 0
 }
+
+_rand_bytes :: proc "contextless" (dst: []byte) {
+	foreign odin_env {
+		@(link_name = "rand_bytes")
+		env_rand_bytes :: proc "contextless" (buf: []byte) ---
+	}
+
+	MAX_PER_CALL_BYTES :: 65536 // 64kiB
+
+	dst := dst
+	for len(dst) > 0 {
+		to_read := min(len(dst), MAX_PER_CALL_BYTES)
+		env_rand_bytes(dst[:to_read])
+
+		dst = dst[to_read:]
+	}
+}
+
+_exit :: proc "contextless" (code: int) -> ! {
+	trap()
+}
--- a/base/runtime/os_specific_linux.odin
+++ b/base/runtime/os_specific_linux.odin
@@ -3,6 +3,8 @@ package runtime

 import "base:intrinsics"

+_HAS_RAND_BYTES :: true
+
 _stderr_write :: proc "contextless" (data: []byte) -> (int, _OS_Errno) {
 	when ODIN_ARCH == .amd64 {
 		SYS_write :: uintptr(1)
@@ -24,3 +26,64 @@ _stderr_write :: proc "contextless" (data: []byte) -> (int, _OS_Errno) {
 	}
 	return ret, 0
 }
+
+_rand_bytes :: proc "contextless" (dst: []byte) {
+	when ODIN_ARCH == .amd64 {
+		SYS_getrandom :: uintptr(318)
+	} else when ODIN_ARCH == .arm64 {
+		SYS_getrandom :: uintptr(278)
+	} else when ODIN_ARCH == .i386 {
+		SYS_getrandom :: uintptr(355)
+	} else when ODIN_ARCH == .arm32 {
+		SYS_getrandom :: uintptr(384)
+	} else when ODIN_ARCH == .riscv64 {
+		SYS_getrandom :: uintptr(278)
+	} else {
+		#panic("base/runtime: no SYS_getrandom definition for target")
+	}
+
+	ERR_EINTR :: 4
+	ERR_ENOSYS :: 38
+
+	MAX_PER_CALL_BYTES :: 33554431 // 2^25 - 1
+
+	dst := dst
+	l := len(dst)
+
+	for l > 0 {
+		to_read := min(l, MAX_PER_CALL_BYTES)
+		ret := int(intrinsics.syscall(SYS_getrandom, uintptr(raw_data(dst[:to_read])), uintptr(to_read), uintptr(0)))
+		switch ret {
+		case -ERR_EINTR:
+			// Call interupted by a signal handler, just retry the
+			// request.
+			continue
+		case -ERR_ENOSYS:
+			// The kernel is apparently prehistoric (< 3.17 circa 2014)
+			// and does not support getrandom.
+			panic_contextless("base/runtime: getrandom not available in kernel")
+		case:
+			if ret < 0 {
+				// All other failures are things that should NEVER happen
+				// unless the kernel interface changes (ie: the Linux
+				// developers break userland).
+				panic_contextless("base/runtime: getrandom failed")
+			}
+		}
+		l -= ret
+		dst = dst[ret:]
+	}
+}
+
+_exit :: proc "contextless" (code: int) -> ! {
+	SYS_exit_group ::
+		231 when ODIN_ARCH == .amd64 else
+		248 when ODIN_ARCH == .arm32 else
+		94  when ODIN_ARCH == .arm64 else
+		252 when ODIN_ARCH == .i386  else
+		94  when ODIN_ARCH == .riscv64 else
+		0
+
+	intrinsics.syscall(uintptr(SYS_exit_group), uintptr(i32(code)))
+	unreachable()
+}
--- a/base/runtime/os_specific_orca.odin
+++ b/base/runtime/os_specific_orca.odin
@@ -4,6 +4,8 @@ package runtime

 import "base:intrinsics"

+_HAS_RAND_BYTES :: false
+
 // Constants allowing to specify the level of logging verbosity.
 log_level :: enum u32 {
 	// Only errors are logged.
@@ -41,3 +43,8 @@ _stderr_write :: proc "contextless" (data: []byte) -> (int, _OS_Errno) {

 	return len(data), 0
 }
+
+
+_exit :: proc "contextless" (code: int) -> ! {
+	trap()
+}
--- a/base/runtime/os_specific_wasi.odin
+++ b/base/runtime/os_specific_wasi.odin
@@ -4,6 +4,8 @@ package runtime

 foreign import wasi "wasi_snapshot_preview1"

+_HAS_RAND_BYTES :: true
+
@(default_calling_convention="contextless")
 foreign wasi {
 	fd_write :: proc(
@@ -23,6 +25,12 @@ foreign wasi {
 		argv:     [^]cstring,
 		argv_buf: [^]byte,
 	) -> u16 ---
+
+	@(private="file")
+	proc_exit :: proc(rval: u32) -> ! ---
+
+	@(private ="file")
+	random_get :: proc(buf: []u8) -> u16 ---
 }

 _stderr_write :: proc "contextless" (data: []byte) -> (int, _OS_Errno) {
@@ -31,6 +39,12 @@ _stderr_write :: proc "contextless" (data: []byte) -> (int, _OS_Errno) {
 	return int(n), _OS_Errno(err)
 }

+_rand_bytes :: proc "contextless" (dst: []byte) {
+	if errno := random_get(dst); errno != 0 {
+		panic_contextless("base/runtime: wasi.random_get failed")
+	}
+}
+
 _wasi_setup_args :: proc() {
 	num_of_args, size_of_args: uint
 	if errno := args_sizes_get(&num_of_args, &size_of_args); errno != 0 {
@@ -53,3 +67,8 @@ _wasi_setup_args :: proc() {
 		delete(args_buf)
 	}
 }
+
+
+_exit :: proc "contextless" (code: int) -> ! {
+	proc_exit(u32(code))
+}
--- a/base/runtime/os_specific_windows.odin
+++ b/base/runtime/os_specific_windows.odin
@@ -2,8 +2,11 @@
 #+private
 package runtime

+foreign import bcrypt "system:Bcrypt.lib"
 foreign import kernel32 "system:Kernel32.lib"

+_HAS_RAND_BYTES :: true
+
@(private="file")
@(default_calling_convention="system")
 foreign kernel32 {
@@ -14,6 +17,14 @@ foreign kernel32 {
 	SetHandleInformation :: proc(hObject: rawptr, dwMask: u32, dwFlags: u32) -> b32 ---
 	WriteFile            :: proc(hFile: rawptr, lpBuffer: rawptr, nNumberOfBytesToWrite: u32, lpNumberOfBytesWritten: ^u32, lpOverlapped: rawptr) -> b32 ---
 	GetLastError         :: proc() -> u32 ---
+
+	ExitProcess          :: proc(code: u32) -> ! ---
+}
+
+@(private="file")
+@(default_calling_convention="system")
+foreign bcrypt {
+	BCryptGenRandom :: proc(hAlgorithm: rawptr, pBuffer: [^]u8, cbBuffer: u32, dwFlags: u32) -> i32 ---
 }

 _stderr_write :: proc "contextless" (data: []byte) -> (n: int, err: _OS_Errno) #no_bounds_check {
@@ -49,3 +60,31 @@ _stderr_write :: proc "contextless" (data: []byte) -> (n: int, err: _OS_Errno) #
 	n = int(total_write)
 	return
 }
+
+_rand_bytes :: proc "contextless" (dst: []byte) {
+	ensure_contextless(u64(len(dst)) <= u64(max(u32)), "base/runtime: oversized rand_bytes request")
+
+	BCRYPT_USE_SYSTEM_PREFERRED_RNG :: 0x00000002
+
+	ERROR_INVALID_HANDLE :: 6
+	ERROR_INVALID_PARAMETER :: 87
+
+	ret := BCryptGenRandom(nil, raw_data(dst), u32(len(dst)), BCRYPT_USE_SYSTEM_PREFERRED_RNG)
+	switch ret {
+	case 0:
+	case ERROR_INVALID_HANDLE:
+		// The handle to the first parameter is invalid.
+		// This should not happen here, since we explicitly pass nil to it
+		panic_contextless("base/runtime: BCryptGenRandom Invalid handle for hAlgorithm")
+	case ERROR_INVALID_PARAMETER:
+		// One of the parameters was invalid
+		panic_contextless("base/runtime: BCryptGenRandom Invalid parameter")
+	case:
+		// Unknown error
+		panic_contextless("base/runtime: BCryptGenRandom failed")
+	}
+}
+
+_exit :: proc "contextless" (code: int) -> ! {
+	ExitProcess(u32(code))
+}
--- a/base/runtime/print.odin
+++ b/base/runtime/print.odin
@@ -184,10 +184,11 @@ print_rune :: #force_no_inline proc "contextless" (r: rune) -> int #no_bounds_ch


 print_u64 :: #force_no_inline proc "contextless" (x: u64) #no_bounds_check {
+	b :: u64(10)
+	u := x
+
 	a: [129]byte
 	i := len(a)
-	b := u64(10)
-	u := x
 	for u >= b {
 		i -= 1; a[i] = _INTEGER_DIGITS_VAR[u % b]
 		u /= b
@@ -199,11 +200,9 @@ print_u64 :: #force_no_inline proc "contextless" (x: u64) #no_bounds_check {


 print_i64 :: #force_no_inline proc "contextless" (x: i64) #no_bounds_check {
-	b :: i64(10)
-
-	u := x
-	neg := u < 0
-	u = abs(u)
+	b :: u64(10)
+	u := u64(abs(x))
+	neg := x < 0

 	a: [129]byte
 	i := len(a)
@@ -408,9 +407,9 @@ print_type :: #force_no_inline proc "contextless" (ti: ^Type_Info) {
 		}

 		print_string("struct ")
-		if .packed    in info.flags { print_string("#packed ") }
-		if .raw_union in info.flags { print_string("#raw_union ") }
-		// if .no_copy   in info.flags { print_string("#no_copy ") }
+		if .packed      in info.flags { print_string("#packed ") }
+		if .raw_union   in info.flags { print_string("#raw_union ") }
+		if .all_or_none in info.flags { print_string("#all_or_none ") }
 		if .align in info.flags {
 			print_string("#align(")
 			print_u64(u64(ti.align))
--- a/base/runtime/procs_darwin.odin
+++ b/base/runtime/procs_darwin.odin
@@ -15,16 +15,25 @@ objc_SEL   :: ^intrinsics.objc_selector
 objc_Ivar  :: ^intrinsics.objc_ivar
 objc_BOOL  :: bool

+objc_super :: struct {
+	receiver: 	 objc_id,
+	super_class: objc_Class,
+}

 objc_IMP :: proc "c" (object: objc_id, sel: objc_SEL, #c_vararg args: ..any) -> objc_id

 foreign ObjC {
 	sel_registerName :: proc "c" (name: cstring) -> objc_SEL ---

-	objc_msgSend        :: proc "c" (self: objc_id, op: objc_SEL, #c_vararg args: ..any) ---
-	objc_msgSend_fpret  :: proc "c" (self: objc_id, op: objc_SEL, #c_vararg args: ..any) -> f64 ---
-	objc_msgSend_fp2ret :: proc "c" (self: objc_id, op: objc_SEL, #c_vararg args: ..any) -> complex128 ---
-	objc_msgSend_stret  :: proc "c" (self: objc_id, op: objc_SEL, #c_vararg args: ..any) ---
+	objc_msgSend             :: proc "c" (self: objc_id, op: objc_SEL, #c_vararg args: ..any) ---
+	objc_msgSend_fpret       :: proc "c" (self: objc_id, op: objc_SEL, #c_vararg args: ..any) -> f64 ---
+	objc_msgSend_fp2ret      :: proc "c" (self: objc_id, op: objc_SEL, #c_vararg args: ..any) -> complex128 ---
+	objc_msgSend_stret       :: proc "c" (self: objc_id, op: objc_SEL, #c_vararg args: ..any) ---
+
+	// See: https://github.com/opensource-apple/objc4/blob/cd5e62a5597ea7a31dccef089317abb3a661c154/runtime/objc-abi.h#L111
+	objc_msgSendSuper2       :: proc "c" (super: rawptr, op: objc_SEL, #c_vararg args: ..any) -> objc_id ---
+	objc_msgSendSuper2_stret :: proc "c" (super: ^objc_super, op: objc_SEL, #c_vararg args: ..any) ---
+

 	objc_lookUpClass          :: proc "c" (name: cstring) -> objc_Class ---
 	objc_allocateClassPair    :: proc "c" (superclass: objc_Class, name: cstring, extraBytes: uint) -> objc_Class ---
@@ -33,6 +42,7 @@ foreign ObjC {
 	class_addIvar             :: proc "c" (cls: objc_Class, name: cstring, size: uint, alignment: u8, types: cstring) -> objc_BOOL ---
 	class_getInstanceVariable :: proc "c" (cls : objc_Class, name: cstring) -> objc_Ivar ---
 	class_getInstanceSize     :: proc "c" (cls : objc_Class) -> uint ---
+	class_getSuperclass       :: proc "c" (cls : objc_Class) -> objc_Class ---
 	ivar_getOffset            :: proc "c" (v: objc_Ivar) -> uintptr ---
 	object_getClass           :: proc "c" (obj: objc_id) -> objc_Class ---
 }
--- a/base/runtime/random_generator.odin
+++ b/base/runtime/random_generator.odin
@@ -41,88 +41,3 @@ random_generator_reset_u64 :: proc(rg: Random_Generator, p: u64) {
 		rg.procedure(rg.data, .Reset, ([^]byte)(&p)[:size_of(p)])
 	}
 }
-
-
-Default_Random_State :: struct {
-	state: u64,
-	inc:   u64,
-}
-
-default_random_generator_proc :: proc(data: rawptr, mode: Random_Generator_Mode, p: []byte) {
-	@(require_results)
-	read_u64 :: proc "contextless" (r: ^Default_Random_State) -> u64 {
-		old_state := r.state
-		r.state = old_state * 6364136223846793005 + (r.inc|1)
-		xor_shifted := (((old_state >> 59) + 5) ~ old_state) * 12605985483714917081
-		rot := (old_state >> 59)
-		return (xor_shifted >> rot) | (xor_shifted << ((-rot) & 63))
-	}
-
-	@(thread_local)
-	global_rand_seed: Default_Random_State
-
-	init :: proc "contextless" (r: ^Default_Random_State, seed: u64) {
-		seed := seed
-		if seed == 0 {
-			seed = u64(intrinsics.read_cycle_counter())
-		}
-		r.state = 0
-		r.inc = (seed << 1) | 1
-		_ = read_u64(r)
-		r.state += seed
-		_ = read_u64(r)
-	}
-
-	r: ^Default_Random_State = ---
-	if data == nil {
-		r = &global_rand_seed
-	} else {
-		r = cast(^Default_Random_State)data
-	}
-
-	switch mode {
-	case .Read:
-		if r.state == 0 && r.inc == 0 {
-			init(r, 0)
-		}
-
-		switch len(p) {
-		case size_of(u64):
-			// Fast path for a 64-bit destination.
-			intrinsics.unaligned_store((^u64)(raw_data(p)), read_u64(r))
-		case:
-			// All other cases.
-			pos := i8(0)
-			val := u64(0)
-			for &v in p {
-				if pos == 0 {
-					val = read_u64(r)
-					pos = 8
-				}
-				v = byte(val)
-				val >>= 8
-				pos -= 1
-			}
-		}
-
-	case .Reset:
-		seed: u64
-		mem_copy_non_overlapping(&seed, raw_data(p), min(size_of(seed), len(p)))
-		init(r, seed)
-
-	case .Query_Info:
-		if len(p) != size_of(Random_Generator_Query_Info) {
-			return
-		}
-		info := (^Random_Generator_Query_Info)(raw_data(p))
-		info^ += {.Uniform, .Resettable}
-	}
-}
-
-@(require_results)
-default_random_generator :: proc "contextless" (state: ^Default_Random_State = nil) -> Random_Generator {
-	return {
-		procedure = default_random_generator_proc,
-		data = state,
-	}
-}
--- a/base/runtime/random_generator_chacha8.odin
+++ b/base/runtime/random_generator_chacha8.odin
@@ -0,0 +1,164 @@
+package runtime
+
+import "base:intrinsics"
+
+// This is an implementation of the Chacha8Rand DRBG, as specified
+// in https://github.com/C2SP/C2SP/blob/main/chacha8rand.md
+//
+// There is a tradeoff to be made between state-size and performance,
+// in terms of the amount of rng output buffered.
+//
+// The sensible buffer sizes are:
+// - 256-bytes:  128-bit SIMD with 16x vector registers (SSE2)
+// - 512-bytes:  128-bit SIMD with 32x vector registers (ARMv8),
+//               256-bit SIMD with 16x vector registers (AVX2),
+// - 1024-bytes: AVX-512
+//
+// Notes:
+//  - Smaller than 256-bytes is possible but would require redundant
+//    calls to the ChaCha8 function, which is prohibitively expensive.
+//  - Larger than 1024-bytes is possible but pointless as the construct
+//    is defined around 992-bytes of RNG output and 32-bytes of input
+//    per iteration.
+//
+// This implementation opts for a 1024-byte buffer for simplicity,
+// under the rationale that modern extremely memory constrained targets
+// provide suitable functionality in hardware, and the language makes
+// supporting the various SIMD flavors easy.
+
+@(private = "file")
+RNG_SEED_SIZE :: 32
+@(private)
+RNG_OUTPUT_PER_ITER :: 1024 - RNG_SEED_SIZE
+
+@(private)
+CHACHA_SIGMA_0: u32 : 0x61707865
+@(private)
+CHACHA_SIGMA_1: u32 : 0x3320646e
+@(private)
+CHACHA_SIGMA_2: u32 : 0x79622d32
+@(private)
+CHACHA_SIGMA_3: u32 : 0x6b206574
+@(private)
+CHACHA_ROUNDS :: 8
+
+Default_Random_State :: struct {
+	_buf:    [1024]byte,
+	_off:    int,
+	_seeded: bool,
+}
+
+@(require_results)
+default_random_generator :: proc "contextless" (state: ^Default_Random_State = nil) -> Random_Generator {
+	return {
+		procedure = default_random_generator_proc,
+		data = state,
+	}
+}
+
+default_random_generator_proc :: proc(data: rawptr, mode: Random_Generator_Mode, p: []byte) {
+	@(thread_local)
+	state: Default_Random_State
+
+	r: ^Default_Random_State = &state
+	if data != nil {
+		r = cast(^Default_Random_State)data
+	}
+	next_seed := r._buf[RNG_OUTPUT_PER_ITER:]
+
+	switch mode {
+	case .Read:
+		if !r._seeded { // Unlikely.
+			rand_bytes(next_seed)
+			r._off = RNG_OUTPUT_PER_ITER // Force refill.
+			r._seeded = true
+		}
+
+		assert(r._off <= RNG_OUTPUT_PER_ITER, "chacha8rand/BUG: outputed key material")
+		if r._off >= RNG_OUTPUT_PER_ITER { // Unlikely.
+			chacha8rand_refill(r)
+		}
+
+		// We are guaranteed to have at least some RNG output buffered.
+		//
+		// As an invariant each read will consume a multiple of 8-bytes
+		// of output at a time.
+		assert(r._off <= RNG_OUTPUT_PER_ITER - 8, "chacha8rand/BUG: less than 8-bytes of output available")
+		assert(r._off % 8 == 0, "chacha8rand/BUG: buffered output is not a multiple of 8-bytes")
+
+		p_len := len(p)
+		if p_len == size_of(u64) {
+			#no_bounds_check {
+				// Fast path for a 64-bit destination.
+				src := (^u64)(raw_data(r._buf[r._off:]))
+				intrinsics.unaligned_store((^u64)(raw_data(p)), src^)
+				src^ = 0 // Erasure (backtrack resistance)
+				r._off += 8
+			}
+			return
+		}
+
+		p_ := p
+		for remaining := p_len; remaining > 0; {
+			sz := min(remaining, RNG_OUTPUT_PER_ITER - r._off)
+			#no_bounds_check {
+				copy(p_[:sz], r._buf[r._off:])
+				p_ = p_[sz:]
+				remaining -= sz
+			}
+			rounded_sz := ((sz + 7) / 8) * 8
+			new_off := r._off + rounded_sz
+			#no_bounds_check if new_off < RNG_OUTPUT_PER_ITER {
+				// Erasure (backtrack resistance)
+				intrinsics.mem_zero(raw_data(r._buf[r._off:]), rounded_sz)
+				r._off = new_off
+			} else {
+				// Can omit erasure since we are overwriting the entire
+				// buffer.
+				chacha8rand_refill(r)
+			}
+		}
+
+	case .Reset:
+		// If no seed is passed, the next call to .Read will attempt to
+		// reseed from the system entropy source.
+		if len(p) == 0 {
+			r._seeded = false
+			return
+		}
+
+		// The cryptographic security of the output depends entirely
+		// on the quality of the entropy in the seed, we will allow
+		// re-seeding (as it makes testing easier), but callers that
+		// decide to provide arbitrary seeds are on their own as far
+		// as ensuring high-quality entropy.
+		intrinsics.mem_zero(raw_data(next_seed), RNG_SEED_SIZE)
+		copy(next_seed, p)
+		r._seeded = true
+		r._off = RNG_OUTPUT_PER_ITER // Force a refill.
+
+	case .Query_Info:
+		if len(p) != size_of(Random_Generator_Query_Info) {
+			return
+		}
+		info := (^Random_Generator_Query_Info)(raw_data(p))
+		info^ += {.Uniform, .Cryptographic, .Resettable}
+	}
+}
+
+@(private = "file")
+chacha8rand_refill :: proc(r: ^Default_Random_State) {
+	assert(r._seeded == true, "chacha8rand/BUG: unseeded refill")
+
+	// i386 has insufficient vector registers to use the
+	// accelerated path at the moment.
+	when ODIN_ARCH == .amd64 && intrinsics.has_target_feature("avx2") {
+		chacha8rand_refill_simd256(r)
+	} else when HAS_HARDWARE_SIMD && ODIN_ARCH != .i386 {
+		chacha8rand_refill_simd128(r)
+	} else {
+		chacha8rand_refill_ref(r)
+	}
+
+	r._off = 0
+}
--- a/base/runtime/random_generator_chacha8_ref.odin
+++ b/base/runtime/random_generator_chacha8_ref.odin
@@ -0,0 +1,145 @@
+package runtime
+
+import "base:intrinsics"
+
+@(private)
+chacha8rand_refill_ref :: proc(r: ^Default_Random_State) {
+	// Initialize the base state.
+	k: [^]u32 = (^u32)(raw_data(r._buf[RNG_OUTPUT_PER_ITER:]))
+	when ODIN_ENDIAN == .Little {
+		s4 := k[0]
+		s5 := k[1]
+		s6 := k[2]
+		s7 := k[3]
+		s8 := k[4]
+		s9 := k[5]
+		s10 := k[6]
+		s11 := k[7]
+	} else {
+		s4 := intrinsics.byte_swap(k[0])
+		s5 := intrinsics.byte_swap(k[1])
+		s6 := intrinsics.byte_swap(k[2])
+		s7 := intrinsics.byte_swap(k[3])
+		s8 := intrinsics.byte_swap(k[4])
+		s9 := intrinsics.byte_swap(k[5])
+		s10 := intrinsics.byte_swap(k[6])
+		s11 := intrinicss.byte_swap(k[7])
+	}
+	s12: u32           // Counter starts at 0.
+	s13, s14, s15: u32 // IV of all 0s.
+
+	dst: [^]u32 = (^u32)(raw_data(r._buf[:]))
+
+	// At least with LLVM21 force_inline produces identical perf to
+	// manual inlining, yay.
+	quarter_round := #force_inline proc "contextless" (a, b, c, d: u32) -> (u32, u32, u32, u32) {
+		a, b, c, d := a, b, c, d
+
+		a += b
+		d ~= a
+		d = rotl(d, 16)
+
+		c += d
+		b ~= c
+		b = rotl(b, 12)
+
+		a += b
+		d ~= a
+		d = rotl(d, 8)
+
+		c += d
+		b ~= c
+		b = rotl(b, 7)
+
+		return a, b, c, d
+	}
+
+	// Filippo Valsorda made an observation that only one of the column
+	// round depends on the counter (s12), so it is worth precomputing
+	// and reusing across multiple blocks.  As far as I know, only Go's
+	// chacha implementation does this.
+
+	p1, p5, p9, p13 := quarter_round(CHACHA_SIGMA_1, s5, s9, s13)
+	p2, p6, p10, p14 := quarter_round(CHACHA_SIGMA_2, s6, s10, s14)
+	p3, p7, p11, p15 := quarter_round(CHACHA_SIGMA_3, s7, s11, s15)
+
+	// 4 groups
+	for g := 0; g < 4; g = g + 1 {
+		// 4 blocks per group
+		for n := 0; n < 4; n = n + 1 {
+			// First column round that depends on the counter
+			p0, p4, p8, p12 := quarter_round(CHACHA_SIGMA_0, s4, s8, s12)
+
+			// First diagonal round
+			x0, x5, x10, x15 := quarter_round(p0, p5, p10, p15)
+			x1, x6, x11, x12 := quarter_round(p1, p6, p11, p12)
+			x2, x7, x8, x13 := quarter_round(p2, p7, p8, p13)
+			x3, x4, x9, x14 := quarter_round(p3, p4, p9, p14)
+
+			for i := CHACHA_ROUNDS - 2; i > 0; i = i - 2 {
+				x0, x4, x8, x12 = quarter_round(x0, x4, x8, x12)
+				x1, x5, x9, x13 = quarter_round(x1, x5, x9, x13)
+				x2, x6, x10, x14 = quarter_round(x2, x6, x10, x14)
+				x3, x7, x11, x15 = quarter_round(x3, x7, x11, x15)
+
+				x0, x5, x10, x15 = quarter_round(x0, x5, x10, x15)
+				x1, x6, x11, x12 = quarter_round(x1, x6, x11, x12)
+				x2, x7, x8, x13 = quarter_round(x2, x7, x8, x13)
+				x3, x4, x9, x14 = quarter_round(x3, x4, x9, x14)
+			}
+
+			// Interleave 4 blocks
+			// NB: The additions of sigma and the counter are omitted
+			STRIDE :: 4
+			d_ := dst[n:]
+			when ODIN_ENDIAN == .Little {
+				d_[STRIDE*0] = x0
+				d_[STRIDE*1] = x1
+				d_[STRIDE*2] = x2
+				d_[STRIDE*3] = x3
+				d_[STRIDE*4] = x4 + s4
+				d_[STRIDE*5] = x5 + s5
+				d_[STRIDE*6] = x6 + s6
+				d_[STRIDE*7] = x7 + s7
+				d_[STRIDE*8] = x8 + s8
+				d_[STRIDE*9] = x9 + s9
+				d_[STRIDE*10] = x10 + s10
+				d_[STRIDE*11] = x11 + s11
+				d_[STRIDE*12] = x12
+				d_[STRIDE*13] = x13 + s13
+				d_[STRIDE*14] = x14 + s14
+				d_[STRIDE*15] = x15 + s15
+			} else {
+				d_[STRIDE*0] = intrinsics.byte_swap(x0)
+				d_[STRIDE*1] = intrinsics.byte_swap(x1)
+				d_[STRIDE*2] = intrinsics.byte_swap(x2)
+				d_[STRIDE*3] = intrinsics.byte_swap(x3)
+				d_[STRIDE*4] = intrinsics.byte_swap(x4 + s4)
+				d_[STRIDE*5] = intrinsics.byte_swap(x5 + s5)
+				d_[STRIDE*6] = intrinsics.byte_swap(x6 + s6)
+				d_[STRIDE*7] = intrinsics.byte_swap(x7 + s7)
+				d_[STRIDE*8] = intrinsics.byte_swap(x8 + s8)
+				d_[STRIDE*9] = intrinsics.byte_swap(x9 + s9)
+				d_[STRIDE*10] = intrinsics.byte_swap(x10 + s10)
+				d_[STRIDE*11] = intrinsics.byte_swap(x11 + s11)
+				d_[STRIDE*12] = intrinsics.byte_swap(x12)
+				d_[STRIDE*13] = intrinsics.byte_swap(x13 + s13)
+				d_[STRIDE*14] = intrinsics.byte_swap(x14 + s14)
+				d_[STRIDE*15] = intrinsics.byte_swap(x15 + s15)
+			}
+
+			s12 = s12 + 1 // Increment the counter
+		}
+
+		dst = dst[16*4:]
+	}
+}
+
+// This replicates `rotate_left32` from `core:math/bits`, under the
+// assumption that this will live in `base:runtime`.
+@(require_results, private = "file")
+rotl :: #force_inline proc "contextless" (x: u32, k: int) -> u32 {
+	n :: 32
+	s := uint(k) & (n-1)
+	return x << s | x >> (n-s)
+}
--- a/base/runtime/random_generator_chacha8_simd128.odin
+++ b/base/runtime/random_generator_chacha8_simd128.odin
@@ -0,0 +1,290 @@
+#+build !i386
+package runtime
+
+import "base:intrinsics"
+
+@(private = "file")
+u32x4 :: #simd[4]u32
+
+@(private = "file")
+S0: u32x4 : {CHACHA_SIGMA_0, CHACHA_SIGMA_0, CHACHA_SIGMA_0, CHACHA_SIGMA_0}
+@(private = "file")
+S1: u32x4 : {CHACHA_SIGMA_1, CHACHA_SIGMA_1, CHACHA_SIGMA_1, CHACHA_SIGMA_1}
+@(private = "file")
+S2: u32x4 : {CHACHA_SIGMA_2, CHACHA_SIGMA_2, CHACHA_SIGMA_2, CHACHA_SIGMA_2}
+@(private = "file")
+S3: u32x4 : {CHACHA_SIGMA_3, CHACHA_SIGMA_3, CHACHA_SIGMA_3, CHACHA_SIGMA_3}
+
+@(private = "file")
+_ROT_7L: u32x4 : {7, 7, 7, 7}
+@(private = "file")
+_ROT_7R: u32x4 : {25, 25, 25, 25}
+@(private = "file")
+_ROT_12L: u32x4 : {12, 12, 12, 12}
+@(private = "file")
+_ROT_12R: u32x4 : {20, 20, 20, 20}
+@(private = "file")
+_ROT_8L: u32x4 : {8, 8, 8, 8}
+@(private = "file")
+_ROT_8R: u32x4 : {24, 24, 24, 24}
+@(private = "file")
+_ROT_16: u32x4 : {16, 16, 16, 16}
+@(private = "file")
+_CTR_INC_4: u32x4 : {4, 4, 4, 4}
+@(private = "file")
+_CTR_INC_8: u32x4 : {8, 8, 8, 8}
+
+when ODIN_ENDIAN == .Big {
+	@(private = "file")
+	_byteswap_u32x4 :: #force_inline proc "contextless" (v: u32x4) -> u32x4 {
+		u8x16 :: #simd[16]u8
+		return(
+			transmute(u32x4)simd.shuffle(
+				transmute(u8x16)v,
+				transmute(u8x16)v,
+				3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
+			)
+		)
+	}
+}
+
+@(private)
+chacha8rand_refill_simd128 :: proc(r: ^Default_Random_State) {
+	// Initialize the base state.
+	k: [^]u32 = (^u32)(raw_data(r._buf[RNG_OUTPUT_PER_ITER:]))
+	when ODIN_ENDIAN == .Little {
+		s4_ := k[0]
+		s5_ := k[1]
+		s6_ := k[2]
+		s7_ := k[3]
+		s8_ := k[4]
+		s9_ := k[5]
+		s10_ := k[6]
+		s11_ := k[7]
+	} else {
+		s4_ := intrinsics.byte_swap(k[0])
+		s5_ := intrinsics.byte_swap(k[1])
+		s6_ := intrinsics.byte_swap(k[2])
+		s7_ := intrinsics.byte_swap(k[3])
+		s8_ := intrinsics.byte_swap(k[4])
+		s9_ := intrinsics.byte_swap(k[5])
+		s10_ := intrinsics.byte_swap(k[6])
+		s11_ := intrinicss.byte_swap(k[7])
+	}
+
+	// 4-lane ChaCha8.
+	s4 := u32x4{s4_, s4_, s4_, s4_}
+	s5 := u32x4{s5_, s5_, s5_, s5_}
+	s6 := u32x4{s6_, s6_, s6_, s6_}
+	s7 := u32x4{s7_, s7_, s7_, s7_}
+	s8 := u32x4{s8_, s8_, s8_, s8_}
+	s9 := u32x4{s9_, s9_, s9_, s9_}
+	s10 := u32x4{s10_, s10_, s10_, s10_}
+	s11 := u32x4{s11_, s11_, s11_, s11_}
+	s12 := u32x4{0, 1, 2, 3}
+	s13, s14, s15: u32x4
+
+	dst: [^]u32x4 = (^u32x4)(raw_data(r._buf[:]))
+
+	quarter_round := #force_inline proc "contextless" (a, b, c, d: u32x4) -> (u32x4, u32x4, u32x4, u32x4) {
+		a, b, c, d := a, b, c, d
+
+		a = intrinsics.simd_add(a, b)
+		d = intrinsics.simd_bit_xor(d, a)
+		d = intrinsics.simd_bit_xor(intrinsics.simd_shl(d, _ROT_16), intrinsics.simd_shr(d, _ROT_16))
+
+		c = intrinsics.simd_add(c, d)
+		b = intrinsics.simd_bit_xor(b, c)
+		b = intrinsics.simd_bit_xor(intrinsics.simd_shl(b, _ROT_12L), intrinsics.simd_shr(b, _ROT_12R))
+
+		a = intrinsics.simd_add(a, b)
+		d = intrinsics.simd_bit_xor(d, a)
+		d = intrinsics.simd_bit_xor(intrinsics.simd_shl(d, _ROT_8L), intrinsics.simd_shr(d, _ROT_8R))
+
+		c = intrinsics.simd_add(c, d)
+		b = intrinsics.simd_bit_xor(b, c)
+		b = intrinsics.simd_bit_xor(intrinsics.simd_shl(b, _ROT_7L), intrinsics.simd_shr(b, _ROT_7R))
+
+		return a, b, c, d
+	}
+
+	// 8 blocks at a time.
+	//
+	// Note:
+	// This uses a ton of registers so it is only worth it on targets
+	// that have something like 32 128-bit registers.  This is currently
+	// all ARMv8 targets, and RISC-V Zvl128b (`V` application profile)
+	// targets.
+	//
+	// While our current definition of `.arm32` is 32-bit ARMv8, this
+	// may change in the future (ARMv7 is still relevant), and things
+	// like Cortex-A8/A9 does "pretend" 128-bit SIMD 64-bits at a time
+	// thus needs bemchmarking.
+	when ODIN_ARCH == .arm64 || ODIN_ARCH == .riscv64 {
+		for _ in 0..<2 {
+			x0_0, x1_0, x2_0, x3_0 := S0, S1, S2, S3
+			x4_0, x5_0, x6_0, x7_0 := s4, s5, s6, s7
+			x8_0, x9_0, x10_0, x11_0 := s8, s9, s10, s11
+			x12_0, x13_0, x14_0, x15_0 := s12, s13, s14, s15
+
+			x0_1, x1_1, x2_1, x3_1 := S0, S1, S2, S3
+			x4_1, x5_1, x6_1, x7_1 := s4, s5, s6, s7
+			x8_1, x9_1, x10_1, x11_1 := s8, s9, s10, s11
+			x12_1 := intrinsics.simd_add(s12, _CTR_INC_4)
+			x13_1, x14_1, x15_1 := s13, s14, s15
+
+			for i := CHACHA_ROUNDS; i > 0; i = i - 2 {
+				x0_0, x4_0, x8_0, x12_0 = quarter_round(x0_0, x4_0, x8_0, x12_0)
+				x0_1, x4_1, x8_1, x12_1 = quarter_round(x0_1, x4_1, x8_1, x12_1)
+				x1_0, x5_0, x9_0, x13_0 = quarter_round(x1_0, x5_0, x9_0, x13_0)
+				x1_1, x5_1, x9_1, x13_1 = quarter_round(x1_1, x5_1, x9_1, x13_1)
+				x2_0, x6_0, x10_0, x14_0 = quarter_round(x2_0, x6_0, x10_0, x14_0)
+				x2_1, x6_1, x10_1, x14_1 = quarter_round(x2_1, x6_1, x10_1, x14_1)
+				x3_0, x7_0, x11_0, x15_0 = quarter_round(x3_0, x7_0, x11_0, x15_0)
+				x3_1, x7_1, x11_1, x15_1 = quarter_round(x3_1, x7_1, x11_1, x15_1)
+
+				x0_0, x5_0, x10_0, x15_0 = quarter_round(x0_0, x5_0, x10_0, x15_0)
+				x0_1, x5_1, x10_1, x15_1 = quarter_round(x0_1, x5_1, x10_1, x15_1)
+				x1_0, x6_0, x11_0, x12_0 = quarter_round(x1_0, x6_0, x11_0, x12_0)
+				x1_1, x6_1, x11_1, x12_1 = quarter_round(x1_1, x6_1, x11_1, x12_1)
+				x2_0, x7_0, x8_0, x13_0 = quarter_round(x2_0, x7_0, x8_0, x13_0)
+				x2_1, x7_1, x8_1, x13_1 = quarter_round(x2_1, x7_1, x8_1, x13_1)
+				x3_0, x4_0, x9_0, x14_0 = quarter_round(x3_0, x4_0, x9_0, x14_0)
+				x3_1, x4_1, x9_1, x14_1 = quarter_round(x3_1, x4_1, x9_1, x14_1)
+			}
+
+			when ODIN_ENDIAN == .Little {
+				intrinsics.unaligned_store((^u32x4)(dst[0:]), x0_0)
+				intrinsics.unaligned_store((^u32x4)(dst[1:]), x1_0)
+				intrinsics.unaligned_store((^u32x4)(dst[2:]), x2_0)
+				intrinsics.unaligned_store((^u32x4)(dst[3:]), x3_0)
+				intrinsics.unaligned_store((^u32x4)(dst[4:]), intrinsics.simd_add(x4_0, s4))
+				intrinsics.unaligned_store((^u32x4)(dst[5:]), intrinsics.simd_add(x5_0, s5))
+				intrinsics.unaligned_store((^u32x4)(dst[6:]), intrinsics.simd_add(x6_0, s6))
+				intrinsics.unaligned_store((^u32x4)(dst[7:]), intrinsics.simd_add(x7_0, s7))
+				intrinsics.unaligned_store((^u32x4)(dst[8:]), intrinsics.simd_add(x8_0, s8))
+				intrinsics.unaligned_store((^u32x4)(dst[9:]), intrinsics.simd_add(x9_0, s9))
+				intrinsics.unaligned_store((^u32x4)(dst[10:]), intrinsics.simd_add(x10_0, s10))
+				intrinsics.unaligned_store((^u32x4)(dst[11:]), intrinsics.simd_add(x11_0, s11))
+				intrinsics.unaligned_store((^u32x4)(dst[12:]), x12_0)
+				intrinsics.unaligned_store((^u32x4)(dst[13:]), intrinsics.simd_add(x13_0, s13))
+				intrinsics.unaligned_store((^u32x4)(dst[14:]), intrinsics.simd_add(x14_0, s14))
+				intrinsics.unaligned_store((^u32x4)(dst[15:]), intrinsics.simd_add(x15_0, s15))
+
+				intrinsics.unaligned_store((^u32x4)(dst[16:]), x0_1)
+				intrinsics.unaligned_store((^u32x4)(dst[17:]), x1_1)
+				intrinsics.unaligned_store((^u32x4)(dst[18:]), x2_1)
+				intrinsics.unaligned_store((^u32x4)(dst[19:]), x3_1)
+				intrinsics.unaligned_store((^u32x4)(dst[20:]), intrinsics.simd_add(x4_1, s4))
+				intrinsics.unaligned_store((^u32x4)(dst[21:]), intrinsics.simd_add(x5_1, s5))
+				intrinsics.unaligned_store((^u32x4)(dst[22:]), intrinsics.simd_add(x6_1, s6))
+				intrinsics.unaligned_store((^u32x4)(dst[23:]), intrinsics.simd_add(x7_1, s7))
+				intrinsics.unaligned_store((^u32x4)(dst[24:]), intrinsics.simd_add(x8_1, s8))
+				intrinsics.unaligned_store((^u32x4)(dst[25:]), intrinsics.simd_add(x9_1, s9))
+				intrinsics.unaligned_store((^u32x4)(dst[26:]), intrinsics.simd_add(x10_1, s10))
+				intrinsics.unaligned_store((^u32x4)(dst[27:]), intrinsics.simd_add(x11_1, s11))
+				intrinsics.unaligned_store((^u32x4)(dst[28:]), x12_1)
+				intrinsics.unaligned_store((^u32x4)(dst[29:]), intrinsics.simd_add(x13_1, s13))
+				intrinsics.unaligned_store((^u32x4)(dst[30:]), intrinsics.simd_add(x14_1, s14))
+				intrinsics.unaligned_store((^u32x4)(dst[31:]), intrinsics.simd_add(x15_1, s15))
+			} else {
+				intrinsics.unaligned_store((^u32x4)(dst[0:]), _byteswap_u32x4(x0_0))
+				intrinsics.unaligned_store((^u32x4)(dst[1:]), _byteswap_u32x4(x1_0))
+				intrinsics.unaligned_store((^u32x4)(dst[2:]), _byteswap_u32x4(x2_0))
+				intrinsics.unaligned_store((^u32x4)(dst[3:]), _byteswap_u32x4(x3_0))
+				intrinsics.unaligned_store((^u32x4)(dst[4:]), _byteswap_u32x4(intrinsics.simd_add(x4_0, s4)))
+				intrinsics.unaligned_store((^u32x4)(dst[5:]), _byteswap_u32x4(intrinsics.simd_add(x5_0, s5)))
+				intrinsics.unaligned_store((^u32x4)(dst[6:]), _byteswap_u32x4(intrinsics.simd_add(x6_0, s6)))
+				intrinsics.unaligned_store((^u32x4)(dst[7:]), _byteswap_u32x4(intrinsics.simd_add(x7_0, s7)))
+				intrinsics.unaligned_store((^u32x4)(dst[8:]), _byteswap_u32x4(intrinsics.simd_add(x8_0, s8)))
+				intrinsics.unaligned_store((^u32x4)(dst[9:]), _byteswap_u32x4(intrinsics.simd_add(x9_0, s9)))
+				intrinsics.unaligned_store((^u32x4)(dst[10:]), _byteswap_u32x4(intrinsics.simd_add(x10_0, s10)))
+				intrinsics.unaligned_store((^u32x4)(dst[11:]), _byteswap_u32x4(intrinsics.simd_add(x11_0, s11)))
+				intrinsics.unaligned_store((^u32x4)(dst[12:]), _byteswap_u32x4(x12_0))
+				intrinsics.unaligned_store((^u32x4)(dst[13:]), _byteswap_u32x4(intrinsics.simd_add(x13_0, s13)))
+				intrinsics.unaligned_store((^u32x4)(dst[14:]), _byteswap_u32x4(intrinsics.simd_add(x14_0, s14)))
+				intrinsics.unaligned_store((^u32x4)(dst[15:]), _byteswap_u32x4(intrinsics.simd_add(x15_0, s15)))
+
+				intrinsics.unaligned_store((^u32x4)(dst[16:]), _byteswap_u32x4(x0_1))
+				intrinsics.unaligned_store((^u32x4)(dst[17:]), _byteswap_u32x4(x1_1))
+				intrinsics.unaligned_store((^u32x4)(dst[18:]), _byteswap_u32x4(x2_1))
+				intrinsics.unaligned_store((^u32x4)(dst[19:]), _byteswap_u32x4(x3_1))
+				intrinsics.unaligned_store((^u32x4)(dst[20:]), _byteswap_u32x4(intrinsics.simd_add(x4_1, s4)))
+				intrinsics.unaligned_store((^u32x4)(dst[21:]), _byteswap_u32x4(intrinsics.simd_add(x5_1, s5)))
+				intrinsics.unaligned_store((^u32x4)(dst[22:]), _byteswap_u32x4(intrinsics.simd_add(x6_1, s6)))
+				intrinsics.unaligned_store((^u32x4)(dst[23:]), _byteswap_u32x4(intrinsics.simd_add(x7_1, s7)))
+				intrinsics.unaligned_store((^u32x4)(dst[24:]), _byteswap_u32x4(intrinsics.simd_add(x8_1, s8)))
+				intrinsics.unaligned_store((^u32x4)(dst[25:]), _byteswap_u32x4(intrinsics.simd_add(x9_1, s9)))
+				intrinsics.unaligned_store((^u32x4)(dst[26:]), _byteswap_u32x4(intrinsics.simd_add(x10_1, s10)))
+				intrinsics.unaligned_store((^u32x4)(dst[27:]), _byteswap_u32x4(intrinsics.simd_add(x11_1, s11)))
+				intrinsics.unaligned_store((^u32x4)(dst[28:]), _byteswap_u32x4(x12_1))
+				intrinsics.unaligned_store((^u32x4)(dst[29:]), _byteswap_u32x4(intrinsics.simd_add(x13_1, s13)))
+				intrinsics.unaligned_store((^u32x4)(dst[30:]), _byteswap_u32x4(intrinsics.simd_add(x14_1, s14)))
+				intrinsics.unaligned_store((^u32x4)(dst[31:]), _byteswap_u32x4(intrinsics.simd_add(x15_1, s15)))
+			}
+
+			s12 = intrinsics.simd_add(s12, _CTR_INC_8)
+
+			dst = dst[32:]
+		}
+	} else {
+		for _ in 0..<4 {
+			x0, x1, x2, x3 := S0, S1, S2, S3
+			x4, x5, x6, x7 := s4, s5, s6, s7
+			x8, x9, x10, x11 := s8, s9, s10, s11
+			x12, x13, x14, x15 := s12, s13, s14, s15
+
+			for i := CHACHA_ROUNDS; i > 0; i = i - 2 {
+				x0, x4, x8, x12 = quarter_round(x0, x4, x8, x12)
+				x1, x5, x9, x13 = quarter_round(x1, x5, x9, x13)
+				x2, x6, x10, x14 = quarter_round(x2, x6, x10, x14)
+				x3, x7, x11, x15 = quarter_round(x3, x7, x11, x15)
+
+				x0, x5, x10, x15 = quarter_round(x0, x5, x10, x15)
+				x1, x6, x11, x12 = quarter_round(x1, x6, x11, x12)
+				x2, x7, x8, x13 = quarter_round(x2, x7, x8, x13)
+				x3, x4, x9, x14 = quarter_round(x3, x4, x9, x14)
+			}
+
+			when ODIN_ENDIAN == .Little {
+				intrinsics.unaligned_store((^u32x4)(dst[0:]), x0)
+				intrinsics.unaligned_store((^u32x4)(dst[1:]), x1)
+				intrinsics.unaligned_store((^u32x4)(dst[2:]), x2)
+				intrinsics.unaligned_store((^u32x4)(dst[3:]), x3)
+				intrinsics.unaligned_store((^u32x4)(dst[4:]), intrinsics.simd_add(x4, s4))
+				intrinsics.unaligned_store((^u32x4)(dst[5:]), intrinsics.simd_add(x5, s5))
+				intrinsics.unaligned_store((^u32x4)(dst[6:]), intrinsics.simd_add(x6, s6))
+				intrinsics.unaligned_store((^u32x4)(dst[7:]), intrinsics.simd_add(x7, s7))
+				intrinsics.unaligned_store((^u32x4)(dst[8:]), intrinsics.simd_add(x8, s8))
+				intrinsics.unaligned_store((^u32x4)(dst[9:]), intrinsics.simd_add(x9, s9))
+				intrinsics.unaligned_store((^u32x4)(dst[10:]), intrinsics.simd_add(x10, s10))
+				intrinsics.unaligned_store((^u32x4)(dst[11:]), intrinsics.simd_add(x11, s11))
+				intrinsics.unaligned_store((^u32x4)(dst[12:]), x12)
+				intrinsics.unaligned_store((^u32x4)(dst[13:]), intrinsics.simd_add(x13, s13))
+				intrinsics.unaligned_store((^u32x4)(dst[14:]), intrinsics.simd_add(x14, s14))
+				intrinsics.unaligned_store((^u32x4)(dst[15:]), intrinsics.simd_add(x15, s15))
+			} else {
+				intrinsics.unaligned_store((^u32x4)(dst[0:]), _byteswap_u32x4(x0))
+				intrinsics.unaligned_store((^u32x4)(dst[1:]), _byteswap_u32x4(x1))
+				intrinsics.unaligned_store((^u32x4)(dst[2:]), _byteswap_u32x4(x2))
+				intrinsics.unaligned_store((^u32x4)(dst[3:]), _byteswap_u32x4(x3))
+				intrinsics.unaligned_store((^u32x4)(dst[4:]), _byteswap_u32x4(intrinsics.simd_add(x4, s4)))
+				intrinsics.unaligned_store((^u32x4)(dst[5:]), _byteswap_u32x4(intrinsics.simd_add(x5, s5)))
+				intrinsics.unaligned_store((^u32x4)(dst[6:]), _byteswap_u32x4(intrinsics.simd_add(x6, s6)))
+				intrinsics.unaligned_store((^u32x4)(dst[7:]), _byteswap_u32x4(intrinsics.simd_add(x7, s7)))
+				intrinsics.unaligned_store((^u32x4)(dst[8:]), _byteswap_u32x4(intrinsics.simd_add(x8, s8)))
+				intrinsics.unaligned_store((^u32x4)(dst[9:]), _byteswap_u32x4(intrinsics.simd_add(x9, s9)))
+				intrinsics.unaligned_store((^u32x4)(dst[10:]), _byteswap_u32x4(intrinsics.simd_add(x10, s10)))
+				intrinsics.unaligned_store((^u32x4)(dst[11:]), _byteswap_u32x4(intrinsics.simd_add(x11, s11)))
+				intrinsics.unaligned_store((^u32x4)(dst[12:]), _byteswap_u32x4(x12))
+				intrinsics.unaligned_store((^u32x4)(dst[13:]), _byteswap_u32x4(intrinsics.simd_add(x13, s13)))
+				intrinsics.unaligned_store((^u32x4)(dst[14:]), _byteswap_u32x4(intrinsics.simd_add(x14, s14)))
+				intrinsics.unaligned_store((^u32x4)(dst[15:]), _byteswap_u32x4(intrinsics.simd_add(x15, s15)))
+			}
+
+			s12 = intrinsics.simd_add(s12, _CTR_INC_4)
+
+			dst = dst[16:]
+		}
+	}
+}
--- a/base/runtime/random_generator_chacha8_simd256.odin
+++ b/base/runtime/random_generator_chacha8_simd256.odin
@@ -0,0 +1,197 @@
+#+build amd64
+package runtime
+
+import "base:intrinsics"
+
+#assert(ODIN_ENDIAN == .Little)
+
+@(private = "file")
+u32x8 :: #simd[8]u32
+@(private = "file")
+u32x4 :: #simd[4]u32
+
+@(private = "file")
+S0: u32x8 : {
+	CHACHA_SIGMA_0, CHACHA_SIGMA_0, CHACHA_SIGMA_0, CHACHA_SIGMA_0,
+	CHACHA_SIGMA_0, CHACHA_SIGMA_0, CHACHA_SIGMA_0, CHACHA_SIGMA_0,
+}
+@(private = "file")
+S1: u32x8 : {
+	CHACHA_SIGMA_1, CHACHA_SIGMA_1, CHACHA_SIGMA_1, CHACHA_SIGMA_1,
+	CHACHA_SIGMA_1, CHACHA_SIGMA_1, CHACHA_SIGMA_1, CHACHA_SIGMA_1,
+}
+@(private = "file")
+S2: u32x8 : {
+	CHACHA_SIGMA_2, CHACHA_SIGMA_2, CHACHA_SIGMA_2, CHACHA_SIGMA_2,
+	CHACHA_SIGMA_2, CHACHA_SIGMA_2, CHACHA_SIGMA_2, CHACHA_SIGMA_2,
+}
+@(private = "file")
+S3: u32x8 : {
+	CHACHA_SIGMA_3, CHACHA_SIGMA_3, CHACHA_SIGMA_3, CHACHA_SIGMA_3,
+	CHACHA_SIGMA_3, CHACHA_SIGMA_3, CHACHA_SIGMA_3, CHACHA_SIGMA_3,
+}
+
+@(private = "file")
+_ROT_7L: u32x8 : {7, 7, 7, 7, 7, 7, 7, 7}
+@(private = "file")
+_ROT_7R: u32x8 : {25, 25, 25, 25, 25, 25, 25, 25}
+@(private = "file")
+_ROT_12L: u32x8 : {12, 12, 12, 12, 12, 12, 12, 12}
+@(private = "file")
+_ROT_12R: u32x8 : {20, 20, 20, 20, 20, 20, 20, 20}
+@(private = "file")
+_ROT_8L: u32x8 : {8, 8, 8, 8, 8, 8, 8, 8}
+@(private = "file")
+_ROT_8R: u32x8 : {24, 24, 24, 24, 24, 24, 24, 24}
+@(private = "file")
+_ROT_16: u32x8 : {16, 16, 16, 16, 16, 16, 16, 16}
+@(private = "file")
+_CTR_INC_8: u32x8 : {8, 8, 8, 8, 8, 8, 8, 8}
+
+// To the best of my knowledge this is only really useful on
+// modern x86-64 as most ARM silicon is missing support for SVE2.
+
+@(private, enable_target_feature = "avx,avx2")
+chacha8rand_refill_simd256 :: proc(r: ^Default_Random_State) {
+	// Initialize the base state.
+	k: [^]u32 = (^u32)(raw_data(r._buf[RNG_OUTPUT_PER_ITER:]))
+	s4_ := k[0]
+	s5_ := k[1]
+	s6_ := k[2]
+	s7_ := k[3]
+	s8_ := k[4]
+	s9_ := k[5]
+	s10_ := k[6]
+	s11_ := k[7]
+
+	// 8-lane ChaCha8.
+	s4 := u32x8{s4_, s4_, s4_, s4_, s4_, s4_, s4_, s4_}
+	s5 := u32x8{s5_, s5_, s5_, s5_, s5_, s5_, s5_, s5_}
+	s6 := u32x8{s6_, s6_, s6_, s6_, s6_, s6_, s6_, s6_}
+	s7 := u32x8{s7_, s7_, s7_, s7_, s7_, s7_, s7_, s7_}
+	s8 := u32x8{s8_, s8_, s8_, s8_, s8_, s8_, s8_, s8_}
+	s9 := u32x8{s9_, s9_, s9_, s9_, s9_, s9_, s9_, s9_}
+	s10 := u32x8{s10_, s10_, s10_, s10_, s10_, s10_, s10_, s10_}
+	s11 := u32x8{s11_, s11_, s11_, s11_, s11_, s11_, s11_, s11_}
+	s12 := u32x8{0, 1, 2, 3, 4, 5, 6, 7}
+	s13, s14, s15: u32x8
+
+	u32x4 :: #simd[4]u32
+	dst: [^]u32x4 = (^u32x4)(raw_data(r._buf[:]))
+
+	quarter_round := #force_inline proc "contextless" (a, b, c, d: u32x8) -> (u32x8, u32x8, u32x8, u32x8) {
+		a, b, c, d := a, b, c, d
+
+		a = intrinsics.simd_add(a, b)
+		d = intrinsics.simd_bit_xor(d, a)
+		d = intrinsics.simd_bit_xor(intrinsics.simd_shl(d, _ROT_16), intrinsics.simd_shr(d, _ROT_16))
+
+		c = intrinsics.simd_add(c, d)
+		b = intrinsics.simd_bit_xor(b, c)
+		b = intrinsics.simd_bit_xor(intrinsics.simd_shl(b, _ROT_12L), intrinsics.simd_shr(b, _ROT_12R))
+
+		a = intrinsics.simd_add(a, b)
+		d = intrinsics.simd_bit_xor(d, a)
+		d = intrinsics.simd_bit_xor(intrinsics.simd_shl(d, _ROT_8L), intrinsics.simd_shr(d, _ROT_8R))
+
+		c = intrinsics.simd_add(c, d)
+		b = intrinsics.simd_bit_xor(b, c)
+		b = intrinsics.simd_bit_xor(intrinsics.simd_shl(b, _ROT_7L), intrinsics.simd_shr(b, _ROT_7R))
+
+		return a, b, c, d
+	}
+
+	for _ in 0..<2 {
+		x0, x1, x2, x3 := S0, S1, S2, S3
+		x4, x5, x6, x7 := s4, s5, s6, s7
+		x8, x9, x10, x11 := s8, s9, s10, s11
+		x12, x13, x14, x15 := s12, s13, s14, s15
+
+		for i := CHACHA_ROUNDS; i > 0; i = i - 2 {
+			x0, x4, x8, x12 = quarter_round(x0, x4, x8, x12)
+			x1, x5, x9, x13 = quarter_round(x1, x5, x9, x13)
+			x2, x6, x10, x14 = quarter_round(x2, x6, x10, x14)
+			x3, x7, x11, x15 = quarter_round(x3, x7, x11, x15)
+
+			x0, x5, x10, x15 = quarter_round(x0, x5, x10, x15)
+			x1, x6, x11, x12 = quarter_round(x1, x6, x11, x12)
+			x2, x7, x8, x13 = quarter_round(x2, x7, x8, x13)
+			x3, x4, x9, x14 = quarter_round(x3, x4, x9, x14)
+		}
+
+		x4 = intrinsics.simd_add(x4, s4)
+		x5 = intrinsics.simd_add(x5, s5)
+		x6 = intrinsics.simd_add(x6, s6)
+		x7 = intrinsics.simd_add(x7, s7)
+		x8 = intrinsics.simd_add(x8, s8)
+		x9 = intrinsics.simd_add(x9, s9)
+		x10 = intrinsics.simd_add(x10, s10)
+		x11 = intrinsics.simd_add(x11, s11)
+		x13 = intrinsics.simd_add(x13, s13)
+		x14 = intrinsics.simd_add(x14, s14)
+		x15 = intrinsics.simd_add(x15, s15)
+
+		// Ok, now we have x0->x15 with 8 lanes, but we need to
+		// output the first 4 blocks, then the second 4 blocks.
+		//
+		// LLVM appears not to consider "this instruction is totally
+		// awful on the given microarchitcture", which leads to
+		// `VPCOMPRESSED` being generated iff AVX512 support is
+		// enabled for `intrinsics.simd_masked_compress_store`.
+		// On Zen 4, this leads to a 50% performance regression vs
+		// the 128-bit SIMD code.
+		//
+		// The fake intrinsic (because LLVM doesn't appear to have
+		// an amd64 specific one), doesn't generate `VEXTRACTI128`,
+		// but instead does cleverness without horrible regressions.
+
+		intrinsics.unaligned_store((^u32x4)(dst[0:]), _mm_mm256_extracti128_si256(x0, 0))
+		intrinsics.unaligned_store((^u32x4)(dst[1:]), _mm_mm256_extracti128_si256(x1, 0))
+		intrinsics.unaligned_store((^u32x4)(dst[2:]), _mm_mm256_extracti128_si256(x2, 0))
+		intrinsics.unaligned_store((^u32x4)(dst[3:]), _mm_mm256_extracti128_si256(x3, 0))
+		intrinsics.unaligned_store((^u32x4)(dst[4:]), _mm_mm256_extracti128_si256(x4, 0))
+		intrinsics.unaligned_store((^u32x4)(dst[5:]), _mm_mm256_extracti128_si256(x5, 0))
+		intrinsics.unaligned_store((^u32x4)(dst[6:]), _mm_mm256_extracti128_si256(x6, 0))
+		intrinsics.unaligned_store((^u32x4)(dst[7:]), _mm_mm256_extracti128_si256(x7, 0))
+		intrinsics.unaligned_store((^u32x4)(dst[8:]), _mm_mm256_extracti128_si256(x8, 0))
+		intrinsics.unaligned_store((^u32x4)(dst[9:]), _mm_mm256_extracti128_si256(x9, 0))
+		intrinsics.unaligned_store((^u32x4)(dst[10:]), _mm_mm256_extracti128_si256(x10, 0))
+		intrinsics.unaligned_store((^u32x4)(dst[11:]), _mm_mm256_extracti128_si256(x11, 0))
+		intrinsics.unaligned_store((^u32x4)(dst[12:]), _mm_mm256_extracti128_si256(x12, 0))
+		intrinsics.unaligned_store((^u32x4)(dst[13:]), _mm_mm256_extracti128_si256(x13, 0))
+		intrinsics.unaligned_store((^u32x4)(dst[14:]), _mm_mm256_extracti128_si256(x14, 0))
+		intrinsics.unaligned_store((^u32x4)(dst[15:]), _mm_mm256_extracti128_si256(x15, 0))
+
+		intrinsics.unaligned_store((^u32x4)(dst[16:]), _mm_mm256_extracti128_si256(x0, 1))
+		intrinsics.unaligned_store((^u32x4)(dst[17:]), _mm_mm256_extracti128_si256(x1, 1))
+		intrinsics.unaligned_store((^u32x4)(dst[18:]), _mm_mm256_extracti128_si256(x2, 1))
+		intrinsics.unaligned_store((^u32x4)(dst[19:]), _mm_mm256_extracti128_si256(x3, 1))
+		intrinsics.unaligned_store((^u32x4)(dst[20:]), _mm_mm256_extracti128_si256(x4, 1))
+		intrinsics.unaligned_store((^u32x4)(dst[21:]), _mm_mm256_extracti128_si256(x5, 1))
+		intrinsics.unaligned_store((^u32x4)(dst[22:]), _mm_mm256_extracti128_si256(x6, 1))
+		intrinsics.unaligned_store((^u32x4)(dst[23:]), _mm_mm256_extracti128_si256(x7, 1))
+		intrinsics.unaligned_store((^u32x4)(dst[24:]), _mm_mm256_extracti128_si256(x8, 1))
+		intrinsics.unaligned_store((^u32x4)(dst[25:]), _mm_mm256_extracti128_si256(x9, 1))
+		intrinsics.unaligned_store((^u32x4)(dst[26:]), _mm_mm256_extracti128_si256(x10, 1))
+		intrinsics.unaligned_store((^u32x4)(dst[27:]), _mm_mm256_extracti128_si256(x11, 1))
+		intrinsics.unaligned_store((^u32x4)(dst[28:]), _mm_mm256_extracti128_si256(x12, 1))
+		intrinsics.unaligned_store((^u32x4)(dst[29:]), _mm_mm256_extracti128_si256(x13, 1))
+		intrinsics.unaligned_store((^u32x4)(dst[30:]), _mm_mm256_extracti128_si256(x14, 1))
+		intrinsics.unaligned_store((^u32x4)(dst[31:]), _mm_mm256_extracti128_si256(x15, 1))
+
+		s12 = intrinsics.simd_add(s12, _CTR_INC_8)
+
+		dst = dst[32:]
+	}
+}
+
+@(private = "file", require_results, enable_target_feature="avx2")
+_mm_mm256_extracti128_si256 :: #force_inline proc "c" (a: u32x8, $OFFSET: int) -> u32x4 {
+	when OFFSET == 0 {
+		return intrinsics.simd_shuffle(a, a, 0, 1, 2, 3)
+	} else when OFFSET == 1 {
+		return intrinsics.simd_shuffle(a, a, 4, 5, 6, 7)
+	} else {
+		#panic("chacha8rand: invalid offset")
+	}
+}
--- a/base/runtime/wasm_allocator.odin
+++ b/base/runtime/wasm_allocator.odin
@@ -43,7 +43,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */

-WASM_Allocator :: struct #no_copy {
+WASM_Allocator :: struct {
 	// The minimum alignment of allocations.
 	alignment: uint,
 	// A region that contains as payload a single forward linked list of pointers to
--- a/bin/radlink.exe
+++ b/bin/radlink.exe
--- a/build.bat
+++ b/build.bat
@@ -79,7 +79,7 @@ set compiler_flags= %compiler_flags% /utf-8
 set compiler_defines= -DODIN_VERSION_RAW=\"%odin_version_raw%\" -DGIT_SHA=\"%GIT_SHA%\"

 rem fileversion is defined as {Major,Minor,Build,Private: u16} so a bit limited
-set rc_flags="-DGIT_SHA=%GIT_SHA% -DVP=dev-%V1%-%V2%:%GIT_SHA% nologo -DV1=%V1% -DV2=%V2% -DV3=%V3% -DV4=%V4% -DVF=%odin_version_full% -DNIGHTLY=%nightly%"
+set rc_flags=-nologo "-DGIT_SHA=%GIT_SHA% -DVP=dev-%V1%-%V2%:%GIT_SHA% nologo -DV1=%V1% -DV2=%V2% -DV3=%V3% -DV4=%V4% -DVF=%odin_version_full% -DNIGHTLY=%nightly%"

 if %nightly% equ 1 set compiler_defines=%compiler_defines% -DNIGHTLY

--- a/build_odin.sh
+++ b/build_odin.sh
@@ -28,7 +28,8 @@ error() {

 # Brew advises people not to add llvm to their $PATH, so try and use brew to find it.
 if [ -z "$LLVM_CONFIG" ] &&  [ -n "$(command -v brew)" ]; then
-    if   [ -n "$(command -v $(brew --prefix llvm@20)/bin/llvm-config)" ]; then LLVM_CONFIG="$(brew --prefix llvm@20)/bin/llvm-config"
+    if   [ -n "$(command -v $(brew --prefix llvm@21)/bin/llvm-config)" ]; then LLVM_CONFIG="$(brew --prefix llvm@21)/bin/llvm-config"
+    elif [ -n "$(command -v $(brew --prefix llvm@20)/bin/llvm-config)" ]; then LLVM_CONFIG="$(brew --prefix llvm@20)/bin/llvm-config"
    elif [ -n "$(command -v $(brew --prefix llvm@19)/bin/llvm-config)" ]; then LLVM_CONFIG="$(brew --prefix llvm@19)/bin/llvm-config"
    elif [ -n "$(command -v $(brew --prefix llvm@18)/bin/llvm-config)" ]; then LLVM_CONFIG="$(brew --prefix llvm@18)/bin/llvm-config"
    elif [ -n "$(command -v $(brew --prefix llvm@17)/bin/llvm-config)" ]; then LLVM_CONFIG="$(brew --prefix llvm@17)/bin/llvm-config"
@@ -38,23 +39,19 @@ fi

 if [ -z "$LLVM_CONFIG" ]; then
 	# darwin, linux, openbsd
-	if   [ -n "$(command -v llvm-config-20)" ]; then LLVM_CONFIG="llvm-config-20"
+	if   [ -n "$(command -v llvm-config-21)" ]; then LLVM_CONFIG="llvm-config-21"
+	elif [ -n "$(command -v llvm-config-20)" ]; then LLVM_CONFIG="llvm-config-20"
 	elif [ -n "$(command -v llvm-config-19)" ]; then LLVM_CONFIG="llvm-config-19"
 	elif [ -n "$(command -v llvm-config-18)" ]; then LLVM_CONFIG="llvm-config-18"
 	elif [ -n "$(command -v llvm-config-17)" ]; then LLVM_CONFIG="llvm-config-17"
 	elif [ -n "$(command -v llvm-config-14)" ]; then LLVM_CONFIG="llvm-config-14"
-	elif [ -n "$(command -v llvm-config-13)" ]; then LLVM_CONFIG="llvm-config-13"
-	elif [ -n "$(command -v llvm-config-12)" ]; then LLVM_CONFIG="llvm-config-12"
-	elif [ -n "$(command -v llvm-config-11)" ]; then LLVM_CONFIG="llvm-config-11"
 	# freebsd
+	elif [ -n "$(command -v llvm-config21)" ]; then  LLVM_CONFIG="llvm-config21"
 	elif [ -n "$(command -v llvm-config20)" ]; then  LLVM_CONFIG="llvm-config20"
 	elif [ -n "$(command -v llvm-config19)" ]; then  LLVM_CONFIG="llvm-config19"
 	elif [ -n "$(command -v llvm-config18)" ]; then  LLVM_CONFIG="llvm-config18"
 	elif [ -n "$(command -v llvm-config17)" ]; then  LLVM_CONFIG="llvm-config17"
 	elif [ -n "$(command -v llvm-config14)" ]; then  LLVM_CONFIG="llvm-config14"
-	elif [ -n "$(command -v llvm-config13)" ]; then  LLVM_CONFIG="llvm-config13"
-	elif [ -n "$(command -v llvm-config12)" ]; then  LLVM_CONFIG="llvm-config12"
-	elif [ -n "$(command -v llvm-config11)" ]; then  LLVM_CONFIG="llvm-config11"
 	# fallback
 	elif [ -n "$(command -v llvm-config)" ]; then LLVM_CONFIG="llvm-config"
 	else
@@ -75,18 +72,12 @@ LLVM_VERSION_MAJOR="$(echo $LLVM_VERSION | awk -F. '{print $1}')"
 LLVM_VERSION_MINOR="$(echo $LLVM_VERSION | awk -F. '{print $2}')"
 LLVM_VERSION_PATCH="$(echo $LLVM_VERSION | awk -F. '{print $3}')"

-if [ $LLVM_VERSION_MAJOR -lt 11 ] || ([ $LLVM_VERSION_MAJOR -gt 14 ] && [ $LLVM_VERSION_MAJOR -lt 17 ]) || [ $LLVM_VERSION_MAJOR -gt 20 ]; then
-	error "Invalid LLVM version $LLVM_VERSION: must be 11, 12, 13, 14, 17, 18, 19 or 20"
+if [ $LLVM_VERSION_MAJOR -lt 14 ] || ([ $LLVM_VERSION_MAJOR -gt 14 ] && [ $LLVM_VERSION_MAJOR -lt 17 ]) || [ $LLVM_VERSION_MAJOR -gt 21 ]; then
+	error "Invalid LLVM version $LLVM_VERSION: must be 14, 17, 18, 19, 20, or 21"
 fi

 case "$OS_NAME" in
 Darwin)
-	if [ "$OS_ARCH" = "arm64" ]; then
-		if [ $LLVM_VERSION_MAJOR -lt 13 ]; then
-			error "Invalid LLVM version $LLVM_VERSION: Darwin Arm64 requires LLVM 13, 14, 17, 18, 19 or 20"
-		fi
-	fi
-
 	darwin_sysroot=
 	if [ $(which xcrun) ]; then
 		darwin_sysroot="--sysroot $(xcrun --sdk macosx --show-sdk-path)"
--- a/codecov.yml
+++ b/codecov.yml
@@ -1 +1,6 @@
-comment: false
+comment: false
+coverage:
+  status:
+    project:
+      default:
+        threshold: 1%
--- a/core/bufio/doc.odin
+++ b/core/bufio/doc.odin
@@ -0,0 +1,2 @@
+// Wraps an `io.Stream` interface to provide buffered I/O.
+package bufio
--- a/core/bufio/read_writer.odin
+++ b/core/bufio/read_writer.odin
@@ -38,5 +38,5 @@ _read_writer_procedure := proc(stream_data: rawptr, mode: io.Stream_Mode, p: []b
 	case .Query:
 		return io.query_utility({.Flush, .Read, .Write, .Query})
 	}
-	return 0, .Empty
+	return 0, .Unsupported
 }
--- a/core/bufio/reader.odin
+++ b/core/bufio/reader.odin
@@ -29,6 +29,7 @@ MIN_READ_BUFFER_SIZE :: 16
@(private)
 DEFAULT_MAX_CONSECUTIVE_EMPTY_READS :: 128

+// reader_init initializes using an `allocator`
 reader_init :: proc(b: ^Reader, rd: io.Reader, size: int = DEFAULT_BUF_SIZE, allocator := context.allocator, loc := #caller_location) {
 	size := size
 	size = max(size, MIN_READ_BUFFER_SIZE)
@@ -37,6 +38,7 @@ reader_init :: proc(b: ^Reader, rd: io.Reader, size: int = DEFAULT_BUF_SIZE, all
 	b.buf = make([]byte, size, allocator, loc)
 }

+// reader_init initializes using a user provided bytes buffer `buf`
 reader_init_with_buf :: proc(b: ^Reader, rd: io.Reader, buf: []byte) {
 	reader_reset(b, rd)
 	b.buf_allocator = {}
@@ -49,10 +51,12 @@ reader_destroy :: proc(b: ^Reader) {
 	b^ = {}
 }

+// reader_size returns the number of bytes in the backing buffer
 reader_size :: proc(b: ^Reader) -> int {
 	return len(b.buf)
 }

+// reader_reset resets the read and write positions, and the error values
 reader_reset :: proc(b: ^Reader, r: io.Reader) {
 	b.rd = r
 	b.r, b.w = 0, 0
@@ -343,7 +347,7 @@ _reader_proc :: proc(stream_data: rawptr, mode: io.Stream_Mode, p: []byte, offse
 	case .Query:
 		return io.query_utility({.Read, .Destroy, .Query})
 	}
-	return 0, .Empty
+	return 0, .Unsupported
 }

 //
--- a/core/bufio/scanner.odin
+++ b/core/bufio/scanner.odin
@@ -46,6 +46,7 @@ DEFAULT_MAX_SCAN_TOKEN_SIZE :: 1<<16
@(private)
 _INIT_BUF_SIZE :: 4096

+// Initializes a Scanner buffer an allocator `buf_allocator`
 scanner_init :: proc(s: ^Scanner, r: io.Reader, buf_allocator := context.allocator) -> ^Scanner {
 	s.r = r
 	s.split = scan_lines
@@ -53,6 +54,8 @@ scanner_init :: proc(s: ^Scanner, r: io.Reader, buf_allocator := context.allocat
 	s.buf.allocator = buf_allocator
 	return s
 }
+
+// Initializes a Scanner buffer a user provided bytes buffer `buf`
 scanner_init_with_buffer :: proc(s: ^Scanner, r: io.Reader, buf: []byte) -> ^Scanner {
 	s.r = r
 	s.split = scan_lines
@@ -75,24 +78,27 @@ scanner_error :: proc(s: ^Scanner) -> Scanner_Error {
 	return s._err
 }

-// Returns the most recent token created by scanner_scan.
+// Returns the most recent token created by 'scan'.
 // The underlying array may point to data that may be overwritten
-// by another call to scanner_scan.
+// by another call to 'scan'.
 // Treat the returned value as if it is immutable.
 scanner_bytes :: proc(s: ^Scanner) -> []byte {
 	return s.token
 }

-// Returns the most recent token created by scanner_scan.
+// Returns the most recent token created by 'scan'.
 // The underlying array may point to data that may be overwritten
-// by another call to scanner_scan.
+// by another call to 'scan'.
 // Treat the returned value as if it is immutable.
 scanner_text :: proc(s: ^Scanner) -> string {
 	return string(s.token)
 }

-// scanner_scan advances the scanner
-scanner_scan :: proc(s: ^Scanner) -> bool {
+// scanner_scan is an alias of scan
+scanner_scan :: scan
+
+// scan advances the Scanner
+scan :: proc(s: ^Scanner) -> bool {
 	set_err :: proc(s: ^Scanner, err: Scanner_Error) {
 		switch s._err {
 		case nil, .EOF:
@@ -229,6 +235,7 @@ scanner_scan :: proc(s: ^Scanner) -> bool {
 	}
 }

+// scan_bytes is a splitting procedure that returns each byte as a token
 scan_bytes :: proc(data: []byte, at_eof: bool) -> (advance: int, token: []byte, err: Scanner_Error, final_token: bool) {
 	if at_eof && len(data) == 0 {
 		return
@@ -236,6 +243,10 @@ scan_bytes :: proc(data: []byte, at_eof: bool) -> (advance: int, token: []byte,
 	return 1, data[0:1], nil, false
 }

+// scan_runes is a splitting procedure that returns each UTF-8 encoded rune as a token.
+// The lsit of runes return is equivalent to that of iterating over a string in a 'for in' loop, meaning any
+// erroneous UTF-8 encodings will be returned as U+FFFD. Unfortunately this means it is impossible for the "client"
+// to know whether a U+FFFD is an expected replacement rune or an encoding of an error.
 scan_runes :: proc(data: []byte, at_eof: bool) -> (advance: int, token: []byte, err: Scanner_Error, final_token: bool) {
 	if at_eof && len(data) == 0 {
 		return
@@ -264,7 +275,8 @@ scan_runes :: proc(data: []byte, at_eof: bool) -> (advance: int, token: []byte,
 	token = ERROR_RUNE
 	return
 }
-
+// scan_words is a splitting procedure that returns each Unicode-space-separated word of text, excluding the surrounded spaces.
+// It will never return return an empty string.
 scan_words :: proc(data: []byte, at_eof: bool) -> (advance: int, token: []byte, err: Scanner_Error, final_token: bool) {
 	is_space :: proc "contextless" (r:  rune) -> bool {
 		switch r {
@@ -312,6 +324,8 @@ scan_words :: proc(data: []byte, at_eof: bool) -> (advance: int, token: []byte,
 	return
 }

+// scan_lines is a splitting procedure that returns each line of text stripping of any trailing newline and an optional preceding carriage return (\r?\n).
+// A new line is allowed to be empty.
 scan_lines :: proc(data: []byte, at_eof: bool) -> (advance: int, token: []byte, err: Scanner_Error, final_token: bool) {
 	trim_carriage_return :: proc "contextless" (data: []byte) -> []byte {
 		if len(data) > 0 && data[len(data)-1] == '\r' {
--- a/core/bufio/writer.odin
+++ b/core/bufio/writer.odin
@@ -19,6 +19,7 @@ Writer :: struct {

 }

+// Initialized a Writer with an `allocator`
 writer_init :: proc(b: ^Writer, wr: io.Writer, size: int = DEFAULT_BUF_SIZE, allocator := context.allocator) {
 	size := size
 	size = max(size, MIN_READ_BUFFER_SIZE)
@@ -27,6 +28,7 @@ writer_init :: proc(b: ^Writer, wr: io.Writer, size: int = DEFAULT_BUF_SIZE, all
 	b.buf = make([]byte, size, allocator)
 }

+// Initialized a Writer with a user provided buffer `buf`
 writer_init_with_buf :: proc(b: ^Writer, wr: io.Writer, buf: []byte) {
 	writer_reset(b, wr)
 	b.buf_allocator = {}
@@ -247,5 +249,5 @@ _writer_proc :: proc(stream_data: rawptr, mode: io.Stream_Mode, p: []byte, offse
 	case .Query:
 		return io.query_utility({.Flush, .Write, .Destroy, .Query})
 	}
-	return 0, .Empty
+	return 0, .Unsupported
 }
--- a/core/bytes/buffer.odin
+++ b/core/bytes/buffer.odin
@@ -434,5 +434,5 @@ _buffer_proc :: proc(stream_data: rawptr, mode: io.Stream_Mode, p: []byte, offse
 	case .Query:
 		return io.query_utility({.Read, .Read_At, .Write, .Write_At, .Seek, .Size, .Destroy, .Query})
 	}
-	return 0, .Empty
+	return 0, .Unsupported
 }
--- a/core/bytes/bytes.odin
+++ b/core/bytes/bytes.odin
@@ -1,3 +1,4 @@
+// Procedures for manipulation of `[]byte` slices.
 package bytes

 import "base:intrinsics"
@@ -134,8 +135,13 @@ equal_fold :: proc(u, v: []byte) -> bool {
 			return false
 		}

-		// TODO(bill): Unicode folding
-
+		r := unicode.simple_fold(sr)
+		for r != sr && r < tr {
+			r = unicode.simple_fold(sr)
+		}
+		if r == tr {
+			continue loop
+		}
 		return false
 	}

--- a/core/bytes/reader.odin
+++ b/core/bytes/reader.odin
@@ -160,6 +160,6 @@ _reader_proc :: proc(stream_data: rawptr, mode: io.Stream_Mode, p: []byte, offse
 	case .Query:
 		return io.query_utility({.Read, .Read_At, .Seek, .Size, .Query})
 	}
-	return 0, .Empty
+	return 0, .Unsupported
 }

--- a/core/c/c.odin
+++ b/core/c/c.odin
@@ -1,3 +1,4 @@
+// Defines the basic types used by `C` programs for foreign function and data structure interop.
 package c

 import builtin "base:builtin"
@@ -48,7 +49,7 @@ int_least64_t  :: builtin.i64
 uint_least64_t :: builtin.u64

 // Same on Windows, Linux, and FreeBSD
-when ODIN_ARCH == .i386 || ODIN_ARCH == .amd64 {
+when ODIN_ARCH == .i386 {
 	int_fast8_t    :: builtin.i8
 	uint_fast8_t   :: builtin.u8
 	int_fast16_t   :: builtin.i32
@@ -57,6 +58,15 @@ when ODIN_ARCH == .i386 || ODIN_ARCH == .amd64 {
 	uint_fast32_t  :: builtin.u32
 	int_fast64_t   :: builtin.i64
 	uint_fast64_t  :: builtin.u64
+} else when ODIN_ARCH == .amd64 {
+	int_fast8_t    :: builtin.i8
+	uint_fast8_t   :: builtin.u8
+	int_fast16_t   :: long
+	uint_fast16_t  :: ulong
+	int_fast32_t   :: long
+	uint_fast32_t  :: ulong
+	int_fast64_t   :: builtin.i64
+	uint_fast64_t  :: builtin.u64
 } else {
 	int_fast8_t    :: builtin.i8
 	uint_fast8_t   :: builtin.u8
--- a/core/c/libc/doc.odin
+++ b/core/c/libc/doc.odin
@@ -0,0 +1,2 @@
+// Declares the commonly used things in `libc` (`C` standard library).
+package libc
--- a/core/c/libc/stdio.odin
+++ b/core/c/libc/stdio.odin
@@ -275,7 +275,7 @@ foreign libc {
 	// 7.21.7 Character input/output functions
 	fgetc     :: proc(stream: ^FILE) -> int ---
 	fgets     :: proc(s: [^]char, n: int, stream: ^FILE) -> [^]char ---
-	fputc     :: proc(s: cstring, stream: ^FILE) -> int ---
+	fputc     :: proc(s: c.int, stream: ^FILE) -> int ---
 	getc      :: proc(stream: ^FILE) -> int ---
 	getchar   :: proc() -> int ---
 	putc      :: proc(c: int, stream: ^FILE) -> int ---
@@ -390,7 +390,7 @@ to_stream :: proc(file: ^FILE) -> io.Stream {
 			}

 		case .Destroy:
-			return 0, .Empty
+			return 0, .Unsupported
 		
 		case .Query:
 			return io.query_utility({ .Close, .Flush, .Read, .Read_At, .Write, .Write_At, .Seek, .Size, .Query })
--- a/core/compress/common.odin
+++ b/core/compress/common.odin
@@ -1,15 +1,14 @@
+// A collection of utilities to aid with other `compress`ion packages.
+package compress
+
 /*
 	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
-	Made available under Odin's BSD-3 license.
+	Made available under Odin's license.

 	List of contributors:
 		Jeroen van Rijn: Initial implementation, optimization.
 */

-
-// package compress is a collection of utilities to aid with other compression packages
-package compress
-
 import "core:io"
 import "core:bytes"
 import "base:runtime"
@@ -298,7 +297,7 @@ peek_data_from_stream :: #force_inline proc(z: ^Context_Stream_Input, $T: typeid
 	curr := z.input->impl_seek(0, .Current) or_return
 	r, e1 := io.to_reader_at(z.input)
 	if !e1 {
-		return T{}, .Empty
+		return T{}, .Unsupported
 	}
 	when size <= 128 {
 		b: [size]u8
@@ -307,7 +306,7 @@ peek_data_from_stream :: #force_inline proc(z: ^Context_Stream_Input, $T: typeid
 	}
 	_, e2 := io.read_at(r, b[:], curr)
 	if e2 != .None {
-		return T{}, .Empty
+		return T{}, .Unsupported
 	}

 	res = (^T)(&b[0])^
@@ -325,7 +324,7 @@ peek_data_at_offset_from_stream :: #force_inline proc(z: ^Context_Stream_Input,

 	r, e3 := io.to_reader_at(z.input)
 	if !e3 {
-		return T{}, .Empty
+		return T{}, .Unsupported
 	}
 	when size <= 128 {
 		b: [size]u8
@@ -334,7 +333,7 @@ peek_data_at_offset_from_stream :: #force_inline proc(z: ^Context_Stream_Input,
 	}
 	_, e4 := io.read_at(r, b[:], pos)
 	if e4 != .None {
-		return T{}, .Empty
+		return T{}, .Unsupported
 	}

 	// Return read head to original position.
--- a/core/compress/gzip/doc.odin
+++ b/core/compress/gzip/doc.odin
@@ -1,15 +1,6 @@
 /*
-	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
-	Made available under Odin's BSD-3 license.
+A small `GZIP` unpacker.

-	List of contributors:
-		Jeroen van Rijn: Initial implementation.
-		Ginger Bill:     Cosmetic changes.
-
-	A small GZIP implementation as an example.
-*/
-
-/*
 Example:
 	import "core:bytes"
 	import "core:os"
@@ -88,3 +79,14 @@ Example:
 	}
 */
 package compress_gzip
+
+/*
+	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
+	Made available under Odin's license.
+
+	List of contributors:
+		Jeroen van Rijn: Initial implementation.
+		Ginger Bill:     Cosmetic changes.
+
+	A small GZIP implementation as an example.
+*/
--- a/core/compress/gzip/gzip.odin
+++ b/core/compress/gzip/gzip.odin
@@ -2,7 +2,7 @@ package compress_gzip

 /*
 	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
-	Made available under Odin's BSD-3 license.
+	Made available under Odin's license.

 	List of contributors:
 		Jeroen van Rijn: Initial implementation.
--- a/core/compress/shoco/model.odin
+++ b/core/compress/shoco/model.odin
@@ -1,11 +1,11 @@
+package compress_shoco
+
 /*
 	This file was generated, so don't edit this by hand.
 	Transliterated from https://github.com/Ed-von-Schleck/shoco/blob/master/shoco_model.h,
 	which is an English word model.
 */

-package compress_shoco
-
 DEFAULT_MODEL :: Shoco_Model {
 	min_char = 39,
 	max_char = 122,
--- a/core/compress/shoco/shoco.odin
+++ b/core/compress/shoco/shoco.odin
@@ -1,6 +1,9 @@
+// `Shoco` short string compression and decompression.
+package compress_shoco
+
 /*
 	Copyright 2022 Jeroen van Rijn <nom@duclavier.com>.
-	Made available under Odin's BSD-3 license.
+	Made available under Odin's license.

 	List of contributors:
 		Jeroen van Rijn: Initial implementation.
@@ -8,9 +11,6 @@
 	An implementation of [shoco](https://github.com/Ed-von-Schleck/shoco) by Christian Schramm.
 */

-// package shoco is an implementation of the shoco short string compressor.
-package compress_shoco
-
 import "base:intrinsics"
 import "core:compress"

--- a/core/compress/zlib/doc.odin
+++ b/core/compress/zlib/doc.odin
@@ -1,14 +1,6 @@
 /*
-	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
-	Made available under Odin's BSD-3 license.
+`Deflate` decompression of raw and `ZLIB`-type streams.

-	List of contributors:
-		Jeroen van Rijn: Initial implementation.
-
-	An example of how to use `zlib.inflate`.
-*/
-
-/*
 Example:
 	package main

@@ -49,3 +41,13 @@ Example:
 	}
 */
 package compress_zlib
+
+/*
+	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
+	Made available under Odin's license.
+
+	List of contributors:
+		Jeroen van Rijn: Initial implementation.
+
+	An example of how to use `zlib.inflate`.
+*/
--- a/core/compress/zlib/zlib.odin
+++ b/core/compress/zlib/zlib.odin
@@ -3,7 +3,7 @@ package compress_zlib

 /*
 	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
-	Made available under Odin's BSD-3 license.
+	Made available under Odin's license.

 	List of contributors:
 		Jeroen van Rijn: Initial implementation, optimization.
--- a/core/container/avl/avl.odin
+++ b/core/container/avl/avl.odin
@@ -1,8 +1,4 @@
-/*
-package avl implements an AVL tree.
-
-The implementation is non-intrusive, and non-recursive.
-*/
+// A non-intrusive and non-recursive implementation of `AVL` trees.
 package container_avl

@(require) import "base:intrinsics"
--- a/core/container/bit_array/doc.odin
+++ b/core/container/bit_array/doc.odin
@@ -1,7 +1,9 @@
 /*
+A dynamically-sized array of bits.
+
 The Bit Array can be used in several ways:

-By default you don't need to instantiate a Bit Array.
+By default you don't need to instantiate a `Bit_Array`.
 Example:
 	package test

@@ -18,11 +20,11 @@ Example:

 		// returns `false`, `false`, because this Bit Array wasn't created to allow negative indices.
 		was_set, was_retrieved := get(&bits, -1)
-		fmt.println(was_set, was_retrieved) 
+		fmt.println(was_set, was_retrieved)
 		destroy(&bits)
 	}

-A Bit Array can optionally allow for negative indices, if the minimum value was given during creation.
+A `Bit_Array` can optionally allow for negative indices, if the minimum value was given during creation.
 Example:
 	package test

@@ -49,4 +51,4 @@ Example:
 		fmt.printf("Freed.\n")
 	}
 */
-package container_dynamic_bit_array
+package container_dynamic_bit_array
--- a/core/container/intrusive/list/doc.odin
+++ b/core/container/intrusive/list/doc.odin
@@ -1,7 +1,7 @@
 /*
-Package list implements an intrusive doubly-linked list.
+An intrusive doubly-linked list.

-An intrusive container requires a `Node` to be embedded in your own structure, like this.
+The intrusive container requires a `Node` to be embedded in your own structure, like this.
 Example:
 	My_String :: struct {
 		node:  list.Node,
@@ -46,4 +46,4 @@ Output:
 	Hello
 	World
 */
-package container_intrusive_list
+package container_intrusive_list
--- a/core/container/lru/lru_cache.odin
+++ b/core/container/lru/lru_cache.odin
@@ -1,3 +1,4 @@
+// A least-recently-used (`LRU`) cache. It automatically removes older entries if its capacity is reached.
 package container_lru

 import "base:runtime"
--- a/core/container/priority_queue/priority_queue.odin
+++ b/core/container/priority_queue/priority_queue.odin
@@ -1,3 +1,4 @@
+// A priority queue data structure.
 package container_priority_queue

 import "base:builtin"
--- a/core/container/queue/queue.odin
+++ b/core/container/queue/queue.odin
@@ -1,3 +1,4 @@
+// A dynamically resizable double-ended queue/ring-buffer.
 package container_queue

 import "base:builtin"
--- a/core/container/rbtree/rbtree.odin
+++ b/core/container/rbtree/rbtree.odin
@@ -1,4 +1,4 @@
-// This package implements a red-black tree
+// A red-black tree with the same API as our AVL tree.
 package container_rbtree

@(require) import "base:intrinsics"
@@ -128,9 +128,9 @@ find_value :: proc(t: ^$T/Tree($Key, $Value), key: Key) -> (value: Value, ok: bo
 	return
 }

-// find_or_insert attempts to insert the value into the tree, and returns
-// the node, a boolean indicating if the value was inserted, and the
-// node allocator error if relevant.  If the value is already present, the existing node is updated.
+// find_or_insert attempts to insert the key-value pair into the tree, and returns
+// the node, a boolean indicating if a new node was inserted, and the
+// node allocator error if relevant. If the key is already present, the existing node is updated and returned.
 find_or_insert :: proc(t: ^$T/Tree($Key, $Value), key: Key, value: Value) -> (n: ^Node(Key, Value), inserted: bool, err: runtime.Allocator_Error) {
 	n_ptr := &t._root
 	for n_ptr^ != nil {
@@ -141,6 +141,7 @@ find_or_insert :: proc(t: ^$T/Tree($Key, $Value), key: Key, value: Value) -> (n:
 		case .Greater:
 			n_ptr = &n._right
 		case .Equal:
+			n.value = value
 			return
 		}
 	}
--- a/core/container/small_array/doc.odin
+++ b/core/container/small_array/doc.odin
@@ -1,8 +1,7 @@
 /*
-Package small_array implements a dynamic array like
-interface on a stack-allocated, fixed-size array.
+A dynamic array-like interface on a stack-allocated, fixed-size array.

-The Small_Array type is optimal for scenarios where you need
+The `Small_Array` type is optimal for scenarios where you need
 a container for a fixed number of elements of a specific type,
 with the total number known at compile time but the exact
 number to be used determined at runtime.
@@ -33,7 +32,7 @@ Example:
 		return
 	}

-	// the Small_Array can be an ordinary parameter 'generic' over
+	// the `Small_Array` can be an ordinary parameter 'generic' over
 	// the actual length to be usable with different sizes
 	print_elements :: proc(arr: ^small_array.Small_Array($N, rune)) {
 		for r in small_array.slice(arr) {
@@ -52,4 +51,4 @@ Output:
 	Hellope

 */
-package container_small_array
+package container_small_array
--- a/core/container/small_array/small_array.odin
+++ b/core/container/small_array/small_array.odin
@@ -1,8 +1,8 @@
 package container_small_array

 import "base:builtin"
-import "base:runtime"
-_ :: runtime
+@require import "base:intrinsics"
+@require import "base:runtime"

 /*
 A fixed-size stack-allocated array operated on in a dynamic fashion.
@@ -105,8 +105,13 @@ This operation assumes that the small-array is large enough.

 This will result in:
 	- the value if 0 <= index < len
-	- the zero value of the type if len < index < capacity
-	- 'crash' if capacity < index or index < 0
+	- raise a bounds check error if capacity <= index
+	- the previous value if len < index < capacity, which defauls to T's zero value.
+
+	e.g. if you call `small_array.push(&a, 0, 1, 2)`, and `i := pop_back(&a)`,
+	then `get(a, 2)` will return the earlier value `2` at that location.
+
+	See also `get_safe`, which returns T's zero value and `false` if `index` is out of bounds.

 **Inputs**
 - `a`: The small-array
@@ -125,8 +130,13 @@ This operation assumes that the small-array is large enough.

 This will result in:
 	- the pointer if 0 <= index < len
-	- the pointer to the zero value if len < index < capacity
-	- 'crash' if capacity < index or index < 0
+	- raise a bounds check error if capacity <= index
+	- a pointer to the previous value if len < index < capacity, which defauls to T's zero value.
+
+	e.g. if you call `small_array.push(&a, 0, 1, 2)`, and `i := pop_back(&a)`,
+	then `get_ptr(a, 2)` will return a pointer to the slot containing the earlier value `2` at that location.
+
+	See also `get_ptr_safe`, which returns a nil pointer, and `false` if `index` is out of bounds.

 **Inputs**
 - `a`: A pointer to the small-array
@@ -169,7 +179,7 @@ Output:
 	x

 */
-get_safe :: proc(a: $A/Small_Array($N, $T), index: int) -> (T, bool) #no_bounds_check {
+get_safe :: proc "contextless" (a: $A/Small_Array($N, $T), index: int) -> (T, bool) #no_bounds_check {
 	if index < 0 || index >= a.len {
 		return {}, false
 	}
@@ -183,11 +193,11 @@ Get a pointer to the item at the specified position.
 - `a`: A pointer to the small-array
 - `index`: The position of the item to get

-**Returns** 
+**Returns**
 - the pointer to the element at the specified position
 - true if element exists, false otherwise
 */
-get_ptr_safe :: proc(a: ^$A/Small_Array($N, $T), index: int) -> (^T, bool) #no_bounds_check {
+get_ptr_safe :: proc "contextless" (a: ^$A/Small_Array($N, $T), index: int) -> (^T, bool) #no_bounds_check {
 	if index < 0 || index >= a.len {
 		return {}, false
 	}
@@ -231,7 +241,7 @@ Example:
 		fmt.println(small_array.slice(&a))

 		// resizing makes the change visible
-		small_array.resize(&a, 100)
+		small_array.non_zero_resize(&a, 100)
 		fmt.println(small_array.slice(&a))
 	}

@@ -250,6 +260,8 @@ set :: proc "contextless" (a: ^$A/Small_Array($N, $T), index: int, item: T) {
 /*
 Tries to resize the small-array to the specified length.

+The memory of added elements will be zeroed out.
+
 The new length will be:
 	- `length` if `length` <= capacity
 	- capacity if length > capacity
@@ -259,7 +271,7 @@ The new length will be:
 - `length`: The new desired length

 Example:
-	
+
 	import "core:container/small_array"
 	import "core:fmt"

@@ -269,7 +281,7 @@ Example:
 		small_array.push_back(&a, 1)
 		small_array.push_back(&a, 2)
 		fmt.println(small_array.slice(&a))
-		
+
 		small_array.resize(&a, 1)
 		fmt.println(small_array.slice(&a))

@@ -278,12 +290,56 @@ Example:
 	}

 Output:
-	
+
+	[1, 2]
+	[1]
+	[1, 0, 0, 0, 0]
+*/
+resize :: proc "contextless" (a: ^$A/Small_Array($N, $T), length: int) {
+	prev_len := a.len
+	a.len = min(length, builtin.len(a.data))
+	if prev_len < a.len {
+		intrinsics.mem_zero(&a.data[prev_len], size_of(T)*(a.len-prev_len))
+	}
+}
+
+/*
+Tries to resize the small-array to the specified length.
+
+The new length will be:
+	- `length` if `length` <= capacity
+	- capacity if length > capacity
+
+**Inputs**
+- `a`: A pointer to the small-array
+- `length`: The new desired length
+
+Example:
+
+	import "core:container/small_array"
+	import "core:fmt"
+
+	non_zero_resize_example :: proc() {
+		a: small_array.Small_Array(5, int)
+
+		small_array.push_back(&a, 1)
+		small_array.push_back(&a, 2)
+		fmt.println(small_array.slice(&a))
+
+		small_array.non_zero_resize(&a, 1)
+		fmt.println(small_array.slice(&a))
+
+		small_array.non_zero_resize(&a, 100)
+		fmt.println(small_array.slice(&a))
+	}
+
+Output:
+
 	[1, 2]
 	[1]
 	[1, 2, 0, 0, 0]
 */
-resize :: proc "contextless" (a: ^$A/Small_Array, length: int) {
+non_zero_resize :: proc "contextless" (a: ^$A/Small_Array, length: int) {
 	a.len = min(length, builtin.len(a.data))
 }

--- a/core/container/topological_sort/topological_sort.odin
+++ b/core/container/topological_sort/topological_sort.odin
@@ -1,6 +1,5 @@
-// The following is a generic O(V+E) topological sorter implementation.
-// This is the fastest known method for topological sorting and Odin's
-// map type is being used to accelerate lookups.
+// A generic `O(V+E)` topological sorter implementation. This is the fastest known method for topological sorting.
+// Odin's map type is being used to accelerate lookups.
 package container_topological_sort

 import "base:intrinsics"
--- a/core/container/xar/xar.odin
+++ b/core/container/xar/xar.odin
@@ -0,0 +1,483 @@
+/*
+	Exponential Array (Xar).
+
+	A dynamically growing array using exponentially-sized chunks, providing stable
+	memory addresses for all elements. Unlike `[dynamic]T`, elements are never
+	moved once allocated, making it safe to hold pointers to elements.
+
+	For more information: https://azmr.uk/dyn/#exponential-arrayxar
+
+	Example:
+
+		import "core:container/xar"
+
+		example :: proc() {
+			x: xar.Xar(int, 4)
+			defer xar.destroy(&x)
+
+			xar.push_back(&x, 10)
+			xar.push_back(&x, 20)
+			xar.push_back(&x, 30)
+
+			ptr := xar.get_ptr(&x, 1)  // ptr remains valid after more push_backs
+			xar.push_back(&x, 40)
+			fmt.println(ptr^)  // prints 20
+		}
+*/
+package container_xar
+
+@(require) import "core:mem"
+@(require) import "base:intrinsics"
+@(require) import "base:runtime"
+
+PLATFORM_BITS :: 8*size_of(uint)
+_LOG2_PLATFORM_BITS :: intrinsics.constant_log2(PLATFORM_BITS)
+
+MAX_SHIFT :: PLATFORM_BITS>>1
+
+/*
+	An Exponential Array with stable element addresses.
+
+	Unlike `[dynamic]T` which reallocates and moves elements when growing, `Xar`
+	allocates separate chunks of exponentially increasing size. This guarantees
+	that pointers to elements remain valid for the lifetime of the container.
+
+	Fields:
+	- `chunks`: Fixed array of multi-pointers to allocated chunks
+	- `len`: Number of elements currently stored
+	- `allocator`: Allocator used for chunk allocations
+
+	Type Parameters:
+	- `T`: The element type
+	- `SHIFT`: Controls initial chunk size (1 << SHIFT). Must be in range (0, MAX_SHIFT].
+	          Larger values mean fewer, bigger chunks. Recommended: 4-8.
+
+	Chunk sizes grow as:
+	- `chunks[0]`: 1 << SHIFT elements
+	- `chunks[1]`: 1 << SHIFT elements
+	- `chunks[2]`: 1 << (SHIFT + 1) elements
+	- `chunks[3]`: 1 << (SHIFT + 2) elements
+	- `chunks[4]`: 1 << (SHIFT + 3) elements
+	- ...and so on
+
+	Example:
+
+		import "core:container/xar"
+
+		example :: proc() {
+			// Xar with initial chunk size of 16 (1 << 4)
+			x: xar.Xar(My_Struct, 4)
+			defer xar.destroy(&x)
+		}
+*/
+Xar :: struct($T: typeid, $SHIFT: uint) where 0 < SHIFT, SHIFT <= MAX_SHIFT {
+	chunks:    [(1 << (_LOG2_PLATFORM_BITS - intrinsics.constant_log2(SHIFT))) + 1][^]T,
+	len:       int,
+	allocator: mem.Allocator,
+}
+
+
+/*
+Initializes an exponential array with the given allocator.
+
+**Inputs**
+- `x`: Pointer to the exponential array to initialize
+- `allocator`: Allocator to use for chunk allocations (defaults to context.allocator)
+*/
+init :: proc(x: ^$X/Xar($T, $SHIFT), allocator := context.allocator) {
+	x^ = {allocator = allocator}
+}
+
+/*
+Frees all allocated chunks and resets the exponential array.
+
+**Inputs**
+- `x`: Pointer to the exponential array to destroy
+*/
+destroy :: proc(x: ^$X/Xar($T, $SHIFT)) {
+	#reverse for c, i in x.chunks {
+		if c != nil {
+			n := 1 << (SHIFT + uint(i if i > 0 else 1) - 1)
+			size_in_bytes := n * size_of(T)
+			mem.free_with_size(c, size_in_bytes, x.allocator)
+		}
+	}
+	x^ = {}
+}
+
+/*
+Resets the array's length to zero without freeing memory.
+Allocated chunks are retained for reuse.
+*/
+clear :: proc(x: ^$X/Xar($T, $SHIFT)) {
+	x.len = 0
+}
+
+// Returns the length of the exponential-array
+@(require_results)
+len :: proc(x: $X/Xar($T, $SHIFT)) -> int {
+	return x.len
+}
+
+// Returns the number of allocated elements
+@(require_results)
+cap :: proc(x: $X/Xar($T, $SHIFT)) -> int {
+	#reverse for c, i in x.chunks {
+		if c != nil {
+			return 1 << (SHIFT + uint(i if i > 0 else 1))
+		}
+	}
+	return 0
+}
+
+// Internal: computes chunk index, element index within chunk, and chunk capacity for a given index.
+@(require_results)
+_meta_get :: #force_inline proc($SHIFT: uint, index: uint) -> (chunk_idx, elem_idx, chunk_cap: uint) {
+	elem_idx = index
+	chunk_cap = uint(1) << SHIFT
+	chunk_idx = 0
+
+	index_shift := index >> SHIFT
+	if index_shift > 0 {
+		N :: 8*size_of(uint)-1
+		CLZ :: intrinsics.count_leading_zeros
+		chunk_idx = N-CLZ(index_shift) // MSB(index_shift)
+
+		chunk_cap  = 1 << (chunk_idx + SHIFT)
+		elem_idx   -= chunk_cap
+		chunk_idx += 1
+	}
+
+	return
+}
+/*
+Get a copy of the element at the specified index.
+
+**Inputs**
+- `x`: Pointer to the exponential array
+- `index`: Position of the element (0-indexed)
+
+**Returns**
+- a copy of the element
+*/
+@(require_results)
+get :: proc(x: ^$X/Xar($T, $SHIFT), #any_int index: int, loc := #caller_location) -> (val: T) #no_bounds_check {
+	runtime.bounds_check_error_loc(loc, index, x.len)
+	chunk_idx, elem_idx, _ := _meta_get(SHIFT, uint(index))
+	return x.chunks[chunk_idx][elem_idx]
+}
+
+/*
+Get a pointer to the element at the specified index.
+
+The returned pointer remains valid even after additional elements are added,
+as long as the element is not removed and the array is not destroyed.
+
+**Inputs**
+- `x`: Pointer to the exponential array
+- `index`: Position of the element (0-indexed)
+
+**Returns**
+- a stable pointer to the element
+
+Example:
+
+	import "core:container/xar"
+
+	get_ptr_example :: proc() {
+		x: xar.Xar(int, 4)
+		defer xar.destroy(&x)
+
+		xar.push_back(&x, 100)
+		ptr := xar.get_ptr(&x, 0)
+
+		// Pointer remains valid after growing
+		for i in 0..<1000 {
+			xar.push_back(&x, i)
+		}
+
+		fmt.println(ptr^)  // Still prints 100
+	}
+*/
+@(require_results)
+get_ptr :: proc(x: ^$X/Xar($T, $SHIFT), #any_int index: int, loc := #caller_location) -> (val: ^T) #no_bounds_check {
+	runtime.bounds_check_error_loc(loc, index, x.len)
+	chunk_idx, elem_idx, _ := _meta_get(SHIFT, uint(index))
+	return &x.chunks[chunk_idx][elem_idx]
+}
+
+/*
+Set the element at the specified index to the given value.
+
+**Inputs**
+- `x`: Pointer to the exponential array
+- `index`: Position of the element (0-indexed)
+- `value`: The value to set
+*/
+set :: proc(x: ^$X/Xar($T, $SHIFT), #any_int index: int, value: T, loc := #caller_location) #no_bounds_check {
+	runtime.bounds_check_error_loc(loc, index, x.len)
+	chunk_idx, elem_idx, _ := _meta_get(SHIFT, uint(index))
+	x.chunks[chunk_idx][elem_idx] = value
+}
+
+append    :: proc{push_back_elem, push_back_elems}
+push_back :: proc{push_back_elem, push_back_elems}
+
+/*
+Append an element to the end of the exponential array.
+Allocates a new chunk if necessary. Existing elements aren't moved, and their pointers remain stable.
+
+**Inputs**
+- `x`: Pointer to the exponential array
+- `value`: The element to append
+
+**Returns**
+- number of elements added (always 1 on success)
+- allocation error if chunk allocation failed
+
+Example:
+
+	import "core:container/xar"
+
+	push_back_example :: proc() {
+		x: xar.Xar(string, 4)
+		defer xar.destroy(&x)
+
+		xar.push_back(&x, "hello")
+		xar.push_back(&x, "world")
+
+		fmt.println(xar.get(&x, 0))  // hello
+		fmt.println(xar.get(&x, 1))  // world
+	}
+*/
+push_back_elem :: proc(x: ^$X/Xar($T, $SHIFT), value: T, loc := #caller_location) -> (n: int, err: mem.Allocator_Error) {
+	if x.allocator.procedure == nil {
+		// to minic `[dynamic]T` behaviour
+		x.allocator = context.allocator
+	}
+
+	chunk_idx, elem_idx, chunk_cap := _meta_get(SHIFT, uint(x.len))
+	if x.chunks[chunk_idx] == nil {
+		x.chunks[chunk_idx] = make([^]T, chunk_cap, x.allocator) or_return
+	}
+	x.chunks[chunk_idx][elem_idx] = value
+	x.len += 1
+	n = 1
+	return
+}
+
+/*
+Append multiple elements to the end of the exponential array.
+
+**Inputs**
+- `x`: Pointer to the exponential array
+- `values`: The elements to append
+
+**Returns**
+- number of elements successfully added
+- allocation error if chunk allocation failed (partial append possible)
+*/
+push_back_elems :: proc(x: ^$X/Xar($T, $SHIFT), values: ..T, loc := #caller_location) -> (n: int, err: mem.Allocator_Error) {
+	for value in values {
+		n += push_back_elem(x, value, loc) or_return
+	}
+	return
+}
+
+append_and_get_ptr :: push_back_elem_and_get_ptr
+/*
+Append an element and return a stable pointer to it.
+This is useful when you need to initialize a complex struct in-place or
+retain a reference to the newly added element.
+
+**Inputs**
+- `x`: Pointer to the exponential array
+- `value`: The element to append
+
+**Returns**
+- a stable pointer to the newly added element
+- allocation error if chunk allocation failed
+
+Example:
+
+	import "core:container/xar"
+
+	push_back_and_get_ptr_example :: proc() {
+		x: xar.Xar(My_Struct, 4)
+		defer xar.destroy(&x)
+
+		ptr := xar.push_back_elem_and_get_ptr(&x, My_Struct{}) or_else panic("alloc failed")
+		ptr.field = 42  // Initialize in-place
+	}
+*/
+@(require_results)
+push_back_elem_and_get_ptr :: proc(x: ^$X/Xar($T, $SHIFT), value: T, loc := #caller_location) -> (ptr: ^T, err: mem.Allocator_Error) {
+	if x.allocator.procedure == nil {
+		// to minic `[dynamic]T` behaviour
+		x.allocator = context.allocator
+	}
+
+	chunk_idx, elem_idx, chunk_cap := _meta_get(SHIFT, uint(x.len))
+	if x.chunks[chunk_idx] == nil {
+		x.chunks[chunk_idx] = make([^]T, chunk_cap, x.allocator) or_return
+	}
+	x.chunks[chunk_idx][elem_idx] = value
+	x.len += 1
+	ptr = &x.chunks[chunk_idx][elem_idx]
+	return
+}
+
+// `pop` will remove and return the end value of an exponential array `x` and reduces the length of the array by 1.
+//
+// Note: If the exponential array has no elements (`xar.len(x) == 0`), this procedure will panic.
+pop :: proc(x: ^$X/Xar($T, $SHIFT), loc := #caller_location) -> (val: T) {
+	assert(x.len > 0, loc=loc)
+	index := uint(x.len-1)
+	chunk_idx, elem_idx, _ := _meta_get(SHIFT, index)
+	x.len -= 1
+	return x.chunks[chunk_idx][elem_idx]
+}
+
+// `pop_safe` trys to remove and return the end value of dynamic array `x` and reduces the length of the array by 1.
+// If the operation is not possible, it will return false.
+@(require_results)
+pop_safe :: proc(x: ^$X/Xar($T, $SHIFT)) -> (val: T, ok: bool) {
+	if x.len == 0 {
+		return
+	}
+	index := uint(x.len-1)
+	chunk_idx, elem_idx, _ := _meta_get(SHIFT, index)
+	x.len -= 1
+
+	val = x.chunks[chunk_idx][elem_idx]
+	ok = true
+	return
+}
+
+/*
+	`unordered_remove` removed the element at the specified `index`. It does so by replacing the current end value
+	with the old value, and reducing the length of the exponential array by 1.
+
+	Note: This is an O(1) operation.
+	Note: This is currently no procedure that is the equivalent of an "ordered_remove"
+	Note: If the index is out of bounds, this procedure will panic.
+
+	Note: Pointers to the last element become invalid (it gets moved). Pointers to other elements remain valid.
+
+	Example:
+
+		import "core:encoding/xar"
+
+		unordered_remove_example :: proc() {
+			x: xar.Xar(int, 4)
+			defer xar.destroy(&x)
+
+			xar.push_back(&x, 10)
+			xar.push_back(&x, 20)
+			xar.push_back(&x, 30)
+
+			xar.unordered_remove(&x, 0)  // Removes 10, replaces with 30
+
+			// Array now contains [30, 20]
+			fmt.println(xar.get(&x, 0))  // 30
+			fmt.println(xar.get(&x, 1))  // 20
+		}
+*/
+unordered_remove :: proc(x: ^$X/Xar($T, $SHIFT), #any_int index: int, loc := #caller_location) {
+	runtime.bounds_check_error_loc(loc, index, x.len)
+	n := x.len-1
+	if index != n {
+		end := get(x, n)
+		set(x, index, end)
+	}
+	x.len -= 1
+}
+
+
+/*
+Iterator state for traversing a `Xar`.
+
+Fields:
+- `xar`: Pointer to the exponential array being iterated
+- `idx`: Current iteration index
+*/
+Iterator :: struct($T: typeid, $SHIFT: uint) {
+	xar: ^Xar(T, SHIFT),
+	idx: int,
+}
+
+/*
+Create an iterator for traversing the exponential array.
+
+**Inputs**
+- `xar`: Pointer to the exponential array
+
+**Returns**
+- an iterator positioned at the start
+
+Example:
+
+	import "lib:xar"
+
+	iteration_example :: proc() {
+		x: xar.Xar(int, 4)
+		defer xar.destroy(&x)
+
+		xar.push_back(&x, 10)
+		xar.push_back(&x, 20)
+		xar.push_back(&x, 30)
+
+		it := xar.iterator(&x)
+		for val in xar.iterate_by_ptr(&it) {
+			fmt.println(val^)
+		}
+	}
+
+Output:
+
+	10
+	20
+	30
+*/
+iterator :: proc(xar: ^$X/Xar($T, $SHIFT)) -> Iterator(T, SHIFT) {
+	return {xar = auto_cast xar, idx = 0}
+}
+
+/*
+Advance the iterator and returns the next element.
+
+**Inputs**
+- `it`: Pointer to the iterator
+
+**Returns**
+- current element
+- `true` if an element was returned, `false` if iteration is complete
+*/
+iterate_by_val :: proc(it: ^Iterator($T, $SHIFT)) -> (val: T, ok: bool) {
+	if it.idx >= it.xar.len {
+		return
+	}
+	val = get(it.xar, it.idx)
+	it.idx += 1
+	return val, true
+}
+
+
+/*
+Advance the iterator and returns a pointer to the next element.
+
+**Inputs**
+- `it`: Pointer to the iterator
+
+**Returns**
+- pointer to the current element
+- `true` if an element was returned, `false` if iteration is complete
+*/
+iterate_by_ptr :: proc(it: ^Iterator($T, $SHIFT)) -> (val: ^T, ok: bool) {
+	if it.idx >= it.xar.len {
+		return
+	}
+	val = get_ptr(it.xar, it.idx)
+	it.idx += 1
+	return val, true
+}
--- a/core/crypto/README.md
+++ b/core/crypto/README.md
@@ -29,4 +29,4 @@ constant-time byte comparison.

 ## License

-This library is made available under the BSD-3 license.
+This library is made available under the zlib license.
--- a/core/crypto/_blake2/blake2.odin
+++ b/core/crypto/_blake2/blake2.odin
@@ -2,7 +2,7 @@ package _blake2

 /*
    Copyright 2021 zhibog
-    Made available under the BSD-3 license.
+    Made available under Odin's license.

    List of contributors:
        zhibog, dotbmp:  Initial implementation.
--- a/core/crypto/_sha3/sha3.odin
+++ b/core/crypto/_sha3/sha3.odin
@@ -2,7 +2,7 @@ package _sha3

 /*
    Copyright 2021 zhibog
-    Made available under the BSD-3 license.
+    Made available under Odin's license.

    List of contributors:
        zhibog, dotbmp:  Initial implementation.
--- a/core/crypto/aead/doc.odin
+++ b/core/crypto/aead/doc.odin
@@ -1,6 +1,5 @@
 /*
-package aead provides a generic interface to the supported Authenticated
-Encryption with Associated Data algorithms.
+A generic interface to Authenticated Encryption with Associated Data (`AEAD`) algorithms.

 Both a one-shot and context based interface are provided, with similar
 usage.  If multiple messages are to be sealed/opened via the same key,
@@ -54,4 +53,4 @@ Example:
 		assert(bytes.equal(opened_pt, plaintext))
 	}
 */
-package aead
+package aead
--- a/core/crypto/aegis/aegis.odin
+++ b/core/crypto/aegis/aegis.odin
@@ -1,6 +1,7 @@
 /*
-package aegis implements the AEGIS-128L and AEGIS-256 Authenticated
-Encryption with Additional Data algorithms.
+`AEGIS-128L` and `AEGIS-256` AEAD algorithms.
+
+Where AEAD stands for Authenticated Encryption with Additional Data.

 See:
 - [[ https://www.ietf.org/archive/id/draft-irtf-cfrg-aegis-aead-12.txt ]]
--- a/core/crypto/aes/aes.odin
+++ b/core/crypto/aes/aes.odin
@@ -1,5 +1,5 @@
 /*
-package aes implements the AES block cipher and some common modes.
+The `AES` block cipher and some common modes.

 See:
 - [[ https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.197-upd1.pdf ]]
--- a/core/crypto/blake2b/blake2b.odin
+++ b/core/crypto/blake2b/blake2b.odin
@@ -1,5 +1,5 @@
 /*
-package blake2b implements the BLAKE2b hash algorithm.
+`BLAKE2b` hash algorithm.

 See:
 - [[ https://datatracker.ietf.org/doc/html/rfc7693 ]]
@@ -9,7 +9,7 @@ package blake2b

 /*
    Copyright 2021 zhibog
-    Made available under the BSD-3 license.
+    Made available under Odin's license.

    List of contributors:
        zhibog, dotbmp:  Initial implementation.
--- a/core/crypto/blake2s/blake2s.odin
+++ b/core/crypto/blake2s/blake2s.odin
@@ -1,5 +1,5 @@
 /*
-package blake2s implements the BLAKE2s hash algorithm.
+`BLAKE2s` hash algorithm.

 See:
 - [[ https://datatracker.ietf.org/doc/html/rfc7693 ]]
@@ -9,7 +9,7 @@ package blake2s

 /*
    Copyright 2021 zhibog
-    Made available under the BSD-3 license.
+    Made available under Odin's license.

    List of contributors:
        zhibog, dotbmp:  Initial implementation.
--- a/core/crypto/chacha20/chacha20.odin
+++ b/core/crypto/chacha20/chacha20.odin
@@ -1,5 +1,5 @@
 /*
-package chacha20 implements the ChaCha20 and XChaCha20 stream ciphers.
+`ChaCha20` and `XChaCha20` stream ciphers.

 See:
 - [[ https://datatracker.ietf.org/doc/html/rfc8439 ]]
--- a/core/crypto/chacha20poly1305/chacha20poly1305.odin
+++ b/core/crypto/chacha20poly1305/chacha20poly1305.odin
@@ -1,7 +1,7 @@
 /*
-package chacha20poly1305 implements the AEAD_CHACHA20_POLY1305 and
-AEAD_XChaCha20_Poly1305 Authenticated Encryption with Additional Data
-algorithms.
+`AEAD_CHACHA20_POLY1305` and `AEAD_XChaCha20_Poly1305` algorithms.
+
+Where AEAD stands for Authenticated Encryption with Additional Data.

 See:
 - [[ https://www.rfc-editor.org/rfc/rfc8439 ]]
--- a/core/crypto/crypto.odin
+++ b/core/crypto/crypto.odin
@@ -1,12 +1,13 @@
-/*
-package crypto implements a selection of cryptography algorithms and useful
-helper routines.
-*/
+// A selection of cryptography algorithms and useful helper routines.
 package crypto

 import "base:runtime"
 import "core:mem"

+// HAS_RAND_BYTES is true iff the runtime provides a cryptographic
+// entropy source.
+HAS_RAND_BYTES :: runtime.HAS_RAND_BYTES
+
 // compare_constant_time returns 1 iff a and b are equal, 0 otherwise.
 //
 // The execution time of this routine is constant regardless of the contents
@@ -57,7 +58,7 @@ rand_bytes :: proc (dst: []byte) {
 	// zero-fill the buffer first
 	mem.zero_explicit(raw_data(dst), len(dst))

-	_rand_bytes(dst)
+	runtime.rand_bytes(dst)
 }

 // random_generator returns a `runtime.Random_Generator` backed by the
--- a/core/crypto/deoxysii/deoxysii.odin
+++ b/core/crypto/deoxysii/deoxysii.odin
@@ -1,6 +1,5 @@
 /*
-package deoxysii implements the Deoxys-II-256 Authenticated Encryption
-with Additional Data algorithm.
+`Deoxys-II-256` Authenticated Encryption with Additional Data (`AEAD`) algorithm.

 - [[ https://sites.google.com/view/deoxyscipher ]]
 - [[ https://thomaspeyrin.github.io/web/assets/docs/papers/Jean-etal-JoC2021.pdf ]]
--- a/core/crypto/ed25519/ed25519.odin
+++ b/core/crypto/ed25519/ed25519.odin
@@ -1,5 +1,5 @@
 /*
-package ed25519 implements the Ed25519 EdDSA signature algorithm.
+`Ed25519` EdDSA signature algorithm.

 See:
 - [[ https://datatracker.ietf.org/doc/html/rfc8032 ]]
--- a/core/crypto/hash/doc.odin
+++ b/core/crypto/hash/doc.odin
@@ -1,5 +1,5 @@
 /*
-package hash provides a generic interface to the supported hash algorithms.
+A generic interface to the supported hash algorithms.

 A high-level convenience procedure group `hash` is provided to easily
 accomplish common tasks.
--- a/core/crypto/hash/hash.odin
+++ b/core/crypto/hash/hash.odin
@@ -2,7 +2,7 @@ package crypto_hash

 /*
 	Copyright 2021 zhibog
-	Made available under the BSD-3 license.
+	Made available under Odin's license.

 	List of contributors:
 		zhibog, dotbmp:  Initial implementation.
--- a/core/crypto/hkdf/hkdf.odin
+++ b/core/crypto/hkdf/hkdf.odin
@@ -1,6 +1,5 @@
 /*
-package hkdf implements the HKDF HMAC-based Extract-and-Expand Key
-Derivation Function.
+`HKDF` HMAC-based Extract-and-Expand Key Derivation Function.

 See: [[ https://www.rfc-editor.org/rfc/rfc5869 ]]
 */
--- a/core/crypto/hmac/hmac.odin
+++ b/core/crypto/hmac/hmac.odin
@@ -1,5 +1,5 @@
 /*
-package hmac implements the HMAC MAC algorithm.
+`HMAC` message authentication code (`MAC`) algorithm.

 See:
 - [[ https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.198-1.pdf ]]
--- a/core/crypto/kmac/kmac.odin
+++ b/core/crypto/kmac/kmac.odin
@@ -1,5 +1,5 @@
 /*
-package kmac implements the KMAC MAC algorithm.
+`KMAC` message authentication code (`MAC`) algorithm.

 See:
 - [[ https://nvlpubs.nist.gov/nistpubs/specialpublications/nist.sp.800-185.pdf ]]
--- a/core/crypto/legacy/keccak/keccak.odin
+++ b/core/crypto/legacy/keccak/keccak.odin
@@ -1,5 +1,5 @@
 /*
-package keccak implements the Keccak hash algorithm family.
+`Keccak` hash algorithm family.

 During the SHA-3 standardization process, the padding scheme was changed
 thus Keccac and SHA-3 produce different outputs.  Most users should use
@@ -10,7 +10,7 @@ package keccak

 /*
    Copyright 2021 zhibog
-    Made available under the BSD-3 license.
+    Made available under Odin's license.

    List of contributors:
        zhibog, dotbmp:  Initial implementation.
--- a/core/crypto/legacy/md5/md5.odin
+++ b/core/crypto/legacy/md5/md5.odin
@@ -1,5 +1,5 @@
 /*
-package md5 implements the MD5 hash algorithm.
+`MD5` hash algorithm.

 WARNING: The MD5 algorithm is known to be insecure and should only be
 used for interoperating with legacy applications.
@@ -12,7 +12,7 @@ package md5

 /*
    Copyright 2021 zhibog
-    Made available under the BSD-3 license.
+    Made available under Odin's license.

    List of contributors:
        zhibog, dotbmp:  Initial implementation.
--- a/core/crypto/legacy/sha1/sha1.odin
+++ b/core/crypto/legacy/sha1/sha1.odin
@@ -1,5 +1,5 @@
 /*
-package sha1 implements the SHA1 hash algorithm.
+`SHA1` hash algorithm.

 WARNING: The SHA1 algorithm is known to be insecure and should only be
 used for interoperating with legacy applications.
@@ -13,7 +13,7 @@ package sha1

 /*
    Copyright 2021 zhibog
-    Made available under the BSD-3 license.
+    Made available under Odin's license.

    List of contributors:
        zhibog, dotbmp:  Initial implementation.
--- a/core/crypto/pbkdf2/pbkdf2.odin
+++ b/core/crypto/pbkdf2/pbkdf2.odin
@@ -1,5 +1,5 @@
 /*
-package pbkdf2 implements the PBKDF2 password-based key derivation function.
+`PBKDF2` password-based key derivation function.

 See: [[ https://www.rfc-editor.org/rfc/rfc2898 ]]
 */
--- a/core/crypto/poly1305/poly1305.odin
+++ b/core/crypto/poly1305/poly1305.odin
@@ -1,5 +1,5 @@
 /*
-package poly1305 implements the Poly1305 one-time MAC algorithm.
+`Poly1305` one-time MAC algorithm.

 See:
 - [[ https://datatracker.ietf.org/doc/html/rfc8439 ]]
--- a/core/crypto/rand_bsd.odin
+++ b/core/crypto/rand_bsd.odin
@@ -1,15 +0,0 @@
-#+build freebsd, openbsd, netbsd
-package crypto
-
-foreign import libc "system:c"
-
-HAS_RAND_BYTES :: true
-
-foreign libc {
-	arc4random_buf :: proc(buf: [^]byte, nbytes: uint) ---
-}
-
-@(private)
-_rand_bytes :: proc(dst: []byte) {
-	arc4random_buf(raw_data(dst), len(dst))
-}
--- a/core/crypto/rand_darwin.odin
+++ b/core/crypto/rand_darwin.odin
@@ -1,17 +0,0 @@
-package crypto
-
-import "core:fmt"
-
-import CF "core:sys/darwin/CoreFoundation"
-import Sec "core:sys/darwin/Security"
-
-HAS_RAND_BYTES :: true
-
-@(private)
-_rand_bytes :: proc(dst: []byte) {
-	err := Sec.RandomCopyBytes(count=len(dst), bytes=raw_data(dst))
-	if err != .Success {
-		msg := CF.StringCopyToOdinString(Sec.CopyErrorMessageString(err))
-		fmt.panicf("crypto/rand_bytes: SecRandomCopyBytes returned non-zero result: %v %s", err, msg)
-	}
-}
--- a/core/crypto/rand_generic.odin
+++ b/core/crypto/rand_generic.odin
@@ -1,16 +0,0 @@
-#+build !linux
-#+build !windows
-#+build !openbsd
-#+build !freebsd
-#+build !netbsd
-#+build !darwin
-#+build !js
-#+build !wasi
-package crypto
-
-HAS_RAND_BYTES :: false
-
-@(private)
-_rand_bytes :: proc(dst: []byte) {
-	unimplemented("crypto: rand_bytes not supported on this OS")
-}
--- a/core/crypto/rand_js.odin
+++ b/core/crypto/rand_js.odin
@@ -1,24 +0,0 @@
-package crypto
-
-foreign import "odin_env"
-foreign odin_env {
-	@(link_name = "rand_bytes")
-	env_rand_bytes :: proc "contextless" (buf: []byte) ---
-}
-
-HAS_RAND_BYTES :: true
-
-@(private)
-_MAX_PER_CALL_BYTES :: 65536 // 64kiB
-
-@(private)
-_rand_bytes :: proc(dst: []byte) {
-	dst := dst
-
-	for len(dst) > 0 {
-		to_read := min(len(dst), _MAX_PER_CALL_BYTES)
-		env_rand_bytes(dst[:to_read])
-
-		dst = dst[to_read:]
-	}
-}
--- a/core/crypto/rand_linux.odin
+++ b/core/crypto/rand_linux.odin
@@ -1,40 +0,0 @@
-package crypto
-
-import "core:fmt"
-
-import "core:sys/linux"
-
-HAS_RAND_BYTES :: true
-
-@(private)
-_MAX_PER_CALL_BYTES :: 33554431 // 2^25 - 1
-
-@(private)
-_rand_bytes :: proc (dst: []byte) {
-	dst := dst
-	l := len(dst)
-
-	for l > 0 {
-		to_read := min(l, _MAX_PER_CALL_BYTES)
-		n_read, errno := linux.getrandom(dst[:to_read], {})
-		#partial switch errno {
-		case .NONE:
-			// Do nothing
-		case .EINTR:
-			// Call interupted by a signal handler, just retry the
-			// request.
-			continue
-		case .ENOSYS:
-			// The kernel is apparently prehistoric (< 3.17 circa 2014)
-			// and does not support getrandom.
-			panic("crypto: getrandom not available in kernel")
-		case:
-			// All other failures are things that should NEVER happen
-			// unless the kernel interface changes (ie: the Linux
-			// developers break userland).
-			fmt.panicf("crypto: getrandom failed: %v", errno)
-		}
-		l -= n_read
-		dst = dst[n_read:]
-	}
-}
--- a/core/crypto/rand_wasi.odin
+++ b/core/crypto/rand_wasi.odin
@@ -1,13 +0,0 @@
-package crypto
-
-import "core:fmt"
-import "core:sys/wasm/wasi"
-
-HAS_RAND_BYTES :: true
-
-@(private)
-_rand_bytes :: proc(dst: []byte) {
-	if err := wasi.random_get(dst); err != nil {
-		fmt.panicf("crypto: wasi.random_get failed: %v", err)
-	}
-}
--- a/core/crypto/rand_windows.odin
+++ b/core/crypto/rand_windows.odin
@@ -1,26 +0,0 @@
-package crypto
-
-import win32 "core:sys/windows"
-import "core:os"
-import "core:fmt"
-
-HAS_RAND_BYTES :: true
-
-@(private)
-_rand_bytes :: proc(dst: []byte) {
-	ret := os.Platform_Error(win32.BCryptGenRandom(nil, raw_data(dst), u32(len(dst)), win32.BCRYPT_USE_SYSTEM_PREFERRED_RNG))
-	if ret != nil {
-		#partial switch ret {
-		case os.ERROR_INVALID_HANDLE:
-			// The handle to the first parameter is invalid.
-			// This should not happen here, since we explicitly pass nil to it
-			panic("crypto: BCryptGenRandom Invalid handle for hAlgorithm")
-		case os.ERROR_INVALID_PARAMETER:
-			// One of the parameters was invalid
-			panic("crypto: BCryptGenRandom Invalid parameter")
-		case:
-			// Unknown error
-			fmt.panicf("crypto: BCryptGenRandom failed: %d\n", ret)
-		}
-	}
-}
--- a/core/crypto/ristretto255/ristretto255.odin
+++ b/core/crypto/ristretto255/ristretto255.odin
@@ -1,5 +1,5 @@
 /*
-package ristretto255 implement the ristretto255 prime-order group.
+Ristretto255 prime-order group.

 See:
 - [[ https://www.rfc-editor.org/rfc/rfc9496 ]]
--- a/core/crypto/sha2/sha2.odin
+++ b/core/crypto/sha2/sha2.odin
@@ -1,5 +1,5 @@
 /*
-package sha2 implements the SHA2 hash algorithm family.
+`SHA2` hash algorithm family.

 See:
 - [[ https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.180-4.pdf ]]
@@ -9,7 +9,7 @@ package sha2

 /*
    Copyright 2021 zhibog
-    Made available under the BSD-3 license.
+    Made available under Odin's license.

    List of contributors:
        zhibog, dotbmp:  Initial implementation.
--- a/core/crypto/sha3/sha3.odin
+++ b/core/crypto/sha3/sha3.odin
@@ -1,5 +1,5 @@
 /*
-package sha3 implements the SHA3 hash algorithm family.
+`SHA3` hash algorithm family.

 The SHAKE XOF can be found in crypto/shake.  While discouraged if the
 pre-standardization Keccak algorithm is required, it can be found in
@@ -12,7 +12,7 @@ package sha3

 /*
    Copyright 2021 zhibog
-    Made available under the BSD-3 license.
+    Made available under Odin's license.

    List of contributors:
        zhibog, dotbmp:  Initial implementation.
--- a/Show More
+++ b/Show More