Merge branch 'master' into fix/field-first-writes-into-dynamic-soa

2026-05-26 21:58:14 +00:00 · 2026-04-09 15:19:11 -03:00
parent f82d41bc9a c87d1a3cf6
commit 2c6c646342
263 changed files with 64048 additions and 17378 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -8,3 +8,5 @@ vendor/box2d/lib/box2d_windows_amd64_sse2.lib filter=lfs diff=lfs merge=lfs -tex
 vendor/miniaudio/lib/miniaudio.lib filter=lfs diff=lfs merge=lfs -text
 vendor/sdl3/SDL3.dll filter=lfs diff=lfs merge=lfs -text
 vendor/sdl3/SDL3.lib filter=lfs diff=lfs merge=lfs -text
+vendor/sdl3/mixer/*.dll filter=lfs diff=lfs merge=lfs -text
+vendor/sdl3/mixer/*.lib filter=lfs diff=lfs merge=lfs -text
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -93,8 +93,8 @@ jobs:
        if: matrix.os == 'macos-latest'
        run: |
          brew update
-          brew install llvm@20 wasmtime lua@5.4 lld
-          echo "$(brew --prefix llvm@20)/bin" >> $GITHUB_PATH
+          brew install llvm@22 wasmtime lua@5.4 lld
+          echo "$(brew --prefix llvm@22)/bin" >> $GITHUB_PATH

      - name: Download LLVM (Ubuntu)
        if: matrix.os == 'ubuntu-latest' || matrix.os == 'ubuntu-24.04-arm'
@@ -140,9 +140,18 @@ jobs:
      - name: Optimized Core library tests
        run: ./odin test tests/core/speed.odin -o:speed -file -all-packages -vet -vet-tabs -strict-style -vet-style -warnings-as-errors -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true -sanitize:address
      - name: Wycheproof tests
-        run: ./odin test tests/core/crypto/wycheproof -vet -vet-tabs -strict-style -vet-style -vet-cast -warnings-as-errors -disallow-do -o:speed -microarch:native
+
+        run: ./odin test tests/core/crypto/wycheproof -vet -vet-tabs -strict-style -vet-style -vet-cast -warnings-as-errors -disallow-do -o:speed
      - name: Vendor library tests
        run: ./odin test tests/vendor -all-packages -vet -vet-tabs -strict-style -vet-style -warnings-as-errors -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true -sanitize:address
+        if: matrix.os != 'macos-15-intel' && matrix.os != 'macos-latest'
+      - name: Vendor library tests (MacOS ARM)
+        run: ./odin test tests/vendor -all-packages -vet -vet-tabs -strict-style -vet-style -warnings-as-errors -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true -sanitize:address -extra-linker-flags:"-L/opt/homebrew/opt/lua@5.4/lib"
+        if: matrix.os == 'macos-latest'
+      - name: Vendor library tests (MacOS Intel)
+        run: ./odin test tests/vendor -all-packages -vet -vet-tabs -strict-style -vet-style -warnings-as-errors -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true -sanitize:address -extra-linker-flags:"-L/usr/local/opt/lua@5.4/lib"
+        if: matrix.os == 'macos-15-intel'
+
      - name: Internals tests
        run: ./odin test tests/internal -all-packages -vet -vet-tabs -strict-style -vet-style -warnings-as-errors -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true -sanitize:address
      - name: GitHub Issue tests
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -58,6 +58,7 @@ jobs:
              musl-dev llvm20-dev clang20 git mold lz4 \
              libxml2-static llvm20-static zlib-static zstd-static \
              make &&
+            git config --global --add safe.directory /src &&
            ./ci/build_linux_static.sh
          '
      - name: Odin run
--- a/base/intrinsics/intrinsics.odin
+++ b/base/intrinsics/intrinsics.odin
@@ -1,4 +1,4 @@
-// This is purely for documentation
+7// This is purely for documentation
 #+build ignore
 package intrinsics

@@ -77,7 +77,9 @@ prefetch_write_instruction :: proc(address: rawptr, #const locality: i32 /* 0..=
 prefetch_write_data        :: proc(address: rawptr, #const locality: i32 /* 0..=3 */) ---

 // Compiler Hints
-expect :: proc(val, expected_val: $T) -> T ---
+expect   :: proc(val, expected_val: $T) -> T ---
+likely   :: proc(val: $T) -> T where type_is_boolean(T) ---
+unlikely :: proc(val: $T) -> T where type_is_boolean(T) ---

 // Linux and Darwin Only
 syscall :: proc(id: uintptr, args: ..uintptr) -> uintptr ---
@@ -180,6 +182,7 @@ type_is_bit_set          :: proc($T: typeid) -> bool ---
 type_is_bit_field        :: proc($T: typeid) -> bool ---
 type_is_simd_vector      :: proc($T: typeid) -> bool ---
 type_is_matrix           :: proc($T: typeid) -> bool ---
+type_is_fixed_capacity_dynamic_array :: proc($T: typeid) -> bool ---

 type_has_nil :: proc($T: typeid) -> bool ---

@@ -202,6 +205,9 @@ type_bit_set_underlying_type :: proc($T: typeid) -> typeid where type_is_bit_set
 type_has_field  :: proc($T: typeid, $name: string) -> bool ---
 type_field_type :: proc($T: typeid, $name: string) -> typeid ---

+type_field_bit_size :: proc($T: typeid, $name: string) -> int where type_is_bit_field(T) ---
+type_field_bit_offset :: proc($T: typeid, $name: string) -> int where type_is_bit_field(T) ---
+
 type_proc_parameter_count :: proc($T: typeid) -> int where type_is_proc(T) ---
 type_proc_return_count    :: proc($T: typeid) -> int where type_is_proc(T) ---

@@ -222,6 +228,8 @@ type_is_superset_of :: proc($Super, $Sub: typeid) -> bool ---

 type_field_index_of :: proc($T: typeid, $name: string) -> uintptr ---

+type_fixed_capacity_dynamic_array_len_offset :: proc($T: typeid/[dynamic; $N]$E) -> uintptr ---
+
 // "Contiguous" means that the set of enum constants, when sorted, have a difference of either 0 or 1 between consecutive values.
 // This is the exact opposite of "sparse".
 type_enum_is_contiguous :: proc($T: typeid) -> bool where type_is_enum(T) ---
@@ -340,7 +348,11 @@ simd_trunc   :: proc(a: #simd[N]any_float) -> #simd[N]any_float ---
 // rounding to the nearest integral value; if two values are equally near, rounds to the even one
 simd_nearest :: proc(a: #simd[N]any_float) -> #simd[N]any_float ---

-simd_to_bits :: proc(v: #simd[N]T) -> #simd[N]Integer where size_of(T) == size_of(Integer), type_is_unsigned(Integer) ---
+simd_approx_recip      :: proc(x: #simd[N]T) -> #simd[N]T where type_is_float(T)) ---
+simd_approx_recip_sqrt :: proc(x: #simd[N]T) -> #simd[N]T where type_is_float(T)) ---
+
+simd_to_bits        :: proc(v: #simd[N]T) -> #simd[N]Integer where size_of(T) == size_of(Integer), type_is_unsigned(Integer) ---
+simd_to_bits_signed :: proc(v: #simd[N]T) -> #simd[N]Integer where size_of(T) == size_of(Integer), !type_is_unsigned(Integer) ---

 // equivalent to a swizzle with descending indices, e.g. reserve(a, 3, 2, 1, 0)
 simd_lanes_reverse :: proc(a: #simd[N]T) -> #simd[N]T ---
@@ -348,6 +360,16 @@ simd_lanes_reverse :: proc(a: #simd[N]T) -> #simd[N]T ---
 simd_lanes_rotate_left  :: proc(a: #simd[N]T, $offset: int) -> #simd[N]T ---
 simd_lanes_rotate_right :: proc(a: #simd[N]T, $offset: int) -> #simd[N]T ---

+// return {b[0], a[1], b[2], a[3], ...}
+simd_odd_even :: proc(a, b: #simd[N]T) -> #simd[N]T ---
+
+// Returns the sums of N consecutive lanes
+simd_sums_of_n :: proc(a: #simd[LANES]T, $N: uint) -> #simd[LANES/N]T where is_power_of_two(N) ---
+
+simd_pairwise_add :: proc(a, b: #simd[LANES]T) -> #simd[LANES/N]T ---
+simd_pairwise_sub :: proc(a, b: #simd[LANES]T) -> #simd[LANES/N]T ---
+
+
 // Checks if the current target supports the given target features.
 //
 // Takes a constant comma-seperated string (eg: "sha512,sse4.1"), or a procedure type which has either
--- a/base/runtime/core.odin
+++ b/base/runtime/core.odin
@@ -39,6 +39,10 @@ Calling_Convention :: enum u8 {

 	Win64       = 9,
 	SysV        = 10,
+
+	Preserve_None = 11,
+	Preserve_Most = 12,
+	Preserve_All  = 13,
 }

 Type_Info_Enum_Value :: distinct i64
@@ -137,7 +141,7 @@ Type_Info_Struct :: struct {

 	flags: Type_Info_Struct_Flags,

-	// These are only set iff this structure is an SOA structure
+	// These are only set if and only if (⟺) this structure is an SOA structure
 	soa_kind:      Type_Info_Struct_Soa_Kind,
 	soa_len:       i32,
 	soa_base_type: ^Type_Info,
@@ -166,10 +170,11 @@ Type_Info_Map :: struct {
 	map_info: ^Map_Info,
 }
 Type_Info_Bit_Set :: struct {
-	elem:       ^Type_Info,
-	underlying: ^Type_Info, // Possibly nil
-	lower:      i64,
-	upper:      i64,
+	elem:                ^Type_Info,
+	underlying:          ^Type_Info,
+	explicit_underlying: bool, // false = bit_set[T], true = bit_set[T, U]
+	lower:               i64,
+	upper:               i64,
 }
 Type_Info_Simd_Vector :: struct {
 	elem:       ^Type_Info,
@@ -201,6 +206,14 @@ Type_Info_Bit_Field :: struct {
 	field_count:  int,
 }

+Type_Info_Fixed_Capacity_Dynamic_Array :: struct {
+	elem: ^Type_Info,
+	elem_size:  int,
+	capacity:   int,
+	len_offset: uintptr,
+}
+
+
 Type_Info_Flag :: enum u8 {
 	Comparable     = 0,
 	Simple_Compare = 1,
@@ -241,6 +254,7 @@ Type_Info :: struct {
 		Type_Info_Matrix,
 		Type_Info_Soa_Pointer,
 		Type_Info_Bit_Field,
+		Type_Info_Fixed_Capacity_Dynamic_Array,
 	},
 }

@@ -420,6 +434,11 @@ Raw_Dynamic_Array :: struct {
 	allocator: Allocator,
 }

+Raw_Fixed_Capacity_Dynamic_Array :: struct($Capacity: uint, $T: typeid) {
+	data: [Capacity]T,
+	len:  int,
+}
+
 // The raw, type-erased representation of a map.
 //
 // 32-bytes on 64-bit
@@ -654,9 +673,8 @@ type_info_base :: proc "contextless" (info: ^Type_Info) -> ^Type_Info {
 	return base
 }

-
 // type_info_core returns the core-type of a `^Type_Info` stripping the `distinct`ness from the first level AND/OR
-// returns the backing integer type of an enum or bit_set `^Type_Info`.
+// returns the backing integer type of an enum `^Type_Info`.
 // This is also aliased as `type_info_base_without_enum`
@(require_results)
 type_info_core :: proc "contextless" (info: ^Type_Info) -> ^Type_Info {
@@ -676,11 +694,35 @@ type_info_core :: proc "contextless" (info: ^Type_Info) -> ^Type_Info {
 	return base
 }

-// type_info_base_without_enum returns the core-type of a `^Type_Info` stripping the `distinct`ness from the first level AND/OR
+
+
+// type_info_underlying returns the underlying (backing) type of a `^Type_Info` stripping the `distinct`ness from the first level AND/OR
+// returns the backing integer type of an enum `^Type_Info` AND/OR the underlying integer type of a bit_set or bit_field.
+@(require_results)
+type_info_underlying :: proc "contextless" (info: ^Type_Info) -> ^Type_Info {
+	if info == nil {
+		return nil
+	}
+
+	base := info
+	loop: for {
+		#partial switch i in base.variant {
+		case Type_Info_Named:     base = i.base
+		case Type_Info_Enum:      base = i.base
+		case Type_Info_Bit_Set:   base = i.underlying
+		case Type_Info_Bit_Field: base = i.backing_type
+		case: break loop
+		}
+	}
+	return base
+}
+
+// `type_info_base_without_enum` returns the core-type of a `^Type_Info` stripping the `distinct`ness from the first level AND/OR
 // returns the backing integer type of an enum or bit_set `^Type_Info`.
 // This is also aliased as `type_info_core`
 type_info_base_without_enum :: type_info_core

+@(require_results)
 __type_info_of :: proc "contextless" (id: typeid) -> ^Type_Info #no_bounds_check {
 	n := u64(len(type_table))
 	i := transmute(u64)id % n
@@ -696,14 +738,16 @@ __type_info_of :: proc "contextless" (id: typeid) -> ^Type_Info #no_bounds_check

 when !ODIN_NO_RTTI {
 	// typeid_base returns the base-type of a `typeid` stripping the `distinct`ness from the first level
+	@(require_results)
 	typeid_base :: proc "contextless" (id: typeid) -> typeid {
 		ti := type_info_of(id)
 		ti = type_info_base(ti)
 		return ti.id
 	}
 	// typeid_core returns the core-type of a `typeid` stripping the `distinct`ness from the first level AND/OR
-	// returns the backing integer type of an enum or bit_set `typeid`.
+	// returns the backing integer type of an enum `typeid`.
 	// This is also aliased as `typeid_base_without_enum`
+	@(require_results)
 	typeid_core :: proc "contextless" (id: typeid) -> typeid {
 		ti := type_info_core(type_info_of(id))
 		return ti.id
@@ -713,6 +757,12 @@ when !ODIN_NO_RTTI {
 	// returns the backing integer type of an enum or bit_set `typeid`.
 	// This is also aliased as `typeid_core`
 	typeid_base_without_enum :: typeid_core
+
+	@(require_results)
+	typeid_underlying :: proc "contextless" (id: typeid) -> typeid {
+		ti := type_info_underlying(type_info_of(id))
+		return ti.id
+	}
 }


--- a/base/runtime/core_builtin.odin
+++ b/base/runtime/core_builtin.odin
@@ -119,14 +119,14 @@ copy :: proc{copy_slice, copy_from_string, copy_from_string16}



-// `unordered_remove` removed the element at the specified `index`. It does so by replacing the current end value
+// `unordered_remove_dynamic_array` removed the element at the specified `index`. It does so by replacing the current end value
 // with the old value, and reducing the length of the dynamic array by 1.
 //
 // Note: This is an O(1) operation.
 // Note: If you want the elements to remain in their order, use `ordered_remove`.
 // Note: If the index is out of bounds, this procedure will panic.
@builtin
-unordered_remove :: proc(array: ^$D/[dynamic]$T, #any_int index: int, loc := #caller_location) #no_bounds_check {
+unordered_remove_dynamic_array :: proc(array: ^$D/[dynamic]$T, #any_int index: int, loc := #caller_location) #no_bounds_check {
 	bounds_check_error_loc(loc, index, len(array))
 	n := len(array)-1
 	if index != n {
@@ -134,13 +134,13 @@ unordered_remove :: proc(array: ^$D/[dynamic]$T, #any_int index: int, loc := #ca
 	}
 	(^Raw_Dynamic_Array)(array).len -= 1
 }
-// `ordered_remove` removed the element at the specified `index` whilst keeping the order of the other elements.
+// `ordered_remove_dynamic_array` removed the element at the specified `index` whilst keeping the order of the other elements.
 //
 // Note: This is an O(N) operation.
 // Note: If the elements do not have to remain in their order, prefer `unordered_remove`.
 // Note: If the index is out of bounds, this procedure will panic.
@builtin
-ordered_remove :: proc(array: ^$D/[dynamic]$T, #any_int index: int, loc := #caller_location) #no_bounds_check {
+ordered_remove_dynamic_array :: proc(array: ^$D/[dynamic]$T, #any_int index: int, loc := #caller_location) #no_bounds_check {
 	bounds_check_error_loc(loc, index, len(array))
 	if index+1 < len(array) {
 		copy(array[index:], array[index+1:])
@@ -148,12 +148,12 @@ ordered_remove :: proc(array: ^$D/[dynamic]$T, #any_int index: int, loc := #call
 	(^Raw_Dynamic_Array)(array).len -= 1
 }

-// `remove_range` removes a range of elements specified by the range `lo` and `hi`, whilst keeping the order of the other elements.
+// `remove_range_dynamic_array` removes a range of elements specified by the range `lo` and `hi`, whilst keeping the order of the other elements.
 //
 // Note: This is an O(N) operation.
 // Note: If the range is out of bounds, this procedure will panic.
@builtin
-remove_range :: proc(array: ^$D/[dynamic]$T, #any_int lo, hi: int, loc := #caller_location) #no_bounds_check {
+remove_range_dynamic_array :: proc(array: ^$D/[dynamic]$T, #any_int lo, hi: int, loc := #caller_location) #no_bounds_check {
 	slice_expr_error_lo_hi_loc(loc, lo, hi, len(array))
 	n := max(hi-lo, 0)
 	if n > 0 {
@@ -164,29 +164,117 @@ remove_range :: proc(array: ^$D/[dynamic]$T, #any_int lo, hi: int, loc := #calle
 	}
 }

+// `unordered_remove_fixed_capacity_dynamic_array` removed the element at the specified `index`. It does so by replacing the current end value
+// with the old value, and reducing the length of the dynamic array by 1.
+//
+// Note: This is an O(1) operation.
+// Note: If you want the elements to remain in their order, use `ordered_remove`.
+// Note: If the index is out of bounds, this procedure will panic.
+@builtin
+unordered_remove_fixed_capacity_dynamic_array :: proc(array: ^$D/[dynamic; $N]$E, #any_int index: int, loc := #caller_location) #no_bounds_check {
+	bounds_check_error_loc(loc, index, len(array))
+	n := len(array)-1
+	if index != n {
+		array[index] = array[n]
+	}
+	(^Raw_Fixed_Capacity_Dynamic_Array(N, E))(array).len -= 1
+}
+// `ordered_remove_fixed_capacity_dynamic_array` removed the element at the specified `index` whilst keeping the order of the other elements.
+//
+// Note: This is an O(N) operation.
+// Note: If the elements do not have to remain in their order, prefer `unordered_remove`.
+// Note: If the index is out of bounds, this procedure will panic.
+@builtin
+ordered_remove_fixed_capacity_dynamic_array :: proc(array: ^$D/[dynamic; $N]$E, #any_int index: int, loc := #caller_location) #no_bounds_check {
+	bounds_check_error_loc(loc, index, len(array))
+	if index+1 < len(array) {
+		copy(array[index:], array[index+1:])
+	}
+	(^Raw_Fixed_Capacity_Dynamic_Array(N, E))(array).len -= 1
+}

-// `pop` will remove and return the end value of dynamic array `array` and reduces the length of `array` by 1.
+// `remove_range_fixed_capacity_dynamic_array` removes a range of elements specified by the range `lo` and `hi`, whilst keeping the order of the other elements.
+//
+// Note: This is an O(N) operation.
+// Note: If the range is out of bounds, this procedure will panic.
+@builtin
+remove_range_fixed_capacity_dynamic_array :: proc(array: ^$D/[dynamic; $N]$E, #any_int lo, hi: int, loc := #caller_location) #no_bounds_check {
+	slice_expr_error_lo_hi_loc(loc, lo, hi, len(array))
+	n := max(hi-lo, 0)
+	if n > 0 {
+		if hi != len(array) {
+			copy(array[lo:], array[hi:])
+		}
+		(^Raw_Fixed_Capacity_Dynamic_Array(N, E))(array).len -= n
+	}
+}
+
+@builtin
+unordered_remove :: proc{
+	unordered_remove_dynamic_array,
+	unordered_remove_fixed_capacity_dynamic_array,
+}
+
+
+@builtin
+ordered_remove :: proc{
+	ordered_remove_dynamic_array,
+	ordered_remove_fixed_capacity_dynamic_array,
+}
+
+@builtin
+remove_range :: proc{
+	remove_range_dynamic_array,
+	remove_range_fixed_capacity_dynamic_array,
+}
+
+
+
+// `pop_dynamic_array` will remove and return the end value of dynamic array `array` and reduces the length of `array` by 1.
 //
 // Note: If the dynamic array has no elements (`len(array) == 0`), this procedure will panic.
@builtin
-pop :: proc(array: ^$T/[dynamic]$E, loc := #caller_location) -> (res: E) #no_bounds_check {
+pop_dynamic_array :: proc(array: ^$T/[dynamic]$E, loc := #caller_location) -> (res: E) #no_bounds_check {
 	assert(len(array) > 0, loc=loc)
-	_pop_type_erased(&res, (^Raw_Dynamic_Array)(array), size_of(E))
+	_pop_dynamic_array_type_erased(&res, (^Raw_Dynamic_Array)(array), size_of(E))
 	return res
 }

-_pop_type_erased :: proc(res: rawptr, array: ^Raw_Dynamic_Array, elem_size: int, loc := #caller_location) {
+_pop_dynamic_array_type_erased :: proc(res: rawptr, array: ^Raw_Dynamic_Array, elem_size: int) {
 	end := rawptr(uintptr(array.data) + uintptr(elem_size*(array.len-1)))
 	intrinsics.mem_copy_non_overlapping(res, end, elem_size)
 	array.len -= 1
 }


+// `pop_fixed_capacity_dynamic_array` will remove and return the end value of fixed capacity dynamic array `array` and reduces the length of `array` by 1.
+//
+// Note: If the fixed capacity dynamic array has no elements (`len(array) == 0`), this procedure will panic.
+@builtin
+pop_fixed_capacity_dynamic_array :: proc(array: ^$T/[dynamic; $N]$E, loc := #caller_location) -> (res: E) #no_bounds_check {
+	assert(len(array) > 0, loc=loc)

-// `pop_safe` trys to remove and return the end value of dynamic array `array` and reduces the length of `array` by 1.
+	elem_size :: size_of(E)
+	end := rawptr(uintptr(array) + uintptr(elem_size*(len(array)-1)))
+	intrinsics.mem_copy_non_overlapping(&res, end, elem_size)
+	(^Raw_Fixed_Capacity_Dynamic_Array(N, E))(array).len -= 1
+	return res
+}
+
+
+// `pop` will remove and return the end value of dynamic array `array` and reduces the length of `array` by 1.
+//
+// Note: If the dynamic array has no elements (`len(array) == 0`), this procedure will panic.
+@builtin
+pop :: proc{
+	pop_dynamic_array,
+	pop_fixed_capacity_dynamic_array,
+}
+
+// `pop_safe_dynamic_array` trys to remove and return the end value of dynamic array `array` and reduces the length of `array` by 1.
 // If the operation is not possible, it will return false.
@builtin
-pop_safe :: proc "contextless" (array: ^$T/[dynamic]$E) -> (res: E, ok: bool) #no_bounds_check {
+pop_safe_dynamic_array :: proc "contextless" (array: ^$T/[dynamic]$E) -> (res: E, ok: bool) #no_bounds_check {
 	if len(array) == 0 {
 		return
 	}
@@ -195,11 +283,32 @@ pop_safe :: proc "contextless" (array: ^$T/[dynamic]$E) -> (res: E, ok: bool) #n
 	return
 }

-// `pop_front` will remove and return the first value of dynamic array `array` and reduces the length of `array` by 1.
+// `pop_safe_fixed_capacity_dynamic_array` trys to remove and return the end value of dynamic array `array` and reduces the length of `array` by 1.
+// If the operation is not possible, it will return false.
+@builtin
+pop_safe_fixed_capacity_dynamic_array :: proc "contextless" (array: ^$T/[dynamic; $N]$E) -> (res: E, ok: bool) #no_bounds_check {
+	if len(array) == 0 {
+		return
+	}
+	res, ok = array[len(array)-1], true
+	(^Raw_Fixed_Capacity_Dynamic_Array(N, E))(array).len -= 1
+	return
+}
+
+// `pop_safe` trys to remove and return the end value of dynamic array `array` and reduces the length of `array` by 1.
+// If the operation is not possible, it will return false.
+@builtin
+pop_safe :: proc{
+	pop_safe_dynamic_array,
+	pop_safe_fixed_capacity_dynamic_array,
+}
+
+
+// `pop_front_dynamic_array` will remove and return the first value of dynamic array `array` and reduces the length of `array` by 1.
 //
 // Note: If the dynamic array as no elements (`len(array) == 0`), this procedure will panic.
@builtin
-pop_front :: proc(array: ^$T/[dynamic]$E, loc := #caller_location) -> (res: E) #no_bounds_check {
+pop_front_dynamic_array :: proc(array: ^$T/[dynamic]$E, loc := #caller_location) -> (res: E) #no_bounds_check {
 	assert(len(array) > 0, loc=loc)
 	res = array[0]
 	if len(array) > 1 {
@@ -209,10 +318,35 @@ pop_front :: proc(array: ^$T/[dynamic]$E, loc := #caller_location) -> (res: E) #
 	return res
 }

-// `pop_front_safe` trys to return and remove the first value of dynamic array `array` and reduces the length of `array` by 1.
+// `pop_front_fixed_capacity_dynamic_array` will remove and return the first value of fixed capacity dynamic array `array` and reduces the length of `array` by 1.
+//
+// Note: If the fixed capacity dynamic array as no elements (`len(array) == 0`), this procedure will panic.
+@builtin
+pop_front_fixed_capacity_dynamic_array :: proc(array: ^$T/[dynamic; $N]$E, loc := #caller_location) -> (res: E) #no_bounds_check {
+	assert(len(array) > 0, loc=loc)
+	res = array[0]
+	if len(array) > 1 {
+		copy(array[0:], array[1:])
+	}
+	(^Raw_Fixed_Capacity_Dynamic_Array(N, E))(array).len -= 1
+	return res
+}
+
+
+// `pop_front` will remove and return the first value of dynamic array `array` and reduces the length of `array` by 1.
+//
+// Note: If the dynamic array as no elements (`len(array) == 0`), this procedure will panic.
+@builtin
+pop_front :: proc{
+	pop_front_dynamic_array,
+	pop_front_fixed_capacity_dynamic_array,
+}
+
+
+// `pop_front_safe_dynamic_array` trys to return and remove the first value of dynamic array `array` and reduces the length of `array` by 1.
 // If the operation is not possible, it will return false.
@builtin
-pop_front_safe :: proc "contextless" (array: ^$T/[dynamic]$E) -> (res: E, ok: bool) #no_bounds_check {
+pop_front_safe_dynamic_array :: proc "contextless" (array: ^$T/[dynamic]$E) -> (res: E, ok: bool) #no_bounds_check {
 	if len(array) == 0 {
 		return
 	}
@@ -224,12 +358,37 @@ pop_front_safe :: proc "contextless" (array: ^$T/[dynamic]$E) -> (res: E, ok: bo
 	return
 }

+// `pop_front_safe_fixed_capacity_dynamic_array` trys to return and remove the first value of dynamic array `array` and reduces the length of `array` by 1.
+// If the operation is not possible, it will return false.
+@builtin
+pop_front_safe_fixed_capacity_dynamic_array :: proc "contextless" (array: ^$T/[dynamic; $N]$E) -> (res: E, ok: bool) #no_bounds_check {
+	if len(array) == 0 {
+		return
+	}
+	res, ok = array[0], true
+	if len(array) > 1 {
+		copy(array[0:], array[1:])
+	}
+	(^Raw_Fixed_Capacity_Dynamic_Array(N, E))(array).len -= 1
+	return
+}
+
+// `pop_front_safe` trys to return and remove the first value of dynamic array `array` and reduces the length of `array` by 1.
+// If the operation is not possible, it will return false.
+@builtin
+pop_front_safe :: proc {
+	pop_front_safe_dynamic_array,
+	pop_front_safe_fixed_capacity_dynamic_array,
+}
+
+

 // `clear` will set the length of a passed dynamic array or map to `0`
@builtin
 clear :: proc{
 	clear_dynamic_array,
 	clear_map,
+	clear_fixed_capacity_dynamic_array,

 	clear_soa_dynamic_array,
 }
@@ -254,6 +413,7 @@ non_zero_reserve :: proc{
@builtin
 resize :: proc{
 	resize_dynamic_array,
+	resize_fixed_capacity_dynamic_array,

 	resize_soa,
 }
@@ -261,6 +421,7 @@ resize :: proc{
@builtin
 non_zero_resize :: proc{
 	non_zero_resize_dynamic_array,
+	non_zero_resize_fixed_capacity_dynamic_array,

 	non_zero_resize_soa,
 }
@@ -532,6 +693,8 @@ shrink_map :: proc(m: ^$T/map[$K]$V, loc := #caller_location) -> (did_shrink: bo

 // The delete_key built-in procedure deletes the element with the specified key (m[key]) from the map.
 // If m is nil, or there is no such element, this procedure is a no-op
+// It is safe to use `delete_key` while iterating a map.
+// But if you iterate across a map and insert a new key, it could resize which means you are not iterating across all of the elements. 
@builtin
 delete_key :: proc(m: ^$T/map[$K]$V, key: K) -> (deleted_key: K, deleted_value: V) {
 	if m != nil {
@@ -669,6 +832,15 @@ non_zero_append_elem_string :: proc(array: ^$T/[dynamic]$E/u8, arg: $A/string, l
 	return _append_elem_string(array, arg, false, loc)
 }

+// `non_zero_append_elem_fixed_capacity_string` appends a string to the end of a dynamic array of bytes, without zeroing any reserved memory
+//
+// Note: Prefer using the procedure group `non_zero_append`.
+@builtin
+non_zero_append_elem_fixed_capacity_string :: proc "contextless" (array: ^$T/[dynamic; $N]$E/u8, arg: $A/string) -> (n: int) {
+	return append_fixed_capacity_elem(array, transmute([]byte)arg)
+}
+
+

 // The append_string built-in procedure appends multiple strings to the end of a [dynamic]u8 like type
 //
@@ -686,6 +858,57 @@ append_string :: proc(array: ^$T/[dynamic]$E/u8, args: ..string, loc := #caller_
 	return
 }

+
+// `append_fixed_capacity_elem` appends an element to the end of a fixed capacity dynamic array. Returns 0 on failure
+@builtin
+append_fixed_capacity_elem :: proc "contextless" (array: ^$T/[dynamic; $N]$E, #no_broadcast arg: E) -> (n: int) {
+	Raw :: Raw_Fixed_Capacity_Dynamic_Array(N, E)
+
+	if (^Raw)(array).len >= N {
+		return 0
+	}
+
+	when size_of(E) != 0 {
+		#no_bounds_check (^Raw)(array).data[(^Raw)(array).len] = arg
+	}
+	(^Raw)(array).len += 1
+	return 1
+}
+
+
+// `append_fixed_capacity_elem` appends an element to the end of a fixed capacity dynamic array. Returns 0 on failure
+@builtin
+append_fixed_capacity_elems :: proc "contextless" (array: ^$T/[dynamic; $N]$E, #no_broadcast args: ..E) -> (n: int) {
+	Raw :: Raw_Fixed_Capacity_Dynamic_Array(N, E)
+	raw := (^Raw)(array)
+
+	n = min(N - len(array), len(args))
+
+	#no_bounds_check when size_of(E) != 0 {
+		intrinsics.mem_copy(&raw.data[raw.len], raw_data(args), n*size_of(E))
+	}
+
+	raw.len += n
+	return n
+}
+
+// The append_fixed_capacity_string built-in procedure appends multiple strings to the end of a [dynamic]u8 like type
+//
+// Note: Prefer using the procedure group `append`.
+@builtin
+append_fixed_capacity_string :: proc "contextless" (array: ^$T/[dynamic; $N]$E/u8, args: ..string) -> (n: int) {
+	n_arg: int
+	for arg in args {
+		n_arg = append_fixed_capacity_elems(array, ..transmute([]E)(arg))
+		n += n_arg
+		if n_arg < len(arg) {
+			return
+		}
+	}
+	return
+}
+
+
 // The append built-in procedure appends elements to the end of a dynamic array
@builtin
 append :: proc{
@@ -693,6 +916,10 @@ append :: proc{
 	append_elems,
 	append_elem_string,

+	append_fixed_capacity_elem,
+	append_fixed_capacity_elems,
+	append_fixed_capacity_string,
+
 	append_soa_elem,
 	append_soa_elems,
 }
@@ -703,6 +930,10 @@ non_zero_append :: proc{
 	non_zero_append_elems,
 	non_zero_append_elem_string,

+	append_fixed_capacity_elem,
+	append_fixed_capacity_elems,
+	non_zero_append_elem_fixed_capacity_string,
+
 	non_zero_append_soa_elem,
 	non_zero_append_soa_elems,
 }
@@ -711,7 +942,7 @@ non_zero_append :: proc{
 // `append_nothing` appends an empty value to a dynamic array. It returns `1, nil` if successful, and `0, err` when it was not possible,
 // whatever `err` happens to be.
@builtin
-append_nothing :: proc(array: ^$T/[dynamic]$E, loc := #caller_location) -> (n: int, err: Allocator_Error) #optional_allocator_error {
+append_nothing_dynamic_array :: proc(array: ^$T/[dynamic]$E, loc := #caller_location) -> (n: int, err: Allocator_Error) #optional_allocator_error {
 	if array == nil {
 		return 0, nil
 	}
@@ -720,6 +951,27 @@ append_nothing :: proc(array: ^$T/[dynamic]$E, loc := #caller_location) -> (n: i
 	return len(array)-prev_len, nil
 }

+// `append_nothing` appends an empty value to a dynamic array. It returns `1, nil` if successful, and `0, err` when it was not possible,
+// whatever `err` happens to be.
+@builtin
+append_nothing_fixed_capacity_dynamic_array :: proc "contextless" (array: ^$T/[dynamic; $N]$E) -> (n: int, ok: bool) {
+	if array == nil {
+		return 0, true
+	}
+	prev_len := len(array)
+	resize_fixed_capacity_dynamic_array(array, len(array)+1) or_return
+	return len(array)-prev_len, true
+}
+
+
+// `append_nothing` appends an empty value to a dynamic array. It returns `1, nil` if successful, and `0, err` when it was not possible,
+// whatever `err` happens to be.
+@builtin
+append_nothing :: proc{
+	append_nothing_dynamic_array,
+	append_nothing_fixed_capacity_dynamic_array,
+}
+

 // `inject_at_elem` injects an element in a dynamic array at a specified index and moves the previous elements after that index "across"
@builtin
@@ -795,16 +1047,92 @@ inject_at_elem_string :: proc(array: ^$T/[dynamic]$E/u8, #any_int index: int, ar
 	return
 }

+
+// `inject_at_elem_fixed_capacity_dynamic_array` injects an element in a dynamic array at a specified index and moves the previous elements after that index "across"
+@builtin
+inject_at_elem_fixed_capacity_dynamic_array :: proc(array: ^$T/[dynamic; $N]$E, #any_int index: int, #no_broadcast arg: E, loc := #caller_location) -> (ok: bool) #no_bounds_check {
+	when !ODIN_NO_BOUNDS_CHECK {
+		ensure(index >= 0, "Index must be positive.", loc)
+	}
+	if array == nil {
+		return false
+	}
+	n := max(len(array), index)
+	m :: 1
+	new_size := n + m
+
+	resize(array, new_size) or_return
+	when size_of(E) != 0 {
+		copy(array[index + m:], array[index:])
+		array[index] = arg
+	}
+	return true
+}
+
+// `inject_at_elems_fixed_capacity_dynamic_array` injects multiple elements in a dynamic array at a specified index and moves the previous elements after that index "across"
+@builtin
+inject_at_elems_fixed_capacity_dynamic_array :: proc(array: ^$T/[dynamic; $N]$E, #any_int index: int, #no_broadcast args: ..E, loc := #caller_location) -> (ok: bool) #no_bounds_check {
+	when !ODIN_NO_BOUNDS_CHECK {
+		ensure(index >= 0, "Index must be positive.", loc)
+	}
+	if array == nil {
+		return false
+	}
+	if len(args) == 0 {
+		return true
+	}
+
+	n := max(len(array), index)
+	m := len(args)
+	new_size := n + m
+
+	resize(array, new_size) or_return
+	when size_of(E) != 0 {
+		copy(array[index + m:], array[index:])
+		copy(array[index:], args)
+	}
+	return true
+}
+
+// `inject_at_elem_string_fixed_capacity_dynamic_array` injects a string into a dynamic array at a specified index and moves the previous elements after that index "across"
+@builtin
+inject_at_elem_string_fixed_capacity_dynamic_array :: proc(array: ^$T/[dynamic; $N]$E/u8, #any_int index: int, arg: string, loc := #caller_location) -> (ok: bool) #no_bounds_check {
+	when !ODIN_NO_BOUNDS_CHECK {
+		ensure(index >= 0, "Index must be positive.", loc)
+	}
+	if array == nil {
+		return false
+	}
+	if len(arg) == 0 {
+		return true
+	}
+
+	n := max(len(array), index)
+	m := len(arg)
+	new_size := n + m
+
+	resize(array, new_size) or_return
+	copy(array[index+m:], array[index:])
+	copy(array[index:], arg)
+	return true
+}
+
+
 // `inject_at` injects something into a dynamic array at a specified index and moves the previous elements after that index "across"
-@builtin inject_at :: proc{
+@builtin
+inject_at :: proc{
 	inject_at_elem,
 	inject_at_elems,
 	inject_at_elem_string,
+
+	inject_at_elem_fixed_capacity_dynamic_array,
+	inject_at_elems_fixed_capacity_dynamic_array,
+	inject_at_elem_string_fixed_capacity_dynamic_array,
 }



-// `assign_at_elem` assigns a value at a given index. If the requested index is smaller than the current
+// `assign_at_elem` assigns a value at a given index. If the requested index is past the end of the current
 // size of the dynamic array, it will attempt to `resize` the a new length of `index+1` and then assign as `index`.
@builtin
 assign_at_elem :: proc(array: ^$T/[dynamic]$E, #any_int index: int, arg: E, loc := #caller_location) -> (ok: bool, err: Allocator_Error) #no_bounds_check #optional_allocator_error {
@@ -820,7 +1148,7 @@ assign_at_elem :: proc(array: ^$T/[dynamic]$E, #any_int index: int, arg: E, loc
 }


-// `assign_at_elems` assigns a values at a given index. If the requested index is smaller than the current
+// `assign_at_elems` assigns a values at a given index. If the requested index is past the end of the current
 // size of the dynamic array, it will attempt to `resize` the a new length of `index+len(args)` and then assign as `index`.
@builtin
 assign_at_elems :: proc(array: ^$T/[dynamic]$E, #any_int index: int, #no_broadcast args: ..E, loc := #caller_location) -> (ok: bool, err: Allocator_Error) #no_bounds_check #optional_allocator_error {
@@ -838,7 +1166,7 @@ assign_at_elems :: proc(array: ^$T/[dynamic]$E, #any_int index: int, #no_broadca
 	return
 }

-// `assign_at_elem_string` assigns a string at a given index. If the requested index is smaller than the current
+// `assign_at_elem_string` assigns a string at a given index. If the requested index is past the end of the current
 // size of the dynamic array, it will attempt to `resize` the a new length of `index+len(arg)` and then assign as `index`.
@builtin
 assign_at_elem_string :: proc(array: ^$T/[dynamic]$E/u8, #any_int index: int, arg: string, loc := #caller_location) -> (ok: bool, err: Allocator_Error) #no_bounds_check #optional_allocator_error {
@@ -856,13 +1184,71 @@ assign_at_elem_string :: proc(array: ^$T/[dynamic]$E/u8, #any_int index: int, ar
 	return
 }

-// `assign_at` assigns a value at a given index. If the requested index is smaller than the current
+
+// `assign_at_elem_fixed_capacity_dynamic_array` assigns a value at a given index. If the requested index is past the end of the current
+// size of the dynamic array, it will attempt to `resize` the a new length of `index+1` and then assign as `index`.
+@builtin
+assign_at_elem_fixed_capacity_dynamic_array :: proc "contextless" (array: ^$T/[dynamic; $N]$E, #any_int index: int, arg: E) -> (ok: bool) #no_bounds_check {
+	if index < len(array) {
+		array[index] = arg
+		ok = true
+	} else {
+		resize(array, index+1, loc) or_return
+		array[index] = arg
+		ok = true
+	}
+	return
+}
+
+
+// `assign_at_elems_fixed_capacity_dynamic_array` assigns a values at a given index. If the requested index is past the end of the current
+// size of the dynamic array, it will attempt to `resize` the a new length of `index+len(args)` and then assign as `index`.
+@builtin
+assign_at_elems_fixed_capacity_dynamic_array :: proc "contextless" (array: ^$T/[dynamic; $N]$E, #any_int index: int, #no_broadcast args: ..E) -> (ok: bool) #no_bounds_check {
+	new_size := index + len(args)
+	if len(args) == 0 {
+		ok = true
+	} else if new_size < len(array) {
+		copy(array[index:], args)
+		ok = true
+	} else {
+		resize(array, new_size, loc) or_return
+		copy(array[index:], args)
+		ok = true
+	}
+	return
+}
+
+// `assign_at_elem_string_fixed_capacity_dynamic_array` assigns a string at a given index. If the requested index is past the end of the current
+// size of the dynamic array, it will attempt to `resize` the a new length of `index+len(arg)` and then assign as `index`.
+@builtin
+assign_at_elem_string_fixed_capacity_dynamic_array :: proc "contextless" (array: ^$T/[dynamic; $N]$E/u8, #any_int index: int, arg: string) -> (ok: bool) #no_bounds_check {
+	new_size := index + len(arg)
+	if len(arg) == 0 {
+		ok = true
+	} else if new_size < len(array) {
+		copy(array[index:], arg)
+		ok = true
+	} else {
+		resize(array, new_size, loc) or_return
+		copy(array[index:], arg)
+		ok = true
+	}
+	return
+}
+
+
+// `assign_at` assigns a value at a given index. If the requested index is past the end of the current
 // size of the dynamic array, it will attempt to `resize` the a new length of `index+size_needed` and then assign as `index`.
@builtin
 assign_at :: proc{
 	assign_at_elem,
 	assign_at_elems,
 	assign_at_elem_string,
+
+	assign_at_elem_fixed_capacity_dynamic_array,
+	assign_at_elems_fixed_capacity_dynamic_array,
+	assign_at_elem_string_fixed_capacity_dynamic_array,
 }


@@ -877,6 +1263,16 @@ clear_dynamic_array :: proc "contextless" (array: ^$T/[dynamic]$E) {
 	}
 }

+// `clear_fixed_capacity_dynamic_array` will set the length of a passed dynamic array to `0`
+//
+// Note: Prefer the procedure group `clear`.
+@builtin
+clear_fixed_capacity_dynamic_array :: proc "contextless" (array: ^$T/[dynamic; $N]$E) {
+	if array != nil {
+		(^Raw_Fixed_Capacity_Dynamic_Array(N, E))(array).len = 0
+	}
+}
+
 // `reserve_dynamic_array` will try to reserve memory of a passed dynamic array or map to the requested element count (setting the `cap`).
 //
 // When a memory resize allocation is required, the memory will be asked to be zeroed (i.e. it calls `mem_resize`).
@@ -996,6 +1392,43 @@ non_zero_resize_dynamic_array :: proc(array: ^$T/[dynamic]$E, #any_int length: i
 	return _resize_dynamic_array((^Raw_Dynamic_Array)(array), size_of(E), align_of(E), length, false, loc=loc)
 }

+
+
+// `resize_fixed_capacity_dynamic_array` will try to resize memory of a passed fixed capacity dynamic array or map to the requested element count (setting the `len`, and possibly `cap`).
+//
+// Note: Prefer the procedure group `resize`
+@builtin
+resize_fixed_capacity_dynamic_array :: proc "contextless" (array: ^$T/[dynamic; $N]$E, #any_int length: int) -> bool {
+	if array == nil {
+		return false
+	}
+	if len(array) < length {
+		size_of_elem :: size_of(E)
+
+		num_reused := min(N, length) - len(array)
+		intrinsics.mem_zero(([^]byte)(array)[len(array)*size_of_elem:], num_reused*size_of_elem)
+	}
+
+	raw := (^Raw_Fixed_Capacity_Dynamic_Array(N, E))(array)
+	new_length := clamp(length, 0, N)
+	raw.len = new_length
+	return true
+}
+
+// `non_zero_resize_fixed_capacity_dynamic_array` will try to resize memory of a passed fixed capacity dynamic array or map to the requested element count (setting the `len`, and possibly `cap`).
+//
+// Note: Prefer the procedure group `resize`
+@builtin
+non_zero_resize_fixed_capacity_dynamic_array :: proc "contextless" (array: ^$T/[dynamic; $N]$E, #any_int length: int) -> bool {
+	if array == nil {
+		return false
+	}
+	raw := (^Raw_Fixed_Capacity_Dynamic_Array(N, E))(array)
+	new_length := clamp(length, 0, N)
+	raw.len = new_length
+	return true
+}
+
 // Shrinks the capacity of a dynamic array down to the current length, or the given capacity.
 //
 // If `new_cap` is negative, then `len(array)` is used.
@@ -1094,7 +1527,7 @@ card :: proc "contextless" (s: $S/bit_set[$E; $U]) -> int {



-// Evaluates the condition and panics the program iff the condition is false.
+// Evaluates the condition and panics the program if and only if (⟺) the condition is false.
 // This uses the `context.assertion_failure_procedure` to assert.
 //
 // This routine will be ignored when `ODIN_DISABLE_ASSERT` is true.
@@ -1118,7 +1551,7 @@ assert :: proc(condition: bool, message := #caller_expression(condition), loc :=
 	}
 }

-// Evaluates the condition and panics the program iff the condition is false.
+// Evaluates the condition and panics the program if and only if (⟺) the condition is false.
 // This uses the `context.assertion_failure_procedure` to assert.
 // This routine ignores `ODIN_DISABLE_ASSERT`, and will always execute.
@builtin
@@ -1158,7 +1591,7 @@ unimplemented :: proc(message := "", loc := #caller_location) -> ! {
 	p("not yet implemented", message, loc)
 }

-// Evaluates the condition and panics the program iff the condition is false.
+// Evaluates the condition and panics the program if and only if (⟺) the condition is false.
 // This uses the `default_assertion_contextless_failure_proc` to assert.
 //
 // This routine will be ignored when `ODIN_DISABLE_ASSERT` is true.
@@ -1178,7 +1611,7 @@ assert_contextless :: proc "contextless" (condition: bool, message := #caller_ex
 	}
 }

-// Evaluates the condition and panics the program iff the condition is false.
+// Evaluates the condition and panics the program if and only if (⟺) the condition is false.
 // This uses the `default_assertion_contextless_failure_proc` to assert.
@builtin
 ensure_contextless :: proc "contextless" (condition: bool, message := #caller_expression(condition), loc := #caller_location) {
--- a/base/runtime/default_temp_allocator_arena.odin
+++ b/base/runtime/default_temp_allocator_arena.odin
@@ -97,15 +97,6 @@ alloc_from_memory_block :: proc(block: ^Memory_Block, min_size, alignment: uint)

@(require_results)
 arena_alloc :: proc(arena: ^Arena, size, alignment: uint, loc := #caller_location) -> (data: []byte, err: Allocator_Error) {
-	align_forward_uint :: proc "contextless" (ptr, align: uint) -> uint {
-		p := ptr
-		modulo := p & (align-1)
-		if modulo != 0 {
-			p += align - modulo
-		}
-		return p
-	}
-
 	assert(alignment & (alignment-1) == 0, "non-power of two alignment", loc)

 	size := size
--- a/base/runtime/internal.odin
+++ b/base/runtime/internal.odin
@@ -29,6 +29,30 @@ byte_slice :: #force_inline proc "contextless" (data: rawptr, len: int) -> []byt
 	return ([^]byte)(data)[:max(len, 0)]
 }

+@(require_results)
+align_forward_uint :: #force_inline proc "odin" (ptr, align: uint) -> uint {
+	assert(is_power_of_two_uint(align))
+	return (ptr + align-1) & ~(align-1)
+}
+
+@(require_results)
+align_forward_int :: #force_inline proc "odin" (ptr, align: int) -> int {
+	assert(is_power_of_two_int(align))
+	return int(align_forward_uint(uint(ptr), uint(align)))
+}
+
+@(require_results)
+align_forward_uintptr :: #force_inline proc "odin" (ptr, align: uintptr) -> uintptr {
+	return uintptr(align_forward_uint(uint(ptr), uint(align)))
+}
+
+align_forward :: proc {
+	align_forward_int,
+	align_forward_uint,
+	align_forward_uintptr,
+}
+
+@(require_results)
 is_power_of_two_int :: #force_inline proc "contextless" (x: int) -> bool {
 	if x <= 0 {
 		return false
@@ -36,51 +60,17 @@ is_power_of_two_int :: #force_inline proc "contextless" (x: int) -> bool {
 	return (x & (x-1)) == 0
 }

-align_forward_int :: #force_inline proc "odin" (ptr, align: int) -> int {
-	assert(is_power_of_two_int(align))
-
-	p := ptr
-	modulo := p & (align-1)
-	if modulo != 0 {
-		p += align - modulo
-	}
-	return p
-}
-
+@(require_results)
 is_power_of_two_uint :: #force_inline proc "contextless" (x: uint) -> bool {
-	if x <= 0 {
+	if x == 0 {
 		return false
 	}
 	return (x & (x-1)) == 0
 }

-align_forward_uint :: #force_inline proc "odin" (ptr, align: uint) -> uint {
-	assert(is_power_of_two_uint(align))
-
-	p := ptr
-	modulo := p & (align-1)
-	if modulo != 0 {
-		p += align - modulo
-	}
-	return p
-}
-
+@(require_results)
 is_power_of_two_uintptr :: #force_inline proc "contextless" (x: uintptr) -> bool {
-	if x <= 0 {
-		return false
-	}
-	return (x & (x-1)) == 0
-}
-
-align_forward_uintptr :: #force_inline proc "odin" (ptr, align: uintptr) -> uintptr {
-	assert(is_power_of_two_uintptr(align))
-
-	p := ptr
-	modulo := p & (align-1)
-	if modulo != 0 {
-		p += align - modulo
-	}
-	return p
+	return is_power_of_two_uint(uint(x))
 }

 is_power_of_two :: proc {
@@ -89,12 +79,6 @@ is_power_of_two :: proc {
 	is_power_of_two_uintptr,
 }

-align_forward :: proc {
-	align_forward_int,
-	align_forward_uint,
-	align_forward_uintptr,
-}
-
 mem_zero :: proc "contextless" (data: rawptr, len: int) -> rawptr {
 	if data == nil {
 		return nil
@@ -718,7 +702,7 @@ quaternion256_eq :: #force_inline proc "contextless" (a, b: quaternion256) -> bo
 quaternion256_ne :: #force_inline proc "contextless" (a, b: quaternion256) -> bool { return real(a) != real(b) || imag(a) != imag(b) || jmag(a) != jmag(b) || kmag(a) != kmag(b) }


-string_decode_rune :: proc "contextless" (s: string) -> (rune, int) {
+string_decode_rune :: proc "contextless" (s: string) -> (rune, int) #no_bounds_check {
 	// NOTE(bill): Duplicated here to remove dependency on package unicode/utf8

 	@(static, rodata) accept_sizes := [256]u8{
@@ -797,7 +781,7 @@ string_decode_rune :: proc "contextless" (s: string) -> (rune, int) {
 	return rune(s0&MASK4)<<18 | rune(b1&MASKX)<<12 | rune(b2&MASKX)<<6 | rune(b3&MASKX), 4
 }

-string_decode_last_rune :: proc "contextless" (s: string) -> (rune, int) {
+string_decode_last_rune :: proc "contextless" (s: string) -> (rune, int) #no_bounds_check {
 	RUNE_ERROR :: '\ufffd'
 	RUNE_SELF  :: 0x80
 	UTF_MAX    :: 4
@@ -833,7 +817,7 @@ string_decode_last_rune :: proc "contextless" (s: string) -> (rune, int) {
 }


-string16_decode_rune :: proc "contextless" (s: string16) -> (rune, int) {
+string16_decode_rune :: proc "contextless" (s: string16) -> (rune, int) #no_bounds_check {
 	REPLACEMENT_CHAR :: '\ufffd'
 	_surr1           :: 0xd800
 	_surr2           :: 0xdc00
@@ -861,7 +845,7 @@ string16_decode_rune :: proc "contextless" (s: string16) -> (rune, int) {
 	return r, w
 }

-string16_decode_last_rune :: proc "contextless" (s: string16) -> (rune, int) {
+string16_decode_last_rune :: proc "contextless" (s: string16) -> (rune, int) #no_bounds_check {
 	REPLACEMENT_CHAR :: '\ufffd'
 	_surr1           :: 0xd800
 	_surr2           :: 0xdc00
--- a/base/runtime/print.odin
+++ b/base/runtime/print.odin
@@ -392,6 +392,12 @@ print_type :: #force_no_inline proc "contextless" (ti: ^Type_Info) {
 		print_string("[]")
 		print_type(info.elem)

+	case Type_Info_Fixed_Capacity_Dynamic_Array:
+		print_string("[dynamic; ")
+		print_u64(u64(info.capacity))
+		print_string("]")
+		print_type(info.elem)
+
 	case Type_Info_Map:
 		print_string("map[")
 		print_type(info.key)
@@ -478,7 +484,7 @@ print_type :: #force_no_inline proc "contextless" (ti: ^Type_Info) {
 			print_string("..")
 			print_i64(info.upper)
 		}
-		if info.underlying != nil {
+		if info.explicit_underlying {
 			print_string("; ")
 			print_type(info.underlying)
 		}
@@ -807,6 +813,12 @@ write_write_type :: #force_no_inline proc "contextless" (i: ^int, buf: []byte, t
 		write_string    (i, buf, "[]")      or_return
 		write_write_type(i, buf, info.elem) or_return

+	case Type_Info_Fixed_Capacity_Dynamic_Array:
+		write_string    (i, buf, "[dynamic; ")       or_return
+		write_u64       (i, buf, u64(info.capacity)) or_return
+		write_string    (i, buf, "]")                or_return
+		write_write_type(i, buf, info.elem)          or_return
+
 	case Type_Info_Map:
 		write_string    (i, buf, "map[")     or_return
 		write_write_type(i, buf, info.key)   or_return
@@ -893,7 +905,7 @@ write_write_type :: #force_no_inline proc "contextless" (i: ^int, buf: []byte, t
 			write_string(i, buf, "..")       or_return
 			write_i64   (i, buf, info.upper) or_return
 		}
-		if info.underlying != nil {
+		if info.explicit_underlying {
 			write_string    (i, buf, "; ")            or_return
 			write_write_type(i, buf, info.underlying) or_return
 		}
--- a/base/runtime/random_generator_chacha8_simd256.odin
+++ b/base/runtime/random_generator_chacha8_simd256.odin
@@ -136,7 +136,7 @@ chacha8rand_refill_simd256 :: proc(r: ^Default_Random_State) {
 		//
 		// LLVM appears not to consider "this instruction is totally
 		// awful on the given microarchitcture", which leads to
-		// `VPCOMPRESSED` being generated iff AVX512 support is
+		// `VPCOMPRESSED` being generated if and only if (⟺) AVX512 support is
 		// enabled for `intrinsics.simd_masked_compress_store`.
 		// On Zen 4, this leads to a 50% performance regression vs
 		// the 128-bit SIMD code.
--- a/build.bat
+++ b/build.bat
@@ -94,6 +94,7 @@ if %release_mode% EQU 0 ( rem Debug
 set compiler_warnings= ^
 	-W4 -WX ^
 	-wd4100 -wd4101 -wd4127 -wd4146 ^
+	-wd4324 ^
 	-wd4505 ^
 	-wd4456 -wd4457

@@ -106,16 +107,6 @@ set libs= ^
 set odin_res=misc\odin.res
 set odin_rc=misc\odin.rc

-rem DO NOT TOUCH!
-rem THIS TILDE STUFF IS FOR DEVELOPMENT ONLY!
-set tilde_backend=0
-if %tilde_backend% EQU 1 (
-	set libs=%libs% src\tilde\tb.lib
-	set compiler_defines=%compiler_defines% -DODIN_TILDE_BACKEND
-)
-rem DO NOT TOUCH!
-
-
 set linker_flags= -incremental:no -opt:ref -subsystem:console -MANIFEST:EMBED

 if %release_mode% EQU 0 ( rem Debug
--- a/build_odin.sh
+++ b/build_odin.sh
@@ -26,7 +26,7 @@ error() {
 	exit 1
 }

-SUPPORTED_LLVM_VERSIONS="21 20 19 18 17 14"
+SUPPORTED_LLVM_VERSIONS="22 21 20 19 18 17 14"

 # Brew advises people not to add llvm to their $PATH, so try and use brew to find it.
 if [ -z "$LLVM_CONFIG" ] &&  [ -n "$(command -v brew)" ]; then
@@ -78,8 +78,8 @@ LLVM_VERSION_MAJOR="$(echo $LLVM_VERSION | awk -F. '{print $1}')"
 LLVM_VERSION_MINOR="$(echo $LLVM_VERSION | awk -F. '{print $2}')"
 LLVM_VERSION_PATCH="$(echo $LLVM_VERSION | awk -F. '{print $3}')"

-if [ $LLVM_VERSION_MAJOR -lt 14 ] || ([ $LLVM_VERSION_MAJOR -gt 14 ] && [ $LLVM_VERSION_MAJOR -lt 17 ]) || [ $LLVM_VERSION_MAJOR -gt 21 ]; then
-	error "Invalid LLVM version $LLVM_VERSION: must be 14, 17, 18, 19, 20, or 21"
+if [ $LLVM_VERSION_MAJOR -lt 14 ] || ([ $LLVM_VERSION_MAJOR -gt 14 ] && [ $LLVM_VERSION_MAJOR -lt 17 ]) || [ $LLVM_VERSION_MAJOR -gt 22 ]; then
+	error "Invalid LLVM version $LLVM_VERSION: must be 14, 17, 18, 19, 20, 21 or 22"
 fi

 case "$OS_NAME" in
--- a/ci/build_linux_static.sh
+++ b/ci/build_linux_static.sh
@@ -6,7 +6,14 @@ LLVM_CONFIG="llvm-config-20"

 DISABLED_WARNINGS="-Wno-switch -Wno-macro-redefined -Wno-unused-value"

-CPPFLAGS="-DODIN_VERSION_RAW=\"dev-$(date +"%Y-%m")\""
+if [ -d ".git" ] && [ -n "$(command -v git)" ]; then
+	GIT_SHA=$(git show --pretty='%h' --no-patch --no-notes HEAD)
+	GIT_DATE=$(git show "--pretty=%cd" "--date=format:%Y-%m" --no-patch --no-notes HEAD)
+	CPPFLAGS="$CPPFLAGS -DGIT_SHA=\"$GIT_SHA\""
+else
+	GIT_DATE=$(date +"%Y-%m")
+fi
+CPPFLAGS="$CPPFLAGS -DODIN_VERSION_RAW=\"dev-$GIT_DATE\""
 CXXFLAGS="-std=c++14 $($LLVM_CONFIG --cxxflags --ldflags)"

 LDFLAGS="-static -lm -lzstd -lz -lffi -pthread -ldl -fuse-ld=mold"
--- a/core/bufio/reader.odin
+++ b/core/bufio/reader.odin
@@ -45,7 +45,7 @@ reader_init_with_buf :: proc(b: ^Reader, rd: io.Reader, buf: []byte) {
 	b.buf = buf
 }

-// reader_destroy destroys the underlying buffer with its associated allocator IFF that allocator has been set
+// reader_destroy destroys the underlying buffer with its associated allocator if and only if (⟺) that allocator has been set
 reader_destroy :: proc(b: ^Reader) {
 	delete(b.buf, b.buf_allocator)
 	b^ = {}
--- a/core/bufio/writer.odin
+++ b/core/bufio/writer.odin
@@ -35,7 +35,7 @@ writer_init_with_buf :: proc(b: ^Writer, wr: io.Writer, buf: []byte) {
 	b.buf = buf
 }

-// writer_destroy destroys the underlying buffer with its associated allocator IFF that allocator has been set
+// writer_destroy destroys the underlying buffer with its associated allocator if and only if (⟺) that allocator has been set
 writer_destroy :: proc(b: ^Writer) {
 	delete(b.buf, b.buf_allocator)
 	b^ = {}
--- a/core/bytes/bytes.odin
+++ b/core/bytes/bytes.odin
@@ -1460,7 +1460,7 @@ fields_proc :: proc(s: []byte, f: proc(rune) -> bool, allocator := context.alloc
 	return subslices[:]
 }

-// alias returns true iff a and b have a non-zero length, and any part of
+// alias returns true if and only if (⟺) a and b have a non-zero length, and any part of
 // a overlaps with b.
 alias :: proc "contextless" (a, b: []byte) -> bool {
 	a_len, b_len := len(a), len(b)
@@ -1474,7 +1474,7 @@ alias :: proc "contextless" (a, b: []byte) -> bool {
 	return a_start <= b_end && b_start <= a_end
 }

-// alias_inexactly returns true iff a and b have a non-zero length,
+// alias_inexactly returns true if and only if (⟺) a and b have a non-zero length,
 // the base pointer of a and b are NOT equal, and any part of a overlaps
 // with b (ie: `alias(a, b)` with an exception that returns false for
 // `a == b`, `b = a[:len(a)-69]` and similar conditions).
--- a/core/c/libc/math.odin
+++ b/core/c/libc/math.odin
@@ -154,12 +154,12 @@ _nan_bit_pattern := ~u64(0)

 // On amd64 Windows and Linux, float_t and double_t are respectively both
 // their usual types. On x86 it's not possible to define these types correctly
-// since they would be long double which Odin does have support for.
+// since they would be long double which Odin does NOT have support for.
 float_t          :: float
 double_t         :: double

 NAN              := transmute(double)(_nan_bit_pattern)
-INFINITY         :: 1e5000
+INFINITY         :: 0h7ff00000_00000000 // +Inf

 HUGE_VALF        :: INFINITY
 HUGE_VAL         :: double(INFINITY)
--- a/core/compress/common.odin
+++ b/core/compress/common.odin
@@ -360,23 +360,26 @@ refill_lsb_from_memory :: #force_inline proc(z: ^Context_Memory_Input, width :=
 	refill := u64(width)
 	b      := u64(0)

-	if z.num_bits > refill {
-		return
-	}
-
 	for {
+		if z.num_bits > refill {
+			break
+		}
+		if z.code_buffer == 0 && z.num_bits > 63 {
+			z.num_bits = 0
+		}
+		if z.code_buffer >= 1 << uint(z.num_bits) {
+			// Code buffer is malformed.
+			z.num_bits = max(u64)
+			return
+		}
 		if len(z.input_data) != 0 {
 			b = u64(z.input_data[0])
 			z.input_data = z.input_data[1:]
 		} else {
-			b = 0
+			return
 		}
-
 		z.code_buffer |= b << u8(z.num_bits)
 		z.num_bits += 8
-		if z.num_bits > refill {
-			break
-		}
 	}
 }

--- a/core/compress/zlib/zlib.odin
+++ b/core/compress/zlib/zlib.odin
@@ -322,9 +322,6 @@ decode_huffman_slowpath :: proc(z: ^$C, t: ^Huffman_Table) -> (r: u16, err: Erro
@(optimization_mode="favor_size")
 decode_huffman :: proc(z: ^$C, t: ^Huffman_Table) -> (r: u16, err: Error) #no_bounds_check {
 	if z.num_bits < 16 {
-		if z.num_bits > 63 {
-			return 0, .Code_Buffer_Malformed
-		}
 		compress.refill_lsb(z)
 		if z.num_bits > 63 {
 			return 0, .Stream_Too_Short
--- a/core/container/avl/avl.odin
+++ b/core/container/avl/avl.odin
@@ -100,20 +100,20 @@ len :: proc "contextless" (t: ^$T/Tree($Value)) -> int {
 	return t._size
 }

-// first returns the first node in the tree (in-order) or nil iff
+// first returns the first node in the tree (in-order) or nil if and only if (⟺)
 // the tree is empty.
 first :: proc "contextless" (t: ^$T/Tree($Value)) -> ^Node(Value) {
 	return tree_first_or_last_in_order(t, Direction.Backward)
 }

-// last returns the last element in the tree (in-order) or nil iff
+// last returns the last element in the tree (in-order) or nil if and only if (⟺)
 // the tree is empty.
 last :: proc "contextless" (t: ^$T/Tree($Value)) -> ^Node(Value) {
 	return tree_first_or_last_in_order(t, Direction.Forward)
 }

 // find finds the value in the tree, and returns the corresponding
-// node or nil iff the value is not present.
+// node or nil if and only if (⟺) the value is not present.
 find :: proc(t: ^$T/Tree($Value), value: Value) -> ^Node(Value) {
 	cur := t._root
 	descend_loop: for cur != nil {
@@ -168,7 +168,7 @@ find_or_insert :: proc(
 	return
 }

-// remove removes a node or value from the tree, and returns true iff the
+// remove removes a node or value from the tree, and returns true if and only if (⟺) the
 // removal was successful.  While the node's value will be left intact,
 // the node itself will be freed via the tree's node allocator.
 remove :: proc {
@@ -176,7 +176,7 @@ remove :: proc {
 	remove_node,
 }

-// remove_value removes a value from the tree, and returns true iff the
+// remove_value removes a value from the tree, and returns true if and only if (⟺) the
 // removal was successful.  While the node's value will be left intact,
 // the node itself will be freed via the tree's node allocator.
 remove_value :: proc(t: ^$T/Tree($Value), value: Value, call_on_remove: bool = true) -> bool {
@@ -187,7 +187,7 @@ remove_value :: proc(t: ^$T/Tree($Value), value: Value, call_on_remove: bool = t
 	return remove_node(t, n, call_on_remove)
 }

-// remove_node removes a node from the tree, and returns true iff the
+// remove_node removes a node from the tree, and returns true if and only if (⟺) the
 // removal was successful.  While the node's value will be left intact,
 // the node itself will be freed via the tree's node allocator.
 remove_node :: proc(t: ^$T/Tree($Value), node: ^Node(Value), call_on_remove: bool = true) -> bool {
@@ -281,14 +281,14 @@ iterator_from_pos :: proc "contextless" (
 }

 // iterator_get returns the node currently pointed to by the iterator,
-// or nil iff the node has been removed, the tree is empty, or the end
+// or nil if and only if (⟺) the node has been removed, the tree is empty, or the end
 // of the tree has been reached.
 iterator_get :: proc "contextless" (it: ^$I/Iterator($Value)) -> ^Node(Value) {
 	return it._cur
 }

 // iterator_remove removes the node currently pointed to by the iterator,
-// and returns true iff the removal was successful.  Semantics are the
+// and returns true if and only if (⟺) the removal was successful.  Semantics are the
 // same as the Tree remove.
 iterator_remove :: proc(it: ^$I/Iterator($Value), call_on_remove: bool = true) -> bool {
 	if it._cur == nil {
@@ -304,7 +304,7 @@ iterator_remove :: proc(it: ^$I/Iterator($Value), call_on_remove: bool = true) -
 }

 // iterator_next advances the iterator and returns the (node, true) or
-// or (nil, false) iff the end of the tree has been reached.
+// or (nil, false) if and only if (⟺) the end of the tree has been reached.
 //
 // Note: The first call to iterator_next will return the first node instead
 // of advancing the iterator.
--- a/core/container/queue/queue.odin
+++ b/core/container/queue/queue.odin
@@ -236,16 +236,6 @@ back_ptr :: proc(q: ^$Q/Queue($T), loc := #caller_location) -> ^T {
 }


-@(deprecated="Use `front_ptr` instead")
-peek_front :: proc(q: ^$Q/Queue($T), loc := #caller_location) -> ^T {
-	return front_ptr(q, loc)
-}
-
-@(deprecated="Use `back_ptr` instead")
-peek_back :: proc(q: ^$Q/Queue($T), loc := #caller_location) -> ^T {
-	return back_ptr(q, loc)
-}
-
 /*
 Push an element to the back of the queue.

--- a/core/container/rbtree/rbtree.odin
+++ b/core/container/rbtree/rbtree.odin
@@ -95,19 +95,19 @@ len :: proc "contextless" (t: $T/Tree($Key, $Value)) -> (node_count: int) {
 	return t._size
 }

-// first returns the first node in the tree (in-order) or nil iff
+// first returns the first node in the tree (in-order) or nil if and only if (⟺)
 // the tree is empty.
 first :: proc "contextless" (t: ^$T/Tree($Key, $Value)) -> ^Node(Key, Value) {
 	return tree_first_or_last_in_order(t, Direction.Backward)
 }

-// last returns the last element in the tree (in-order) or nil iff
+// last returns the last element in the tree (in-order) or nil if and only if (⟺)
 // the tree is empty.
 last :: proc "contextless" (t: ^$T/Tree($Key, $Value)) -> ^Node(Key, Value) {
 	return tree_first_or_last_in_order(t, Direction.Forward)
 }

-// find finds the key in the tree, and returns the corresponding node, or nil iff the value is not present.
+// find finds the key in the tree, and returns the corresponding node, or nil if and only if (⟺) the value is not present.
 find :: proc(t: $T/Tree($Key, $Value), key: Key) -> (node: ^Node(Key, Value)) {
 	node = t._root
 	for node != nil {
@@ -120,7 +120,7 @@ find :: proc(t: $T/Tree($Key, $Value), key: Key) -> (node: ^Node(Key, Value)) {
 	return node
 }

-// find_value finds the key in the tree, and returns the corresponding value, or nil iff the value is not present.
+// find_value finds the key in the tree, and returns the corresponding value, or nil if and only if (⟺) the value is not present.
 find_value :: proc(t: $T/Tree($Key, $Value), key: Key) -> (value: Value, ok: bool) #optional_ok {
 	if n := find(t, key); n != nil {
 		return n.value, true
@@ -154,7 +154,7 @@ find_or_insert :: proc(t: ^$T/Tree($Key, $Value), key: Key, value: Value) -> (n:
 	return n, true, nil
 }

-// remove removes a node or value from the tree, and returns true iff the
+// remove removes a node or value from the tree, and returns true if and only if (⟺) the
 // removal was successful.  While the node's value will be left intact,
 // the node itself will be freed via the tree's node allocator.
 remove :: proc {
@@ -162,7 +162,7 @@ remove :: proc {
 	remove_node,
 }

-// remove_value removes a value from the tree, and returns true iff the
+// remove_value removes a value from the tree, and returns true if and only if (⟺) the
 // removal was successful.  While the node's key + value will be left intact,
 // the node itself will be freed via the tree's node allocator.
 remove_key :: proc(t: ^$T/Tree($Key, $Value), key: Key, call_on_remove := true) -> bool {
@@ -173,7 +173,7 @@ remove_key :: proc(t: ^$T/Tree($Key, $Value), key: Key, call_on_remove := true)
 	return remove_node(t, n, call_on_remove)
 }

-// remove_node removes a node from the tree, and returns true iff the
+// remove_node removes a node from the tree, and returns true if and only if (⟺) the
 // removal was successful.  While the node's key + value will be left intact,
 // the node itself will be freed via the tree's node allocator.
 remove_node :: proc(t: ^$T/Tree($Key, $Value), node: ^$N/Node(Key, Value), call_on_remove := true) -> (found: bool) {
@@ -235,14 +235,14 @@ iterator_from_pos :: proc "contextless" (t: ^$T/Tree($Key, $Value), pos: ^Node(K
 }

 // iterator_get returns the node currently pointed to by the iterator,
-// or nil iff the node has been removed, the tree is empty, or the end
+// or nil if and only if (⟺) the node has been removed, the tree is empty, or the end
 // of the tree has been reached.
 iterator_get :: proc "contextless" (it: ^$I/Iterator($Key, $Value)) -> ^Node(Key, Value) {
 	return it._cur
 }

 // iterator_remove removes the node currently pointed to by the iterator,
-// and returns true iff the removal was successful.  Semantics are the
+// and returns true if and only if (⟺) the removal was successful.  Semantics are the
 // same as the Tree remove.
 iterator_remove :: proc(it: ^$I/Iterator($Key, $Value), call_on_remove: bool = true) -> bool {
 	if it._cur == nil {
@@ -258,7 +258,7 @@ iterator_remove :: proc(it: ^$I/Iterator($Key, $Value), call_on_remove: bool = t
 }

 // iterator_next advances the iterator and returns the (node, true) or
-// or (nil, false) iff the end of the tree has been reached.
+// or (nil, false) if and only if (⟺) the end of the tree has been reached.
 //
 // Note: The first call to iterator_next will return the first node instead
 // of advancing the iterator.
--- a/core/container/small_array/doc.odin
+++ b/core/container/small_array/doc.odin
@@ -1,4 +1,6 @@
 /*
+Deprecation Notice: Prefer using `[dynamic; N]T` (fixed capacity dynamic arrays).
+
 A dynamic array-like interface on a stack-allocated, fixed-size array.

 The `Small_Array` type is optimal for scenarios where you need
--- a/core/container/small_array/small_array.odin
+++ b/core/container/small_array/small_array.odin
@@ -21,7 +21,7 @@ Example:
 	}
 */
 Small_Array :: struct($N: int, $T: typeid) where N >= 0 {
-	data: [N]T,
+	data: [N]T `fmt:",len"`,
 	len:  int,
 }

--- a/core/container/xar/freelist.odin
+++ b/core/container/xar/freelist.odin
@@ -29,7 +29,8 @@ freelist_clear :: proc(x: ^$X/Freelist_Array($T, $SHIFT)) {
 freelist_push_with_index :: proc(x: ^$X/Freelist_Array($T, $SHIFT), value: T, loc := #caller_location) -> (ptr: ^T, index: int, err: runtime.Allocator_Error) {
 	if x.freelist != nil {
 		slot := x.freelist
-		idx, _ := freelist_linear_search(x, slot)
+		idx, found := freelist_linear_search(x, slot)
+		assert(found)
 		x.freelist = (^^T)(slot)^
 		slot^ = value
 		return slot, idx, nil
--- a/core/crypto/_aes/hw/api.odin
+++ b/core/crypto/_aes/hw/api.odin
@@ -0,0 +1,69 @@
+package aes_hw
+
+@(require) import "core:sys/info"
+
+// is_supported returns true if and only if (⟺) hardware accelerated AES
+// is supported.
+is_supported :: proc "contextless" () -> bool {
+	when ODIN_ARCH == .amd64 {
+		// Note: Everything with AES-NI has support for
+		// the required SSE extxtensions.
+		req_features :: info.CPU_Features{
+			.sse2,
+			.ssse3,
+			.sse41,
+			.aes,
+		}
+		return info.cpu_features() >= req_features
+	} else when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 {
+		req_features :: info.CPU_Features{
+			.asimd,
+			.aes,
+		}
+		return info.cpu_features() >= req_features
+	} else {
+		return false
+	}
+}
+
+// is_ghash_supported returns true if and only if (⟺) hardware accelerated
+// GHASH is supported.
+is_ghash_supported :: proc "contextless" () -> bool {
+	// Just having hardware GHASH is silly.
+	if !is_supported() {
+		return false
+	}
+
+	when ODIN_ARCH == .amd64 {
+		return info.cpu_features() >= info.CPU_Features{
+			.pclmulqdq,
+		}
+	} else when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32{
+		// Once we can actually use this, we can re-enable this.
+		//
+		// return info.cpu_features() >= info.CPU_Features{
+		// 	.pmull,
+		// }
+		return false
+	} else {
+		return false
+	}
+}
+
+// Context is a keyed AES (ECB) instance.
+Context :: struct {
+	// Note: The ideal thing to do is for the expanded round keys to be
+	// arrays of `u8x16`, however that implies alignment (or using AVX).
+	//
+	// All the people using e-waste processors that don't support an
+	// instruction set that has been around for over 10 years are why
+	// we can't have nice things.
+	_sk_exp_enc: [15][16]byte,
+	_sk_exp_dec: [15][16]byte,
+	_num_rounds: int,
+}
+
+// init initializes a context for AES with the provided key.
+init :: proc(ctx: ^Context, key: []byte) {
+	keysched(ctx, key)
+}
--- a/core/crypto/_aes/hw/ghash_intel.odin
+++ b/core/crypto/_aes/hw/ghash_intel.odin
@@ -21,7 +21,7 @@
 // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #+build amd64
-package aes_hw_intel
+package aes_hw

 import "base:intrinsics"
 import "core:crypto/_aes"
--- a/core/crypto/_aes/hw/intrinsics_arm.odin
+++ b/core/crypto/_aes/hw/intrinsics_arm.odin
@@ -0,0 +1,115 @@
+#+build arm64,arm32
+package aes_hw
+
+import "core:simd"
+import "core:simd/arm"
+
+// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a/
+
+TARGET_FEATURES :: "neon,aes"
+HAS_GHASH :: false // Temporary
+
+@(require_results, enable_target_feature = "aes")
+aesdec :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 {
+	return simd.bit_xor(arm.vaesimcq_u8(arm.vaesdq_u8(data, simd.u8x16{})), key)
+}
+
+@(require_results, enable_target_feature = "aes")
+aesdeclast :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 {
+	return simd.bit_xor(arm.vaesdq_u8(data, simd.u8x16{}), key)
+}
+
+@(require_results, enable_target_feature = "aes")
+aesenc :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 {
+	return simd.bit_xor(arm.vaesmcq_u8(arm.vaeseq_u8(data, simd.u8x16{})), key)
+}
+
+@(require_results, enable_target_feature = "aes")
+aesenclast :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 {
+	return simd.bit_xor(arm.vaeseq_u8(data, simd.u8x16{}), key)
+}
+
+aesimc :: arm.vaesimcq_u8
+
+@(require_results, enable_target_feature = "aes")
+aeskeygenassist :: #force_inline proc "c" (data: simd.u8x16, $IMM8: u8) -> simd.u8x16 {
+	a := arm.vaeseq_u8(data, simd.u8x16{}) // AESE does ShiftRows and SubBytes on A
+
+	// Undo ShiftRows step from AESE and extract X1 and X3
+	dest := simd.swizzle(
+		a,
+		0x04, 0x01, 0x0e, 0x0b, // SubBytes(X1)
+		0x01, 0x0e, 0x0b, 0x04, // ROT(SubBytes(X1))
+		0x0c, 0x09, 0x06, 0x03, // SubBytes(X3)
+		0x09, 0x06, 0x03, 0x0c, // ROT(SubBytes(X3))
+	)
+
+	rcons := simd.u8x16{
+		0, 0, 0, 0,
+		IMM8, 0, 0, 0,
+		0, 0, 0, 0,
+		IMM8, 0, 0, 0,
+	}
+
+	return simd.bit_xor(dest, rcons)
+}
+
+// The keyschedule implementation is easier to read with some extra
+// Intel intrinsics that are emulated by built-in LLVM ops anyway.
+
+@(private, require_results, enable_target_feature = TARGET_FEATURES)
+_mm_slli_si128 :: #force_inline proc "c" (a: simd.u8x16, $IMM8: u32) -> simd.u8x16 {
+	shift :: IMM8 & 0xff
+
+	// This needs to emit behavior identical to PSLLDQ which is as follows:
+	//
+	// TEMP := COUNT
+	// IF (TEMP > 15) THEN TEMP := 16; FI
+	// DEST := DEST << (TEMP * 8)
+	// DEST[MAXVL-1:128] (Unmodified)
+
+	return simd.shuffle(
+		simd.u8x16{},
+		a,
+		0 when shift > 15 else (16 - shift + 0),
+		1 when shift > 15 else (16 - shift + 1),
+		2 when shift > 15 else (16 - shift + 2),
+		3 when shift > 15 else (16 - shift + 3),
+		4 when shift > 15 else (16 - shift + 4),
+		5 when shift > 15 else (16 - shift + 5),
+		6 when shift > 15 else (16 - shift + 6),
+		7 when shift > 15 else (16 - shift + 7),
+		8 when shift > 15 else (16 - shift + 8),
+		9 when shift > 15 else (16 - shift + 9),
+		10 when shift > 15 else (16 - shift + 10),
+		11 when shift > 15 else (16 - shift + 11),
+		12 when shift > 15 else (16 - shift + 12),
+		13 when shift > 15 else (16 - shift + 13),
+		14 when shift > 15 else (16 - shift + 14),
+		15 when shift > 15 else (16 - shift + 15),
+	)
+}
+
+@(private, require_results, enable_target_feature = TARGET_FEATURES)
+_mm_shuffle_epi32 :: #force_inline proc "c" (a: simd.u8x16, $IMM8: u32) -> simd.u8x16 {
+	v := transmute(simd.i32x4)a
+	return transmute(simd.u8x16)simd.shuffle(
+		v,
+		v,
+		IMM8 & 0b11,
+		(IMM8 >> 2) & 0b11,
+		(IMM8 >> 4) & 0b11,
+		(IMM8 >> 6) & 0b11,
+	)
+}
+
+@(private, require_results, enable_target_feature = TARGET_FEATURES)
+_mm_shuffle_ps :: #force_inline proc "c" (a, b: simd.u8x16, $MASK: u32) -> simd.u8x16 {
+	return transmute(simd.u8x16)simd.shuffle(
+		transmute(simd.u32x4)(a),
+		transmute(simd.u32x4)(b),
+		u32(MASK) & 0b11,
+		(u32(MASK)>>2) & 0b11,
+		((u32(MASK)>>4) & 0b11)+4,
+		((u32(MASK)>>6) & 0b11)+4)
+}
--- a/core/crypto/_aes/hw/intrinsics_intel.odin
+++ b/core/crypto/_aes/hw/intrinsics_intel.odin
@@ -0,0 +1,55 @@
+#+build amd64
+package aes_hw
+
+import "core:simd"
+import "core:simd/x86"
+
+// Intel/RISC-V semantics.
+
+TARGET_FEATURES :: "sse,sse2,ssse3,sse4.1,aes"
+HAS_GHASH :: true
+
+@(require_results, enable_target_feature = "aes")
+aesdec :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 {
+	return transmute(simd.u8x16)(x86._mm_aesdec_si128(transmute(x86.__m128i)(data), transmute(x86.__m128i)(key)))
+}
+
+@(require_results, enable_target_feature = "aes")
+aesdeclast :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 {
+	return transmute(simd.u8x16)(x86._mm_aesdeclast_si128(transmute(x86.__m128i)(data), transmute(x86.__m128i)(key)))
+}
+
+@(require_results, enable_target_feature = "aes")
+aesenc :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 {
+	return transmute(simd.u8x16)(x86._mm_aesenc_si128(transmute(x86.__m128i)(data), transmute(x86.__m128i)(key)))
+}
+
+@(require_results, enable_target_feature = "aes")
+aesenclast :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 {
+	return transmute(simd.u8x16)(x86._mm_aesenclast_si128(transmute(x86.__m128i)(data), transmute(x86.__m128i)(key)))
+}
+
+@(require_results, enable_target_feature = "aes")
+aesimc :: #force_inline proc "c" (data: simd.u8x16) -> simd.u8x16 {
+	return transmute(simd.u8x16)(x86._mm_aesimc_si128(transmute(x86.__m128i)(data)))
+}
+
+@(require_results, enable_target_feature = "aes")
+aeskeygenassist :: #force_inline proc "c" (data: simd.u8x16, $IMM8: u8) -> simd.u8x16 {
+	return transmute(simd.u8x16)(x86._mm_aeskeygenassist_si128(transmute(x86.__m128i)(data), IMM8))
+}
+
+@(private, require_results, enable_target_feature = TARGET_FEATURES)
+_mm_slli_si128 :: #force_inline proc "c" (a: simd.u8x16, $IMM8: u32) -> simd.u8x16 {
+	return transmute(simd.u8x16)(x86._mm_slli_si128(transmute(x86.__m128i)(a), IMM8))
+}
+
+@(private, require_results, enable_target_feature = TARGET_FEATURES)
+_mm_shuffle_epi32 :: #force_inline proc "c" (a: simd.u8x16, $IMM8: u32) -> simd.u8x16 {
+	return transmute(simd.u8x16)(x86._mm_shuffle_epi32(transmute(x86.__m128i)(a), IMM8))
+}
+
+@(private, require_results, enable_target_feature = TARGET_FEATURES)
+_mm_shuffle_ps :: #force_inline proc "c" (a, b: simd.u8x16, $MASK: u32) -> simd.u8x16 {
+	return transmute(simd.u8x16)(x86._mm_shuffle_ps(transmute(x86.__m128)(a), transmute(x86.__m128)(b), MASK))
+}
--- a/core/crypto/_aes/hw/keysched_hw.odin
+++ b/core/crypto/_aes/hw/keysched_hw.odin
@@ -0,0 +1,181 @@
+// Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//
+//   1. Redistributions of source code must retain the above copyright
+//      notice, this list of conditions and the following disclaimer.
+//
+// THIS SOFTWARE IS PROVIDED BY THE AUTHORS “AS IS” AND ANY EXPRESS OR
+// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
+// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+// GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#+build amd64,arm32
+package aes_hw
+
+import "base:intrinsics"
+import "core:crypto"
+import "core:crypto/_aes"
+import "core:simd"
+
+// Inspiration taken from BearSSL's AES-NI implementation.
+//
+// Note: This assumes that the SROA optimization pass is enabled to be
+// anything resembling performant otherwise, LLVM will not elide a massive
+// number of redundant loads/stores it generates for every intrinsic call.
+
+@(private = "file", require_results, enable_target_feature = TARGET_FEATURES)
+expand_step128 :: #force_inline proc(k1, k2: simd.u8x16) -> simd.u8x16 {
+	k1, k2 := k1, k2
+
+	k2 = _mm_shuffle_epi32(k2, 0xff)
+	k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
+	k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
+	k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
+	return simd.bit_xor(k1, k2)
+}
+
+@(private = "file", require_results, enable_target_feature = TARGET_FEATURES)
+expand_step192a :: #force_inline proc (k1_, k2_: ^simd.u8x16, k3: simd.u8x16) -> (simd.u8x16, simd.u8x16) {
+	k1, k2, k3 := k1_^, k2_^, k3
+
+	k3 = _mm_shuffle_epi32(k3, 0x55)
+	k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
+	k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
+	k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
+	k1 = simd.bit_xor(k1, k3)
+
+	tmp := k2
+	k2 = simd.bit_xor(k2, _mm_slli_si128(k2, 0x04))
+	k2 = simd.bit_xor(k2, _mm_shuffle_epi32(k1, 0xff))
+
+	k1_, k2_ := k1_, k2_
+	k1_^, k2_^ = k1, k2
+
+	r1 := _mm_shuffle_ps(tmp, k1, 0x44)
+	r2 := _mm_shuffle_ps(k1, k2, 0x4e)
+
+	return r1, r2
+}
+
+@(private = "file", require_results, enable_target_feature = TARGET_FEATURES)
+expand_step192b :: #force_inline proc (k1_, k2_: ^simd.u8x16, k3: simd.u8x16) -> simd.u8x16 {
+	k1, k2, k3 := k1_^, k2_^, k3
+
+	k3 = _mm_shuffle_epi32(k3, 0x55)
+	k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
+	k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
+	k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
+	k1 = simd.bit_xor(k1, k3)
+
+	k2 = simd.bit_xor(k2, _mm_slli_si128(k2, 0x04))
+	k2 = simd.bit_xor(k2, _mm_shuffle_epi32(k1, 0xff))
+
+	k1_, k2_ := k1_, k2_
+	k1_^, k2_^ = k1, k2
+
+	return k1
+}
+
+@(private = "file", require_results, enable_target_feature = TARGET_FEATURES)
+expand_step256b :: #force_inline proc(k1, k2: simd.u8x16) -> simd.u8x16 {
+	k1, k2 := k1, k2
+
+	k2 = _mm_shuffle_epi32(k2, 0xaa)
+	k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
+	k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
+	k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
+	return simd.bit_xor(k1, k2)
+}
+
+@(private = "file", enable_target_feature = TARGET_FEATURES)
+derive_dec_keys :: proc(ctx: ^Context, sks: ^[15]simd.u8x16, num_rounds: int) {
+	intrinsics.unaligned_store((^simd.u8x16)(&ctx._sk_exp_dec[0]), sks[num_rounds])
+	for i in 1 ..< num_rounds {
+		tmp := aesimc(sks[i])
+		intrinsics.unaligned_store((^simd.u8x16)(&ctx._sk_exp_dec[num_rounds - i]), tmp)
+	}
+	intrinsics.unaligned_store((^simd.u8x16)(&ctx._sk_exp_dec[num_rounds]), sks[0])
+}
+
+@(private, enable_target_feature = TARGET_FEATURES)
+keysched :: proc(ctx: ^Context, key: []byte) {
+	sks: [15]simd.u8x16 = ---
+
+	// Compute the encryption keys.
+	num_rounds, key_len := 0, len(key)
+	switch key_len {
+	case _aes.KEY_SIZE_128:
+		sks[0] = intrinsics.unaligned_load((^simd.u8x16)(raw_data(key)))
+		sks[1] = expand_step128(sks[0], aeskeygenassist(sks[0], 0x01))
+		sks[2] = expand_step128(sks[1], aeskeygenassist(sks[1], 0x02))
+		sks[3] = expand_step128(sks[2], aeskeygenassist(sks[2], 0x04))
+		sks[4] = expand_step128(sks[3], aeskeygenassist(sks[3], 0x08))
+		sks[5] = expand_step128(sks[4], aeskeygenassist(sks[4], 0x10))
+		sks[6] = expand_step128(sks[5], aeskeygenassist(sks[5], 0x20))
+		sks[7] = expand_step128(sks[6], aeskeygenassist(sks[6], 0x40))
+		sks[8] = expand_step128(sks[7], aeskeygenassist(sks[7], 0x80))
+		sks[9] = expand_step128(sks[8], aeskeygenassist(sks[8], 0x1b))
+		sks[10] = expand_step128(sks[9], aeskeygenassist(sks[9], 0x36))
+		num_rounds = _aes.ROUNDS_128
+	case _aes.KEY_SIZE_192:
+		k0 := intrinsics.unaligned_load((^simd.u8x16)(raw_data(key)))
+
+		k1_tmp: [16]byte
+		copy(k1_tmp[:], key[16:24])
+		k1 := intrinsics.unaligned_load((^simd.u8x16)(&k1_tmp))
+		crypto.zero_explicit(&k1_tmp, size_of(k1_tmp))
+
+		sks[0] = k0
+		sks[1], sks[2] = expand_step192a(&k0, &k1, aeskeygenassist(k1, 0x01))
+		sks[3] = expand_step192b(&k0, &k1, aeskeygenassist(k1, 0x02))
+		sks[4], sks[5] = expand_step192a(&k0, &k1, aeskeygenassist(k1, 0x04))
+		sks[6] = expand_step192b(&k0, &k1, aeskeygenassist(k1, 0x08))
+		sks[7], sks[8] = expand_step192a(&k0, &k1, aeskeygenassist(k1, 0x10))
+		sks[9] = expand_step192b(&k0, &k1, aeskeygenassist(k1, 0x20))
+		sks[10], sks[11] = expand_step192a(&k0, &k1, aeskeygenassist(k1, 0x40))
+		sks[12] = expand_step192b(&k0, &k1, aeskeygenassist(k1, 0x80))
+		num_rounds = _aes.ROUNDS_192
+
+	case _aes.KEY_SIZE_256:
+		sks[0] = intrinsics.unaligned_load((^simd.u8x16)(raw_data(key)))
+		sks[1] = intrinsics.unaligned_load((^simd.u8x16)(raw_data(key[16:])))
+		sks[2] = expand_step128(sks[0], aeskeygenassist(sks[1], 0x01))
+		sks[3] = expand_step256b(sks[1], aeskeygenassist(sks[2], 0x01))
+		sks[4] = expand_step128(sks[2], aeskeygenassist(sks[3], 0x02))
+		sks[5] = expand_step256b(sks[3], aeskeygenassist(sks[4], 0x02))
+		sks[6] = expand_step128(sks[4], aeskeygenassist(sks[5], 0x04))
+		sks[7] = expand_step256b(sks[5], aeskeygenassist(sks[6], 0x04))
+		sks[8] = expand_step128(sks[6], aeskeygenassist(sks[7], 0x08))
+		sks[9] = expand_step256b(sks[7], aeskeygenassist(sks[8], 0x08))
+		sks[10] = expand_step128(sks[8], aeskeygenassist(sks[9], 0x10))
+		sks[11] = expand_step256b(sks[9], aeskeygenassist(sks[10], 0x10))
+		sks[12] = expand_step128(sks[10], aeskeygenassist(sks[11], 0x20))
+		sks[13] = expand_step256b(sks[11], aeskeygenassist(sks[12], 0x20))
+		sks[14] = expand_step128(sks[12], aeskeygenassist(sks[13], 0x40))
+		num_rounds = _aes.ROUNDS_256
+	case:
+		panic("crypto/aes: invalid AES key size")
+	}
+	for i in 0 ..= num_rounds {
+		intrinsics.unaligned_store((^simd.u8x16)(&ctx._sk_exp_enc[i]), sks[i])
+	}
+
+	// Compute the decryption keys.  GCM and CTR do not need this, however
+	// ECB, CBC, OCB3, etc do.
+	derive_dec_keys(ctx, &sks, num_rounds)
+
+	ctx._num_rounds = num_rounds
+
+	crypto.zero_explicit(&sks, size_of(sks))
+}
--- a/core/crypto/_aes/hw/unsupported.odin
+++ b/core/crypto/_aes/hw/unsupported.odin
@@ -0,0 +1,11 @@
+#+build !amd64
+#+build !arm64
+#+build !arm32
+package aes_hw
+
+HAS_GHASH :: false
+
+@(private)
+keysched :: proc(ctx: ^Context, key: []byte) {
+	panic("crypto/aes: hardware implementation unsupported")
+}
--- a/core/crypto/_aes/hw_intel/api.odin
+++ b/core/crypto/_aes/hw_intel/api.odin
@@ -1,38 +0,0 @@
-#+build amd64
-package aes_hw_intel
-
-import "core:sys/info"
-
-// is_supported returns true iff hardware accelerated AES
-// is supported.
-is_supported :: proc "contextless" () -> bool {
-	// Note: Everything with AES-NI and PCLMULQDQ has support for
-	// the required SSE extxtensions.
-	req_features :: info.CPU_Features{
-		.sse2,
-		.ssse3,
-		.sse41,
-		.aes,
-		.pclmulqdq,
-	}
-	return info.cpu_features() >= req_features
-}
-
-// Context is a keyed AES (ECB) instance.
-Context :: struct {
-	// Note: The ideal thing to do is for the expanded round keys to be
-	// arrays of `__m128i`, however that implies alignment (or using AVX).
-	//
-	// All the people using e-waste processors that don't support an
-	// insturction set that has been around for over 10 years are why
-	// we can't have nice things.
-	_sk_exp_enc: [15][16]byte,
-	_sk_exp_dec: [15][16]byte,
-	_num_rounds: int,
-}
-
-// init initializes a context for AES with the provided key.
-init :: proc(ctx: ^Context, key: []byte) {
-	keysched(ctx, key)
-}
-
--- a/core/crypto/_aes/hw_intel/hw_intel_keysched.odin
+++ b/core/crypto/_aes/hw_intel/hw_intel_keysched.odin
@@ -1,200 +0,0 @@
-// Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-//
-//   1. Redistributions of source code must retain the above copyright
-//      notice, this list of conditions and the following disclaimer.
-//
-// THIS SOFTWARE IS PROVIDED BY THE AUTHORS “AS IS” AND ANY EXPRESS OR
-// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
-// GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
-// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
-// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-#+build amd64
-package aes_hw_intel
-
-import "base:intrinsics"
-import "core:crypto/_aes"
-import "core:simd/x86"
-
-// Intel AES-NI based implementation.  Inspiration taken from BearSSL.
-//
-// Note: This assumes that the SROA optimization pass is enabled to be
-// anything resembling performat otherwise, LLVM will not elide a massive
-// number of redundant loads/stores it generates for every intrinsic call.
-
-@(private = "file", require_results, enable_target_feature = "sse2")
-expand_step128 :: #force_inline proc(k1, k2: x86.__m128i) -> x86.__m128i {
-	k1, k2 := k1, k2
-
-	k2 = x86._mm_shuffle_epi32(k2, 0xff)
-	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
-	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
-	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
-	return x86._mm_xor_si128(k1, k2)
-}
-
-@(private = "file", require_results, enable_target_feature = "sse,sse2")
-expand_step192a :: #force_inline proc (k1_, k2_: ^x86.__m128i, k3: x86.__m128i) -> (x86.__m128i, x86.__m128i) {
-	k1, k2, k3 := k1_^, k2_^, k3
-
-	k3 = x86._mm_shuffle_epi32(k3, 0x55)
-	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
-	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
-	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
-	k1 = x86._mm_xor_si128(k1, k3)
-
-	tmp := k2
-	k2 = x86._mm_xor_si128(k2, x86._mm_slli_si128(k2, 0x04))
-	k2 = x86._mm_xor_si128(k2, x86._mm_shuffle_epi32(k1, 0xff))
-
-	k1_, k2_ := k1_, k2_
-	k1_^, k2_^ = k1, k2
-
-	r1 := transmute(x86.__m128i)(x86._mm_shuffle_ps(transmute(x86.__m128)(tmp), transmute(x86.__m128)(k1), 0x44))
-	r2 := transmute(x86.__m128i)(x86._mm_shuffle_ps(transmute(x86.__m128)(k1), transmute(x86.__m128)(k2), 0x4e))
-
-	return r1, r2
-}
-
-@(private = "file", require_results, enable_target_feature = "sse2")
-expand_step192b :: #force_inline proc (k1_, k2_: ^x86.__m128i, k3: x86.__m128i) -> x86.__m128i {
-	k1, k2, k3 := k1_^, k2_^, k3
-
-	k3 = x86._mm_shuffle_epi32(k3, 0x55)
-	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
-	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
-	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
-	k1 = x86._mm_xor_si128(k1, k3)
-
-	k2 = x86._mm_xor_si128(k2, x86._mm_slli_si128(k2, 0x04))
-	k2 = x86._mm_xor_si128(k2, x86._mm_shuffle_epi32(k1, 0xff))
-
-	k1_, k2_ := k1_, k2_
-	k1_^, k2_^ = k1, k2
-
-	return k1
-}
-
-@(private = "file", require_results, enable_target_feature = "sse2")
-expand_step256b :: #force_inline proc(k1, k2: x86.__m128i) -> x86.__m128i {
-	k1, k2 := k1, k2
-
-	k2 = x86._mm_shuffle_epi32(k2, 0xaa)
-	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
-	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
-	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
-	return x86._mm_xor_si128(k1, k2)
-}
-
-@(private = "file", enable_target_feature = "aes")
-derive_dec_keys :: proc(ctx: ^Context, sks: ^[15]x86.__m128i, num_rounds: int) {
-	intrinsics.unaligned_store((^x86.__m128i)(&ctx._sk_exp_dec[0]), sks[num_rounds])
-	for i in 1 ..< num_rounds {
-		tmp := x86._mm_aesimc_si128(sks[i])
-		intrinsics.unaligned_store((^x86.__m128i)(&ctx._sk_exp_dec[num_rounds - i]), tmp)
-	}
-	intrinsics.unaligned_store((^x86.__m128i)(&ctx._sk_exp_dec[num_rounds]), sks[0])
-}
-
-@(private, enable_target_feature = "sse,sse2,aes")
-keysched :: proc(ctx: ^Context, key: []byte) {
-	sks: [15]x86.__m128i = ---
-
-	// Compute the encryption keys.
-	num_rounds, key_len := 0, len(key)
-	switch key_len {
-	case _aes.KEY_SIZE_128:
-		sks[0] = intrinsics.unaligned_load((^x86.__m128i)(raw_data(key)))
-		sks[1] = expand_step128(sks[0], x86._mm_aeskeygenassist_si128(sks[0], 0x01))
-		sks[2] = expand_step128(sks[1], x86._mm_aeskeygenassist_si128(sks[1], 0x02))
-		sks[3] = expand_step128(sks[2], x86._mm_aeskeygenassist_si128(sks[2], 0x04))
-		sks[4] = expand_step128(sks[3], x86._mm_aeskeygenassist_si128(sks[3], 0x08))
-		sks[5] = expand_step128(sks[4], x86._mm_aeskeygenassist_si128(sks[4], 0x10))
-		sks[6] = expand_step128(sks[5], x86._mm_aeskeygenassist_si128(sks[5], 0x20))
-		sks[7] = expand_step128(sks[6], x86._mm_aeskeygenassist_si128(sks[6], 0x40))
-		sks[8] = expand_step128(sks[7], x86._mm_aeskeygenassist_si128(sks[7], 0x80))
-		sks[9] = expand_step128(sks[8], x86._mm_aeskeygenassist_si128(sks[8], 0x1b))
-		sks[10] = expand_step128(sks[9], x86._mm_aeskeygenassist_si128(sks[9], 0x36))
-		num_rounds = _aes.ROUNDS_128
-	case _aes.KEY_SIZE_192:
-		k0 := intrinsics.unaligned_load((^x86.__m128i)(raw_data(key)))
-		k1 := x86.__m128i{
-			intrinsics.unaligned_load((^i64)(raw_data(key[16:]))),
-			0,
-		}
-		sks[0] = k0
-		sks[1], sks[2] = expand_step192a(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x01))
-		sks[3] = expand_step192b(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x02))
-		sks[4], sks[5] = expand_step192a(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x04))
-		sks[6] = expand_step192b(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x08))
-		sks[7], sks[8] = expand_step192a(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x10))
-		sks[9] = expand_step192b(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x20))
-		sks[10], sks[11] = expand_step192a(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x40))
-		sks[12] = expand_step192b(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x80))
-		num_rounds = _aes.ROUNDS_192
-	case _aes.KEY_SIZE_256:
-		sks[0] = intrinsics.unaligned_load((^x86.__m128i)(raw_data(key)))
-		sks[1] = intrinsics.unaligned_load((^x86.__m128i)(raw_data(key[16:])))
-		sks[2] = expand_step128(sks[0], x86._mm_aeskeygenassist_si128(sks[1], 0x01))
-		sks[3] = expand_step256b(sks[1], x86._mm_aeskeygenassist_si128(sks[2], 0x01))
-		sks[4] = expand_step128(sks[2], x86._mm_aeskeygenassist_si128(sks[3], 0x02))
-		sks[5] = expand_step256b(sks[3], x86._mm_aeskeygenassist_si128(sks[4], 0x02))
-		sks[6] = expand_step128(sks[4], x86._mm_aeskeygenassist_si128(sks[5], 0x04))
-		sks[7] = expand_step256b(sks[5], x86._mm_aeskeygenassist_si128(sks[6], 0x04))
-		sks[8] = expand_step128(sks[6], x86._mm_aeskeygenassist_si128(sks[7], 0x08))
-		sks[9] = expand_step256b(sks[7], x86._mm_aeskeygenassist_si128(sks[8], 0x08))
-		sks[10] = expand_step128(sks[8], x86._mm_aeskeygenassist_si128(sks[9], 0x10))
-		sks[11] = expand_step256b(sks[9], x86._mm_aeskeygenassist_si128(sks[10], 0x10))
-		sks[12] = expand_step128(sks[10], x86._mm_aeskeygenassist_si128(sks[11], 0x20))
-		sks[13] = expand_step256b(sks[11], x86._mm_aeskeygenassist_si128(sks[12], 0x20))
-		sks[14] = expand_step128(sks[12], x86._mm_aeskeygenassist_si128(sks[13], 0x40))
-		num_rounds = _aes.ROUNDS_256
-	case:
-		panic("crypto/aes: invalid AES key size")
-	}
-	for i in 0 ..= num_rounds {
-		intrinsics.unaligned_store((^x86.__m128i)(&ctx._sk_exp_enc[i]), sks[i])
-	}
-
-	// Compute the decryption keys.  GCM and CTR do not need this, however
-	// ECB, CBC, OCB3, etc do.
-	derive_dec_keys(ctx, &sks, num_rounds)
-
-	ctx._num_rounds = num_rounds
-
-	zero_explicit(&sks, size_of(sks))
-}
-
-/*
-Set each byte of a memory range to zero.
-
-This procedure copies the value `0` into the `len` bytes of a memory range,
-starting at address `data`.
-
-This procedure returns the pointer to `data`.
-
-Unlike the `zero()` procedure, which can be optimized away or reordered by the
-compiler under certain circumstances, `zero_explicit()` procedure can not be
-optimized away or reordered with other memory access operations, and the
-compiler assumes volatile semantics of the memory.
-*/
-zero_explicit :: proc "contextless" (data: rawptr, len: int) -> rawptr {
-	// This routine tries to avoid the compiler optimizing away the call,
-	// so that it is always executed.  It is intended to provide
-	// equivalent semantics to those provided by the C11 Annex K 3.7.4.1
-	// memset_s call.
-	intrinsics.mem_zero_volatile(data, len) // Use the volatile mem_zero
-	intrinsics.atomic_thread_fence(.Seq_Cst) // Prevent reordering
-	return data
-}
--- a/core/crypto/_blake2/blake2.odin
+++ b/core/crypto/_blake2/blake2.odin
@@ -11,6 +11,7 @@ package _blake2
 */

 import "base:intrinsics"
+import "core:crypto"
 import "core:encoding/endian"

 BLAKE2S_BLOCK_SIZE :: 64
@@ -18,17 +19,12 @@ BLAKE2S_SIZE :: 32
 BLAKE2B_BLOCK_SIZE :: 128
 BLAKE2B_SIZE :: 64

-MAX_SIZE :: 255
-
 Blake2s_Context :: struct {
 	h:            [8]u32,
 	t:            [2]u32,
 	f:            [2]u32,
 	x:            [BLAKE2S_BLOCK_SIZE]byte,
 	nx:           int,
-	ih:           [8]u32,
-	padded_key:   [BLAKE2S_BLOCK_SIZE]byte,
-	is_keyed:     bool,
 	size:         byte,
 	is_last_node: bool,

@@ -41,9 +37,6 @@ Blake2b_Context :: struct {
 	f:            [2]u64,
 	x:            [BLAKE2B_BLOCK_SIZE]byte,
 	nx:           int,
-	ih:           [8]u64,
-	padded_key:   [BLAKE2B_BLOCK_SIZE]byte,
-	is_keyed:     bool,
 	size:         byte,
 	is_last_node: bool,

@@ -86,11 +79,12 @@ BLAKE2B_IV := [8]u64 {

 init :: proc "contextless" (ctx: ^$T, cfg: ^Blake2_Config) {
 	when T == Blake2s_Context {
-		max_size :: BLAKE2S_SIZE
+		MAX_SIZE :: BLAKE2S_SIZE
 	} else when T == Blake2b_Context {
-		max_size :: BLAKE2B_SIZE
+		MAX_SIZE :: BLAKE2B_SIZE
 	}
-	ensure_contextless(cfg.size <= max_size, "blake2: requested output size exceeeds algorithm max")
+	ensure_contextless(cfg.size <= MAX_SIZE, "blake2: requested output size exceeeds algorithm max")
+	ensure_contextless(len(cfg.key) <= MAX_SIZE, "blake2: requested key size exceeeds algorithm max")

 	// To save having to allocate a scratch buffer, use the internal
 	// data buffer (`ctx.x`), as it is exactly the correct size.
@@ -133,7 +127,7 @@ init :: proc "contextless" (ctx: ^$T, cfg: ^Blake2_Config) {
 			p[17] = cfg.tree.(Blake2_Tree).inner_hash_size
 		}
 	} else {
-		p[2], p[3] = 1, 1
+		p[2], p[3], p[4], p[5], p[6], p[7] = 1, 1, 0, 0, 0, 0
 	}
 	ctx.size = cfg.size
 	for i := 0; i < 8; i += 1 {
@@ -151,17 +145,11 @@ init :: proc "contextless" (ctx: ^$T, cfg: ^Blake2_Config) {
 		ctx.is_last_node = true
 	}
 	if len(cfg.key) > 0 {
-		copy(ctx.padded_key[:], cfg.key)
-		update(ctx, ctx.padded_key[:])
-		ctx.is_keyed = true
+		copy(ctx.x[:], cfg.key)
+		ctx.nx = len(ctx.x)
+	} else {
+		ctx.nx = 0
 	}
-	copy(ctx.ih[:], ctx.h[:])
-	copy(ctx.h[:], ctx.ih[:])
-	if ctx.is_keyed {
-		update(ctx, ctx.padded_key[:])
-	}
-
-	ctx.nx = 0

 	ctx.is_initialized = true
 }
@@ -171,22 +159,22 @@ update :: proc "contextless" (ctx: ^$T, p: []byte) {

 	p := p
 	when T == Blake2s_Context {
-		block_size :: BLAKE2S_BLOCK_SIZE
+		BLOCK_SIZE :: BLAKE2S_BLOCK_SIZE
 	} else when T == Blake2b_Context {
-		block_size :: BLAKE2B_BLOCK_SIZE
+		BLOCK_SIZE :: BLAKE2B_BLOCK_SIZE
 	}

-	left := block_size - ctx.nx
+	left := BLOCK_SIZE - ctx.nx
 	if len(p) > left {
 		copy(ctx.x[ctx.nx:], p[:left])
 		p = p[left:]
 		blocks(ctx, ctx.x[:])
 		ctx.nx = 0
 	}
-	if len(p) > block_size {
-		n := len(p) &~ (block_size - 1)
+	if len(p) > BLOCK_SIZE {
+		n := len(p) &~ (BLOCK_SIZE - 1)
 		if n == len(p) {
-			n -= block_size
+			n -= BLOCK_SIZE
 		}
 		blocks(ctx, p[:n])
 		p = p[n:]
@@ -222,17 +210,11 @@ reset :: proc "contextless" (ctx: ^$T) {
 		return
 	}

-	zero_explicit(ctx, size_of(ctx^))
+	crypto.zero_explicit(ctx, size_of(ctx^))
 }

@(private)
 blake2s_final :: proc "contextless" (ctx: ^Blake2s_Context, hash: []byte) {
-	if ctx.is_keyed {
-		for i := 0; i < len(ctx.padded_key); i += 1 {
-			ctx.padded_key[i] = 0
-		}
-	}
-
 	dec := BLAKE2S_BLOCK_SIZE - u32(ctx.nx)
 	if ctx.t[0] < dec {
 		ctx.t[1] -= 1
@@ -244,23 +226,20 @@ blake2s_final :: proc "contextless" (ctx: ^Blake2s_Context, hash: []byte) {
 		ctx.f[1] = 0xffffffff
 	}

+	for i := ctx.nx; i < BLAKE2S_BLOCK_SIZE; i+= 1 {
+		ctx.x[i] = 0
+	}
 	blocks(ctx, ctx.x[:])

 	dst: [BLAKE2S_SIZE]byte
 	for i := 0; i < BLAKE2S_SIZE / 4; i += 1 {
 		endian.unchecked_put_u32le(dst[i * 4:], ctx.h[i])
 	}
-	copy(hash, dst[:])
+	copy(hash, dst[:ctx.size])
 }

@(private)
 blake2b_final :: proc "contextless" (ctx: ^Blake2b_Context, hash: []byte) {
-	if ctx.is_keyed {
-		for i := 0; i < len(ctx.padded_key); i += 1 {
-			ctx.padded_key[i] = 0
-		}
-	}
-
 	dec := BLAKE2B_BLOCK_SIZE - u64(ctx.nx)
 	if ctx.t[0] < dec {
 		ctx.t[1] -= 1
@@ -272,6 +251,9 @@ blake2b_final :: proc "contextless" (ctx: ^Blake2b_Context, hash: []byte) {
 		ctx.f[1] = 0xffffffffffffffff
 	}

+	for i := ctx.nx; i < BLAKE2B_BLOCK_SIZE; i+= 1 {
+		ctx.x[i] = 0
+	}
 	blocks(ctx, ctx.x[:])

 	dst: [BLAKE2B_SIZE]byte
@@ -2877,27 +2859,3 @@ blake2b_blocks :: #force_inline proc "contextless" (ctx: ^Blake2b_Context, p: []
 	ctx.h[0], ctx.h[1], ctx.h[2], ctx.h[3], ctx.h[4], ctx.h[5], ctx.h[6], ctx.h[7] =
 		h0, h1, h2, h3, h4, h5, h6, h7
 }
-
-/*
-Set each byte of a memory range to zero.
-
-This procedure copies the value `0` into the `len` bytes of a memory range,
-starting at address `data`.
-
-This procedure returns the pointer to `data`.
-
-Unlike the `zero()` procedure, which can be optimized away or reordered by the
-compiler under certain circumstances, `zero_explicit()` procedure can not be
-optimized away or reordered with other memory access operations, and the
-compiler assumes volatile semantics of the memory.
-*/
-@(private)
-zero_explicit :: proc "contextless" (data: rawptr, len: int) -> rawptr {
-	// This routine tries to avoid the compiler optimizing away the call,
-	// so that it is always executed.  It is intended to provide
-	// equivalent semantics to those provided by the C11 Annex K 3.7.4.1
-	// memset_s call.
-	intrinsics.mem_zero_volatile(data, len) // Use the volatile mem_zero
-	intrinsics.atomic_thread_fence(.Seq_Cst) // Prevent reordering
-	return data
-}
--- a/core/crypto/_chacha20/simd128/chacha20_simd128.odin
+++ b/core/crypto/_chacha20/simd128/chacha20_simd128.odin
@@ -215,7 +215,7 @@ _store_simd128 :: #force_inline proc "contextless" (
 	intrinsics.unaligned_store((^simd.u32x4)(dst[3:]), v3)
 }

-// is_performant returns true iff the target and current host both support
+// is_performant returns true if and only if (⟺) the target and current host both support
 // "enough" 128-bit SIMD to make this implementation performant.
 is_performant :: proc "contextless" () -> bool {
 	when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 || ODIN_ARCH == .amd64 || ODIN_ARCH == .i386 || ODIN_ARCH == .riscv64 {
--- a/core/crypto/_chacha20/simd256/chacha20_simd256.odin
+++ b/core/crypto/_chacha20/simd256/chacha20_simd256.odin
@@ -36,7 +36,7 @@ _VEC_ZERO_ONE: simd.u64x4 : {0, 0, 1, 0}
@(private = "file")
 _VEC_TWO: simd.u64x4 : {2, 0, 2, 0}

-// is_performant returns true iff the target and current host both support
+// is_performant returns true if and only if (⟺) the target and current host both support
 // "enough" SIMD to make this implementation performant.
 is_performant :: proc "contextless" () -> bool {
 	req_features :: info.CPU_Features{.avx, .avx2}
--- a/core/crypto/_fiat/field_p256r1/field.odin
+++ b/core/crypto/_fiat/field_p256r1/field.odin
@@ -69,7 +69,7 @@ fe_equal :: proc "contextless" (arg1, arg2: ^Montgomery_Domain_Field_Element) ->
 	tmp: Montgomery_Domain_Field_Element = ---
 	fe_sub(&tmp, arg1, arg2)

-	// This will only underflow iff arg1 == arg2, and we return the borrow,
+	// This will only underflow if and only if (⟺) arg1 == arg2, and we return the borrow,
 	// which will be 1.
 	is_eq := subtle.u64_is_zero(fe_non_zero(&tmp))

--- a/core/crypto/_fiat/field_p384r1/field.odin
+++ b/core/crypto/_fiat/field_p384r1/field.odin
@@ -75,7 +75,7 @@ fe_equal :: proc "contextless" (arg1, arg2: ^Montgomery_Domain_Field_Element) ->
 	tmp: Montgomery_Domain_Field_Element = ---
 	fe_sub(&tmp, arg1, arg2)

-	// This will only underflow iff arg1 == arg2, and we return the borrow,
+	// This will only underflow if and only if (⟺) arg1 == arg2, and we return the borrow,
 	// which will be 1.
 	is_eq := subtle.u64_is_zero(fe_non_zero(&tmp))

--- a/core/crypto/_subtle/subtle.odin
+++ b/core/crypto/_subtle/subtle.odin
@@ -5,17 +5,17 @@ package _subtle

 import "core:math/bits"

-// byte_eq returns 1 iff a == b, 0 otherwise.
+// byte_eq returns 1 if and only if (⟺) a == b, 0 otherwise.
@(optimization_mode="none")
 byte_eq :: proc "contextless" (a, b: byte) -> int {
 	v := a ~ b

-	// v == 0 iff a == b.  The subtraction will underflow, setting the
+	// v == 0 if and only if (⟺) a == b.  The subtraction will underflow, setting the
 	// sign bit, which will get returned.
 	return int((u32(v)-1) >> 31)
 }

-// u64_eq returns 1 iff a == b, 0 otherwise.
+// u64_eq returns 1 if and only if (⟺) a == b, 0 otherwise.
@(optimization_mode="none")
 u64_eq :: proc "contextless" (a, b: u64) -> u64 {
 	_, borrow := bits.sub_u64(0, a ~ b, 0)
@@ -27,14 +27,14 @@ eq :: proc {
 	u64_eq,
 }

-// u64_is_zero returns 1 iff a == 0, 0 otherwise.
+// u64_is_zero returns 1 if and only if (⟺) a == 0, 0 otherwise.
@(optimization_mode="none")
 u64_is_zero :: proc "contextless" (a: u64) -> u64 {
 	_, borrow := bits.sub_u64(a, 1, 0)
 	return borrow
 }

-// u64_is_non_zero returns 1 iff a != 0, 0 otherwise.
+// u64_is_non_zero returns 1 if and only if (⟺) a != 0, 0 otherwise.
@(optimization_mode="none")
 u64_is_non_zero :: proc "contextless" (a: u64) -> u64 {
 	is_zero := u64_is_zero(a)
--- a/core/crypto/aead/aead.odin
+++ b/core/crypto/aead/aead.odin
@@ -13,7 +13,7 @@ seal_oneshot :: proc(algo: Algorithm, dst, tag, key, iv, aad, plaintext: []byte,

 // open authenticates the aad and ciphertext, and decrypts the ciphertext,
 // with the provided algorithm, key, iv, and tag, and stores the output in dst,
-// returning true iff the authentication was successful.  If authentication
+// returning true if and only if (⟺) the authentication was successful.  If authentication
 // fails, the destination buffer will be zeroed.
 //
 // dst and ciphertext MUST alias exactly or not at all.
--- a/core/crypto/aead/low_level.odin
+++ b/core/crypto/aead/low_level.odin
@@ -183,7 +183,7 @@ seal_ctx :: proc(ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) {

 // open_ctx authenticates the aad and ciphertext, and decrypts the ciphertext,
 // with the provided Context, iv, and tag, and stores the output in dst,
-// returning true iff the authentication was successful.  If authentication
+// returning true if and only if (⟺) the authentication was successful.  If authentication
 // fails, the destination buffer will be zeroed.
 //
 // dst and plaintext MUST alias exactly or not at all.
--- a/core/crypto/aegis/aegis.odin
+++ b/core/crypto/aegis/aegis.odin
@@ -144,7 +144,7 @@ seal :: proc(ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) {

 // open authenticates the aad and ciphertext, and decrypts the ciphertext,
 // with the provided Context, iv, and tag, and stores the output in dst,
-// returning true iff the authentication was successful.  If authentication
+// returning true if and only if (⟺) the authentication was successful.  If authentication
 // fails, the destination buffer will be zeroed.
 //
 // dst and plaintext MUST alias exactly or not at all.
--- a/core/crypto/aegis/aegis_impl_hw.odin
+++ b/core/crypto/aegis/aegis_impl_hw.odin
@@ -0,0 +1,397 @@
+#+build amd64,arm32
+package aegis
+
+import "base:intrinsics"
+import "core:crypto"
+import aes_hw "core:crypto/_aes/hw"
+import "core:encoding/endian"
+import "core:simd"
+
+@(private)
+State_HW :: struct {
+	s0:   simd.u8x16,
+	s1:   simd.u8x16,
+	s2:   simd.u8x16,
+	s3:   simd.u8x16,
+	s4:   simd.u8x16,
+	s5:   simd.u8x16,
+	s6:   simd.u8x16,
+	s7:   simd.u8x16,
+	rate: int,
+}
+
+when ODIN_ARCH == .amd64 {
+	@(private="file")
+	TARGET_FEATURES :: "sse2,aes"
+} else when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 {
+	@(private="file")
+	TARGET_FEATURES :: "neon,aes"
+}
+
+// is_hardware_accelerated returns true if and only if (⟺) hardware
+// accelerated AEGIS is supported.
+is_hardware_accelerated :: proc "contextless" () -> bool {
+	return aes_hw.is_supported()
+}
+
+@(private, enable_target_feature = TARGET_FEATURES)
+init_hw :: proc "contextless" (ctx: ^Context, st: ^State_HW, iv: []byte) {
+	switch ctx._key_len {
+	case KEY_SIZE_128L:
+		key := intrinsics.unaligned_load((^simd.u8x16)(&ctx._key[0]))
+		iv := intrinsics.unaligned_load((^simd.u8x16)(raw_data(iv)))
+
+		st.s0 = simd.bit_xor(key, iv)
+		st.s1 = intrinsics.unaligned_load((^simd.u8x16)(&_C1[0]))
+		st.s2 = intrinsics.unaligned_load((^simd.u8x16)(&_C0[0]))
+		st.s3 = st.s1
+		st.s4 = st.s0
+		st.s5 = simd.bit_xor(key, st.s2) // key ^ C0
+		st.s6 = simd.bit_xor(key, st.s1) // key ^ C1
+		st.s7 = st.s5
+		st.rate = _RATE_128L
+
+		for _ in 0 ..< 10 {
+			update_hw_128l(st, iv, key)
+		}
+	case KEY_SIZE_256:
+		k0 := intrinsics.unaligned_load((^simd.u8x16)(&ctx._key[0]))
+		k1 := intrinsics.unaligned_load((^simd.u8x16)(&ctx._key[16]))
+		n0 := intrinsics.unaligned_load((^simd.u8x16)(&iv[0]))
+		n1 := intrinsics.unaligned_load((^simd.u8x16)(&iv[16]))
+
+		st.s0 = simd.bit_xor(k0, n0)
+		st.s1 = simd.bit_xor(k1, n1)
+		st.s2 = intrinsics.unaligned_load((^simd.u8x16)(&_C1[0]))
+		st.s3 = intrinsics.unaligned_load((^simd.u8x16)(&_C0[0]))
+		st.s4 = simd.bit_xor(k0, st.s3) // k0 ^ C0
+		st.s5 = simd.bit_xor(k1, st.s2) // k1 ^ C1
+		st.rate = _RATE_256
+
+		u0, u1 := st.s0, st.s1
+		for _ in 0 ..< 4 {
+			update_hw_256(st, k0)
+			update_hw_256(st, k1)
+			update_hw_256(st, u0)
+			update_hw_256(st, u1)
+		}
+	}
+}
+
+@(private = "file", enable_target_feature = TARGET_FEATURES)
+update_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, m0, m1: simd.u8x16) {
+	s0_ := aes_hw.aesenc(st.s7, simd.bit_xor(st.s0, m0))
+	s1_ := aes_hw.aesenc(st.s0, st.s1)
+	s2_ := aes_hw.aesenc(st.s1, st.s2)
+	s3_ := aes_hw.aesenc(st.s2, st.s3)
+	s4_ := aes_hw.aesenc(st.s3, simd.bit_xor(st.s4, m1))
+	s5_ := aes_hw.aesenc(st.s4, st.s5)
+	s6_ := aes_hw.aesenc(st.s5, st.s6)
+	s7_ := aes_hw.aesenc(st.s6, st.s7)
+	st.s0, st.s1, st.s2, st.s3, st.s4, st.s5, st.s6, st.s7 = s0_, s1_, s2_, s3_, s4_, s5_, s6_, s7_
+}
+
+@(private = "file", enable_target_feature = TARGET_FEATURES)
+update_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, m: simd.u8x16) {
+	s0_ := aes_hw.aesenc(st.s5, simd.bit_xor(st.s0, m))
+	s1_ := aes_hw.aesenc(st.s0, st.s1)
+	s2_ := aes_hw.aesenc(st.s1, st.s2)
+	s3_ := aes_hw.aesenc(st.s2, st.s3)
+	s4_ := aes_hw.aesenc(st.s3, st.s4)
+	s5_ := aes_hw.aesenc(st.s4, st.s5)
+	st.s0, st.s1, st.s2, st.s3, st.s4, st.s5 = s0_, s1_, s2_, s3_, s4_, s5_
+}
+
+@(private = "file", enable_target_feature = TARGET_FEATURES)
+absorb_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, ai: []byte) {
+	t0 := intrinsics.unaligned_load((^simd.u8x16)(&ai[0]))
+	t1 := intrinsics.unaligned_load((^simd.u8x16)(&ai[16]))
+	update_hw_128l(st, t0, t1)
+}
+
+@(private = "file", enable_target_feature = TARGET_FEATURES)
+absorb_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, ai: []byte) {
+	m := intrinsics.unaligned_load((^simd.u8x16)(&ai[0]))
+	update_hw_256(st, m)
+}
+
+@(private, enable_target_feature = TARGET_FEATURES)
+absorb_hw :: proc "contextless" (st: ^State_HW, aad: []byte) #no_bounds_check {
+	ai, l := aad, len(aad)
+
+	switch st.rate {
+	case _RATE_128L:
+		for l >= _RATE_128L {
+			absorb_hw_128l(st, ai)
+			ai = ai[_RATE_128L:]
+			l -= _RATE_128L
+		}
+	case _RATE_256:
+		for l >= _RATE_256 {
+			absorb_hw_256(st, ai)
+
+			ai = ai[_RATE_256:]
+			l -= _RATE_256
+		}
+	}
+
+	// Pad out the remainder with `0`s till it is rate sized.
+	if l > 0 {
+		tmp: [_RATE_MAX]byte // AAD is not confidential.
+		copy(tmp[:], ai)
+		switch st.rate {
+		case _RATE_128L:
+			absorb_hw_128l(st, tmp[:])
+		case _RATE_256:
+			absorb_hw_256(st, tmp[:])
+		}
+	}
+}
+
+@(private = "file", enable_target_feature = TARGET_FEATURES, require_results)
+z_hw_128l :: #force_inline proc "contextless" (st: ^State_HW) -> (simd.u8x16, simd.u8x16) {
+	z0 := simd.bit_xor(
+		st.s6,
+		simd.bit_xor(
+			st.s1,
+			simd.bit_and(st.s2, st.s3),
+		),
+	)
+	z1 := simd.bit_xor(
+		st.s2,
+		simd.bit_xor(
+			st.s5,
+			simd.bit_and(st.s6, st.s7),
+		),
+	)
+	return z0, z1
+}
+
+@(private = "file", enable_target_feature = TARGET_FEATURES, require_results)
+z_hw_256 :: #force_inline proc "contextless" (st: ^State_HW) -> simd.u8x16 {
+	return simd.bit_xor(
+		st.s1,
+		simd.bit_xor(
+			st.s4,
+			simd.bit_xor(
+				st.s5,
+				simd.bit_and(st.s2, st.s3),
+			),
+		),
+	)
+}
+
+@(private = "file", enable_target_feature = TARGET_FEATURES)
+enc_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, ci, xi: []byte) #no_bounds_check {
+	z0, z1 := z_hw_128l(st)
+
+	t0 := intrinsics.unaligned_load((^simd.u8x16)(&xi[0]))
+	t1 := intrinsics.unaligned_load((^simd.u8x16)(&xi[16]))
+	update_hw_128l(st, t0, t1)
+
+	out0 := simd.bit_xor(t0, z0)
+	out1 := simd.bit_xor(t1, z1)
+	intrinsics.unaligned_store((^simd.u8x16)(&ci[0]), out0)
+	intrinsics.unaligned_store((^simd.u8x16)(&ci[16]), out1)
+}
+
+@(private = "file", enable_target_feature = TARGET_FEATURES)
+enc_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, ci, xi: []byte) #no_bounds_check {
+	z := z_hw_256(st)
+
+	xi_ := intrinsics.unaligned_load((^simd.u8x16)(raw_data(xi)))
+	update_hw_256(st, xi_)
+
+	ci_ := simd.bit_xor(xi_, z)
+	intrinsics.unaligned_store((^simd.u8x16)(raw_data(ci)), ci_)
+}
+
+@(private, enable_target_feature = TARGET_FEATURES)
+enc_hw :: proc "contextless" (st: ^State_HW, dst, src: []byte) #no_bounds_check {
+	ci, xi, l := dst, src, len(src)
+
+	switch st.rate {
+	case _RATE_128L:
+		for l >= _RATE_128L {
+			enc_hw_128l(st, ci, xi)
+			ci = ci[_RATE_128L:]
+			xi = xi[_RATE_128L:]
+			l -= _RATE_128L
+		}
+	case _RATE_256:
+		for l >= _RATE_256 {
+			enc_hw_256(st, ci, xi)
+			ci = ci[_RATE_256:]
+			xi = xi[_RATE_256:]
+			l -= _RATE_256
+		}
+	}
+
+	// Pad out the remainder with `0`s till it is rate sized.
+	if l > 0 {
+		tmp: [_RATE_MAX]byte // Ciphertext is not confidential.
+		copy(tmp[:], xi)
+		switch st.rate {
+		case _RATE_128L:
+			enc_hw_128l(st, tmp[:], tmp[:])
+		case _RATE_256:
+			enc_hw_256(st, tmp[:], tmp[:])
+		}
+		copy(ci, tmp[:l])
+	}
+}
+
+@(private = "file", enable_target_feature = TARGET_FEATURES)
+dec_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, xi, ci: []byte) #no_bounds_check {
+	z0, z1 := z_hw_128l(st)
+
+	t0 := intrinsics.unaligned_load((^simd.u8x16)(&ci[0]))
+	t1 := intrinsics.unaligned_load((^simd.u8x16)(&ci[16]))
+	out0 := simd.bit_xor(t0, z0)
+	out1 := simd.bit_xor(t1, z1)
+
+	update_hw_128l(st, out0, out1)
+	intrinsics.unaligned_store((^simd.u8x16)(&xi[0]), out0)
+	intrinsics.unaligned_store((^simd.u8x16)(&xi[16]), out1)
+}
+
+@(private = "file", enable_target_feature = TARGET_FEATURES)
+dec_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, xi, ci: []byte) #no_bounds_check {
+	z := z_hw_256(st)
+
+	ci_ := intrinsics.unaligned_load((^simd.u8x16)(raw_data(ci)))
+	xi_ := simd.bit_xor(ci_, z)
+
+	update_hw_256(st, xi_)
+	intrinsics.unaligned_store((^simd.u8x16)(raw_data(xi)), xi_)
+}
+
+@(private = "file", enable_target_feature = TARGET_FEATURES)
+dec_partial_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, xn, cn: []byte) #no_bounds_check {
+	tmp: [_RATE_128L]byte
+	defer crypto.zero_explicit(&tmp, size_of(tmp))
+
+	z0, z1 := z_hw_128l(st)
+	copy(tmp[:], cn)
+
+	t0 := intrinsics.unaligned_load((^simd.u8x16)(&tmp[0]))
+	t1 := intrinsics.unaligned_load((^simd.u8x16)(&tmp[16]))
+	out0 := simd.bit_xor(t0, z0)
+	out1 := simd.bit_xor(t1, z1)
+
+	intrinsics.unaligned_store((^simd.u8x16)(&tmp[0]), out0)
+	intrinsics.unaligned_store((^simd.u8x16)(&tmp[16]), out1)
+	copy(xn, tmp[:])
+
+	for off := len(xn); off < _RATE_128L; off += 1 {
+		tmp[off] = 0
+	}
+	out0 = intrinsics.unaligned_load((^simd.u8x16)(&tmp[0])) // v0
+	out1 = intrinsics.unaligned_load((^simd.u8x16)(&tmp[16])) // v1
+	update_hw_128l(st, out0, out1)
+}
+
+@(private = "file", enable_target_feature = TARGET_FEATURES)
+dec_partial_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, xn, cn: []byte) #no_bounds_check {
+	tmp: [_RATE_256]byte
+	defer crypto.zero_explicit(&tmp, size_of(tmp))
+
+	z := z_hw_256(st)
+	copy(tmp[:], cn)
+
+	cn_ := intrinsics.unaligned_load((^simd.u8x16)(&tmp[0]))
+	xn_ := simd.bit_xor(cn_, z)
+
+	intrinsics.unaligned_store((^simd.u8x16)(&tmp[0]), xn_)
+	copy(xn, tmp[:])
+
+	for off := len(xn); off < _RATE_256; off += 1 {
+		tmp[off] = 0
+	}
+	xn_ = intrinsics.unaligned_load((^simd.u8x16)(&tmp[0]))
+	update_hw_256(st, xn_)
+}
+
+@(private, enable_target_feature = TARGET_FEATURES)
+dec_hw :: proc "contextless" (st: ^State_HW, dst, src: []byte) #no_bounds_check {
+	xi, ci, l := dst, src, len(src)
+
+	switch st.rate {
+	case _RATE_128L:
+		for l >= _RATE_128L {
+			dec_hw_128l(st, xi, ci)
+			xi = xi[_RATE_128L:]
+			ci = ci[_RATE_128L:]
+			l -= _RATE_128L
+		}
+	case _RATE_256:
+		for l >= _RATE_256 {
+			dec_hw_256(st, xi, ci)
+			xi = xi[_RATE_256:]
+			ci = ci[_RATE_256:]
+			l -= _RATE_256
+		}
+	}
+
+	// Process the remainder.
+	if l > 0 {
+		switch st.rate {
+		case _RATE_128L:
+			dec_partial_hw_128l(st, xi, ci)
+		case _RATE_256:
+			dec_partial_hw_256(st, xi, ci)
+		}
+	}
+}
+
+@(private, enable_target_feature = TARGET_FEATURES)
+finalize_hw :: proc "contextless" (st: ^State_HW, tag: []byte, ad_len, msg_len: int) {
+	tmp: [16]byte
+	endian.unchecked_put_u64le(tmp[0:], u64(ad_len) * 8)
+	endian.unchecked_put_u64le(tmp[8:], u64(msg_len) * 8)
+
+	t := intrinsics.unaligned_load((^simd.u8x16)(&tmp[0]))
+
+	t0, t1: simd.u8x16 = ---, ---
+	switch st.rate {
+	case _RATE_128L:
+		t = simd.bit_xor(st.s2, t)
+		for _ in 0 ..< 7 {
+			update_hw_128l(st, t, t)
+		}
+
+		t0 = simd.bit_xor(st.s0, st.s1)
+		t0 = simd.bit_xor(t0, st.s2)
+		t0 = simd.bit_xor(t0, st.s3)
+
+		t1 = simd.bit_xor(st.s4, st.s5)
+		t1 = simd.bit_xor(t1, st.s6)
+		if len(tag) == TAG_SIZE_256 {
+			t1 = simd.bit_xor(t1, st.s7)
+		}
+	case _RATE_256:
+		t = simd.bit_xor(st.s3, t)
+		for _ in 0 ..< 7 {
+			update_hw_256(st, t)
+		}
+
+		t0 = simd.bit_xor(st.s0, st.s1)
+		t0 = simd.bit_xor(t0, st.s2)
+
+		t1 = simd.bit_xor(st.s3, st.s4)
+		t1 = simd.bit_xor(t1, st.s5)
+	}
+	switch len(tag) {
+	case TAG_SIZE_128:
+		t0 = simd.bit_xor(t0, t1)
+		intrinsics.unaligned_store((^simd.u8x16)(&tag[0]), t0)
+	case TAG_SIZE_256:
+		intrinsics.unaligned_store((^simd.u8x16)(&tag[0]), t0)
+		intrinsics.unaligned_store((^simd.u8x16)(&tag[16]), t1)
+	}
+}
+
+@(private)
+reset_state_hw :: proc "contextless" (st: ^State_HW) {
+	crypto.zero_explicit(st, size_of(st^))
+}
--- a/core/crypto/aegis/aegis_impl_hw_gen.odin
+++ b/core/crypto/aegis/aegis_impl_hw_gen.odin
@@ -1,4 +1,6 @@
 #+build !amd64
+#+build !arm64
+#+build !arm32
 package aegis

@(private = "file")
@@ -7,7 +9,7 @@ ERR_HW_NOT_SUPPORTED :: "crypto/aegis: hardware implementation unsupported"
@(private)
 State_HW :: struct {}

-// is_hardware_accelerated returns true iff hardware accelerated AEGIS
+// is_hardware_accelerated returns true if and only if (⟺) hardware accelerated AEGIS
 // is supported.
 is_hardware_accelerated :: proc "contextless" () -> bool {
 	return false
--- a/core/crypto/aegis/aegis_impl_hw_intel.odin
+++ b/core/crypto/aegis/aegis_impl_hw_intel.odin
@@ -1,389 +0,0 @@
-#+build amd64
-package aegis
-
-import "base:intrinsics"
-import "core:crypto"
-import "core:crypto/aes"
-import "core:encoding/endian"
-import "core:simd/x86"
-
-@(private)
-State_HW :: struct {
-	s0:   x86.__m128i,
-	s1:   x86.__m128i,
-	s2:   x86.__m128i,
-	s3:   x86.__m128i,
-	s4:   x86.__m128i,
-	s5:   x86.__m128i,
-	s6:   x86.__m128i,
-	s7:   x86.__m128i,
-	rate: int,
-}
-
-// is_hardware_accelerated returns true iff hardware accelerated AEGIS
-// is supported.
-is_hardware_accelerated :: proc "contextless" () -> bool {
-	return aes.is_hardware_accelerated()
-}
-
-@(private, enable_target_feature = "sse2,aes")
-init_hw :: proc "contextless" (ctx: ^Context, st: ^State_HW, iv: []byte) {
-	switch ctx._key_len {
-	case KEY_SIZE_128L:
-		key := intrinsics.unaligned_load((^x86.__m128i)(&ctx._key[0]))
-		iv := intrinsics.unaligned_load((^x86.__m128i)(raw_data(iv)))
-
-		st.s0 = x86._mm_xor_si128(key, iv)
-		st.s1 = intrinsics.unaligned_load((^x86.__m128i)(&_C1[0]))
-		st.s2 = intrinsics.unaligned_load((^x86.__m128i)(&_C0[0]))
-		st.s3 = st.s1
-		st.s4 = st.s0
-		st.s5 = x86._mm_xor_si128(key, st.s2) // key ^ C0
-		st.s6 = x86._mm_xor_si128(key, st.s1) // key ^ C1
-		st.s7 = st.s5
-		st.rate = _RATE_128L
-
-		for _ in 0 ..< 10 {
-			update_hw_128l(st, iv, key)
-		}
-	case KEY_SIZE_256:
-		k0 := intrinsics.unaligned_load((^x86.__m128i)(&ctx._key[0]))
-		k1 := intrinsics.unaligned_load((^x86.__m128i)(&ctx._key[16]))
-		n0 := intrinsics.unaligned_load((^x86.__m128i)(&iv[0]))
-		n1 := intrinsics.unaligned_load((^x86.__m128i)(&iv[16]))
-
-		st.s0 = x86._mm_xor_si128(k0, n0)
-		st.s1 = x86._mm_xor_si128(k1, n1)
-		st.s2 = intrinsics.unaligned_load((^x86.__m128i)(&_C1[0]))
-		st.s3 = intrinsics.unaligned_load((^x86.__m128i)(&_C0[0]))
-		st.s4 = x86._mm_xor_si128(k0, st.s3) // k0 ^ C0
-		st.s5 = x86._mm_xor_si128(k1, st.s2) // k1 ^ C1
-		st.rate = _RATE_256
-
-		u0, u1 := st.s0, st.s1
-		for _ in 0 ..< 4 {
-			update_hw_256(st, k0)
-			update_hw_256(st, k1)
-			update_hw_256(st, u0)
-			update_hw_256(st, u1)
-		}
-	}
-}
-
-@(private = "file", enable_target_feature = "sse2,aes")
-update_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, m0, m1: x86.__m128i) {
-	s0_ := x86._mm_aesenc_si128(st.s7, x86._mm_xor_si128(st.s0, m0))
-	s1_ := x86._mm_aesenc_si128(st.s0, st.s1)
-	s2_ := x86._mm_aesenc_si128(st.s1, st.s2)
-	s3_ := x86._mm_aesenc_si128(st.s2, st.s3)
-	s4_ := x86._mm_aesenc_si128(st.s3, x86._mm_xor_si128(st.s4, m1))
-	s5_ := x86._mm_aesenc_si128(st.s4, st.s5)
-	s6_ := x86._mm_aesenc_si128(st.s5, st.s6)
-	s7_ := x86._mm_aesenc_si128(st.s6, st.s7)
-	st.s0, st.s1, st.s2, st.s3, st.s4, st.s5, st.s6, st.s7 = s0_, s1_, s2_, s3_, s4_, s5_, s6_, s7_
-}
-
-@(private = "file", enable_target_feature = "sse2,aes")
-update_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, m: x86.__m128i) {
-	s0_ := x86._mm_aesenc_si128(st.s5, x86._mm_xor_si128(st.s0, m))
-	s1_ := x86._mm_aesenc_si128(st.s0, st.s1)
-	s2_ := x86._mm_aesenc_si128(st.s1, st.s2)
-	s3_ := x86._mm_aesenc_si128(st.s2, st.s3)
-	s4_ := x86._mm_aesenc_si128(st.s3, st.s4)
-	s5_ := x86._mm_aesenc_si128(st.s4, st.s5)
-	st.s0, st.s1, st.s2, st.s3, st.s4, st.s5 = s0_, s1_, s2_, s3_, s4_, s5_
-}
-
-@(private = "file", enable_target_feature = "sse2,aes")
-absorb_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, ai: []byte) {
-	t0 := intrinsics.unaligned_load((^x86.__m128i)(&ai[0]))
-	t1 := intrinsics.unaligned_load((^x86.__m128i)(&ai[16]))
-	update_hw_128l(st, t0, t1)
-}
-
-@(private = "file", enable_target_feature = "sse2,aes")
-absorb_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, ai: []byte) {
-	m := intrinsics.unaligned_load((^x86.__m128i)(&ai[0]))
-	update_hw_256(st, m)
-}
-
-@(private, enable_target_feature = "sse2,aes")
-absorb_hw :: proc "contextless" (st: ^State_HW, aad: []byte) #no_bounds_check {
-	ai, l := aad, len(aad)
-
-	switch st.rate {
-	case _RATE_128L:
-		for l >= _RATE_128L {
-			absorb_hw_128l(st, ai)
-			ai = ai[_RATE_128L:]
-			l -= _RATE_128L
-		}
-	case _RATE_256:
-		for l >= _RATE_256 {
-			absorb_hw_256(st, ai)
-
-			ai = ai[_RATE_256:]
-			l -= _RATE_256
-		}
-	}
-
-	// Pad out the remainder with `0`s till it is rate sized.
-	if l > 0 {
-		tmp: [_RATE_MAX]byte // AAD is not confidential.
-		copy(tmp[:], ai)
-		switch st.rate {
-		case _RATE_128L:
-			absorb_hw_128l(st, tmp[:])
-		case _RATE_256:
-			absorb_hw_256(st, tmp[:])
-		}
-	}
-}
-
-@(private = "file", enable_target_feature = "sse2", require_results)
-z_hw_128l :: #force_inline proc "contextless" (st: ^State_HW) -> (x86.__m128i, x86.__m128i) {
-	z0 := x86._mm_xor_si128(
-		st.s6,
-		x86._mm_xor_si128(
-			st.s1,
-			x86._mm_and_si128(st.s2, st.s3),
-		),
-	)
-	z1 := x86._mm_xor_si128(
-		st.s2,
-		x86._mm_xor_si128(
-			st.s5,
-			x86._mm_and_si128(st.s6, st.s7),
-		),
-	)
-	return z0, z1
-}
-
-@(private = "file", enable_target_feature = "sse2", require_results)
-z_hw_256 :: #force_inline proc "contextless" (st: ^State_HW) -> x86.__m128i {
-	return x86._mm_xor_si128(
-		st.s1,
-		x86._mm_xor_si128(
-			st.s4,
-			x86._mm_xor_si128(
-				st.s5,
-				x86._mm_and_si128(st.s2, st.s3),
-			),
-		),
-	)
-}
-
-@(private = "file", enable_target_feature = "sse2,aes")
-enc_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, ci, xi: []byte) #no_bounds_check {
-	z0, z1 := z_hw_128l(st)
-
-	t0 := intrinsics.unaligned_load((^x86.__m128i)(&xi[0]))
-	t1 := intrinsics.unaligned_load((^x86.__m128i)(&xi[16]))
-	update_hw_128l(st, t0, t1)
-
-	out0 := x86._mm_xor_si128(t0, z0)
-	out1 := x86._mm_xor_si128(t1, z1)
-	intrinsics.unaligned_store((^x86.__m128i)(&ci[0]), out0)
-	intrinsics.unaligned_store((^x86.__m128i)(&ci[16]), out1)
-}
-
-@(private = "file", enable_target_feature = "sse2,aes")
-enc_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, ci, xi: []byte) #no_bounds_check {
-	z := z_hw_256(st)
-
-	xi_ := intrinsics.unaligned_load((^x86.__m128i)(raw_data(xi)))
-	update_hw_256(st, xi_)
-
-	ci_ := x86._mm_xor_si128(xi_, z)
-	intrinsics.unaligned_store((^x86.__m128i)(raw_data(ci)), ci_)
-}
-
-@(private, enable_target_feature = "sse2,aes")
-enc_hw :: proc "contextless" (st: ^State_HW, dst, src: []byte) #no_bounds_check {
-	ci, xi, l := dst, src, len(src)
-
-	switch st.rate {
-	case _RATE_128L:
-		for l >= _RATE_128L {
-			enc_hw_128l(st, ci, xi)
-			ci = ci[_RATE_128L:]
-			xi = xi[_RATE_128L:]
-			l -= _RATE_128L
-		}
-	case _RATE_256:
-		for l >= _RATE_256 {
-			enc_hw_256(st, ci, xi)
-			ci = ci[_RATE_256:]
-			xi = xi[_RATE_256:]
-			l -= _RATE_256
-		}
-	}
-
-	// Pad out the remainder with `0`s till it is rate sized.
-	if l > 0 {
-		tmp: [_RATE_MAX]byte // Ciphertext is not confidential.
-		copy(tmp[:], xi)
-		switch st.rate {
-		case _RATE_128L:
-			enc_hw_128l(st, tmp[:], tmp[:])
-		case _RATE_256:
-			enc_hw_256(st, tmp[:], tmp[:])
-		}
-		copy(ci, tmp[:l])
-	}
-}
-
-@(private = "file", enable_target_feature = "sse2,aes")
-dec_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, xi, ci: []byte) #no_bounds_check {
-	z0, z1 := z_hw_128l(st)
-
-	t0 := intrinsics.unaligned_load((^x86.__m128i)(&ci[0]))
-	t1 := intrinsics.unaligned_load((^x86.__m128i)(&ci[16]))
-	out0 := x86._mm_xor_si128(t0, z0)
-	out1 := x86._mm_xor_si128(t1, z1)
-
-	update_hw_128l(st, out0, out1)
-	intrinsics.unaligned_store((^x86.__m128i)(&xi[0]), out0)
-	intrinsics.unaligned_store((^x86.__m128i)(&xi[16]), out1)
-}
-
-@(private = "file", enable_target_feature = "sse2,aes")
-dec_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, xi, ci: []byte) #no_bounds_check {
-	z := z_hw_256(st)
-
-	ci_ := intrinsics.unaligned_load((^x86.__m128i)(raw_data(ci)))
-	xi_ := x86._mm_xor_si128(ci_, z)
-
-	update_hw_256(st, xi_)
-	intrinsics.unaligned_store((^x86.__m128i)(raw_data(xi)), xi_)
-}
-
-@(private = "file", enable_target_feature = "sse2,aes")
-dec_partial_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, xn, cn: []byte) #no_bounds_check {
-	tmp: [_RATE_128L]byte
-	defer crypto.zero_explicit(&tmp, size_of(tmp))
-
-	z0, z1 := z_hw_128l(st)
-	copy(tmp[:], cn)
-
-	t0 := intrinsics.unaligned_load((^x86.__m128i)(&tmp[0]))
-	t1 := intrinsics.unaligned_load((^x86.__m128i)(&tmp[16]))
-	out0 := x86._mm_xor_si128(t0, z0)
-	out1 := x86._mm_xor_si128(t1, z1)
-
-	intrinsics.unaligned_store((^x86.__m128i)(&tmp[0]), out0)
-	intrinsics.unaligned_store((^x86.__m128i)(&tmp[16]), out1)
-	copy(xn, tmp[:])
-
-	for off := len(xn); off < _RATE_128L; off += 1 {
-		tmp[off] = 0
-	}
-	out0 = intrinsics.unaligned_load((^x86.__m128i)(&tmp[0])) // v0
-	out1 = intrinsics.unaligned_load((^x86.__m128i)(&tmp[16])) // v1
-	update_hw_128l(st, out0, out1)
-}
-
-@(private = "file", enable_target_feature = "sse2,aes")
-dec_partial_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, xn, cn: []byte) #no_bounds_check {
-	tmp: [_RATE_256]byte
-	defer crypto.zero_explicit(&tmp, size_of(tmp))
-
-	z := z_hw_256(st)
-	copy(tmp[:], cn)
-
-	cn_ := intrinsics.unaligned_load((^x86.__m128i)(&tmp[0]))
-	xn_ := x86._mm_xor_si128(cn_, z)
-
-	intrinsics.unaligned_store((^x86.__m128i)(&tmp[0]), xn_)
-	copy(xn, tmp[:])
-
-	for off := len(xn); off < _RATE_256; off += 1 {
-		tmp[off] = 0
-	}
-	xn_ = intrinsics.unaligned_load((^x86.__m128i)(&tmp[0]))
-	update_hw_256(st, xn_)
-}
-
-@(private, enable_target_feature = "sse2,aes")
-dec_hw :: proc "contextless" (st: ^State_HW, dst, src: []byte) #no_bounds_check {
-	xi, ci, l := dst, src, len(src)
-
-	switch st.rate {
-	case _RATE_128L:
-		for l >= _RATE_128L {
-			dec_hw_128l(st, xi, ci)
-			xi = xi[_RATE_128L:]
-			ci = ci[_RATE_128L:]
-			l -= _RATE_128L
-		}
-	case _RATE_256:
-		for l >= _RATE_256 {
-			dec_hw_256(st, xi, ci)
-			xi = xi[_RATE_256:]
-			ci = ci[_RATE_256:]
-			l -= _RATE_256
-		}
-	}
-
-	// Process the remainder.
-	if l > 0 {
-		switch st.rate {
-		case _RATE_128L:
-			dec_partial_hw_128l(st, xi, ci)
-		case _RATE_256:
-			dec_partial_hw_256(st, xi, ci)
-		}
-	}
-}
-
-@(private, enable_target_feature = "sse2,aes")
-finalize_hw :: proc "contextless" (st: ^State_HW, tag: []byte, ad_len, msg_len: int) {
-	tmp: [16]byte
-	endian.unchecked_put_u64le(tmp[0:], u64(ad_len) * 8)
-	endian.unchecked_put_u64le(tmp[8:], u64(msg_len) * 8)
-
-	t := intrinsics.unaligned_load((^x86.__m128i)(&tmp[0]))
-
-	t0, t1: x86.__m128i = ---, ---
-	switch st.rate {
-	case _RATE_128L:
-		t = x86._mm_xor_si128(st.s2, t)
-		for _ in 0 ..< 7 {
-			update_hw_128l(st, t, t)
-		}
-
-		t0 = x86._mm_xor_si128(st.s0, st.s1)
-		t0 = x86._mm_xor_si128(t0, st.s2)
-		t0 = x86._mm_xor_si128(t0, st.s3)
-
-		t1 = x86._mm_xor_si128(st.s4, st.s5)
-		t1 = x86._mm_xor_si128(t1, st.s6)
-		if len(tag) == TAG_SIZE_256 {
-			t1 = x86._mm_xor_si128(t1, st.s7)
-		}
-	case _RATE_256:
-		t = x86._mm_xor_si128(st.s3, t)
-		for _ in 0 ..< 7 {
-			update_hw_256(st, t)
-		}
-
-		t0 = x86._mm_xor_si128(st.s0, st.s1)
-		t0 = x86._mm_xor_si128(t0, st.s2)
-
-		t1 = x86._mm_xor_si128(st.s3, st.s4)
-		t1 = x86._mm_xor_si128(t1, st.s5)
-	}
-	switch len(tag) {
-	case TAG_SIZE_128:
-		t0 = x86._mm_xor_si128(t0, t1)
-		intrinsics.unaligned_store((^x86.__m128i)(&tag[0]), t0)
-	case TAG_SIZE_256:
-		intrinsics.unaligned_store((^x86.__m128i)(&tag[0]), t0)
-		intrinsics.unaligned_store((^x86.__m128i)(&tag[16]), t1)
-	}
-}
-
-@(private)
-reset_state_hw :: proc "contextless" (st: ^State_HW) {
-	crypto.zero_explicit(st, size_of(st^))
-}
--- a/core/crypto/aes/aes_ctr_hw_intel.odin
+++ b/core/crypto/aes/aes_ctr_hw_intel.odin
@@ -1,30 +1,32 @@
-#+build amd64
+#+build amd64,arm32
 package aes

 import "base:intrinsics"
 import "core:crypto/_aes"
+import aes_hw "core:crypto/_aes/hw"
+import "core:encoding/endian"
 import "core:math/bits"
-import "core:simd/x86"
+import "core:simd"

@(private)
 CTR_STRIDE_HW :: 4
@(private)
 CTR_STRIDE_BYTES_HW :: CTR_STRIDE_HW * BLOCK_SIZE

-@(private, enable_target_feature = "sse2,aes")
+@(private, enable_target_feature = aes_hw.TARGET_FEATURES)
 ctr_blocks_hw :: proc(ctx: ^Context_CTR, dst, src: []byte, nr_blocks: int) #no_bounds_check {
 	hw_ctx := ctx._impl.(Context_Impl_Hardware)

-	sks: [15]x86.__m128i = ---
+	sks: [15]simd.u8x16 = ---
 	for i in 0 ..= hw_ctx._num_rounds {
-		sks[i] = intrinsics.unaligned_load((^x86.__m128i)(&hw_ctx._sk_exp_enc[i]))
+		sks[i] = intrinsics.unaligned_load((^simd.u8x16)(&hw_ctx._sk_exp_enc[i]))
 	}

-	hw_inc_ctr := #force_inline proc "contextless" (hi, lo: u64) -> (x86.__m128i, u64, u64) {
-		ret := x86.__m128i{
-			i64(intrinsics.byte_swap(hi)),
-			i64(intrinsics.byte_swap(lo)),
-		}
+	hw_inc_ctr := #force_inline proc "contextless" (hi, lo: u64) -> (simd.u8x16, u64, u64) {
+		buf: [BLOCK_SIZE]byte = ---
+		endian.unchecked_put_u64be(buf[0:], hi)
+		endian.unchecked_put_u64be(buf[8:], lo)
+		ret := intrinsics.unaligned_load((^simd.u8x16)(&buf))

 		hi, lo := hi, lo
 		carry: u64
@@ -46,42 +48,42 @@ ctr_blocks_hw :: proc(ctx: ^Context_CTR, dst, src: []byte, nr_blocks: int) #no_b
 	nr_blocks := nr_blocks
 	ctr_hi, ctr_lo := ctx._ctr_hi, ctx._ctr_lo

-	blks: [CTR_STRIDE_HW]x86.__m128i = ---
+	blks: [CTR_STRIDE_HW]simd.u8x16 = ---
 	for nr_blocks >= CTR_STRIDE_HW {
 		#unroll for i in 0..< CTR_STRIDE_HW {
 			blks[i], ctr_hi, ctr_lo = hw_inc_ctr(ctr_hi, ctr_lo)
 		}

 		#unroll for i in 0 ..< CTR_STRIDE_HW {
-			blks[i] = x86._mm_xor_si128(blks[i], sks[0])
+			blks[i] = simd.bit_xor(blks[i], sks[0])
 		}
 		#unroll for i in 1 ..= 9 {
 			#unroll for j in 0 ..< CTR_STRIDE_HW {
-				blks[j] = x86._mm_aesenc_si128(blks[j], sks[i])
+				blks[j] = aes_hw.aesenc(blks[j], sks[i])
 			}
 		}
 		switch hw_ctx._num_rounds {
 		case _aes.ROUNDS_128:
 			#unroll for i in 0 ..< CTR_STRIDE_HW {
-				blks[i] = x86._mm_aesenclast_si128(blks[i], sks[10])
+				blks[i] = aes_hw.aesenclast(blks[i], sks[10])
 			}
 		case _aes.ROUNDS_192:
 			#unroll for i in 10 ..= 11 {
 				#unroll for j in 0 ..< CTR_STRIDE_HW {
-					blks[j] = x86._mm_aesenc_si128(blks[j], sks[i])
+					blks[j] = aes_hw.aesenc(blks[j], sks[i])
 				}
 			}
 			#unroll for i in 0 ..< CTR_STRIDE_HW {
-				blks[i] = x86._mm_aesenclast_si128(blks[i], sks[12])
+				blks[i] = aes_hw.aesenclast(blks[i], sks[12])
 			}
 		case _aes.ROUNDS_256:
 			#unroll for i in 10 ..= 13 {
 				#unroll for j in 0 ..< CTR_STRIDE_HW {
-					blks[j] = x86._mm_aesenc_si128(blks[j], sks[i])
+					blks[j] = aes_hw.aesenc(blks[j], sks[i])
 				}
 			}
 			#unroll for i in 0 ..< CTR_STRIDE_HW {
-				blks[i] = x86._mm_aesenclast_si128(blks[i], sks[14])
+				blks[i] = aes_hw.aesenclast(blks[i], sks[14])
 			}
 		}

@@ -98,23 +100,23 @@ ctr_blocks_hw :: proc(ctx: ^Context_CTR, dst, src: []byte, nr_blocks: int) #no_b
 	for nr_blocks > 0 {
 		blks[0], ctr_hi, ctr_lo = hw_inc_ctr(ctr_hi, ctr_lo)

-		blks[0] = x86._mm_xor_si128(blks[0], sks[0])
+		blks[0] = simd.bit_xor(blks[0], sks[0])
 		#unroll for i in 1 ..= 9 {
-			blks[0] = x86._mm_aesenc_si128(blks[0], sks[i])
+			blks[0] = aes_hw.aesenc(blks[0], sks[i])
 		}
 		switch hw_ctx._num_rounds {
 		case _aes.ROUNDS_128:
-			blks[0] = x86._mm_aesenclast_si128(blks[0], sks[10])
+			blks[0] = aes_hw.aesenclast(blks[0], sks[10])
 		case _aes.ROUNDS_192:
 			#unroll for i in 10 ..= 11 {
-				blks[0] = x86._mm_aesenc_si128(blks[0], sks[i])
+				blks[0] = aes_hw.aesenc(blks[0], sks[i])
 			}
-			blks[0] = x86._mm_aesenclast_si128(blks[0], sks[12])
+			blks[0] = aes_hw.aesenclast(blks[0], sks[12])
 		case _aes.ROUNDS_256:
 			#unroll for i in 10 ..= 13 {
-				blks[0] = x86._mm_aesenc_si128(blks[0], sks[i])
+				blks[0] = aes_hw.aesenc(blks[0], sks[i])
 			}
-			blks[0] = x86._mm_aesenclast_si128(blks[0], sks[14])
+			blks[0] = aes_hw.aesenclast(blks[0], sks[14])
 		}

 		xor_blocks_hw(dst, src, blks[:1])
@@ -133,18 +135,18 @@ ctr_blocks_hw :: proc(ctx: ^Context_CTR, dst, src: []byte, nr_blocks: int) #no_b
 	zero_explicit(&sks, size_of(sks))
 }

-@(private, enable_target_feature = "sse2")
-xor_blocks_hw :: proc(dst, src: []byte, blocks: []x86.__m128i) {
+@(private, enable_target_feature = aes_hw.TARGET_FEATURES)
+xor_blocks_hw :: proc(dst, src: []byte, blocks: []simd.u8x16) {
 	#no_bounds_check {
 		if src != nil {
 				for i in 0 ..< len(blocks) {
 					off := i * BLOCK_SIZE
-					tmp := intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[off:])))
-					blocks[i] = x86._mm_xor_si128(blocks[i], tmp)
+					tmp := intrinsics.unaligned_load((^simd.u8x16)(raw_data(src[off:])))
+					blocks[i] = simd.bit_xor(blocks[i], tmp)
 				}
 		}
 		for i in 0 ..< len(blocks) {
-			intrinsics.unaligned_store((^x86.__m128i)(raw_data(dst[i * BLOCK_SIZE:])), blocks[i])
+			intrinsics.unaligned_store((^simd.u8x16)(raw_data(dst[i * BLOCK_SIZE:])), blocks[i])
 		}
 	}
 }
--- a/core/crypto/aes/aes_ecb.odin
+++ b/core/crypto/aes/aes_ecb.odin
@@ -21,7 +21,7 @@ init_ecb :: proc(ctx: ^Context_ECB, key: []byte, impl := DEFAULT_IMPLEMENTATION)
 encrypt_ecb :: proc(ctx: ^Context_ECB, dst, src: []byte) {
 	ensure(ctx._is_initialized)
 	ensure(len(dst) == BLOCK_SIZE, "crypto/aes: invalid dst size")
-	ensure(len(dst) == BLOCK_SIZE, "crypto/aes: invalid src size")
+	ensure(len(src) == BLOCK_SIZE, "crypto/aes: invalid src size")

 	switch &impl in ctx._impl {
 	case ct64.Context:
@@ -35,7 +35,7 @@ encrypt_ecb :: proc(ctx: ^Context_ECB, dst, src: []byte) {
 decrypt_ecb :: proc(ctx: ^Context_ECB, dst, src: []byte) {
 	ensure(ctx._is_initialized)
 	ensure(len(dst) == BLOCK_SIZE, "crypto/aes: invalid dst size")
-	ensure(len(dst) == BLOCK_SIZE, "crypto/aes: invalid src size")
+	ensure(len(src) == BLOCK_SIZE, "crypto/aes: invalid src size")

 	switch &impl in ctx._impl {
 	case ct64.Context:
--- a/core/crypto/aes/aes_ecb_hw.odin
+++ b/core/crypto/aes/aes_ecb_hw.odin
@@ -0,0 +1,59 @@
+#+build amd64,arm32
+package aes
+
+import "base:intrinsics"
+import "core:crypto/_aes"
+import aes_hw "core:crypto/_aes/hw"
+import "core:simd"
+
+@(private, enable_target_feature = aes_hw.TARGET_FEATURES)
+encrypt_block_hw :: proc(ctx: ^Context_Impl_Hardware, dst, src: []byte) {
+	blk := intrinsics.unaligned_load((^simd.u8x16)(raw_data(src)))
+
+	blk = simd.bit_xor(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_enc[0])))
+	#unroll for i in 1 ..= 9 {
+		blk = aes_hw.aesenc(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_enc[i])))
+	}
+	switch ctx._num_rounds {
+	case _aes.ROUNDS_128:
+		blk = aes_hw.aesenclast(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_enc[10])))
+	case _aes.ROUNDS_192:
+		#unroll for i in 10 ..= 11 {
+			blk = aes_hw.aesenc(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_enc[i])))
+		}
+		blk = aes_hw.aesenclast(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_enc[12])))
+	case _aes.ROUNDS_256:
+		#unroll for i in 10 ..= 13 {
+			blk = aes_hw.aesenc(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_enc[i])))
+		}
+		blk = aes_hw.aesenclast(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_enc[14])))
+	}
+
+	intrinsics.unaligned_store((^simd.u8x16)(raw_data(dst)), blk)
+}
+
+@(private, enable_target_feature = aes_hw.TARGET_FEATURES)
+decrypt_block_hw :: proc(ctx: ^Context_Impl_Hardware, dst, src: []byte) {
+	blk := intrinsics.unaligned_load((^simd.u8x16)(raw_data(src)))
+
+	blk = simd.bit_xor(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_dec[0])))
+	#unroll for i in 1 ..= 9 {
+		blk = aes_hw.aesdec(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_dec[i])))
+	}
+	switch ctx._num_rounds {
+	case _aes.ROUNDS_128:
+		blk = aes_hw.aesdeclast(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_dec[10])))
+	case _aes.ROUNDS_192:
+		#unroll for i in 10 ..= 11 {
+			blk = aes_hw.aesdec(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_dec[i])))
+		}
+		blk = aes_hw.aesdeclast(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_dec[12])))
+	case _aes.ROUNDS_256:
+		#unroll for i in 10 ..= 13 {
+			blk = aes_hw.aesdec(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_dec[i])))
+		}
+		blk = aes_hw.aesdeclast(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_dec[14])))
+	}
+
+	intrinsics.unaligned_store((^simd.u8x16)(raw_data(dst)), blk)
+}
--- a/core/crypto/aes/aes_ecb_hw_intel.odin
+++ b/core/crypto/aes/aes_ecb_hw_intel.odin
@@ -1,58 +0,0 @@
-#+build amd64
-package aes
-
-import "base:intrinsics"
-import "core:crypto/_aes"
-import "core:simd/x86"
-
-@(private, enable_target_feature = "sse2,aes")
-encrypt_block_hw :: proc(ctx: ^Context_Impl_Hardware, dst, src: []byte) {
-	blk := intrinsics.unaligned_load((^x86.__m128i)(raw_data(src)))
-
-	blk = x86._mm_xor_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[0])))
-	#unroll for i in 1 ..= 9 {
-		blk = x86._mm_aesenc_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[i])))
-	}
-	switch ctx._num_rounds {
-	case _aes.ROUNDS_128:
-		blk = x86._mm_aesenclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[10])))
-	case _aes.ROUNDS_192:
-		#unroll for i in 10 ..= 11 {
-			blk = x86._mm_aesenc_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[i])))
-		}
-		blk = x86._mm_aesenclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[12])))
-	case _aes.ROUNDS_256:
-		#unroll for i in 10 ..= 13 {
-			blk = x86._mm_aesenc_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[i])))
-		}
-		blk = x86._mm_aesenclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[14])))
-	}
-
-	intrinsics.unaligned_store((^x86.__m128i)(raw_data(dst)), blk)
-}
-
-@(private, enable_target_feature = "sse2,aes")
-decrypt_block_hw :: proc(ctx: ^Context_Impl_Hardware, dst, src: []byte) {
-	blk := intrinsics.unaligned_load((^x86.__m128i)(raw_data(src)))
-
-	blk = x86._mm_xor_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[0])))
-	#unroll for i in 1 ..= 9 {
-		blk = x86._mm_aesdec_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[i])))
-	}
-	switch ctx._num_rounds {
-	case _aes.ROUNDS_128:
-		blk = x86._mm_aesdeclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[10])))
-	case _aes.ROUNDS_192:
-		#unroll for i in 10 ..= 11 {
-			blk = x86._mm_aesdec_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[i])))
-		}
-		blk = x86._mm_aesdeclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[12])))
-	case _aes.ROUNDS_256:
-		#unroll for i in 10 ..= 13 {
-			blk = x86._mm_aesdec_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[i])))
-		}
-		blk = x86._mm_aesdeclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[14])))
-	}
-
-	intrinsics.unaligned_store((^x86.__m128i)(raw_data(dst)), blk)
-}
--- a/core/crypto/aes/aes_gcm.odin
+++ b/core/crypto/aes/aes_gcm.odin
@@ -4,6 +4,7 @@ import "core:bytes"
 import "core:crypto"
 import "core:crypto/_aes"
 import "core:crypto/_aes/ct64"
+import aes_hw "core:crypto/_aes/hw"
 import "core:encoding/endian"

 // GCM_IV_SIZE is the default size of the GCM IV in bytes.
@@ -26,6 +27,10 @@ Context_GCM :: struct {

 // init_gcm initializes a Context_GCM with the provided key.
 init_gcm :: proc(ctx: ^Context_GCM, key: []byte, impl := DEFAULT_IMPLEMENTATION) {
+	when aes_hw.HAS_GHASH {
+		impl := aes_hw.is_ghash_supported() ? impl : .Portable
+
+	}
 	init_impl(&ctx._impl, key, impl)
 	ctx._is_initialized = true
 }
@@ -65,7 +70,7 @@ seal_gcm :: proc(ctx: ^Context_GCM, dst, tag, iv, aad, plaintext: []byte) {

 // open_gcm authenticates the aad and ciphertext, and decrypts the ciphertext,
 // with the provided Context_GCM, iv, and tag, and stores the output in dst,
-// returning true iff the authentication was successful.  If authentication
+// returning true if and only if (⟺) the authentication was successful.  If authentication
 // fails, the destination buffer will be zeroed.
 //
 // dst and plaintext MUST alias exactly or not at all.
--- a/core/crypto/aes/aes_gcm_hw_intel.odin
+++ b/core/crypto/aes/aes_gcm_hw_intel.odin
@@ -1,12 +1,13 @@
-#+build amd64
+#+build amd64,arm32
 package aes

 import "base:intrinsics"
 import "core:crypto"
 import "core:crypto/_aes"
-import "core:crypto/_aes/hw_intel"
+@(require) import "core:crypto/_aes/ct64"
+import aes_hw "core:crypto/_aes/hw"
 import "core:encoding/endian"
-import "core:simd/x86"
+import "core:simd"

@(private)
 gcm_seal_hw :: proc(ctx: ^Context_Impl_Hardware, dst, tag, iv, aad, plaintext: []byte) {
@@ -17,7 +18,11 @@ gcm_seal_hw :: proc(ctx: ^Context_Impl_Hardware, dst, tag, iv, aad, plaintext: [
 	init_ghash_hw(ctx, &h, &j0, &j0_enc, iv)

 	// Note: Our GHASH implementation handles appending padding.
-	hw_intel.ghash(s[:], h[:], aad)
+	when aes_hw.HAS_GHASH {
+		aes_hw.ghash(s[:], h[:], aad)
+	} else {
+		ct64.ghash(s[:], h[:], aad)
+	}
 	gctr_hw(ctx, dst, &s, plaintext, &h, &j0, true)
 	final_ghash_hw(&s, &h, &j0_enc, len(aad), len(plaintext))
 	copy(tag, s[:])
@@ -35,7 +40,11 @@ gcm_open_hw :: proc(ctx: ^Context_Impl_Hardware, dst, iv, aad, ciphertext, tag:
 	s: [_aes.GHASH_TAG_SIZE]byte
 	init_ghash_hw(ctx, &h, &j0, &j0_enc, iv)

-	hw_intel.ghash(s[:], h[:], aad)
+	when aes_hw.HAS_GHASH {
+		aes_hw.ghash(s[:], h[:], aad)
+	} else {
+		ct64.ghash(s[:], h[:], aad)
+	}
 	gctr_hw(ctx, dst, &s, ciphertext, &h, &j0, false)
 	final_ghash_hw(&s, &h, &j0_enc, len(aad), len(ciphertext))

@@ -71,18 +80,26 @@ init_ghash_hw :: proc(
 	} else {
 		// If len(IV) != 96, then let s = 128 ceil(len(IV)/128) - len(IV),
 		// and let J0 = GHASHH(IV || 0^(s+64) || ceil(len(IV))^64).
-		hw_intel.ghash(j0[:], h[:], iv)
+		when aes_hw.HAS_GHASH {
+			aes_hw.ghash(j0[:], h[:], iv)
+		} else {
+			ct64.ghash(j0[:], h[:], iv)
+		}

 		tmp: [_aes.GHASH_BLOCK_SIZE]byte
 		endian.unchecked_put_u64be(tmp[8:], u64(l) * 8)
-		hw_intel.ghash(j0[:], h[:], tmp[:])
+		when aes_hw.HAS_GHASH {
+			aes_hw.ghash(j0[:], h[:], tmp[:])
+		} else {
+			ct64.ghash(j0[:], h[:], tmp[:])
+		}
 	}

 	// ECB encrypt j0, so that we can just XOR with the tag.
 	encrypt_block_hw(ctx, j0_enc[:], j0[:])
 }

-@(private = "file", enable_target_feature = "sse2")
+@(private = "file", enable_target_feature = aes_hw.TARGET_FEATURES)
 final_ghash_hw :: proc(
 	s: ^[_aes.GHASH_BLOCK_SIZE]byte,
 	h: ^[_aes.GHASH_KEY_SIZE]byte,
@@ -94,14 +111,18 @@ final_ghash_hw :: proc(
 	endian.unchecked_put_u64be(blk[0:], u64(a_len) * 8)
 	endian.unchecked_put_u64be(blk[8:], u64(t_len) * 8)

-	hw_intel.ghash(s[:], h[:], blk[:])
-	j0_vec := intrinsics.unaligned_load((^x86.__m128i)(j0))
-	s_vec := intrinsics.unaligned_load((^x86.__m128i)(s))
-	s_vec = x86._mm_xor_si128(s_vec, j0_vec)
-	intrinsics.unaligned_store((^x86.__m128i)(s), s_vec)
+	when aes_hw.HAS_GHASH {
+		aes_hw.ghash(s[:], h[:], blk[:])
+	} else {
+		ct64.ghash(s[:], h[:], blk[:])
+	}
+	j0_vec := intrinsics.unaligned_load((^simd.u8x16)(j0))
+	s_vec := intrinsics.unaligned_load((^simd.u8x16)(s))
+	s_vec = simd.bit_xor(s_vec, j0_vec)
+	intrinsics.unaligned_store((^simd.u8x16)(s), s_vec)
 }

-@(private = "file", enable_target_feature = "sse2,sse4.1,aes")
+@(private = "file", enable_target_feature = aes_hw.TARGET_FEATURES)
 gctr_hw :: proc(
 	ctx: ^Context_Impl_Hardware,
 	dst: []byte,
@@ -111,13 +132,13 @@ gctr_hw :: proc(
 	iv: ^[_aes.GHASH_BLOCK_SIZE]byte,
 	is_seal: bool,
 ) #no_bounds_check {
-	sks: [15]x86.__m128i = ---
+	sks: [15]simd.u8x16 = ---
 	for i in 0 ..= ctx._num_rounds {
-		sks[i] = intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[i]))
+		sks[i] = intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_enc[i]))
 	}

 	// Setup the counter block
-	ctr_blk := intrinsics.unaligned_load((^x86.__m128i)(iv))
+	ctr_blk := intrinsics.unaligned_load((^simd.u8x16)(iv))
 	ctr := endian.unchecked_get_u32be(iv[GCM_IV_SIZE:]) + 1

 	src, dst := src, dst
@@ -127,11 +148,15 @@ gctr_hw :: proc(
 	// This results in an unreadable mess, so we opt for simplicity
 	// as performance is adequate.

-	blks: [CTR_STRIDE_HW]x86.__m128i = ---
+	blks: [CTR_STRIDE_HW]simd.u8x16 = ---
 	nr_blocks := len(src) / BLOCK_SIZE
 	for nr_blocks >= CTR_STRIDE_HW {
 		if !is_seal {
-			hw_intel.ghash(s[:], h[:], src[:CTR_STRIDE_BYTES_HW])
+			when aes_hw.HAS_GHASH {
+				aes_hw.ghash(s[:], h[:], src[:CTR_STRIDE_BYTES_HW])
+			} else {
+				ct64.ghash(s[:], h[:], src[:CTR_STRIDE_BYTES_HW])
+			}
 		}

 		#unroll for i in 0 ..< CTR_STRIDE_HW {
@@ -139,42 +164,46 @@ gctr_hw :: proc(
 		}

 		#unroll for i in 0 ..< CTR_STRIDE_HW {
-			blks[i] = x86._mm_xor_si128(blks[i], sks[0])
+			blks[i] = simd.bit_xor(blks[i], sks[0])
 		}
 		#unroll for i in 1 ..= 9 {
 			#unroll for j in 0 ..< CTR_STRIDE_HW {
-				blks[j] = x86._mm_aesenc_si128(blks[j], sks[i])
+				blks[j] = aes_hw.aesenc(blks[j], sks[i])
 			}
 		}
 		switch ctx._num_rounds {
 		case _aes.ROUNDS_128:
 			#unroll for i in 0 ..< CTR_STRIDE_HW {
-				blks[i] = x86._mm_aesenclast_si128(blks[i], sks[10])
+				blks[i] = aes_hw.aesenclast(blks[i], sks[10])
 			}
 		case _aes.ROUNDS_192:
 			#unroll for i in 10 ..= 11 {
 				#unroll for j in 0 ..< CTR_STRIDE_HW {
-					blks[j] = x86._mm_aesenc_si128(blks[j], sks[i])
+					blks[j] = aes_hw.aesenc(blks[j], sks[i])
 				}
 			}
 			#unroll for i in 0 ..< CTR_STRIDE_HW {
-				blks[i] = x86._mm_aesenclast_si128(blks[i], sks[12])
+				blks[i] = aes_hw.aesenclast(blks[i], sks[12])
 			}
 		case _aes.ROUNDS_256:
 			#unroll for i in 10 ..= 13 {
 				#unroll for j in 0 ..< CTR_STRIDE_HW {
-					blks[j] = x86._mm_aesenc_si128(blks[j], sks[i])
+					blks[j] = aes_hw.aesenc(blks[j], sks[i])
 				}
 			}
 			#unroll for i in 0 ..< CTR_STRIDE_HW {
-				blks[i] = x86._mm_aesenclast_si128(blks[i], sks[14])
+				blks[i] = aes_hw.aesenclast(blks[i], sks[14])
 			}
 		}

 		xor_blocks_hw(dst, src, blks[:])

 		if is_seal {
-			hw_intel.ghash(s[:], h[:], dst[:CTR_STRIDE_BYTES_HW])
+			when aes_hw.HAS_GHASH {
+				aes_hw.ghash(s[:], h[:], dst[:CTR_STRIDE_BYTES_HW])
+			} else {
+				ct64.ghash(s[:], h[:], dst[:CTR_STRIDE_BYTES_HW])
+			}
 		}

 		src = src[CTR_STRIDE_BYTES_HW:]
@@ -186,28 +215,32 @@ gctr_hw :: proc(
 	for n := len(src); n > 0; {
 		l := min(n, BLOCK_SIZE)
 		if !is_seal {
-			hw_intel.ghash(s[:], h[:], src[:l])
+			when aes_hw.HAS_GHASH {
+				aes_hw.ghash(s[:], h[:], src[:l])
+			} else {
+				ct64.ghash(s[:], h[:], src[:l])
+			}
 		}

 		blks[0], ctr = hw_inc_ctr32(&ctr_blk, ctr)

-		blks[0] = x86._mm_xor_si128(blks[0], sks[0])
+		blks[0] = simd.bit_xor(blks[0], sks[0])
 		#unroll for i in 1 ..= 9 {
-			blks[0] = x86._mm_aesenc_si128(blks[0], sks[i])
+			blks[0] = aes_hw.aesenc(blks[0], sks[i])
 		}
 		switch ctx._num_rounds {
 		case _aes.ROUNDS_128:
-			blks[0] = x86._mm_aesenclast_si128(blks[0], sks[10])
+			blks[0] = aes_hw.aesenclast(blks[0], sks[10])
 		case _aes.ROUNDS_192:
 			#unroll for i in 10 ..= 11 {
-				blks[0] = x86._mm_aesenc_si128(blks[0], sks[i])
+				blks[0] = aes_hw.aesenc(blks[0], sks[i])
 			}
-			blks[0] = x86._mm_aesenclast_si128(blks[0], sks[12])
+			blks[0] = aes_hw.aesenclast(blks[0], sks[12])
 		case _aes.ROUNDS_256:
 			#unroll for i in 10 ..= 13 {
-				blks[0] = x86._mm_aesenc_si128(blks[0], sks[i])
+				blks[0] = aes_hw.aesenc(blks[0], sks[i])
 			}
-			blks[0] = x86._mm_aesenclast_si128(blks[0], sks[14])
+			blks[0] = aes_hw.aesenclast(blks[0], sks[14])
 		}

 		if l == BLOCK_SIZE {
@@ -219,7 +252,11 @@ gctr_hw :: proc(
 			copy(dst, blk[:l])
 		}
 		if is_seal {
-			hw_intel.ghash(s[:], h[:], dst[:l])
+			when aes_hw.HAS_GHASH {
+				aes_hw.ghash(s[:], h[:], dst[:l])
+			} else {
+				ct64.ghash(s[:], h[:], dst[:l])
+			}
 		}

 		dst = dst[l:]
@@ -235,8 +272,17 @@ gctr_hw :: proc(
 // the compiler.
 //
 // src/check_expr.cpp(8104): Assertion Failure: `c->curr_proc_decl->entity`
-@(private = "file", enable_target_feature = "sse4.1")
-hw_inc_ctr32 :: #force_inline proc "contextless" (src: ^x86.__m128i, ctr: u32) -> (x86.__m128i, u32) {
-	ret := x86._mm_insert_epi32(src^, i32(intrinsics.byte_swap(ctr)), 3)
+@(private = "file", enable_target_feature = aes_hw.TARGET_FEATURES)
+hw_inc_ctr32 :: #force_inline proc "contextless" (src: ^simd.u8x16, ctr: u32) -> (simd.u8x16, u32) {
+	when ODIN_ENDIAN == .Little {
+		ctr_be := intrinsics.byte_swap(ctr)
+	} else {
+		ctr_be := ctr
+	}
+
+	ret := transmute(simd.u8x16)(
+		simd.replace(transmute(simd.u32x4)(src^), 3, ctr_be)
+	)
+
 	return ret, ctr + 1
 }
--- a/core/crypto/aes/aes_impl_hw.odin
+++ b/core/crypto/aes/aes_impl_hw.odin
@@ -0,0 +1,18 @@
+#+build amd64,arm32
+package aes
+
+import aes_hw "core:crypto/_aes/hw"
+
+// is_hardware_accelerated returns true if and only if (⟺) hardware accelerated AES
+// is supported.
+is_hardware_accelerated :: proc "contextless" () -> bool {
+	return aes_hw.is_supported()
+}
+
+@(private)
+Context_Impl_Hardware :: aes_hw.Context
+
+@(private, enable_target_feature = aes_hw.TARGET_FEATURES)
+init_impl_hw :: proc(ctx: ^Context_Impl_Hardware, key: []byte) {
+	aes_hw.init(ctx, key)
+}
--- a/core/crypto/aes/aes_impl_hw_gen.odin
+++ b/core/crypto/aes/aes_impl_hw_gen.odin
@@ -1,10 +1,12 @@
 #+build !amd64
+#+build !arm64
+#+build !arm32
 package aes

@(private = "file")
 ERR_HW_NOT_SUPPORTED :: "crypto/aes: hardware implementation unsupported"

-// is_hardware_accelerated returns true iff hardware accelerated AES
+// is_hardware_accelerated returns true if and only if (⟺) hardware accelerated AES
 // is supported.
 is_hardware_accelerated :: proc "contextless" () -> bool {
 	return false
--- a/core/crypto/aes/aes_impl_hw_intel.odin
+++ b/core/crypto/aes/aes_impl_hw_intel.odin
@@ -1,18 +0,0 @@
-#+build amd64
-package aes
-
-import "core:crypto/_aes/hw_intel"
-
-// is_hardware_accelerated returns true iff hardware accelerated AES
-// is supported.
-is_hardware_accelerated :: proc "contextless" () -> bool {
-	return hw_intel.is_supported()
-}
-
-@(private)
-Context_Impl_Hardware :: hw_intel.Context
-
-@(private, enable_target_feature = "sse2,aes")
-init_impl_hw :: proc(ctx: ^Context_Impl_Hardware, key: []byte) {
-	hw_intel.init(ctx, key)
-}
--- a/core/crypto/argon2id/argon2id.odin
+++ b/core/crypto/argon2id/argon2id.odin
@@ -0,0 +1,622 @@
+/*
+package argon2id implements the Argon2id password hashing algorithm.
+
+See: [[ https://datatracker.ietf.org/doc/rfc9106/ ]]
+*/
+package argon2id
+
+import "core:crypto/blake2b"
+import "core:encoding/endian"
+import "core:math/bits"
+import "core:mem"
+
+// Implementation based on the RFC, Monocypher (CC0-1.0), and the reference
+// code (CC0-1.0).
+
+// MAX_INPUT_SIZE is the mamximum size of the various inputs (password,
+// salt, secret, ad) in bytes.
+MAX_INPUT_SIZE :: (1 << 32) - 1
+
+// MIN_PARALLELISM is the minimum allowed parallelism.
+MIN_PARALLELISM :: 1
+// MAX_PARALLELISM is the maximum allowed parallelism.
+MAX_PARALLELISM :: (1 << 24) - 1
+
+// MIN_TAG_SIZE is the minimum digest size in bytes.
+MIN_TAG_SIZE :: 4
+// MAX_TAG_SIZE is the maximum digest size in bytes.
+MAX_TAG_SIZE :: (1 << 32) - 1
+
+// RECOMMENDED_TAG_SIZE is the recommended tag size in bytes.
+RECOMMENTED_TAG_SIZE :: 32 // 256-bits
+// RECOMMENDNED_SALT_SIZE is the recommended salt size in bytes.
+RECOMMENDED_SALT_SIZE :: 16 // 128-bits
+
+@(private)
+V_RFC9106 :: 0x13
+@(private)
+Y_ID :: 0x02
+@(private)
+BLOCK_SIZE_BYTES :: 1024
+@(private)
+BLOCK_SIZE_U64 :: 128
+
+// PARAMS_RFC9106 is the first recommended "uniformly safe" parameter set
+// per RFC 9106.
+@(rodata)
+PARAMS_RFC9106 := Parameters{
+	memory_size = 2 * 1024 * 1024, // 2 GiB
+	passes      = 1,
+	parallelism = 4,
+}
+
+// PARAMS_RFC9106_SMALL is the second recommended "uniformly safe" parameter
+// set per RFC 9106 tailored for memory constrained environments.
+@(rodata)
+PARAMS_RFC9106_SMALL := Parameters{
+	memory_size = 64 * 1024, // 64 MiB
+	passes      = 3,
+	parallelism = 4,
+}
+
+// PARAMS_OWASP is one of the recommended parameter set from the OWASP
+// Password Storage Cheat Sheet (as of 2026/02).  The cheat sheet contains
+// additional variations to this parameter set with various trade-offs
+// between `memory_size` and `passes` that are intended to provide
+// equivalent security.
+//
+// See: [[ https://cheatsheetseries.owasp.org/cheatsheets/Password_Storage_Cheat_Sheet.html ]]
+@(rodata)
+PARAMS_OWASP := Parameters{
+	memory_size = 19 * 1024, // 19 MiB
+	passes      = 2,
+	parallelism = 1,
+}
+
+// PARAMS_OWASP_SMALL is equivalent in strength to PARAMS_OWASP, but
+// trades off less memory use for more CPU usage.
+@(rodata)
+PARAMS_OWASP_SMALL := Parameters{
+	memory_size = 7 * 1024, // 7 MiB
+	passes      = 5,
+	parallelism = 1,
+}
+
+// Parameters is an Argon2id parameter set.
+Parameters :: struct {
+	memory_size: u32,  // m (KiB)
+	passes:      u32,  // t
+	parallelism: u32,  // p
+}
+
+@(private)
+Block :: [BLOCK_SIZE_U64]u64
+
+// derive invokes Argon2id with the specified parameter set and inputs,
+// and outputs the derived key to dst.
+@(require_results)
+derive :: proc(
+	parameters: ^Parameters,
+	password:   []byte, // P
+	salt:       []byte, // S
+	dst:        []byte,
+	secret:     []byte = nil, // K (aka `pepper`)
+	ad:         []byte = nil, // X
+	sanitize  := true,
+	allocator := context.allocator, // Not temp as this can be large.
+) -> mem.Allocator_Error #no_bounds_check {
+	if u64(len(password)) > MAX_INPUT_SIZE {
+		panic("crypto/argon2id: invalid password size")
+	}
+	if u64(len(salt)) > MAX_INPUT_SIZE {
+		panic("crypto/argon2id: invalid salt size")
+	}
+	if u64(len(secret)) > MAX_INPUT_SIZE {
+		panic("crypto/argon2id: invalid secret size")
+	}
+	if u64(len(ad)) > MAX_INPUT_SIZE {
+		panic("crypto/argon2id: invalid ad size")
+	}
+	if l := u64(len(dst)); l > MAX_TAG_SIZE || l < MIN_TAG_SIZE {
+		panic("crypto/argon2id: invalid dst size")
+	}
+
+	p, t, m := parameters.parallelism, parameters.passes, u64(parameters.memory_size)
+	if p < MIN_PARALLELISM || p > MAX_PARALLELISM {
+		panic("crypto/argon2id: invalid parallelism")
+	}
+	if t < 1 {
+		panic("crypto/argon2id: invalid passes")
+	}
+	if m < 8 * u64(p) {
+		panic("crypto/argon2id: insufficient memory size")
+	}
+	if m * BLOCK_SIZE_BYTES > u64(max(int)) {
+		panic("crypto/argon2id: excessive memory size")
+	}
+
+	// Allocate the memory as m' 1024-byte blocks, where m' is derived as:
+	// m' = 4 * p * floor (m / 4p)
+	//
+	// For p lanes, the memory is organized in a matrix B[i][j] of
+	// blocks with p rows (lanes) and q = m' / p columns.
+	m_ := 4 * u64(p) * (m / u64(4 * p))
+	b := mem.alloc_bytes_non_zeroed(
+		int(m_) * BLOCK_SIZE_BYTES,
+		alignment = mem.DEFAULT_PAGE_SIZE,
+		allocator = allocator,
+	) or_return
+	defer delete(b, allocator)
+
+	block_buf: [BLOCK_SIZE_BYTES]byte = ---
+
+	blocks := ([^]Block)(raw_data(b))[:m_]
+	segment_size := u32(m_ / u64(p) / 4)
+	lane_size := segment_size * 4
+
+	// Establish H_0 as the 64-byte value as shown below.  If K, X, or S
+	// has zero length, it is just absent, but its length field remains.
+	//
+	// H_0 = H^(64)(LE32(p) || LE32(T) || LE32(m) || LE32(t) ||
+	//     LE32(v) || LE32(y) || LE32(length(P)) || P ||
+	//     LE32(length(S)) || S ||  LE32(length(K)) || K ||
+	//     LE32(length(X)) || X)
+	{
+		ctx: blake2b.Context
+		blake2b.init(&ctx)
+
+		blake2b_update_u32le(&ctx, u32(p))
+		blake2b_update_u32le(&ctx, u32(len(dst)))
+		blake2b_update_u32le(&ctx, parameters.memory_size)
+		blake2b_update_u32le(&ctx, t)
+		blake2b_update_u32le(&ctx, V_RFC9106)
+		blake2b_update_u32le(&ctx, Y_ID)
+		blake2b_update_u32le(&ctx, u32(len(password)))
+		blake2b.update(&ctx, password)
+		blake2b_update_u32le(&ctx, u32(len(salt)))
+		blake2b.update(&ctx, salt)
+		blake2b_update_u32le(&ctx, u32(len(secret)))
+		blake2b.update(&ctx, secret)
+		blake2b_update_u32le(&ctx, u32(len(ad)))
+		blake2b.update(&ctx, ad)
+
+		h_0: [blake2b.DIGEST_SIZE+8]byte
+		blake2b.final(&ctx, h_0[:blake2b.DIGEST_SIZE])
+
+		// Compute B[i][0] for all i ranging from (and including) 0 to (not
+		// including) p.
+		//
+		// B[i][0] = H'^(1024)(H_0 || LE32(0) || LE32(i))
+		//
+		// Compute B[i][1] for all i ranging from (and including) 0 to (not
+		// including) p.
+		//
+		// B[i][1] = H'^(1024)(H_0 || LE32(1) || LE32(i))
+		for l in u32(0) ..< p {
+			for i in u32(0) ..< 2 {
+				endian.unchecked_put_u32le(h_0[blake2b.DIGEST_SIZE:], i)   // LE32({0,1})
+				endian.unchecked_put_u32le(h_0[blake2b.DIGEST_SIZE+4:], l) // LE32(i)
+				h_prime(block_buf[:], h_0[:])
+				blk := &blocks[l * lane_size + i]
+				for j in 0 ..< BLOCK_SIZE_U64 {
+					blk[j] = endian.unchecked_get_u64le(block_buf[j*8:])
+				}
+			}
+		}
+
+		mem.zero_explicit(&h_0, size_of(h_0)) // No longer needed.
+	}
+
+	// Compute B[i][j] for all i ranging from (and including) 0 to (not
+	// including) p and for all j ranging from (and including) 2 to (not
+	// including) q.  The computation MUST proceed slicewise
+	// (Section 3.4): first, blocks from slice 0 are computed for all
+	// lanes (in an arbitrary order of lanes), then blocks from slice 1
+	// are computed, etc.  The block indices l and z are determined for
+	// each i, j differently for Argon2d, Argon2i, and Argon2id.
+	//
+	// B[i][j] = G(B[i][j-1], B[l][z])
+	//
+	// If the number of passes t is larger than 1, we repeat step 5.  We
+	// compute B[i][0] and B[i][j] for all i raging from (and including)
+	// 0 to (not including) p and for all j ranging from (and including)
+	// 1 to (not including) q.  However, blocks are computed differently
+	// as the old value is XORed with the new one:
+	//
+	// B[i][0] = G(B[i][q-1], B[l][z]) XOR B[i][0];
+	// B[i][j] = G(B[i][j-1], B[l][z]) XOR B[i][j].
+	constant_time := true // Start with constant time indexing.
+	tmp, index_block: Block = ---, ---
+	for pass in u32(0) ..< t {
+		for slice in u32(0) ..< 4 {
+			// The first slice of the first pass has blocks 0 and 1
+			// pre-filled.
+			pass_offset: u32 = pass == 0 && slice == 0 ? 2 : 0
+			slice_offset := slice * segment_size
+
+			// 3.4.1.3.  Argon2id
+			//
+			//    If the pass number is 0 and the slice number is 0 or 1, then compute
+			//    J_1 and J_2 as for Argon2i, else compute J_1 and J_2 as for Argon2d.
+			if slice == 2 {
+				constant_time = false
+			}
+
+			// Each segment can be processed in parallel, as long as
+			// each iteration of the loop completes before proceeding
+			// to the next.  For simplicity we do this in serial
+			// instead of using threads.
+			for segment in u32(0) ..< u32(p) {
+				index_ctr: u64 = 1
+				for block in pass_offset ..< segment_size {
+					// Current and previous blocks (indexes, not pointers)
+					lane_offset := segment * lane_size
+					segment_start := lane_offset + slice_offset
+					current := segment_start + block
+					previous := segment_start - 1
+					switch {
+					case block == 0 && slice_offset == 0:
+						previous += lane_size
+					case:
+						previous += block
+					}
+
+					index_seed: u64
+					if constant_time {
+						// 3.4.1.2.  Argon2i
+						//
+						//    For each segment, we do the following.  First, we compute the value Z
+						//    as:
+						//
+						//    Z= ( LE64(r) || LE64(l) || LE64(sl) || LE64(m') ||
+						//         LE64(t) || LE64(y) )
+						//
+						//                 Figure 11: Input to Compute J1,J2 in Argon2i
+						//
+						//    where
+						//
+						//    r:   the pass number
+						//    l:   the lane number
+						//    sl:  the slice number
+						//    m':  the total number of memory blocks
+						//    t:   the total number of passes
+						//    y:   the Argon2 type (0 for Argon2d, 1 for Argon2i, 2 for Argon2id)
+						//
+						//    Then we compute:
+						//
+						//    q/(128*SL) 1024-byte values
+						//    G(ZERO(1024),G(ZERO(1024),
+						//    Z || LE64(1) || ZERO(968) )),
+						//    G(ZERO(1024),G(ZERO(1024),
+						//    Z || LE64(2) || ZERO(968) )),... ,
+						//    G(ZERO(1024),G(ZERO(1024),
+						//    Z || LE64(q/(128*SL)) || ZERO(968) )),
+						//
+						//    which are partitioned into q/(SL) 8-byte values X, which are viewed
+						//    as X1||X2 and converted to J_1=int32(X1) and J_2=int32(X2).
+						//
+						//    The values r, l, sl, m', t, y, and i are represented as 8 bytes in
+						//    little endian.
+						if block == pass_offset || (block % 128) == 0 {
+							mem.zero(&index_block, size_of(index_block))
+							index_block[0] = u64(pass)
+							index_block[1] = u64(segment)
+							index_block[2] = u64(slice)
+							index_block[3] = u64(lane_size * p)
+							index_block[4] = u64(t) // passes
+							index_block[5] = Y_ID
+							index_block[6] = index_ctr
+							index_ctr += 1
+
+							copy(tmp[:], index_block[:])
+							g_rounds(&index_block)
+							xor_block(&index_block, &tmp)
+							copy(tmp[:], index_block[:])
+							g_rounds(&index_block)
+							xor_block(&index_block, &tmp)
+						}
+						index_seed = index_block[block % 128]
+					} else {
+						// 3.4.1.1.  Argon2d
+						//
+						//    J_1 is given by the first 32 bits of block B[i][j-1], while J_2 is
+						//    given by the next 32 bits of block B[i][j-1]:
+						//
+						//    J_1 = int32(extract(B[i][j-1], 0))
+						//    J_2 = int32(extract(B[i][j-1], 1))
+						//
+						//                   Figure 10: Deriving J1,J2 in Argon2d
+						index_seed = blocks[previous][0]
+					}
+
+					// 3.4.2.  Mapping J_1 and J_2 to Reference Block Index [l][z]
+					//
+					//    The value of l = J_2 mod p gives the index of the lane from which the
+					//    block will be taken.  For the first pass (r=0) and the first slice
+					//    (sl=0), the block is taken from the current lane.
+					//
+					//    The set W contains the indices that are referenced according to the
+					//    following rules:
+					//
+					//    1.  If l is the current lane, then W includes the indices of all
+					//        blocks in the last SL - 1 = 3 segments computed and finished, as
+					//        well as the blocks computed in the current segment in the current
+					//        pass excluding B[i][j-1].
+					//
+					//    2.  If l is not the current lane, then W includes the indices of all
+					//        blocks in the last SL - 1 = 3 segments computed and finished in
+					//        lane l.  If B[i][j] is the first block of a segment, then the
+					//        very last index from W is excluded.
+					//
+					//    Then take a block from W with a nonuniform distribution over [0, |W|)
+					//    using the following mapping:
+					//
+					//    J_1 -> |W|(1 - J_1^2 / 2^(64))
+					//
+					//                           Figure 12: Computing J1
+					//
+					//    To avoid floating point computation, the following approximation is
+					//    used:
+					//
+					//    x = J_1^2 / 2^(32)
+					//    y = (|W| * x) / 2^(32)
+					//    zz = |W| - 1 - y
+					//
+					//                       Figure 13: Computing J1, Part 2
+					//
+					//    Then take the zz-th index from W; it will be the z value for the
+					//    reference block index [l][z].
+					next_slice: u32 = ((slice + 1) % 4) * segment_size
+					window_start, nb_segments: u32
+					lane := u32(index_seed >> 32) % p
+					switch {
+					case pass == 0:
+						nb_segments = slice
+						if slice == 0 {
+							lane = segment
+						}
+					case:
+						window_start = next_slice
+						nb_segments = 3
+					}
+					window_size := nb_segments * segment_size
+					if lane == segment {
+						window_size += block - 1
+					} else if block == 0 {
+						window_size += ~u32(0)
+					}
+
+					j1 := index_seed & 0xffffffff
+					x := (j1 * j1) >> 32
+					y := (u64(window_size) * x) >> 32
+					z := (u64(window_size) - 1) - y
+					ref := u32((u64(window_start) + z) % u64(lane_size))
+					reference: u32 = lane * lane_size + ref
+
+					copy(tmp[:], blocks[previous][:])
+					xor_block(&tmp, &blocks[reference])
+					if pass == 0 {
+						copy(blocks[current][:], tmp[:])
+					} else {
+						xor_block(&blocks[current], &tmp)
+					}
+					g_rounds(&tmp)
+					xor_block(&blocks[current], &tmp)
+				}
+			}
+		}
+	}
+	mem.zero_explicit(&tmp, size_of(tmp))
+	mem.zero_explicit(&index_block, size_of(index_block))
+
+	// After t steps have been iterated, the final block C is computed
+	// as the XOR of the last column:
+	//
+	// C = B[0][q-1] XOR B[1][q-1] XOR ... XOR B[p-1][q-1]
+	idx := lane_size - 1
+	last_block := &blocks[idx]
+	for _ in 1 ..< p {
+		idx += lane_size
+		next_block := &blocks[idx]
+		xor_block(next_block, last_block)
+		last_block = next_block
+	}
+
+	for v, i in last_block {
+		endian.unchecked_put_u64le(block_buf[i*8:], v)
+	}
+
+	// The output tag is computed as H'^T(C).
+	h_prime(dst, block_buf[:])
+	mem.zero_explicit(&block_buf, size_of(block_buf))
+
+	// Sanitize the working memory.  While the RFC implies that this is
+	// optional ("enable the memory-wiping option in the library call"),
+	// the reference code defaults to enabling it.
+	//
+	// An opt-out is provided, as this can get somewhat expensive when
+	// m gets large.
+	if sanitize {
+		mem.zero_explicit(raw_data(b), len(b))
+	}
+
+	return nil
+}
+
+@(private)
+xor_block :: #force_inline proc(dst, src: ^Block) {
+	for v, i in src {
+		dst[i] ~= v
+	}
+}
+
+@(private)
+blake2b_update_u32le :: #force_inline proc(ctx: ^blake2b.Context, i: u32) {
+	tmp: [4]byte = ---
+	endian.unchecked_put_u32le(tmp[:], i)
+	blake2b.update(ctx, tmp[:])
+	mem.zero_explicit(&tmp, size_of(tmp)) // Probably overkill.
+}
+
+// 3.3.  Variable-Length Hash Function H'
+//
+//    Let V_i be a 64-byte block and W_i be its first 32 bytes.  Then we
+//    define function H' as follows:
+//
+//            if T <= 64
+//                H'^T(A) = H^T(LE32(T)||A)
+//            else
+//                r = ceil(T/32)-2
+//                V_1 = H^(64)(LE32(T)||A)
+//                V_2 = H^(64)(V_1)
+//                ...
+//                V_r = H^(64)(V_{r-1})
+//                V_{r+1} = H^(T-32*r)(V_{r})
+//                H'^T(X) = W_1 || W_2 || ... || W_r || V_{r+1}
+//
+//         Figure 8: Function H' for Tag and Initial Block Computations
+@(private)
+h_prime :: proc(dst, src: []byte) {
+	t := len(dst)
+	ctx: blake2b.Context
+	blake2b.init(&ctx, min(t, blake2b.DIGEST_SIZE))
+	blake2b_update_u32le(&ctx, u32(t))
+	blake2b.update(&ctx, src)
+	blake2b.final(&ctx, dst)
+
+	if t > 64 {
+		r := u32((u64(t) + 31) >> 5) - 2
+		i: u32 = 1
+		off_in := 0
+		off_out := 32
+		for i < r {
+			blake2b.init(&ctx, blake2b.DIGEST_SIZE)
+			blake2b.update(&ctx, dst[off_in:off_in+64])
+			blake2b.final(&ctx, dst[off_out:])
+			i += 1
+			off_in += 32
+			off_out += 32
+		}
+		blake2b.init(&ctx, t - int(32 * r))
+		blake2b.update(&ctx, dst[off_in:off_in+64])
+		blake2b.final(&ctx, dst[off_out:])
+	}
+}
+
+// GB(a, b, c, d) is defined as follows:
+//
+//         a = (a + b + 2 * trunc(a) * trunc(b)) mod 2^(64)
+//         d = (d XOR a) >>> 32
+//         c = (c + d + 2 * trunc(c) * trunc(d)) mod 2^(64)
+//         b = (b XOR c) >>> 24
+//
+//         a = (a + b + 2 * trunc(a) * trunc(b)) mod 2^(64)
+//         d = (d XOR a) >>> 16
+//         c = (c + d + 2 * trunc(c) * trunc(d)) mod 2^(64)
+//         b = (b XOR c) >>> 63
+//
+//                        Figure 19: Details of GB
+//
+// The modular additions in GB are combined with 64-bit multiplications.
+// Multiplications are the only difference from the original BLAKE2b
+// design.  This choice is done to increase the circuit depth and thus
+// the running time of ASIC implementations, while having roughly the
+// same running time on CPUs thanks to parallelism and pipelining.
+@(private,require_results)
+gb :: #force_inline proc(a, b, c, d: u64) -> (u64, u64, u64, u64) {
+	a, b, c, d := a, b, c, d
+
+	trunc := #force_inline proc(v: u64) -> u64 {
+		return u64(u32(v))
+	}
+
+	a += b + ((trunc(a) * trunc(b)) << 1)
+	d = bits.rotate_left64(d ~ a, 32) // >>> 32
+	c += d + ((trunc(c) * trunc(d)) << 1)
+	b = bits.rotate_left64((b ~ c), 40) // >>> 24
+
+	a += b + ((trunc(a) * trunc(b)) << 1)
+	d = bits.rotate_left64(d ~ a, 48) // >>> 16
+	c += d + ((trunc(c) * trunc(d)) << 1)
+	b = bits.rotate_left64((b ~ c), 1) // >>> 63
+
+	return a, b, c, d
+}
+
+// 3.6.  Permutation P
+//
+//    Permutation P is based on the round function of BLAKE2b.  The eight
+//    16-byte inputs S_0, S_1, ... , S_7 are viewed as a 4x4 matrix of
+//    64-bit words, where S_i = (v_{2*i+1} || v_{2*i}):
+//
+//             v_0  v_1  v_2  v_3
+//             v_4  v_5  v_6  v_7
+//             v_8  v_9 v_10 v_11
+//            v_12 v_13 v_14 v_15
+//
+//                      Figure 17: Matrix Element Labeling
+//
+//    It works as follows:
+//
+//            GB(v_0, v_4,  v_8, v_12)
+//            GB(v_1, v_5,  v_9, v_13)
+//            GB(v_2, v_6, v_10, v_14)
+//            GB(v_3, v_7, v_11, v_15)
+//
+//            GB(v_0, v_5, v_10, v_15)
+//            GB(v_1, v_6, v_11, v_12)
+//            GB(v_2, v_7,  v_8, v_13)
+//            GB(v_3, v_4,  v_9, v_14)
+//
+//                   Figure 18: Feeding Matrix Elements to GB
+@(private,require_results)
+perm_p :: #force_inline proc(v_0, v_1, v_2, v_3, v_4, v_5, v_6, v_7, v_8, v_9, v_10, v_11, v_12, v_13, v_14, v_15: u64) -> (u64, u64, u64, u64, u64, u64, u64, u64, u64, u64, u64, u64, u64, u64, u64, u64) {
+	v_0, v_1, v_2, v_3, v_4, v_5, v_6, v_7, v_8, v_9, v_10, v_11, v_12, v_13, v_14, v_15 := v_0, v_1, v_2, v_3, v_4, v_5, v_6, v_7, v_8, v_9, v_10, v_11, v_12, v_13, v_14, v_15
+
+	v_0, v_4, v_8, v_12 = gb(v_0, v_4, v_8, v_12)
+	v_1, v_5, v_9, v_13 = gb(v_1, v_5, v_9, v_13)
+	v_2, v_6, v_10, v_14 = gb(v_2, v_6, v_10, v_14)
+	v_3, v_7, v_11, v_15 = gb(v_3, v_7, v_11, v_15)
+
+	v_0, v_5, v_10, v_15 = gb(v_0, v_5, v_10, v_15)
+	v_1, v_6, v_11, v_12 = gb(v_1, v_6, v_11, v_12)
+	v_2, v_7, v_8, v_13 = gb(v_2, v_7, v_8, v_13)
+	v_3, v_4, v_9, v_14 = gb(v_3, v_4, v_9, v_14)
+
+	return v_0, v_1, v_2, v_3, v_4, v_5, v_6, v_7, v_8, v_9, v_10, v_11, v_12, v_13, v_14, v_15
+}
+
+// 3.5.  Compression Function G
+//
+//    The compression function G is built upon the BLAKE2b-based
+//    transformation P.  P operates on the 128-byte input, which can be
+//    viewed as eight 16-byte registers:
+//
+//    P(A_0, A_1, ... ,A_7) = (B_0, B_1, ... ,B_7)
+//
+//                      Figure 14: Blake Round Function P
+//
+//    The compression function G(X, Y) operates on two 1024-byte blocks X
+//    and Y.  It first computes R = X XOR Y.  Then R is viewed as an 8x8
+//    matrix of 16-byte registers R_0, R_1, ... , R_63.  Then P is first
+//    applied to each row, and then to each column to get Z:
+//
+//    ( Q_0,  Q_1,  Q_2, ... ,  Q_7) <- P( R_0,  R_1,  R_2, ... ,  R_7)
+//    ( Q_8,  Q_9, Q_10, ... , Q_15) <- P( R_8,  R_9, R_10, ... , R_15)
+//                                  ...
+//    (Q_56, Q_57, Q_58, ... , Q_63) <- P(R_56, R_57, R_58, ... , R_63)
+//    ( Z_0,  Z_8, Z_16, ... , Z_56) <- P( Q_0,  Q_8, Q_16, ... , Q_56)
+//    ( Z_1,  Z_9, Z_17, ... , Z_57) <- P( Q_1,  Q_9, Q_17, ... , Q_57)
+//                                  ...
+//    ( Z_7, Z_15, Z 23, ... , Z_63) <- P( Q_7, Q_15, Q_23, ... , Q_63)
+//
+//                  Figure 15: Core of Compression Function G
+@(private)
+g_rounds :: proc(b: ^Block) {
+	for i := 0; i < 128; i += 16 {
+		b[i], b[i+1], b[i+2], b[i+3], b[i+4], b[i+5], b[i+6], b[i+7], b[i+8], b[i+9], b[i+10], b[i+11], b[i+12], b[i+13], b[i+14], b[i+15] = perm_p(b[i], b[i+1], b[i+2], b[i+3], b[i+4], b[i+5], b[i+6], b[i+7], b[i+8], b[i+9], b[i+10], b[i+11], b[i+12], b[i+13], b[i+14], b[i+15])
+	}
+	for i := 0; i < 16; i += 2 {
+		b[i], b[i+1], b[i+16], b[i+17], b[i+32], b[i+33], b[i+48], b[i+49], b[i+64], b[i+65], b[i+80], b[i+81], b[i+96], b[i+97], b[i+112], b[i+113] = perm_p(b[i], b[i+1], b[i+16], b[i+17], b[i+32], b[i+33], b[i+48], b[i+49], b[i+64], b[i+65], b[i+80], b[i+81], b[i+96], b[i+97], b[i+112], b[i+113])
+	}
+}
--- a/core/crypto/blake2b/blake2b.odin
+++ b/core/crypto/blake2b/blake2b.odin
@@ -28,13 +28,24 @@ Context :: _blake2.Blake2b_Context

 // init initializes a Context with the default BLAKE2b config.
 init :: proc(ctx: ^Context, digest_size := DIGEST_SIZE) {
-	ensure(digest_size <= _blake2.MAX_SIZE, "crypto/blake2b: invalid digest size")
+	ensure(digest_size <= DIGEST_SIZE, "crypto/blake2b: invalid digest size")

 	cfg: _blake2.Blake2_Config
 	cfg.size = u8(digest_size)
 	_blake2.init(ctx, &cfg)
 }

+// init_mac initializes a Context with a user provided key.
+init_mac :: proc(ctx: ^Context, key: []byte, digest_size := DIGEST_SIZE) {
+	ensure(digest_size <= DIGEST_SIZE, "crypto/blake2b: invalid digest size")
+	ensure(len(key) <= DIGEST_SIZE, "crypto/blake2b: invalid key size")
+
+	cfg: _blake2.Blake2_Config
+	cfg.size = u8(digest_size)
+	cfg.key = key
+	_blake2.init(ctx, &cfg)
+}
+
 // update adds more data to the Context.
 update :: proc(ctx: ^Context, data: []byte) {
 	_blake2.update(ctx, data)
@@ -43,7 +54,7 @@ update :: proc(ctx: ^Context, data: []byte) {
 // final finalizes the Context, writes the digest to hash, and calls
 // reset on the Context.
 //
-// Iff finalize_clone is set, final will work on a copy of the Context,
+// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context,
 // which is useful for for calculating rolling digests.
 final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
 	_blake2.final(ctx, hash, finalize_clone)
--- a/core/crypto/blake2s/blake2s.odin
+++ b/core/crypto/blake2s/blake2s.odin
@@ -28,13 +28,24 @@ Context :: _blake2.Blake2s_Context

 // init initializes a Context with the default BLAKE2s config.
 init :: proc(ctx: ^Context, digest_size := DIGEST_SIZE) {
-	ensure(digest_size <= _blake2.MAX_SIZE, "crypto/blake2s: invalid digest size")
+	ensure(digest_size <= DIGEST_SIZE, "crypto/blake2s: invalid digest size")

 	cfg: _blake2.Blake2_Config
 	cfg.size = u8(digest_size)
 	_blake2.init(ctx, &cfg)
 }

+// init_mac initializes a Context with a user provided key.
+init_mac :: proc(ctx: ^Context, key: []byte, digest_size := DIGEST_SIZE) {
+	ensure(digest_size <= DIGEST_SIZE, "crypto/blake2s: invalid digest size")
+	ensure(len(key) <= DIGEST_SIZE, "crypto/blake2s: invalid key size")
+
+	cfg: _blake2.Blake2_Config
+	cfg.size = u8(digest_size)
+	cfg.key = key
+	_blake2.init(ctx, &cfg)
+}
+
 // update adds more data to the Context.
 update :: proc(ctx: ^Context, data: []byte) {
 	_blake2.update(ctx, data)
@@ -43,7 +54,7 @@ update :: proc(ctx: ^Context, data: []byte) {
 // final finalizes the Context, writes the digest to hash, and calls
 // reset on the Context.
 //
-// Iff finalize_clone is set, final will work on a copy of the Context,
+// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context,
 // which is useful for for calculating rolling digests.
 final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
 	_blake2.final(ctx, hash, finalize_clone)
--- a/core/crypto/chacha20poly1305/chacha20poly1305.odin
+++ b/core/crypto/chacha20poly1305/chacha20poly1305.odin
@@ -136,7 +136,7 @@ seal :: proc(ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) {

 // open authenticates the aad and ciphertext, and decrypts the ciphertext,
 // with the provided Context, iv, and tag, and stores the output in dst,
-// returning true iff the authentication was successful.  If authentication
+// returning true if and only if (⟺) the authentication was successful.  If authentication
 // fails, the destination buffer will be zeroed.
 //
 // dst and plaintext MUST alias exactly or not at all.
--- a/core/crypto/crypto.odin
+++ b/core/crypto/crypto.odin
@@ -8,15 +8,15 @@ import subtle "core:crypto/_subtle"
 // Omit large precomputed tables, trading off performance for size.
 COMPACT_IMPLS: bool : #config(ODIN_CRYPTO_COMPACT, false)

-// HAS_RAND_BYTES is true iff the runtime provides a cryptographic
+// HAS_RAND_BYTES is true if and only if (⟺) the runtime provides a cryptographic
 // entropy source.
 HAS_RAND_BYTES :: runtime.HAS_RAND_BYTES

-// compare_constant_time returns 1 iff a and b are equal, 0 otherwise.
+// compare_constant_time returns 1 if and only if (⟺) a and b are equal, 0 otherwise.
 //
 // The execution time of this routine is constant regardless of the contents
 // of the slices being compared, as long as the length of the slices is equal.
-// If the length of the two slices is different, it will early-return 0.
+// If the length of the two slices is dif and only if (⟺)erent, it will early-return 0.
 compare_constant_time :: proc "contextless" (a, b: []byte) -> int {
 	// If the length of the slices is different, early return.
 	//
@@ -31,7 +31,7 @@ compare_constant_time :: proc "contextless" (a, b: []byte) -> int {
 	return compare_byte_ptrs_constant_time(raw_data(a), raw_data(b), n)
 }

-// compare_byte_ptrs_constant_time returns 1 iff the bytes pointed to by
+// compare_byte_ptrs_constant_time returns 1 if and only if (⟺) the bytes pointed to by
 // a and b are equal, 0 otherwise.
 //
 // The execution time of this routine is constant regardless of the
@@ -46,12 +46,12 @@ compare_byte_ptrs_constant_time :: proc "contextless" (a, b: ^byte, n: int) -> i
 		v |= x[i] ~ y[i]
 	}

-	// After the loop, v == 0 iff a == b.  The subtraction will underflow
-	// iff v == 0, setting the sign-bit, which gets returned.
+	// After the loop, v == 0 if and only if (⟺) a == b.  The subtraction will underflow
+	// if and only if (⟺) v == 0, setting the sign-bit, which gets returned.
 	return subtle.eq(0, v)
 }

-// is_zero_constant_time returns 1 iff b is all 0s, 0 otherwise.
+// is_zero_constant_time returns 1 if and only if (⟺) b is all 0s, 0 otherwise.
 is_zero_constant_time :: proc "contextless" (b: []byte) -> int {
 	v: byte
 	for b_ in b {
--- a/core/crypto/deoxysii/deoxysii.odin
+++ b/core/crypto/deoxysii/deoxysii.odin
@@ -122,7 +122,7 @@ seal :: proc(ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) {

 // open authenticates the aad and ciphertext, and decrypts the ciphertext,
 // with the provided Context, iv, and tag, and stores the output in dst,
-// returning true iff the authentication was successful.  If authentication
+// returning true if and only if (⟺) the authentication was successful.  If authentication
 // fails, the destination buffer will be zeroed.
 //
 // dst and plaintext MUST alias exactly or not at all.
--- a/core/crypto/deoxysii/deoxysii_impl_hw_intel.odin
+++ b/core/crypto/deoxysii/deoxysii_impl_hw_intel.odin
@@ -1,152 +1,183 @@
-#+build amd64
+#+build amd64,arm32
 package deoxysii

 import "base:intrinsics"
 import "core:crypto"
-import "core:crypto/aes"
+import aes_hw "core:crypto/_aes/hw"
 import "core:simd"
-import "core:simd/x86"

 // This processes a maximum of 4 blocks at a time, as that is suitable
 // for most current hardware that doesn't say "Xeon".
+//
+// TODO/perf: ARM should be able to do 8 at a time.
+
+when ODIN_ARCH == .amd64 {
+	@(private="file")
+	TARGET_FEATURES :: "sse2,ssse3,aes"
+} else when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 {
+	@(private="file")
+	TARGET_FEATURES :: "neon,aes"
+}

@(private = "file")
-_BIT_ENC :: x86.__m128i{0x80, 0}
+_BIT_ENC :: simd.u8x16{0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
@(private = "file")
-_PREFIX_AD_BLOCK :: x86.__m128i{PREFIX_AD_BLOCK << PREFIX_SHIFT, 0}
+_PREFIX_AD_BLOCK :: simd.u8x16{
+	PREFIX_AD_BLOCK << PREFIX_SHIFT, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0,
+}
@(private = "file")
-_PREFIX_AD_FINAL :: x86.__m128i{PREFIX_AD_FINAL << PREFIX_SHIFT, 0}
+_PREFIX_AD_FINAL :: simd.u8x16{
+	PREFIX_AD_FINAL << PREFIX_SHIFT, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0,
+}
@(private = "file")
-_PREFIX_MSG_BLOCK :: x86.__m128i{PREFIX_MSG_BLOCK << PREFIX_SHIFT, 0}
+_PREFIX_MSG_BLOCK :: simd.u8x16{
+	PREFIX_MSG_BLOCK << PREFIX_SHIFT, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0,
+}
@(private = "file")
-_PREFIX_MSG_FINAL :: x86.__m128i{PREFIX_MSG_FINAL << PREFIX_SHIFT, 0}
+_PREFIX_MSG_FINAL :: simd.u8x16{
+	PREFIX_MSG_FINAL << PREFIX_SHIFT, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0,
+}

-// is_hardware_accelerated returns true iff hardware accelerated Deoxys-II
+// is_hardware_accelerated returns true if and only if (⟺) hardware accelerated Deoxys-II
 // is supported.
 is_hardware_accelerated :: proc "contextless" () -> bool {
-	return aes.is_hardware_accelerated()
+	return aes_hw.is_supported()
 }

-@(private = "file", enable_target_feature = "sse4.1", require_results)
+@(private = "file", enable_target_feature = TARGET_FEATURES, require_results)
 auth_tweak :: #force_inline proc "contextless" (
-	prefix:   x86.__m128i,
+	prefix:   simd.u8x16,
 	block_nr: int,
-) -> x86.__m128i {
-	return x86._mm_insert_epi64(prefix, i64(intrinsics.byte_swap(u64(block_nr))), 1)
-}
+) -> simd.u8x16 {
+	when ODIN_ENDIAN == .Little {
+		block_nr_u64 := intrinsics.byte_swap(u64(block_nr))
+	} else {
+		block_nr_u64 := u64(block_nr)
+	}

-@(private = "file", enable_target_feature = "sse2", require_results)
-enc_tweak :: #force_inline proc "contextless" (
-	tag:      x86.__m128i,
-	block_nr: int,
-) -> x86.__m128i {
-	return x86._mm_xor_si128(
-		x86._mm_or_si128(tag, _BIT_ENC),
-		x86.__m128i{0, i64(intrinsics.byte_swap(u64(block_nr)))},
+	return simd.bit_or(
+		prefix,
+		transmute(simd.u8x16)(simd.u64x2{0, block_nr_u64}),
 	)
 }

-@(private = "file", enable_target_feature = "ssse3", require_results)
-h_ :: #force_inline proc "contextless" (tk1: x86.__m128i) -> x86.__m128i {
-	return transmute(x86.__m128i)h(transmute(simd.u8x16)tk1)
+@(private = "file", enable_target_feature = TARGET_FEATURES, require_results)
+enc_tweak :: #force_inline proc "contextless" (
+	tag:      simd.u8x16,
+	block_nr: int,
+) -> simd.u8x16 {
+	when ODIN_ENDIAN == .Little {
+		block_nr_u64 := intrinsics.byte_swap(u64(block_nr))
+	} else {
+		block_nr_u64 := u64(block_nr)
+	}
+
+	return simd.bit_xor(
+		simd.bit_or(tag, _BIT_ENC),
+		transmute(simd.u8x16)(simd.u64x2{0, block_nr_u64}),
+	)
 }

-@(private = "file", enable_target_feature = "sse2,ssse3,aes", require_results)
+@(private = "file", enable_target_feature = TARGET_FEATURES, require_results)
 bc_x4 :: #force_inline proc "contextless" (
 	ctx: ^Context,
-	s_0, s_1, s_2, s_3:                 x86.__m128i,
-	tweak_0, tweak_1, tweak_2, tweak_3: x86.__m128i,
-) -> (x86.__m128i, x86.__m128i, x86.__m128i, x86.__m128i) #no_bounds_check {
+	s_0, s_1, s_2, s_3:                 simd.u8x16,
+	tweak_0, tweak_1, tweak_2, tweak_3: simd.u8x16,
+) -> (simd.u8x16, simd.u8x16, simd.u8x16, simd.u8x16) #no_bounds_check {
 	s_0, s_1, s_2, s_3 := s_0, s_1, s_2, s_3
 	tk1_0, tk1_1, tk1_2, tk1_3 := tweak_0, tweak_1, tweak_2, tweak_3

-	sk := intrinsics.unaligned_load((^x86.__m128i)(&ctx._subkeys[0]))
-	stk_0 := x86._mm_xor_si128(tk1_0, sk)
-	stk_1 := x86._mm_xor_si128(tk1_1, sk)
-	stk_2 := x86._mm_xor_si128(tk1_2, sk)
-	stk_3 := x86._mm_xor_si128(tk1_3, sk)
+	sk := intrinsics.unaligned_load((^simd.u8x16)(&ctx._subkeys[0]))
+	stk_0 := simd.bit_xor(tk1_0, sk)
+	stk_1 := simd.bit_xor(tk1_1, sk)
+	stk_2 := simd.bit_xor(tk1_2, sk)
+	stk_3 := simd.bit_xor(tk1_3, sk)

-	s_0 = x86._mm_xor_si128(s_0, stk_0)
-	s_1 = x86._mm_xor_si128(s_1, stk_1)
-	s_2 = x86._mm_xor_si128(s_2, stk_2)
-	s_3 = x86._mm_xor_si128(s_3, stk_3)
+	s_0 = simd.bit_xor(s_0, stk_0)
+	s_1 = simd.bit_xor(s_1, stk_1)
+	s_2 = simd.bit_xor(s_2, stk_2)
+	s_3 = simd.bit_xor(s_3, stk_3)

 	for i in 1 ..= BC_ROUNDS {
-		sk = intrinsics.unaligned_load((^x86.__m128i)(&ctx._subkeys[i]))
+		sk = intrinsics.unaligned_load((^simd.u8x16)(&ctx._subkeys[i]))

-		tk1_0 = h_(tk1_0)
-		tk1_1 = h_(tk1_1)
-		tk1_2 = h_(tk1_2)
-		tk1_3 = h_(tk1_3)
+		tk1_0 = h(tk1_0)
+		tk1_1 = h(tk1_1)
+		tk1_2 = h(tk1_2)
+		tk1_3 = h(tk1_3)

-		stk_0 = x86._mm_xor_si128(tk1_0, sk)
-		stk_1 = x86._mm_xor_si128(tk1_1, sk)
-		stk_2 = x86._mm_xor_si128(tk1_2, sk)
-		stk_3 = x86._mm_xor_si128(tk1_3, sk)
+		stk_0 = simd.bit_xor(tk1_0, sk)
+		stk_1 = simd.bit_xor(tk1_1, sk)
+		stk_2 = simd.bit_xor(tk1_2, sk)
+		stk_3 = simd.bit_xor(tk1_3, sk)

-		s_0 = x86._mm_aesenc_si128(s_0, stk_0)
-		s_1 = x86._mm_aesenc_si128(s_1, stk_1)
-		s_2 = x86._mm_aesenc_si128(s_2, stk_2)
-		s_3 = x86._mm_aesenc_si128(s_3, stk_3)
+		s_0 = aes_hw.aesenc(s_0, stk_0)
+		s_1 = aes_hw.aesenc(s_1, stk_1)
+		s_2 = aes_hw.aesenc(s_2, stk_2)
+		s_3 = aes_hw.aesenc(s_3, stk_3)
 	}

 	return s_0, s_1, s_2, s_3
 }

-@(private = "file", enable_target_feature = "sse2,ssse3,aes", require_results)
+@(private = "file", enable_target_feature = TARGET_FEATURES, require_results)
 bc_x1 :: #force_inline proc "contextless" (
 	ctx:   ^Context,
-	s:     x86.__m128i,
-	tweak: x86.__m128i,
-) -> x86.__m128i #no_bounds_check {
+	s:     simd.u8x16,
+	tweak: simd.u8x16,
+) -> simd.u8x16 #no_bounds_check {
 	s, tk1 := s, tweak

-	sk := intrinsics.unaligned_load((^x86.__m128i)(&ctx._subkeys[0]))
-	stk := x86._mm_xor_si128(tk1, sk)
+	sk := intrinsics.unaligned_load((^simd.u8x16)(&ctx._subkeys[0]))
+	stk := simd.bit_xor(tk1, sk)

-	s = x86._mm_xor_si128(s, stk)
+	s = simd.bit_xor(s, stk)

 	for i in 1 ..= BC_ROUNDS {
-		sk = intrinsics.unaligned_load((^x86.__m128i)(&ctx._subkeys[i]))
+		sk = intrinsics.unaligned_load((^simd.u8x16)(&ctx._subkeys[i]))

-		tk1 = h_(tk1)
+		tk1 = h(tk1)

-		stk = x86._mm_xor_si128(tk1, sk)
+		stk = simd.bit_xor(tk1, sk)

-		s = x86._mm_aesenc_si128(s, stk)
+		s = aes_hw.aesenc(s, stk)
 	}

 	return s
 }

-@(private = "file", enable_target_feature = "sse2,ssse3,sse4.1,aes", require_results)
+@(private = "file", enable_target_feature = TARGET_FEATURES, require_results)
 bc_absorb :: proc "contextless" (
 	ctx:          ^Context,
-	tag:          x86.__m128i,
+	tag:          simd.u8x16,
 	src:          []byte,
-	tweak_prefix: x86.__m128i,
+	tweak_prefix: simd.u8x16,
 	stk_block_nr: int,
-) -> (x86.__m128i, int) #no_bounds_check {
+) -> (simd.u8x16, int) #no_bounds_check {
 	src, stk_block_nr, tag := src, stk_block_nr, tag

 	nr_blocks := len(src) / BLOCK_SIZE
 	for nr_blocks >= 4 {
 		d_0, d_1, d_2, d_3 := bc_x4(
 			ctx,
-			intrinsics.unaligned_load((^x86.__m128i)(raw_data(src))),
-			intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[BLOCK_SIZE:]))),
-			intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[2*BLOCK_SIZE:]))),
-			intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[3*BLOCK_SIZE:]))),
+			intrinsics.unaligned_load((^simd.u8x16)(raw_data(src))),
+			intrinsics.unaligned_load((^simd.u8x16)(raw_data(src[BLOCK_SIZE:]))),
+			intrinsics.unaligned_load((^simd.u8x16)(raw_data(src[2*BLOCK_SIZE:]))),
+			intrinsics.unaligned_load((^simd.u8x16)(raw_data(src[3*BLOCK_SIZE:]))),
 			auth_tweak(tweak_prefix, stk_block_nr),
 			auth_tweak(tweak_prefix, stk_block_nr + 1),
 			auth_tweak(tweak_prefix, stk_block_nr + 2),
 			auth_tweak(tweak_prefix, stk_block_nr + 3),
 		)

-		tag = x86._mm_xor_si128(tag, d_0)
-		tag = x86._mm_xor_si128(tag, d_1)
-		tag = x86._mm_xor_si128(tag, d_2)
-		tag = x86._mm_xor_si128(tag, d_3)
+		tag = simd.bit_xor(tag, d_0)
+		tag = simd.bit_xor(tag, d_1)
+		tag = simd.bit_xor(tag, d_2)
+		tag = simd.bit_xor(tag, d_3)

 		src = src[4*BLOCK_SIZE:]
 		stk_block_nr += 4
@@ -156,11 +187,11 @@ bc_absorb :: proc "contextless" (
 	for nr_blocks > 0 {
 		d := bc_x1(
 			ctx,
-			intrinsics.unaligned_load((^x86.__m128i)(raw_data(src))),
+			intrinsics.unaligned_load((^simd.u8x16)(raw_data(src))),
 			auth_tweak(tweak_prefix, stk_block_nr),
 		)

-		tag = x86._mm_xor_si128(tag, d)
+		tag = simd.bit_xor(tag, d)

 		src = src[BLOCK_SIZE:]
 		stk_block_nr += 1
@@ -170,29 +201,29 @@ bc_absorb :: proc "contextless" (
 	return tag, stk_block_nr
 }

-@(private = "file", enable_target_feature = "sse2,ssse3,aes", require_results)
+@(private = "file", enable_target_feature = TARGET_FEATURES, require_results)
 bc_final :: proc "contextless" (
 	ctx: ^Context,
-	tag: x86.__m128i,
+	tag: simd.u8x16,
 	iv:  []byte,
-) -> x86.__m128i {
+) -> simd.u8x16 {
 	tmp: [BLOCK_SIZE]byte

 	tmp[0] = PREFIX_TAG << PREFIX_SHIFT
 	copy(tmp[1:], iv)

-	tweak := intrinsics.unaligned_load((^x86.__m128i)(&tmp))
+	tweak := intrinsics.unaligned_load((^simd.u8x16)(&tmp))

 	return bc_x1(ctx, tag, tweak)
 }

-@(private = "file", enable_target_feature = "sse2,ssse3,aes", require_results)
+@(private = "file", enable_target_feature = TARGET_FEATURES, require_results)
 bc_encrypt :: proc "contextless" (
 	ctx:          ^Context,
 	dst:          []byte,
 	src:          []byte,
-	iv:           x86.__m128i,
-	tweak_tag:    x86.__m128i,
+	iv:           simd.u8x16,
+	tweak_tag:    simd.u8x16,
 	stk_block_nr: int,
 ) -> int {
 	dst, src, stk_block_nr := dst, src, stk_block_nr
@@ -209,31 +240,31 @@ bc_encrypt :: proc "contextless" (
 		)

 		intrinsics.unaligned_store(
-			(^x86.__m128i)(raw_data(dst)),
-			x86._mm_xor_si128(
+			(^simd.u8x16)(raw_data(dst)),
+			simd.bit_xor(
 				d_0,
-				intrinsics.unaligned_load((^x86.__m128i)(raw_data(src))),
+				intrinsics.unaligned_load((^simd.u8x16)(raw_data(src))),
 			),
 		)
 		intrinsics.unaligned_store(
-			(^x86.__m128i)(raw_data(dst[BLOCK_SIZE:])),
-			x86._mm_xor_si128(
+			(^simd.u8x16)(raw_data(dst[BLOCK_SIZE:])),
+			simd.bit_xor(
 				d_1,
-				intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[BLOCK_SIZE:]))),
+				intrinsics.unaligned_load((^simd.u8x16)(raw_data(src[BLOCK_SIZE:]))),
 			),
 		)
 		intrinsics.unaligned_store(
-			(^x86.__m128i)(raw_data(dst[2*BLOCK_SIZE:])),
-			x86._mm_xor_si128(
+			(^simd.u8x16)(raw_data(dst[2*BLOCK_SIZE:])),
+			simd.bit_xor(
 				d_2,
-				intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[2*BLOCK_SIZE:]))),
+				intrinsics.unaligned_load((^simd.u8x16)(raw_data(src[2*BLOCK_SIZE:]))),
 			),
 		)
 		intrinsics.unaligned_store(
-			(^x86.__m128i)(raw_data(dst[3*BLOCK_SIZE:])),
-			x86._mm_xor_si128(
+			(^simd.u8x16)(raw_data(dst[3*BLOCK_SIZE:])),
+			simd.bit_xor(
 				d_3,
-				intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[3*BLOCK_SIZE:]))),
+				intrinsics.unaligned_load((^simd.u8x16)(raw_data(src[3*BLOCK_SIZE:]))),
 			),
 		)

@@ -250,10 +281,10 @@ bc_encrypt :: proc "contextless" (
 		)

 		intrinsics.unaligned_store(
-			(^x86.__m128i)(raw_data(dst)),
-			x86._mm_xor_si128(
+			(^simd.u8x16)(raw_data(dst)),
+			simd.bit_xor(
 				d,
-				intrinsics.unaligned_load((^x86.__m128i)(raw_data(src))),
+				intrinsics.unaligned_load((^simd.u8x16)(raw_data(src))),
 			),
 		)

@@ -269,7 +300,7 @@ bc_encrypt :: proc "contextless" (
 e_hw :: proc "contextless" (ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) #no_bounds_check {
 	tmp: [BLOCK_SIZE]byte
 	copy(tmp[1:], iv)
-	iv_ := intrinsics.unaligned_load((^x86.__m128i)(raw_data(&tmp)))
+	iv_ := intrinsics.unaligned_load((^simd.u8x16)(raw_data(&tmp)))

 	// Algorithm 3
 	//
@@ -282,7 +313,7 @@ e_hw :: proc "contextless" (ctx: ^Context, dst, tag, iv, aad, plaintext: []byte)
 	// if A_∗ != nil then
 	//   Auth <- Auth ^ EK(0110 || la, pad10∗(A_∗))
 	// end
-	auth: x86.__m128i
+	auth: simd.u8x16
 	n: int

 	aad := aad
@@ -341,14 +372,14 @@ e_hw :: proc "contextless" (ctx: ^Context, dst, tag, iv, aad, plaintext: []byte)
 		copy(dst[n*BLOCK_SIZE:], m_star[:])
 	}

-	intrinsics.unaligned_store((^x86.__m128i)(raw_data(tag)), auth)
+	intrinsics.unaligned_store((^simd.u8x16)(raw_data(tag)), auth)
 }

@(private, require_results)
 d_hw :: proc "contextless" (ctx: ^Context, dst, iv, aad, ciphertext, tag: []byte) -> bool {
 	tmp: [BLOCK_SIZE]byte
 	copy(tmp[1:], iv)
-	iv_ := intrinsics.unaligned_load((^x86.__m128i)(raw_data(&tmp)))
+	iv_ := intrinsics.unaligned_load((^simd.u8x16)(raw_data(&tmp)))

 	// Algorithm 4
 	//
@@ -360,7 +391,7 @@ d_hw :: proc "contextless" (ctx: ^Context, dst, iv, aad, ciphertext, tag: []byte
 	// if C_∗ != nil then
 	//   M_∗ <- C_∗ ^ EK(1 || tag ^ l, 0^8 || N)
 	// end
-	auth := intrinsics.unaligned_load((^x86.__m128i)(raw_data(tag)))
+	auth := intrinsics.unaligned_load((^simd.u8x16)(raw_data(tag)))

 	m := ciphertext
 	n := bc_encrypt(ctx, dst, m, iv_, auth, 0)
@@ -385,7 +416,7 @@ d_hw :: proc "contextless" (ctx: ^Context, dst, iv, aad, ciphertext, tag: []byte
 	// if A∗ != nil then
 	//   Auth <- Auth ^ EK(0110| | l_a, pad10∗(A_∗))
 	// end
-	auth = x86.__m128i{0, 0}
+	auth = simd.u8x16{}
 	aad := aad
 	auth, n = bc_absorb(ctx, auth, aad, _PREFIX_AD_BLOCK, 0)
 	aad = aad[BLOCK_SIZE*n:]
@@ -424,7 +455,7 @@ d_hw :: proc "contextless" (ctx: ^Context, dst, iv, aad, ciphertext, tag: []byte
 	// Tag verification
 	// if tag0 = tag then return (M_1 || ... || M_l || M_∗)
 	// else return false
-	intrinsics.unaligned_store((^x86.__m128i)(raw_data(&tmp)), auth)
+	intrinsics.unaligned_store((^simd.u8x16)(raw_data(&tmp)), auth)
 	ok := crypto.compare_constant_time(tmp[:], tag) == 1

 	crypto.zero_explicit(&tmp, size_of(tmp))
--- a/core/crypto/deoxysii/deoxysii_impl_hw_gen.odin
+++ b/core/crypto/deoxysii/deoxysii_impl_hw_gen.odin
@@ -1,10 +1,12 @@
 #+build !amd64
+#+build !arm64
+#+build !arm32
 package deoxysii

@(private = "file")
 ERR_HW_NOT_SUPPORTED :: "crypto/deoxysii: hardware implementation unsupported"

-// is_hardware_accelerated returns true iff hardware accelerated Deoxys-II
+// is_hardware_accelerated returns true if and only if (⟺) hardware accelerated Deoxys-II
 // is supported.
 is_hardware_accelerated :: proc "contextless" () -> bool {
 	return false
--- a/core/crypto/ecdh/ecdh.odin
+++ b/core/crypto/ecdh/ecdh.odin
@@ -104,7 +104,7 @@ Public_Key :: struct {
 }

 // private_key_generate uses the system entropy source to generate a new
-// Private_Key.  This will only fail iff the system entropy source is
+// Private_Key.  This will only fail if and only if (⟺) the system entropy source is
 // missing or broken.
 private_key_generate :: proc(priv_key: ^Private_Key, curve: Curve) -> bool {
 	private_key_clear(priv_key)
@@ -142,7 +142,7 @@ private_key_generate :: proc(priv_key: ^Private_Key, curve: Curve) -> bool {
 }

 // private_key_set_bytes decodes a byte-encoded private key, and returns
-// true iff the operation was successful.
+// true if and only if (⟺) the operation was successful.
 private_key_set_bytes :: proc(priv_key: ^Private_Key, curve: Curve, b: []byte) -> bool {
 	private_key_clear(priv_key)

@@ -245,7 +245,7 @@ private_key_bytes :: proc(priv_key: ^Private_Key, dst: []byte) {
 	}
 }

-// private_key_equal returns true iff the private keys are equal,
+// private_key_equal returns true if and only if (⟺) the private keys are equal,
 // in constant time.
 private_key_equal :: proc(p, q: ^Private_Key) -> bool {
 	if p._curve != q._curve {
@@ -276,7 +276,7 @@ private_key_clear :: proc "contextless" (priv_key: ^Private_Key) {
 }

 // public_key_set_bytes decodes a byte-encoded public key, and returns
-// true iff the operation was successful.
+// true if and only if (⟺) the operation was successful.
 public_key_set_bytes :: proc(pub_key: ^Public_Key, curve: Curve, b: []byte) -> bool {
 	public_key_clear(pub_key)

@@ -365,7 +365,7 @@ public_key_bytes :: proc(pub_key: ^Public_Key, dst: []byte) {
 	}
 }

-// public_key_equal returns true iff the public keys are equal,
+// public_key_equal returns true if and only if (⟺) the public keys are equal,
 // in constant time.
 public_key_equal :: proc(p, q: ^Public_Key) -> bool {
 	if p._curve != q._curve {
--- a/core/crypto/ecdsa/ecdsa.odin
+++ b/core/crypto/ecdsa/ecdsa.odin
@@ -79,7 +79,7 @@ Public_Key :: struct {
 }

 // private_key_generate uses the system entropy source to generate a new
-// Private_Key.  This will only fail iff the system entropy source is
+// Private_Key.  This will only fail if and only if (⟺) the system entropy source is
 // missing or broken.
 private_key_generate :: proc(priv_key: ^Private_Key, curve: Curve) -> bool {
 	private_key_clear(priv_key)
@@ -111,7 +111,7 @@ private_key_generate :: proc(priv_key: ^Private_Key, curve: Curve) -> bool {
 }

 // private_key_set_bytes decodes a byte-encoded private key, and returns
-// true iff the operation was successful.
+// true if and only if (⟺) the operation was successful.
 private_key_set_bytes :: proc(priv_key: ^Private_Key, curve: Curve, b: []byte) -> bool {
 	private_key_clear(priv_key)

@@ -194,7 +194,7 @@ private_key_bytes :: proc(priv_key: ^Private_Key, dst: []byte) {
 	}
 }

-// private_key_equal returns true iff the private keys are equal,
+// private_key_equal returns true if and only if (⟺) the private keys are equal,
 // in constant time.
 private_key_equal :: proc(p, q: ^Private_Key) -> bool {
 	if p._curve != q._curve {
@@ -219,7 +219,7 @@ private_key_clear :: proc "contextless" (priv_key: ^Private_Key) {
 }

 // public_key_set_bytes decodes a byte-encoded public key, and returns
-// true iff the operation was successful.
+// true if and only if (⟺) the operation was successful.
 public_key_set_bytes :: proc(pub_key: ^Public_Key, curve: Curve, b: []byte) -> bool {
 	public_key_clear(pub_key)

@@ -296,7 +296,7 @@ public_key_bytes :: proc(pub_key: ^Public_Key, dst: []byte) {
 	}
 }

-// public_key_equal returns true iff the public keys are equal,
+// public_key_equal returns true if and only if (⟺) the public keys are equal,
 // in constant time.
 public_key_equal :: proc(p, q: ^Public_Key) -> bool {
 	if p._curve != q._curve {
--- a/core/crypto/ecdsa/ecdsa_asn1.odin
+++ b/core/crypto/ecdsa/ecdsa_asn1.odin
@@ -141,7 +141,7 @@ parse_asn1_sig :: proc(sig: []byte) -> (r, s: []byte, ok: bool) {
 		return nil, nil, false
 	}

-	// DER requires a leading 0 iff the sign bit of the leading byte
+	// DER requires a leading 0 if and only if (⟺) the sign bit of the leading byte
 	// is set to distinguish between positive and negative integers,
 	// and the minimal length representation.  `r` and `s` are always
 	// going to be unsigned, so we validate malformed DER and strip
--- a/core/crypto/ecdsa/ecdsa_verify.odin
+++ b/core/crypto/ecdsa/ecdsa_verify.odin
@@ -3,7 +3,7 @@ package ecdsa
 import "core:crypto/hash"
 import secec "core:crypto/_weierstrass"

-// verify_raw returns true iff sig is a valid signature by pub_key over
+// verify_raw returns true if and only if (⟺) sig is a valid signature by pub_key over
 // msg, hased using hash_algo, per the verification procedure specifed
 // in SEC 1, Version 2.0, Section 4.1.4.
 //
@@ -33,7 +33,7 @@ verify_raw :: proc(pub_key: ^Public_Key, hash_algo: hash.Algorithm, msg, sig: []
 	panic("crypto/ecdsa: invalid curve")
 }

-// verify_asn1 returns true iff sig is a valid signature by pub_key over
+// verify_asn1 returns true if and only if (⟺) sig is a valid signature by pub_key over
 // msg, hased using hash_algo, per the verification procedure specifed
 // in SEC 1, Version 2.0, Section 4.1.4.
 //
--- a/core/crypto/ed25519/ed25519.odin
+++ b/core/crypto/ed25519/ed25519.odin
@@ -48,7 +48,7 @@ Public_Key :: struct {
 }

 // private_key_generate uses the system entropy source to generate a new
-// Private_Key.  This will only fail iff the system entropy source is
+// Private_Key.  This will only fail if and only if (⟺) the system entropy source is
 // missing or broken.
 private_key_generate :: proc(priv_key: ^Private_Key) -> bool {
 	private_key_clear(priv_key)
@@ -67,7 +67,7 @@ private_key_generate :: proc(priv_key: ^Private_Key) -> bool {
 }

 // private_key_set_bytes decodes a byte-encoded private key, and returns
-// true iff the operation was successful.
+// true if and only if (⟺) the operation was successful.
 private_key_set_bytes :: proc(priv_key: ^Private_Key, b: []byte) -> bool {
 	if len(b) != PRIVATE_KEY_SIZE {
 		return false
@@ -167,7 +167,7 @@ sign :: proc(priv_key: ^Private_Key, msg, sig: []byte) {
 }

 // public_key_set_bytes decodes a byte-encoded public key, and returns
-// true iff the operation was successful.
+// true if and only if (⟺) the operation was successful.
 public_key_set_bytes :: proc "contextless" (pub_key: ^Public_Key, b: []byte) -> bool {
 	if len(b) != PUBLIC_KEY_SIZE {
 		return false
@@ -205,14 +205,14 @@ public_key_bytes :: proc(pub_key: ^Public_Key, dst: []byte) {
 	copy(dst, pub_key._b[:])
 }

-// public_key_equal returns true iff pub_key is equal to other.
+// public_key_equal returns true if and only if (⟺) pub_key is equal to other.
 public_key_equal :: proc(pub_key, other: ^Public_Key) -> bool {
 	ensure(pub_key._is_initialized && other._is_initialized, "crypto/ed25519: uninitialized public key")

 	return crypto.compare_constant_time(pub_key._b[:], other._b[:]) == 1
 }

-// verify returns true iff sig is a valid signature by pub_key over msg.
+// verify returns true if and only if (⟺) sig is a valid signature by pub_key over msg.
 //
 // The optional `allow_small_order_A` parameter will make this
 // implementation strictly compatible with FIPS 186-5, at the expense of
--- a/core/crypto/hash/low_level.odin
+++ b/core/crypto/hash/low_level.odin
@@ -235,7 +235,7 @@ update :: proc(ctx: ^Context, data: []byte) {
 // final finalizes the Context, writes the digest to hash, and calls
 // reset on the Context.
 //
-// Iff finalize_clone is set, final will work on a copy of the Context,
+// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context,
 // which is useful for for calculating rolling digests.
 final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
 	switch &impl in ctx._impl {
--- a/core/crypto/hmac/hmac.odin
+++ b/core/crypto/hmac/hmac.odin
@@ -21,7 +21,7 @@ sum :: proc(algorithm: hash.Algorithm, dst, msg, key: []byte) {
 }

 // verify will verify the HMAC tag computed with the specified algorithm
-// and key over msg and return true iff the tag is valid.  It requires
+// and key over msg and return true if and only if (⟺) the tag is valid.  It requires
 // that the tag is correctly sized.
 verify :: proc(algorithm: hash.Algorithm, tag, msg, key: []byte) -> bool {
 	tag_buf: [hash.MAX_DIGEST_SIZE]byte
--- a/core/crypto/kmac/kmac.odin
+++ b/core/crypto/kmac/kmac.odin
@@ -32,7 +32,7 @@ sum :: proc(sec_strength: int, dst, msg, key, domain_sep: []byte) {
 }

 // verify will verify the KMAC tag computed with the specified security
-// strength, key and domain separator over msg and return true iff the
+// strength, key and domain separator over msg and return true if and only if (⟺) the
 // tag is valid.
 verify :: proc(sec_strength: int, tag, msg, key, domain_sep: []byte, allocator := context.temp_allocator) -> bool {
 	derived_tag := make([]byte, len(tag), allocator)
--- a/core/crypto/legacy/keccak/keccak.odin
+++ b/core/crypto/legacy/keccak/keccak.odin
@@ -77,7 +77,7 @@ update :: proc "contextless" (ctx: ^Context, data: []byte) {
 // final finalizes the Context, writes the digest to hash, and calls
 // reset on the Context.
 //
-// Iff finalize_clone is set, final will work on a copy of the Context,
+// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context,
 // which is useful for for calculating rolling digests.
 final :: proc "contextless" (ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
 	_sha3.final((^_sha3.Context)(ctx), hash, finalize_clone)
--- a/core/crypto/legacy/md5/md5.odin
+++ b/core/crypto/legacy/md5/md5.odin
@@ -69,7 +69,7 @@ update :: proc(ctx: ^Context, data: []byte) {
 // final finalizes the Context, writes the digest to hash, and calls
 // reset on the Context.
 //
-// Iff finalize_clone is set, final will work on a copy of the Context,
+// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context,
 // which is useful for for calculating rolling digests.
 final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
 	ensure(ctx.is_initialized)
--- a/core/crypto/legacy/sha1/sha1.odin
+++ b/core/crypto/legacy/sha1/sha1.odin
@@ -76,7 +76,7 @@ update :: proc(ctx: ^Context, data: []byte) {
 // final finalizes the Context, writes the digest to hash, and calls
 // reset on the Context.
 //
-// Iff finalize_clone is set, final will work on a copy of the Context,
+// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context,
 // which is useful for for calculating rolling digests.
 final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
 	ensure(ctx.is_initialized)
--- a/core/crypto/pbkdf2/pbkdf2.odin
+++ b/core/crypto/pbkdf2/pbkdf2.odin
@@ -66,7 +66,7 @@ derive :: proc(
 		dst_blk = dst_blk[h_len:]
 	}

-	// Instead of rounding l up, just proceass the one extra block iff
+	// Instead of rounding l up, just proceass the one extra block if and only if (⟺)
 	// r != 0.
 	if r > 0 {
 		tmp: [hash.MAX_DIGEST_SIZE]byte
--- a/core/crypto/poly1305/poly1305.odin
+++ b/core/crypto/poly1305/poly1305.odin
@@ -33,7 +33,7 @@ sum :: proc(dst, msg, key: []byte) {
 }

 // verify will verify the Poly1305 tag computed with the key over msg and
-// return true iff the tag is valid.  It requires that the tag is correctly
+// return true if and only if (⟺) the tag is valid.  It requires that the tag is correctly
 // sized.
 verify :: proc(tag, msg, key: []byte) -> bool {
 	ctx: Context = ---
--- a/core/crypto/ristretto255/ristretto255.odin
+++ b/core/crypto/ristretto255/ristretto255.odin
@@ -360,7 +360,7 @@ ge_double_scalarmult_generator_vartime :: proc(
 	ge._is_initialized = true
 }

-// ge_cond_negate sets `ge = a` iff `ctrl == 0` and `ge = -a` iff `ctrl == 1`.
+// ge_cond_negate sets `ge = a` if and only if (⟺) `ctrl == 0` and `ge = -a` if and only if (⟺) `ctrl == 1`.
 // Behavior for all other values of ctrl are undefined,
 ge_cond_negate :: proc(ge, a: ^Group_Element, ctrl: int) {
 	_ge_ensure_initialized([]^Group_Element{a})
@@ -369,7 +369,7 @@ ge_cond_negate :: proc(ge, a: ^Group_Element, ctrl: int) {
 	ge._is_initialized = true
 }

-// ge_cond_assign sets `ge = ge` iff `ctrl == 0` and `ge = a` iff `ctrl == 1`.
+// ge_cond_assign sets `ge = ge` if and only if (⟺) `ctrl == 0` and `ge = a` if and only if (⟺) `ctrl == 1`.
 // Behavior for all other values of ctrl are undefined,
 ge_cond_assign :: proc(ge, a: ^Group_Element, ctrl: int) {
 	_ge_ensure_initialized([]^Group_Element{ge, a})
@@ -377,7 +377,7 @@ ge_cond_assign :: proc(ge, a: ^Group_Element, ctrl: int) {
 	grp.ge_cond_assign(&ge._p, &a._p, ctrl)
 }

-// ge_cond_select sets `ge = a` iff `ctrl == 0` and `ge = b` iff `ctrl == 1`.
+// ge_cond_select sets `ge = a` if and only if (⟺) `ctrl == 0` and `ge = b` if and only if (⟺) `ctrl == 1`.
 // Behavior for all other values of ctrl are undefined,
 ge_cond_select :: proc(ge, a, b: ^Group_Element, ctrl: int) {
 	_ge_ensure_initialized([]^Group_Element{a, b})
@@ -386,7 +386,7 @@ ge_cond_select :: proc(ge, a, b: ^Group_Element, ctrl: int) {
 	ge._is_initialized = true
 }

-// ge_equal returns 1 iff `a == b`, and 0 otherwise.
+// ge_equal returns 1 if and only if (⟺) `a == b`, and 0 otherwise.
@(require_results)
 ge_equal :: proc(a, b: ^Group_Element) -> int {
 	_ge_ensure_initialized([]^Group_Element{a, b})
@@ -405,7 +405,7 @@ ge_equal :: proc(a, b: ^Group_Element) -> int {
 	return ret
 }

-// ge_is_identity returns 1 iff `ge` is the identity element, and 0 otherwise.
+// ge_is_identity returns 1 if and only if (⟺) `ge` is the identity element, and 0 otherwise.
@(require_results)
 ge_is_identity :: proc(ge: ^Group_Element) -> int {
 	return ge_equal(ge, &GE_IDENTITY)
--- a/core/crypto/ristretto255/ristretto255_scalar.odin
+++ b/core/crypto/ristretto255/ristretto255_scalar.odin
@@ -80,13 +80,13 @@ sc_square :: proc "contextless" (sc, a: ^Scalar) {
 	grp.sc_square(sc, a)
 }

-// sc_cond_assign sets `sc = sc` iff `ctrl == 0` and `sc = a` iff `ctrl == 1`.
+// sc_cond_assign sets `sc = sc` if and only if (⟺) `ctrl == 0` and `sc = a` if and only if (⟺) `ctrl == 1`.
 // Behavior for all other values of ctrl are undefined,
 sc_cond_assign :: proc(sc, a: ^Scalar, ctrl: int) {
 	grp.sc_cond_assign(sc, a, ctrl)
 }

-// sc_equal returns 1 iff `a == b`, and 0 otherwise.
+// sc_equal returns 1 if and only if (⟺) `a == b`, and 0 otherwise.
@(require_results)
 sc_equal :: proc(a, b: ^Scalar) -> int {
 	return grp.sc_equal(a, b)
--- a/core/crypto/sha2/sha2.odin
+++ b/core/crypto/sha2/sha2.odin
@@ -44,7 +44,8 @@ Context_256 :: struct {
 	length:    u64,
 	md_bits:   int,

-	is_initialized: bool,
+	is_hw_accelerated: bool,
+	is_initialized:    bool,
 }

 // Context_512 is a SHA-384, SHA-512 or SHA-512/256 instance.
@@ -55,7 +56,8 @@ Context_512 :: struct {
 	length:    u64,
 	md_bits:   int,

-	is_initialized: bool,
+	is_hw_accelerated: bool,
+	is_initialized:    bool,
 }

 // init_224 initializes a Context_256 for SHA-224.
@@ -88,6 +90,9 @@ init_512_256 :: proc(ctx: ^Context_512) {
 	_init(ctx)
 }

+@(private)
+ERR_HW_NOT_SUPPORTED :: "crypto/sha2: hardware implementation unsupported"
+
@(private)
 _init :: proc(ctx: ^$T) {
 	when T == Context_256 {
@@ -113,6 +118,8 @@ _init :: proc(ctx: ^$T) {
 		case:
 			panic("crypto/sha2: invalid digest output length")
 		}
+
+		ctx.is_hw_accelerated = is_hardware_accelerated_256()
 	} else when T == Context_512 {
 		switch ctx.md_bits {
 		case 256:
@@ -148,6 +155,8 @@ _init :: proc(ctx: ^$T) {
 		case:
 			panic("crypto/sha2: invalid digest output length")
 		}
+
+		ctx.is_hw_accelerated = is_hardware_accelerated_512()
 	}

 	ctx.length = 0
@@ -191,7 +200,7 @@ update :: proc(ctx: ^$T, data: []byte) {
 // final finalizes the Context, writes the digest to hash, and calls
 // reset on the Context.
 //
-// Iff finalize_clone is set, final will work on a copy of the Context,
+// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context,
 // which is useful for for calculating rolling digests.
 final :: proc(ctx: ^$T, hash: []byte, finalize_clone: bool = false) {
 	ensure(ctx.is_initialized)
@@ -267,7 +276,7 @@ reset :: proc(ctx: ^$T) {
    SHA2 implementation
 */

-@(private, rodata)
+@(private = "file", rodata)
 SHA256_K := [64]u32 {
 	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
 	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
@@ -287,7 +296,7 @@ SHA256_K := [64]u32 {
 	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
 }

-@(private, rodata)
+@(private = "file", rodata)
 SHA512_K := [80]u64 {
 	0x428a2f98d728ae22, 0x7137449123ef65cd,
 	0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc,
@@ -336,70 +345,70 @@ SHA256_ROUNDS :: 64
@(private)
 SHA512_ROUNDS :: 80

-@(private)
+@(private = "file")
 SHA256_CH :: #force_inline proc "contextless" (x, y, z: u32) -> u32 {
 	return (x & y) ~ (~x & z)
 }

-@(private)
+@(private = "file")
 SHA256_MAJ :: #force_inline proc "contextless" (x, y, z: u32) -> u32 {
 	return (x & y) ~ (x & z) ~ (y & z)
 }

-@(private)
+@(private = "file")
 SHA512_CH :: #force_inline proc "contextless" (x, y, z: u64) -> u64 {
 	return (x & y) ~ (~x & z)
 }

-@(private)
+@(private = "file")
 SHA512_MAJ :: #force_inline proc "contextless" (x, y, z: u64) -> u64 {
 	return (x & y) ~ (x & z) ~ (y & z)
 }

-@(private)
+@(private = "file")
 SHA256_F1 :: #force_inline proc "contextless" (x: u32) -> u32 {
 	return bits.rotate_left32(x, 30) ~ bits.rotate_left32(x, 19) ~ bits.rotate_left32(x, 10)
 }

-@(private)
+@(private = "file")
 SHA256_F2 :: #force_inline proc "contextless" (x: u32) -> u32 {
 	return bits.rotate_left32(x, 26) ~ bits.rotate_left32(x, 21) ~ bits.rotate_left32(x, 7)
 }

-@(private)
+@(private = "file")
 SHA256_F3 :: #force_inline proc "contextless" (x: u32) -> u32 {
 	return bits.rotate_left32(x, 25) ~ bits.rotate_left32(x, 14) ~ (x >> 3)
 }

-@(private)
+@(private = "file")
 SHA256_F4 :: #force_inline proc "contextless" (x: u32) -> u32 {
 	return bits.rotate_left32(x, 15) ~ bits.rotate_left32(x, 13) ~ (x >> 10)
 }

-@(private)
+@(private = "file")
 SHA512_F1 :: #force_inline proc "contextless" (x: u64) -> u64 {
 	return bits.rotate_left64(x, 36) ~ bits.rotate_left64(x, 30) ~ bits.rotate_left64(x, 25)
 }

-@(private)
+@(private = "file")
 SHA512_F2 :: #force_inline proc "contextless" (x: u64) -> u64 {
 	return bits.rotate_left64(x, 50) ~ bits.rotate_left64(x, 46) ~ bits.rotate_left64(x, 23)
 }

-@(private)
+@(private = "file")
 SHA512_F3 :: #force_inline proc "contextless" (x: u64) -> u64 {
 	return bits.rotate_left64(x, 63) ~ bits.rotate_left64(x, 56) ~ (x >> 7)
 }

-@(private)
+@(private = "file")
 SHA512_F4 :: #force_inline proc "contextless" (x: u64) -> u64 {
 	return bits.rotate_left64(x, 45) ~ bits.rotate_left64(x, 3) ~ (x >> 6)
 }

-@(private)
+@(private = "file")
 sha2_transf :: proc "contextless" (ctx: ^$T, data: []byte) #no_bounds_check {
 	when T == Context_256 {
-		if is_hardware_accelerated_256() {
+		if ctx.is_hw_accelerated {
 			sha256_transf_hw(ctx, data)
 			return
 		}
@@ -410,6 +419,11 @@ sha2_transf :: proc "contextless" (ctx: ^$T, data: []byte) #no_bounds_check {

 		CURR_BLOCK_SIZE :: BLOCK_SIZE_256
 	} else when T == Context_512 {
+		if ctx.is_hw_accelerated {
+			sha512_transf_hw(ctx, data)
+			return
+		}
+
 		w: [SHA512_ROUNDS]u64
 		wv: [8]u64
 		t1, t2: u64
--- a/core/crypto/sha2/sha256_impl_hw_arm.odin
+++ b/core/crypto/sha2/sha256_impl_hw_arm.odin
@@ -0,0 +1,224 @@
+#+build arm64,arm32
+package sha2
+
+// Based on the public domain code by Jeffrey Walton, though
+// realistically, there only is one sensible way to write this.
+//
+// See: https://github.com/noloader/SHA-Intrinsics
+
+import "base:intrinsics"
+import "core:simd"
+import "core:simd/arm"
+import "core:sys/info"
+
+// is_hardware_accelerated_256 returns true if and only if (⟺) hardware
+// accelerated SHA-224/SHA-256 is supported.
+is_hardware_accelerated_256 :: proc "contextless" () -> bool {
+	req_features :: info.CPU_Features{
+		.asimd,
+		.sha256,
+	}
+	return info.cpu_features() >= req_features
+}
+
+@(private = "file")
+K_0 :: simd.u32x4{0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5}
+@(private = "file")
+K_1 :: simd.u32x4{0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5}
+@(private = "file")
+K_2 :: simd.u32x4{0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3}
+@(private = "file")
+K_3 :: simd.u32x4{0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174}
+@(private = "file")
+K_4 :: simd.u32x4{0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC}
+@(private = "file")
+K_5 :: simd.u32x4{0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA}
+@(private = "file")
+K_6 :: simd.u32x4{0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7}
+@(private = "file")
+K_7 :: simd.u32x4{0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967}
+@(private = "file")
+K_8 :: simd.u32x4{0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13}
+@(private = "file")
+K_9 :: simd.u32x4{0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85}
+@(private = "file")
+K_10 :: simd.u32x4{0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3}
+@(private = "file")
+K_11 :: simd.u32x4{0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070}
+@(private = "file")
+K_12 :: simd.u32x4{0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5}
+@(private = "file")
+K_13 :: simd.u32x4{0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3}
+@(private = "file")
+K_14 :: simd.u32x4{0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208}
+@(private = "file")
+K_15 :: simd.u32x4{0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2}
+
+@(private, enable_target_feature = "neon,sha2")
+sha256_transf_hw :: proc "contextless" (ctx: ^Context_256, data: []byte) #no_bounds_check {
+	state_0 := intrinsics.unaligned_load((^simd.u32x4)(&ctx.h[0]))
+	state_1 := intrinsics.unaligned_load((^simd.u32x4)(&ctx.h[4]))
+
+	data := data
+	for len(data) >= BLOCK_SIZE_256 {
+		// Save state
+		abef_save, cdgh_save := state_0, state_1
+
+		// Load message
+		msg_0 := intrinsics.unaligned_load((^simd.u32x4)(raw_data(data)))
+		msg_1 := intrinsics.unaligned_load((^simd.u32x4)(raw_data(data[16:])))
+		msg_2 := intrinsics.unaligned_load((^simd.u32x4)(raw_data(data[32:])))
+		msg_3 := intrinsics.unaligned_load((^simd.u32x4)(raw_data(data[48:])))
+
+		// Reverse for little endian
+		when ODIN_ENDIAN == .Little {
+			msg_0 = byteswap_u32x4(msg_0)
+			msg_1 = byteswap_u32x4(msg_1)
+			msg_2 = byteswap_u32x4(msg_2)
+			msg_3 = byteswap_u32x4(msg_3)
+		}
+
+		tmp_0 := simd.add(msg_0, K_0)
+
+		// Rounds 0-3
+		msg_0 = arm.vsha256su0q_u32(msg_0, msg_1)
+		tmp_2 := state_0
+		tmp_1 := simd.add(msg_1, K_1)
+		state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0)
+		state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0)
+		msg_0 = arm.vsha256su1q_u32(msg_0, msg_2, msg_3)
+
+		// Rounds 4-7
+		msg_1 = arm.vsha256su0q_u32(msg_1, msg_2)
+		tmp_2 = state_0
+		tmp_0 = simd.add(msg_2, K_2)
+		state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1)
+		state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1)
+		msg_1 = arm.vsha256su1q_u32(msg_1, msg_3, msg_0)
+
+		// Rounds 8-11
+		msg_2 = arm.vsha256su0q_u32(msg_2, msg_3)
+		tmp_2 = state_0
+		tmp_1 = simd.add(msg_3, K_3)
+		state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0)
+		state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0)
+		msg_2 = arm.vsha256su1q_u32(msg_2, msg_0, msg_1)
+
+		// Rounds 12-15
+		msg_3 = arm.vsha256su0q_u32(msg_3, msg_0)
+		tmp_2 = state_0
+		tmp_0 = simd.add(msg_0, K_4)
+		state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1)
+		state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1)
+		msg_3 = arm.vsha256su1q_u32(msg_3, msg_1, msg_2)
+
+		// Rounds 16-19
+		msg_0 = arm.vsha256su0q_u32(msg_0, msg_1)
+		tmp_2 = state_0
+		tmp_1 = simd.add(msg_1, K_5)
+		state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0)
+		state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0)
+		msg_0 = arm.vsha256su1q_u32(msg_0, msg_2, msg_3)
+
+		// Rounds 20-23
+		msg_1 = arm.vsha256su0q_u32(msg_1, msg_2)
+		tmp_2 = state_0
+		tmp_0 = simd.add(msg_2, K_6)
+		state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1)
+		state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1)
+		msg_1 = arm.vsha256su1q_u32(msg_1, msg_3, msg_0)
+
+		// Rounds 24-27
+		msg_2 = arm.vsha256su0q_u32(msg_2, msg_3)
+		tmp_2 = state_0
+		tmp_1 = simd.add(msg_3, K_7)
+		state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0)
+		state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0)
+		msg_2 = arm.vsha256su1q_u32(msg_2, msg_0, msg_1)
+
+		// Rounds 28-31
+		msg_3 = arm.vsha256su0q_u32(msg_3, msg_0)
+		tmp_2 = state_0
+		tmp_0 = simd.add(msg_0, K_8)
+		state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1)
+		state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1)
+		msg_3 = arm.vsha256su1q_u32(msg_3, msg_1, msg_2)
+
+		// Rounds 32-35
+		msg_0 = arm.vsha256su0q_u32(msg_0, msg_1)
+		tmp_2 = state_0
+		tmp_1 = simd.add(msg_1, K_9)
+		state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0)
+		state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0)
+		msg_0 = arm.vsha256su1q_u32(msg_0, msg_2, msg_3)
+
+		// Rounds 36-39
+		msg_1 = arm.vsha256su0q_u32(msg_1, msg_2)
+		tmp_2 = state_0
+		tmp_0 = simd.add(msg_2, K_10)
+		state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1)
+		state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1)
+		msg_1 = arm.vsha256su1q_u32(msg_1, msg_3, msg_0)
+
+		// Rounds 40-43
+		msg_2 = arm.vsha256su0q_u32(msg_2, msg_3)
+		tmp_2 = state_0
+		tmp_1 = simd.add(msg_3, K_11)
+		state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0)
+		state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0)
+		msg_2 = arm.vsha256su1q_u32(msg_2, msg_0, msg_1)
+
+		// Rounds 44-47
+		msg_3 = arm.vsha256su0q_u32(msg_3, msg_0)
+		tmp_2 = state_0
+		tmp_0 = simd.add(msg_0, K_12)
+		state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1)
+		state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1)
+		msg_3 = arm.vsha256su1q_u32(msg_3, msg_1, msg_2)
+
+		// Rounds 48-51
+		tmp_2 = state_0
+		tmp_1 = simd.add(msg_1, K_13)
+		state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0)
+		state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0)
+
+		// Rounds 52-55
+		tmp_2 = state_0
+		tmp_0 = simd.add(msg_2, K_14)
+		state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1)
+		state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1)
+
+		// Rounds 56-59
+		tmp_2 = state_0
+		tmp_1 = simd.add(msg_3, K_15)
+		state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0)
+		state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0)
+
+		// Rounds 60-63
+		tmp_2 = state_0
+		state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1)
+		state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1)
+
+		// Combine state
+		state_0 = simd.add(state_0, abef_save)
+		state_1 = simd.add(state_1, cdgh_save)
+
+		data = data[BLOCK_SIZE_256:]
+	}
+
+	intrinsics.unaligned_store((^simd.u32x4)(&ctx.h[0]), state_0)
+	intrinsics.unaligned_store((^simd.u32x4)(&ctx.h[4]), state_1)
+}
+
+when ODIN_ENDIAN == .Little {
+	@(private = "file", enable_target_feature = "neon")
+	byteswap_u32x4 :: #force_inline proc "contextless" (a: simd.u32x4) -> simd.u32x4 {
+		return transmute(simd.u32x4)(
+			simd.shuffle(
+				transmute(simd.u8x16)(a),
+				transmute(simd.u8x16)(a),
+				3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
+			)
+		)
+	}
+}
--- a/core/crypto/sha2/sha256_impl_hw_gen.odin
+++ b/core/crypto/sha2/sha256_impl_hw_gen.odin
@@ -1,15 +1,15 @@
 #+build !amd64
+#+build !arm64
+#+build !arm32
 package sha2

-@(private = "file")
-ERR_HW_NOT_SUPPORTED :: "crypto/sha2: hardware implementation unsupported"
-
-// is_hardware_accelerated_256 returns true iff hardware accelerated
-// SHA-224/SHA-256 is supported.
+// is_hardware_accelerated_256 returns true if and only if (⟺) hardware
+// accelerated SHA-224/SHA-256 is supported.
 is_hardware_accelerated_256 :: proc "contextless" () -> bool {
 	return false
 }

+@(private)
 sha256_transf_hw :: proc "contextless" (ctx: ^Context_256, data: []byte) {
 	panic_contextless(ERR_HW_NOT_SUPPORTED)
 }
--- a/core/crypto/sha2/sha256_impl_hw_intel.odin
+++ b/core/crypto/sha2/sha256_impl_hw_intel.odin
@@ -49,7 +49,7 @@ K_14 :: simd.u64x2{0x78a5636f748f82ee, 0x8cc7020884c87814}
 K_15 :: simd.u64x2{0xa4506ceb90befffa, 0xc67178f2bef9a3f7}


-// is_hardware_accelerated_256 returns true iff hardware accelerated
+// is_hardware_accelerated_256 returns true if and only if (⟺) hardware accelerated
 // SHA-224/SHA-256 is supported.
 is_hardware_accelerated_256 :: proc "contextless" () -> bool {
 	req_features :: info.CPU_Features{
@@ -70,8 +70,7 @@ sha256_transf_hw :: proc "contextless" (ctx: ^Context_256, data: []byte) #no_bou
 	tmp = x86._mm_shuffle_epi32(tmp, 0xb1)            // CDAB
 	state_1 = x86._mm_shuffle_epi32(state_1, 0x1b)    // EFGH
 	state_0 := x86._mm_alignr_epi8(tmp, state_1, 8)   // ABEF
-	// state_1 = x86._mm_blend_epi16(state_1, tmp, 0xf0) // CDGH
-	state_1 = kludge_mm_blend_epi16_0xf0(state_1, tmp)
+	state_1 = x86._mm_blend_epi16(state_1, tmp, 0xf0) // CDGH

 	data := data
 	for len(data) >= BLOCK_SIZE_256 {
@@ -238,18 +237,9 @@ sha256_transf_hw :: proc "contextless" (ctx: ^Context_256, data: []byte) #no_bou
 	// Write back the updated state
 	tmp = x86._mm_shuffle_epi32(state_0, 0x1b)        // FEBA
 	state_1 = x86._mm_shuffle_epi32(state_1, 0xb1)    // DCHG
-	// state_0 = x86._mm_blend_epi16(tmp, state_1, 0xf0) // DCBA
-	state_0 = kludge_mm_blend_epi16_0xf0(tmp, state_1)
+	state_0 = x86._mm_blend_epi16(tmp, state_1, 0xf0) // DCBA
 	state_1 = x86._mm_alignr_epi8(state_1, tmp, 8)    // ABEF

 	intrinsics.unaligned_store((^x86.__m128i)(&ctx.h[0]), state_0)
 	intrinsics.unaligned_store((^x86.__m128i)(&ctx.h[4]), state_1)
 }
-
-@(private = "file")
-kludge_mm_blend_epi16_0xf0 :: #force_inline proc "contextless"(a, b: x86.__m128i) -> x86.__m128i {
-	// HACK HACK HACK: LLVM got rid of `llvm.x86.sse41.pblendw`.
-	a_ := simd.to_array(a)
-	b_ := simd.to_array(b)
-	return x86.__m128i{a_[0], b_[1]}
-}
--- a/core/crypto/sha2/sha512_impl_hw_arm.odin
+++ b/core/crypto/sha2/sha512_impl_hw_arm.odin
@@ -0,0 +1,498 @@
+// The round function's intrinsic calls are based on:
+// https://github.com/LostInCompilation/HashMe/blob/main/src/SHA512_Hardware.cpp
+//
+//     The zlib License
+//
+//     Copyright (C) 2024 Marc Schöndorf
+//
+// This software is provided 'as-is', without any express or implied warranty. In
+// no event will the authors be held liable for any damages arising from the use of
+// this software.
+//
+// Permission is granted to anyone to use this software for any purpose, including
+// commercial applications, and to alter it and redistribute it freely, subject to
+// the following restrictions:
+//
+// 1.  The origin of this software must not be misrepresented; you must not claim
+//     that you wrote the original software. If you use this software in a product,
+//     an acknowledgment in the product documentation would be appreciated but is
+//     not required.
+//
+// 2.  Altered source versions must be plainly marked as such, and must not be
+//     misrepresented as being the original software.
+//
+// 3.  This notice may not be removed or altered from any source distribution.
+
+#+build arm64
+package sha2
+
+import "base:intrinsics"
+import "core:simd"
+import "core:simd/arm"
+import "core:sys/info"
+
+// is_hardware_accelerated_512 returns true if and only if (⟺) hardware
+// accelerated SHA-384, SHA-512, and SHA-512/256 are supported.
+is_hardware_accelerated_512 :: proc "contextless" () -> bool {
+	req_features :: info.CPU_Features{
+		.asimd,
+		.sha512,
+		.sha3, // XXX: LLVM groups these under `sha3`.
+	}
+	return info.cpu_features() >= req_features
+}
+
+@(private = "file")
+K_0 :: simd.u64x2{0x428a2f98d728ae22, 0x7137449123ef65cd}
+@(private = "file")
+K_1 :: simd.u64x2{0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc}
+@(private = "file")
+K_2 :: simd.u64x2{0x3956c25bf348b538, 0x59f111f1b605d019}
+@(private = "file")
+K_3 :: simd.u64x2{0x923f82a4af194f9b, 0xab1c5ed5da6d8118}
+@(private = "file")
+K_4 :: simd.u64x2{0xd807aa98a3030242, 0x12835b0145706fbe}
+@(private = "file")
+K_5 :: simd.u64x2{0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2}
+@(private = "file")
+K_6 :: simd.u64x2{0x72be5d74f27b896f, 0x80deb1fe3b1696b1}
+@(private = "file")
+K_7 :: simd.u64x2{0x9bdc06a725c71235, 0xc19bf174cf692694}
+@(private = "file")
+K_8 :: simd.u64x2{0xe49b69c19ef14ad2, 0xefbe4786384f25e3}
+@(private = "file")
+K_9 :: simd.u64x2{0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65}
+@(private = "file")
+K_10 :: simd.u64x2{0x2de92c6f592b0275, 0x4a7484aa6ea6e483}
+@(private = "file")
+K_11 :: simd.u64x2{0x5cb0a9dcbd41fbd4, 0x76f988da831153b5}
+@(private = "file")
+K_12 :: simd.u64x2{0x983e5152ee66dfab, 0xa831c66d2db43210}
+@(private = "file")
+K_13 :: simd.u64x2{0xb00327c898fb213f, 0xbf597fc7beef0ee4}
+@(private = "file")
+K_14 :: simd.u64x2{0xc6e00bf33da88fc2, 0xd5a79147930aa725}
+@(private = "file")
+K_15 :: simd.u64x2{0x06ca6351e003826f, 0x142929670a0e6e70}
+@(private = "file")
+K_16 :: simd.u64x2{0x27b70a8546d22ffc, 0x2e1b21385c26c926}
+@(private = "file")
+K_17 :: simd.u64x2{0x4d2c6dfc5ac42aed, 0x53380d139d95b3df}
+@(private = "file")
+K_18 :: simd.u64x2{0x650a73548baf63de, 0x766a0abb3c77b2a8}
+@(private = "file")
+K_19 :: simd.u64x2{0x81c2c92e47edaee6, 0x92722c851482353b}
+@(private = "file")
+K_20 :: simd.u64x2{0xa2bfe8a14cf10364, 0xa81a664bbc423001}
+@(private = "file")
+K_21 :: simd.u64x2{0xc24b8b70d0f89791, 0xc76c51a30654be30}
+@(private = "file")
+K_22 :: simd.u64x2{0xd192e819d6ef5218, 0xd69906245565a910}
+@(private = "file")
+K_23 :: simd.u64x2{0xf40e35855771202a, 0x106aa07032bbd1b8}
+@(private = "file")
+K_24 :: simd.u64x2{0x19a4c116b8d2d0c8, 0x1e376c085141ab53}
+@(private = "file")
+K_25 :: simd.u64x2{0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8}
+@(private = "file")
+K_26 :: simd.u64x2{0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb}
+@(private = "file")
+K_27 :: simd.u64x2{0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3}
+@(private = "file")
+K_28 :: simd.u64x2{0x748f82ee5defb2fc, 0x78a5636f43172f60}
+@(private = "file")
+K_29 :: simd.u64x2{0x84c87814a1f0ab72, 0x8cc702081a6439ec}
+@(private = "file")
+K_30 :: simd.u64x2{0x90befffa23631e28, 0xa4506cebde82bde9}
+@(private = "file")
+K_31 :: simd.u64x2{0xbef9a3f7b2c67915, 0xc67178f2e372532b}
+@(private = "file")
+K_32 :: simd.u64x2{0xca273eceea26619c, 0xd186b8c721c0c207}
+@(private = "file")
+K_33 :: simd.u64x2{0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178}
+@(private = "file")
+K_34 :: simd.u64x2{0x06f067aa72176fba, 0x0a637dc5a2c898a6}
+@(private = "file")
+K_35 :: simd.u64x2{0x113f9804bef90dae, 0x1b710b35131c471b}
+@(private = "file")
+K_36 :: simd.u64x2{0x28db77f523047d84, 0x32caab7b40c72493}
+@(private = "file")
+K_37 :: simd.u64x2{0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c}
+@(private = "file")
+K_38 :: simd.u64x2{0x4cc5d4becb3e42b6, 0x597f299cfc657e2a}
+@(private = "file")
+K_39 :: simd.u64x2{0x5fcb6fab3ad6faec, 0x6c44198c4a475817}
+
+@(private, enable_target_feature = "neon,sha3")
+sha512_transf_hw :: proc "contextless" (ctx: ^Context_512, data: []byte) #no_bounds_check {
+	state_0 := intrinsics.unaligned_load((^simd.u64x2)(&ctx.h[0]))
+	state_1 := intrinsics.unaligned_load((^simd.u64x2)(&ctx.h[2]))
+	state_2 := intrinsics.unaligned_load((^simd.u64x2)(&ctx.h[4]))
+	state_3 := intrinsics.unaligned_load((^simd.u64x2)(&ctx.h[6]))
+
+	data := data
+	for len(data) >= BLOCK_SIZE_512 {
+		ab_save, cd_save, ef_save, gh_save := state_0, state_1, state_2, state_3
+
+		// Load message
+		msg_0 := intrinsics.unaligned_load((^simd.u64x2)(raw_data(data)))
+		msg_1 := intrinsics.unaligned_load((^simd.u64x2)(raw_data(data[16:])))
+		msg_2 := intrinsics.unaligned_load((^simd.u64x2)(raw_data(data[32:])))
+		msg_3 := intrinsics.unaligned_load((^simd.u64x2)(raw_data(data[48:])))
+		msg_4 := intrinsics.unaligned_load((^simd.u64x2)(raw_data(data[64:])))
+		msg_5 := intrinsics.unaligned_load((^simd.u64x2)(raw_data(data[80:])))
+		msg_6 := intrinsics.unaligned_load((^simd.u64x2)(raw_data(data[96:])))
+		msg_7 := intrinsics.unaligned_load((^simd.u64x2)(raw_data(data[112:])))
+
+		// Reverse for little endian
+		when ODIN_ENDIAN == .Little {
+			msg_0 = byteswap_u64x2(msg_0)
+			msg_1 = byteswap_u64x2(msg_1)
+			msg_2 = byteswap_u64x2(msg_2)
+			msg_3 = byteswap_u64x2(msg_3)
+			msg_4 = byteswap_u64x2(msg_4)
+			msg_5 = byteswap_u64x2(msg_5)
+			msg_6 = byteswap_u64x2(msg_6)
+			msg_7 = byteswap_u64x2(msg_7)
+		}
+
+		// Rounds 0-1
+		msg_k := simd.add(msg_0, K_0)
+		tmp_0 := simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_3)
+		tmp_1 := arm.vsha512hq_u64(tmp_0, simd.shuffle(state_2, state_3, 1, 2), simd.shuffle(state_1, state_2, 1, 2))
+		state_3 = arm.vsha512h2q_u64(tmp_1, state_1, state_0)
+		state_1 = simd.add(state_1, tmp_1)
+		msg_0 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_0, msg_1), msg_7, simd.shuffle(msg_4, msg_5, 1, 2))
+
+		// Rounds 2-3
+		msg_k = simd.add(msg_1, K_1)
+		tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_2)
+		tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_1, state_2, 1, 2), simd.shuffle(state_0, state_1, 1, 2))
+		state_2 = arm.vsha512h2q_u64(tmp_1, state_0, state_3)
+		state_0 = simd.add(state_0, tmp_1)
+		msg_1 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_1, msg_2), msg_0, simd.shuffle(msg_5, msg_6, 1, 2))
+
+		// Rounds 4-5
+		msg_k = simd.add(msg_2, K_2)
+		tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_1)
+		tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_0, state_1, 1, 2), simd.shuffle(state_3, state_0, 1, 2))
+		state_1 = arm.vsha512h2q_u64(tmp_1, state_3, state_2)
+		state_3 = simd.add(state_3, tmp_1)
+		msg_2 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_2, msg_3), msg_1, simd.shuffle(msg_6, msg_7, 1, 2))
+
+		// Rounds 6-7
+		msg_k = simd.add(msg_3, K_3)
+		tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_0)
+		tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_3, state_0, 1, 2), simd.shuffle(state_2, state_3, 1, 2))
+		state_0 = arm.vsha512h2q_u64(tmp_1, state_2, state_1)
+		state_2 = simd.add(state_2, tmp_1)
+		msg_3 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_3, msg_4), msg_2, simd.shuffle(msg_7, msg_0, 1, 2))
+
+		// Rounds 8-9
+		msg_k = simd.add(msg_4, K_4)
+		tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_3)
+		tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_2, state_3, 1, 2), simd.shuffle(state_1, state_2, 1, 2))
+		state_3 = arm.vsha512h2q_u64(tmp_1, state_1, state_0)
+		state_1 = simd.add(state_1, tmp_1)
+		msg_4 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_4, msg_5), msg_3, simd.shuffle(msg_0, msg_1, 1, 2))
+
+		// Rounds 10-11
+		msg_k = simd.add(msg_5, K_5)
+		tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_2)
+		tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_1, state_2, 1, 2), simd.shuffle(state_0, state_1, 1, 2))
+		state_2 = arm.vsha512h2q_u64(tmp_1, state_0, state_3)
+		state_0 = simd.add(state_0, tmp_1)
+		msg_5 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_5, msg_6), msg_4, simd.shuffle(msg_1, msg_2, 1, 2))
+
+		// Rounds 12-13
+		msg_k = simd.add(msg_6, K_6)
+		tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_1)
+		tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_0, state_1, 1, 2), simd.shuffle(state_3, state_0, 1, 2))
+		state_1 = arm.vsha512h2q_u64(tmp_1, state_3, state_2)
+		state_3 = simd.add(state_3, tmp_1)
+		msg_6 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_6, msg_7), msg_5, simd.shuffle(msg_2, msg_3, 1, 2))
+
+		// Rounds 14-15
+		msg_k = simd.add(msg_7, K_7)
+		tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_0)
+		tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_3, state_0, 1, 2), simd.shuffle(state_2, state_3, 1, 2))
+		state_0 = arm.vsha512h2q_u64(tmp_1, state_2, state_1)
+		state_2 = simd.add(state_2, tmp_1)
+		msg_7 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_7, msg_0), msg_6, simd.shuffle(msg_3, msg_4, 1, 2))
+
+		// Rounds 16-17
+		msg_k = simd.add(msg_0, K_8)
+		tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_3)
+		tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_2, state_3, 1, 2), simd.shuffle(state_1, state_2, 1, 2))
+		state_3 = arm.vsha512h2q_u64(tmp_1, state_1, state_0)
+		state_1 = simd.add(state_1, tmp_1)
+		msg_0 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_0, msg_1), msg_7, simd.shuffle(msg_4, msg_5, 1, 2))
+
+		// Rounds 18-19
+		msg_k = simd.add(msg_1, K_9)
+		tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_2)
+		tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_1, state_2, 1, 2), simd.shuffle(state_0, state_1, 1, 2))
+		state_2 = arm.vsha512h2q_u64(tmp_1, state_0, state_3)
+		state_0 = simd.add(state_0, tmp_1)
+		msg_1 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_1, msg_2), msg_0, simd.shuffle(msg_5, msg_6, 1, 2))
+
+		// Rounds 20-21
+		msg_k = simd.add(msg_2, K_10)
+		tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_1)
+		tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_0, state_1, 1, 2), simd.shuffle(state_3, state_0, 1, 2))
+		state_1 = arm.vsha512h2q_u64(tmp_1, state_3, state_2)
+		state_3 = simd.add(state_3, tmp_1)
+		msg_2 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_2, msg_3), msg_1, simd.shuffle(msg_6, msg_7, 1, 2))
+
+		// Rounds 22-23
+		msg_k = simd.add(msg_3, K_11)
+		tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_0)
+		tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_3, state_0, 1, 2), simd.shuffle(state_2, state_3, 1, 2))
+		state_0 = arm.vsha512h2q_u64(tmp_1, state_2, state_1)
+		state_2 = simd.add(state_2, tmp_1)
+		msg_3 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_3, msg_4), msg_2, simd.shuffle(msg_7, msg_0, 1, 2))
+
+		// Rounds 24-25
+		msg_k = simd.add(msg_4, K_12)
+		tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_3)
+		tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_2, state_3, 1, 2), simd.shuffle(state_1, state_2, 1, 2))
+		state_3 = arm.vsha512h2q_u64(tmp_1, state_1, state_0)
+		state_1 = simd.add(state_1, tmp_1)
+		msg_4 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_4, msg_5), msg_3, simd.shuffle(msg_0, msg_1, 1, 2))
+
+		// Rounds 26-27
+		msg_k = simd.add(msg_5, K_13)
+		tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_2)
+		tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_1, state_2, 1, 2), simd.shuffle(state_0, state_1, 1, 2))
+		state_2 = arm.vsha512h2q_u64(tmp_1, state_0, state_3)
+		state_0 = simd.add(state_0, tmp_1)
+		msg_5 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_5, msg_6), msg_4, simd.shuffle(msg_1, msg_2, 1, 2))
+
+		// Rounds 28-29
+		msg_k = simd.add(msg_6, K_14)
+		tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_1)
+		tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_0, state_1, 1, 2), simd.shuffle(state_3, state_0, 1, 2))
+		state_1 = arm.vsha512h2q_u64(tmp_1, state_3, state_2)
+		state_3 = simd.add(state_3, tmp_1)
+		msg_6 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_6, msg_7), msg_5, simd.shuffle(msg_2, msg_3, 1, 2))
+
+		// Rounds 30-31
+		msg_k = simd.add(msg_7, K_15)
+		tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_0)
+		tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_3, state_0, 1, 2), simd.shuffle(state_2, state_3, 1, 2))
+		state_0 = arm.vsha512h2q_u64(tmp_1, state_2, state_1)
+		state_2 = simd.add(state_2, tmp_1)
+		msg_7 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_7, msg_0), msg_6, simd.shuffle(msg_3, msg_4, 1, 2))
+
+		// Rounds 32-33
+		msg_k = simd.add(msg_0, K_16)
+		tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_3)
+		tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_2, state_3, 1, 2), simd.shuffle(state_1, state_2, 1, 2))
+		state_3 = arm.vsha512h2q_u64(tmp_1, state_1, state_0)
+		state_1 = simd.add(state_1, tmp_1)
+		msg_0 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_0, msg_1), msg_7, simd.shuffle(msg_4, msg_5, 1, 2))
+
+		// Rounds 34-35
+		msg_k = simd.add(msg_1, K_17)
+		tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_2)
+		tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_1, state_2, 1, 2), simd.shuffle(state_0, state_1, 1, 2))
+		state_2 = arm.vsha512h2q_u64(tmp_1, state_0, state_3)
+		state_0 = simd.add(state_0, tmp_1)
+		msg_1 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_1, msg_2), msg_0, simd.shuffle(msg_5, msg_6, 1, 2))
+
+		// Rounds 36-37
+		msg_k = simd.add(msg_2, K_18)
+		tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_1)
+		tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_0, state_1, 1, 2), simd.shuffle(state_3, state_0, 1, 2))
+		state_1 = arm.vsha512h2q_u64(tmp_1, state_3, state_2)
+		state_3 = simd.add(state_3, tmp_1)
+		msg_2 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_2, msg_3), msg_1, simd.shuffle(msg_6, msg_7, 1, 2))
+
+		// Rounds 38-39
+		msg_k = simd.add(msg_3, K_19)
+		tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_0)
+		tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_3, state_0, 1, 2), simd.shuffle(state_2, state_3, 1, 2))
+		state_0 = arm.vsha512h2q_u64(tmp_1, state_2, state_1)
+		state_2 = simd.add(state_2, tmp_1)
+		msg_3 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_3, msg_4), msg_2, simd.shuffle(msg_7, msg_0, 1, 2))
+
+		// Rounds 40-41
+		msg_k = simd.add(msg_4, K_20)
+		tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_3)
+		tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_2, state_3, 1, 2), simd.shuffle(state_1, state_2, 1, 2))
+		state_3 = arm.vsha512h2q_u64(tmp_1, state_1, state_0)
+		state_1 = simd.add(state_1, tmp_1)
+		msg_4 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_4, msg_5), msg_3, simd.shuffle(msg_0, msg_1, 1, 2))
+
+		// Rounds 42-43
+		msg_k = simd.add(msg_5, K_21)
+		tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_2)
+		tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_1, state_2, 1, 2), simd.shuffle(state_0, state_1, 1, 2))
+		state_2 = arm.vsha512h2q_u64(tmp_1, state_0, state_3)
+		state_0 = simd.add(state_0, tmp_1)
+		msg_5 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_5, msg_6), msg_4, simd.shuffle(msg_1, msg_2, 1, 2))
+
+		// Rounds 44-45
+		msg_k = simd.add(msg_6, K_22)
+		tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_1)
+		tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_0, state_1, 1, 2), simd.shuffle(state_3, state_0, 1, 2))
+		state_1 = arm.vsha512h2q_u64(tmp_1, state_3, state_2)
+		state_3 = simd.add(state_3, tmp_1)
+		msg_6 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_6, msg_7), msg_5, simd.shuffle(msg_2, msg_3, 1, 2))
+
+		// Rounds 46-47
+		msg_k = simd.add(msg_7, K_23)
+		tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_0)
+		tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_3, state_0, 1, 2), simd.shuffle(state_2, state_3, 1, 2))
+		state_0 = arm.vsha512h2q_u64(tmp_1, state_2, state_1)
+		state_2 = simd.add(state_2, tmp_1)
+		msg_7 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_7, msg_0), msg_6, simd.shuffle(msg_3, msg_4, 1, 2))
+
+		// Rounds 48-49
+		msg_k = simd.add(msg_0, K_24)
+		tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_3)
+		tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_2, state_3, 1, 2), simd.shuffle(state_1, state_2, 1, 2))
+		state_3 = arm.vsha512h2q_u64(tmp_1, state_1, state_0)
+		state_1 = simd.add(state_1, tmp_1)
+		msg_0 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_0, msg_1), msg_7, simd.shuffle(msg_4, msg_5, 1, 2))
+
+		// Rounds 50-51
+		msg_k = simd.add(msg_1, K_25)
+		tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_2)
+		tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_1, state_2, 1, 2), simd.shuffle(state_0, state_1, 1, 2))
+		state_2 = arm.vsha512h2q_u64(tmp_1, state_0, state_3)
+		state_0 = simd.add(state_0, tmp_1)
+		msg_1 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_1, msg_2), msg_0, simd.shuffle(msg_5, msg_6, 1, 2))
+
+		// Rounds 52-53
+		msg_k = simd.add(msg_2, K_26)
+		tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_1)
+		tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_0, state_1, 1, 2), simd.shuffle(state_3, state_0, 1, 2))
+		state_1 = arm.vsha512h2q_u64(tmp_1, state_3, state_2)
+		state_3 = simd.add(state_3, tmp_1)
+		msg_2 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_2, msg_3), msg_1, simd.shuffle(msg_6, msg_7, 1, 2))
+
+		// Rounds 54-55
+		msg_k = simd.add(msg_3, K_27)
+		tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_0)
+		tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_3, state_0, 1, 2), simd.shuffle(state_2, state_3, 1, 2))
+		state_0 = arm.vsha512h2q_u64(tmp_1, state_2, state_1)
+		state_2 = simd.add(state_2, tmp_1)
+		msg_3 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_3, msg_4), msg_2, simd.shuffle(msg_7, msg_0, 1, 2))
+
+		// Rounds 56-57
+		msg_k = simd.add(msg_4, K_28)
+		tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_3)
+		tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_2, state_3, 1, 2), simd.shuffle(state_1, state_2, 1, 2))
+		state_3 = arm.vsha512h2q_u64(tmp_1, state_1, state_0)
+		state_1 = simd.add(state_1, tmp_1)
+		msg_4 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_4, msg_5), msg_3, simd.shuffle(msg_0, msg_1, 1, 2))
+
+		// Rounds 58-59
+		msg_k = simd.add(msg_5, K_29)
+		tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_2)
+		tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_1, state_2, 1, 2), simd.shuffle(state_0, state_1, 1, 2))
+		state_2 = arm.vsha512h2q_u64(tmp_1, state_0, state_3)
+		state_0 = simd.add(state_0, tmp_1)
+		msg_5 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_5, msg_6), msg_4, simd.shuffle(msg_1, msg_2, 1, 2))
+
+		// Rounds 60-61
+		msg_k = simd.add(msg_6, K_30)
+		tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_1)
+		tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_0, state_1, 1, 2), simd.shuffle(state_3, state_0, 1, 2))
+		state_1 = arm.vsha512h2q_u64(tmp_1, state_3, state_2)
+		state_3 = simd.add(state_3, tmp_1)
+		msg_6 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_6, msg_7), msg_5, simd.shuffle(msg_2, msg_3, 1, 2))
+
+		// Rounds 62-63
+		msg_k = simd.add(msg_7, K_31)
+		tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_0)
+		tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_3, state_0, 1, 2), simd.shuffle(state_2, state_3, 1, 2))
+		state_0 = arm.vsha512h2q_u64(tmp_1, state_2, state_1)
+		state_2 = simd.add(state_2, tmp_1)
+		msg_7 = arm.vsha512su1q_u64(arm.vsha512su0q_u64(msg_7, msg_0), msg_6, simd.shuffle(msg_3, msg_4, 1, 2))
+
+		// Rounds 64-65
+		msg_k = simd.add(msg_0, K_32)
+		tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_3)
+		tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_2, state_3, 1, 2), simd.shuffle(state_1, state_2, 1, 2))
+		state_3 = arm.vsha512h2q_u64(tmp_1, state_1, state_0)
+		state_1 = simd.add(state_1, tmp_1)
+
+		// Rounds 66-67
+		msg_k = simd.add(msg_1, K_33)
+		tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_2)
+		tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_1, state_2, 1, 2), simd.shuffle(state_0, state_1, 1, 2))
+		state_2 = arm.vsha512h2q_u64(tmp_1, state_0, state_3)
+		state_0 = simd.add(state_0, tmp_1)
+
+		// Rounds 68-69
+		msg_k = simd.add(msg_2, K_34)
+		tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_1)
+		tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_0, state_1, 1, 2), simd.shuffle(state_3, state_0, 1, 2))
+		state_1 = arm.vsha512h2q_u64(tmp_1, state_3, state_2)
+		state_3 = simd.add(state_3, tmp_1)
+
+		// Rounds 70-71
+		msg_k = simd.add(msg_3, K_35)
+		tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_0)
+		tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_3, state_0, 1, 2), simd.shuffle(state_2, state_3, 1, 2))
+		state_0 = arm.vsha512h2q_u64(tmp_1, state_2, state_1)
+		state_2 = simd.add(state_2, tmp_1)
+
+		// Rounds 72-73
+		msg_k = simd.add(msg_4, K_36)
+		tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_3)
+		tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_2, state_3, 1, 2), simd.shuffle(state_1, state_2, 1, 2))
+		state_3 = arm.vsha512h2q_u64(tmp_1, state_1, state_0)
+		state_1 = simd.add(state_1, tmp_1)
+
+		// Rounds 74-75
+		msg_k = simd.add(msg_5, K_37)
+		tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_2)
+		tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_1, state_2, 1, 2), simd.shuffle(state_0, state_1, 1, 2))
+		state_2 = arm.vsha512h2q_u64(tmp_1, state_0, state_3)
+		state_0 = simd.add(state_0, tmp_1)
+
+		// Rounds 76-77
+		msg_k = simd.add(msg_6, K_38)
+		tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_1)
+		tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_0, state_1, 1, 2), simd.shuffle(state_3, state_0, 1, 2))
+		state_1 = arm.vsha512h2q_u64(tmp_1, state_3, state_2)
+		state_3 = simd.add(state_3, tmp_1)
+
+		// Rounds 78-79
+		msg_k = simd.add(msg_7, K_39)
+		tmp_0 = simd.add(simd.shuffle(msg_k, msg_k, 1, 2), state_0)
+		tmp_1 = arm.vsha512hq_u64(tmp_0, simd.shuffle(state_3, state_0, 1, 2), simd.shuffle(state_2, state_3, 1, 2))
+		state_0 = arm.vsha512h2q_u64(tmp_1, state_2, state_1)
+		state_2 = simd.add(state_2, tmp_1)
+
+		// Combine state
+		state_0 = simd.add(state_0, ab_save)
+		state_1 = simd.add(state_1, cd_save)
+		state_2 = simd.add(state_2, ef_save)
+		state_3 = simd.add(state_3, gh_save)
+
+		data = data[BLOCK_SIZE_512:]
+	}
+
+	intrinsics.unaligned_store((^simd.u64x2)(&ctx.h[0]), state_0)
+	intrinsics.unaligned_store((^simd.u64x2)(&ctx.h[2]), state_1)
+	intrinsics.unaligned_store((^simd.u64x2)(&ctx.h[4]), state_2)
+	intrinsics.unaligned_store((^simd.u64x2)(&ctx.h[6]), state_3)
+}
+
+when ODIN_ENDIAN == .Little {
+	@(private = "file", enable_target_feature = "neon")
+	byteswap_u64x2 :: #force_inline proc "contextless" (a: simd.u64x2) -> simd.u64x2 {
+		return transmute(simd.u64x2)(
+			simd.shuffle(
+				transmute(simd.u8x16)(a),
+				transmute(simd.u8x16)(a),
+				7, 6, 5, 4, 3, 2, 1, 0,
+				15, 14, 13, 12, 11, 10, 9, 8,
+			)
+		)
+	}
+}
--- a/core/crypto/sha2/sha512_impl_hw_gen.odin
+++ b/core/crypto/sha2/sha512_impl_hw_gen.odin
@@ -0,0 +1,13 @@
+#+build !arm64
+package sha2
+
+// is_hardware_accelerated_512 returns true if and only if (⟺) hardware
+// accelerated SHA-384, SHA-512, and SHA-512/256 are supported.
+is_hardware_accelerated_512 :: proc "contextless" () -> bool {
+	return false
+}
+
+@(private)
+sha512_transf_hw :: proc "contextless" (ctx: ^Context_512, data: []byte) {
+	panic_contextless(ERR_HW_NOT_SUPPORTED)
+}
--- a/core/crypto/sha3/sha3.odin
+++ b/core/crypto/sha3/sha3.odin
@@ -79,7 +79,7 @@ update :: proc(ctx: ^Context, data: []byte) {
 // final finalizes the Context, writes the digest to hash, and calls
 // reset on the Context.
 //
-// Iff finalize_clone is set, final will work on a copy of the Context,
+// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context,
 // which is useful for for calculating rolling digests.
 final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
 	_sha3.final((^_sha3.Context)(ctx), hash, finalize_clone)
--- a/core/crypto/sm3/sm3.odin
+++ b/core/crypto/sm3/sm3.odin
@@ -80,7 +80,7 @@ update :: proc(ctx: ^Context, data: []byte) {
 // final finalizes the Context, writes the digest to hash, and calls
 // reset on the Context.
 //
-// Iff finalize_clone is set, final will work on a copy of the Context,
+// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context,
 // which is useful for for calculating rolling digests.
 final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
 	ensure(ctx.is_initialized)
--- a/core/crypto/tuplehash/tuplehash.odin
+++ b/core/crypto/tuplehash/tuplehash.odin
@@ -31,7 +31,7 @@ write_element :: proc(ctx: ^Context, data: []byte) {
 // final finalizes the Context, writes the digest to hash, and calls
 // reset on the Context.
 //
-// Iff finalize_clone is set, final will work on a copy of the Context,
+// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context,
 // which is useful for for calculating rolling digests.
 final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
 	_sha3.final_cshake((^_sha3.Context)(ctx), hash, finalize_clone)
--- a/core/encoding/cbor/marshal.odin
+++ b/core/encoding/cbor/marshal.odin
@@ -285,6 +285,32 @@ _marshal_into_encoder :: proc(e: Encoder, v: any, ti: ^runtime.Type_Info) -> (er
 		}
 		return

+
+	case runtime.Type_Info_Fixed_Capacity_Dynamic_Array:
+		array_data := uintptr(v.data)
+		array_len := (^int)(array_data + info.len_offset)^
+		if info.elem.id == byte {
+					raw := runtime.Raw_Slice{v.data, array_len}
+					return err_conv(_encode_bytes(e, transmute([]byte)raw))
+		}
+
+		err_conv(_encode_u64(e, u64(array_len), .Array)) or_return
+
+		if impl, ok := _tag_implementations_type[info.elem.id]; ok {
+			for i in 0..<array_len {
+				data := array_data + uintptr(i*info.elem_size)
+				impl->marshal(e, any{rawptr(data), info.elem.id}) or_return
+			}
+			return
+		}
+
+		elem_ti := runtime.type_info_core(type_info_of(info.elem.id))
+		for i in 0..<array_len {
+			data := array_data + uintptr(i*info.elem_size)
+			_marshal_into_encoder(e, any{rawptr(data), info.elem.id}, elem_ti) or_return
+		}
+		return
+
 	case runtime.Type_Info_Slice:
 		if info.elem.id == byte {
 			raw := (^[]byte)(v.data)
--- a/core/encoding/cbor/unmarshal.odin
+++ b/core/encoding/cbor/unmarshal.odin
@@ -389,6 +389,23 @@ _unmarshal_bytes :: proc(d: Decoder, v: any, ti: ^reflect.Type_Info, hdr: Header
 		n := copy(slice, bytes)
 		assert(n == len(bytes))
 		return
+
+	case reflect.Type_Info_Fixed_Capacity_Dynamic_Array:
+		elem_base := reflect.type_info_base(t.elem)
+
+		if elem_base.id != byte { return _unsupported(v, hdr) }
+
+		bytes := err_conv(_decode_bytes(d, add, allocator=context.temp_allocator)) or_return
+		defer delete(bytes, context.temp_allocator)
+
+		if len(bytes) > t.capacity { return _unsupported(v, hdr) }
+
+		// Copy into array type, delete original.
+		slice := ([^]byte)(v.data)[:len(bytes)]
+		n := copy(slice, bytes)
+		assert(n == len(bytes))
+		(^int)(uintptr(v.data) + t.len_offset)^ = n
+		return
 	}

 	return _unsupported(v, hdr)
@@ -553,6 +570,21 @@ _unmarshal_array :: proc(d: Decoder, v: any, ti: ^reflect.Type_Info, hdr: Header
 		if out_of_space { return _unsupported(v, hdr) }
 		return

+	case reflect.Type_Info_Fixed_Capacity_Dynamic_Array:
+		length, _ := err_conv(_decode_len_container(d, add)) or_return
+		if length > t.capacity {
+			return _unsupported(v, hdr)
+		}
+
+		da := mem.Raw_Dynamic_Array{rawptr(v.data), 0, length, allocator }
+
+		out_of_space := assign_array(d, &da, t.elem, length, growable=false) or_return
+		if out_of_space { return _unsupported(v, hdr) }
+
+		(^int)(uintptr(v.data) + t.len_offset)^ = length
+
+		return
+
 	case reflect.Type_Info_Complex:
 		length, _ := err_conv(_decode_len_container(d, add)) or_return
 		if length > 2 {
@@ -661,8 +693,7 @@ _unmarshal_map :: proc(d: Decoder, v: any, ti: ^reflect.Type_Info, hdr: Header,
 		unknown := length == -1
 		fields := reflect.struct_fields_zipped(ti.id)

-		idx := 0
-		for ; idx < len(fields) && (unknown || idx < length); idx += 1 {
+		for idx := 0; unknown || idx < length; idx += 1 {
 			// Decode key, keys can only be strings.
 			key: string
 			if keyv, kerr := decode_key(d, v, context.temp_allocator); unknown && kerr == .Break {
@@ -710,16 +741,6 @@ _unmarshal_map :: proc(d: Decoder, v: any, ti: ^reflect.Type_Info, hdr: Header,
 			_unmarshal_value(d, fany, _decode_header(r) or_return) or_return
 		}

-		// If there are fields left in the map that did not get decoded into the struct, decode and discard them.
-		if !unknown {
-			for _ in idx..<length {
-				key := err_conv(_decode_from_decoder(d, allocator=context.temp_allocator)) or_return
-				destroy(key, context.temp_allocator)
-				val := err_conv(_decode_from_decoder(d, allocator=context.temp_allocator)) or_return
-				destroy(val, context.temp_allocator)
-			}
-		}
-
 		return

 	case reflect.Type_Info_Map:
--- a/core/encoding/json/marshal.odin
+++ b/core/encoding/json/marshal.odin
@@ -320,6 +320,16 @@ marshal_to_writer :: proc(w: io.Writer, v: any, opt: ^Marshal_Options) -> (err:
 		}
 		opt_write_end(w, opt, ']') or_return

+	case runtime.Type_Info_Fixed_Capacity_Dynamic_Array:
+		opt_write_start(w, opt, '[') or_return
+		len := (^int)(uintptr(v.data) + info.len_offset)^
+		for i in 0..<len {
+			opt_write_iteration(w, opt, i == 0) or_return
+			data := uintptr(v.data) + uintptr(i*info.elem_size)
+			marshal_to_writer(w, any{rawptr(data), info.elem.id}, opt) or_return
+		}
+		opt_write_end(w, opt, ']') or_return
+
 	case runtime.Type_Info_Slice:
 		opt_write_start(w, opt, '[') or_return
 		slice := cast(^mem.Raw_Slice)v.data
--- a/core/encoding/json/unmarshal.odin
+++ b/core/encoding/json/unmarshal.odin
@@ -34,35 +34,6 @@ Register_User_Unmarshaler_Error :: enum {
 	Unmarshaler_Previously_Found,
 }

-// Example User Unmarshaler:
-// Custom Unmarshaler for `int`
-// Some_Unmarshaler :: proc(p: ^json.Parser, v: any) -> json.Unmarshal_Error {
-// 	token := p.curr_token.text
-// 	i, ok := strconv.parse_i64_of_base(token, 2)
-// 	if !ok {
-//		return .Invalid_Data
-//
-//	}
-//	(^int)(v.data)^ = int(i)
-//	return .None
-// }
-//
-// _main :: proc() {
-//	// Ensure the json._user_unmarshaler map is initialized
-//	json.set_user_unmarshalers(new(map[typeid]json.User_Unmarshaler))
-//	reg_err := json.register_user_unmarshaler(type_info_of(int).id, Some_Unmarshaler)
-//	assert(reg_err == .None)
-//
-//	data := `{"value":101010}`
-//	SomeType :: struct {
-//		value: int,
-//	}
-//	y: SomeType
-//
-//	unmarshal_err := json.unmarshal(transmute([]byte)data, &y)
-//	fmt.println(y, unmarshal_err)
-// }
-
 // NOTE(Jeroen): This is a pointer to prevent accidental additions
 // it is prefixed with `_` rather than marked with a private attribute so that users can access it if necessary
 _user_unmarshalers: ^map[typeid]User_Unmarshaler
@@ -72,23 +43,60 @@ _user_unmarshalers: ^map[typeid]User_Unmarshaler
 // Inputs:
 // - m: A pointer to a map of typeids to User_Unmarshaler procs.
 //
-// NOTE: Must be called before using register_user_unmarshaler.
+// NOTE: Must be called before using `register_user_unmarshaler`.
 //
 set_user_unmarshalers :: proc(m: ^map[typeid]User_Unmarshaler) {
 	assert(_user_unmarshalers == nil, "set_user_unmarshalers must not be called more than once.")
 	_user_unmarshalers = m
 }

-// Registers a user-defined unmarshaler for a specific typeid
-//
-// Inputs:
-// - id: The typeid of the custom type.
-// - unmarshaler: The User_Unmarshaler function for the custom type.
-//
-// Returns: A Register_User_Unmarshaler_Error value indicating the success or failure of the operation.
-//
-// WARNING: set_user_unmarshalers must be called before using this procedure.
-//
+/*
+Registers a user-defined unmarshaler for a specific `typeid`.
+
+WARNING: set_user_unmarshalers must be called before using this procedure.
+
+Inputs:
+- id: The `typeid` of the custom type.
+- unmarshaler: The `User_Unmarshaler` function for the custom type.
+
+Example:
+	import "core:fmt"
+	import "core:encoding/json"
+	import "core:strconv"
+
+	// Custom Unmarshaler for `int`
+	some_unmarshaler :: proc(p: ^json.Parser, v: any) -> json.Unmarshal_Error {
+		token := p.curr_token.text
+		i, ok := strconv.parse_i64_of_base(token, 2)
+		if !ok {
+			return .Invalid_Data
+		}
+
+		(^int)(v.data)^ = int(i)
+
+		json.advance_token(p)
+		return nil
+	}
+
+	register_user_unmarshaler_example :: proc() {
+		// Ensure the `json._user_unmarshalers` map is initialized.
+		json.set_user_unmarshalers(new(map[typeid]json.User_Unmarshaler))
+		reg_err := json.register_user_unmarshaler(typeid_of(int), some_unmarshaler)
+		assert(reg_err == .None)
+
+		data := `{"value":101010}`
+		SomeType :: struct {
+			value: int,
+		}
+		y: SomeType
+
+		unmarshal_err := json.unmarshal(transmute([]byte)data, &y)
+		fmt.println(y, unmarshal_err)
+	}
+
+Output:
+	SomeType{value = 42} nil
+*/
 register_user_unmarshaler :: proc(id: typeid, unmarshaler: User_Unmarshaler) -> Register_User_Unmarshaler_Error {
 	if _user_unmarshalers == nil {
 		return .No_User_Unmarshaler
--- a/core/fmt/fmt.odin
+++ b/core/fmt/fmt.odin
@@ -2592,11 +2592,19 @@ fmt_named_buitlin_custom_formatters :: proc(fi: ^Info, v: any, verb: rune, info:
 				prec = 6
 				buf[w] = 'm'
 			}
+			if fi.space {
+				w -= 1
+				buf[w] = ' '
+			}
 			w, u = ffrac(buf[:w], u, prec)
 			w = fint(buf[:w], u)
 		} else {
 			w -= 1
 			buf[w] = 's'
+			if fi.space {
+				w -= 1
+				buf[w] = ' '
+			}
 			w, u = ffrac(buf[:w], u, 9)
 			w = fint(buf[:w], u%60)
 			u /= 60
@@ -3238,6 +3246,21 @@ fmt_value :: proc(fi: ^Info, v: any, verb: rune) {
 		}
 		fmt_array(fi, ptr, n, info.elem_size, info.elem, verb)

+
+	case runtime.Type_Info_Fixed_Capacity_Dynamic_Array:
+		n := (^int)(uintptr(v.data) + info.len_offset)^
+
+		ptr := v.data // data is stored at the start
+		if ol, ok := fi.optional_len.?; ok {
+			fi.optional_len = nil
+			n = min(n, ol)
+		} else if fi.use_nul_termination {
+			fi.use_nul_termination = false
+			fmt_array_nul_terminated(fi, ptr, n, info.elem_size, info.elem, verb)
+			return
+		}
+		fmt_array(fi, ptr, n, info.elem_size, info.elem, verb)
+
 	case runtime.Type_Info_Simd_Vector:
 		io.write_byte(fi.writer, '<', &fi.n)
 		defer io.write_byte(fi.writer, '>', &fi.n)
--- a/core/io/io.odin
+++ b/core/io/io.odin
@@ -436,7 +436,7 @@ copy_buffer :: proc(dst: Writer, src: Reader, buf: []byte) -> (written: i64, err

 // copy_n copies n bytes (or till an error) from src to dst.
 // It returns the number of bytes copied and the first error that occurred whilst copying, if any.
-// On return, written == n IFF err == nil
+// On return, written == n if and only if (⟺) err == nil
 copy_n :: proc(dst: Writer, src: Reader, n: i64) -> (written: i64, err: Error) {
 	nsrc := limited_reader_init(&Limited_Reader{}, src, n)
 	written, err = copy(dst, nsrc)
--- a/core/math/big/prime.odin
+++ b/core/math/big/prime.odin
@@ -101,7 +101,7 @@ internal_int_power_modulo :: proc(res, G, X, P: ^Int, allocator := context.alloc
 		If the modulus is odd or dr != 0 use the montgomery method.
 	*/
 	if internal_int_is_odd(P) || dr != 0 {
-		return _private_int_exponent_mod(res, G, X, P, dr)
+		return _private_int_exponent_mod_fast(res, G, X, P, dr)
 	}

 	/*
--- a/core/math/big/private.odin
+++ b/core/math/big/private.odin
@@ -439,8 +439,14 @@ _private_int_mul_high :: proc(dest, a, b: ^Int, digits: int, allocator := contex
 		return _private_int_mul_high_comba(dest, a, b, digits)
 	}

-	internal_grow(dest, a.used + b.used + 1) or_return
-	dest.used = a.used + b.used + 1
+	/*
+		Set up temporary output `Int`, which we'll swap for `dest` when done.
+	*/
+
+	t := &Int{}
+
+	internal_grow(t, a.used + b.used + 1) or_return
+	t.used = a.used + b.used + 1

 	pa := a.used
 	pb := b.used
@@ -451,20 +457,23 @@ _private_int_mul_high :: proc(dest, a, b: ^Int, digits: int, allocator := contex
 			/*
 				Calculate the double precision result.
 			*/
-			r := _WORD(dest.digit[ix + iy]) + _WORD(a.digit[ix]) * _WORD(b.digit[iy]) + _WORD(carry)
+			r := _WORD(t.digit[ix + iy]) + _WORD(a.digit[ix]) * _WORD(b.digit[iy]) + _WORD(carry)

 			/*
 				Get the lower part.
 			*/
-			dest.digit[ix + iy] = DIGIT(r & _WORD(_MASK))
+			t.digit[ix + iy] = DIGIT(r & _WORD(_MASK))

 			/*
 				Carry the carry.
 			*/
 			carry = DIGIT(r >> _WORD(_DIGIT_BITS))
 		}
-		dest.digit[ix + pb] = carry
+		t.digit[ix + pb] = carry
 	}
+
+	internal_swap(dest, t)
+	internal_destroy(t)
 	return internal_clamp(dest)
 }

--- a/core/math/fixed/fixed.odin
+++ b/core/math/fixed/fixed.odin
@@ -29,7 +29,7 @@ Fixed32_32 :: distinct Fixed(i64, 32)
 Fixed52_12 :: distinct Fixed(i64, 12)


-init_from_f64 :: proc(x: ^$T/Fixed($Backing, $Fraction_Width), val: f64) {
+init_from_f64 :: proc "contextless" (x: ^$T/Fixed($Backing, $Fraction_Width), val: f64) {
 	i, f := math.modf(math.abs(val))
 	x.i  = Backing(f * (1<<Fraction_Width))
 	x.i &= 1<<Fraction_Width - 1
@@ -39,13 +39,13 @@ init_from_f64 :: proc(x: ^$T/Fixed($Backing, $Fraction_Width), val: f64) {
 	}
 }

-init_from_parts :: proc(x: ^$T/Fixed($Backing, $Fraction_Width), integer, fraction: Backing) {
+init_from_parts :: proc "contextless" (x: ^$T/Fixed($Backing, $Fraction_Width), integer, fraction: Backing) {
 	x.i  = fraction
 	x.i &= 1<<Fraction_Width - 1
 	x.i |= (integer << Fraction_Width)
 }

-to_f64 :: proc(x: $T/Fixed($Backing, $Fraction_Width)) -> f64 {
+to_f64 :: proc "contextless" (x: $T/Fixed($Backing, $Fraction_Width)) -> f64 {
 	sign := -1.0 if x.i < 0 else 1.0
 	num := math.abs(x.i)
 	res := f64(num >> Fraction_Width)
@@ -55,39 +55,39 @@ to_f64 :: proc(x: $T/Fixed($Backing, $Fraction_Width)) -> f64 {


@(require_results)
-add :: proc(x, y: $T/Fixed) -> T {
+add :: proc "contextless" (x, y: $T/Fixed) -> T {
 	return {x.i + y.i}
 }
@(require_results)
-sub :: proc(x, y: $T/Fixed) -> T {
+sub :: proc "contextless" (x, y: $T/Fixed) -> T {
 	return {x.i - y.i}
 }

@(require_results)
-mul :: proc(x, y: $T/Fixed($Backing, $Fraction_Width)) -> (z: T) {
+mul :: proc "contextless" (x, y: $T/Fixed($Backing, $Fraction_Width)) -> (z: T) {
 	z.i = intrinsics.fixed_point_mul(x.i, y.i, Fraction_Width)
 	return
 }
@(require_results)
-mul_sat :: proc(x, y: $T/Fixed($Backing, $Fraction_Width)) -> (z: T) {
+mul_sat :: proc "contextless" (x, y: $T/Fixed($Backing, $Fraction_Width)) -> (z: T) {
 	z.i = intrinsics.fixed_point_mul_sat(x.i, y.i, Fraction_Width)
 	return
 }

@(require_results)
-div :: proc(x, y: $T/Fixed($Backing, $Fraction_Width)) -> (z: T) {
+div :: proc "contextless" (x, y: $T/Fixed($Backing, $Fraction_Width)) -> (z: T) {
 	z.i = intrinsics.fixed_point_div(x.i, y.i, Fraction_Width)
 	return
 }
@(require_results)
-div_sat :: proc(x, y: $T/Fixed($Backing, $Fraction_Width)) -> (z: T) {
+div_sat :: proc "contextless" (x, y: $T/Fixed($Backing, $Fraction_Width)) -> (z: T) {
 	z.i = intrinsics.fixed_point_div_sat(x.i, y.i, Fraction_Width)
 	return
 }


@(require_results)
-floor :: proc(x: $T/Fixed($Backing, $Fraction_Width)) -> Backing {
+floor :: proc "contextless" (x: $T/Fixed($Backing, $Fraction_Width)) -> Backing {
 	if x.i >= 0 {
 		return x.i >> Fraction_Width
 	} else {
@@ -95,11 +95,11 @@ floor :: proc(x: $T/Fixed($Backing, $Fraction_Width)) -> Backing {
 	}
 }
@(require_results)
-ceil :: proc(x: $T/Fixed($Backing, $Fraction_Width)) -> Backing {
+ceil :: proc "contextless" (x: $T/Fixed($Backing, $Fraction_Width)) -> Backing {
 	return (x.i + (1 << Fraction_Width - 1)) >> Fraction_Width
 }
@(require_results)
-round :: proc(x: $T/Fixed($Backing, $Fraction_Width)) -> Backing {
+round :: proc "contextless" (x: $T/Fixed($Backing, $Fraction_Width)) -> Backing {
 	return (x.i + (1 << (Fraction_Width - 1))) >> Fraction_Width
 }

@@ -163,7 +163,7 @@ to_string :: proc(x: $T/Fixed($Backing, $Fraction_Width), allocator := context.a
 }


-@(private)
+@(rodata, private)
 _power_of_two_table := [129]string{
 	"0.5",
 	"1",
@@ -295,8 +295,3 @@ _power_of_two_table := [129]string{
 	"85070591730234615865843651857942052864",
 	"170141183460469231731687303715884105728",
 }
-
-@(deprecated="Use write instead")
-append :: proc(dst: []byte, x: $T/Fixed($Backing, $Fraction_Width)) -> string {
-	return write(dst, x)
-}
--- a/core/math/rand/rand_xoshiro256.odin
+++ b/core/math/rand/rand_xoshiro256.odin
@@ -3,6 +3,18 @@ package rand
 import "base:intrinsics"
 import "base:runtime"

+when ODIN_ARCH == .amd64 || ODIN_ARCH == .i386 {
+	// LLVM thinks that using SIMD for read_u64 is good,
+	// when it causes a ~3x performance regression.  As
+	// far as I can tell, this behavior is limited to
+	// Intel.
+	@(private = "file")
+	TARGET_FEATURES :: "-sse,-avx,-avx2"
+} else {
+	@(private = "file")
+	TARGET_FEATURES :: ""
+}
+
 /*
 The state for a xoshiro256** pseudorandom generator.
 */
@@ -10,8 +22,9 @@ Xoshiro256_Random_State :: struct {
 	s: [4]u64,
 }

+@(enable_target_feature = TARGET_FEATURES)
 xoshiro256_random_generator_proc :: proc(data: rawptr, mode: runtime.Random_Generator_Mode, p: []byte) {
-	@(require_results)
+	@(require_results, enable_target_feature = TARGET_FEATURES)
 	read_u64 :: proc "contextless" (r: ^Xoshiro256_Random_State) -> u64 {
 		// xoshiro256** output function and state transition

@@ -27,7 +40,7 @@ xoshiro256_random_generator_proc :: proc(data: rawptr, mode: runtime.Random_Gene

 		return result

-		rotate_left64 :: proc "contextless" (x: u64, k: int) -> u64 {
+		rotate_left64 :: #force_inline proc "contextless" (x: u64, k: int) -> u64 {
 			n :: 64
 			s := uint(k) & (n-1)
 			return x << s | x >> (n-s)
--- a/core/mem/allocators.odin
+++ b/core/mem/allocators.odin
@@ -536,15 +536,8 @@ scratch_alloc_bytes_non_zeroed :: proc(
 		// we don't need to be so strict about every byte.
 		aligned_size += alignment - 1
 	}
-	if aligned_size <= len(s.data) {
-		offset := uintptr(0)
-		if s.curr_offset+aligned_size <= len(s.data) {
-			offset = uintptr(s.curr_offset)
-		} else {
-			// The allocation will cause an overflow past the boundary of the
-			// space available, so reset to the starting offset.
-			offset = 0
-		}
+	if s.curr_offset+aligned_size <= len(s.data) {
+		offset := uintptr(s.curr_offset)
 		start := uintptr(raw_data(s.data))
 		ptr := rawptr(offset+start)
 		// We keep track of the original base pointer without extra alignment
--- a/core/mem/mem.odin
+++ b/core/mem/mem.odin
@@ -467,13 +467,7 @@ Check whether a number is a power of two.
 This procedure checks whether a given pointer-sized unsigned integer contains
 a power-of-two value.
 */
-@(require_results)
-is_power_of_two :: proc "contextless" (x: uintptr) -> bool {
-	if x <= 0 {
-		return false
-	}
-	return (x & (x-1)) == 0
-}
+is_power_of_two :: runtime.is_power_of_two_uintptr

 /*
 Check if a pointer is aligned.
@@ -497,11 +491,7 @@ bytes, `ptr` is returned.

 The specified alignment must be a power of 2.
 */
-@(require_results)
-align_forward_uintptr :: proc(ptr, align: uintptr) -> uintptr {
-	assert(is_power_of_two(align))
-	return (ptr + align-1) & ~(align-1)
-}
+align_forward_uintptr :: runtime.align_forward_uintptr

 /*
 Align pointer forward.
@@ -526,10 +516,7 @@ bytes, `ptr` is returned.

 The specified alignment must be a power of 2.
 */
-@(require_results)
-align_forward_int :: proc(ptr, align: int) -> int {
-	return int(align_forward_uintptr(uintptr(ptr), uintptr(align)))
-}
+align_forward_int :: runtime.align_forward_int

 /*
 Align uint forward.
@@ -540,10 +527,7 @@ bytes, `ptr` is returned.

 The specified alignment must be a power of 2.
 */
-@(require_results)
-align_forward_uint :: proc(ptr, align: uint) -> uint {
-	return uint(align_forward_uintptr(uintptr(ptr), uintptr(align)))
-}
+align_forward_uint :: runtime.align_forward_uint

 /*
 Align uintptr backwards.
@@ -626,32 +610,6 @@ reinterpret_copy :: proc "contextless" ($T: typeid, ptr: rawptr) -> (value: T) {
 	return
 }

-/*
-Dynamic array with a fixed capacity buffer.
-
-This type represents dynamic arrays with a fixed-size backing buffer. Upon
-allocating memory beyond reaching the maximum capacity, allocations from fixed
-byte buffers return `nil` and no error.
-*/
-Fixed_Byte_Buffer :: distinct [dynamic]byte
-
-/*
-Create a fixed byte buffer from a slice.
-*/
-@(require_results)
-make_fixed_byte_buffer :: proc "contextless" (backing: []byte) -> Fixed_Byte_Buffer {
-	s := transmute(Raw_Slice)backing
-	d: Raw_Dynamic_Array
-	d.data = s.data
-	d.len = 0
-	d.cap = s.len
-	d.allocator = Allocator{
-		procedure = nil_allocator_proc,
-		data = nil,
-	}
-	return transmute(Fixed_Byte_Buffer)d
-}
-
 /*
 General-purpose align formula.

--- a/core/mem/virtual/arena.odin
+++ b/core/mem/virtual/arena.odin
@@ -141,9 +141,9 @@ arena_alloc_unguarded :: proc(arena: ^Arena, size: uint, alignment: uint, loc :=

 			needed := mem.align_forward_uint(size, alignment)
 			needed = max(needed, arena.default_commit_size)
-			block_size := max(needed, arena.minimum_block_size)
+			block_size := max(needed, arena.minimum_block_size) + alignment

-			new_block := memory_block_alloc(needed, block_size, alignment, {}) or_return
+			new_block := memory_block_alloc(needed, block_size) or_return
 			new_block.prev = arena.curr_block
 			arena.curr_block = new_block
 			arena.total_reserved += new_block.reserved
--- a/core/mem/virtual/file.odin
+++ b/core/mem/virtual/file.odin
@@ -10,7 +10,14 @@ map_file :: proc{
 }

 map_file_from_path :: proc(filename: string, flags: Map_File_Flags) -> (data: []byte, error: Map_File_Error) {
-	f, err := os.open(filename, os.O_RDWR)
+	open_flags : os.File_Flags
+	if .Read in flags {
+		open_flags += {.Read}
+	}
+	if .Write in flags {
+		open_flags += {.Write}
+	}
+	f, err := os.open(filename, open_flags)
 	if err != nil {
 		return nil, .Open_Failure
 	}
@@ -37,4 +44,4 @@ unmap_file :: proc(data: []byte) {
 	if raw_data(data) != nil {
 		_unmap_file(data)
 	}
-}
+}
--- a/Show More
+++ b/Show More