Merge branch 'master' into bill/fixed-capacity-dynamic-array

2026-06-10 04:18:11 +00:00 · 2026-03-15 11:41:01 +00:00
parent 0e6ea3884d 117e3a7b5a
commit c6772dfd06
85 changed files with 1762 additions and 1008 deletions
--- a/base/runtime/core.odin
+++ b/base/runtime/core.odin
@@ -141,7 +141,7 @@ Type_Info_Struct :: struct {

 	flags: Type_Info_Struct_Flags,

-	// These are only set iff this structure is an SOA structure
+	// These are only set if and only if (⟺) this structure is an SOA structure
 	soa_kind:      Type_Info_Struct_Soa_Kind,
 	soa_len:       i32,
 	soa_base_type: ^Type_Info,
--- a/base/runtime/core_builtin.odin
+++ b/base/runtime/core_builtin.odin
@@ -1525,7 +1525,7 @@ card :: proc "contextless" (s: $S/bit_set[$E; $U]) -> int {



-// Evaluates the condition and panics the program iff the condition is false.
+// Evaluates the condition and panics the program if and only if (⟺) the condition is false.
 // This uses the `context.assertion_failure_procedure` to assert.
 //
 // This routine will be ignored when `ODIN_DISABLE_ASSERT` is true.
@@ -1549,7 +1549,7 @@ assert :: proc(condition: bool, message := #caller_expression(condition), loc :=
 	}
 }

-// Evaluates the condition and panics the program iff the condition is false.
+// Evaluates the condition and panics the program if and only if (⟺) the condition is false.
 // This uses the `context.assertion_failure_procedure` to assert.
 // This routine ignores `ODIN_DISABLE_ASSERT`, and will always execute.
@builtin
@@ -1589,7 +1589,7 @@ unimplemented :: proc(message := "", loc := #caller_location) -> ! {
 	p("not yet implemented", message, loc)
 }

-// Evaluates the condition and panics the program iff the condition is false.
+// Evaluates the condition and panics the program if and only if (⟺) the condition is false.
 // This uses the `default_assertion_contextless_failure_proc` to assert.
 //
 // This routine will be ignored when `ODIN_DISABLE_ASSERT` is true.
@@ -1609,7 +1609,7 @@ assert_contextless :: proc "contextless" (condition: bool, message := #caller_ex
 	}
 }

-// Evaluates the condition and panics the program iff the condition is false.
+// Evaluates the condition and panics the program if and only if (⟺) the condition is false.
 // This uses the `default_assertion_contextless_failure_proc` to assert.
@builtin
 ensure_contextless :: proc "contextless" (condition: bool, message := #caller_expression(condition), loc := #caller_location) {
--- a/base/runtime/random_generator_chacha8_simd256.odin
+++ b/base/runtime/random_generator_chacha8_simd256.odin
@@ -136,7 +136,7 @@ chacha8rand_refill_simd256 :: proc(r: ^Default_Random_State) {
 		//
 		// LLVM appears not to consider "this instruction is totally
 		// awful on the given microarchitcture", which leads to
-		// `VPCOMPRESSED` being generated iff AVX512 support is
+		// `VPCOMPRESSED` being generated if and only if (⟺) AVX512 support is
 		// enabled for `intrinsics.simd_masked_compress_store`.
 		// On Zen 4, this leads to a 50% performance regression vs
 		// the 128-bit SIMD code.
--- a/core/bufio/reader.odin
+++ b/core/bufio/reader.odin
@@ -45,7 +45,7 @@ reader_init_with_buf :: proc(b: ^Reader, rd: io.Reader, buf: []byte) {
 	b.buf = buf
 }

-// reader_destroy destroys the underlying buffer with its associated allocator IFF that allocator has been set
+// reader_destroy destroys the underlying buffer with its associated allocator if and only if (⟺) that allocator has been set
 reader_destroy :: proc(b: ^Reader) {
 	delete(b.buf, b.buf_allocator)
 	b^ = {}
--- a/core/bufio/writer.odin
+++ b/core/bufio/writer.odin
@@ -35,7 +35,7 @@ writer_init_with_buf :: proc(b: ^Writer, wr: io.Writer, buf: []byte) {
 	b.buf = buf
 }

-// writer_destroy destroys the underlying buffer with its associated allocator IFF that allocator has been set
+// writer_destroy destroys the underlying buffer with its associated allocator if and only if (⟺) that allocator has been set
 writer_destroy :: proc(b: ^Writer) {
 	delete(b.buf, b.buf_allocator)
 	b^ = {}
--- a/core/bytes/bytes.odin
+++ b/core/bytes/bytes.odin
@@ -1460,7 +1460,7 @@ fields_proc :: proc(s: []byte, f: proc(rune) -> bool, allocator := context.alloc
 	return subslices[:]
 }

-// alias returns true iff a and b have a non-zero length, and any part of
+// alias returns true if and only if (⟺) a and b have a non-zero length, and any part of
 // a overlaps with b.
 alias :: proc "contextless" (a, b: []byte) -> bool {
 	a_len, b_len := len(a), len(b)
@@ -1474,7 +1474,7 @@ alias :: proc "contextless" (a, b: []byte) -> bool {
 	return a_start <= b_end && b_start <= a_end
 }

-// alias_inexactly returns true iff a and b have a non-zero length,
+// alias_inexactly returns true if and only if (⟺) a and b have a non-zero length,
 // the base pointer of a and b are NOT equal, and any part of a overlaps
 // with b (ie: `alias(a, b)` with an exception that returns false for
 // `a == b`, `b = a[:len(a)-69]` and similar conditions).
--- a/core/container/avl/avl.odin
+++ b/core/container/avl/avl.odin
@@ -100,20 +100,20 @@ len :: proc "contextless" (t: ^$T/Tree($Value)) -> int {
 	return t._size
 }

-// first returns the first node in the tree (in-order) or nil iff
+// first returns the first node in the tree (in-order) or nil if and only if (⟺)
 // the tree is empty.
 first :: proc "contextless" (t: ^$T/Tree($Value)) -> ^Node(Value) {
 	return tree_first_or_last_in_order(t, Direction.Backward)
 }

-// last returns the last element in the tree (in-order) or nil iff
+// last returns the last element in the tree (in-order) or nil if and only if (⟺)
 // the tree is empty.
 last :: proc "contextless" (t: ^$T/Tree($Value)) -> ^Node(Value) {
 	return tree_first_or_last_in_order(t, Direction.Forward)
 }

 // find finds the value in the tree, and returns the corresponding
-// node or nil iff the value is not present.
+// node or nil if and only if (⟺) the value is not present.
 find :: proc(t: ^$T/Tree($Value), value: Value) -> ^Node(Value) {
 	cur := t._root
 	descend_loop: for cur != nil {
@@ -168,7 +168,7 @@ find_or_insert :: proc(
 	return
 }

-// remove removes a node or value from the tree, and returns true iff the
+// remove removes a node or value from the tree, and returns true if and only if (⟺) the
 // removal was successful.  While the node's value will be left intact,
 // the node itself will be freed via the tree's node allocator.
 remove :: proc {
@@ -176,7 +176,7 @@ remove :: proc {
 	remove_node,
 }

-// remove_value removes a value from the tree, and returns true iff the
+// remove_value removes a value from the tree, and returns true if and only if (⟺) the
 // removal was successful.  While the node's value will be left intact,
 // the node itself will be freed via the tree's node allocator.
 remove_value :: proc(t: ^$T/Tree($Value), value: Value, call_on_remove: bool = true) -> bool {
@@ -187,7 +187,7 @@ remove_value :: proc(t: ^$T/Tree($Value), value: Value, call_on_remove: bool = t
 	return remove_node(t, n, call_on_remove)
 }

-// remove_node removes a node from the tree, and returns true iff the
+// remove_node removes a node from the tree, and returns true if and only if (⟺) the
 // removal was successful.  While the node's value will be left intact,
 // the node itself will be freed via the tree's node allocator.
 remove_node :: proc(t: ^$T/Tree($Value), node: ^Node(Value), call_on_remove: bool = true) -> bool {
@@ -281,14 +281,14 @@ iterator_from_pos :: proc "contextless" (
 }

 // iterator_get returns the node currently pointed to by the iterator,
-// or nil iff the node has been removed, the tree is empty, or the end
+// or nil if and only if (⟺) the node has been removed, the tree is empty, or the end
 // of the tree has been reached.
 iterator_get :: proc "contextless" (it: ^$I/Iterator($Value)) -> ^Node(Value) {
 	return it._cur
 }

 // iterator_remove removes the node currently pointed to by the iterator,
-// and returns true iff the removal was successful.  Semantics are the
+// and returns true if and only if (⟺) the removal was successful.  Semantics are the
 // same as the Tree remove.
 iterator_remove :: proc(it: ^$I/Iterator($Value), call_on_remove: bool = true) -> bool {
 	if it._cur == nil {
@@ -304,7 +304,7 @@ iterator_remove :: proc(it: ^$I/Iterator($Value), call_on_remove: bool = true) -
 }

 // iterator_next advances the iterator and returns the (node, true) or
-// or (nil, false) iff the end of the tree has been reached.
+// or (nil, false) if and only if (⟺) the end of the tree has been reached.
 //
 // Note: The first call to iterator_next will return the first node instead
 // of advancing the iterator.
--- a/core/container/rbtree/rbtree.odin
+++ b/core/container/rbtree/rbtree.odin
@@ -95,19 +95,19 @@ len :: proc "contextless" (t: $T/Tree($Key, $Value)) -> (node_count: int) {
 	return t._size
 }

-// first returns the first node in the tree (in-order) or nil iff
+// first returns the first node in the tree (in-order) or nil if and only if (⟺)
 // the tree is empty.
 first :: proc "contextless" (t: ^$T/Tree($Key, $Value)) -> ^Node(Key, Value) {
 	return tree_first_or_last_in_order(t, Direction.Backward)
 }

-// last returns the last element in the tree (in-order) or nil iff
+// last returns the last element in the tree (in-order) or nil if and only if (⟺)
 // the tree is empty.
 last :: proc "contextless" (t: ^$T/Tree($Key, $Value)) -> ^Node(Key, Value) {
 	return tree_first_or_last_in_order(t, Direction.Forward)
 }

-// find finds the key in the tree, and returns the corresponding node, or nil iff the value is not present.
+// find finds the key in the tree, and returns the corresponding node, or nil if and only if (⟺) the value is not present.
 find :: proc(t: $T/Tree($Key, $Value), key: Key) -> (node: ^Node(Key, Value)) {
 	node = t._root
 	for node != nil {
@@ -120,7 +120,7 @@ find :: proc(t: $T/Tree($Key, $Value), key: Key) -> (node: ^Node(Key, Value)) {
 	return node
 }

-// find_value finds the key in the tree, and returns the corresponding value, or nil iff the value is not present.
+// find_value finds the key in the tree, and returns the corresponding value, or nil if and only if (⟺) the value is not present.
 find_value :: proc(t: $T/Tree($Key, $Value), key: Key) -> (value: Value, ok: bool) #optional_ok {
 	if n := find(t, key); n != nil {
 		return n.value, true
@@ -154,7 +154,7 @@ find_or_insert :: proc(t: ^$T/Tree($Key, $Value), key: Key, value: Value) -> (n:
 	return n, true, nil
 }

-// remove removes a node or value from the tree, and returns true iff the
+// remove removes a node or value from the tree, and returns true if and only if (⟺) the
 // removal was successful.  While the node's value will be left intact,
 // the node itself will be freed via the tree's node allocator.
 remove :: proc {
@@ -162,7 +162,7 @@ remove :: proc {
 	remove_node,
 }

-// remove_value removes a value from the tree, and returns true iff the
+// remove_value removes a value from the tree, and returns true if and only if (⟺) the
 // removal was successful.  While the node's key + value will be left intact,
 // the node itself will be freed via the tree's node allocator.
 remove_key :: proc(t: ^$T/Tree($Key, $Value), key: Key, call_on_remove := true) -> bool {
@@ -173,7 +173,7 @@ remove_key :: proc(t: ^$T/Tree($Key, $Value), key: Key, call_on_remove := true)
 	return remove_node(t, n, call_on_remove)
 }

-// remove_node removes a node from the tree, and returns true iff the
+// remove_node removes a node from the tree, and returns true if and only if (⟺) the
 // removal was successful.  While the node's key + value will be left intact,
 // the node itself will be freed via the tree's node allocator.
 remove_node :: proc(t: ^$T/Tree($Key, $Value), node: ^$N/Node(Key, Value), call_on_remove := true) -> (found: bool) {
@@ -235,14 +235,14 @@ iterator_from_pos :: proc "contextless" (t: ^$T/Tree($Key, $Value), pos: ^Node(K
 }

 // iterator_get returns the node currently pointed to by the iterator,
-// or nil iff the node has been removed, the tree is empty, or the end
+// or nil if and only if (⟺) the node has been removed, the tree is empty, or the end
 // of the tree has been reached.
 iterator_get :: proc "contextless" (it: ^$I/Iterator($Key, $Value)) -> ^Node(Key, Value) {
 	return it._cur
 }

 // iterator_remove removes the node currently pointed to by the iterator,
-// and returns true iff the removal was successful.  Semantics are the
+// and returns true if and only if (⟺) the removal was successful.  Semantics are the
 // same as the Tree remove.
 iterator_remove :: proc(it: ^$I/Iterator($Key, $Value), call_on_remove: bool = true) -> bool {
 	if it._cur == nil {
@@ -258,7 +258,7 @@ iterator_remove :: proc(it: ^$I/Iterator($Key, $Value), call_on_remove: bool = t
 }

 // iterator_next advances the iterator and returns the (node, true) or
-// or (nil, false) iff the end of the tree has been reached.
+// or (nil, false) if and only if (⟺) the end of the tree has been reached.
 //
 // Note: The first call to iterator_next will return the first node instead
 // of advancing the iterator.
--- a/core/crypto/_aes/hw/api.odin
+++ b/core/crypto/_aes/hw/api.odin
@@ -0,0 +1,69 @@
+package aes_hw
+
+@(require) import "core:sys/info"
+
+// is_supported returns true if and only if (⟺) hardware accelerated AES
+// is supported.
+is_supported :: proc "contextless" () -> bool {
+	when ODIN_ARCH == .amd64 {
+		// Note: Everything with AES-NI has support for
+		// the required SSE extxtensions.
+		req_features :: info.CPU_Features{
+			.sse2,
+			.ssse3,
+			.sse41,
+			.aes,
+		}
+		return info.cpu_features() >= req_features
+	} else when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 {
+		req_features :: info.CPU_Features{
+			.asimd,
+			.aes,
+		}
+		return info.cpu_features() >= req_features
+	} else {
+		return false
+	}
+}
+
+// is_ghash_supported returns true if and only if (⟺) hardware accelerated
+// GHASH is supported.
+is_ghash_supported :: proc "contextless" () -> bool {
+	// Just having hardware GHASH is silly.
+	if !is_supported() {
+		return false
+	}
+
+	when ODIN_ARCH == .amd64 {
+		return info.cpu_features() >= info.CPU_Features{
+			.pclmulqdq,
+		}
+	} else when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32{
+		// Once we can actually use this, we can re-enable this.
+		//
+		// return info.cpu_features() >= info.CPU_Features{
+		// 	.pmull,
+		// }
+		return false
+	} else {
+		return false
+	}
+}
+
+// Context is a keyed AES (ECB) instance.
+Context :: struct {
+	// Note: The ideal thing to do is for the expanded round keys to be
+	// arrays of `u8x16`, however that implies alignment (or using AVX).
+	//
+	// All the people using e-waste processors that don't support an
+	// instruction set that has been around for over 10 years are why
+	// we can't have nice things.
+	_sk_exp_enc: [15][16]byte,
+	_sk_exp_dec: [15][16]byte,
+	_num_rounds: int,
+}
+
+// init initializes a context for AES with the provided key.
+init :: proc(ctx: ^Context, key: []byte) {
+	keysched(ctx, key)
+}
--- a/core/crypto/_aes/hw/ghash_intel.odin
+++ b/core/crypto/_aes/hw/ghash_intel.odin
@@ -21,7 +21,7 @@
 // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #+build amd64
-package aes_hw_intel
+package aes_hw

 import "base:intrinsics"
 import "core:crypto/_aes"
--- a/core/crypto/_aes/hw/intrinsics_arm.odin
+++ b/core/crypto/_aes/hw/intrinsics_arm.odin
@@ -0,0 +1,115 @@
+#+build arm64,arm32
+package aes_hw
+
+import "core:simd"
+import "core:simd/arm"
+
+// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a/
+
+TARGET_FEATURES :: "neon,aes"
+HAS_GHASH :: false // Temporary
+
+@(require_results, enable_target_feature = "aes")
+aesdec :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 {
+	return simd.bit_xor(arm.vaesimcq_u8(arm.vaesdq_u8(data, simd.u8x16{})), key)
+}
+
+@(require_results, enable_target_feature = "aes")
+aesdeclast :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 {
+	return simd.bit_xor(arm.vaesdq_u8(data, simd.u8x16{}), key)
+}
+
+@(require_results, enable_target_feature = "aes")
+aesenc :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 {
+	return simd.bit_xor(arm.vaesmcq_u8(arm.vaeseq_u8(data, simd.u8x16{})), key)
+}
+
+@(require_results, enable_target_feature = "aes")
+aesenclast :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 {
+	return simd.bit_xor(arm.vaeseq_u8(data, simd.u8x16{}), key)
+}
+
+aesimc :: arm.vaesimcq_u8
+
+@(require_results, enable_target_feature = "aes")
+aeskeygenassist :: #force_inline proc "c" (data: simd.u8x16, $IMM8: u8) -> simd.u8x16 {
+	a := arm.vaeseq_u8(data, simd.u8x16{}) // AESE does ShiftRows and SubBytes on A
+
+	// Undo ShiftRows step from AESE and extract X1 and X3
+	dest := simd.swizzle(
+		a,
+		0x04, 0x01, 0x0e, 0x0b, // SubBytes(X1)
+		0x01, 0x0e, 0x0b, 0x04, // ROT(SubBytes(X1))
+		0x0c, 0x09, 0x06, 0x03, // SubBytes(X3)
+		0x09, 0x06, 0x03, 0x0c, // ROT(SubBytes(X3))
+	)
+
+	rcons := simd.u8x16{
+		0, 0, 0, 0,
+		IMM8, 0, 0, 0,
+		0, 0, 0, 0,
+		IMM8, 0, 0, 0,
+	}
+
+	return simd.bit_xor(dest, rcons)
+}
+
+// The keyschedule implementation is easier to read with some extra
+// Intel intrinsics that are emulated by built-in LLVM ops anyway.
+
+@(private, require_results, enable_target_feature = TARGET_FEATURES)
+_mm_slli_si128 :: #force_inline proc "c" (a: simd.u8x16, $IMM8: u32) -> simd.u8x16 {
+	shift :: IMM8 & 0xff
+
+	// This needs to emit behavior identical to PSLLDQ which is as follows:
+	//
+	// TEMP := COUNT
+	// IF (TEMP > 15) THEN TEMP := 16; FI
+	// DEST := DEST << (TEMP * 8)
+	// DEST[MAXVL-1:128] (Unmodified)
+
+	return simd.shuffle(
+		simd.u8x16{},
+		a,
+		0 when shift > 15 else (16 - shift + 0),
+		1 when shift > 15 else (16 - shift + 1),
+		2 when shift > 15 else (16 - shift + 2),
+		3 when shift > 15 else (16 - shift + 3),
+		4 when shift > 15 else (16 - shift + 4),
+		5 when shift > 15 else (16 - shift + 5),
+		6 when shift > 15 else (16 - shift + 6),
+		7 when shift > 15 else (16 - shift + 7),
+		8 when shift > 15 else (16 - shift + 8),
+		9 when shift > 15 else (16 - shift + 9),
+		10 when shift > 15 else (16 - shift + 10),
+		11 when shift > 15 else (16 - shift + 11),
+		12 when shift > 15 else (16 - shift + 12),
+		13 when shift > 15 else (16 - shift + 13),
+		14 when shift > 15 else (16 - shift + 14),
+		15 when shift > 15 else (16 - shift + 15),
+	)
+}
+
+@(private, require_results, enable_target_feature = TARGET_FEATURES)
+_mm_shuffle_epi32 :: #force_inline proc "c" (a: simd.u8x16, $IMM8: u32) -> simd.u8x16 {
+	v := transmute(simd.i32x4)a
+	return transmute(simd.u8x16)simd.shuffle(
+		v,
+		v,
+		IMM8 & 0b11,
+		(IMM8 >> 2) & 0b11,
+		(IMM8 >> 4) & 0b11,
+		(IMM8 >> 6) & 0b11,
+	)
+}
+
+@(private, require_results, enable_target_feature = TARGET_FEATURES)
+_mm_shuffle_ps :: #force_inline proc "c" (a, b: simd.u8x16, $MASK: u32) -> simd.u8x16 {
+	return transmute(simd.u8x16)simd.shuffle(
+		transmute(simd.u32x4)(a),
+		transmute(simd.u32x4)(b),
+		u32(MASK) & 0b11,
+		(u32(MASK)>>2) & 0b11,
+		((u32(MASK)>>4) & 0b11)+4,
+		((u32(MASK)>>6) & 0b11)+4)
+}
--- a/core/crypto/_aes/hw/intrinsics_intel.odin
+++ b/core/crypto/_aes/hw/intrinsics_intel.odin
@@ -0,0 +1,55 @@
+#+build amd64
+package aes_hw
+
+import "core:simd"
+import "core:simd/x86"
+
+// Intel/RISC-V semantics.
+
+TARGET_FEATURES :: "sse,sse2,ssse3,sse4.1,aes"
+HAS_GHASH :: true
+
+@(require_results, enable_target_feature = "aes")
+aesdec :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 {
+	return transmute(simd.u8x16)(x86._mm_aesdec_si128(transmute(x86.__m128i)(data), transmute(x86.__m128i)(key)))
+}
+
+@(require_results, enable_target_feature = "aes")
+aesdeclast :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 {
+	return transmute(simd.u8x16)(x86._mm_aesdeclast_si128(transmute(x86.__m128i)(data), transmute(x86.__m128i)(key)))
+}
+
+@(require_results, enable_target_feature = "aes")
+aesenc :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 {
+	return transmute(simd.u8x16)(x86._mm_aesenc_si128(transmute(x86.__m128i)(data), transmute(x86.__m128i)(key)))
+}
+
+@(require_results, enable_target_feature = "aes")
+aesenclast :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 {
+	return transmute(simd.u8x16)(x86._mm_aesenclast_si128(transmute(x86.__m128i)(data), transmute(x86.__m128i)(key)))
+}
+
+@(require_results, enable_target_feature = "aes")
+aesimc :: #force_inline proc "c" (data: simd.u8x16) -> simd.u8x16 {
+	return transmute(simd.u8x16)(x86._mm_aesimc_si128(transmute(x86.__m128i)(data)))
+}
+
+@(require_results, enable_target_feature = "aes")
+aeskeygenassist :: #force_inline proc "c" (data: simd.u8x16, $IMM8: u8) -> simd.u8x16 {
+	return transmute(simd.u8x16)(x86._mm_aeskeygenassist_si128(transmute(x86.__m128i)(data), IMM8))
+}
+
+@(private, require_results, enable_target_feature = TARGET_FEATURES)
+_mm_slli_si128 :: #force_inline proc "c" (a: simd.u8x16, $IMM8: u32) -> simd.u8x16 {
+	return transmute(simd.u8x16)(x86._mm_slli_si128(transmute(x86.__m128i)(a), IMM8))
+}
+
+@(private, require_results, enable_target_feature = TARGET_FEATURES)
+_mm_shuffle_epi32 :: #force_inline proc "c" (a: simd.u8x16, $IMM8: u32) -> simd.u8x16 {
+	return transmute(simd.u8x16)(x86._mm_shuffle_epi32(transmute(x86.__m128i)(a), IMM8))
+}
+
+@(private, require_results, enable_target_feature = TARGET_FEATURES)
+_mm_shuffle_ps :: #force_inline proc "c" (a, b: simd.u8x16, $MASK: u32) -> simd.u8x16 {
+	return transmute(simd.u8x16)(x86._mm_shuffle_ps(transmute(x86.__m128)(a), transmute(x86.__m128)(b), MASK))
+}
--- a/core/crypto/_aes/hw/keysched_hw.odin
+++ b/core/crypto/_aes/hw/keysched_hw.odin
@@ -0,0 +1,181 @@
+// Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//
+//   1. Redistributions of source code must retain the above copyright
+//      notice, this list of conditions and the following disclaimer.
+//
+// THIS SOFTWARE IS PROVIDED BY THE AUTHORS “AS IS” AND ANY EXPRESS OR
+// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
+// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+// GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#+build amd64,arm32
+package aes_hw
+
+import "base:intrinsics"
+import "core:crypto"
+import "core:crypto/_aes"
+import "core:simd"
+
+// Inspiration taken from BearSSL's AES-NI implementation.
+//
+// Note: This assumes that the SROA optimization pass is enabled to be
+// anything resembling performant otherwise, LLVM will not elide a massive
+// number of redundant loads/stores it generates for every intrinsic call.
+
+@(private = "file", require_results, enable_target_feature = TARGET_FEATURES)
+expand_step128 :: #force_inline proc(k1, k2: simd.u8x16) -> simd.u8x16 {
+	k1, k2 := k1, k2
+
+	k2 = _mm_shuffle_epi32(k2, 0xff)
+	k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
+	k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
+	k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
+	return simd.bit_xor(k1, k2)
+}
+
+@(private = "file", require_results, enable_target_feature = TARGET_FEATURES)
+expand_step192a :: #force_inline proc (k1_, k2_: ^simd.u8x16, k3: simd.u8x16) -> (simd.u8x16, simd.u8x16) {
+	k1, k2, k3 := k1_^, k2_^, k3
+
+	k3 = _mm_shuffle_epi32(k3, 0x55)
+	k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
+	k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
+	k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
+	k1 = simd.bit_xor(k1, k3)
+
+	tmp := k2
+	k2 = simd.bit_xor(k2, _mm_slli_si128(k2, 0x04))
+	k2 = simd.bit_xor(k2, _mm_shuffle_epi32(k1, 0xff))
+
+	k1_, k2_ := k1_, k2_
+	k1_^, k2_^ = k1, k2
+
+	r1 := _mm_shuffle_ps(tmp, k1, 0x44)
+	r2 := _mm_shuffle_ps(k1, k2, 0x4e)
+
+	return r1, r2
+}
+
+@(private = "file", require_results, enable_target_feature = TARGET_FEATURES)
+expand_step192b :: #force_inline proc (k1_, k2_: ^simd.u8x16, k3: simd.u8x16) -> simd.u8x16 {
+	k1, k2, k3 := k1_^, k2_^, k3
+
+	k3 = _mm_shuffle_epi32(k3, 0x55)
+	k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
+	k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
+	k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
+	k1 = simd.bit_xor(k1, k3)
+
+	k2 = simd.bit_xor(k2, _mm_slli_si128(k2, 0x04))
+	k2 = simd.bit_xor(k2, _mm_shuffle_epi32(k1, 0xff))
+
+	k1_, k2_ := k1_, k2_
+	k1_^, k2_^ = k1, k2
+
+	return k1
+}
+
+@(private = "file", require_results, enable_target_feature = TARGET_FEATURES)
+expand_step256b :: #force_inline proc(k1, k2: simd.u8x16) -> simd.u8x16 {
+	k1, k2 := k1, k2
+
+	k2 = _mm_shuffle_epi32(k2, 0xaa)
+	k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
+	k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
+	k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
+	return simd.bit_xor(k1, k2)
+}
+
+@(private = "file", enable_target_feature = TARGET_FEATURES)
+derive_dec_keys :: proc(ctx: ^Context, sks: ^[15]simd.u8x16, num_rounds: int) {
+	intrinsics.unaligned_store((^simd.u8x16)(&ctx._sk_exp_dec[0]), sks[num_rounds])
+	for i in 1 ..< num_rounds {
+		tmp := aesimc(sks[i])
+		intrinsics.unaligned_store((^simd.u8x16)(&ctx._sk_exp_dec[num_rounds - i]), tmp)
+	}
+	intrinsics.unaligned_store((^simd.u8x16)(&ctx._sk_exp_dec[num_rounds]), sks[0])
+}
+
+@(private, enable_target_feature = TARGET_FEATURES)
+keysched :: proc(ctx: ^Context, key: []byte) {
+	sks: [15]simd.u8x16 = ---
+
+	// Compute the encryption keys.
+	num_rounds, key_len := 0, len(key)
+	switch key_len {
+	case _aes.KEY_SIZE_128:
+		sks[0] = intrinsics.unaligned_load((^simd.u8x16)(raw_data(key)))
+		sks[1] = expand_step128(sks[0], aeskeygenassist(sks[0], 0x01))
+		sks[2] = expand_step128(sks[1], aeskeygenassist(sks[1], 0x02))
+		sks[3] = expand_step128(sks[2], aeskeygenassist(sks[2], 0x04))
+		sks[4] = expand_step128(sks[3], aeskeygenassist(sks[3], 0x08))
+		sks[5] = expand_step128(sks[4], aeskeygenassist(sks[4], 0x10))
+		sks[6] = expand_step128(sks[5], aeskeygenassist(sks[5], 0x20))
+		sks[7] = expand_step128(sks[6], aeskeygenassist(sks[6], 0x40))
+		sks[8] = expand_step128(sks[7], aeskeygenassist(sks[7], 0x80))
+		sks[9] = expand_step128(sks[8], aeskeygenassist(sks[8], 0x1b))
+		sks[10] = expand_step128(sks[9], aeskeygenassist(sks[9], 0x36))
+		num_rounds = _aes.ROUNDS_128
+	case _aes.KEY_SIZE_192:
+		k0 := intrinsics.unaligned_load((^simd.u8x16)(raw_data(key)))
+
+		k1_tmp: [16]byte
+		copy(k1_tmp[:], key[16:24])
+		k1 := intrinsics.unaligned_load((^simd.u8x16)(&k1_tmp))
+		crypto.zero_explicit(&k1_tmp, size_of(k1_tmp))
+
+		sks[0] = k0
+		sks[1], sks[2] = expand_step192a(&k0, &k1, aeskeygenassist(k1, 0x01))
+		sks[3] = expand_step192b(&k0, &k1, aeskeygenassist(k1, 0x02))
+		sks[4], sks[5] = expand_step192a(&k0, &k1, aeskeygenassist(k1, 0x04))
+		sks[6] = expand_step192b(&k0, &k1, aeskeygenassist(k1, 0x08))
+		sks[7], sks[8] = expand_step192a(&k0, &k1, aeskeygenassist(k1, 0x10))
+		sks[9] = expand_step192b(&k0, &k1, aeskeygenassist(k1, 0x20))
+		sks[10], sks[11] = expand_step192a(&k0, &k1, aeskeygenassist(k1, 0x40))
+		sks[12] = expand_step192b(&k0, &k1, aeskeygenassist(k1, 0x80))
+		num_rounds = _aes.ROUNDS_192
+
+	case _aes.KEY_SIZE_256:
+		sks[0] = intrinsics.unaligned_load((^simd.u8x16)(raw_data(key)))
+		sks[1] = intrinsics.unaligned_load((^simd.u8x16)(raw_data(key[16:])))
+		sks[2] = expand_step128(sks[0], aeskeygenassist(sks[1], 0x01))
+		sks[3] = expand_step256b(sks[1], aeskeygenassist(sks[2], 0x01))
+		sks[4] = expand_step128(sks[2], aeskeygenassist(sks[3], 0x02))
+		sks[5] = expand_step256b(sks[3], aeskeygenassist(sks[4], 0x02))
+		sks[6] = expand_step128(sks[4], aeskeygenassist(sks[5], 0x04))
+		sks[7] = expand_step256b(sks[5], aeskeygenassist(sks[6], 0x04))
+		sks[8] = expand_step128(sks[6], aeskeygenassist(sks[7], 0x08))
+		sks[9] = expand_step256b(sks[7], aeskeygenassist(sks[8], 0x08))
+		sks[10] = expand_step128(sks[8], aeskeygenassist(sks[9], 0x10))
+		sks[11] = expand_step256b(sks[9], aeskeygenassist(sks[10], 0x10))
+		sks[12] = expand_step128(sks[10], aeskeygenassist(sks[11], 0x20))
+		sks[13] = expand_step256b(sks[11], aeskeygenassist(sks[12], 0x20))
+		sks[14] = expand_step128(sks[12], aeskeygenassist(sks[13], 0x40))
+		num_rounds = _aes.ROUNDS_256
+	case:
+		panic("crypto/aes: invalid AES key size")
+	}
+	for i in 0 ..= num_rounds {
+		intrinsics.unaligned_store((^simd.u8x16)(&ctx._sk_exp_enc[i]), sks[i])
+	}
+
+	// Compute the decryption keys.  GCM and CTR do not need this, however
+	// ECB, CBC, OCB3, etc do.
+	derive_dec_keys(ctx, &sks, num_rounds)
+
+	ctx._num_rounds = num_rounds
+
+	crypto.zero_explicit(&sks, size_of(sks))
+}
--- a/core/crypto/_aes/hw/unsupported.odin
+++ b/core/crypto/_aes/hw/unsupported.odin
@@ -0,0 +1,11 @@
+#+build !amd64
+#+build !arm64
+#+build !arm32
+package aes_hw
+
+HAS_GHASH :: false
+
+@(private)
+keysched :: proc(ctx: ^Context, key: []byte) {
+	panic("crypto/aes: hardware implementation unsupported")
+}
--- a/core/crypto/_aes/hw_intel/api.odin
+++ b/core/crypto/_aes/hw_intel/api.odin
@@ -1,38 +0,0 @@
-#+build amd64
-package aes_hw_intel
-
-import "core:sys/info"
-
-// is_supported returns true iff hardware accelerated AES
-// is supported.
-is_supported :: proc "contextless" () -> bool {
-	// Note: Everything with AES-NI and PCLMULQDQ has support for
-	// the required SSE extxtensions.
-	req_features :: info.CPU_Features{
-		.sse2,
-		.ssse3,
-		.sse41,
-		.aes,
-		.pclmulqdq,
-	}
-	return info.cpu_features() >= req_features
-}
-
-// Context is a keyed AES (ECB) instance.
-Context :: struct {
-	// Note: The ideal thing to do is for the expanded round keys to be
-	// arrays of `__m128i`, however that implies alignment (or using AVX).
-	//
-	// All the people using e-waste processors that don't support an
-	// insturction set that has been around for over 10 years are why
-	// we can't have nice things.
-	_sk_exp_enc: [15][16]byte,
-	_sk_exp_dec: [15][16]byte,
-	_num_rounds: int,
-}
-
-// init initializes a context for AES with the provided key.
-init :: proc(ctx: ^Context, key: []byte) {
-	keysched(ctx, key)
-}
-
--- a/core/crypto/_aes/hw_intel/hw_intel_keysched.odin
+++ b/core/crypto/_aes/hw_intel/hw_intel_keysched.odin
@@ -1,200 +0,0 @@
-// Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-//
-//   1. Redistributions of source code must retain the above copyright
-//      notice, this list of conditions and the following disclaimer.
-//
-// THIS SOFTWARE IS PROVIDED BY THE AUTHORS “AS IS” AND ANY EXPRESS OR
-// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
-// GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
-// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
-// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-#+build amd64
-package aes_hw_intel
-
-import "base:intrinsics"
-import "core:crypto/_aes"
-import "core:simd/x86"
-
-// Intel AES-NI based implementation.  Inspiration taken from BearSSL.
-//
-// Note: This assumes that the SROA optimization pass is enabled to be
-// anything resembling performat otherwise, LLVM will not elide a massive
-// number of redundant loads/stores it generates for every intrinsic call.
-
-@(private = "file", require_results, enable_target_feature = "sse2")
-expand_step128 :: #force_inline proc(k1, k2: x86.__m128i) -> x86.__m128i {
-	k1, k2 := k1, k2
-
-	k2 = x86._mm_shuffle_epi32(k2, 0xff)
-	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
-	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
-	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
-	return x86._mm_xor_si128(k1, k2)
-}
-
-@(private = "file", require_results, enable_target_feature = "sse,sse2")
-expand_step192a :: #force_inline proc (k1_, k2_: ^x86.__m128i, k3: x86.__m128i) -> (x86.__m128i, x86.__m128i) {
-	k1, k2, k3 := k1_^, k2_^, k3
-
-	k3 = x86._mm_shuffle_epi32(k3, 0x55)
-	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
-	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
-	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
-	k1 = x86._mm_xor_si128(k1, k3)
-
-	tmp := k2
-	k2 = x86._mm_xor_si128(k2, x86._mm_slli_si128(k2, 0x04))
-	k2 = x86._mm_xor_si128(k2, x86._mm_shuffle_epi32(k1, 0xff))
-
-	k1_, k2_ := k1_, k2_
-	k1_^, k2_^ = k1, k2
-
-	r1 := transmute(x86.__m128i)(x86._mm_shuffle_ps(transmute(x86.__m128)(tmp), transmute(x86.__m128)(k1), 0x44))
-	r2 := transmute(x86.__m128i)(x86._mm_shuffle_ps(transmute(x86.__m128)(k1), transmute(x86.__m128)(k2), 0x4e))
-
-	return r1, r2
-}
-
-@(private = "file", require_results, enable_target_feature = "sse2")
-expand_step192b :: #force_inline proc (k1_, k2_: ^x86.__m128i, k3: x86.__m128i) -> x86.__m128i {
-	k1, k2, k3 := k1_^, k2_^, k3
-
-	k3 = x86._mm_shuffle_epi32(k3, 0x55)
-	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
-	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
-	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
-	k1 = x86._mm_xor_si128(k1, k3)
-
-	k2 = x86._mm_xor_si128(k2, x86._mm_slli_si128(k2, 0x04))
-	k2 = x86._mm_xor_si128(k2, x86._mm_shuffle_epi32(k1, 0xff))
-
-	k1_, k2_ := k1_, k2_
-	k1_^, k2_^ = k1, k2
-
-	return k1
-}
-
-@(private = "file", require_results, enable_target_feature = "sse2")
-expand_step256b :: #force_inline proc(k1, k2: x86.__m128i) -> x86.__m128i {
-	k1, k2 := k1, k2
-
-	k2 = x86._mm_shuffle_epi32(k2, 0xaa)
-	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
-	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
-	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
-	return x86._mm_xor_si128(k1, k2)
-}
-
-@(private = "file", enable_target_feature = "aes")
-derive_dec_keys :: proc(ctx: ^Context, sks: ^[15]x86.__m128i, num_rounds: int) {
-	intrinsics.unaligned_store((^x86.__m128i)(&ctx._sk_exp_dec[0]), sks[num_rounds])
-	for i in 1 ..< num_rounds {
-		tmp := x86._mm_aesimc_si128(sks[i])
-		intrinsics.unaligned_store((^x86.__m128i)(&ctx._sk_exp_dec[num_rounds - i]), tmp)
-	}
-	intrinsics.unaligned_store((^x86.__m128i)(&ctx._sk_exp_dec[num_rounds]), sks[0])
-}
-
-@(private, enable_target_feature = "sse,sse2,aes")
-keysched :: proc(ctx: ^Context, key: []byte) {
-	sks: [15]x86.__m128i = ---
-
-	// Compute the encryption keys.
-	num_rounds, key_len := 0, len(key)
-	switch key_len {
-	case _aes.KEY_SIZE_128:
-		sks[0] = intrinsics.unaligned_load((^x86.__m128i)(raw_data(key)))
-		sks[1] = expand_step128(sks[0], x86._mm_aeskeygenassist_si128(sks[0], 0x01))
-		sks[2] = expand_step128(sks[1], x86._mm_aeskeygenassist_si128(sks[1], 0x02))
-		sks[3] = expand_step128(sks[2], x86._mm_aeskeygenassist_si128(sks[2], 0x04))
-		sks[4] = expand_step128(sks[3], x86._mm_aeskeygenassist_si128(sks[3], 0x08))
-		sks[5] = expand_step128(sks[4], x86._mm_aeskeygenassist_si128(sks[4], 0x10))
-		sks[6] = expand_step128(sks[5], x86._mm_aeskeygenassist_si128(sks[5], 0x20))
-		sks[7] = expand_step128(sks[6], x86._mm_aeskeygenassist_si128(sks[6], 0x40))
-		sks[8] = expand_step128(sks[7], x86._mm_aeskeygenassist_si128(sks[7], 0x80))
-		sks[9] = expand_step128(sks[8], x86._mm_aeskeygenassist_si128(sks[8], 0x1b))
-		sks[10] = expand_step128(sks[9], x86._mm_aeskeygenassist_si128(sks[9], 0x36))
-		num_rounds = _aes.ROUNDS_128
-	case _aes.KEY_SIZE_192:
-		k0 := intrinsics.unaligned_load((^x86.__m128i)(raw_data(key)))
-		k1 := x86.__m128i{
-			intrinsics.unaligned_load((^i64)(raw_data(key[16:]))),
-			0,
-		}
-		sks[0] = k0
-		sks[1], sks[2] = expand_step192a(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x01))
-		sks[3] = expand_step192b(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x02))
-		sks[4], sks[5] = expand_step192a(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x04))
-		sks[6] = expand_step192b(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x08))
-		sks[7], sks[8] = expand_step192a(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x10))
-		sks[9] = expand_step192b(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x20))
-		sks[10], sks[11] = expand_step192a(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x40))
-		sks[12] = expand_step192b(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x80))
-		num_rounds = _aes.ROUNDS_192
-	case _aes.KEY_SIZE_256:
-		sks[0] = intrinsics.unaligned_load((^x86.__m128i)(raw_data(key)))
-		sks[1] = intrinsics.unaligned_load((^x86.__m128i)(raw_data(key[16:])))
-		sks[2] = expand_step128(sks[0], x86._mm_aeskeygenassist_si128(sks[1], 0x01))
-		sks[3] = expand_step256b(sks[1], x86._mm_aeskeygenassist_si128(sks[2], 0x01))
-		sks[4] = expand_step128(sks[2], x86._mm_aeskeygenassist_si128(sks[3], 0x02))
-		sks[5] = expand_step256b(sks[3], x86._mm_aeskeygenassist_si128(sks[4], 0x02))
-		sks[6] = expand_step128(sks[4], x86._mm_aeskeygenassist_si128(sks[5], 0x04))
-		sks[7] = expand_step256b(sks[5], x86._mm_aeskeygenassist_si128(sks[6], 0x04))
-		sks[8] = expand_step128(sks[6], x86._mm_aeskeygenassist_si128(sks[7], 0x08))
-		sks[9] = expand_step256b(sks[7], x86._mm_aeskeygenassist_si128(sks[8], 0x08))
-		sks[10] = expand_step128(sks[8], x86._mm_aeskeygenassist_si128(sks[9], 0x10))
-		sks[11] = expand_step256b(sks[9], x86._mm_aeskeygenassist_si128(sks[10], 0x10))
-		sks[12] = expand_step128(sks[10], x86._mm_aeskeygenassist_si128(sks[11], 0x20))
-		sks[13] = expand_step256b(sks[11], x86._mm_aeskeygenassist_si128(sks[12], 0x20))
-		sks[14] = expand_step128(sks[12], x86._mm_aeskeygenassist_si128(sks[13], 0x40))
-		num_rounds = _aes.ROUNDS_256
-	case:
-		panic("crypto/aes: invalid AES key size")
-	}
-	for i in 0 ..= num_rounds {
-		intrinsics.unaligned_store((^x86.__m128i)(&ctx._sk_exp_enc[i]), sks[i])
-	}
-
-	// Compute the decryption keys.  GCM and CTR do not need this, however
-	// ECB, CBC, OCB3, etc do.
-	derive_dec_keys(ctx, &sks, num_rounds)
-
-	ctx._num_rounds = num_rounds
-
-	zero_explicit(&sks, size_of(sks))
-}
-
-/*
-Set each byte of a memory range to zero.
-
-This procedure copies the value `0` into the `len` bytes of a memory range,
-starting at address `data`.
-
-This procedure returns the pointer to `data`.
-
-Unlike the `zero()` procedure, which can be optimized away or reordered by the
-compiler under certain circumstances, `zero_explicit()` procedure can not be
-optimized away or reordered with other memory access operations, and the
-compiler assumes volatile semantics of the memory.
-*/
-zero_explicit :: proc "contextless" (data: rawptr, len: int) -> rawptr {
-	// This routine tries to avoid the compiler optimizing away the call,
-	// so that it is always executed.  It is intended to provide
-	// equivalent semantics to those provided by the C11 Annex K 3.7.4.1
-	// memset_s call.
-	intrinsics.mem_zero_volatile(data, len) // Use the volatile mem_zero
-	intrinsics.atomic_thread_fence(.Seq_Cst) // Prevent reordering
-	return data
-}
--- a/core/crypto/_chacha20/simd128/chacha20_simd128.odin
+++ b/core/crypto/_chacha20/simd128/chacha20_simd128.odin
@@ -215,7 +215,7 @@ _store_simd128 :: #force_inline proc "contextless" (
 	intrinsics.unaligned_store((^simd.u32x4)(dst[3:]), v3)
 }

-// is_performant returns true iff the target and current host both support
+// is_performant returns true if and only if (⟺) the target and current host both support
 // "enough" 128-bit SIMD to make this implementation performant.
 is_performant :: proc "contextless" () -> bool {
 	when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 || ODIN_ARCH == .amd64 || ODIN_ARCH == .i386 || ODIN_ARCH == .riscv64 {
--- a/core/crypto/_chacha20/simd256/chacha20_simd256.odin
+++ b/core/crypto/_chacha20/simd256/chacha20_simd256.odin
@@ -36,7 +36,7 @@ _VEC_ZERO_ONE: simd.u64x4 : {0, 0, 1, 0}
@(private = "file")
 _VEC_TWO: simd.u64x4 : {2, 0, 2, 0}

-// is_performant returns true iff the target and current host both support
+// is_performant returns true if and only if (⟺) the target and current host both support
 // "enough" SIMD to make this implementation performant.
 is_performant :: proc "contextless" () -> bool {
 	req_features :: info.CPU_Features{.avx, .avx2}
--- a/core/crypto/_fiat/field_p256r1/field.odin
+++ b/core/crypto/_fiat/field_p256r1/field.odin
@@ -69,7 +69,7 @@ fe_equal :: proc "contextless" (arg1, arg2: ^Montgomery_Domain_Field_Element) ->
 	tmp: Montgomery_Domain_Field_Element = ---
 	fe_sub(&tmp, arg1, arg2)

-	// This will only underflow iff arg1 == arg2, and we return the borrow,
+	// This will only underflow if and only if (⟺) arg1 == arg2, and we return the borrow,
 	// which will be 1.
 	is_eq := subtle.u64_is_zero(fe_non_zero(&tmp))

--- a/core/crypto/_fiat/field_p384r1/field.odin
+++ b/core/crypto/_fiat/field_p384r1/field.odin
@@ -75,7 +75,7 @@ fe_equal :: proc "contextless" (arg1, arg2: ^Montgomery_Domain_Field_Element) ->
 	tmp: Montgomery_Domain_Field_Element = ---
 	fe_sub(&tmp, arg1, arg2)

-	// This will only underflow iff arg1 == arg2, and we return the borrow,
+	// This will only underflow if and only if (⟺) arg1 == arg2, and we return the borrow,
 	// which will be 1.
 	is_eq := subtle.u64_is_zero(fe_non_zero(&tmp))

--- a/core/crypto/_subtle/subtle.odin
+++ b/core/crypto/_subtle/subtle.odin
@@ -5,17 +5,17 @@ package _subtle

 import "core:math/bits"

-// byte_eq returns 1 iff a == b, 0 otherwise.
+// byte_eq returns 1 if and only if (⟺) a == b, 0 otherwise.
@(optimization_mode="none")
 byte_eq :: proc "contextless" (a, b: byte) -> int {
 	v := a ~ b

-	// v == 0 iff a == b.  The subtraction will underflow, setting the
+	// v == 0 if and only if (⟺) a == b.  The subtraction will underflow, setting the
 	// sign bit, which will get returned.
 	return int((u32(v)-1) >> 31)
 }

-// u64_eq returns 1 iff a == b, 0 otherwise.
+// u64_eq returns 1 if and only if (⟺) a == b, 0 otherwise.
@(optimization_mode="none")
 u64_eq :: proc "contextless" (a, b: u64) -> u64 {
 	_, borrow := bits.sub_u64(0, a ~ b, 0)
@@ -27,14 +27,14 @@ eq :: proc {
 	u64_eq,
 }

-// u64_is_zero returns 1 iff a == 0, 0 otherwise.
+// u64_is_zero returns 1 if and only if (⟺) a == 0, 0 otherwise.
@(optimization_mode="none")
 u64_is_zero :: proc "contextless" (a: u64) -> u64 {
 	_, borrow := bits.sub_u64(a, 1, 0)
 	return borrow
 }

-// u64_is_non_zero returns 1 iff a != 0, 0 otherwise.
+// u64_is_non_zero returns 1 if and only if (⟺) a != 0, 0 otherwise.
@(optimization_mode="none")
 u64_is_non_zero :: proc "contextless" (a: u64) -> u64 {
 	is_zero := u64_is_zero(a)
--- a/core/crypto/aead/aead.odin
+++ b/core/crypto/aead/aead.odin
@@ -13,7 +13,7 @@ seal_oneshot :: proc(algo: Algorithm, dst, tag, key, iv, aad, plaintext: []byte,

 // open authenticates the aad and ciphertext, and decrypts the ciphertext,
 // with the provided algorithm, key, iv, and tag, and stores the output in dst,
-// returning true iff the authentication was successful.  If authentication
+// returning true if and only if (⟺) the authentication was successful.  If authentication
 // fails, the destination buffer will be zeroed.
 //
 // dst and ciphertext MUST alias exactly or not at all.
--- a/core/crypto/aead/low_level.odin
+++ b/core/crypto/aead/low_level.odin
@@ -183,7 +183,7 @@ seal_ctx :: proc(ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) {

 // open_ctx authenticates the aad and ciphertext, and decrypts the ciphertext,
 // with the provided Context, iv, and tag, and stores the output in dst,
-// returning true iff the authentication was successful.  If authentication
+// returning true if and only if (⟺) the authentication was successful.  If authentication
 // fails, the destination buffer will be zeroed.
 //
 // dst and plaintext MUST alias exactly or not at all.
--- a/core/crypto/aegis/aegis.odin
+++ b/core/crypto/aegis/aegis.odin
@@ -144,7 +144,7 @@ seal :: proc(ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) {

 // open authenticates the aad and ciphertext, and decrypts the ciphertext,
 // with the provided Context, iv, and tag, and stores the output in dst,
-// returning true iff the authentication was successful.  If authentication
+// returning true if and only if (⟺) the authentication was successful.  If authentication
 // fails, the destination buffer will be zeroed.
 //
 // dst and plaintext MUST alias exactly or not at all.
--- a/core/crypto/aegis/aegis_impl_hw.odin
+++ b/core/crypto/aegis/aegis_impl_hw.odin
@@ -0,0 +1,397 @@
+#+build amd64,arm32
+package aegis
+
+import "base:intrinsics"
+import "core:crypto"
+import aes_hw "core:crypto/_aes/hw"
+import "core:encoding/endian"
+import "core:simd"
+
+@(private)
+State_HW :: struct {
+	s0:   simd.u8x16,
+	s1:   simd.u8x16,
+	s2:   simd.u8x16,
+	s3:   simd.u8x16,
+	s4:   simd.u8x16,
+	s5:   simd.u8x16,
+	s6:   simd.u8x16,
+	s7:   simd.u8x16,
+	rate: int,
+}
+
+when ODIN_ARCH == .amd64 {
+	@(private="file")
+	TARGET_FEATURES :: "sse2,aes"
+} else when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 {
+	@(private="file")
+	TARGET_FEATURES :: "neon,aes"
+}
+
+// is_hardware_accelerated returns true if and only if (⟺) hardware
+// accelerated AEGIS is supported.
+is_hardware_accelerated :: proc "contextless" () -> bool {
+	return aes_hw.is_supported()
+}
+
+@(private, enable_target_feature = TARGET_FEATURES)
+init_hw :: proc "contextless" (ctx: ^Context, st: ^State_HW, iv: []byte) {
+	switch ctx._key_len {
+	case KEY_SIZE_128L:
+		key := intrinsics.unaligned_load((^simd.u8x16)(&ctx._key[0]))
+		iv := intrinsics.unaligned_load((^simd.u8x16)(raw_data(iv)))
+
+		st.s0 = simd.bit_xor(key, iv)
+		st.s1 = intrinsics.unaligned_load((^simd.u8x16)(&_C1[0]))
+		st.s2 = intrinsics.unaligned_load((^simd.u8x16)(&_C0[0]))
+		st.s3 = st.s1
+		st.s4 = st.s0
+		st.s5 = simd.bit_xor(key, st.s2) // key ^ C0
+		st.s6 = simd.bit_xor(key, st.s1) // key ^ C1
+		st.s7 = st.s5
+		st.rate = _RATE_128L
+
+		for _ in 0 ..< 10 {
+			update_hw_128l(st, iv, key)
+		}
+	case KEY_SIZE_256:
+		k0 := intrinsics.unaligned_load((^simd.u8x16)(&ctx._key[0]))
+		k1 := intrinsics.unaligned_load((^simd.u8x16)(&ctx._key[16]))
+		n0 := intrinsics.unaligned_load((^simd.u8x16)(&iv[0]))
+		n1 := intrinsics.unaligned_load((^simd.u8x16)(&iv[16]))
+
+		st.s0 = simd.bit_xor(k0, n0)
+		st.s1 = simd.bit_xor(k1, n1)
+		st.s2 = intrinsics.unaligned_load((^simd.u8x16)(&_C1[0]))
+		st.s3 = intrinsics.unaligned_load((^simd.u8x16)(&_C0[0]))
+		st.s4 = simd.bit_xor(k0, st.s3) // k0 ^ C0
+		st.s5 = simd.bit_xor(k1, st.s2) // k1 ^ C1
+		st.rate = _RATE_256
+
+		u0, u1 := st.s0, st.s1
+		for _ in 0 ..< 4 {
+			update_hw_256(st, k0)
+			update_hw_256(st, k1)
+			update_hw_256(st, u0)
+			update_hw_256(st, u1)
+		}
+	}
+}
+
+@(private = "file", enable_target_feature = TARGET_FEATURES)
+update_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, m0, m1: simd.u8x16) {
+	s0_ := aes_hw.aesenc(st.s7, simd.bit_xor(st.s0, m0))
+	s1_ := aes_hw.aesenc(st.s0, st.s1)
+	s2_ := aes_hw.aesenc(st.s1, st.s2)
+	s3_ := aes_hw.aesenc(st.s2, st.s3)
+	s4_ := aes_hw.aesenc(st.s3, simd.bit_xor(st.s4, m1))
+	s5_ := aes_hw.aesenc(st.s4, st.s5)
+	s6_ := aes_hw.aesenc(st.s5, st.s6)
+	s7_ := aes_hw.aesenc(st.s6, st.s7)
+	st.s0, st.s1, st.s2, st.s3, st.s4, st.s5, st.s6, st.s7 = s0_, s1_, s2_, s3_, s4_, s5_, s6_, s7_
+}
+
+@(private = "file", enable_target_feature = TARGET_FEATURES)
+update_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, m: simd.u8x16) {
+	s0_ := aes_hw.aesenc(st.s5, simd.bit_xor(st.s0, m))
+	s1_ := aes_hw.aesenc(st.s0, st.s1)
+	s2_ := aes_hw.aesenc(st.s1, st.s2)
+	s3_ := aes_hw.aesenc(st.s2, st.s3)
+	s4_ := aes_hw.aesenc(st.s3, st.s4)
+	s5_ := aes_hw.aesenc(st.s4, st.s5)
+	st.s0, st.s1, st.s2, st.s3, st.s4, st.s5 = s0_, s1_, s2_, s3_, s4_, s5_
+}
+
+@(private = "file", enable_target_feature = TARGET_FEATURES)
+absorb_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, ai: []byte) {
+	t0 := intrinsics.unaligned_load((^simd.u8x16)(&ai[0]))
+	t1 := intrinsics.unaligned_load((^simd.u8x16)(&ai[16]))
+	update_hw_128l(st, t0, t1)
+}
+
+@(private = "file", enable_target_feature = TARGET_FEATURES)
+absorb_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, ai: []byte) {
+	m := intrinsics.unaligned_load((^simd.u8x16)(&ai[0]))
+	update_hw_256(st, m)
+}
+
+@(private, enable_target_feature = TARGET_FEATURES)
+absorb_hw :: proc "contextless" (st: ^State_HW, aad: []byte) #no_bounds_check {
+	ai, l := aad, len(aad)
+
+	switch st.rate {
+	case _RATE_128L:
+		for l >= _RATE_128L {
+			absorb_hw_128l(st, ai)
+			ai = ai[_RATE_128L:]
+			l -= _RATE_128L
+		}
+	case _RATE_256:
+		for l >= _RATE_256 {
+			absorb_hw_256(st, ai)
+
+			ai = ai[_RATE_256:]
+			l -= _RATE_256
+		}
+	}
+
+	// Pad out the remainder with `0`s till it is rate sized.
+	if l > 0 {
+		tmp: [_RATE_MAX]byte // AAD is not confidential.
+		copy(tmp[:], ai)
+		switch st.rate {
+		case _RATE_128L:
+			absorb_hw_128l(st, tmp[:])
+		case _RATE_256:
+			absorb_hw_256(st, tmp[:])
+		}
+	}
+}
+
+@(private = "file", enable_target_feature = TARGET_FEATURES, require_results)
+z_hw_128l :: #force_inline proc "contextless" (st: ^State_HW) -> (simd.u8x16, simd.u8x16) {
+	z0 := simd.bit_xor(
+		st.s6,
+		simd.bit_xor(
+			st.s1,
+			simd.bit_and(st.s2, st.s3),
+		),
+	)
+	z1 := simd.bit_xor(
+		st.s2,
+		simd.bit_xor(
+			st.s5,
+			simd.bit_and(st.s6, st.s7),
+		),
+	)
+	return z0, z1
+}
+
+@(private = "file", enable_target_feature = TARGET_FEATURES, require_results)
+z_hw_256 :: #force_inline proc "contextless" (st: ^State_HW) -> simd.u8x16 {
+	return simd.bit_xor(
+		st.s1,
+		simd.bit_xor(
+			st.s4,
+			simd.bit_xor(
+				st.s5,
+				simd.bit_and(st.s2, st.s3),
+			),
+		),
+	)
+}
+
+@(private = "file", enable_target_feature = TARGET_FEATURES)
+enc_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, ci, xi: []byte) #no_bounds_check {
+	z0, z1 := z_hw_128l(st)
+
+	t0 := intrinsics.unaligned_load((^simd.u8x16)(&xi[0]))
+	t1 := intrinsics.unaligned_load((^simd.u8x16)(&xi[16]))
+	update_hw_128l(st, t0, t1)
+
+	out0 := simd.bit_xor(t0, z0)
+	out1 := simd.bit_xor(t1, z1)
+	intrinsics.unaligned_store((^simd.u8x16)(&ci[0]), out0)
+	intrinsics.unaligned_store((^simd.u8x16)(&ci[16]), out1)
+}
+
+@(private = "file", enable_target_feature = TARGET_FEATURES)
+enc_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, ci, xi: []byte) #no_bounds_check {
+	z := z_hw_256(st)
+
+	xi_ := intrinsics.unaligned_load((^simd.u8x16)(raw_data(xi)))
+	update_hw_256(st, xi_)
+
+	ci_ := simd.bit_xor(xi_, z)
+	intrinsics.unaligned_store((^simd.u8x16)(raw_data(ci)), ci_)
+}
+
+@(private, enable_target_feature = TARGET_FEATURES)
+enc_hw :: proc "contextless" (st: ^State_HW, dst, src: []byte) #no_bounds_check {
+	ci, xi, l := dst, src, len(src)
+
+	switch st.rate {
+	case _RATE_128L:
+		for l >= _RATE_128L {
+			enc_hw_128l(st, ci, xi)
+			ci = ci[_RATE_128L:]
+			xi = xi[_RATE_128L:]
+			l -= _RATE_128L
+		}
+	case _RATE_256:
+		for l >= _RATE_256 {
+			enc_hw_256(st, ci, xi)
+			ci = ci[_RATE_256:]
+			xi = xi[_RATE_256:]
+			l -= _RATE_256
+		}
+	}
+
+	// Pad out the remainder with `0`s till it is rate sized.
+	if l > 0 {
+		tmp: [_RATE_MAX]byte // Ciphertext is not confidential.
+		copy(tmp[:], xi)
+		switch st.rate {
+		case _RATE_128L:
+			enc_hw_128l(st, tmp[:], tmp[:])
+		case _RATE_256:
+			enc_hw_256(st, tmp[:], tmp[:])
+		}
+		copy(ci, tmp[:l])
+	}
+}
+
+@(private = "file", enable_target_feature = TARGET_FEATURES)
+dec_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, xi, ci: []byte) #no_bounds_check {
+	z0, z1 := z_hw_128l(st)
+
+	t0 := intrinsics.unaligned_load((^simd.u8x16)(&ci[0]))
+	t1 := intrinsics.unaligned_load((^simd.u8x16)(&ci[16]))
+	out0 := simd.bit_xor(t0, z0)
+	out1 := simd.bit_xor(t1, z1)
+
+	update_hw_128l(st, out0, out1)
+	intrinsics.unaligned_store((^simd.u8x16)(&xi[0]), out0)
+	intrinsics.unaligned_store((^simd.u8x16)(&xi[16]), out1)
+}
+
+@(private = "file", enable_target_feature = TARGET_FEATURES)
+dec_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, xi, ci: []byte) #no_bounds_check {
+	z := z_hw_256(st)
+
+	ci_ := intrinsics.unaligned_load((^simd.u8x16)(raw_data(ci)))
+	xi_ := simd.bit_xor(ci_, z)
+
+	update_hw_256(st, xi_)
+	intrinsics.unaligned_store((^simd.u8x16)(raw_data(xi)), xi_)
+}
+
+@(private = "file", enable_target_feature = TARGET_FEATURES)
+dec_partial_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, xn, cn: []byte) #no_bounds_check {
+	tmp: [_RATE_128L]byte
+	defer crypto.zero_explicit(&tmp, size_of(tmp))
+
+	z0, z1 := z_hw_128l(st)
+	copy(tmp[:], cn)
+
+	t0 := intrinsics.unaligned_load((^simd.u8x16)(&tmp[0]))
+	t1 := intrinsics.unaligned_load((^simd.u8x16)(&tmp[16]))
+	out0 := simd.bit_xor(t0, z0)
+	out1 := simd.bit_xor(t1, z1)
+
+	intrinsics.unaligned_store((^simd.u8x16)(&tmp[0]), out0)
+	intrinsics.unaligned_store((^simd.u8x16)(&tmp[16]), out1)
+	copy(xn, tmp[:])
+
+	for off := len(xn); off < _RATE_128L; off += 1 {
+		tmp[off] = 0
+	}
+	out0 = intrinsics.unaligned_load((^simd.u8x16)(&tmp[0])) // v0
+	out1 = intrinsics.unaligned_load((^simd.u8x16)(&tmp[16])) // v1
+	update_hw_128l(st, out0, out1)
+}
+
+@(private = "file", enable_target_feature = TARGET_FEATURES)
+dec_partial_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, xn, cn: []byte) #no_bounds_check {
+	tmp: [_RATE_256]byte
+	defer crypto.zero_explicit(&tmp, size_of(tmp))
+
+	z := z_hw_256(st)
+	copy(tmp[:], cn)
+
+	cn_ := intrinsics.unaligned_load((^simd.u8x16)(&tmp[0]))
+	xn_ := simd.bit_xor(cn_, z)
+
+	intrinsics.unaligned_store((^simd.u8x16)(&tmp[0]), xn_)
+	copy(xn, tmp[:])
+
+	for off := len(xn); off < _RATE_256; off += 1 {
+		tmp[off] = 0
+	}
+	xn_ = intrinsics.unaligned_load((^simd.u8x16)(&tmp[0]))
+	update_hw_256(st, xn_)
+}
+
+@(private, enable_target_feature = TARGET_FEATURES)
+dec_hw :: proc "contextless" (st: ^State_HW, dst, src: []byte) #no_bounds_check {
+	xi, ci, l := dst, src, len(src)
+
+	switch st.rate {
+	case _RATE_128L:
+		for l >= _RATE_128L {
+			dec_hw_128l(st, xi, ci)
+			xi = xi[_RATE_128L:]
+			ci = ci[_RATE_128L:]
+			l -= _RATE_128L
+		}
+	case _RATE_256:
+		for l >= _RATE_256 {
+			dec_hw_256(st, xi, ci)
+			xi = xi[_RATE_256:]
+			ci = ci[_RATE_256:]
+			l -= _RATE_256
+		}
+	}
+
+	// Process the remainder.
+	if l > 0 {
+		switch st.rate {
+		case _RATE_128L:
+			dec_partial_hw_128l(st, xi, ci)
+		case _RATE_256:
+			dec_partial_hw_256(st, xi, ci)
+		}
+	}
+}
+
+@(private, enable_target_feature = TARGET_FEATURES)
+finalize_hw :: proc "contextless" (st: ^State_HW, tag: []byte, ad_len, msg_len: int) {
+	tmp: [16]byte
+	endian.unchecked_put_u64le(tmp[0:], u64(ad_len) * 8)
+	endian.unchecked_put_u64le(tmp[8:], u64(msg_len) * 8)
+
+	t := intrinsics.unaligned_load((^simd.u8x16)(&tmp[0]))
+
+	t0, t1: simd.u8x16 = ---, ---
+	switch st.rate {
+	case _RATE_128L:
+		t = simd.bit_xor(st.s2, t)
+		for _ in 0 ..< 7 {
+			update_hw_128l(st, t, t)
+		}
+
+		t0 = simd.bit_xor(st.s0, st.s1)
+		t0 = simd.bit_xor(t0, st.s2)
+		t0 = simd.bit_xor(t0, st.s3)
+
+		t1 = simd.bit_xor(st.s4, st.s5)
+		t1 = simd.bit_xor(t1, st.s6)
+		if len(tag) == TAG_SIZE_256 {
+			t1 = simd.bit_xor(t1, st.s7)
+		}
+	case _RATE_256:
+		t = simd.bit_xor(st.s3, t)
+		for _ in 0 ..< 7 {
+			update_hw_256(st, t)
+		}
+
+		t0 = simd.bit_xor(st.s0, st.s1)
+		t0 = simd.bit_xor(t0, st.s2)
+
+		t1 = simd.bit_xor(st.s3, st.s4)
+		t1 = simd.bit_xor(t1, st.s5)
+	}
+	switch len(tag) {
+	case TAG_SIZE_128:
+		t0 = simd.bit_xor(t0, t1)
+		intrinsics.unaligned_store((^simd.u8x16)(&tag[0]), t0)
+	case TAG_SIZE_256:
+		intrinsics.unaligned_store((^simd.u8x16)(&tag[0]), t0)
+		intrinsics.unaligned_store((^simd.u8x16)(&tag[16]), t1)
+	}
+}
+
+@(private)
+reset_state_hw :: proc "contextless" (st: ^State_HW) {
+	crypto.zero_explicit(st, size_of(st^))
+}
--- a/core/crypto/aegis/aegis_impl_hw_gen.odin
+++ b/core/crypto/aegis/aegis_impl_hw_gen.odin
@@ -1,4 +1,6 @@
 #+build !amd64
+#+build !arm64
+#+build !arm32
 package aegis

@(private = "file")
@@ -7,7 +9,7 @@ ERR_HW_NOT_SUPPORTED :: "crypto/aegis: hardware implementation unsupported"
@(private)
 State_HW :: struct {}

-// is_hardware_accelerated returns true iff hardware accelerated AEGIS
+// is_hardware_accelerated returns true if and only if (⟺) hardware accelerated AEGIS
 // is supported.
 is_hardware_accelerated :: proc "contextless" () -> bool {
 	return false
--- a/core/crypto/aegis/aegis_impl_hw_intel.odin
+++ b/core/crypto/aegis/aegis_impl_hw_intel.odin
@@ -1,389 +0,0 @@
-#+build amd64
-package aegis
-
-import "base:intrinsics"
-import "core:crypto"
-import "core:crypto/aes"
-import "core:encoding/endian"
-import "core:simd/x86"
-
-@(private)
-State_HW :: struct {
-	s0:   x86.__m128i,
-	s1:   x86.__m128i,
-	s2:   x86.__m128i,
-	s3:   x86.__m128i,
-	s4:   x86.__m128i,
-	s5:   x86.__m128i,
-	s6:   x86.__m128i,
-	s7:   x86.__m128i,
-	rate: int,
-}
-
-// is_hardware_accelerated returns true iff hardware accelerated AEGIS
-// is supported.
-is_hardware_accelerated :: proc "contextless" () -> bool {
-	return aes.is_hardware_accelerated()
-}
-
-@(private, enable_target_feature = "sse2,aes")
-init_hw :: proc "contextless" (ctx: ^Context, st: ^State_HW, iv: []byte) {
-	switch ctx._key_len {
-	case KEY_SIZE_128L:
-		key := intrinsics.unaligned_load((^x86.__m128i)(&ctx._key[0]))
-		iv := intrinsics.unaligned_load((^x86.__m128i)(raw_data(iv)))
-
-		st.s0 = x86._mm_xor_si128(key, iv)
-		st.s1 = intrinsics.unaligned_load((^x86.__m128i)(&_C1[0]))
-		st.s2 = intrinsics.unaligned_load((^x86.__m128i)(&_C0[0]))
-		st.s3 = st.s1
-		st.s4 = st.s0
-		st.s5 = x86._mm_xor_si128(key, st.s2) // key ^ C0
-		st.s6 = x86._mm_xor_si128(key, st.s1) // key ^ C1
-		st.s7 = st.s5
-		st.rate = _RATE_128L
-
-		for _ in 0 ..< 10 {
-			update_hw_128l(st, iv, key)
-		}
-	case KEY_SIZE_256:
-		k0 := intrinsics.unaligned_load((^x86.__m128i)(&ctx._key[0]))
-		k1 := intrinsics.unaligned_load((^x86.__m128i)(&ctx._key[16]))
-		n0 := intrinsics.unaligned_load((^x86.__m128i)(&iv[0]))
-		n1 := intrinsics.unaligned_load((^x86.__m128i)(&iv[16]))
-
-		st.s0 = x86._mm_xor_si128(k0, n0)
-		st.s1 = x86._mm_xor_si128(k1, n1)
-		st.s2 = intrinsics.unaligned_load((^x86.__m128i)(&_C1[0]))
-		st.s3 = intrinsics.unaligned_load((^x86.__m128i)(&_C0[0]))
-		st.s4 = x86._mm_xor_si128(k0, st.s3) // k0 ^ C0
-		st.s5 = x86._mm_xor_si128(k1, st.s2) // k1 ^ C1
-		st.rate = _RATE_256
-
-		u0, u1 := st.s0, st.s1
-		for _ in 0 ..< 4 {
-			update_hw_256(st, k0)
-			update_hw_256(st, k1)
-			update_hw_256(st, u0)
-			update_hw_256(st, u1)
-		}
-	}
-}
-
-@(private = "file", enable_target_feature = "sse2,aes")
-update_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, m0, m1: x86.__m128i) {
-	s0_ := x86._mm_aesenc_si128(st.s7, x86._mm_xor_si128(st.s0, m0))
-	s1_ := x86._mm_aesenc_si128(st.s0, st.s1)
-	s2_ := x86._mm_aesenc_si128(st.s1, st.s2)
-	s3_ := x86._mm_aesenc_si128(st.s2, st.s3)
-	s4_ := x86._mm_aesenc_si128(st.s3, x86._mm_xor_si128(st.s4, m1))
-	s5_ := x86._mm_aesenc_si128(st.s4, st.s5)
-	s6_ := x86._mm_aesenc_si128(st.s5, st.s6)
-	s7_ := x86._mm_aesenc_si128(st.s6, st.s7)
-	st.s0, st.s1, st.s2, st.s3, st.s4, st.s5, st.s6, st.s7 = s0_, s1_, s2_, s3_, s4_, s5_, s6_, s7_
-}
-
-@(private = "file", enable_target_feature = "sse2,aes")
-update_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, m: x86.__m128i) {
-	s0_ := x86._mm_aesenc_si128(st.s5, x86._mm_xor_si128(st.s0, m))
-	s1_ := x86._mm_aesenc_si128(st.s0, st.s1)
-	s2_ := x86._mm_aesenc_si128(st.s1, st.s2)
-	s3_ := x86._mm_aesenc_si128(st.s2, st.s3)
-	s4_ := x86._mm_aesenc_si128(st.s3, st.s4)
-	s5_ := x86._mm_aesenc_si128(st.s4, st.s5)
-	st.s0, st.s1, st.s2, st.s3, st.s4, st.s5 = s0_, s1_, s2_, s3_, s4_, s5_
-}
-
-@(private = "file", enable_target_feature = "sse2,aes")
-absorb_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, ai: []byte) {
-	t0 := intrinsics.unaligned_load((^x86.__m128i)(&ai[0]))
-	t1 := intrinsics.unaligned_load((^x86.__m128i)(&ai[16]))
-	update_hw_128l(st, t0, t1)
-}
-
-@(private = "file", enable_target_feature = "sse2,aes")
-absorb_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, ai: []byte) {
-	m := intrinsics.unaligned_load((^x86.__m128i)(&ai[0]))
-	update_hw_256(st, m)
-}
-
-@(private, enable_target_feature = "sse2,aes")
-absorb_hw :: proc "contextless" (st: ^State_HW, aad: []byte) #no_bounds_check {
-	ai, l := aad, len(aad)
-
-	switch st.rate {
-	case _RATE_128L:
-		for l >= _RATE_128L {
-			absorb_hw_128l(st, ai)
-			ai = ai[_RATE_128L:]
-			l -= _RATE_128L
-		}
-	case _RATE_256:
-		for l >= _RATE_256 {
-			absorb_hw_256(st, ai)
-
-			ai = ai[_RATE_256:]
-			l -= _RATE_256
-		}
-	}
-
-	// Pad out the remainder with `0`s till it is rate sized.
-	if l > 0 {
-		tmp: [_RATE_MAX]byte // AAD is not confidential.
-		copy(tmp[:], ai)
-		switch st.rate {
-		case _RATE_128L:
-			absorb_hw_128l(st, tmp[:])
-		case _RATE_256:
-			absorb_hw_256(st, tmp[:])
-		}
-	}
-}
-
-@(private = "file", enable_target_feature = "sse2", require_results)
-z_hw_128l :: #force_inline proc "contextless" (st: ^State_HW) -> (x86.__m128i, x86.__m128i) {
-	z0 := x86._mm_xor_si128(
-		st.s6,
-		x86._mm_xor_si128(
-			st.s1,
-			x86._mm_and_si128(st.s2, st.s3),
-		),
-	)
-	z1 := x86._mm_xor_si128(
-		st.s2,
-		x86._mm_xor_si128(
-			st.s5,
-			x86._mm_and_si128(st.s6, st.s7),
-		),
-	)
-	return z0, z1
-}
-
-@(private = "file", enable_target_feature = "sse2", require_results)
-z_hw_256 :: #force_inline proc "contextless" (st: ^State_HW) -> x86.__m128i {
-	return x86._mm_xor_si128(
-		st.s1,
-		x86._mm_xor_si128(
-			st.s4,
-			x86._mm_xor_si128(
-				st.s5,
-				x86._mm_and_si128(st.s2, st.s3),
-			),
-		),
-	)
-}
-
-@(private = "file", enable_target_feature = "sse2,aes")
-enc_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, ci, xi: []byte) #no_bounds_check {
-	z0, z1 := z_hw_128l(st)
-
-	t0 := intrinsics.unaligned_load((^x86.__m128i)(&xi[0]))
-	t1 := intrinsics.unaligned_load((^x86.__m128i)(&xi[16]))
-	update_hw_128l(st, t0, t1)
-
-	out0 := x86._mm_xor_si128(t0, z0)
-	out1 := x86._mm_xor_si128(t1, z1)
-	intrinsics.unaligned_store((^x86.__m128i)(&ci[0]), out0)
-	intrinsics.unaligned_store((^x86.__m128i)(&ci[16]), out1)
-}
-
-@(private = "file", enable_target_feature = "sse2,aes")
-enc_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, ci, xi: []byte) #no_bounds_check {
-	z := z_hw_256(st)
-
-	xi_ := intrinsics.unaligned_load((^x86.__m128i)(raw_data(xi)))
-	update_hw_256(st, xi_)
-
-	ci_ := x86._mm_xor_si128(xi_, z)
-	intrinsics.unaligned_store((^x86.__m128i)(raw_data(ci)), ci_)
-}
-
-@(private, enable_target_feature = "sse2,aes")
-enc_hw :: proc "contextless" (st: ^State_HW, dst, src: []byte) #no_bounds_check {
-	ci, xi, l := dst, src, len(src)
-
-	switch st.rate {
-	case _RATE_128L:
-		for l >= _RATE_128L {
-			enc_hw_128l(st, ci, xi)
-			ci = ci[_RATE_128L:]
-			xi = xi[_RATE_128L:]
-			l -= _RATE_128L
-		}
-	case _RATE_256:
-		for l >= _RATE_256 {
-			enc_hw_256(st, ci, xi)
-			ci = ci[_RATE_256:]
-			xi = xi[_RATE_256:]
-			l -= _RATE_256
-		}
-	}
-
-	// Pad out the remainder with `0`s till it is rate sized.
-	if l > 0 {
-		tmp: [_RATE_MAX]byte // Ciphertext is not confidential.
-		copy(tmp[:], xi)
-		switch st.rate {
-		case _RATE_128L:
-			enc_hw_128l(st, tmp[:], tmp[:])
-		case _RATE_256:
-			enc_hw_256(st, tmp[:], tmp[:])
-		}
-		copy(ci, tmp[:l])
-	}
-}
-
-@(private = "file", enable_target_feature = "sse2,aes")
-dec_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, xi, ci: []byte) #no_bounds_check {
-	z0, z1 := z_hw_128l(st)
-
-	t0 := intrinsics.unaligned_load((^x86.__m128i)(&ci[0]))
-	t1 := intrinsics.unaligned_load((^x86.__m128i)(&ci[16]))
-	out0 := x86._mm_xor_si128(t0, z0)
-	out1 := x86._mm_xor_si128(t1, z1)
-
-	update_hw_128l(st, out0, out1)
-	intrinsics.unaligned_store((^x86.__m128i)(&xi[0]), out0)
-	intrinsics.unaligned_store((^x86.__m128i)(&xi[16]), out1)
-}
-
-@(private = "file", enable_target_feature = "sse2,aes")
-dec_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, xi, ci: []byte) #no_bounds_check {
-	z := z_hw_256(st)
-
-	ci_ := intrinsics.unaligned_load((^x86.__m128i)(raw_data(ci)))
-	xi_ := x86._mm_xor_si128(ci_, z)
-
-	update_hw_256(st, xi_)
-	intrinsics.unaligned_store((^x86.__m128i)(raw_data(xi)), xi_)
-}
-
-@(private = "file", enable_target_feature = "sse2,aes")
-dec_partial_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, xn, cn: []byte) #no_bounds_check {
-	tmp: [_RATE_128L]byte
-	defer crypto.zero_explicit(&tmp, size_of(tmp))
-
-	z0, z1 := z_hw_128l(st)
-	copy(tmp[:], cn)
-
-	t0 := intrinsics.unaligned_load((^x86.__m128i)(&tmp[0]))
-	t1 := intrinsics.unaligned_load((^x86.__m128i)(&tmp[16]))
-	out0 := x86._mm_xor_si128(t0, z0)
-	out1 := x86._mm_xor_si128(t1, z1)
-
-	intrinsics.unaligned_store((^x86.__m128i)(&tmp[0]), out0)
-	intrinsics.unaligned_store((^x86.__m128i)(&tmp[16]), out1)
-	copy(xn, tmp[:])
-
-	for off := len(xn); off < _RATE_128L; off += 1 {
-		tmp[off] = 0
-	}
-	out0 = intrinsics.unaligned_load((^x86.__m128i)(&tmp[0])) // v0
-	out1 = intrinsics.unaligned_load((^x86.__m128i)(&tmp[16])) // v1
-	update_hw_128l(st, out0, out1)
-}
-
-@(private = "file", enable_target_feature = "sse2,aes")
-dec_partial_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, xn, cn: []byte) #no_bounds_check {
-	tmp: [_RATE_256]byte
-	defer crypto.zero_explicit(&tmp, size_of(tmp))
-
-	z := z_hw_256(st)
-	copy(tmp[:], cn)
-
-	cn_ := intrinsics.unaligned_load((^x86.__m128i)(&tmp[0]))
-	xn_ := x86._mm_xor_si128(cn_, z)
-
-	intrinsics.unaligned_store((^x86.__m128i)(&tmp[0]), xn_)
-	copy(xn, tmp[:])
-
-	for off := len(xn); off < _RATE_256; off += 1 {
-		tmp[off] = 0
-	}
-	xn_ = intrinsics.unaligned_load((^x86.__m128i)(&tmp[0]))
-	update_hw_256(st, xn_)
-}
-
-@(private, enable_target_feature = "sse2,aes")
-dec_hw :: proc "contextless" (st: ^State_HW, dst, src: []byte) #no_bounds_check {
-	xi, ci, l := dst, src, len(src)
-
-	switch st.rate {
-	case _RATE_128L:
-		for l >= _RATE_128L {
-			dec_hw_128l(st, xi, ci)
-			xi = xi[_RATE_128L:]
-			ci = ci[_RATE_128L:]
-			l -= _RATE_128L
-		}
-	case _RATE_256:
-		for l >= _RATE_256 {
-			dec_hw_256(st, xi, ci)
-			xi = xi[_RATE_256:]
-			ci = ci[_RATE_256:]
-			l -= _RATE_256
-		}
-	}
-
-	// Process the remainder.
-	if l > 0 {
-		switch st.rate {
-		case _RATE_128L:
-			dec_partial_hw_128l(st, xi, ci)
-		case _RATE_256:
-			dec_partial_hw_256(st, xi, ci)
-		}
-	}
-}
-
-@(private, enable_target_feature = "sse2,aes")
-finalize_hw :: proc "contextless" (st: ^State_HW, tag: []byte, ad_len, msg_len: int) {
-	tmp: [16]byte
-	endian.unchecked_put_u64le(tmp[0:], u64(ad_len) * 8)
-	endian.unchecked_put_u64le(tmp[8:], u64(msg_len) * 8)
-
-	t := intrinsics.unaligned_load((^x86.__m128i)(&tmp[0]))
-
-	t0, t1: x86.__m128i = ---, ---
-	switch st.rate {
-	case _RATE_128L:
-		t = x86._mm_xor_si128(st.s2, t)
-		for _ in 0 ..< 7 {
-			update_hw_128l(st, t, t)
-		}
-
-		t0 = x86._mm_xor_si128(st.s0, st.s1)
-		t0 = x86._mm_xor_si128(t0, st.s2)
-		t0 = x86._mm_xor_si128(t0, st.s3)
-
-		t1 = x86._mm_xor_si128(st.s4, st.s5)
-		t1 = x86._mm_xor_si128(t1, st.s6)
-		if len(tag) == TAG_SIZE_256 {
-			t1 = x86._mm_xor_si128(t1, st.s7)
-		}
-	case _RATE_256:
-		t = x86._mm_xor_si128(st.s3, t)
-		for _ in 0 ..< 7 {
-			update_hw_256(st, t)
-		}
-
-		t0 = x86._mm_xor_si128(st.s0, st.s1)
-		t0 = x86._mm_xor_si128(t0, st.s2)
-
-		t1 = x86._mm_xor_si128(st.s3, st.s4)
-		t1 = x86._mm_xor_si128(t1, st.s5)
-	}
-	switch len(tag) {
-	case TAG_SIZE_128:
-		t0 = x86._mm_xor_si128(t0, t1)
-		intrinsics.unaligned_store((^x86.__m128i)(&tag[0]), t0)
-	case TAG_SIZE_256:
-		intrinsics.unaligned_store((^x86.__m128i)(&tag[0]), t0)
-		intrinsics.unaligned_store((^x86.__m128i)(&tag[16]), t1)
-	}
-}
-
-@(private)
-reset_state_hw :: proc "contextless" (st: ^State_HW) {
-	crypto.zero_explicit(st, size_of(st^))
-}
--- a/core/crypto/aes/aes_ctr_hw_intel.odin
+++ b/core/crypto/aes/aes_ctr_hw_intel.odin
@@ -1,30 +1,32 @@
-#+build amd64
+#+build amd64,arm32
 package aes

 import "base:intrinsics"
 import "core:crypto/_aes"
+import aes_hw "core:crypto/_aes/hw"
+import "core:encoding/endian"
 import "core:math/bits"
-import "core:simd/x86"
+import "core:simd"

@(private)
 CTR_STRIDE_HW :: 4
@(private)
 CTR_STRIDE_BYTES_HW :: CTR_STRIDE_HW * BLOCK_SIZE

-@(private, enable_target_feature = "sse2,aes")
+@(private, enable_target_feature = aes_hw.TARGET_FEATURES)
 ctr_blocks_hw :: proc(ctx: ^Context_CTR, dst, src: []byte, nr_blocks: int) #no_bounds_check {
 	hw_ctx := ctx._impl.(Context_Impl_Hardware)

-	sks: [15]x86.__m128i = ---
+	sks: [15]simd.u8x16 = ---
 	for i in 0 ..= hw_ctx._num_rounds {
-		sks[i] = intrinsics.unaligned_load((^x86.__m128i)(&hw_ctx._sk_exp_enc[i]))
+		sks[i] = intrinsics.unaligned_load((^simd.u8x16)(&hw_ctx._sk_exp_enc[i]))
 	}

-	hw_inc_ctr := #force_inline proc "contextless" (hi, lo: u64) -> (x86.__m128i, u64, u64) {
-		ret := x86.__m128i{
-			i64(intrinsics.byte_swap(hi)),
-			i64(intrinsics.byte_swap(lo)),
-		}
+	hw_inc_ctr := #force_inline proc "contextless" (hi, lo: u64) -> (simd.u8x16, u64, u64) {
+		buf: [BLOCK_SIZE]byte = ---
+		endian.unchecked_put_u64be(buf[0:], hi)
+		endian.unchecked_put_u64be(buf[8:], lo)
+		ret := intrinsics.unaligned_load((^simd.u8x16)(&buf))

 		hi, lo := hi, lo
 		carry: u64
@@ -46,42 +48,42 @@ ctr_blocks_hw :: proc(ctx: ^Context_CTR, dst, src: []byte, nr_blocks: int) #no_b
 	nr_blocks := nr_blocks
 	ctr_hi, ctr_lo := ctx._ctr_hi, ctx._ctr_lo

-	blks: [CTR_STRIDE_HW]x86.__m128i = ---
+	blks: [CTR_STRIDE_HW]simd.u8x16 = ---
 	for nr_blocks >= CTR_STRIDE_HW {
 		#unroll for i in 0..< CTR_STRIDE_HW {
 			blks[i], ctr_hi, ctr_lo = hw_inc_ctr(ctr_hi, ctr_lo)
 		}

 		#unroll for i in 0 ..< CTR_STRIDE_HW {
-			blks[i] = x86._mm_xor_si128(blks[i], sks[0])
+			blks[i] = simd.bit_xor(blks[i], sks[0])
 		}
 		#unroll for i in 1 ..= 9 {
 			#unroll for j in 0 ..< CTR_STRIDE_HW {
-				blks[j] = x86._mm_aesenc_si128(blks[j], sks[i])
+				blks[j] = aes_hw.aesenc(blks[j], sks[i])
 			}
 		}
 		switch hw_ctx._num_rounds {
 		case _aes.ROUNDS_128:
 			#unroll for i in 0 ..< CTR_STRIDE_HW {
-				blks[i] = x86._mm_aesenclast_si128(blks[i], sks[10])
+				blks[i] = aes_hw.aesenclast(blks[i], sks[10])
 			}
 		case _aes.ROUNDS_192:
 			#unroll for i in 10 ..= 11 {
 				#unroll for j in 0 ..< CTR_STRIDE_HW {
-					blks[j] = x86._mm_aesenc_si128(blks[j], sks[i])
+					blks[j] = aes_hw.aesenc(blks[j], sks[i])
 				}
 			}
 			#unroll for i in 0 ..< CTR_STRIDE_HW {
-				blks[i] = x86._mm_aesenclast_si128(blks[i], sks[12])
+				blks[i] = aes_hw.aesenclast(blks[i], sks[12])
 			}
 		case _aes.ROUNDS_256:
 			#unroll for i in 10 ..= 13 {
 				#unroll for j in 0 ..< CTR_STRIDE_HW {
-					blks[j] = x86._mm_aesenc_si128(blks[j], sks[i])
+					blks[j] = aes_hw.aesenc(blks[j], sks[i])
 				}
 			}
 			#unroll for i in 0 ..< CTR_STRIDE_HW {
-				blks[i] = x86._mm_aesenclast_si128(blks[i], sks[14])
+				blks[i] = aes_hw.aesenclast(blks[i], sks[14])
 			}
 		}

@@ -98,23 +100,23 @@ ctr_blocks_hw :: proc(ctx: ^Context_CTR, dst, src: []byte, nr_blocks: int) #no_b
 	for nr_blocks > 0 {
 		blks[0], ctr_hi, ctr_lo = hw_inc_ctr(ctr_hi, ctr_lo)

-		blks[0] = x86._mm_xor_si128(blks[0], sks[0])
+		blks[0] = simd.bit_xor(blks[0], sks[0])
 		#unroll for i in 1 ..= 9 {
-			blks[0] = x86._mm_aesenc_si128(blks[0], sks[i])
+			blks[0] = aes_hw.aesenc(blks[0], sks[i])
 		}
 		switch hw_ctx._num_rounds {
 		case _aes.ROUNDS_128:
-			blks[0] = x86._mm_aesenclast_si128(blks[0], sks[10])
+			blks[0] = aes_hw.aesenclast(blks[0], sks[10])
 		case _aes.ROUNDS_192:
 			#unroll for i in 10 ..= 11 {
-				blks[0] = x86._mm_aesenc_si128(blks[0], sks[i])
+				blks[0] = aes_hw.aesenc(blks[0], sks[i])
 			}
-			blks[0] = x86._mm_aesenclast_si128(blks[0], sks[12])
+			blks[0] = aes_hw.aesenclast(blks[0], sks[12])
 		case _aes.ROUNDS_256:
 			#unroll for i in 10 ..= 13 {
-				blks[0] = x86._mm_aesenc_si128(blks[0], sks[i])
+				blks[0] = aes_hw.aesenc(blks[0], sks[i])
 			}
-			blks[0] = x86._mm_aesenclast_si128(blks[0], sks[14])
+			blks[0] = aes_hw.aesenclast(blks[0], sks[14])
 		}

 		xor_blocks_hw(dst, src, blks[:1])
@@ -133,18 +135,18 @@ ctr_blocks_hw :: proc(ctx: ^Context_CTR, dst, src: []byte, nr_blocks: int) #no_b
 	zero_explicit(&sks, size_of(sks))
 }

-@(private, enable_target_feature = "sse2")
-xor_blocks_hw :: proc(dst, src: []byte, blocks: []x86.__m128i) {
+@(private, enable_target_feature = aes_hw.TARGET_FEATURES)
+xor_blocks_hw :: proc(dst, src: []byte, blocks: []simd.u8x16) {
 	#no_bounds_check {
 		if src != nil {
 				for i in 0 ..< len(blocks) {
 					off := i * BLOCK_SIZE
-					tmp := intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[off:])))
-					blocks[i] = x86._mm_xor_si128(blocks[i], tmp)
+					tmp := intrinsics.unaligned_load((^simd.u8x16)(raw_data(src[off:])))
+					blocks[i] = simd.bit_xor(blocks[i], tmp)
 				}
 		}
 		for i in 0 ..< len(blocks) {
-			intrinsics.unaligned_store((^x86.__m128i)(raw_data(dst[i * BLOCK_SIZE:])), blocks[i])
+			intrinsics.unaligned_store((^simd.u8x16)(raw_data(dst[i * BLOCK_SIZE:])), blocks[i])
 		}
 	}
 }
--- a/core/crypto/aes/aes_ecb_hw.odin
+++ b/core/crypto/aes/aes_ecb_hw.odin
@@ -0,0 +1,59 @@
+#+build amd64,arm32
+package aes
+
+import "base:intrinsics"
+import "core:crypto/_aes"
+import aes_hw "core:crypto/_aes/hw"
+import "core:simd"
+
+@(private, enable_target_feature = aes_hw.TARGET_FEATURES)
+encrypt_block_hw :: proc(ctx: ^Context_Impl_Hardware, dst, src: []byte) {
+	blk := intrinsics.unaligned_load((^simd.u8x16)(raw_data(src)))
+
+	blk = simd.bit_xor(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_enc[0])))
+	#unroll for i in 1 ..= 9 {
+		blk = aes_hw.aesenc(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_enc[i])))
+	}
+	switch ctx._num_rounds {
+	case _aes.ROUNDS_128:
+		blk = aes_hw.aesenclast(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_enc[10])))
+	case _aes.ROUNDS_192:
+		#unroll for i in 10 ..= 11 {
+			blk = aes_hw.aesenc(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_enc[i])))
+		}
+		blk = aes_hw.aesenclast(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_enc[12])))
+	case _aes.ROUNDS_256:
+		#unroll for i in 10 ..= 13 {
+			blk = aes_hw.aesenc(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_enc[i])))
+		}
+		blk = aes_hw.aesenclast(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_enc[14])))
+	}
+
+	intrinsics.unaligned_store((^simd.u8x16)(raw_data(dst)), blk)
+}
+
+@(private, enable_target_feature = aes_hw.TARGET_FEATURES)
+decrypt_block_hw :: proc(ctx: ^Context_Impl_Hardware, dst, src: []byte) {
+	blk := intrinsics.unaligned_load((^simd.u8x16)(raw_data(src)))
+
+	blk = simd.bit_xor(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_dec[0])))
+	#unroll for i in 1 ..= 9 {
+		blk = aes_hw.aesdec(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_dec[i])))
+	}
+	switch ctx._num_rounds {
+	case _aes.ROUNDS_128:
+		blk = aes_hw.aesdeclast(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_dec[10])))
+	case _aes.ROUNDS_192:
+		#unroll for i in 10 ..= 11 {
+			blk = aes_hw.aesdec(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_dec[i])))
+		}
+		blk = aes_hw.aesdeclast(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_dec[12])))
+	case _aes.ROUNDS_256:
+		#unroll for i in 10 ..= 13 {
+			blk = aes_hw.aesdec(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_dec[i])))
+		}
+		blk = aes_hw.aesdeclast(blk, intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_dec[14])))
+	}
+
+	intrinsics.unaligned_store((^simd.u8x16)(raw_data(dst)), blk)
+}
--- a/core/crypto/aes/aes_ecb_hw_intel.odin
+++ b/core/crypto/aes/aes_ecb_hw_intel.odin
@@ -1,58 +0,0 @@
-#+build amd64
-package aes
-
-import "base:intrinsics"
-import "core:crypto/_aes"
-import "core:simd/x86"
-
-@(private, enable_target_feature = "sse2,aes")
-encrypt_block_hw :: proc(ctx: ^Context_Impl_Hardware, dst, src: []byte) {
-	blk := intrinsics.unaligned_load((^x86.__m128i)(raw_data(src)))
-
-	blk = x86._mm_xor_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[0])))
-	#unroll for i in 1 ..= 9 {
-		blk = x86._mm_aesenc_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[i])))
-	}
-	switch ctx._num_rounds {
-	case _aes.ROUNDS_128:
-		blk = x86._mm_aesenclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[10])))
-	case _aes.ROUNDS_192:
-		#unroll for i in 10 ..= 11 {
-			blk = x86._mm_aesenc_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[i])))
-		}
-		blk = x86._mm_aesenclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[12])))
-	case _aes.ROUNDS_256:
-		#unroll for i in 10 ..= 13 {
-			blk = x86._mm_aesenc_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[i])))
-		}
-		blk = x86._mm_aesenclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[14])))
-	}
-
-	intrinsics.unaligned_store((^x86.__m128i)(raw_data(dst)), blk)
-}
-
-@(private, enable_target_feature = "sse2,aes")
-decrypt_block_hw :: proc(ctx: ^Context_Impl_Hardware, dst, src: []byte) {
-	blk := intrinsics.unaligned_load((^x86.__m128i)(raw_data(src)))
-
-	blk = x86._mm_xor_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[0])))
-	#unroll for i in 1 ..= 9 {
-		blk = x86._mm_aesdec_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[i])))
-	}
-	switch ctx._num_rounds {
-	case _aes.ROUNDS_128:
-		blk = x86._mm_aesdeclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[10])))
-	case _aes.ROUNDS_192:
-		#unroll for i in 10 ..= 11 {
-			blk = x86._mm_aesdec_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[i])))
-		}
-		blk = x86._mm_aesdeclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[12])))
-	case _aes.ROUNDS_256:
-		#unroll for i in 10 ..= 13 {
-			blk = x86._mm_aesdec_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[i])))
-		}
-		blk = x86._mm_aesdeclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[14])))
-	}
-
-	intrinsics.unaligned_store((^x86.__m128i)(raw_data(dst)), blk)
-}
--- a/core/crypto/aes/aes_gcm.odin
+++ b/core/crypto/aes/aes_gcm.odin
@@ -4,6 +4,7 @@ import "core:bytes"
 import "core:crypto"
 import "core:crypto/_aes"
 import "core:crypto/_aes/ct64"
+import aes_hw "core:crypto/_aes/hw"
 import "core:encoding/endian"

 // GCM_IV_SIZE is the default size of the GCM IV in bytes.
@@ -26,6 +27,10 @@ Context_GCM :: struct {

 // init_gcm initializes a Context_GCM with the provided key.
 init_gcm :: proc(ctx: ^Context_GCM, key: []byte, impl := DEFAULT_IMPLEMENTATION) {
+	when aes_hw.HAS_GHASH {
+		impl := aes_hw.is_ghash_supported() ? impl : .Portable
+
+	}
 	init_impl(&ctx._impl, key, impl)
 	ctx._is_initialized = true
 }
@@ -65,7 +70,7 @@ seal_gcm :: proc(ctx: ^Context_GCM, dst, tag, iv, aad, plaintext: []byte) {

 // open_gcm authenticates the aad and ciphertext, and decrypts the ciphertext,
 // with the provided Context_GCM, iv, and tag, and stores the output in dst,
-// returning true iff the authentication was successful.  If authentication
+// returning true if and only if (⟺) the authentication was successful.  If authentication
 // fails, the destination buffer will be zeroed.
 //
 // dst and plaintext MUST alias exactly or not at all.
--- a/core/crypto/aes/aes_gcm_hw_intel.odin
+++ b/core/crypto/aes/aes_gcm_hw_intel.odin
@@ -1,12 +1,13 @@
-#+build amd64
+#+build amd64,arm32
 package aes

 import "base:intrinsics"
 import "core:crypto"
 import "core:crypto/_aes"
-import "core:crypto/_aes/hw_intel"
+@(require) import "core:crypto/_aes/ct64"
+import aes_hw "core:crypto/_aes/hw"
 import "core:encoding/endian"
-import "core:simd/x86"
+import "core:simd"

@(private)
 gcm_seal_hw :: proc(ctx: ^Context_Impl_Hardware, dst, tag, iv, aad, plaintext: []byte) {
@@ -17,7 +18,11 @@ gcm_seal_hw :: proc(ctx: ^Context_Impl_Hardware, dst, tag, iv, aad, plaintext: [
 	init_ghash_hw(ctx, &h, &j0, &j0_enc, iv)

 	// Note: Our GHASH implementation handles appending padding.
-	hw_intel.ghash(s[:], h[:], aad)
+	when aes_hw.HAS_GHASH {
+		aes_hw.ghash(s[:], h[:], aad)
+	} else {
+		ct64.ghash(s[:], h[:], aad)
+	}
 	gctr_hw(ctx, dst, &s, plaintext, &h, &j0, true)
 	final_ghash_hw(&s, &h, &j0_enc, len(aad), len(plaintext))
 	copy(tag, s[:])
@@ -35,7 +40,11 @@ gcm_open_hw :: proc(ctx: ^Context_Impl_Hardware, dst, iv, aad, ciphertext, tag:
 	s: [_aes.GHASH_TAG_SIZE]byte
 	init_ghash_hw(ctx, &h, &j0, &j0_enc, iv)

-	hw_intel.ghash(s[:], h[:], aad)
+	when aes_hw.HAS_GHASH {
+		aes_hw.ghash(s[:], h[:], aad)
+	} else {
+		ct64.ghash(s[:], h[:], aad)
+	}
 	gctr_hw(ctx, dst, &s, ciphertext, &h, &j0, false)
 	final_ghash_hw(&s, &h, &j0_enc, len(aad), len(ciphertext))

@@ -71,18 +80,26 @@ init_ghash_hw :: proc(
 	} else {
 		// If len(IV) != 96, then let s = 128 ceil(len(IV)/128) - len(IV),
 		// and let J0 = GHASHH(IV || 0^(s+64) || ceil(len(IV))^64).
-		hw_intel.ghash(j0[:], h[:], iv)
+		when aes_hw.HAS_GHASH {
+			aes_hw.ghash(j0[:], h[:], iv)
+		} else {
+			ct64.ghash(j0[:], h[:], iv)
+		}

 		tmp: [_aes.GHASH_BLOCK_SIZE]byte
 		endian.unchecked_put_u64be(tmp[8:], u64(l) * 8)
-		hw_intel.ghash(j0[:], h[:], tmp[:])
+		when aes_hw.HAS_GHASH {
+			aes_hw.ghash(j0[:], h[:], tmp[:])
+		} else {
+			ct64.ghash(j0[:], h[:], tmp[:])
+		}
 	}

 	// ECB encrypt j0, so that we can just XOR with the tag.
 	encrypt_block_hw(ctx, j0_enc[:], j0[:])
 }

-@(private = "file", enable_target_feature = "sse2")
+@(private = "file", enable_target_feature = aes_hw.TARGET_FEATURES)
 final_ghash_hw :: proc(
 	s: ^[_aes.GHASH_BLOCK_SIZE]byte,
 	h: ^[_aes.GHASH_KEY_SIZE]byte,
@@ -94,14 +111,18 @@ final_ghash_hw :: proc(
 	endian.unchecked_put_u64be(blk[0:], u64(a_len) * 8)
 	endian.unchecked_put_u64be(blk[8:], u64(t_len) * 8)

-	hw_intel.ghash(s[:], h[:], blk[:])
-	j0_vec := intrinsics.unaligned_load((^x86.__m128i)(j0))
-	s_vec := intrinsics.unaligned_load((^x86.__m128i)(s))
-	s_vec = x86._mm_xor_si128(s_vec, j0_vec)
-	intrinsics.unaligned_store((^x86.__m128i)(s), s_vec)
+	when aes_hw.HAS_GHASH {
+		aes_hw.ghash(s[:], h[:], blk[:])
+	} else {
+		ct64.ghash(s[:], h[:], blk[:])
+	}
+	j0_vec := intrinsics.unaligned_load((^simd.u8x16)(j0))
+	s_vec := intrinsics.unaligned_load((^simd.u8x16)(s))
+	s_vec = simd.bit_xor(s_vec, j0_vec)
+	intrinsics.unaligned_store((^simd.u8x16)(s), s_vec)
 }

-@(private = "file", enable_target_feature = "sse2,sse4.1,aes")
+@(private = "file", enable_target_feature = aes_hw.TARGET_FEATURES)
 gctr_hw :: proc(
 	ctx: ^Context_Impl_Hardware,
 	dst: []byte,
@@ -111,13 +132,13 @@ gctr_hw :: proc(
 	iv: ^[_aes.GHASH_BLOCK_SIZE]byte,
 	is_seal: bool,
 ) #no_bounds_check {
-	sks: [15]x86.__m128i = ---
+	sks: [15]simd.u8x16 = ---
 	for i in 0 ..= ctx._num_rounds {
-		sks[i] = intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[i]))
+		sks[i] = intrinsics.unaligned_load((^simd.u8x16)(&ctx._sk_exp_enc[i]))
 	}

 	// Setup the counter block
-	ctr_blk := intrinsics.unaligned_load((^x86.__m128i)(iv))
+	ctr_blk := intrinsics.unaligned_load((^simd.u8x16)(iv))
 	ctr := endian.unchecked_get_u32be(iv[GCM_IV_SIZE:]) + 1

 	src, dst := src, dst
@@ -127,11 +148,15 @@ gctr_hw :: proc(
 	// This results in an unreadable mess, so we opt for simplicity
 	// as performance is adequate.

-	blks: [CTR_STRIDE_HW]x86.__m128i = ---
+	blks: [CTR_STRIDE_HW]simd.u8x16 = ---
 	nr_blocks := len(src) / BLOCK_SIZE
 	for nr_blocks >= CTR_STRIDE_HW {
 		if !is_seal {
-			hw_intel.ghash(s[:], h[:], src[:CTR_STRIDE_BYTES_HW])
+			when aes_hw.HAS_GHASH {
+				aes_hw.ghash(s[:], h[:], src[:CTR_STRIDE_BYTES_HW])
+			} else {
+				ct64.ghash(s[:], h[:], src[:CTR_STRIDE_BYTES_HW])
+			}
 		}

 		#unroll for i in 0 ..< CTR_STRIDE_HW {
@@ -139,42 +164,46 @@ gctr_hw :: proc(
 		}

 		#unroll for i in 0 ..< CTR_STRIDE_HW {
-			blks[i] = x86._mm_xor_si128(blks[i], sks[0])
+			blks[i] = simd.bit_xor(blks[i], sks[0])
 		}
 		#unroll for i in 1 ..= 9 {
 			#unroll for j in 0 ..< CTR_STRIDE_HW {
-				blks[j] = x86._mm_aesenc_si128(blks[j], sks[i])
+				blks[j] = aes_hw.aesenc(blks[j], sks[i])
 			}
 		}
 		switch ctx._num_rounds {
 		case _aes.ROUNDS_128:
 			#unroll for i in 0 ..< CTR_STRIDE_HW {
-				blks[i] = x86._mm_aesenclast_si128(blks[i], sks[10])
+				blks[i] = aes_hw.aesenclast(blks[i], sks[10])
 			}
 		case _aes.ROUNDS_192:
 			#unroll for i in 10 ..= 11 {
 				#unroll for j in 0 ..< CTR_STRIDE_HW {
-					blks[j] = x86._mm_aesenc_si128(blks[j], sks[i])
+					blks[j] = aes_hw.aesenc(blks[j], sks[i])
 				}
 			}
 			#unroll for i in 0 ..< CTR_STRIDE_HW {
-				blks[i] = x86._mm_aesenclast_si128(blks[i], sks[12])
+				blks[i] = aes_hw.aesenclast(blks[i], sks[12])
 			}
 		case _aes.ROUNDS_256:
 			#unroll for i in 10 ..= 13 {
 				#unroll for j in 0 ..< CTR_STRIDE_HW {
-					blks[j] = x86._mm_aesenc_si128(blks[j], sks[i])
+					blks[j] = aes_hw.aesenc(blks[j], sks[i])
 				}
 			}
 			#unroll for i in 0 ..< CTR_STRIDE_HW {
-				blks[i] = x86._mm_aesenclast_si128(blks[i], sks[14])
+				blks[i] = aes_hw.aesenclast(blks[i], sks[14])
 			}
 		}

 		xor_blocks_hw(dst, src, blks[:])

 		if is_seal {
-			hw_intel.ghash(s[:], h[:], dst[:CTR_STRIDE_BYTES_HW])
+			when aes_hw.HAS_GHASH {
+				aes_hw.ghash(s[:], h[:], dst[:CTR_STRIDE_BYTES_HW])
+			} else {
+				ct64.ghash(s[:], h[:], dst[:CTR_STRIDE_BYTES_HW])
+			}
 		}

 		src = src[CTR_STRIDE_BYTES_HW:]
@@ -186,28 +215,32 @@ gctr_hw :: proc(
 	for n := len(src); n > 0; {
 		l := min(n, BLOCK_SIZE)
 		if !is_seal {
-			hw_intel.ghash(s[:], h[:], src[:l])
+			when aes_hw.HAS_GHASH {
+				aes_hw.ghash(s[:], h[:], src[:l])
+			} else {
+				ct64.ghash(s[:], h[:], src[:l])
+			}
 		}

 		blks[0], ctr = hw_inc_ctr32(&ctr_blk, ctr)

-		blks[0] = x86._mm_xor_si128(blks[0], sks[0])
+		blks[0] = simd.bit_xor(blks[0], sks[0])
 		#unroll for i in 1 ..= 9 {
-			blks[0] = x86._mm_aesenc_si128(blks[0], sks[i])
+			blks[0] = aes_hw.aesenc(blks[0], sks[i])
 		}
 		switch ctx._num_rounds {
 		case _aes.ROUNDS_128:
-			blks[0] = x86._mm_aesenclast_si128(blks[0], sks[10])
+			blks[0] = aes_hw.aesenclast(blks[0], sks[10])
 		case _aes.ROUNDS_192:
 			#unroll for i in 10 ..= 11 {
-				blks[0] = x86._mm_aesenc_si128(blks[0], sks[i])
+				blks[0] = aes_hw.aesenc(blks[0], sks[i])
 			}
-			blks[0] = x86._mm_aesenclast_si128(blks[0], sks[12])
+			blks[0] = aes_hw.aesenclast(blks[0], sks[12])
 		case _aes.ROUNDS_256:
 			#unroll for i in 10 ..= 13 {
-				blks[0] = x86._mm_aesenc_si128(blks[0], sks[i])
+				blks[0] = aes_hw.aesenc(blks[0], sks[i])
 			}
-			blks[0] = x86._mm_aesenclast_si128(blks[0], sks[14])
+			blks[0] = aes_hw.aesenclast(blks[0], sks[14])
 		}

 		if l == BLOCK_SIZE {
@@ -219,7 +252,11 @@ gctr_hw :: proc(
 			copy(dst, blk[:l])
 		}
 		if is_seal {
-			hw_intel.ghash(s[:], h[:], dst[:l])
+			when aes_hw.HAS_GHASH {
+				aes_hw.ghash(s[:], h[:], dst[:l])
+			} else {
+				ct64.ghash(s[:], h[:], dst[:l])
+			}
 		}

 		dst = dst[l:]
@@ -235,8 +272,17 @@ gctr_hw :: proc(
 // the compiler.
 //
 // src/check_expr.cpp(8104): Assertion Failure: `c->curr_proc_decl->entity`
-@(private = "file", enable_target_feature = "sse4.1")
-hw_inc_ctr32 :: #force_inline proc "contextless" (src: ^x86.__m128i, ctr: u32) -> (x86.__m128i, u32) {
-	ret := x86._mm_insert_epi32(src^, i32(intrinsics.byte_swap(ctr)), 3)
+@(private = "file", enable_target_feature = aes_hw.TARGET_FEATURES)
+hw_inc_ctr32 :: #force_inline proc "contextless" (src: ^simd.u8x16, ctr: u32) -> (simd.u8x16, u32) {
+	when ODIN_ENDIAN == .Little {
+		ctr_be := intrinsics.byte_swap(ctr)
+	} else {
+		ctr_be := ctr
+	}
+
+	ret := transmute(simd.u8x16)(
+		simd.replace(transmute(simd.u32x4)(src^), 3, ctr_be)
+	)
+
 	return ret, ctr + 1
 }
--- a/core/crypto/aes/aes_impl_hw.odin
+++ b/core/crypto/aes/aes_impl_hw.odin
@@ -0,0 +1,18 @@
+#+build amd64,arm32
+package aes
+
+import aes_hw "core:crypto/_aes/hw"
+
+// is_hardware_accelerated returns true if and only if (⟺) hardware accelerated AES
+// is supported.
+is_hardware_accelerated :: proc "contextless" () -> bool {
+	return aes_hw.is_supported()
+}
+
+@(private)
+Context_Impl_Hardware :: aes_hw.Context
+
+@(private, enable_target_feature = aes_hw.TARGET_FEATURES)
+init_impl_hw :: proc(ctx: ^Context_Impl_Hardware, key: []byte) {
+	aes_hw.init(ctx, key)
+}
--- a/core/crypto/aes/aes_impl_hw_gen.odin
+++ b/core/crypto/aes/aes_impl_hw_gen.odin
@@ -1,10 +1,12 @@
 #+build !amd64
+#+build !arm64
+#+build !arm32
 package aes

@(private = "file")
 ERR_HW_NOT_SUPPORTED :: "crypto/aes: hardware implementation unsupported"

-// is_hardware_accelerated returns true iff hardware accelerated AES
+// is_hardware_accelerated returns true if and only if (⟺) hardware accelerated AES
 // is supported.
 is_hardware_accelerated :: proc "contextless" () -> bool {
 	return false
--- a/core/crypto/aes/aes_impl_hw_intel.odin
+++ b/core/crypto/aes/aes_impl_hw_intel.odin
@@ -1,18 +0,0 @@
-#+build amd64
-package aes
-
-import "core:crypto/_aes/hw_intel"
-
-// is_hardware_accelerated returns true iff hardware accelerated AES
-// is supported.
-is_hardware_accelerated :: proc "contextless" () -> bool {
-	return hw_intel.is_supported()
-}
-
-@(private)
-Context_Impl_Hardware :: hw_intel.Context
-
-@(private, enable_target_feature = "sse2,aes")
-init_impl_hw :: proc(ctx: ^Context_Impl_Hardware, key: []byte) {
-	hw_intel.init(ctx, key)
-}
--- a/core/crypto/blake2b/blake2b.odin
+++ b/core/crypto/blake2b/blake2b.odin
@@ -54,7 +54,7 @@ update :: proc(ctx: ^Context, data: []byte) {
 // final finalizes the Context, writes the digest to hash, and calls
 // reset on the Context.
 //
-// Iff finalize_clone is set, final will work on a copy of the Context,
+// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context,
 // which is useful for for calculating rolling digests.
 final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
 	_blake2.final(ctx, hash, finalize_clone)
--- a/core/crypto/blake2s/blake2s.odin
+++ b/core/crypto/blake2s/blake2s.odin
@@ -54,7 +54,7 @@ update :: proc(ctx: ^Context, data: []byte) {
 // final finalizes the Context, writes the digest to hash, and calls
 // reset on the Context.
 //
-// Iff finalize_clone is set, final will work on a copy of the Context,
+// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context,
 // which is useful for for calculating rolling digests.
 final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
 	_blake2.final(ctx, hash, finalize_clone)
--- a/core/crypto/chacha20poly1305/chacha20poly1305.odin
+++ b/core/crypto/chacha20poly1305/chacha20poly1305.odin
@@ -136,7 +136,7 @@ seal :: proc(ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) {

 // open authenticates the aad and ciphertext, and decrypts the ciphertext,
 // with the provided Context, iv, and tag, and stores the output in dst,
-// returning true iff the authentication was successful.  If authentication
+// returning true if and only if (⟺) the authentication was successful.  If authentication
 // fails, the destination buffer will be zeroed.
 //
 // dst and plaintext MUST alias exactly or not at all.
--- a/core/crypto/crypto.odin
+++ b/core/crypto/crypto.odin
@@ -8,15 +8,15 @@ import subtle "core:crypto/_subtle"
 // Omit large precomputed tables, trading off performance for size.
 COMPACT_IMPLS: bool : #config(ODIN_CRYPTO_COMPACT, false)

-// HAS_RAND_BYTES is true iff the runtime provides a cryptographic
+// HAS_RAND_BYTES is true if and only if (⟺) the runtime provides a cryptographic
 // entropy source.
 HAS_RAND_BYTES :: runtime.HAS_RAND_BYTES

-// compare_constant_time returns 1 iff a and b are equal, 0 otherwise.
+// compare_constant_time returns 1 if and only if (⟺) a and b are equal, 0 otherwise.
 //
 // The execution time of this routine is constant regardless of the contents
 // of the slices being compared, as long as the length of the slices is equal.
-// If the length of the two slices is different, it will early-return 0.
+// If the length of the two slices is dif and only if (⟺)erent, it will early-return 0.
 compare_constant_time :: proc "contextless" (a, b: []byte) -> int {
 	// If the length of the slices is different, early return.
 	//
@@ -31,7 +31,7 @@ compare_constant_time :: proc "contextless" (a, b: []byte) -> int {
 	return compare_byte_ptrs_constant_time(raw_data(a), raw_data(b), n)
 }

-// compare_byte_ptrs_constant_time returns 1 iff the bytes pointed to by
+// compare_byte_ptrs_constant_time returns 1 if and only if (⟺) the bytes pointed to by
 // a and b are equal, 0 otherwise.
 //
 // The execution time of this routine is constant regardless of the
@@ -46,12 +46,12 @@ compare_byte_ptrs_constant_time :: proc "contextless" (a, b: ^byte, n: int) -> i
 		v |= x[i] ~ y[i]
 	}

-	// After the loop, v == 0 iff a == b.  The subtraction will underflow
-	// iff v == 0, setting the sign-bit, which gets returned.
+	// After the loop, v == 0 if and only if (⟺) a == b.  The subtraction will underflow
+	// if and only if (⟺) v == 0, setting the sign-bit, which gets returned.
 	return subtle.eq(0, v)
 }

-// is_zero_constant_time returns 1 iff b is all 0s, 0 otherwise.
+// is_zero_constant_time returns 1 if and only if (⟺) b is all 0s, 0 otherwise.
 is_zero_constant_time :: proc "contextless" (b: []byte) -> int {
 	v: byte
 	for b_ in b {
--- a/core/crypto/deoxysii/deoxysii.odin
+++ b/core/crypto/deoxysii/deoxysii.odin
@@ -122,7 +122,7 @@ seal :: proc(ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) {

 // open authenticates the aad and ciphertext, and decrypts the ciphertext,
 // with the provided Context, iv, and tag, and stores the output in dst,
-// returning true iff the authentication was successful.  If authentication
+// returning true if and only if (⟺) the authentication was successful.  If authentication
 // fails, the destination buffer will be zeroed.
 //
 // dst and plaintext MUST alias exactly or not at all.
--- a/core/crypto/deoxysii/deoxysii_impl_hw_intel.odin
+++ b/core/crypto/deoxysii/deoxysii_impl_hw_intel.odin
@@ -1,152 +1,183 @@
-#+build amd64
+#+build amd64,arm32
 package deoxysii

 import "base:intrinsics"
 import "core:crypto"
-import "core:crypto/aes"
+import aes_hw "core:crypto/_aes/hw"
 import "core:simd"
-import "core:simd/x86"

 // This processes a maximum of 4 blocks at a time, as that is suitable
 // for most current hardware that doesn't say "Xeon".
+//
+// TODO/perf: ARM should be able to do 8 at a time.
+
+when ODIN_ARCH == .amd64 {
+	@(private="file")
+	TARGET_FEATURES :: "sse2,ssse3,aes"
+} else when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 {
+	@(private="file")
+	TARGET_FEATURES :: "neon,aes"
+}

@(private = "file")
-_BIT_ENC :: x86.__m128i{0x80, 0}
+_BIT_ENC :: simd.u8x16{0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
@(private = "file")
-_PREFIX_AD_BLOCK :: x86.__m128i{PREFIX_AD_BLOCK << PREFIX_SHIFT, 0}
+_PREFIX_AD_BLOCK :: simd.u8x16{
+	PREFIX_AD_BLOCK << PREFIX_SHIFT, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0,
+}
@(private = "file")
-_PREFIX_AD_FINAL :: x86.__m128i{PREFIX_AD_FINAL << PREFIX_SHIFT, 0}
+_PREFIX_AD_FINAL :: simd.u8x16{
+	PREFIX_AD_FINAL << PREFIX_SHIFT, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0,
+}
@(private = "file")
-_PREFIX_MSG_BLOCK :: x86.__m128i{PREFIX_MSG_BLOCK << PREFIX_SHIFT, 0}
+_PREFIX_MSG_BLOCK :: simd.u8x16{
+	PREFIX_MSG_BLOCK << PREFIX_SHIFT, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0,
+}
@(private = "file")
-_PREFIX_MSG_FINAL :: x86.__m128i{PREFIX_MSG_FINAL << PREFIX_SHIFT, 0}
+_PREFIX_MSG_FINAL :: simd.u8x16{
+	PREFIX_MSG_FINAL << PREFIX_SHIFT, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0,
+}

-// is_hardware_accelerated returns true iff hardware accelerated Deoxys-II
+// is_hardware_accelerated returns true if and only if (⟺) hardware accelerated Deoxys-II
 // is supported.
 is_hardware_accelerated :: proc "contextless" () -> bool {
-	return aes.is_hardware_accelerated()
+	return aes_hw.is_supported()
 }

-@(private = "file", enable_target_feature = "sse4.1", require_results)
+@(private = "file", enable_target_feature = TARGET_FEATURES, require_results)
 auth_tweak :: #force_inline proc "contextless" (
-	prefix:   x86.__m128i,
+	prefix:   simd.u8x16,
 	block_nr: int,
-) -> x86.__m128i {
-	return x86._mm_insert_epi64(prefix, i64(intrinsics.byte_swap(u64(block_nr))), 1)
-}
+) -> simd.u8x16 {
+	when ODIN_ENDIAN == .Little {
+		block_nr_u64 := intrinsics.byte_swap(u64(block_nr))
+	} else {
+		block_nr_u64 := u64(block_nr)
+	}

-@(private = "file", enable_target_feature = "sse2", require_results)
-enc_tweak :: #force_inline proc "contextless" (
-	tag:      x86.__m128i,
-	block_nr: int,
-) -> x86.__m128i {
-	return x86._mm_xor_si128(
-		x86._mm_or_si128(tag, _BIT_ENC),
-		x86.__m128i{0, i64(intrinsics.byte_swap(u64(block_nr)))},
+	return simd.bit_or(
+		prefix,
+		transmute(simd.u8x16)(simd.u64x2{0, block_nr_u64}),
 	)
 }

-@(private = "file", enable_target_feature = "ssse3", require_results)
-h_ :: #force_inline proc "contextless" (tk1: x86.__m128i) -> x86.__m128i {
-	return transmute(x86.__m128i)h(transmute(simd.u8x16)tk1)
+@(private = "file", enable_target_feature = TARGET_FEATURES, require_results)
+enc_tweak :: #force_inline proc "contextless" (
+	tag:      simd.u8x16,
+	block_nr: int,
+) -> simd.u8x16 {
+	when ODIN_ENDIAN == .Little {
+		block_nr_u64 := intrinsics.byte_swap(u64(block_nr))
+	} else {
+		block_nr_u64 := u64(block_nr)
+	}
+
+	return simd.bit_xor(
+		simd.bit_or(tag, _BIT_ENC),
+		transmute(simd.u8x16)(simd.u64x2{0, block_nr_u64}),
+	)
 }

-@(private = "file", enable_target_feature = "sse2,ssse3,aes", require_results)
+@(private = "file", enable_target_feature = TARGET_FEATURES, require_results)
 bc_x4 :: #force_inline proc "contextless" (
 	ctx: ^Context,
-	s_0, s_1, s_2, s_3:                 x86.__m128i,
-	tweak_0, tweak_1, tweak_2, tweak_3: x86.__m128i,
-) -> (x86.__m128i, x86.__m128i, x86.__m128i, x86.__m128i) #no_bounds_check {
+	s_0, s_1, s_2, s_3:                 simd.u8x16,
+	tweak_0, tweak_1, tweak_2, tweak_3: simd.u8x16,
+) -> (simd.u8x16, simd.u8x16, simd.u8x16, simd.u8x16) #no_bounds_check {
 	s_0, s_1, s_2, s_3 := s_0, s_1, s_2, s_3
 	tk1_0, tk1_1, tk1_2, tk1_3 := tweak_0, tweak_1, tweak_2, tweak_3

-	sk := intrinsics.unaligned_load((^x86.__m128i)(&ctx._subkeys[0]))
-	stk_0 := x86._mm_xor_si128(tk1_0, sk)
-	stk_1 := x86._mm_xor_si128(tk1_1, sk)
-	stk_2 := x86._mm_xor_si128(tk1_2, sk)
-	stk_3 := x86._mm_xor_si128(tk1_3, sk)
+	sk := intrinsics.unaligned_load((^simd.u8x16)(&ctx._subkeys[0]))
+	stk_0 := simd.bit_xor(tk1_0, sk)
+	stk_1 := simd.bit_xor(tk1_1, sk)
+	stk_2 := simd.bit_xor(tk1_2, sk)
+	stk_3 := simd.bit_xor(tk1_3, sk)

-	s_0 = x86._mm_xor_si128(s_0, stk_0)
-	s_1 = x86._mm_xor_si128(s_1, stk_1)
-	s_2 = x86._mm_xor_si128(s_2, stk_2)
-	s_3 = x86._mm_xor_si128(s_3, stk_3)
+	s_0 = simd.bit_xor(s_0, stk_0)
+	s_1 = simd.bit_xor(s_1, stk_1)
+	s_2 = simd.bit_xor(s_2, stk_2)
+	s_3 = simd.bit_xor(s_3, stk_3)

 	for i in 1 ..= BC_ROUNDS {
-		sk = intrinsics.unaligned_load((^x86.__m128i)(&ctx._subkeys[i]))
+		sk = intrinsics.unaligned_load((^simd.u8x16)(&ctx._subkeys[i]))

-		tk1_0 = h_(tk1_0)
-		tk1_1 = h_(tk1_1)
-		tk1_2 = h_(tk1_2)
-		tk1_3 = h_(tk1_3)
+		tk1_0 = h(tk1_0)
+		tk1_1 = h(tk1_1)
+		tk1_2 = h(tk1_2)
+		tk1_3 = h(tk1_3)

-		stk_0 = x86._mm_xor_si128(tk1_0, sk)
-		stk_1 = x86._mm_xor_si128(tk1_1, sk)
-		stk_2 = x86._mm_xor_si128(tk1_2, sk)
-		stk_3 = x86._mm_xor_si128(tk1_3, sk)
+		stk_0 = simd.bit_xor(tk1_0, sk)
+		stk_1 = simd.bit_xor(tk1_1, sk)
+		stk_2 = simd.bit_xor(tk1_2, sk)
+		stk_3 = simd.bit_xor(tk1_3, sk)

-		s_0 = x86._mm_aesenc_si128(s_0, stk_0)
-		s_1 = x86._mm_aesenc_si128(s_1, stk_1)
-		s_2 = x86._mm_aesenc_si128(s_2, stk_2)
-		s_3 = x86._mm_aesenc_si128(s_3, stk_3)
+		s_0 = aes_hw.aesenc(s_0, stk_0)
+		s_1 = aes_hw.aesenc(s_1, stk_1)
+		s_2 = aes_hw.aesenc(s_2, stk_2)
+		s_3 = aes_hw.aesenc(s_3, stk_3)
 	}

 	return s_0, s_1, s_2, s_3
 }

-@(private = "file", enable_target_feature = "sse2,ssse3,aes", require_results)
+@(private = "file", enable_target_feature = TARGET_FEATURES, require_results)
 bc_x1 :: #force_inline proc "contextless" (
 	ctx:   ^Context,
-	s:     x86.__m128i,
-	tweak: x86.__m128i,
-) -> x86.__m128i #no_bounds_check {
+	s:     simd.u8x16,
+	tweak: simd.u8x16,
+) -> simd.u8x16 #no_bounds_check {
 	s, tk1 := s, tweak

-	sk := intrinsics.unaligned_load((^x86.__m128i)(&ctx._subkeys[0]))
-	stk := x86._mm_xor_si128(tk1, sk)
+	sk := intrinsics.unaligned_load((^simd.u8x16)(&ctx._subkeys[0]))
+	stk := simd.bit_xor(tk1, sk)

-	s = x86._mm_xor_si128(s, stk)
+	s = simd.bit_xor(s, stk)

 	for i in 1 ..= BC_ROUNDS {
-		sk = intrinsics.unaligned_load((^x86.__m128i)(&ctx._subkeys[i]))
+		sk = intrinsics.unaligned_load((^simd.u8x16)(&ctx._subkeys[i]))

-		tk1 = h_(tk1)
+		tk1 = h(tk1)

-		stk = x86._mm_xor_si128(tk1, sk)
+		stk = simd.bit_xor(tk1, sk)

-		s = x86._mm_aesenc_si128(s, stk)
+		s = aes_hw.aesenc(s, stk)
 	}

 	return s
 }

-@(private = "file", enable_target_feature = "sse2,ssse3,sse4.1,aes", require_results)
+@(private = "file", enable_target_feature = TARGET_FEATURES, require_results)
 bc_absorb :: proc "contextless" (
 	ctx:          ^Context,
-	tag:          x86.__m128i,
+	tag:          simd.u8x16,
 	src:          []byte,
-	tweak_prefix: x86.__m128i,
+	tweak_prefix: simd.u8x16,
 	stk_block_nr: int,
-) -> (x86.__m128i, int) #no_bounds_check {
+) -> (simd.u8x16, int) #no_bounds_check {
 	src, stk_block_nr, tag := src, stk_block_nr, tag

 	nr_blocks := len(src) / BLOCK_SIZE
 	for nr_blocks >= 4 {
 		d_0, d_1, d_2, d_3 := bc_x4(
 			ctx,
-			intrinsics.unaligned_load((^x86.__m128i)(raw_data(src))),
-			intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[BLOCK_SIZE:]))),
-			intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[2*BLOCK_SIZE:]))),
-			intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[3*BLOCK_SIZE:]))),
+			intrinsics.unaligned_load((^simd.u8x16)(raw_data(src))),
+			intrinsics.unaligned_load((^simd.u8x16)(raw_data(src[BLOCK_SIZE:]))),
+			intrinsics.unaligned_load((^simd.u8x16)(raw_data(src[2*BLOCK_SIZE:]))),
+			intrinsics.unaligned_load((^simd.u8x16)(raw_data(src[3*BLOCK_SIZE:]))),
 			auth_tweak(tweak_prefix, stk_block_nr),
 			auth_tweak(tweak_prefix, stk_block_nr + 1),
 			auth_tweak(tweak_prefix, stk_block_nr + 2),
 			auth_tweak(tweak_prefix, stk_block_nr + 3),
 		)

-		tag = x86._mm_xor_si128(tag, d_0)
-		tag = x86._mm_xor_si128(tag, d_1)
-		tag = x86._mm_xor_si128(tag, d_2)
-		tag = x86._mm_xor_si128(tag, d_3)
+		tag = simd.bit_xor(tag, d_0)
+		tag = simd.bit_xor(tag, d_1)
+		tag = simd.bit_xor(tag, d_2)
+		tag = simd.bit_xor(tag, d_3)

 		src = src[4*BLOCK_SIZE:]
 		stk_block_nr += 4
@@ -156,11 +187,11 @@ bc_absorb :: proc "contextless" (
 	for nr_blocks > 0 {
 		d := bc_x1(
 			ctx,
-			intrinsics.unaligned_load((^x86.__m128i)(raw_data(src))),
+			intrinsics.unaligned_load((^simd.u8x16)(raw_data(src))),
 			auth_tweak(tweak_prefix, stk_block_nr),
 		)

-		tag = x86._mm_xor_si128(tag, d)
+		tag = simd.bit_xor(tag, d)

 		src = src[BLOCK_SIZE:]
 		stk_block_nr += 1
@@ -170,29 +201,29 @@ bc_absorb :: proc "contextless" (
 	return tag, stk_block_nr
 }

-@(private = "file", enable_target_feature = "sse2,ssse3,aes", require_results)
+@(private = "file", enable_target_feature = TARGET_FEATURES, require_results)
 bc_final :: proc "contextless" (
 	ctx: ^Context,
-	tag: x86.__m128i,
+	tag: simd.u8x16,
 	iv:  []byte,
-) -> x86.__m128i {
+) -> simd.u8x16 {
 	tmp: [BLOCK_SIZE]byte

 	tmp[0] = PREFIX_TAG << PREFIX_SHIFT
 	copy(tmp[1:], iv)

-	tweak := intrinsics.unaligned_load((^x86.__m128i)(&tmp))
+	tweak := intrinsics.unaligned_load((^simd.u8x16)(&tmp))

 	return bc_x1(ctx, tag, tweak)
 }

-@(private = "file", enable_target_feature = "sse2,ssse3,aes", require_results)
+@(private = "file", enable_target_feature = TARGET_FEATURES, require_results)
 bc_encrypt :: proc "contextless" (
 	ctx:          ^Context,
 	dst:          []byte,
 	src:          []byte,
-	iv:           x86.__m128i,
-	tweak_tag:    x86.__m128i,
+	iv:           simd.u8x16,
+	tweak_tag:    simd.u8x16,
 	stk_block_nr: int,
 ) -> int {
 	dst, src, stk_block_nr := dst, src, stk_block_nr
@@ -209,31 +240,31 @@ bc_encrypt :: proc "contextless" (
 		)

 		intrinsics.unaligned_store(
-			(^x86.__m128i)(raw_data(dst)),
-			x86._mm_xor_si128(
+			(^simd.u8x16)(raw_data(dst)),
+			simd.bit_xor(
 				d_0,
-				intrinsics.unaligned_load((^x86.__m128i)(raw_data(src))),
+				intrinsics.unaligned_load((^simd.u8x16)(raw_data(src))),
 			),
 		)
 		intrinsics.unaligned_store(
-			(^x86.__m128i)(raw_data(dst[BLOCK_SIZE:])),
-			x86._mm_xor_si128(
+			(^simd.u8x16)(raw_data(dst[BLOCK_SIZE:])),
+			simd.bit_xor(
 				d_1,
-				intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[BLOCK_SIZE:]))),
+				intrinsics.unaligned_load((^simd.u8x16)(raw_data(src[BLOCK_SIZE:]))),
 			),
 		)
 		intrinsics.unaligned_store(
-			(^x86.__m128i)(raw_data(dst[2*BLOCK_SIZE:])),
-			x86._mm_xor_si128(
+			(^simd.u8x16)(raw_data(dst[2*BLOCK_SIZE:])),
+			simd.bit_xor(
 				d_2,
-				intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[2*BLOCK_SIZE:]))),
+				intrinsics.unaligned_load((^simd.u8x16)(raw_data(src[2*BLOCK_SIZE:]))),
 			),
 		)
 		intrinsics.unaligned_store(
-			(^x86.__m128i)(raw_data(dst[3*BLOCK_SIZE:])),
-			x86._mm_xor_si128(
+			(^simd.u8x16)(raw_data(dst[3*BLOCK_SIZE:])),
+			simd.bit_xor(
 				d_3,
-				intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[3*BLOCK_SIZE:]))),
+				intrinsics.unaligned_load((^simd.u8x16)(raw_data(src[3*BLOCK_SIZE:]))),
 			),
 		)

@@ -250,10 +281,10 @@ bc_encrypt :: proc "contextless" (
 		)

 		intrinsics.unaligned_store(
-			(^x86.__m128i)(raw_data(dst)),
-			x86._mm_xor_si128(
+			(^simd.u8x16)(raw_data(dst)),
+			simd.bit_xor(
 				d,
-				intrinsics.unaligned_load((^x86.__m128i)(raw_data(src))),
+				intrinsics.unaligned_load((^simd.u8x16)(raw_data(src))),
 			),
 		)

@@ -269,7 +300,7 @@ bc_encrypt :: proc "contextless" (
 e_hw :: proc "contextless" (ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) #no_bounds_check {
 	tmp: [BLOCK_SIZE]byte
 	copy(tmp[1:], iv)
-	iv_ := intrinsics.unaligned_load((^x86.__m128i)(raw_data(&tmp)))
+	iv_ := intrinsics.unaligned_load((^simd.u8x16)(raw_data(&tmp)))

 	// Algorithm 3
 	//
@@ -282,7 +313,7 @@ e_hw :: proc "contextless" (ctx: ^Context, dst, tag, iv, aad, plaintext: []byte)
 	// if A_∗ != nil then
 	//   Auth <- Auth ^ EK(0110 || la, pad10∗(A_∗))
 	// end
-	auth: x86.__m128i
+	auth: simd.u8x16
 	n: int

 	aad := aad
@@ -341,14 +372,14 @@ e_hw :: proc "contextless" (ctx: ^Context, dst, tag, iv, aad, plaintext: []byte)
 		copy(dst[n*BLOCK_SIZE:], m_star[:])
 	}

-	intrinsics.unaligned_store((^x86.__m128i)(raw_data(tag)), auth)
+	intrinsics.unaligned_store((^simd.u8x16)(raw_data(tag)), auth)
 }

@(private, require_results)
 d_hw :: proc "contextless" (ctx: ^Context, dst, iv, aad, ciphertext, tag: []byte) -> bool {
 	tmp: [BLOCK_SIZE]byte
 	copy(tmp[1:], iv)
-	iv_ := intrinsics.unaligned_load((^x86.__m128i)(raw_data(&tmp)))
+	iv_ := intrinsics.unaligned_load((^simd.u8x16)(raw_data(&tmp)))

 	// Algorithm 4
 	//
@@ -360,7 +391,7 @@ d_hw :: proc "contextless" (ctx: ^Context, dst, iv, aad, ciphertext, tag: []byte
 	// if C_∗ != nil then
 	//   M_∗ <- C_∗ ^ EK(1 || tag ^ l, 0^8 || N)
 	// end
-	auth := intrinsics.unaligned_load((^x86.__m128i)(raw_data(tag)))
+	auth := intrinsics.unaligned_load((^simd.u8x16)(raw_data(tag)))

 	m := ciphertext
 	n := bc_encrypt(ctx, dst, m, iv_, auth, 0)
@@ -385,7 +416,7 @@ d_hw :: proc "contextless" (ctx: ^Context, dst, iv, aad, ciphertext, tag: []byte
 	// if A∗ != nil then
 	//   Auth <- Auth ^ EK(0110| | l_a, pad10∗(A_∗))
 	// end
-	auth = x86.__m128i{0, 0}
+	auth = simd.u8x16{}
 	aad := aad
 	auth, n = bc_absorb(ctx, auth, aad, _PREFIX_AD_BLOCK, 0)
 	aad = aad[BLOCK_SIZE*n:]
@@ -424,7 +455,7 @@ d_hw :: proc "contextless" (ctx: ^Context, dst, iv, aad, ciphertext, tag: []byte
 	// Tag verification
 	// if tag0 = tag then return (M_1 || ... || M_l || M_∗)
 	// else return false
-	intrinsics.unaligned_store((^x86.__m128i)(raw_data(&tmp)), auth)
+	intrinsics.unaligned_store((^simd.u8x16)(raw_data(&tmp)), auth)
 	ok := crypto.compare_constant_time(tmp[:], tag) == 1

 	crypto.zero_explicit(&tmp, size_of(tmp))
--- a/core/crypto/deoxysii/deoxysii_impl_hw_gen.odin
+++ b/core/crypto/deoxysii/deoxysii_impl_hw_gen.odin
@@ -1,10 +1,12 @@
 #+build !amd64
+#+build !arm64
+#+build !arm32
 package deoxysii

@(private = "file")
 ERR_HW_NOT_SUPPORTED :: "crypto/deoxysii: hardware implementation unsupported"

-// is_hardware_accelerated returns true iff hardware accelerated Deoxys-II
+// is_hardware_accelerated returns true if and only if (⟺) hardware accelerated Deoxys-II
 // is supported.
 is_hardware_accelerated :: proc "contextless" () -> bool {
 	return false
--- a/core/crypto/ecdh/ecdh.odin
+++ b/core/crypto/ecdh/ecdh.odin
@@ -104,7 +104,7 @@ Public_Key :: struct {
 }

 // private_key_generate uses the system entropy source to generate a new
-// Private_Key.  This will only fail iff the system entropy source is
+// Private_Key.  This will only fail if and only if (⟺) the system entropy source is
 // missing or broken.
 private_key_generate :: proc(priv_key: ^Private_Key, curve: Curve) -> bool {
 	private_key_clear(priv_key)
@@ -142,7 +142,7 @@ private_key_generate :: proc(priv_key: ^Private_Key, curve: Curve) -> bool {
 }

 // private_key_set_bytes decodes a byte-encoded private key, and returns
-// true iff the operation was successful.
+// true if and only if (⟺) the operation was successful.
 private_key_set_bytes :: proc(priv_key: ^Private_Key, curve: Curve, b: []byte) -> bool {
 	private_key_clear(priv_key)

@@ -245,7 +245,7 @@ private_key_bytes :: proc(priv_key: ^Private_Key, dst: []byte) {
 	}
 }

-// private_key_equal returns true iff the private keys are equal,
+// private_key_equal returns true if and only if (⟺) the private keys are equal,
 // in constant time.
 private_key_equal :: proc(p, q: ^Private_Key) -> bool {
 	if p._curve != q._curve {
@@ -276,7 +276,7 @@ private_key_clear :: proc "contextless" (priv_key: ^Private_Key) {
 }

 // public_key_set_bytes decodes a byte-encoded public key, and returns
-// true iff the operation was successful.
+// true if and only if (⟺) the operation was successful.
 public_key_set_bytes :: proc(pub_key: ^Public_Key, curve: Curve, b: []byte) -> bool {
 	public_key_clear(pub_key)

@@ -365,7 +365,7 @@ public_key_bytes :: proc(pub_key: ^Public_Key, dst: []byte) {
 	}
 }

-// public_key_equal returns true iff the public keys are equal,
+// public_key_equal returns true if and only if (⟺) the public keys are equal,
 // in constant time.
 public_key_equal :: proc(p, q: ^Public_Key) -> bool {
 	if p._curve != q._curve {
--- a/core/crypto/ecdsa/ecdsa.odin
+++ b/core/crypto/ecdsa/ecdsa.odin
@@ -79,7 +79,7 @@ Public_Key :: struct {
 }

 // private_key_generate uses the system entropy source to generate a new
-// Private_Key.  This will only fail iff the system entropy source is
+// Private_Key.  This will only fail if and only if (⟺) the system entropy source is
 // missing or broken.
 private_key_generate :: proc(priv_key: ^Private_Key, curve: Curve) -> bool {
 	private_key_clear(priv_key)
@@ -111,7 +111,7 @@ private_key_generate :: proc(priv_key: ^Private_Key, curve: Curve) -> bool {
 }

 // private_key_set_bytes decodes a byte-encoded private key, and returns
-// true iff the operation was successful.
+// true if and only if (⟺) the operation was successful.
 private_key_set_bytes :: proc(priv_key: ^Private_Key, curve: Curve, b: []byte) -> bool {
 	private_key_clear(priv_key)

@@ -194,7 +194,7 @@ private_key_bytes :: proc(priv_key: ^Private_Key, dst: []byte) {
 	}
 }

-// private_key_equal returns true iff the private keys are equal,
+// private_key_equal returns true if and only if (⟺) the private keys are equal,
 // in constant time.
 private_key_equal :: proc(p, q: ^Private_Key) -> bool {
 	if p._curve != q._curve {
@@ -219,7 +219,7 @@ private_key_clear :: proc "contextless" (priv_key: ^Private_Key) {
 }

 // public_key_set_bytes decodes a byte-encoded public key, and returns
-// true iff the operation was successful.
+// true if and only if (⟺) the operation was successful.
 public_key_set_bytes :: proc(pub_key: ^Public_Key, curve: Curve, b: []byte) -> bool {
 	public_key_clear(pub_key)

@@ -296,7 +296,7 @@ public_key_bytes :: proc(pub_key: ^Public_Key, dst: []byte) {
 	}
 }

-// public_key_equal returns true iff the public keys are equal,
+// public_key_equal returns true if and only if (⟺) the public keys are equal,
 // in constant time.
 public_key_equal :: proc(p, q: ^Public_Key) -> bool {
 	if p._curve != q._curve {
--- a/core/crypto/ecdsa/ecdsa_asn1.odin
+++ b/core/crypto/ecdsa/ecdsa_asn1.odin
@@ -141,7 +141,7 @@ parse_asn1_sig :: proc(sig: []byte) -> (r, s: []byte, ok: bool) {
 		return nil, nil, false
 	}

-	// DER requires a leading 0 iff the sign bit of the leading byte
+	// DER requires a leading 0 if and only if (⟺) the sign bit of the leading byte
 	// is set to distinguish between positive and negative integers,
 	// and the minimal length representation.  `r` and `s` are always
 	// going to be unsigned, so we validate malformed DER and strip
--- a/core/crypto/ecdsa/ecdsa_verify.odin
+++ b/core/crypto/ecdsa/ecdsa_verify.odin
@@ -3,7 +3,7 @@ package ecdsa
 import "core:crypto/hash"
 import secec "core:crypto/_weierstrass"

-// verify_raw returns true iff sig is a valid signature by pub_key over
+// verify_raw returns true if and only if (⟺) sig is a valid signature by pub_key over
 // msg, hased using hash_algo, per the verification procedure specifed
 // in SEC 1, Version 2.0, Section 4.1.4.
 //
@@ -33,7 +33,7 @@ verify_raw :: proc(pub_key: ^Public_Key, hash_algo: hash.Algorithm, msg, sig: []
 	panic("crypto/ecdsa: invalid curve")
 }

-// verify_asn1 returns true iff sig is a valid signature by pub_key over
+// verify_asn1 returns true if and only if (⟺) sig is a valid signature by pub_key over
 // msg, hased using hash_algo, per the verification procedure specifed
 // in SEC 1, Version 2.0, Section 4.1.4.
 //
--- a/core/crypto/ed25519/ed25519.odin
+++ b/core/crypto/ed25519/ed25519.odin
@@ -48,7 +48,7 @@ Public_Key :: struct {
 }

 // private_key_generate uses the system entropy source to generate a new
-// Private_Key.  This will only fail iff the system entropy source is
+// Private_Key.  This will only fail if and only if (⟺) the system entropy source is
 // missing or broken.
 private_key_generate :: proc(priv_key: ^Private_Key) -> bool {
 	private_key_clear(priv_key)
@@ -67,7 +67,7 @@ private_key_generate :: proc(priv_key: ^Private_Key) -> bool {
 }

 // private_key_set_bytes decodes a byte-encoded private key, and returns
-// true iff the operation was successful.
+// true if and only if (⟺) the operation was successful.
 private_key_set_bytes :: proc(priv_key: ^Private_Key, b: []byte) -> bool {
 	if len(b) != PRIVATE_KEY_SIZE {
 		return false
@@ -167,7 +167,7 @@ sign :: proc(priv_key: ^Private_Key, msg, sig: []byte) {
 }

 // public_key_set_bytes decodes a byte-encoded public key, and returns
-// true iff the operation was successful.
+// true if and only if (⟺) the operation was successful.
 public_key_set_bytes :: proc "contextless" (pub_key: ^Public_Key, b: []byte) -> bool {
 	if len(b) != PUBLIC_KEY_SIZE {
 		return false
@@ -205,14 +205,14 @@ public_key_bytes :: proc(pub_key: ^Public_Key, dst: []byte) {
 	copy(dst, pub_key._b[:])
 }

-// public_key_equal returns true iff pub_key is equal to other.
+// public_key_equal returns true if and only if (⟺) pub_key is equal to other.
 public_key_equal :: proc(pub_key, other: ^Public_Key) -> bool {
 	ensure(pub_key._is_initialized && other._is_initialized, "crypto/ed25519: uninitialized public key")

 	return crypto.compare_constant_time(pub_key._b[:], other._b[:]) == 1
 }

-// verify returns true iff sig is a valid signature by pub_key over msg.
+// verify returns true if and only if (⟺) sig is a valid signature by pub_key over msg.
 //
 // The optional `allow_small_order_A` parameter will make this
 // implementation strictly compatible with FIPS 186-5, at the expense of
--- a/core/crypto/hash/low_level.odin
+++ b/core/crypto/hash/low_level.odin
@@ -235,7 +235,7 @@ update :: proc(ctx: ^Context, data: []byte) {
 // final finalizes the Context, writes the digest to hash, and calls
 // reset on the Context.
 //
-// Iff finalize_clone is set, final will work on a copy of the Context,
+// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context,
 // which is useful for for calculating rolling digests.
 final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
 	switch &impl in ctx._impl {
--- a/core/crypto/hmac/hmac.odin
+++ b/core/crypto/hmac/hmac.odin
@@ -21,7 +21,7 @@ sum :: proc(algorithm: hash.Algorithm, dst, msg, key: []byte) {
 }

 // verify will verify the HMAC tag computed with the specified algorithm
-// and key over msg and return true iff the tag is valid.  It requires
+// and key over msg and return true if and only if (⟺) the tag is valid.  It requires
 // that the tag is correctly sized.
 verify :: proc(algorithm: hash.Algorithm, tag, msg, key: []byte) -> bool {
 	tag_buf: [hash.MAX_DIGEST_SIZE]byte
--- a/core/crypto/kmac/kmac.odin
+++ b/core/crypto/kmac/kmac.odin
@@ -32,7 +32,7 @@ sum :: proc(sec_strength: int, dst, msg, key, domain_sep: []byte) {
 }

 // verify will verify the KMAC tag computed with the specified security
-// strength, key and domain separator over msg and return true iff the
+// strength, key and domain separator over msg and return true if and only if (⟺) the
 // tag is valid.
 verify :: proc(sec_strength: int, tag, msg, key, domain_sep: []byte, allocator := context.temp_allocator) -> bool {
 	derived_tag := make([]byte, len(tag), allocator)
--- a/core/crypto/legacy/keccak/keccak.odin
+++ b/core/crypto/legacy/keccak/keccak.odin
@@ -77,7 +77,7 @@ update :: proc "contextless" (ctx: ^Context, data: []byte) {
 // final finalizes the Context, writes the digest to hash, and calls
 // reset on the Context.
 //
-// Iff finalize_clone is set, final will work on a copy of the Context,
+// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context,
 // which is useful for for calculating rolling digests.
 final :: proc "contextless" (ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
 	_sha3.final((^_sha3.Context)(ctx), hash, finalize_clone)
--- a/core/crypto/legacy/md5/md5.odin
+++ b/core/crypto/legacy/md5/md5.odin
@@ -69,7 +69,7 @@ update :: proc(ctx: ^Context, data: []byte) {
 // final finalizes the Context, writes the digest to hash, and calls
 // reset on the Context.
 //
-// Iff finalize_clone is set, final will work on a copy of the Context,
+// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context,
 // which is useful for for calculating rolling digests.
 final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
 	ensure(ctx.is_initialized)
--- a/core/crypto/legacy/sha1/sha1.odin
+++ b/core/crypto/legacy/sha1/sha1.odin
@@ -76,7 +76,7 @@ update :: proc(ctx: ^Context, data: []byte) {
 // final finalizes the Context, writes the digest to hash, and calls
 // reset on the Context.
 //
-// Iff finalize_clone is set, final will work on a copy of the Context,
+// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context,
 // which is useful for for calculating rolling digests.
 final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
 	ensure(ctx.is_initialized)
--- a/core/crypto/pbkdf2/pbkdf2.odin
+++ b/core/crypto/pbkdf2/pbkdf2.odin
@@ -66,7 +66,7 @@ derive :: proc(
 		dst_blk = dst_blk[h_len:]
 	}

-	// Instead of rounding l up, just proceass the one extra block iff
+	// Instead of rounding l up, just proceass the one extra block if and only if (⟺)
 	// r != 0.
 	if r > 0 {
 		tmp: [hash.MAX_DIGEST_SIZE]byte
--- a/core/crypto/poly1305/poly1305.odin
+++ b/core/crypto/poly1305/poly1305.odin
@@ -33,7 +33,7 @@ sum :: proc(dst, msg, key: []byte) {
 }

 // verify will verify the Poly1305 tag computed with the key over msg and
-// return true iff the tag is valid.  It requires that the tag is correctly
+// return true if and only if (⟺) the tag is valid.  It requires that the tag is correctly
 // sized.
 verify :: proc(tag, msg, key: []byte) -> bool {
 	ctx: Context = ---
--- a/core/crypto/ristretto255/ristretto255.odin
+++ b/core/crypto/ristretto255/ristretto255.odin
@@ -360,7 +360,7 @@ ge_double_scalarmult_generator_vartime :: proc(
 	ge._is_initialized = true
 }

-// ge_cond_negate sets `ge = a` iff `ctrl == 0` and `ge = -a` iff `ctrl == 1`.
+// ge_cond_negate sets `ge = a` if and only if (⟺) `ctrl == 0` and `ge = -a` if and only if (⟺) `ctrl == 1`.
 // Behavior for all other values of ctrl are undefined,
 ge_cond_negate :: proc(ge, a: ^Group_Element, ctrl: int) {
 	_ge_ensure_initialized([]^Group_Element{a})
@@ -369,7 +369,7 @@ ge_cond_negate :: proc(ge, a: ^Group_Element, ctrl: int) {
 	ge._is_initialized = true
 }

-// ge_cond_assign sets `ge = ge` iff `ctrl == 0` and `ge = a` iff `ctrl == 1`.
+// ge_cond_assign sets `ge = ge` if and only if (⟺) `ctrl == 0` and `ge = a` if and only if (⟺) `ctrl == 1`.
 // Behavior for all other values of ctrl are undefined,
 ge_cond_assign :: proc(ge, a: ^Group_Element, ctrl: int) {
 	_ge_ensure_initialized([]^Group_Element{ge, a})
@@ -377,7 +377,7 @@ ge_cond_assign :: proc(ge, a: ^Group_Element, ctrl: int) {
 	grp.ge_cond_assign(&ge._p, &a._p, ctrl)
 }

-// ge_cond_select sets `ge = a` iff `ctrl == 0` and `ge = b` iff `ctrl == 1`.
+// ge_cond_select sets `ge = a` if and only if (⟺) `ctrl == 0` and `ge = b` if and only if (⟺) `ctrl == 1`.
 // Behavior for all other values of ctrl are undefined,
 ge_cond_select :: proc(ge, a, b: ^Group_Element, ctrl: int) {
 	_ge_ensure_initialized([]^Group_Element{a, b})
@@ -386,7 +386,7 @@ ge_cond_select :: proc(ge, a, b: ^Group_Element, ctrl: int) {
 	ge._is_initialized = true
 }

-// ge_equal returns 1 iff `a == b`, and 0 otherwise.
+// ge_equal returns 1 if and only if (⟺) `a == b`, and 0 otherwise.
@(require_results)
 ge_equal :: proc(a, b: ^Group_Element) -> int {
 	_ge_ensure_initialized([]^Group_Element{a, b})
@@ -405,7 +405,7 @@ ge_equal :: proc(a, b: ^Group_Element) -> int {
 	return ret
 }

-// ge_is_identity returns 1 iff `ge` is the identity element, and 0 otherwise.
+// ge_is_identity returns 1 if and only if (⟺) `ge` is the identity element, and 0 otherwise.
@(require_results)
 ge_is_identity :: proc(ge: ^Group_Element) -> int {
 	return ge_equal(ge, &GE_IDENTITY)
--- a/core/crypto/ristretto255/ristretto255_scalar.odin
+++ b/core/crypto/ristretto255/ristretto255_scalar.odin
@@ -80,13 +80,13 @@ sc_square :: proc "contextless" (sc, a: ^Scalar) {
 	grp.sc_square(sc, a)
 }

-// sc_cond_assign sets `sc = sc` iff `ctrl == 0` and `sc = a` iff `ctrl == 1`.
+// sc_cond_assign sets `sc = sc` if and only if (⟺) `ctrl == 0` and `sc = a` if and only if (⟺) `ctrl == 1`.
 // Behavior for all other values of ctrl are undefined,
 sc_cond_assign :: proc(sc, a: ^Scalar, ctrl: int) {
 	grp.sc_cond_assign(sc, a, ctrl)
 }

-// sc_equal returns 1 iff `a == b`, and 0 otherwise.
+// sc_equal returns 1 if and only if (⟺) `a == b`, and 0 otherwise.
@(require_results)
 sc_equal :: proc(a, b: ^Scalar) -> int {
 	return grp.sc_equal(a, b)
--- a/core/crypto/sha2/sha2.odin
+++ b/core/crypto/sha2/sha2.odin
@@ -191,7 +191,7 @@ update :: proc(ctx: ^$T, data: []byte) {
 // final finalizes the Context, writes the digest to hash, and calls
 // reset on the Context.
 //
-// Iff finalize_clone is set, final will work on a copy of the Context,
+// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context,
 // which is useful for for calculating rolling digests.
 final :: proc(ctx: ^$T, hash: []byte, finalize_clone: bool = false) {
 	ensure(ctx.is_initialized)
--- a/core/crypto/sha2/sha256_impl_hw_arm.odin
+++ b/core/crypto/sha2/sha256_impl_hw_arm.odin
@@ -0,0 +1,224 @@
+#+build arm64,arm32
+package sha2
+
+// Based on the public domain code by Jeffrey Walton, though
+// realistically, there only is one sensible way to write this.
+//
+// See: https://github.com/noloader/SHA-Intrinsics
+
+import "base:intrinsics"
+import "core:simd"
+import "core:simd/arm"
+import "core:sys/info"
+
+// is_hardware_accelerated_256 returns true if and only if (⟺) hardware
+// accelerated SHA-224/SHA-256 is supported.
+is_hardware_accelerated_256 :: proc "contextless" () -> bool {
+	req_features :: info.CPU_Features{
+		.asimd,
+		.sha256,
+	}
+	return info.cpu_features() >= req_features
+}
+
+@(private = "file")
+K_0 :: simd.u32x4{0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5}
+@(private = "file")
+K_1 :: simd.u32x4{0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5}
+@(private = "file")
+K_2 :: simd.u32x4{0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3}
+@(private = "file")
+K_3 :: simd.u32x4{0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174}
+@(private = "file")
+K_4 :: simd.u32x4{0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC}
+@(private = "file")
+K_5 :: simd.u32x4{0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA}
+@(private = "file")
+K_6 :: simd.u32x4{0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7}
+@(private = "file")
+K_7 :: simd.u32x4{0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967}
+@(private = "file")
+K_8 :: simd.u32x4{0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13}
+@(private = "file")
+K_9 :: simd.u32x4{0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85}
+@(private = "file")
+K_10 :: simd.u32x4{0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3}
+@(private = "file")
+K_11 :: simd.u32x4{0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070}
+@(private = "file")
+K_12 :: simd.u32x4{0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5}
+@(private = "file")
+K_13 :: simd.u32x4{0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3}
+@(private = "file")
+K_14 :: simd.u32x4{0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208}
+@(private = "file")
+K_15 :: simd.u32x4{0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2}
+
+@(private, enable_target_feature = "neon,sha2")
+sha256_transf_hw :: proc "contextless" (ctx: ^Context_256, data: []byte) #no_bounds_check {
+	state_0 := intrinsics.unaligned_load((^simd.u32x4)(&ctx.h[0]))
+	state_1 := intrinsics.unaligned_load((^simd.u32x4)(&ctx.h[4]))
+
+	data := data
+	for len(data) >= BLOCK_SIZE_256 {
+		// Save state
+		abef_save, cdgh_save := state_0, state_1
+
+		// Load message
+		msg_0 := intrinsics.unaligned_load((^simd.u32x4)(raw_data(data)))
+		msg_1 := intrinsics.unaligned_load((^simd.u32x4)(raw_data(data[16:])))
+		msg_2 := intrinsics.unaligned_load((^simd.u32x4)(raw_data(data[32:])))
+		msg_3 := intrinsics.unaligned_load((^simd.u32x4)(raw_data(data[48:])))
+
+		// Reverse for little endian
+		when ODIN_ENDIAN == .Little {
+			msg_0 = byteswap_u32x4(msg_0)
+			msg_1 = byteswap_u32x4(msg_1)
+			msg_2 = byteswap_u32x4(msg_2)
+			msg_3 = byteswap_u32x4(msg_3)
+		}
+
+		tmp_0 := simd.add(msg_0, K_0)
+
+		// Rounds 0-3
+		msg_0 = arm.vsha256su0q_u32(msg_0, msg_1)
+		tmp_2 := state_0
+		tmp_1 := simd.add(msg_1, K_1)
+		state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0)
+		state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0)
+		msg_0 = arm.vsha256su1q_u32(msg_0, msg_2, msg_3)
+
+		// Rounds 4-7
+		msg_1 = arm.vsha256su0q_u32(msg_1, msg_2)
+		tmp_2 = state_0
+		tmp_0 = simd.add(msg_2, K_2)
+		state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1)
+		state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1)
+		msg_1 = arm.vsha256su1q_u32(msg_1, msg_3, msg_0)
+
+		// Rounds 8-11
+		msg_2 = arm.vsha256su0q_u32(msg_2, msg_3)
+		tmp_2 = state_0
+		tmp_1 = simd.add(msg_3, K_3)
+		state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0)
+		state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0)
+		msg_2 = arm.vsha256su1q_u32(msg_2, msg_0, msg_1)
+
+		// Rounds 12-15
+		msg_3 = arm.vsha256su0q_u32(msg_3, msg_0)
+		tmp_2 = state_0
+		tmp_0 = simd.add(msg_0, K_4)
+		state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1)
+		state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1)
+		msg_3 = arm.vsha256su1q_u32(msg_3, msg_1, msg_2)
+
+		// Rounds 16-19
+		msg_0 = arm.vsha256su0q_u32(msg_0, msg_1)
+		tmp_2 = state_0
+		tmp_1 = simd.add(msg_1, K_5)
+		state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0)
+		state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0)
+		msg_0 = arm.vsha256su1q_u32(msg_0, msg_2, msg_3)
+
+		// Rounds 20-23
+		msg_1 = arm.vsha256su0q_u32(msg_1, msg_2)
+		tmp_2 = state_0
+		tmp_0 = simd.add(msg_2, K_6)
+		state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1)
+		state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1)
+		msg_1 = arm.vsha256su1q_u32(msg_1, msg_3, msg_0)
+
+		// Rounds 24-27
+		msg_2 = arm.vsha256su0q_u32(msg_2, msg_3)
+		tmp_2 = state_0
+		tmp_1 = simd.add(msg_3, K_7)
+		state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0)
+		state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0)
+		msg_2 = arm.vsha256su1q_u32(msg_2, msg_0, msg_1)
+
+		// Rounds 28-31
+		msg_3 = arm.vsha256su0q_u32(msg_3, msg_0)
+		tmp_2 = state_0
+		tmp_0 = simd.add(msg_0, K_8)
+		state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1)
+		state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1)
+		msg_3 = arm.vsha256su1q_u32(msg_3, msg_1, msg_2)
+
+		// Rounds 32-35
+		msg_0 = arm.vsha256su0q_u32(msg_0, msg_1)
+		tmp_2 = state_0
+		tmp_1 = simd.add(msg_1, K_9)
+		state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0)
+		state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0)
+		msg_0 = arm.vsha256su1q_u32(msg_0, msg_2, msg_3)
+
+		// Rounds 36-39
+		msg_1 = arm.vsha256su0q_u32(msg_1, msg_2)
+		tmp_2 = state_0
+		tmp_0 = simd.add(msg_2, K_10)
+		state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1)
+		state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1)
+		msg_1 = arm.vsha256su1q_u32(msg_1, msg_3, msg_0)
+
+		// Rounds 40-43
+		msg_2 = arm.vsha256su0q_u32(msg_2, msg_3)
+		tmp_2 = state_0
+		tmp_1 = simd.add(msg_3, K_11)
+		state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0)
+		state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0)
+		msg_2 = arm.vsha256su1q_u32(msg_2, msg_0, msg_1)
+
+		// Rounds 44-47
+		msg_3 = arm.vsha256su0q_u32(msg_3, msg_0)
+		tmp_2 = state_0
+		tmp_0 = simd.add(msg_0, K_12)
+		state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1)
+		state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1)
+		msg_3 = arm.vsha256su1q_u32(msg_3, msg_1, msg_2)
+
+		// Rounds 48-51
+		tmp_2 = state_0
+		tmp_1 = simd.add(msg_1, K_13)
+		state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0)
+		state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0)
+
+		// Rounds 52-55
+		tmp_2 = state_0
+		tmp_0 = simd.add(msg_2, K_14)
+		state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1)
+		state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1)
+
+		// Rounds 56-59
+		tmp_2 = state_0
+		tmp_1 = simd.add(msg_3, K_15)
+		state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_0)
+		state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_0)
+
+		// Rounds 60-63
+		tmp_2 = state_0
+		state_0 = arm.vsha256hq_u32(state_0, state_1, tmp_1)
+		state_1 = arm.vsha256h2q_u32(state_1, tmp_2, tmp_1)
+
+		// Combine state
+		state_0 = simd.add(state_0, abef_save)
+		state_1 = simd.add(state_1, cdgh_save)
+
+		data = data[BLOCK_SIZE_256:]
+	}
+
+	intrinsics.unaligned_store((^simd.u32x4)(&ctx.h[0]), state_0)
+	intrinsics.unaligned_store((^simd.u32x4)(&ctx.h[4]), state_1)
+}
+
+when ODIN_ENDIAN == .Little {
+	@(private = "file", enable_target_feature = "neon")
+	byteswap_u32x4 :: #force_inline proc "contextless" (a: simd.u32x4) -> simd.u32x4 {
+		return transmute(simd.u32x4)(
+			simd.shuffle(
+				transmute(simd.u8x16)(a),
+				transmute(simd.u8x16)(a),
+				3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
+			)
+		)
+	}
+}
--- a/core/crypto/sha2/sha256_impl_hw_intel.odin
+++ b/core/crypto/sha2/sha256_impl_hw_intel.odin
@@ -49,7 +49,7 @@ K_14 :: simd.u64x2{0x78a5636f748f82ee, 0x8cc7020884c87814}
 K_15 :: simd.u64x2{0xa4506ceb90befffa, 0xc67178f2bef9a3f7}


-// is_hardware_accelerated_256 returns true iff hardware accelerated
+// is_hardware_accelerated_256 returns true if and only if (⟺) hardware accelerated
 // SHA-224/SHA-256 is supported.
 is_hardware_accelerated_256 :: proc "contextless" () -> bool {
 	req_features :: info.CPU_Features{
--- a/core/crypto/sha2/sha2_impl_hw_gen.odin
+++ b/core/crypto/sha2/sha2_impl_hw_gen.odin
@@ -1,10 +1,12 @@
 #+build !amd64
+#+build !arm64
+#+build !arm32
 package sha2

@(private = "file")
 ERR_HW_NOT_SUPPORTED :: "crypto/sha2: hardware implementation unsupported"

-// is_hardware_accelerated_256 returns true iff hardware accelerated
+// is_hardware_accelerated_256 returns true if and only if (⟺) hardware accelerated
 // SHA-224/SHA-256 is supported.
 is_hardware_accelerated_256 :: proc "contextless" () -> bool {
 	return false
--- a/core/crypto/sha3/sha3.odin
+++ b/core/crypto/sha3/sha3.odin
@@ -79,7 +79,7 @@ update :: proc(ctx: ^Context, data: []byte) {
 // final finalizes the Context, writes the digest to hash, and calls
 // reset on the Context.
 //
-// Iff finalize_clone is set, final will work on a copy of the Context,
+// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context,
 // which is useful for for calculating rolling digests.
 final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
 	_sha3.final((^_sha3.Context)(ctx), hash, finalize_clone)
--- a/core/crypto/sm3/sm3.odin
+++ b/core/crypto/sm3/sm3.odin
@@ -80,7 +80,7 @@ update :: proc(ctx: ^Context, data: []byte) {
 // final finalizes the Context, writes the digest to hash, and calls
 // reset on the Context.
 //
-// Iff finalize_clone is set, final will work on a copy of the Context,
+// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context,
 // which is useful for for calculating rolling digests.
 final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
 	ensure(ctx.is_initialized)
--- a/core/crypto/tuplehash/tuplehash.odin
+++ b/core/crypto/tuplehash/tuplehash.odin
@@ -31,7 +31,7 @@ write_element :: proc(ctx: ^Context, data: []byte) {
 // final finalizes the Context, writes the digest to hash, and calls
 // reset on the Context.
 //
-// Iff finalize_clone is set, final will work on a copy of the Context,
+// If and only if (⟺) finalize_clone is set, final will work on a copy of the Context,
 // which is useful for for calculating rolling digests.
 final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
 	_sha3.final_cshake((^_sha3.Context)(ctx), hash, finalize_clone)
--- a/core/io/io.odin
+++ b/core/io/io.odin
@@ -436,7 +436,7 @@ copy_buffer :: proc(dst: Writer, src: Reader, buf: []byte) -> (written: i64, err

 // copy_n copies n bytes (or till an error) from src to dst.
 // It returns the number of bytes copied and the first error that occurred whilst copying, if any.
-// On return, written == n IFF err == nil
+// On return, written == n if and only if (⟺) err == nil
 copy_n :: proc(dst: Writer, src: Reader, n: i64) -> (written: i64, err: Error) {
 	nsrc := limited_reader_init(&Limited_Reader{}, src, n)
 	written, err = copy(dst, nsrc)
--- a/core/math/big/prime.odin
+++ b/core/math/big/prime.odin
@@ -101,7 +101,7 @@ internal_int_power_modulo :: proc(res, G, X, P: ^Int, allocator := context.alloc
 		If the modulus is odd or dr != 0 use the montgomery method.
 	*/
 	if internal_int_is_odd(P) || dr != 0 {
-		return _private_int_exponent_mod(res, G, X, P, dr)
+		return _private_int_exponent_mod_fast(res, G, X, P, dr)
 	}

 	/*
--- a/core/math/big/private.odin
+++ b/core/math/big/private.odin
@@ -439,8 +439,14 @@ _private_int_mul_high :: proc(dest, a, b: ^Int, digits: int, allocator := contex
 		return _private_int_mul_high_comba(dest, a, b, digits)
 	}

-	internal_grow(dest, a.used + b.used + 1) or_return
-	dest.used = a.used + b.used + 1
+	/*
+		Set up temporary output `Int`, which we'll swap for `dest` when done.
+	*/
+
+	t := &Int{}
+
+	internal_grow(t, a.used + b.used + 1) or_return
+	t.used = a.used + b.used + 1

 	pa := a.used
 	pb := b.used
@@ -451,20 +457,23 @@ _private_int_mul_high :: proc(dest, a, b: ^Int, digits: int, allocator := contex
 			/*
 				Calculate the double precision result.
 			*/
-			r := _WORD(dest.digit[ix + iy]) + _WORD(a.digit[ix]) * _WORD(b.digit[iy]) + _WORD(carry)
+			r := _WORD(t.digit[ix + iy]) + _WORD(a.digit[ix]) * _WORD(b.digit[iy]) + _WORD(carry)

 			/*
 				Get the lower part.
 			*/
-			dest.digit[ix + iy] = DIGIT(r & _WORD(_MASK))
+			t.digit[ix + iy] = DIGIT(r & _WORD(_MASK))

 			/*
 				Carry the carry.
 			*/
 			carry = DIGIT(r >> _WORD(_DIGIT_BITS))
 		}
-		dest.digit[ix + pb] = carry
+		t.digit[ix + pb] = carry
 	}
+
+	internal_swap(dest, t)
+	internal_destroy(t)
 	return internal_clamp(dest)
 }

--- a/core/mem/virtual/arena.odin
+++ b/core/mem/virtual/arena.odin
@@ -141,9 +141,9 @@ arena_alloc_unguarded :: proc(arena: ^Arena, size: uint, alignment: uint, loc :=

 			needed := mem.align_forward_uint(size, alignment)
 			needed = max(needed, arena.default_commit_size)
-			block_size := max(needed, arena.minimum_block_size)
+			block_size := max(needed, arena.minimum_block_size) + alignment

-			new_block := memory_block_alloc(needed, block_size, alignment, {}) or_return
+			new_block := memory_block_alloc(needed, block_size) or_return
 			new_block.prev = arena.curr_block
 			arena.curr_block = new_block
 			arena.total_reserved += new_block.reserved
--- a/core/mem/virtual/virtual.odin
+++ b/core/mem/virtual/virtual.odin
@@ -154,7 +154,7 @@ alloc_from_memory_block :: proc(block: ^Memory_Block, min_size, alignment: uint,

 			pmblock.committed = platform_total_commit
 			block.committed = pmblock.committed - base_offset
-
+			assert(block.committed <= block.reserved)
 		}
 		return
 	}
@@ -174,7 +174,7 @@ alloc_from_memory_block :: proc(block: ^Memory_Block, min_size, alignment: uint,
 		err = .Out_Of_Memory
 		return
 	}
-	assert(block.committed <= block.reserved)
+
 	do_commit_if_necessary(block, size, default_commit_size) or_return

 	data = block.base[block.used+alignment_offset:][:min_size]
--- a/core/nbio/impl_posix.odin
+++ b/core/nbio/impl_posix.odin
@@ -804,7 +804,7 @@ send_exec :: proc(op: ^Operation) -> Op_Result {

 	op.send.sent += n

-	if op.send.sent < total {
+	if n < total {
 		return send_exec(op)
 	}

@@ -868,7 +868,7 @@ recv_exec :: proc(op: ^Operation) -> Op_Result {
 	assert(is_tcp || op.recv.received == 0)
 	op.recv.received += n

-	if is_tcp && n != 0 && op.recv.received < total {
+	if is_tcp && n != 0 && n < total {
 		return recv_exec(op)
 	}

--- a/core/simd/arm/aes.odin
+++ b/core/simd/arm/aes.odin
@@ -0,0 +1,34 @@
+#+build arm64,arm32
+package simd_arm
+
+@(require_results, enable_target_feature = "aes")
+vaeseq_u8 :: #force_inline proc "c" (data, key: uint8x16_t) -> uint8x16_t {
+	return _vaeseq_u8(data, key)
+}
+
+@(require_results, enable_target_feature = "aes")
+vaesdq_u8 :: #force_inline proc "c" (data, key: uint8x16_t) -> uint8x16_t {
+	return _vaesdq_u8(data, key)
+}
+
+@(require_results, enable_target_feature = "aes")
+vaesmcq_u8 :: #force_inline proc "c" (data: uint8x16_t) -> uint8x16_t {
+	return _vaesmcq_u8(data)
+}
+
+@(require_results,enable_target_feature = "aes")
+vaesimcq_u8 :: #force_inline proc "c" (data: uint8x16_t) -> uint8x16_t {
+	return _vaesimcq_u8(data)
+}
+
+@(private, default_calling_convention = "none")
+foreign _ {
+	@(link_name = "llvm.aarch64.crypto.aese" when ODIN_ARCH == .arm64 else "llvm.arm.neon.aese")
+	_vaeseq_u8 :: proc(data, key: uint8x16_t) -> uint8x16_t ---
+	@(link_name = "llvm.aarch64.crypto.aesd" when ODIN_ARCH == .arm64 else "llvm.arm.neon.aesd")
+	_vaesdq_u8 :: proc(data, key: uint8x16_t) -> uint8x16_t ---
+	@(link_name = "llvm.aarch64.crypto.aesmc" when ODIN_ARCH == .arm64 else "llvm.arm.neon.aesmc")
+	_vaesmcq_u8 :: proc(data: uint8x16_t) -> uint8x16_t ---
+	@(link_name = "llvm.aarch64.crypto.aesimc" when ODIN_ARCH == .arm64 else "llvm.arm.neon.aesimc")
+	_vaesimcq_u8 :: proc(data: uint8x16_t) -> uint8x16_t ---
+}
--- a/core/simd/arm/doc.odin
+++ b/core/simd/arm/doc.odin
@@ -0,0 +1,2 @@
+// `SIMD` intrinsics specific to ARMv8 `arm32` and `arm64` architectures.
+package simd_arm
--- a/core/simd/arm/sha.odin
+++ b/core/simd/arm/sha.odin
@@ -0,0 +1,108 @@
+#+build arm64,arm32
+package simd_arm
+
+@(require_results, enable_target_feature = "sha2")
+vsha1cq_u32 :: #force_inline proc "c" (hash_abcd: uint32x4_t, e: uint32_t, wk: uint32x4_t) -> uint32x4_t {
+	return _vsha1cq_u32(hash_abcd, e, wk)
+}
+
+@(require_results, enable_target_feature = "sha2")
+vsha1pq_u32 :: #force_inline proc "c" (hash_abcd: uint32x4_t, e: uint32_t, wk: uint32x4_t) -> uint32x4_t {
+	return _vsha1pq_u32(hash_abcd, e, wk)
+}
+
+@(require_results, enable_target_feature = "sha2")
+vsha1mq_u32 :: #force_inline proc "c" (hash_abcd: uint32x4_t, e: uint32_t, wk: uint32x4_t) -> uint32x4_t {
+	return _vsha1mq_u32(hash_abcd, e, wk)
+}
+
+@(require_results, enable_target_feature = "sha2")
+vsha1h_u32 :: #force_inline proc "c" (e: uint32_t) -> uint32_t {
+	return _vsha1h_u32(e)
+}
+
+@(require_results, enable_target_feature = "sha2")
+vsha1su0q_u32 :: #force_inline proc "c" (w0_3, w4_7, w8_11: uint32x4_t) -> uint32x4_t {
+	return _vsha1su0q_u32(w0_3, w4_7, w8_11)
+}
+
+@(require_results, enable_target_feature = "sha2")
+vsha1su1q_u32 :: #force_inline proc "c" (tw0_3, w12_15: uint32x4_t) -> uint32x4_t {
+	return _vsha1su1q_u32(tw0_3, w12_15)
+}
+
+@(require_results, enable_target_feature = "sha2")
+vsha256hq_u32 :: #force_inline proc "c" (hash_abcd, hash_efgh, wk: uint32x4_t) -> uint32x4_t {
+	return _vsha256hq_u32(hash_abcd, hash_efgh, wk)
+}
+
+@(require_results, enable_target_feature = "sha2")
+vsha256h2q_u32 :: #force_inline proc "c" (hash_efgh, hash_abcd, wk: uint32x4_t) -> uint32x4_t {
+	return _vsha256h2q_u32(hash_efgh, hash_abcd, wk)
+}
+
+@(require_results, enable_target_feature = "sha2")
+vsha256su0q_u32 :: #force_inline proc "c" (w0_3, w4_7: uint32x4_t) -> uint32x4_t {
+	return _vsha256su0q_u32(w0_3, w4_7)
+}
+
+@(require_results, enable_target_feature = "sha2")
+vsha256su1q_u32 :: #force_inline proc "c" (tw0_3, w8_11, w12_15: uint32x4_t) -> uint32x4_t {
+	return _vsha256su1q_u32(tw0_3, w8_11, w12_15)
+}
+
+// Note: The SHA512 instructions are part of the `sha3` feature set.
+
+@(require_results, enable_target_feature = "sha3")
+vsha512hq_u64 :: #force_inline proc "c" (hash_ed, hash_gf, kwh_kwh2: uint64x2_t) -> uint64x2_t {
+	return _vsha512hq_u64(hash_ed, hash_gf, kwh_kwh2)
+}
+
+@(require_results, enable_target_feature = "sha3")
+vsha512h2q_u64 :: #force_inline proc "c" (sum_ab, hash_c_, hash_ab: uint64x2_t) -> uint64x2_t {
+	return _vsha512h2q_u64(sum_ab, hash_c_, hash_ab)
+}
+
+@(require_results, enable_target_feature = "sha3")
+vsha512su0q_u64 :: #force_inline proc "c" (w0_1, w2_: uint64x2_t) -> uint64x2_t {
+	return _vsha512su0q_u64(w0_1, w2_)
+}
+
+@(require_results, enable_target_feature = "sha3")
+vsha512su1q_u64 :: #force_inline proc "c" (s01_s02, w14_15, w9_10: uint64x2_t) -> uint64x2_t {
+	return _vsha512su1q_u64(s01_s02, w14_15, w9_10)
+}
+
+@(private, default_calling_convention = "none")
+foreign _ {
+	@(link_name = "llvm.aarch64.crypto.sha1c" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha1c")
+	_vsha1cq_u32 :: proc(hash_abcd: uint32x4_t, e: uint32_t, wk: uint32x4_t) -> uint32x4_t ---
+	@(link_name = "llvm.aarch64.crypto.sha1p" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha1p")
+	_vsha1pq_u32 :: proc(hash_abcd: uint32x4_t, e: uint32_t, wk: uint32x4_t) -> uint32x4_t ---
+	@(link_name = "llvm.aarch64.crypto.sha1m" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha1m")
+	_vsha1mq_u32 :: proc(hash_abcd: uint32x4_t, e: uint32_t, wk: uint32x4_t) -> uint32x4_t ---
+	@(link_name = "llvm.aarch64.crypto.sha1h" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha1h")
+	_vsha1h_u32 :: proc(e: uint32_t) -> uint32_t ---
+	@(link_name = "llvm.aarch64.crypto.sha1su0" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha1su0")
+	_vsha1su0q_u32 :: proc(w0_3, w4_7, w8_11: uint32x4_t) -> uint32x4_t ---
+	@(link_name = "llvm.aarch64.crypto.sha1su1" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha1su1")
+	_vsha1su1q_u32 :: proc(tw0_3, w12_15: uint32x4_t) -> uint32x4_t ---
+
+	@(link_name = "llvm.aarch64.crypto.sha256h" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha256h")
+	_vsha256hq_u32 :: proc(hash_abcd, hash_efgh, wk: uint32x4_t) -> uint32x4_t ---
+	@(link_name = "llvm.aarch64.crypto.sha256h2" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha256h2")
+	_vsha256h2q_u32 :: proc(hash_efgh, hash_abcd, wk: uint32x4_t) -> uint32x4_t ---
+	@(link_name = "llvm.aarch64.crypto.sha256su0" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha256su0")
+	_vsha256su0q_u32 :: proc(w0_3, w4_7: uint32x4_t) -> uint32x4_t ---
+	@(link_name = "llvm.aarch64.crypto.sha256su1" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha256su1")
+	_vsha256su1q_u32 :: proc(tw0_3, w8_11, w12_15: uint32x4_t) -> uint32x4_t ---
+
+	@(link_name = "llvm.aarch64.crypto.sha512h" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha512h")
+	_vsha512hq_u64 :: proc(hash_ed, hash_gf, kwh_kwh2: uint64x2_t) -> uint64x2_t ---
+	@(link_name = "llvm.aarch64.crypto.sha512h2" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha512h2")
+	_vsha512h2q_u64 :: proc(sum_ab, hash_c_, hash_ab: uint64x2_t) -> uint64x2_t ---
+	@(link_name = "llvm.aarch64.crypto.sha512su0" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha512su0")
+	_vsha512su0q_u64 :: proc(w0_1, w2_: uint64x2_t) -> uint64x2_t ---
+	@(link_name = "llvm.aarch64.crypto.sha512su1" when ODIN_ARCH == .arm64 else "llvm.arm.neon.sha512su1")
+	_vsha512su1q_u64 :: proc(s01_s02, w14_15, w9_10: uint64x2_t) -> uint64x2_t ---
+}
--- a/core/simd/arm/types.odin
+++ b/core/simd/arm/types.odin
@@ -0,0 +1,9 @@
+#+build arm64,arm32
+package simd_arm
+
+// Type aliases to match `arm_neon.h`.
+uint32_t :: u32
+
+uint8x16_t :: #simd[16]u8
+uint32x4_t :: #simd[4]u32
+uint64x2_t :: #simd[2]u64
--- a/core/sys/darwin/Foundation/NSBlock.odin
+++ b/core/sys/darwin/Foundation/NSBlock.odin
@@ -82,7 +82,7 @@ internal_block_literal_make :: proc (is_global: bool, user_data: rawptr, user_pr
 		BLOCK_HAS_COPY_DISPOSE :: 1 << 25
 		BLOCK_HAS_CTOR         :: 1 << 26 // helpers have C++ code
 		BLOCK_IS_GLOBAL        :: 1 << 28
-		BLOCK_HAS_STRET        :: 1 << 29 // IFF BLOCK_HAS_SIGNATURE
+		BLOCK_HAS_STRET        :: 1 << 29 // if and only if (⟺) BLOCK_HAS_SIGNATURE
 		BLOCK_HAS_SIGNATURE    :: 1 << 30

 		bl.isa = is_global ? &_NSConcreteGlobalBlock : &_NSConcreteStackBlock
--- a/src/build_settings.cpp
+++ b/src/build_settings.cpp
@@ -1683,7 +1683,20 @@ gb_internal void init_android_values(bool with_sdk) {
 		gb_exit(1);
 	}

-	bc->ODIN_ANDROID_NDK_TOOLCHAIN_LIB = concatenate_strings(permanent_allocator(), bc->ODIN_ANDROID_NDK_TOOLCHAIN, str_lit("sysroot/usr/lib/aarch64-linux-android/"));
+	switch (bc->metrics.arch) {
+		case TargetArch_arm64:
+			bc->ODIN_ANDROID_NDK_TOOLCHAIN_LIB = str_lit("aarch64-linux-android");
+			break;
+		case TargetArch_arm32:
+			bc->ODIN_ANDROID_NDK_TOOLCHAIN_LIB = str_lit("arm-linux-androideabi");
+			break;
+		case TargetArch_amd64:
+			bc->ODIN_ANDROID_NDK_TOOLCHAIN_LIB = str_lit("x86_64-linux-android");
+			break;
+		case TargetArch_i386:
+			bc->ODIN_ANDROID_NDK_TOOLCHAIN_LIB = str_lit("i686-linux-android");
+			break;
+	}

 	char buf[32] = {};
 	gb_snprintf(buf, gb_size_of(buf), "%d/", bc->ODIN_ANDROID_API_LEVEL);
@@ -1958,9 +1971,22 @@ gb_internal void init_build_context(TargetMetrics *cross_target, Subtarget subta
 	} else if (metrics->os == TargetOs_linux && subtarget == Subtarget_Android) {
 		switch (metrics->arch) {
 		case TargetArch_arm64:
-			bc->metrics.target_triplet = str_lit("aarch64-none-linux-android");
+			bc->metrics.target_triplet = str_lit("aarch64-linux-android");
 			bc->reloc_mode = RelocMode_PIC;
 			break;
+		case TargetArch_arm32:
+			bc->metrics.target_triplet = str_lit("armv7a-linux-androideabi");
+			bc->reloc_mode = RelocMode_PIC;
+			break;
+		case TargetArch_amd64:
+			bc->metrics.target_triplet = str_lit("x86_64-linux-android");
+			bc->reloc_mode = RelocMode_PIC;
+			break;
+		case TargetArch_i386:
+			bc->metrics.target_triplet = str_lit("i686-linux-android");
+			bc->reloc_mode = RelocMode_PIC;
+			break;
+		
 		default:
 			GB_PANIC("Unknown architecture for -subtarget:android");
 		}
--- a/src/check_expr.cpp
+++ b/src/check_expr.cpp
@@ -2971,14 +2971,21 @@ gb_internal void check_comparison(CheckerContext *c, Ast *node, Operand *x, Oper

 	if (check_is_assignable_to(c, x, y->type) ||
 	    check_is_assignable_to(c, y, x->type)) {
+		if (x->type->failure || y->type->failure) {
+			// // skip any failures
+			x->mode = Addressing_Value;
+			x->type = t_untyped_bool;
+			return;
+		}
+
 		Type *err_type = x->type;
 		bool defined = false;
 		switch (op) {
 		case Token_CmpEq:
 		case Token_NotEq:
-			defined = (is_type_comparable(x->type) && is_type_comparable(y->type)) ||
-			          (is_operand_nil(*x) && type_has_nil(y->type)) ||
-			          (is_operand_nil(*y) && type_has_nil(x->type));
+			defined = ((is_operand_nil(*x) && type_has_nil(y->type)) ||
+			           (is_operand_nil(*y) && type_has_nil(x->type)) ||
+			           is_type_comparable(x->type) && is_type_comparable(y->type));
 			break;
 		case Token_Lt:
 		case Token_Gt:
@@ -4476,9 +4483,9 @@ gb_internal void check_binary_expr(CheckerContext *c, Operand *x, Ast *node, Typ
 		    			truncated: r = a - b*trunc(a/b)
 		    			floored:   r = a - b*floor(a/b)

-		    			IFF a/0 == 0,        then (a%0 == a) or (a%%0 == a)
-		    			IFF a/0 == a,        then (a%0 == 0) or (a%%0 == 0)
-		    			IFF a/0 == 0b111..., then (a%0 == a) or (a%%0 == a)
+		    			If and only if (⟺) a/0 == 0,        then (a%0 == a) or (a%%0 == a)
+		    			If and only if (⟺) a/0 == a,        then (a%0 == 0) or (a%%0 == 0)
+		    			If and only if (⟺) a/0 == 0b111..., then (a%0 == a) or (a%%0 == a)
 		    		*/

 				switch (zero_behaviour) {
--- a/src/check_type.cpp
+++ b/src/check_type.cpp
@@ -1853,7 +1853,7 @@ gb_internal Type *check_get_params(CheckerContext *ctx, Scope *scope, Ast *_para
 		if (is_using && (feature_flags & OptInFeatureFlag_UsingStmt) == 0) {
 			ERROR_BLOCK();
 			error(param, "'using' has been disallowed as it is considered bad practice to use as a statement/procedure parameter outside of immediate refactoring");
-			error_line("\tIt you do require it for refactoring purposes or legacy code, it can be enabled on a per-file basis with '#+feature using-stmt'\n");
+			error_line("\tIf you do require it for refactoring purposes or legacy code, it can be enabled on a per-file basis with '#+feature using-stmt'\n");
 		}

 		if (type_expr == nullptr) {
--- a/src/checker.cpp
+++ b/src/checker.cpp
@@ -2240,7 +2240,7 @@ gb_internal void add_type_info_type_internal(CheckerContext *c, Type *t) {

 	case Type_BitSet:
 		add_type_info_type_internal(c, bt->BitSet.elem);
-		add_type_info_type_internal(c, bt->BitSet.underlying);
+		add_type_info_type_internal(c, bit_set_to_int(bt));
 		break;

 	case Type_Pointer:
@@ -2484,7 +2484,7 @@ gb_internal void add_min_dep_type_info(Checker *c, Type *t) {

 	case Type_BitSet:
 		add_min_dep_type_info(c, bt->BitSet.elem);
-		add_min_dep_type_info(c, bt->BitSet.underlying);
+		add_min_dep_type_info(c, bit_set_to_int(bt));
 		break;

 	case Type_Pointer:
--- a/src/linker.cpp
+++ b/src/linker.cpp
@@ -676,7 +676,7 @@ try_cross_linking:;
 				defer (gb_string_free(glue));

 				glue = gb_string_append_fmt(glue, "bin/clang");
-				glue = gb_string_append_fmt(glue, " --target=aarch64-linux-android%d ", ODIN_ANDROID_API_LEVEL);
+				glue = gb_string_append_fmt(glue, " --target=%s%d ", build_context.metrics.target_triplet, ODIN_ANDROID_API_LEVEL);
 				glue = gb_string_appendc(glue, "-c \"");
 				glue = gb_string_append_length(glue, ODIN_ANDROID_NDK.text, ODIN_ANDROID_NDK.len);
 				glue = gb_string_appendc(glue, "sources/android/native_app_glue/android_native_app_glue.c");
@@ -697,8 +697,9 @@ try_cross_linking:;

 				glue = gb_string_appendc(glue, "\"-I");
 				glue = gb_string_append_length(glue, ODIN_ANDROID_NDK_TOOLCHAIN.text, ODIN_ANDROID_NDK_TOOLCHAIN.len);
-				glue = gb_string_appendc(glue, "sysroot/usr/include/aarch64-linux-android/");
-				glue = gb_string_appendc(glue, "\" ");
+				glue = gb_string_appendc(glue, "sysroot/usr/include/");
+				glue = gb_string_append_length(glue, ODIN_ANDROID_NDK_TOOLCHAIN_LIB.text, ODIN_ANDROID_NDK_TOOLCHAIN_LIB.len);
+				glue = gb_string_appendc(glue, "/\" ");


 				glue = gb_string_appendc(glue, "-Wno-macro-redefined ");
@@ -969,7 +970,7 @@ try_cross_linking:;
 				gbString ndk_bin_directory = gb_string_make_length(temporary_allocator(), ODIN_ANDROID_NDK_TOOLCHAIN.text, ODIN_ANDROID_NDK_TOOLCHAIN.len);
 				link_command_line = gb_string_appendc(link_command_line, ndk_bin_directory);
 				link_command_line = gb_string_appendc(link_command_line, "bin/clang");
-				link_command_line = gb_string_append_fmt(link_command_line, " --target=aarch64-linux-android%d ", ODIN_ANDROID_API_LEVEL);
+				link_command_line = gb_string_append_fmt(link_command_line, " --target=%s%d ", build_context.metrics.target_triplet,  ODIN_ANDROID_API_LEVEL);
 			} else {
 				link_command_line = gb_string_appendc(link_command_line, clang_path);
 			}
--- a/src/llvm_backend.cpp
+++ b/src/llvm_backend.cpp
@@ -66,6 +66,19 @@ gb_internal String get_final_microarchitecture() {
 gb_internal String get_default_features() {
 	BuildContext *bc = &build_context;

+	if (bc->microarch == str_lit("native")) {
+		String features = make_string_c(LLVMGetHostCPUFeatures());
+
+		// Update the features string so LLVM uses it later.
+		if (bc->target_features_string.len > 0) {
+			bc->target_features_string = concatenate3_strings(permanent_allocator(), features, str_lit(","), bc->target_features_string);
+		} else {
+			bc->target_features_string = features;
+		}
+
+		return features;
+	}
+
 	int off = 0;
 	for (int i = 0; i < bc->metrics.arch; i += 1) {
 		off += target_microarch_counts[i];
--- a/src/llvm_backend_expr.cpp
+++ b/src/llvm_backend_expr.cpp
@@ -1424,8 +1424,8 @@ gb_internal LLVMValueRef lb_integer_modulo(lbProcedure *p, LLVMValueRef lhs, LLV
 		truncated: r = a - b*trunc(a/b)
 		floored:   r = a - b*floor(a/b)

-		IFF a/0 == 0, then (a%0 == a) or (a%%0 == a)
-		IFF a/0 == a, then (a%0 == 0) or (a%%0 == 0)
+		If and only if (⟺) a/0 == 0, then (a%0 == a) or (a%%0 == a)
+		If and only if (⟺) a/0 == a, then (a%0 == 0) or (a%%0 == 0)
 	*/

 	switch (behaviour)  {
--- a/tests/core/mem/test_core_mem.odin
+++ b/tests/core/mem/test_core_mem.odin
@@ -57,6 +57,33 @@ test_align_bumping_block_limit :: proc(t: ^testing.T) {
 	testing.expect(t, len(data) == 896)
 }

+@(test)
+test_large_minimum_block_size :: proc(t: ^testing.T) {
+	a: virtual.Arena
+	defer virtual.arena_destroy(&a)
+
+	init_err := virtual.arena_init_growing(&a, 16*mem.Megabyte)
+	testing.expect_value(t, init_err, nil)
+
+	align : uint = 4
+	for _ in 0..<6 {
+		data, err := virtual.arena_alloc(&a, 18874368, align)
+		testing.expect_value(t, err, nil)
+		testing.expect(t, len(data) == 18874368)
+		testing.expect(t, uintptr(raw_data(data)) & uintptr(align-1) == 0)
+		align *= 2
+		virtual.arena_free_all(&a)
+	}
+
+	align = 4
+	for _ in 0..<32 {
+		data, err := virtual.arena_alloc(&a, 1048576, align)
+		testing.expect_value(t, err, nil)
+		testing.expect(t, len(data) == 1048576)
+		testing.expect(t, uintptr(raw_data(data)) & uintptr(align-1) == 0)
+	}
+}
+
@(test)
 tlsf_test_overlap_and_zero :: proc(t: ^testing.T) {
 	default_allocator := context.allocator
--- a/vendor/lua/5.4/include/luaconf.h
+++ b/vendor/lua/5.4/include/luaconf.h
@@ -71,7 +71,7 @@


 /*
-@@ LUAI_IS32INT is true iff 'int' has (at least) 32 bits.
+@@ LUAI_IS32INT is true if and only if (⟺) 'int' has (at least) 32 bits.
 */
 #define LUAI_IS32INT	((UINT_MAX >> 30) >= 3)

--- a/vendor/portmidi/portmidi.odin
+++ b/vendor/portmidi/portmidi.odin
@@ -119,8 +119,8 @@ DeviceInfo :: struct {
 	structVersion: c.int,   /**< this internal structure version */ 
 	interf:        cstring, /**< underlying MIDI API, e.g. MMSystem or DirectX */
 	name:          cstring, /**< device name, e.g. USB MidiSport 1x1 */
-	input:         b32,     /**< true iff input is available */
-	output:        b32,     /**< true iff output is available */
+	input:         b32,     /**< true if and only if (⟺) input is available */
+	output:        b32,     /**< true if and only if (⟺) output is available */
 	opened:        b32,     /**< used by generic PortMidi code to do error checking on arguments */
 }