core/crypto/_aes/hw: Initial import

2026-07-11 10:29:32 +00:00 · 2026-03-12 20:35:55 +09:00
parent 48b1f3b830
commit 6aeed0e20e
10 changed files with 446 additions and 231 deletions
--- a/core/crypto/_aes/hw/api.odin
+++ b/core/crypto/_aes/hw/api.odin
@@ -0,0 +1,69 @@
+package aes_hw
+
+@(require) import "core:sys/info"
+
+// is_supported returns true if and only if (⟺) hardware accelerated AES
+// is supported.
+is_supported :: proc "contextless" () -> bool {
+	when ODIN_ARCH == .amd64 {
+		// Note: Everything with AES-NI has support for
+		// the required SSE extxtensions.
+		req_features :: info.CPU_Features{
+			.sse2,
+			.ssse3,
+			.sse41,
+			.aes,
+		}
+		return info.cpu_features() >= req_features
+	} else when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 {
+		req_features :: info.CPU_Features{
+			.asimd,
+			.aes,
+		}
+		return info.cpu_features() >= req_features
+	} else {
+		return false
+	}
+}
+
+// is_ghash_supported returns true if and only if (⟺) hardware accelerated
+// GHASH is supported.
+is_ghash_supported :: proc "contextless" () -> bool {
+	// Just having hardware GHASH is silly.
+	if !is_supported() {
+		return false
+	}
+
+	when ODIN_ARCH == .amd64 {
+		return info.cpu_features() >= info.CPU_Features{
+			.pclmulqdq,
+		}
+	} else when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32{
+		// Once we can actually use this, we can re-enable this.
+		//
+		// return info.cpu_features() >= info.CPU_Features{
+		// 	.pmull,
+		// }
+		return false
+	} else {
+		return false
+	}
+}
+
+// Context is a keyed AES (ECB) instance.
+Context :: struct {
+	// Note: The ideal thing to do is for the expanded round keys to be
+	// arrays of `u8x16`, however that implies alignment (or using AVX).
+	//
+	// All the people using e-waste processors that don't support an
+	// instruction set that has been around for over 10 years are why
+	// we can't have nice things.
+	_sk_exp_enc: [15][16]byte,
+	_sk_exp_dec: [15][16]byte,
+	_num_rounds: int,
+}
+
+// init initializes a context for AES with the provided key.
+init :: proc(ctx: ^Context, key: []byte) {
+	keysched(ctx, key)
+}
--- a/core/crypto/_aes/hw/ghash_intel.odin
+++ b/core/crypto/_aes/hw/ghash_intel.odin
@@ -21,7 +21,7 @@
 // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #+build amd64
-package aes_hw_intel
+package aes_hw

 import "base:intrinsics"
 import "core:crypto/_aes"
--- a/core/crypto/_aes/hw/intrinsics_arm.odin
+++ b/core/crypto/_aes/hw/intrinsics_arm.odin
@@ -0,0 +1,115 @@
+#+build arm64,arm32
+package aes_hw
+
+import "core:simd"
+import "core:simd/arm"
+
+// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a/
+
+TARGET_FEATURES :: "neon,aes"
+HAS_GHASH :: false // Temporary
+
+@(require_results, enable_target_feature = "aes")
+aesdec :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 {
+	return simd.bit_xor(arm.vaesimcq_u8(arm.vaesdq_u8(data, simd.u8x16{})), key)
+}
+
+@(require_results, enable_target_feature = "aes")
+aesdeclast :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 {
+	return simd.bit_xor(arm.vaesdq_u8(data, simd.u8x16{}), key)
+}
+
+@(require_results, enable_target_feature = "aes")
+aesenc :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 {
+	return simd.bit_xor(arm.vaesmcq_u8(arm.vaeseq_u8(data, simd.u8x16{})), key)
+}
+
+@(require_results, enable_target_feature = "aes")
+aesenclast :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 {
+	return simd.bit_xor(arm.vaeseq_u8(data, simd.u8x16{}), key)
+}
+
+aesimc :: arm.vaesimcq_u8
+
+@(require_results, enable_target_feature = "aes")
+aeskeygenassist :: #force_inline proc "c" (data: simd.u8x16, $IMM8: u8) -> simd.u8x16 {
+	a := arm.vaeseq_u8(data, simd.u8x16{}) // AESE does ShiftRows and SubBytes on A
+
+	// Undo ShiftRows step from AESE and extract X1 and X3
+	dest := simd.swizzle(
+		a,
+		0x04, 0x01, 0x0e, 0x0b, // SubBytes(X1)
+		0x01, 0x0e, 0x0b, 0x04, // ROT(SubBytes(X1))
+		0x0c, 0x09, 0x06, 0x03, // SubBytes(X3)
+		0x09, 0x06, 0x03, 0x0c, // ROT(SubBytes(X3))
+	)
+
+	rcons := simd.u8x16{
+		0, 0, 0, 0,
+		IMM8, 0, 0, 0,
+		0, 0, 0, 0,
+		IMM8, 0, 0, 0,
+	}
+
+	return simd.bit_xor(dest, rcons)
+}
+
+// The keyschedule implementation is easier to read with some extra
+// Intel intrinsics that are emulated by built-in LLVM ops anyway.
+
+@(private, require_results, enable_target_feature = TARGET_FEATURES)
+_mm_slli_si128 :: #force_inline proc "c" (a: simd.u8x16, $IMM8: u32) -> simd.u8x16 {
+	shift :: IMM8 & 0xff
+
+	// This needs to emit behavior identical to PSLLDQ which is as follows:
+	//
+	// TEMP := COUNT
+	// IF (TEMP > 15) THEN TEMP := 16; FI
+	// DEST := DEST << (TEMP * 8)
+	// DEST[MAXVL-1:128] (Unmodified)
+
+	return simd.shuffle(
+		simd.u8x16{},
+		a,
+		0 when shift > 15 else (16 - shift + 0),
+		1 when shift > 15 else (16 - shift + 1),
+		2 when shift > 15 else (16 - shift + 2),
+		3 when shift > 15 else (16 - shift + 3),
+		4 when shift > 15 else (16 - shift + 4),
+		5 when shift > 15 else (16 - shift + 5),
+		6 when shift > 15 else (16 - shift + 6),
+		7 when shift > 15 else (16 - shift + 7),
+		8 when shift > 15 else (16 - shift + 8),
+		9 when shift > 15 else (16 - shift + 9),
+		10 when shift > 15 else (16 - shift + 10),
+		11 when shift > 15 else (16 - shift + 11),
+		12 when shift > 15 else (16 - shift + 12),
+		13 when shift > 15 else (16 - shift + 13),
+		14 when shift > 15 else (16 - shift + 14),
+		15 when shift > 15 else (16 - shift + 15),
+	)
+}
+
+@(private, require_results, enable_target_feature = TARGET_FEATURES)
+_mm_shuffle_epi32 :: #force_inline proc "c" (a: simd.u8x16, $IMM8: u32) -> simd.u8x16 {
+	v := transmute(simd.i32x4)a
+	return transmute(simd.u8x16)simd.shuffle(
+		v,
+		v,
+		IMM8 & 0b11,
+		(IMM8 >> 2) & 0b11,
+		(IMM8 >> 4) & 0b11,
+		(IMM8 >> 6) & 0b11,
+	)
+}
+
+@(private, require_results, enable_target_feature = TARGET_FEATURES)
+_mm_shuffle_ps :: #force_inline proc "c" (a, b: simd.u8x16, $MASK: u32) -> simd.u8x16 {
+	return transmute(simd.u8x16)simd.shuffle(
+		transmute(simd.u32x4)(a),
+		transmute(simd.u32x4)(b),
+		u32(MASK) & 0b11,
+		(u32(MASK)>>2) & 0b11,
+		((u32(MASK)>>4) & 0b11)+4,
+		((u32(MASK)>>6) & 0b11)+4)
+}
--- a/core/crypto/_aes/hw/intrinsics_intel.odin
+++ b/core/crypto/_aes/hw/intrinsics_intel.odin
@@ -0,0 +1,55 @@
+#+build amd64
+package aes_hw
+
+import "core:simd"
+import "core:simd/x86"
+
+// Intel/RISC-V semantics.
+
+TARGET_FEATURES :: "sse,sse2,ssse3,sse4.1,aes"
+HAS_GHASH :: true
+
+@(require_results, enable_target_feature = "aes")
+aesdec :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 {
+	return transmute(simd.u8x16)(x86._mm_aesdec_si128(transmute(x86.__m128i)(data), transmute(x86.__m128i)(key)))
+}
+
+@(require_results, enable_target_feature = "aes")
+aesdeclast :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 {
+	return transmute(simd.u8x16)(x86._mm_aesdeclast_si128(transmute(x86.__m128i)(data), transmute(x86.__m128i)(key)))
+}
+
+@(require_results, enable_target_feature = "aes")
+aesenc :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 {
+	return transmute(simd.u8x16)(x86._mm_aesenc_si128(transmute(x86.__m128i)(data), transmute(x86.__m128i)(key)))
+}
+
+@(require_results, enable_target_feature = "aes")
+aesenclast :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 {
+	return transmute(simd.u8x16)(x86._mm_aesenclast_si128(transmute(x86.__m128i)(data), transmute(x86.__m128i)(key)))
+}
+
+@(require_results, enable_target_feature = "aes")
+aesimc :: #force_inline proc "c" (data: simd.u8x16) -> simd.u8x16 {
+	return transmute(simd.u8x16)(x86._mm_aesimc_si128(transmute(x86.__m128i)(data)))
+}
+
+@(require_results, enable_target_feature = "aes")
+aeskeygenassist :: #force_inline proc "c" (data: simd.u8x16, $IMM8: u8) -> simd.u8x16 {
+	return transmute(simd.u8x16)(x86._mm_aeskeygenassist_si128(transmute(x86.__m128i)(data), IMM8))
+}
+
+@(private, require_results, enable_target_feature = TARGET_FEATURES)
+_mm_slli_si128 :: #force_inline proc "c" (a: simd.u8x16, $IMM8: u32) -> simd.u8x16 {
+	return transmute(simd.u8x16)(x86._mm_slli_si128(transmute(x86.__m128i)(a), IMM8))
+}
+
+@(private, require_results, enable_target_feature = TARGET_FEATURES)
+_mm_shuffle_epi32 :: #force_inline proc "c" (a: simd.u8x16, $IMM8: u32) -> simd.u8x16 {
+	return transmute(simd.u8x16)(x86._mm_shuffle_epi32(transmute(x86.__m128i)(a), IMM8))
+}
+
+@(private, require_results, enable_target_feature = TARGET_FEATURES)
+_mm_shuffle_ps :: #force_inline proc "c" (a, b: simd.u8x16, $MASK: u32) -> simd.u8x16 {
+	return transmute(simd.u8x16)(x86._mm_shuffle_ps(transmute(x86.__m128)(a), transmute(x86.__m128)(b), MASK))
+}
--- a/core/crypto/_aes/hw/keysched_hw.odin
+++ b/core/crypto/_aes/hw/keysched_hw.odin
@@ -0,0 +1,181 @@
+// Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//
+//   1. Redistributions of source code must retain the above copyright
+//      notice, this list of conditions and the following disclaimer.
+//
+// THIS SOFTWARE IS PROVIDED BY THE AUTHORS “AS IS” AND ANY EXPRESS OR
+// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
+// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+// GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#+build amd64,arm32
+package aes_hw
+
+import "base:intrinsics"
+import "core:crypto"
+import "core:crypto/_aes"
+import "core:simd"
+
+// Inspiration taken from BearSSL's AES-NI implementation.
+//
+// Note: This assumes that the SROA optimization pass is enabled to be
+// anything resembling performant otherwise, LLVM will not elide a massive
+// number of redundant loads/stores it generates for every intrinsic call.
+
+@(private = "file", require_results, enable_target_feature = TARGET_FEATURES)
+expand_step128 :: #force_inline proc(k1, k2: simd.u8x16) -> simd.u8x16 {
+	k1, k2 := k1, k2
+
+	k2 = _mm_shuffle_epi32(k2, 0xff)
+	k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
+	k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
+	k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
+	return simd.bit_xor(k1, k2)
+}
+
+@(private = "file", require_results, enable_target_feature = TARGET_FEATURES)
+expand_step192a :: #force_inline proc (k1_, k2_: ^simd.u8x16, k3: simd.u8x16) -> (simd.u8x16, simd.u8x16) {
+	k1, k2, k3 := k1_^, k2_^, k3
+
+	k3 = _mm_shuffle_epi32(k3, 0x55)
+	k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
+	k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
+	k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
+	k1 = simd.bit_xor(k1, k3)
+
+	tmp := k2
+	k2 = simd.bit_xor(k2, _mm_slli_si128(k2, 0x04))
+	k2 = simd.bit_xor(k2, _mm_shuffle_epi32(k1, 0xff))
+
+	k1_, k2_ := k1_, k2_
+	k1_^, k2_^ = k1, k2
+
+	r1 := _mm_shuffle_ps(tmp, k1, 0x44)
+	r2 := _mm_shuffle_ps(k1, k2, 0x4e)
+
+	return r1, r2
+}
+
+@(private = "file", require_results, enable_target_feature = TARGET_FEATURES)
+expand_step192b :: #force_inline proc (k1_, k2_: ^simd.u8x16, k3: simd.u8x16) -> simd.u8x16 {
+	k1, k2, k3 := k1_^, k2_^, k3
+
+	k3 = _mm_shuffle_epi32(k3, 0x55)
+	k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
+	k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
+	k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
+	k1 = simd.bit_xor(k1, k3)
+
+	k2 = simd.bit_xor(k2, _mm_slli_si128(k2, 0x04))
+	k2 = simd.bit_xor(k2, _mm_shuffle_epi32(k1, 0xff))
+
+	k1_, k2_ := k1_, k2_
+	k1_^, k2_^ = k1, k2
+
+	return k1
+}
+
+@(private = "file", require_results, enable_target_feature = TARGET_FEATURES)
+expand_step256b :: #force_inline proc(k1, k2: simd.u8x16) -> simd.u8x16 {
+	k1, k2 := k1, k2
+
+	k2 = _mm_shuffle_epi32(k2, 0xaa)
+	k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
+	k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
+	k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
+	return simd.bit_xor(k1, k2)
+}
+
+@(private = "file", enable_target_feature = TARGET_FEATURES)
+derive_dec_keys :: proc(ctx: ^Context, sks: ^[15]simd.u8x16, num_rounds: int) {
+	intrinsics.unaligned_store((^simd.u8x16)(&ctx._sk_exp_dec[0]), sks[num_rounds])
+	for i in 1 ..< num_rounds {
+		tmp := aesimc(sks[i])
+		intrinsics.unaligned_store((^simd.u8x16)(&ctx._sk_exp_dec[num_rounds - i]), tmp)
+	}
+	intrinsics.unaligned_store((^simd.u8x16)(&ctx._sk_exp_dec[num_rounds]), sks[0])
+}
+
+@(private, enable_target_feature = TARGET_FEATURES)
+keysched :: proc(ctx: ^Context, key: []byte) {
+	sks: [15]simd.u8x16 = ---
+
+	// Compute the encryption keys.
+	num_rounds, key_len := 0, len(key)
+	switch key_len {
+	case _aes.KEY_SIZE_128:
+		sks[0] = intrinsics.unaligned_load((^simd.u8x16)(raw_data(key)))
+		sks[1] = expand_step128(sks[0], aeskeygenassist(sks[0], 0x01))
+		sks[2] = expand_step128(sks[1], aeskeygenassist(sks[1], 0x02))
+		sks[3] = expand_step128(sks[2], aeskeygenassist(sks[2], 0x04))
+		sks[4] = expand_step128(sks[3], aeskeygenassist(sks[3], 0x08))
+		sks[5] = expand_step128(sks[4], aeskeygenassist(sks[4], 0x10))
+		sks[6] = expand_step128(sks[5], aeskeygenassist(sks[5], 0x20))
+		sks[7] = expand_step128(sks[6], aeskeygenassist(sks[6], 0x40))
+		sks[8] = expand_step128(sks[7], aeskeygenassist(sks[7], 0x80))
+		sks[9] = expand_step128(sks[8], aeskeygenassist(sks[8], 0x1b))
+		sks[10] = expand_step128(sks[9], aeskeygenassist(sks[9], 0x36))
+		num_rounds = _aes.ROUNDS_128
+	case _aes.KEY_SIZE_192:
+		k0 := intrinsics.unaligned_load((^simd.u8x16)(raw_data(key)))
+
+		k1_tmp: [16]byte
+		copy(k1_tmp[:], key[16:24])
+		k1 := intrinsics.unaligned_load((^simd.u8x16)(&k1_tmp))
+		crypto.zero_explicit(&k1_tmp, size_of(k1_tmp))
+
+		sks[0] = k0
+		sks[1], sks[2] = expand_step192a(&k0, &k1, aeskeygenassist(k1, 0x01))
+		sks[3] = expand_step192b(&k0, &k1, aeskeygenassist(k1, 0x02))
+		sks[4], sks[5] = expand_step192a(&k0, &k1, aeskeygenassist(k1, 0x04))
+		sks[6] = expand_step192b(&k0, &k1, aeskeygenassist(k1, 0x08))
+		sks[7], sks[8] = expand_step192a(&k0, &k1, aeskeygenassist(k1, 0x10))
+		sks[9] = expand_step192b(&k0, &k1, aeskeygenassist(k1, 0x20))
+		sks[10], sks[11] = expand_step192a(&k0, &k1, aeskeygenassist(k1, 0x40))
+		sks[12] = expand_step192b(&k0, &k1, aeskeygenassist(k1, 0x80))
+		num_rounds = _aes.ROUNDS_192
+
+	case _aes.KEY_SIZE_256:
+		sks[0] = intrinsics.unaligned_load((^simd.u8x16)(raw_data(key)))
+		sks[1] = intrinsics.unaligned_load((^simd.u8x16)(raw_data(key[16:])))
+		sks[2] = expand_step128(sks[0], aeskeygenassist(sks[1], 0x01))
+		sks[3] = expand_step256b(sks[1], aeskeygenassist(sks[2], 0x01))
+		sks[4] = expand_step128(sks[2], aeskeygenassist(sks[3], 0x02))
+		sks[5] = expand_step256b(sks[3], aeskeygenassist(sks[4], 0x02))
+		sks[6] = expand_step128(sks[4], aeskeygenassist(sks[5], 0x04))
+		sks[7] = expand_step256b(sks[5], aeskeygenassist(sks[6], 0x04))
+		sks[8] = expand_step128(sks[6], aeskeygenassist(sks[7], 0x08))
+		sks[9] = expand_step256b(sks[7], aeskeygenassist(sks[8], 0x08))
+		sks[10] = expand_step128(sks[8], aeskeygenassist(sks[9], 0x10))
+		sks[11] = expand_step256b(sks[9], aeskeygenassist(sks[10], 0x10))
+		sks[12] = expand_step128(sks[10], aeskeygenassist(sks[11], 0x20))
+		sks[13] = expand_step256b(sks[11], aeskeygenassist(sks[12], 0x20))
+		sks[14] = expand_step128(sks[12], aeskeygenassist(sks[13], 0x40))
+		num_rounds = _aes.ROUNDS_256
+	case:
+		panic("crypto/aes: invalid AES key size")
+	}
+	for i in 0 ..= num_rounds {
+		intrinsics.unaligned_store((^simd.u8x16)(&ctx._sk_exp_enc[i]), sks[i])
+	}
+
+	// Compute the decryption keys.  GCM and CTR do not need this, however
+	// ECB, CBC, OCB3, etc do.
+	derive_dec_keys(ctx, &sks, num_rounds)
+
+	ctx._num_rounds = num_rounds
+
+	crypto.zero_explicit(&sks, size_of(sks))
+}
--- a/core/crypto/_aes/hw/unsupported.odin
+++ b/core/crypto/_aes/hw/unsupported.odin
@@ -0,0 +1,11 @@
+#+build !amd64
+#+build !arm64
+#+build !arm32
+package aes_hw
+
+HAS_GHASH :: false
+
+@(private)
+keysched :: proc(ctx: ^Context, key: []byte) {
+	panic("crypto/aes: hardware implementation unsupported")
+}
--- a/core/crypto/_aes/hw_intel/api.odin
+++ b/core/crypto/_aes/hw_intel/api.odin
@@ -1,38 +0,0 @@
-#+build amd64
-package aes_hw_intel
-
-import "core:sys/info"
-
-// is_supported returns true if and only if (⟺) hardware accelerated AES
-// is supported.
-is_supported :: proc "contextless" () -> bool {
-	// Note: Everything with AES-NI and PCLMULQDQ has support for
-	// the required SSE extxtensions.
-	req_features :: info.CPU_Features{
-		.sse2,
-		.ssse3,
-		.sse41,
-		.aes,
-		.pclmulqdq,
-	}
-	return info.cpu_features() >= req_features
-}
-
-// Context is a keyed AES (ECB) instance.
-Context :: struct {
-	// Note: The ideal thing to do is for the expanded round keys to be
-	// arrays of `__m128i`, however that implies alignment (or using AVX).
-	//
-	// All the people using e-waste processors that don't support an
-	// insturction set that has been around for over 10 years are why
-	// we can't have nice things.
-	_sk_exp_enc: [15][16]byte,
-	_sk_exp_dec: [15][16]byte,
-	_num_rounds: int,
-}
-
-// init initializes a context for AES with the provided key.
-init :: proc(ctx: ^Context, key: []byte) {
-	keysched(ctx, key)
-}
-
--- a/core/crypto/_aes/hw_intel/hw_intel_keysched.odin
+++ b/core/crypto/_aes/hw_intel/hw_intel_keysched.odin
@@ -1,178 +0,0 @@
-// Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-//
-//   1. Redistributions of source code must retain the above copyright
-//      notice, this list of conditions and the following disclaimer.
-//
-// THIS SOFTWARE IS PROVIDED BY THE AUTHORS “AS IS” AND ANY EXPRESS OR
-// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
-// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
-// GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
-// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
-// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-#+build amd64
-package aes_hw_intel
-
-import "base:intrinsics"
-import "core:crypto"
-import "core:crypto/_aes"
-import "core:simd/x86"
-
-// Intel AES-NI based implementation.  Inspiration taken from BearSSL.
-//
-// Note: This assumes that the SROA optimization pass is enabled to be
-// anything resembling performat otherwise, LLVM will not elide a massive
-// number of redundant loads/stores it generates for every intrinsic call.
-
-@(private = "file", require_results, enable_target_feature = "sse2")
-expand_step128 :: #force_inline proc(k1, k2: x86.__m128i) -> x86.__m128i {
-	k1, k2 := k1, k2
-
-	k2 = x86._mm_shuffle_epi32(k2, 0xff)
-	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
-	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
-	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
-	return x86._mm_xor_si128(k1, k2)
-}
-
-@(private = "file", require_results, enable_target_feature = "sse,sse2")
-expand_step192a :: #force_inline proc (k1_, k2_: ^x86.__m128i, k3: x86.__m128i) -> (x86.__m128i, x86.__m128i) {
-	k1, k2, k3 := k1_^, k2_^, k3
-
-	k3 = x86._mm_shuffle_epi32(k3, 0x55)
-	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
-	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
-	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
-	k1 = x86._mm_xor_si128(k1, k3)
-
-	tmp := k2
-	k2 = x86._mm_xor_si128(k2, x86._mm_slli_si128(k2, 0x04))
-	k2 = x86._mm_xor_si128(k2, x86._mm_shuffle_epi32(k1, 0xff))
-
-	k1_, k2_ := k1_, k2_
-	k1_^, k2_^ = k1, k2
-
-	r1 := transmute(x86.__m128i)(x86._mm_shuffle_ps(transmute(x86.__m128)(tmp), transmute(x86.__m128)(k1), 0x44))
-	r2 := transmute(x86.__m128i)(x86._mm_shuffle_ps(transmute(x86.__m128)(k1), transmute(x86.__m128)(k2), 0x4e))
-
-	return r1, r2
-}
-
-@(private = "file", require_results, enable_target_feature = "sse2")
-expand_step192b :: #force_inline proc (k1_, k2_: ^x86.__m128i, k3: x86.__m128i) -> x86.__m128i {
-	k1, k2, k3 := k1_^, k2_^, k3
-
-	k3 = x86._mm_shuffle_epi32(k3, 0x55)
-	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
-	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
-	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
-	k1 = x86._mm_xor_si128(k1, k3)
-
-	k2 = x86._mm_xor_si128(k2, x86._mm_slli_si128(k2, 0x04))
-	k2 = x86._mm_xor_si128(k2, x86._mm_shuffle_epi32(k1, 0xff))
-
-	k1_, k2_ := k1_, k2_
-	k1_^, k2_^ = k1, k2
-
-	return k1
-}
-
-@(private = "file", require_results, enable_target_feature = "sse2")
-expand_step256b :: #force_inline proc(k1, k2: x86.__m128i) -> x86.__m128i {
-	k1, k2 := k1, k2
-
-	k2 = x86._mm_shuffle_epi32(k2, 0xaa)
-	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
-	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
-	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
-	return x86._mm_xor_si128(k1, k2)
-}
-
-@(private = "file", enable_target_feature = "aes")
-derive_dec_keys :: proc(ctx: ^Context, sks: ^[15]x86.__m128i, num_rounds: int) {
-	intrinsics.unaligned_store((^x86.__m128i)(&ctx._sk_exp_dec[0]), sks[num_rounds])
-	for i in 1 ..< num_rounds {
-		tmp := x86._mm_aesimc_si128(sks[i])
-		intrinsics.unaligned_store((^x86.__m128i)(&ctx._sk_exp_dec[num_rounds - i]), tmp)
-	}
-	intrinsics.unaligned_store((^x86.__m128i)(&ctx._sk_exp_dec[num_rounds]), sks[0])
-}
-
-@(private, enable_target_feature = "sse,sse2,aes")
-keysched :: proc(ctx: ^Context, key: []byte) {
-	sks: [15]x86.__m128i = ---
-
-	// Compute the encryption keys.
-	num_rounds, key_len := 0, len(key)
-	switch key_len {
-	case _aes.KEY_SIZE_128:
-		sks[0] = intrinsics.unaligned_load((^x86.__m128i)(raw_data(key)))
-		sks[1] = expand_step128(sks[0], x86._mm_aeskeygenassist_si128(sks[0], 0x01))
-		sks[2] = expand_step128(sks[1], x86._mm_aeskeygenassist_si128(sks[1], 0x02))
-		sks[3] = expand_step128(sks[2], x86._mm_aeskeygenassist_si128(sks[2], 0x04))
-		sks[4] = expand_step128(sks[3], x86._mm_aeskeygenassist_si128(sks[3], 0x08))
-		sks[5] = expand_step128(sks[4], x86._mm_aeskeygenassist_si128(sks[4], 0x10))
-		sks[6] = expand_step128(sks[5], x86._mm_aeskeygenassist_si128(sks[5], 0x20))
-		sks[7] = expand_step128(sks[6], x86._mm_aeskeygenassist_si128(sks[6], 0x40))
-		sks[8] = expand_step128(sks[7], x86._mm_aeskeygenassist_si128(sks[7], 0x80))
-		sks[9] = expand_step128(sks[8], x86._mm_aeskeygenassist_si128(sks[8], 0x1b))
-		sks[10] = expand_step128(sks[9], x86._mm_aeskeygenassist_si128(sks[9], 0x36))
-		num_rounds = _aes.ROUNDS_128
-	case _aes.KEY_SIZE_192:
-		k0 := intrinsics.unaligned_load((^x86.__m128i)(raw_data(key)))
-		k1 := x86.__m128i{
-			intrinsics.unaligned_load((^i64)(raw_data(key[16:]))),
-			0,
-		}
-		sks[0] = k0
-		sks[1], sks[2] = expand_step192a(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x01))
-		sks[3] = expand_step192b(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x02))
-		sks[4], sks[5] = expand_step192a(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x04))
-		sks[6] = expand_step192b(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x08))
-		sks[7], sks[8] = expand_step192a(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x10))
-		sks[9] = expand_step192b(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x20))
-		sks[10], sks[11] = expand_step192a(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x40))
-		sks[12] = expand_step192b(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x80))
-		num_rounds = _aes.ROUNDS_192
-	case _aes.KEY_SIZE_256:
-		sks[0] = intrinsics.unaligned_load((^x86.__m128i)(raw_data(key)))
-		sks[1] = intrinsics.unaligned_load((^x86.__m128i)(raw_data(key[16:])))
-		sks[2] = expand_step128(sks[0], x86._mm_aeskeygenassist_si128(sks[1], 0x01))
-		sks[3] = expand_step256b(sks[1], x86._mm_aeskeygenassist_si128(sks[2], 0x01))
-		sks[4] = expand_step128(sks[2], x86._mm_aeskeygenassist_si128(sks[3], 0x02))
-		sks[5] = expand_step256b(sks[3], x86._mm_aeskeygenassist_si128(sks[4], 0x02))
-		sks[6] = expand_step128(sks[4], x86._mm_aeskeygenassist_si128(sks[5], 0x04))
-		sks[7] = expand_step256b(sks[5], x86._mm_aeskeygenassist_si128(sks[6], 0x04))
-		sks[8] = expand_step128(sks[6], x86._mm_aeskeygenassist_si128(sks[7], 0x08))
-		sks[9] = expand_step256b(sks[7], x86._mm_aeskeygenassist_si128(sks[8], 0x08))
-		sks[10] = expand_step128(sks[8], x86._mm_aeskeygenassist_si128(sks[9], 0x10))
-		sks[11] = expand_step256b(sks[9], x86._mm_aeskeygenassist_si128(sks[10], 0x10))
-		sks[12] = expand_step128(sks[10], x86._mm_aeskeygenassist_si128(sks[11], 0x20))
-		sks[13] = expand_step256b(sks[11], x86._mm_aeskeygenassist_si128(sks[12], 0x20))
-		sks[14] = expand_step128(sks[12], x86._mm_aeskeygenassist_si128(sks[13], 0x40))
-		num_rounds = _aes.ROUNDS_256
-	case:
-		panic("crypto/aes: invalid AES key size")
-	}
-	for i in 0 ..= num_rounds {
-		intrinsics.unaligned_store((^x86.__m128i)(&ctx._sk_exp_enc[i]), sks[i])
-	}
-
-	// Compute the decryption keys.  GCM and CTR do not need this, however
-	// ECB, CBC, OCB3, etc do.
-	derive_dec_keys(ctx, &sks, num_rounds)
-
-	ctx._num_rounds = num_rounds
-
-	crypto.zero_explicit(&sks, size_of(sks))
-}
--- a/core/crypto/aes/aes_gcm_hw_intel.odin
+++ b/core/crypto/aes/aes_gcm_hw_intel.odin
@@ -4,7 +4,7 @@ package aes
 import "base:intrinsics"
 import "core:crypto"
 import "core:crypto/_aes"
-import "core:crypto/_aes/hw_intel"
+import aes_hw "core:crypto/_aes/hw"
 import "core:encoding/endian"
 import "core:simd/x86"

@@ -17,7 +17,7 @@ gcm_seal_hw :: proc(ctx: ^Context_Impl_Hardware, dst, tag, iv, aad, plaintext: [
 	init_ghash_hw(ctx, &h, &j0, &j0_enc, iv)

 	// Note: Our GHASH implementation handles appending padding.
-	hw_intel.ghash(s[:], h[:], aad)
+	aes_hw.ghash(s[:], h[:], aad)
 	gctr_hw(ctx, dst, &s, plaintext, &h, &j0, true)
 	final_ghash_hw(&s, &h, &j0_enc, len(aad), len(plaintext))
 	copy(tag, s[:])
@@ -35,7 +35,7 @@ gcm_open_hw :: proc(ctx: ^Context_Impl_Hardware, dst, iv, aad, ciphertext, tag:
 	s: [_aes.GHASH_TAG_SIZE]byte
 	init_ghash_hw(ctx, &h, &j0, &j0_enc, iv)

-	hw_intel.ghash(s[:], h[:], aad)
+	aes_hw.ghash(s[:], h[:], aad)
 	gctr_hw(ctx, dst, &s, ciphertext, &h, &j0, false)
 	final_ghash_hw(&s, &h, &j0_enc, len(aad), len(ciphertext))

@@ -71,11 +71,11 @@ init_ghash_hw :: proc(
 	} else {
 		// If len(IV) != 96, then let s = 128 ceil(len(IV)/128) - len(IV),
 		// and let J0 = GHASHH(IV || 0^(s+64) || ceil(len(IV))^64).
-		hw_intel.ghash(j0[:], h[:], iv)
+		aes_hw.ghash(j0[:], h[:], iv)

 		tmp: [_aes.GHASH_BLOCK_SIZE]byte
 		endian.unchecked_put_u64be(tmp[8:], u64(l) * 8)
-		hw_intel.ghash(j0[:], h[:], tmp[:])
+		aes_hw.ghash(j0[:], h[:], tmp[:])
 	}

 	// ECB encrypt j0, so that we can just XOR with the tag.
@@ -94,7 +94,7 @@ final_ghash_hw :: proc(
 	endian.unchecked_put_u64be(blk[0:], u64(a_len) * 8)
 	endian.unchecked_put_u64be(blk[8:], u64(t_len) * 8)

-	hw_intel.ghash(s[:], h[:], blk[:])
+	aes_hw.ghash(s[:], h[:], blk[:])
 	j0_vec := intrinsics.unaligned_load((^x86.__m128i)(j0))
 	s_vec := intrinsics.unaligned_load((^x86.__m128i)(s))
 	s_vec = x86._mm_xor_si128(s_vec, j0_vec)
@@ -131,7 +131,7 @@ gctr_hw :: proc(
 	nr_blocks := len(src) / BLOCK_SIZE
 	for nr_blocks >= CTR_STRIDE_HW {
 		if !is_seal {
-			hw_intel.ghash(s[:], h[:], src[:CTR_STRIDE_BYTES_HW])
+			aes_hw.ghash(s[:], h[:], src[:CTR_STRIDE_BYTES_HW])
 		}

 		#unroll for i in 0 ..< CTR_STRIDE_HW {
@@ -174,7 +174,7 @@ gctr_hw :: proc(
 		xor_blocks_hw(dst, src, blks[:])

 		if is_seal {
-			hw_intel.ghash(s[:], h[:], dst[:CTR_STRIDE_BYTES_HW])
+			aes_hw.ghash(s[:], h[:], dst[:CTR_STRIDE_BYTES_HW])
 		}

 		src = src[CTR_STRIDE_BYTES_HW:]
@@ -186,7 +186,7 @@ gctr_hw :: proc(
 	for n := len(src); n > 0; {
 		l := min(n, BLOCK_SIZE)
 		if !is_seal {
-			hw_intel.ghash(s[:], h[:], src[:l])
+			aes_hw.ghash(s[:], h[:], src[:l])
 		}

 		blks[0], ctr = hw_inc_ctr32(&ctr_blk, ctr)
@@ -219,7 +219,7 @@ gctr_hw :: proc(
 			copy(dst, blk[:l])
 		}
 		if is_seal {
-			hw_intel.ghash(s[:], h[:], dst[:l])
+			aes_hw.ghash(s[:], h[:], dst[:l])
 		}

 		dst = dst[l:]
--- a/core/crypto/aes/aes_impl_hw_intel.odin
+++ b/core/crypto/aes/aes_impl_hw_intel.odin
@@ -1,18 +1,18 @@
 #+build amd64
 package aes

-import "core:crypto/_aes/hw_intel"
+import aes_hw "core:crypto/_aes/hw"

 // is_hardware_accelerated returns true if and only if (⟺) hardware accelerated AES
 // is supported.
 is_hardware_accelerated :: proc "contextless" () -> bool {
-	return hw_intel.is_supported()
+	return aes_hw.is_supported()
 }

@(private)
-Context_Impl_Hardware :: hw_intel.Context
+Context_Impl_Hardware :: aes_hw.Context

@(private, enable_target_feature = "sse2,aes")
 init_impl_hw :: proc(ctx: ^Context_Impl_Hardware, key: []byte) {
-	hw_intel.init(ctx, key)
+	aes_hw.init(ctx, key)
 }