diff --git a/core/crypto/_aes/hw/api.odin b/core/crypto/_aes/hw/api.odin new file mode 100644 index 000000000..09f674657 --- /dev/null +++ b/core/crypto/_aes/hw/api.odin @@ -0,0 +1,69 @@ +package aes_hw + +@(require) import "core:sys/info" + +// is_supported returns true if and only if (⟺) hardware accelerated AES +// is supported. +is_supported :: proc "contextless" () -> bool { + when ODIN_ARCH == .amd64 { + // Note: Everything with AES-NI has support for + // the required SSE extxtensions. + req_features :: info.CPU_Features{ + .sse2, + .ssse3, + .sse41, + .aes, + } + return info.cpu_features() >= req_features + } else when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 { + req_features :: info.CPU_Features{ + .asimd, + .aes, + } + return info.cpu_features() >= req_features + } else { + return false + } +} + +// is_ghash_supported returns true if and only if (⟺) hardware accelerated +// GHASH is supported. +is_ghash_supported :: proc "contextless" () -> bool { + // Just having hardware GHASH is silly. + if !is_supported() { + return false + } + + when ODIN_ARCH == .amd64 { + return info.cpu_features() >= info.CPU_Features{ + .pclmulqdq, + } + } else when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32{ + // Once we can actually use this, we can re-enable this. + // + // return info.cpu_features() >= info.CPU_Features{ + // .pmull, + // } + return false + } else { + return false + } +} + +// Context is a keyed AES (ECB) instance. +Context :: struct { + // Note: The ideal thing to do is for the expanded round keys to be + // arrays of `u8x16`, however that implies alignment (or using AVX). + // + // All the people using e-waste processors that don't support an + // instruction set that has been around for over 10 years are why + // we can't have nice things. + _sk_exp_enc: [15][16]byte, + _sk_exp_dec: [15][16]byte, + _num_rounds: int, +} + +// init initializes a context for AES with the provided key. +init :: proc(ctx: ^Context, key: []byte) { + keysched(ctx, key) +} diff --git a/core/crypto/_aes/hw_intel/ghash.odin b/core/crypto/_aes/hw/ghash_intel.odin similarity index 99% rename from core/crypto/_aes/hw_intel/ghash.odin rename to core/crypto/_aes/hw/ghash_intel.odin index 5f51b614b..d80816d5d 100644 --- a/core/crypto/_aes/hw_intel/ghash.odin +++ b/core/crypto/_aes/hw/ghash_intel.odin @@ -21,7 +21,7 @@ // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #+build amd64 -package aes_hw_intel +package aes_hw import "base:intrinsics" import "core:crypto/_aes" diff --git a/core/crypto/_aes/hw/intrinsics_arm.odin b/core/crypto/_aes/hw/intrinsics_arm.odin new file mode 100644 index 000000000..ccd8efa8f --- /dev/null +++ b/core/crypto/_aes/hw/intrinsics_arm.odin @@ -0,0 +1,115 @@ +#+build arm64,arm32 +package aes_hw + +import "core:simd" +import "core:simd/arm" + +// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a/ + +TARGET_FEATURES :: "neon,aes" +HAS_GHASH :: false // Temporary + +@(require_results, enable_target_feature = "aes") +aesdec :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 { + return simd.bit_xor(arm.vaesimcq_u8(arm.vaesdq_u8(data, simd.u8x16{})), key) +} + +@(require_results, enable_target_feature = "aes") +aesdeclast :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 { + return simd.bit_xor(arm.vaesdq_u8(data, simd.u8x16{}), key) +} + +@(require_results, enable_target_feature = "aes") +aesenc :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 { + return simd.bit_xor(arm.vaesmcq_u8(arm.vaeseq_u8(data, simd.u8x16{})), key) +} + +@(require_results, enable_target_feature = "aes") +aesenclast :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 { + return simd.bit_xor(arm.vaeseq_u8(data, simd.u8x16{}), key) +} + +aesimc :: arm.vaesimcq_u8 + +@(require_results, enable_target_feature = "aes") +aeskeygenassist :: #force_inline proc "c" (data: simd.u8x16, $IMM8: u8) -> simd.u8x16 { + a := arm.vaeseq_u8(data, simd.u8x16{}) // AESE does ShiftRows and SubBytes on A + + // Undo ShiftRows step from AESE and extract X1 and X3 + dest := simd.swizzle( + a, + 0x04, 0x01, 0x0e, 0x0b, // SubBytes(X1) + 0x01, 0x0e, 0x0b, 0x04, // ROT(SubBytes(X1)) + 0x0c, 0x09, 0x06, 0x03, // SubBytes(X3) + 0x09, 0x06, 0x03, 0x0c, // ROT(SubBytes(X3)) + ) + + rcons := simd.u8x16{ + 0, 0, 0, 0, + IMM8, 0, 0, 0, + 0, 0, 0, 0, + IMM8, 0, 0, 0, + } + + return simd.bit_xor(dest, rcons) +} + +// The keyschedule implementation is easier to read with some extra +// Intel intrinsics that are emulated by built-in LLVM ops anyway. + +@(private, require_results, enable_target_feature = TARGET_FEATURES) +_mm_slli_si128 :: #force_inline proc "c" (a: simd.u8x16, $IMM8: u32) -> simd.u8x16 { + shift :: IMM8 & 0xff + + // This needs to emit behavior identical to PSLLDQ which is as follows: + // + // TEMP := COUNT + // IF (TEMP > 15) THEN TEMP := 16; FI + // DEST := DEST << (TEMP * 8) + // DEST[MAXVL-1:128] (Unmodified) + + return simd.shuffle( + simd.u8x16{}, + a, + 0 when shift > 15 else (16 - shift + 0), + 1 when shift > 15 else (16 - shift + 1), + 2 when shift > 15 else (16 - shift + 2), + 3 when shift > 15 else (16 - shift + 3), + 4 when shift > 15 else (16 - shift + 4), + 5 when shift > 15 else (16 - shift + 5), + 6 when shift > 15 else (16 - shift + 6), + 7 when shift > 15 else (16 - shift + 7), + 8 when shift > 15 else (16 - shift + 8), + 9 when shift > 15 else (16 - shift + 9), + 10 when shift > 15 else (16 - shift + 10), + 11 when shift > 15 else (16 - shift + 11), + 12 when shift > 15 else (16 - shift + 12), + 13 when shift > 15 else (16 - shift + 13), + 14 when shift > 15 else (16 - shift + 14), + 15 when shift > 15 else (16 - shift + 15), + ) +} + +@(private, require_results, enable_target_feature = TARGET_FEATURES) +_mm_shuffle_epi32 :: #force_inline proc "c" (a: simd.u8x16, $IMM8: u32) -> simd.u8x16 { + v := transmute(simd.i32x4)a + return transmute(simd.u8x16)simd.shuffle( + v, + v, + IMM8 & 0b11, + (IMM8 >> 2) & 0b11, + (IMM8 >> 4) & 0b11, + (IMM8 >> 6) & 0b11, + ) +} + +@(private, require_results, enable_target_feature = TARGET_FEATURES) +_mm_shuffle_ps :: #force_inline proc "c" (a, b: simd.u8x16, $MASK: u32) -> simd.u8x16 { + return transmute(simd.u8x16)simd.shuffle( + transmute(simd.u32x4)(a), + transmute(simd.u32x4)(b), + u32(MASK) & 0b11, + (u32(MASK)>>2) & 0b11, + ((u32(MASK)>>4) & 0b11)+4, + ((u32(MASK)>>6) & 0b11)+4) +} diff --git a/core/crypto/_aes/hw/intrinsics_intel.odin b/core/crypto/_aes/hw/intrinsics_intel.odin new file mode 100644 index 000000000..25399dfae --- /dev/null +++ b/core/crypto/_aes/hw/intrinsics_intel.odin @@ -0,0 +1,55 @@ +#+build amd64 +package aes_hw + +import "core:simd" +import "core:simd/x86" + +// Intel/RISC-V semantics. + +TARGET_FEATURES :: "sse,sse2,ssse3,sse4.1,aes" +HAS_GHASH :: true + +@(require_results, enable_target_feature = "aes") +aesdec :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 { + return transmute(simd.u8x16)(x86._mm_aesdec_si128(transmute(x86.__m128i)(data), transmute(x86.__m128i)(key))) +} + +@(require_results, enable_target_feature = "aes") +aesdeclast :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 { + return transmute(simd.u8x16)(x86._mm_aesdeclast_si128(transmute(x86.__m128i)(data), transmute(x86.__m128i)(key))) +} + +@(require_results, enable_target_feature = "aes") +aesenc :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 { + return transmute(simd.u8x16)(x86._mm_aesenc_si128(transmute(x86.__m128i)(data), transmute(x86.__m128i)(key))) +} + +@(require_results, enable_target_feature = "aes") +aesenclast :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 { + return transmute(simd.u8x16)(x86._mm_aesenclast_si128(transmute(x86.__m128i)(data), transmute(x86.__m128i)(key))) +} + +@(require_results, enable_target_feature = "aes") +aesimc :: #force_inline proc "c" (data: simd.u8x16) -> simd.u8x16 { + return transmute(simd.u8x16)(x86._mm_aesimc_si128(transmute(x86.__m128i)(data))) +} + +@(require_results, enable_target_feature = "aes") +aeskeygenassist :: #force_inline proc "c" (data: simd.u8x16, $IMM8: u8) -> simd.u8x16 { + return transmute(simd.u8x16)(x86._mm_aeskeygenassist_si128(transmute(x86.__m128i)(data), IMM8)) +} + +@(private, require_results, enable_target_feature = TARGET_FEATURES) +_mm_slli_si128 :: #force_inline proc "c" (a: simd.u8x16, $IMM8: u32) -> simd.u8x16 { + return transmute(simd.u8x16)(x86._mm_slli_si128(transmute(x86.__m128i)(a), IMM8)) +} + +@(private, require_results, enable_target_feature = TARGET_FEATURES) +_mm_shuffle_epi32 :: #force_inline proc "c" (a: simd.u8x16, $IMM8: u32) -> simd.u8x16 { + return transmute(simd.u8x16)(x86._mm_shuffle_epi32(transmute(x86.__m128i)(a), IMM8)) +} + +@(private, require_results, enable_target_feature = TARGET_FEATURES) +_mm_shuffle_ps :: #force_inline proc "c" (a, b: simd.u8x16, $MASK: u32) -> simd.u8x16 { + return transmute(simd.u8x16)(x86._mm_shuffle_ps(transmute(x86.__m128)(a), transmute(x86.__m128)(b), MASK)) +} diff --git a/core/crypto/_aes/hw/keysched_hw.odin b/core/crypto/_aes/hw/keysched_hw.odin new file mode 100644 index 000000000..7d85c43b7 --- /dev/null +++ b/core/crypto/_aes/hw/keysched_hw.odin @@ -0,0 +1,181 @@ +// Copyright (c) 2017 Thomas Pornin +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// THIS SOFTWARE IS PROVIDED BY THE AUTHORS “AS IS” AND ANY EXPRESS OR +// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY +// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE +// GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#+build amd64,arm64,arm32 +package aes_hw + +import "base:intrinsics" +import "core:crypto" +import "core:crypto/_aes" +import "core:simd" + +// Inspiration taken from BearSSL's AES-NI implementation. +// +// Note: This assumes that the SROA optimization pass is enabled to be +// anything resembling performant otherwise, LLVM will not elide a massive +// number of redundant loads/stores it generates for every intrinsic call. + +@(private = "file", require_results, enable_target_feature = TARGET_FEATURES) +expand_step128 :: #force_inline proc(k1, k2: simd.u8x16) -> simd.u8x16 { + k1, k2 := k1, k2 + + k2 = _mm_shuffle_epi32(k2, 0xff) + k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04)) + k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04)) + k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04)) + return simd.bit_xor(k1, k2) +} + +@(private = "file", require_results, enable_target_feature = TARGET_FEATURES) +expand_step192a :: #force_inline proc (k1_, k2_: ^simd.u8x16, k3: simd.u8x16) -> (simd.u8x16, simd.u8x16) { + k1, k2, k3 := k1_^, k2_^, k3 + + k3 = _mm_shuffle_epi32(k3, 0x55) + k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04)) + k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04)) + k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04)) + k1 = simd.bit_xor(k1, k3) + + tmp := k2 + k2 = simd.bit_xor(k2, _mm_slli_si128(k2, 0x04)) + k2 = simd.bit_xor(k2, _mm_shuffle_epi32(k1, 0xff)) + + k1_, k2_ := k1_, k2_ + k1_^, k2_^ = k1, k2 + + r1 := _mm_shuffle_ps(tmp, k1, 0x44) + r2 := _mm_shuffle_ps(k1, k2, 0x4e) + + return r1, r2 +} + +@(private = "file", require_results, enable_target_feature = TARGET_FEATURES) +expand_step192b :: #force_inline proc (k1_, k2_: ^simd.u8x16, k3: simd.u8x16) -> simd.u8x16 { + k1, k2, k3 := k1_^, k2_^, k3 + + k3 = _mm_shuffle_epi32(k3, 0x55) + k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04)) + k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04)) + k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04)) + k1 = simd.bit_xor(k1, k3) + + k2 = simd.bit_xor(k2, _mm_slli_si128(k2, 0x04)) + k2 = simd.bit_xor(k2, _mm_shuffle_epi32(k1, 0xff)) + + k1_, k2_ := k1_, k2_ + k1_^, k2_^ = k1, k2 + + return k1 +} + +@(private = "file", require_results, enable_target_feature = TARGET_FEATURES) +expand_step256b :: #force_inline proc(k1, k2: simd.u8x16) -> simd.u8x16 { + k1, k2 := k1, k2 + + k2 = _mm_shuffle_epi32(k2, 0xaa) + k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04)) + k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04)) + k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04)) + return simd.bit_xor(k1, k2) +} + +@(private = "file", enable_target_feature = TARGET_FEATURES) +derive_dec_keys :: proc(ctx: ^Context, sks: ^[15]simd.u8x16, num_rounds: int) { + intrinsics.unaligned_store((^simd.u8x16)(&ctx._sk_exp_dec[0]), sks[num_rounds]) + for i in 1 ..< num_rounds { + tmp := aesimc(sks[i]) + intrinsics.unaligned_store((^simd.u8x16)(&ctx._sk_exp_dec[num_rounds - i]), tmp) + } + intrinsics.unaligned_store((^simd.u8x16)(&ctx._sk_exp_dec[num_rounds]), sks[0]) +} + +@(private, enable_target_feature = TARGET_FEATURES) +keysched :: proc(ctx: ^Context, key: []byte) { + sks: [15]simd.u8x16 = --- + + // Compute the encryption keys. + num_rounds, key_len := 0, len(key) + switch key_len { + case _aes.KEY_SIZE_128: + sks[0] = intrinsics.unaligned_load((^simd.u8x16)(raw_data(key))) + sks[1] = expand_step128(sks[0], aeskeygenassist(sks[0], 0x01)) + sks[2] = expand_step128(sks[1], aeskeygenassist(sks[1], 0x02)) + sks[3] = expand_step128(sks[2], aeskeygenassist(sks[2], 0x04)) + sks[4] = expand_step128(sks[3], aeskeygenassist(sks[3], 0x08)) + sks[5] = expand_step128(sks[4], aeskeygenassist(sks[4], 0x10)) + sks[6] = expand_step128(sks[5], aeskeygenassist(sks[5], 0x20)) + sks[7] = expand_step128(sks[6], aeskeygenassist(sks[6], 0x40)) + sks[8] = expand_step128(sks[7], aeskeygenassist(sks[7], 0x80)) + sks[9] = expand_step128(sks[8], aeskeygenassist(sks[8], 0x1b)) + sks[10] = expand_step128(sks[9], aeskeygenassist(sks[9], 0x36)) + num_rounds = _aes.ROUNDS_128 + case _aes.KEY_SIZE_192: + k0 := intrinsics.unaligned_load((^simd.u8x16)(raw_data(key))) + + k1_tmp: [16]byte + copy(k1_tmp[:], key[16:24]) + k1 := intrinsics.unaligned_load((^simd.u8x16)(&k1_tmp)) + crypto.zero_explicit(&k1_tmp, size_of(k1_tmp)) + + sks[0] = k0 + sks[1], sks[2] = expand_step192a(&k0, &k1, aeskeygenassist(k1, 0x01)) + sks[3] = expand_step192b(&k0, &k1, aeskeygenassist(k1, 0x02)) + sks[4], sks[5] = expand_step192a(&k0, &k1, aeskeygenassist(k1, 0x04)) + sks[6] = expand_step192b(&k0, &k1, aeskeygenassist(k1, 0x08)) + sks[7], sks[8] = expand_step192a(&k0, &k1, aeskeygenassist(k1, 0x10)) + sks[9] = expand_step192b(&k0, &k1, aeskeygenassist(k1, 0x20)) + sks[10], sks[11] = expand_step192a(&k0, &k1, aeskeygenassist(k1, 0x40)) + sks[12] = expand_step192b(&k0, &k1, aeskeygenassist(k1, 0x80)) + num_rounds = _aes.ROUNDS_192 + + case _aes.KEY_SIZE_256: + sks[0] = intrinsics.unaligned_load((^simd.u8x16)(raw_data(key))) + sks[1] = intrinsics.unaligned_load((^simd.u8x16)(raw_data(key[16:]))) + sks[2] = expand_step128(sks[0], aeskeygenassist(sks[1], 0x01)) + sks[3] = expand_step256b(sks[1], aeskeygenassist(sks[2], 0x01)) + sks[4] = expand_step128(sks[2], aeskeygenassist(sks[3], 0x02)) + sks[5] = expand_step256b(sks[3], aeskeygenassist(sks[4], 0x02)) + sks[6] = expand_step128(sks[4], aeskeygenassist(sks[5], 0x04)) + sks[7] = expand_step256b(sks[5], aeskeygenassist(sks[6], 0x04)) + sks[8] = expand_step128(sks[6], aeskeygenassist(sks[7], 0x08)) + sks[9] = expand_step256b(sks[7], aeskeygenassist(sks[8], 0x08)) + sks[10] = expand_step128(sks[8], aeskeygenassist(sks[9], 0x10)) + sks[11] = expand_step256b(sks[9], aeskeygenassist(sks[10], 0x10)) + sks[12] = expand_step128(sks[10], aeskeygenassist(sks[11], 0x20)) + sks[13] = expand_step256b(sks[11], aeskeygenassist(sks[12], 0x20)) + sks[14] = expand_step128(sks[12], aeskeygenassist(sks[13], 0x40)) + num_rounds = _aes.ROUNDS_256 + case: + panic("crypto/aes: invalid AES key size") + } + for i in 0 ..= num_rounds { + intrinsics.unaligned_store((^simd.u8x16)(&ctx._sk_exp_enc[i]), sks[i]) + } + + // Compute the decryption keys. GCM and CTR do not need this, however + // ECB, CBC, OCB3, etc do. + derive_dec_keys(ctx, &sks, num_rounds) + + ctx._num_rounds = num_rounds + + crypto.zero_explicit(&sks, size_of(sks)) +} diff --git a/core/crypto/_aes/hw/unsupported.odin b/core/crypto/_aes/hw/unsupported.odin new file mode 100644 index 000000000..3fb31b6b8 --- /dev/null +++ b/core/crypto/_aes/hw/unsupported.odin @@ -0,0 +1,11 @@ +#+build !amd64 +#+build !arm64 +#+build !arm32 +package aes_hw + +HAS_GHASH :: false + +@(private) +keysched :: proc(ctx: ^Context, key: []byte) { + panic("crypto/aes: hardware implementation unsupported") +} diff --git a/core/crypto/_aes/hw_intel/api.odin b/core/crypto/_aes/hw_intel/api.odin deleted file mode 100644 index 9547d8f84..000000000 --- a/core/crypto/_aes/hw_intel/api.odin +++ /dev/null @@ -1,38 +0,0 @@ -#+build amd64 -package aes_hw_intel - -import "core:sys/info" - -// is_supported returns true if and only if (⟺) hardware accelerated AES -// is supported. -is_supported :: proc "contextless" () -> bool { - // Note: Everything with AES-NI and PCLMULQDQ has support for - // the required SSE extxtensions. - req_features :: info.CPU_Features{ - .sse2, - .ssse3, - .sse41, - .aes, - .pclmulqdq, - } - return info.cpu_features() >= req_features -} - -// Context is a keyed AES (ECB) instance. -Context :: struct { - // Note: The ideal thing to do is for the expanded round keys to be - // arrays of `__m128i`, however that implies alignment (or using AVX). - // - // All the people using e-waste processors that don't support an - // insturction set that has been around for over 10 years are why - // we can't have nice things. - _sk_exp_enc: [15][16]byte, - _sk_exp_dec: [15][16]byte, - _num_rounds: int, -} - -// init initializes a context for AES with the provided key. -init :: proc(ctx: ^Context, key: []byte) { - keysched(ctx, key) -} - diff --git a/core/crypto/_aes/hw_intel/hw_intel_keysched.odin b/core/crypto/_aes/hw_intel/hw_intel_keysched.odin deleted file mode 100644 index 7b339c5f5..000000000 --- a/core/crypto/_aes/hw_intel/hw_intel_keysched.odin +++ /dev/null @@ -1,178 +0,0 @@ -// Copyright (c) 2017 Thomas Pornin -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// THIS SOFTWARE IS PROVIDED BY THE AUTHORS “AS IS” AND ANY EXPRESS OR -// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -// ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY -// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE -// GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, -// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF -// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -#+build amd64 -package aes_hw_intel - -import "base:intrinsics" -import "core:crypto" -import "core:crypto/_aes" -import "core:simd/x86" - -// Intel AES-NI based implementation. Inspiration taken from BearSSL. -// -// Note: This assumes that the SROA optimization pass is enabled to be -// anything resembling performat otherwise, LLVM will not elide a massive -// number of redundant loads/stores it generates for every intrinsic call. - -@(private = "file", require_results, enable_target_feature = "sse2") -expand_step128 :: #force_inline proc(k1, k2: x86.__m128i) -> x86.__m128i { - k1, k2 := k1, k2 - - k2 = x86._mm_shuffle_epi32(k2, 0xff) - k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04)) - k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04)) - k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04)) - return x86._mm_xor_si128(k1, k2) -} - -@(private = "file", require_results, enable_target_feature = "sse,sse2") -expand_step192a :: #force_inline proc (k1_, k2_: ^x86.__m128i, k3: x86.__m128i) -> (x86.__m128i, x86.__m128i) { - k1, k2, k3 := k1_^, k2_^, k3 - - k3 = x86._mm_shuffle_epi32(k3, 0x55) - k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04)) - k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04)) - k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04)) - k1 = x86._mm_xor_si128(k1, k3) - - tmp := k2 - k2 = x86._mm_xor_si128(k2, x86._mm_slli_si128(k2, 0x04)) - k2 = x86._mm_xor_si128(k2, x86._mm_shuffle_epi32(k1, 0xff)) - - k1_, k2_ := k1_, k2_ - k1_^, k2_^ = k1, k2 - - r1 := transmute(x86.__m128i)(x86._mm_shuffle_ps(transmute(x86.__m128)(tmp), transmute(x86.__m128)(k1), 0x44)) - r2 := transmute(x86.__m128i)(x86._mm_shuffle_ps(transmute(x86.__m128)(k1), transmute(x86.__m128)(k2), 0x4e)) - - return r1, r2 -} - -@(private = "file", require_results, enable_target_feature = "sse2") -expand_step192b :: #force_inline proc (k1_, k2_: ^x86.__m128i, k3: x86.__m128i) -> x86.__m128i { - k1, k2, k3 := k1_^, k2_^, k3 - - k3 = x86._mm_shuffle_epi32(k3, 0x55) - k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04)) - k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04)) - k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04)) - k1 = x86._mm_xor_si128(k1, k3) - - k2 = x86._mm_xor_si128(k2, x86._mm_slli_si128(k2, 0x04)) - k2 = x86._mm_xor_si128(k2, x86._mm_shuffle_epi32(k1, 0xff)) - - k1_, k2_ := k1_, k2_ - k1_^, k2_^ = k1, k2 - - return k1 -} - -@(private = "file", require_results, enable_target_feature = "sse2") -expand_step256b :: #force_inline proc(k1, k2: x86.__m128i) -> x86.__m128i { - k1, k2 := k1, k2 - - k2 = x86._mm_shuffle_epi32(k2, 0xaa) - k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04)) - k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04)) - k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04)) - return x86._mm_xor_si128(k1, k2) -} - -@(private = "file", enable_target_feature = "aes") -derive_dec_keys :: proc(ctx: ^Context, sks: ^[15]x86.__m128i, num_rounds: int) { - intrinsics.unaligned_store((^x86.__m128i)(&ctx._sk_exp_dec[0]), sks[num_rounds]) - for i in 1 ..< num_rounds { - tmp := x86._mm_aesimc_si128(sks[i]) - intrinsics.unaligned_store((^x86.__m128i)(&ctx._sk_exp_dec[num_rounds - i]), tmp) - } - intrinsics.unaligned_store((^x86.__m128i)(&ctx._sk_exp_dec[num_rounds]), sks[0]) -} - -@(private, enable_target_feature = "sse,sse2,aes") -keysched :: proc(ctx: ^Context, key: []byte) { - sks: [15]x86.__m128i = --- - - // Compute the encryption keys. - num_rounds, key_len := 0, len(key) - switch key_len { - case _aes.KEY_SIZE_128: - sks[0] = intrinsics.unaligned_load((^x86.__m128i)(raw_data(key))) - sks[1] = expand_step128(sks[0], x86._mm_aeskeygenassist_si128(sks[0], 0x01)) - sks[2] = expand_step128(sks[1], x86._mm_aeskeygenassist_si128(sks[1], 0x02)) - sks[3] = expand_step128(sks[2], x86._mm_aeskeygenassist_si128(sks[2], 0x04)) - sks[4] = expand_step128(sks[3], x86._mm_aeskeygenassist_si128(sks[3], 0x08)) - sks[5] = expand_step128(sks[4], x86._mm_aeskeygenassist_si128(sks[4], 0x10)) - sks[6] = expand_step128(sks[5], x86._mm_aeskeygenassist_si128(sks[5], 0x20)) - sks[7] = expand_step128(sks[6], x86._mm_aeskeygenassist_si128(sks[6], 0x40)) - sks[8] = expand_step128(sks[7], x86._mm_aeskeygenassist_si128(sks[7], 0x80)) - sks[9] = expand_step128(sks[8], x86._mm_aeskeygenassist_si128(sks[8], 0x1b)) - sks[10] = expand_step128(sks[9], x86._mm_aeskeygenassist_si128(sks[9], 0x36)) - num_rounds = _aes.ROUNDS_128 - case _aes.KEY_SIZE_192: - k0 := intrinsics.unaligned_load((^x86.__m128i)(raw_data(key))) - k1 := x86.__m128i{ - intrinsics.unaligned_load((^i64)(raw_data(key[16:]))), - 0, - } - sks[0] = k0 - sks[1], sks[2] = expand_step192a(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x01)) - sks[3] = expand_step192b(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x02)) - sks[4], sks[5] = expand_step192a(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x04)) - sks[6] = expand_step192b(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x08)) - sks[7], sks[8] = expand_step192a(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x10)) - sks[9] = expand_step192b(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x20)) - sks[10], sks[11] = expand_step192a(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x40)) - sks[12] = expand_step192b(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x80)) - num_rounds = _aes.ROUNDS_192 - case _aes.KEY_SIZE_256: - sks[0] = intrinsics.unaligned_load((^x86.__m128i)(raw_data(key))) - sks[1] = intrinsics.unaligned_load((^x86.__m128i)(raw_data(key[16:]))) - sks[2] = expand_step128(sks[0], x86._mm_aeskeygenassist_si128(sks[1], 0x01)) - sks[3] = expand_step256b(sks[1], x86._mm_aeskeygenassist_si128(sks[2], 0x01)) - sks[4] = expand_step128(sks[2], x86._mm_aeskeygenassist_si128(sks[3], 0x02)) - sks[5] = expand_step256b(sks[3], x86._mm_aeskeygenassist_si128(sks[4], 0x02)) - sks[6] = expand_step128(sks[4], x86._mm_aeskeygenassist_si128(sks[5], 0x04)) - sks[7] = expand_step256b(sks[5], x86._mm_aeskeygenassist_si128(sks[6], 0x04)) - sks[8] = expand_step128(sks[6], x86._mm_aeskeygenassist_si128(sks[7], 0x08)) - sks[9] = expand_step256b(sks[7], x86._mm_aeskeygenassist_si128(sks[8], 0x08)) - sks[10] = expand_step128(sks[8], x86._mm_aeskeygenassist_si128(sks[9], 0x10)) - sks[11] = expand_step256b(sks[9], x86._mm_aeskeygenassist_si128(sks[10], 0x10)) - sks[12] = expand_step128(sks[10], x86._mm_aeskeygenassist_si128(sks[11], 0x20)) - sks[13] = expand_step256b(sks[11], x86._mm_aeskeygenassist_si128(sks[12], 0x20)) - sks[14] = expand_step128(sks[12], x86._mm_aeskeygenassist_si128(sks[13], 0x40)) - num_rounds = _aes.ROUNDS_256 - case: - panic("crypto/aes: invalid AES key size") - } - for i in 0 ..= num_rounds { - intrinsics.unaligned_store((^x86.__m128i)(&ctx._sk_exp_enc[i]), sks[i]) - } - - // Compute the decryption keys. GCM and CTR do not need this, however - // ECB, CBC, OCB3, etc do. - derive_dec_keys(ctx, &sks, num_rounds) - - ctx._num_rounds = num_rounds - - crypto.zero_explicit(&sks, size_of(sks)) -} diff --git a/core/crypto/aes/aes_gcm_hw_intel.odin b/core/crypto/aes/aes_gcm_hw_intel.odin index c6e564773..75c97be80 100644 --- a/core/crypto/aes/aes_gcm_hw_intel.odin +++ b/core/crypto/aes/aes_gcm_hw_intel.odin @@ -4,7 +4,7 @@ package aes import "base:intrinsics" import "core:crypto" import "core:crypto/_aes" -import "core:crypto/_aes/hw_intel" +import aes_hw "core:crypto/_aes/hw" import "core:encoding/endian" import "core:simd/x86" @@ -17,7 +17,7 @@ gcm_seal_hw :: proc(ctx: ^Context_Impl_Hardware, dst, tag, iv, aad, plaintext: [ init_ghash_hw(ctx, &h, &j0, &j0_enc, iv) // Note: Our GHASH implementation handles appending padding. - hw_intel.ghash(s[:], h[:], aad) + aes_hw.ghash(s[:], h[:], aad) gctr_hw(ctx, dst, &s, plaintext, &h, &j0, true) final_ghash_hw(&s, &h, &j0_enc, len(aad), len(plaintext)) copy(tag, s[:]) @@ -35,7 +35,7 @@ gcm_open_hw :: proc(ctx: ^Context_Impl_Hardware, dst, iv, aad, ciphertext, tag: s: [_aes.GHASH_TAG_SIZE]byte init_ghash_hw(ctx, &h, &j0, &j0_enc, iv) - hw_intel.ghash(s[:], h[:], aad) + aes_hw.ghash(s[:], h[:], aad) gctr_hw(ctx, dst, &s, ciphertext, &h, &j0, false) final_ghash_hw(&s, &h, &j0_enc, len(aad), len(ciphertext)) @@ -71,11 +71,11 @@ init_ghash_hw :: proc( } else { // If len(IV) != 96, then let s = 128 ceil(len(IV)/128) - len(IV), // and let J0 = GHASHH(IV || 0^(s+64) || ceil(len(IV))^64). - hw_intel.ghash(j0[:], h[:], iv) + aes_hw.ghash(j0[:], h[:], iv) tmp: [_aes.GHASH_BLOCK_SIZE]byte endian.unchecked_put_u64be(tmp[8:], u64(l) * 8) - hw_intel.ghash(j0[:], h[:], tmp[:]) + aes_hw.ghash(j0[:], h[:], tmp[:]) } // ECB encrypt j0, so that we can just XOR with the tag. @@ -94,7 +94,7 @@ final_ghash_hw :: proc( endian.unchecked_put_u64be(blk[0:], u64(a_len) * 8) endian.unchecked_put_u64be(blk[8:], u64(t_len) * 8) - hw_intel.ghash(s[:], h[:], blk[:]) + aes_hw.ghash(s[:], h[:], blk[:]) j0_vec := intrinsics.unaligned_load((^x86.__m128i)(j0)) s_vec := intrinsics.unaligned_load((^x86.__m128i)(s)) s_vec = x86._mm_xor_si128(s_vec, j0_vec) @@ -131,7 +131,7 @@ gctr_hw :: proc( nr_blocks := len(src) / BLOCK_SIZE for nr_blocks >= CTR_STRIDE_HW { if !is_seal { - hw_intel.ghash(s[:], h[:], src[:CTR_STRIDE_BYTES_HW]) + aes_hw.ghash(s[:], h[:], src[:CTR_STRIDE_BYTES_HW]) } #unroll for i in 0 ..< CTR_STRIDE_HW { @@ -174,7 +174,7 @@ gctr_hw :: proc( xor_blocks_hw(dst, src, blks[:]) if is_seal { - hw_intel.ghash(s[:], h[:], dst[:CTR_STRIDE_BYTES_HW]) + aes_hw.ghash(s[:], h[:], dst[:CTR_STRIDE_BYTES_HW]) } src = src[CTR_STRIDE_BYTES_HW:] @@ -186,7 +186,7 @@ gctr_hw :: proc( for n := len(src); n > 0; { l := min(n, BLOCK_SIZE) if !is_seal { - hw_intel.ghash(s[:], h[:], src[:l]) + aes_hw.ghash(s[:], h[:], src[:l]) } blks[0], ctr = hw_inc_ctr32(&ctr_blk, ctr) @@ -219,7 +219,7 @@ gctr_hw :: proc( copy(dst, blk[:l]) } if is_seal { - hw_intel.ghash(s[:], h[:], dst[:l]) + aes_hw.ghash(s[:], h[:], dst[:l]) } dst = dst[l:] diff --git a/core/crypto/aes/aes_impl_hw_intel.odin b/core/crypto/aes/aes_impl_hw_intel.odin index 96a1811f3..fe3849eda 100644 --- a/core/crypto/aes/aes_impl_hw_intel.odin +++ b/core/crypto/aes/aes_impl_hw_intel.odin @@ -1,18 +1,18 @@ #+build amd64 package aes -import "core:crypto/_aes/hw_intel" +import aes_hw "core:crypto/_aes/hw" // is_hardware_accelerated returns true if and only if (⟺) hardware accelerated AES // is supported. is_hardware_accelerated :: proc "contextless" () -> bool { - return hw_intel.is_supported() + return aes_hw.is_supported() } @(private) -Context_Impl_Hardware :: hw_intel.Context +Context_Impl_Hardware :: aes_hw.Context @(private, enable_target_feature = "sse2,aes") init_impl_hw :: proc(ctx: ^Context_Impl_Hardware, key: []byte) { - hw_intel.init(ctx, key) + aes_hw.init(ctx, key) }