mirror of
https://github.com/odin-lang/Odin.git
synced 2026-04-06 06:38:20 +00:00
core/crypto/_aes/hw: Initial import
This commit is contained in:
69
core/crypto/_aes/hw/api.odin
Normal file
69
core/crypto/_aes/hw/api.odin
Normal file
@@ -0,0 +1,69 @@
|
||||
package aes_hw
|
||||
|
||||
@(require) import "core:sys/info"
|
||||
|
||||
// is_supported returns true if and only if (⟺) hardware accelerated AES
|
||||
// is supported.
|
||||
is_supported :: proc "contextless" () -> bool {
|
||||
when ODIN_ARCH == .amd64 {
|
||||
// Note: Everything with AES-NI has support for
|
||||
// the required SSE extxtensions.
|
||||
req_features :: info.CPU_Features{
|
||||
.sse2,
|
||||
.ssse3,
|
||||
.sse41,
|
||||
.aes,
|
||||
}
|
||||
return info.cpu_features() >= req_features
|
||||
} else when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 {
|
||||
req_features :: info.CPU_Features{
|
||||
.asimd,
|
||||
.aes,
|
||||
}
|
||||
return info.cpu_features() >= req_features
|
||||
} else {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// is_ghash_supported returns true if and only if (⟺) hardware accelerated
|
||||
// GHASH is supported.
|
||||
is_ghash_supported :: proc "contextless" () -> bool {
|
||||
// Just having hardware GHASH is silly.
|
||||
if !is_supported() {
|
||||
return false
|
||||
}
|
||||
|
||||
when ODIN_ARCH == .amd64 {
|
||||
return info.cpu_features() >= info.CPU_Features{
|
||||
.pclmulqdq,
|
||||
}
|
||||
} else when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32{
|
||||
// Once we can actually use this, we can re-enable this.
|
||||
//
|
||||
// return info.cpu_features() >= info.CPU_Features{
|
||||
// .pmull,
|
||||
// }
|
||||
return false
|
||||
} else {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// Context is a keyed AES (ECB) instance.
|
||||
Context :: struct {
|
||||
// Note: The ideal thing to do is for the expanded round keys to be
|
||||
// arrays of `u8x16`, however that implies alignment (or using AVX).
|
||||
//
|
||||
// All the people using e-waste processors that don't support an
|
||||
// instruction set that has been around for over 10 years are why
|
||||
// we can't have nice things.
|
||||
_sk_exp_enc: [15][16]byte,
|
||||
_sk_exp_dec: [15][16]byte,
|
||||
_num_rounds: int,
|
||||
}
|
||||
|
||||
// init initializes a context for AES with the provided key.
|
||||
init :: proc(ctx: ^Context, key: []byte) {
|
||||
keysched(ctx, key)
|
||||
}
|
||||
@@ -21,7 +21,7 @@
|
||||
// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#+build amd64
|
||||
package aes_hw_intel
|
||||
package aes_hw
|
||||
|
||||
import "base:intrinsics"
|
||||
import "core:crypto/_aes"
|
||||
115
core/crypto/_aes/hw/intrinsics_arm.odin
Normal file
115
core/crypto/_aes/hw/intrinsics_arm.odin
Normal file
@@ -0,0 +1,115 @@
|
||||
#+build arm64,arm32
|
||||
package aes_hw
|
||||
|
||||
import "core:simd"
|
||||
import "core:simd/arm"
|
||||
|
||||
// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a/
|
||||
|
||||
TARGET_FEATURES :: "neon,aes"
|
||||
HAS_GHASH :: false // Temporary
|
||||
|
||||
@(require_results, enable_target_feature = "aes")
|
||||
aesdec :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 {
|
||||
return simd.bit_xor(arm.vaesimcq_u8(arm.vaesdq_u8(data, simd.u8x16{})), key)
|
||||
}
|
||||
|
||||
@(require_results, enable_target_feature = "aes")
|
||||
aesdeclast :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 {
|
||||
return simd.bit_xor(arm.vaesdq_u8(data, simd.u8x16{}), key)
|
||||
}
|
||||
|
||||
@(require_results, enable_target_feature = "aes")
|
||||
aesenc :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 {
|
||||
return simd.bit_xor(arm.vaesmcq_u8(arm.vaeseq_u8(data, simd.u8x16{})), key)
|
||||
}
|
||||
|
||||
@(require_results, enable_target_feature = "aes")
|
||||
aesenclast :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 {
|
||||
return simd.bit_xor(arm.vaeseq_u8(data, simd.u8x16{}), key)
|
||||
}
|
||||
|
||||
aesimc :: arm.vaesimcq_u8
|
||||
|
||||
@(require_results, enable_target_feature = "aes")
|
||||
aeskeygenassist :: #force_inline proc "c" (data: simd.u8x16, $IMM8: u8) -> simd.u8x16 {
|
||||
a := arm.vaeseq_u8(data, simd.u8x16{}) // AESE does ShiftRows and SubBytes on A
|
||||
|
||||
// Undo ShiftRows step from AESE and extract X1 and X3
|
||||
dest := simd.swizzle(
|
||||
a,
|
||||
0x04, 0x01, 0x0e, 0x0b, // SubBytes(X1)
|
||||
0x01, 0x0e, 0x0b, 0x04, // ROT(SubBytes(X1))
|
||||
0x0c, 0x09, 0x06, 0x03, // SubBytes(X3)
|
||||
0x09, 0x06, 0x03, 0x0c, // ROT(SubBytes(X3))
|
||||
)
|
||||
|
||||
rcons := simd.u8x16{
|
||||
0, 0, 0, 0,
|
||||
IMM8, 0, 0, 0,
|
||||
0, 0, 0, 0,
|
||||
IMM8, 0, 0, 0,
|
||||
}
|
||||
|
||||
return simd.bit_xor(dest, rcons)
|
||||
}
|
||||
|
||||
// The keyschedule implementation is easier to read with some extra
|
||||
// Intel intrinsics that are emulated by built-in LLVM ops anyway.
|
||||
|
||||
@(private, require_results, enable_target_feature = TARGET_FEATURES)
|
||||
_mm_slli_si128 :: #force_inline proc "c" (a: simd.u8x16, $IMM8: u32) -> simd.u8x16 {
|
||||
shift :: IMM8 & 0xff
|
||||
|
||||
// This needs to emit behavior identical to PSLLDQ which is as follows:
|
||||
//
|
||||
// TEMP := COUNT
|
||||
// IF (TEMP > 15) THEN TEMP := 16; FI
|
||||
// DEST := DEST << (TEMP * 8)
|
||||
// DEST[MAXVL-1:128] (Unmodified)
|
||||
|
||||
return simd.shuffle(
|
||||
simd.u8x16{},
|
||||
a,
|
||||
0 when shift > 15 else (16 - shift + 0),
|
||||
1 when shift > 15 else (16 - shift + 1),
|
||||
2 when shift > 15 else (16 - shift + 2),
|
||||
3 when shift > 15 else (16 - shift + 3),
|
||||
4 when shift > 15 else (16 - shift + 4),
|
||||
5 when shift > 15 else (16 - shift + 5),
|
||||
6 when shift > 15 else (16 - shift + 6),
|
||||
7 when shift > 15 else (16 - shift + 7),
|
||||
8 when shift > 15 else (16 - shift + 8),
|
||||
9 when shift > 15 else (16 - shift + 9),
|
||||
10 when shift > 15 else (16 - shift + 10),
|
||||
11 when shift > 15 else (16 - shift + 11),
|
||||
12 when shift > 15 else (16 - shift + 12),
|
||||
13 when shift > 15 else (16 - shift + 13),
|
||||
14 when shift > 15 else (16 - shift + 14),
|
||||
15 when shift > 15 else (16 - shift + 15),
|
||||
)
|
||||
}
|
||||
|
||||
@(private, require_results, enable_target_feature = TARGET_FEATURES)
|
||||
_mm_shuffle_epi32 :: #force_inline proc "c" (a: simd.u8x16, $IMM8: u32) -> simd.u8x16 {
|
||||
v := transmute(simd.i32x4)a
|
||||
return transmute(simd.u8x16)simd.shuffle(
|
||||
v,
|
||||
v,
|
||||
IMM8 & 0b11,
|
||||
(IMM8 >> 2) & 0b11,
|
||||
(IMM8 >> 4) & 0b11,
|
||||
(IMM8 >> 6) & 0b11,
|
||||
)
|
||||
}
|
||||
|
||||
@(private, require_results, enable_target_feature = TARGET_FEATURES)
|
||||
_mm_shuffle_ps :: #force_inline proc "c" (a, b: simd.u8x16, $MASK: u32) -> simd.u8x16 {
|
||||
return transmute(simd.u8x16)simd.shuffle(
|
||||
transmute(simd.u32x4)(a),
|
||||
transmute(simd.u32x4)(b),
|
||||
u32(MASK) & 0b11,
|
||||
(u32(MASK)>>2) & 0b11,
|
||||
((u32(MASK)>>4) & 0b11)+4,
|
||||
((u32(MASK)>>6) & 0b11)+4)
|
||||
}
|
||||
55
core/crypto/_aes/hw/intrinsics_intel.odin
Normal file
55
core/crypto/_aes/hw/intrinsics_intel.odin
Normal file
@@ -0,0 +1,55 @@
|
||||
#+build amd64
|
||||
package aes_hw
|
||||
|
||||
import "core:simd"
|
||||
import "core:simd/x86"
|
||||
|
||||
// Intel/RISC-V semantics.
|
||||
|
||||
TARGET_FEATURES :: "sse,sse2,ssse3,sse4.1,aes"
|
||||
HAS_GHASH :: true
|
||||
|
||||
@(require_results, enable_target_feature = "aes")
|
||||
aesdec :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 {
|
||||
return transmute(simd.u8x16)(x86._mm_aesdec_si128(transmute(x86.__m128i)(data), transmute(x86.__m128i)(key)))
|
||||
}
|
||||
|
||||
@(require_results, enable_target_feature = "aes")
|
||||
aesdeclast :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 {
|
||||
return transmute(simd.u8x16)(x86._mm_aesdeclast_si128(transmute(x86.__m128i)(data), transmute(x86.__m128i)(key)))
|
||||
}
|
||||
|
||||
@(require_results, enable_target_feature = "aes")
|
||||
aesenc :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 {
|
||||
return transmute(simd.u8x16)(x86._mm_aesenc_si128(transmute(x86.__m128i)(data), transmute(x86.__m128i)(key)))
|
||||
}
|
||||
|
||||
@(require_results, enable_target_feature = "aes")
|
||||
aesenclast :: #force_inline proc "c" (data, key: simd.u8x16) -> simd.u8x16 {
|
||||
return transmute(simd.u8x16)(x86._mm_aesenclast_si128(transmute(x86.__m128i)(data), transmute(x86.__m128i)(key)))
|
||||
}
|
||||
|
||||
@(require_results, enable_target_feature = "aes")
|
||||
aesimc :: #force_inline proc "c" (data: simd.u8x16) -> simd.u8x16 {
|
||||
return transmute(simd.u8x16)(x86._mm_aesimc_si128(transmute(x86.__m128i)(data)))
|
||||
}
|
||||
|
||||
@(require_results, enable_target_feature = "aes")
|
||||
aeskeygenassist :: #force_inline proc "c" (data: simd.u8x16, $IMM8: u8) -> simd.u8x16 {
|
||||
return transmute(simd.u8x16)(x86._mm_aeskeygenassist_si128(transmute(x86.__m128i)(data), IMM8))
|
||||
}
|
||||
|
||||
@(private, require_results, enable_target_feature = TARGET_FEATURES)
|
||||
_mm_slli_si128 :: #force_inline proc "c" (a: simd.u8x16, $IMM8: u32) -> simd.u8x16 {
|
||||
return transmute(simd.u8x16)(x86._mm_slli_si128(transmute(x86.__m128i)(a), IMM8))
|
||||
}
|
||||
|
||||
@(private, require_results, enable_target_feature = TARGET_FEATURES)
|
||||
_mm_shuffle_epi32 :: #force_inline proc "c" (a: simd.u8x16, $IMM8: u32) -> simd.u8x16 {
|
||||
return transmute(simd.u8x16)(x86._mm_shuffle_epi32(transmute(x86.__m128i)(a), IMM8))
|
||||
}
|
||||
|
||||
@(private, require_results, enable_target_feature = TARGET_FEATURES)
|
||||
_mm_shuffle_ps :: #force_inline proc "c" (a, b: simd.u8x16, $MASK: u32) -> simd.u8x16 {
|
||||
return transmute(simd.u8x16)(x86._mm_shuffle_ps(transmute(x86.__m128)(a), transmute(x86.__m128)(b), MASK))
|
||||
}
|
||||
181
core/crypto/_aes/hw/keysched_hw.odin
Normal file
181
core/crypto/_aes/hw/keysched_hw.odin
Normal file
@@ -0,0 +1,181 @@
|
||||
// Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
|
||||
// All rights reserved.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions
|
||||
// are met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY THE AUTHORS “AS IS” AND ANY EXPRESS OR
|
||||
// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
// ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
|
||||
// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
||||
// GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
||||
// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#+build amd64,arm32
|
||||
package aes_hw
|
||||
|
||||
import "base:intrinsics"
|
||||
import "core:crypto"
|
||||
import "core:crypto/_aes"
|
||||
import "core:simd"
|
||||
|
||||
// Inspiration taken from BearSSL's AES-NI implementation.
|
||||
//
|
||||
// Note: This assumes that the SROA optimization pass is enabled to be
|
||||
// anything resembling performant otherwise, LLVM will not elide a massive
|
||||
// number of redundant loads/stores it generates for every intrinsic call.
|
||||
|
||||
@(private = "file", require_results, enable_target_feature = TARGET_FEATURES)
|
||||
expand_step128 :: #force_inline proc(k1, k2: simd.u8x16) -> simd.u8x16 {
|
||||
k1, k2 := k1, k2
|
||||
|
||||
k2 = _mm_shuffle_epi32(k2, 0xff)
|
||||
k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
|
||||
k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
|
||||
k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
|
||||
return simd.bit_xor(k1, k2)
|
||||
}
|
||||
|
||||
@(private = "file", require_results, enable_target_feature = TARGET_FEATURES)
|
||||
expand_step192a :: #force_inline proc (k1_, k2_: ^simd.u8x16, k3: simd.u8x16) -> (simd.u8x16, simd.u8x16) {
|
||||
k1, k2, k3 := k1_^, k2_^, k3
|
||||
|
||||
k3 = _mm_shuffle_epi32(k3, 0x55)
|
||||
k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
|
||||
k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
|
||||
k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
|
||||
k1 = simd.bit_xor(k1, k3)
|
||||
|
||||
tmp := k2
|
||||
k2 = simd.bit_xor(k2, _mm_slli_si128(k2, 0x04))
|
||||
k2 = simd.bit_xor(k2, _mm_shuffle_epi32(k1, 0xff))
|
||||
|
||||
k1_, k2_ := k1_, k2_
|
||||
k1_^, k2_^ = k1, k2
|
||||
|
||||
r1 := _mm_shuffle_ps(tmp, k1, 0x44)
|
||||
r2 := _mm_shuffle_ps(k1, k2, 0x4e)
|
||||
|
||||
return r1, r2
|
||||
}
|
||||
|
||||
@(private = "file", require_results, enable_target_feature = TARGET_FEATURES)
|
||||
expand_step192b :: #force_inline proc (k1_, k2_: ^simd.u8x16, k3: simd.u8x16) -> simd.u8x16 {
|
||||
k1, k2, k3 := k1_^, k2_^, k3
|
||||
|
||||
k3 = _mm_shuffle_epi32(k3, 0x55)
|
||||
k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
|
||||
k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
|
||||
k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
|
||||
k1 = simd.bit_xor(k1, k3)
|
||||
|
||||
k2 = simd.bit_xor(k2, _mm_slli_si128(k2, 0x04))
|
||||
k2 = simd.bit_xor(k2, _mm_shuffle_epi32(k1, 0xff))
|
||||
|
||||
k1_, k2_ := k1_, k2_
|
||||
k1_^, k2_^ = k1, k2
|
||||
|
||||
return k1
|
||||
}
|
||||
|
||||
@(private = "file", require_results, enable_target_feature = TARGET_FEATURES)
|
||||
expand_step256b :: #force_inline proc(k1, k2: simd.u8x16) -> simd.u8x16 {
|
||||
k1, k2 := k1, k2
|
||||
|
||||
k2 = _mm_shuffle_epi32(k2, 0xaa)
|
||||
k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
|
||||
k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
|
||||
k1 = simd.bit_xor(k1, _mm_slli_si128(k1, 0x04))
|
||||
return simd.bit_xor(k1, k2)
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = TARGET_FEATURES)
|
||||
derive_dec_keys :: proc(ctx: ^Context, sks: ^[15]simd.u8x16, num_rounds: int) {
|
||||
intrinsics.unaligned_store((^simd.u8x16)(&ctx._sk_exp_dec[0]), sks[num_rounds])
|
||||
for i in 1 ..< num_rounds {
|
||||
tmp := aesimc(sks[i])
|
||||
intrinsics.unaligned_store((^simd.u8x16)(&ctx._sk_exp_dec[num_rounds - i]), tmp)
|
||||
}
|
||||
intrinsics.unaligned_store((^simd.u8x16)(&ctx._sk_exp_dec[num_rounds]), sks[0])
|
||||
}
|
||||
|
||||
@(private, enable_target_feature = TARGET_FEATURES)
|
||||
keysched :: proc(ctx: ^Context, key: []byte) {
|
||||
sks: [15]simd.u8x16 = ---
|
||||
|
||||
// Compute the encryption keys.
|
||||
num_rounds, key_len := 0, len(key)
|
||||
switch key_len {
|
||||
case _aes.KEY_SIZE_128:
|
||||
sks[0] = intrinsics.unaligned_load((^simd.u8x16)(raw_data(key)))
|
||||
sks[1] = expand_step128(sks[0], aeskeygenassist(sks[0], 0x01))
|
||||
sks[2] = expand_step128(sks[1], aeskeygenassist(sks[1], 0x02))
|
||||
sks[3] = expand_step128(sks[2], aeskeygenassist(sks[2], 0x04))
|
||||
sks[4] = expand_step128(sks[3], aeskeygenassist(sks[3], 0x08))
|
||||
sks[5] = expand_step128(sks[4], aeskeygenassist(sks[4], 0x10))
|
||||
sks[6] = expand_step128(sks[5], aeskeygenassist(sks[5], 0x20))
|
||||
sks[7] = expand_step128(sks[6], aeskeygenassist(sks[6], 0x40))
|
||||
sks[8] = expand_step128(sks[7], aeskeygenassist(sks[7], 0x80))
|
||||
sks[9] = expand_step128(sks[8], aeskeygenassist(sks[8], 0x1b))
|
||||
sks[10] = expand_step128(sks[9], aeskeygenassist(sks[9], 0x36))
|
||||
num_rounds = _aes.ROUNDS_128
|
||||
case _aes.KEY_SIZE_192:
|
||||
k0 := intrinsics.unaligned_load((^simd.u8x16)(raw_data(key)))
|
||||
|
||||
k1_tmp: [16]byte
|
||||
copy(k1_tmp[:], key[16:24])
|
||||
k1 := intrinsics.unaligned_load((^simd.u8x16)(&k1_tmp))
|
||||
crypto.zero_explicit(&k1_tmp, size_of(k1_tmp))
|
||||
|
||||
sks[0] = k0
|
||||
sks[1], sks[2] = expand_step192a(&k0, &k1, aeskeygenassist(k1, 0x01))
|
||||
sks[3] = expand_step192b(&k0, &k1, aeskeygenassist(k1, 0x02))
|
||||
sks[4], sks[5] = expand_step192a(&k0, &k1, aeskeygenassist(k1, 0x04))
|
||||
sks[6] = expand_step192b(&k0, &k1, aeskeygenassist(k1, 0x08))
|
||||
sks[7], sks[8] = expand_step192a(&k0, &k1, aeskeygenassist(k1, 0x10))
|
||||
sks[9] = expand_step192b(&k0, &k1, aeskeygenassist(k1, 0x20))
|
||||
sks[10], sks[11] = expand_step192a(&k0, &k1, aeskeygenassist(k1, 0x40))
|
||||
sks[12] = expand_step192b(&k0, &k1, aeskeygenassist(k1, 0x80))
|
||||
num_rounds = _aes.ROUNDS_192
|
||||
|
||||
case _aes.KEY_SIZE_256:
|
||||
sks[0] = intrinsics.unaligned_load((^simd.u8x16)(raw_data(key)))
|
||||
sks[1] = intrinsics.unaligned_load((^simd.u8x16)(raw_data(key[16:])))
|
||||
sks[2] = expand_step128(sks[0], aeskeygenassist(sks[1], 0x01))
|
||||
sks[3] = expand_step256b(sks[1], aeskeygenassist(sks[2], 0x01))
|
||||
sks[4] = expand_step128(sks[2], aeskeygenassist(sks[3], 0x02))
|
||||
sks[5] = expand_step256b(sks[3], aeskeygenassist(sks[4], 0x02))
|
||||
sks[6] = expand_step128(sks[4], aeskeygenassist(sks[5], 0x04))
|
||||
sks[7] = expand_step256b(sks[5], aeskeygenassist(sks[6], 0x04))
|
||||
sks[8] = expand_step128(sks[6], aeskeygenassist(sks[7], 0x08))
|
||||
sks[9] = expand_step256b(sks[7], aeskeygenassist(sks[8], 0x08))
|
||||
sks[10] = expand_step128(sks[8], aeskeygenassist(sks[9], 0x10))
|
||||
sks[11] = expand_step256b(sks[9], aeskeygenassist(sks[10], 0x10))
|
||||
sks[12] = expand_step128(sks[10], aeskeygenassist(sks[11], 0x20))
|
||||
sks[13] = expand_step256b(sks[11], aeskeygenassist(sks[12], 0x20))
|
||||
sks[14] = expand_step128(sks[12], aeskeygenassist(sks[13], 0x40))
|
||||
num_rounds = _aes.ROUNDS_256
|
||||
case:
|
||||
panic("crypto/aes: invalid AES key size")
|
||||
}
|
||||
for i in 0 ..= num_rounds {
|
||||
intrinsics.unaligned_store((^simd.u8x16)(&ctx._sk_exp_enc[i]), sks[i])
|
||||
}
|
||||
|
||||
// Compute the decryption keys. GCM and CTR do not need this, however
|
||||
// ECB, CBC, OCB3, etc do.
|
||||
derive_dec_keys(ctx, &sks, num_rounds)
|
||||
|
||||
ctx._num_rounds = num_rounds
|
||||
|
||||
crypto.zero_explicit(&sks, size_of(sks))
|
||||
}
|
||||
11
core/crypto/_aes/hw/unsupported.odin
Normal file
11
core/crypto/_aes/hw/unsupported.odin
Normal file
@@ -0,0 +1,11 @@
|
||||
#+build !amd64
|
||||
#+build !arm64
|
||||
#+build !arm32
|
||||
package aes_hw
|
||||
|
||||
HAS_GHASH :: false
|
||||
|
||||
@(private)
|
||||
keysched :: proc(ctx: ^Context, key: []byte) {
|
||||
panic("crypto/aes: hardware implementation unsupported")
|
||||
}
|
||||
@@ -1,38 +0,0 @@
|
||||
#+build amd64
|
||||
package aes_hw_intel
|
||||
|
||||
import "core:sys/info"
|
||||
|
||||
// is_supported returns true if and only if (⟺) hardware accelerated AES
|
||||
// is supported.
|
||||
is_supported :: proc "contextless" () -> bool {
|
||||
// Note: Everything with AES-NI and PCLMULQDQ has support for
|
||||
// the required SSE extxtensions.
|
||||
req_features :: info.CPU_Features{
|
||||
.sse2,
|
||||
.ssse3,
|
||||
.sse41,
|
||||
.aes,
|
||||
.pclmulqdq,
|
||||
}
|
||||
return info.cpu_features() >= req_features
|
||||
}
|
||||
|
||||
// Context is a keyed AES (ECB) instance.
|
||||
Context :: struct {
|
||||
// Note: The ideal thing to do is for the expanded round keys to be
|
||||
// arrays of `__m128i`, however that implies alignment (or using AVX).
|
||||
//
|
||||
// All the people using e-waste processors that don't support an
|
||||
// insturction set that has been around for over 10 years are why
|
||||
// we can't have nice things.
|
||||
_sk_exp_enc: [15][16]byte,
|
||||
_sk_exp_dec: [15][16]byte,
|
||||
_num_rounds: int,
|
||||
}
|
||||
|
||||
// init initializes a context for AES with the provided key.
|
||||
init :: proc(ctx: ^Context, key: []byte) {
|
||||
keysched(ctx, key)
|
||||
}
|
||||
|
||||
@@ -1,178 +0,0 @@
|
||||
// Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
|
||||
// All rights reserved.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions
|
||||
// are met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY THE AUTHORS “AS IS” AND ANY EXPRESS OR
|
||||
// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
// ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
|
||||
// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
||||
// GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
||||
// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#+build amd64
|
||||
package aes_hw_intel
|
||||
|
||||
import "base:intrinsics"
|
||||
import "core:crypto"
|
||||
import "core:crypto/_aes"
|
||||
import "core:simd/x86"
|
||||
|
||||
// Intel AES-NI based implementation. Inspiration taken from BearSSL.
|
||||
//
|
||||
// Note: This assumes that the SROA optimization pass is enabled to be
|
||||
// anything resembling performat otherwise, LLVM will not elide a massive
|
||||
// number of redundant loads/stores it generates for every intrinsic call.
|
||||
|
||||
@(private = "file", require_results, enable_target_feature = "sse2")
|
||||
expand_step128 :: #force_inline proc(k1, k2: x86.__m128i) -> x86.__m128i {
|
||||
k1, k2 := k1, k2
|
||||
|
||||
k2 = x86._mm_shuffle_epi32(k2, 0xff)
|
||||
k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
|
||||
k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
|
||||
k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
|
||||
return x86._mm_xor_si128(k1, k2)
|
||||
}
|
||||
|
||||
@(private = "file", require_results, enable_target_feature = "sse,sse2")
|
||||
expand_step192a :: #force_inline proc (k1_, k2_: ^x86.__m128i, k3: x86.__m128i) -> (x86.__m128i, x86.__m128i) {
|
||||
k1, k2, k3 := k1_^, k2_^, k3
|
||||
|
||||
k3 = x86._mm_shuffle_epi32(k3, 0x55)
|
||||
k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
|
||||
k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
|
||||
k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
|
||||
k1 = x86._mm_xor_si128(k1, k3)
|
||||
|
||||
tmp := k2
|
||||
k2 = x86._mm_xor_si128(k2, x86._mm_slli_si128(k2, 0x04))
|
||||
k2 = x86._mm_xor_si128(k2, x86._mm_shuffle_epi32(k1, 0xff))
|
||||
|
||||
k1_, k2_ := k1_, k2_
|
||||
k1_^, k2_^ = k1, k2
|
||||
|
||||
r1 := transmute(x86.__m128i)(x86._mm_shuffle_ps(transmute(x86.__m128)(tmp), transmute(x86.__m128)(k1), 0x44))
|
||||
r2 := transmute(x86.__m128i)(x86._mm_shuffle_ps(transmute(x86.__m128)(k1), transmute(x86.__m128)(k2), 0x4e))
|
||||
|
||||
return r1, r2
|
||||
}
|
||||
|
||||
@(private = "file", require_results, enable_target_feature = "sse2")
|
||||
expand_step192b :: #force_inline proc (k1_, k2_: ^x86.__m128i, k3: x86.__m128i) -> x86.__m128i {
|
||||
k1, k2, k3 := k1_^, k2_^, k3
|
||||
|
||||
k3 = x86._mm_shuffle_epi32(k3, 0x55)
|
||||
k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
|
||||
k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
|
||||
k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
|
||||
k1 = x86._mm_xor_si128(k1, k3)
|
||||
|
||||
k2 = x86._mm_xor_si128(k2, x86._mm_slli_si128(k2, 0x04))
|
||||
k2 = x86._mm_xor_si128(k2, x86._mm_shuffle_epi32(k1, 0xff))
|
||||
|
||||
k1_, k2_ := k1_, k2_
|
||||
k1_^, k2_^ = k1, k2
|
||||
|
||||
return k1
|
||||
}
|
||||
|
||||
@(private = "file", require_results, enable_target_feature = "sse2")
|
||||
expand_step256b :: #force_inline proc(k1, k2: x86.__m128i) -> x86.__m128i {
|
||||
k1, k2 := k1, k2
|
||||
|
||||
k2 = x86._mm_shuffle_epi32(k2, 0xaa)
|
||||
k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
|
||||
k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
|
||||
k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
|
||||
return x86._mm_xor_si128(k1, k2)
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = "aes")
|
||||
derive_dec_keys :: proc(ctx: ^Context, sks: ^[15]x86.__m128i, num_rounds: int) {
|
||||
intrinsics.unaligned_store((^x86.__m128i)(&ctx._sk_exp_dec[0]), sks[num_rounds])
|
||||
for i in 1 ..< num_rounds {
|
||||
tmp := x86._mm_aesimc_si128(sks[i])
|
||||
intrinsics.unaligned_store((^x86.__m128i)(&ctx._sk_exp_dec[num_rounds - i]), tmp)
|
||||
}
|
||||
intrinsics.unaligned_store((^x86.__m128i)(&ctx._sk_exp_dec[num_rounds]), sks[0])
|
||||
}
|
||||
|
||||
@(private, enable_target_feature = "sse,sse2,aes")
|
||||
keysched :: proc(ctx: ^Context, key: []byte) {
|
||||
sks: [15]x86.__m128i = ---
|
||||
|
||||
// Compute the encryption keys.
|
||||
num_rounds, key_len := 0, len(key)
|
||||
switch key_len {
|
||||
case _aes.KEY_SIZE_128:
|
||||
sks[0] = intrinsics.unaligned_load((^x86.__m128i)(raw_data(key)))
|
||||
sks[1] = expand_step128(sks[0], x86._mm_aeskeygenassist_si128(sks[0], 0x01))
|
||||
sks[2] = expand_step128(sks[1], x86._mm_aeskeygenassist_si128(sks[1], 0x02))
|
||||
sks[3] = expand_step128(sks[2], x86._mm_aeskeygenassist_si128(sks[2], 0x04))
|
||||
sks[4] = expand_step128(sks[3], x86._mm_aeskeygenassist_si128(sks[3], 0x08))
|
||||
sks[5] = expand_step128(sks[4], x86._mm_aeskeygenassist_si128(sks[4], 0x10))
|
||||
sks[6] = expand_step128(sks[5], x86._mm_aeskeygenassist_si128(sks[5], 0x20))
|
||||
sks[7] = expand_step128(sks[6], x86._mm_aeskeygenassist_si128(sks[6], 0x40))
|
||||
sks[8] = expand_step128(sks[7], x86._mm_aeskeygenassist_si128(sks[7], 0x80))
|
||||
sks[9] = expand_step128(sks[8], x86._mm_aeskeygenassist_si128(sks[8], 0x1b))
|
||||
sks[10] = expand_step128(sks[9], x86._mm_aeskeygenassist_si128(sks[9], 0x36))
|
||||
num_rounds = _aes.ROUNDS_128
|
||||
case _aes.KEY_SIZE_192:
|
||||
k0 := intrinsics.unaligned_load((^x86.__m128i)(raw_data(key)))
|
||||
k1 := x86.__m128i{
|
||||
intrinsics.unaligned_load((^i64)(raw_data(key[16:]))),
|
||||
0,
|
||||
}
|
||||
sks[0] = k0
|
||||
sks[1], sks[2] = expand_step192a(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x01))
|
||||
sks[3] = expand_step192b(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x02))
|
||||
sks[4], sks[5] = expand_step192a(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x04))
|
||||
sks[6] = expand_step192b(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x08))
|
||||
sks[7], sks[8] = expand_step192a(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x10))
|
||||
sks[9] = expand_step192b(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x20))
|
||||
sks[10], sks[11] = expand_step192a(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x40))
|
||||
sks[12] = expand_step192b(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x80))
|
||||
num_rounds = _aes.ROUNDS_192
|
||||
case _aes.KEY_SIZE_256:
|
||||
sks[0] = intrinsics.unaligned_load((^x86.__m128i)(raw_data(key)))
|
||||
sks[1] = intrinsics.unaligned_load((^x86.__m128i)(raw_data(key[16:])))
|
||||
sks[2] = expand_step128(sks[0], x86._mm_aeskeygenassist_si128(sks[1], 0x01))
|
||||
sks[3] = expand_step256b(sks[1], x86._mm_aeskeygenassist_si128(sks[2], 0x01))
|
||||
sks[4] = expand_step128(sks[2], x86._mm_aeskeygenassist_si128(sks[3], 0x02))
|
||||
sks[5] = expand_step256b(sks[3], x86._mm_aeskeygenassist_si128(sks[4], 0x02))
|
||||
sks[6] = expand_step128(sks[4], x86._mm_aeskeygenassist_si128(sks[5], 0x04))
|
||||
sks[7] = expand_step256b(sks[5], x86._mm_aeskeygenassist_si128(sks[6], 0x04))
|
||||
sks[8] = expand_step128(sks[6], x86._mm_aeskeygenassist_si128(sks[7], 0x08))
|
||||
sks[9] = expand_step256b(sks[7], x86._mm_aeskeygenassist_si128(sks[8], 0x08))
|
||||
sks[10] = expand_step128(sks[8], x86._mm_aeskeygenassist_si128(sks[9], 0x10))
|
||||
sks[11] = expand_step256b(sks[9], x86._mm_aeskeygenassist_si128(sks[10], 0x10))
|
||||
sks[12] = expand_step128(sks[10], x86._mm_aeskeygenassist_si128(sks[11], 0x20))
|
||||
sks[13] = expand_step256b(sks[11], x86._mm_aeskeygenassist_si128(sks[12], 0x20))
|
||||
sks[14] = expand_step128(sks[12], x86._mm_aeskeygenassist_si128(sks[13], 0x40))
|
||||
num_rounds = _aes.ROUNDS_256
|
||||
case:
|
||||
panic("crypto/aes: invalid AES key size")
|
||||
}
|
||||
for i in 0 ..= num_rounds {
|
||||
intrinsics.unaligned_store((^x86.__m128i)(&ctx._sk_exp_enc[i]), sks[i])
|
||||
}
|
||||
|
||||
// Compute the decryption keys. GCM and CTR do not need this, however
|
||||
// ECB, CBC, OCB3, etc do.
|
||||
derive_dec_keys(ctx, &sks, num_rounds)
|
||||
|
||||
ctx._num_rounds = num_rounds
|
||||
|
||||
crypto.zero_explicit(&sks, size_of(sks))
|
||||
}
|
||||
@@ -4,7 +4,7 @@ package aes
|
||||
import "base:intrinsics"
|
||||
import "core:crypto"
|
||||
import "core:crypto/_aes"
|
||||
import "core:crypto/_aes/hw_intel"
|
||||
import aes_hw "core:crypto/_aes/hw"
|
||||
import "core:encoding/endian"
|
||||
import "core:simd/x86"
|
||||
|
||||
@@ -17,7 +17,7 @@ gcm_seal_hw :: proc(ctx: ^Context_Impl_Hardware, dst, tag, iv, aad, plaintext: [
|
||||
init_ghash_hw(ctx, &h, &j0, &j0_enc, iv)
|
||||
|
||||
// Note: Our GHASH implementation handles appending padding.
|
||||
hw_intel.ghash(s[:], h[:], aad)
|
||||
aes_hw.ghash(s[:], h[:], aad)
|
||||
gctr_hw(ctx, dst, &s, plaintext, &h, &j0, true)
|
||||
final_ghash_hw(&s, &h, &j0_enc, len(aad), len(plaintext))
|
||||
copy(tag, s[:])
|
||||
@@ -35,7 +35,7 @@ gcm_open_hw :: proc(ctx: ^Context_Impl_Hardware, dst, iv, aad, ciphertext, tag:
|
||||
s: [_aes.GHASH_TAG_SIZE]byte
|
||||
init_ghash_hw(ctx, &h, &j0, &j0_enc, iv)
|
||||
|
||||
hw_intel.ghash(s[:], h[:], aad)
|
||||
aes_hw.ghash(s[:], h[:], aad)
|
||||
gctr_hw(ctx, dst, &s, ciphertext, &h, &j0, false)
|
||||
final_ghash_hw(&s, &h, &j0_enc, len(aad), len(ciphertext))
|
||||
|
||||
@@ -71,11 +71,11 @@ init_ghash_hw :: proc(
|
||||
} else {
|
||||
// If len(IV) != 96, then let s = 128 ceil(len(IV)/128) - len(IV),
|
||||
// and let J0 = GHASHH(IV || 0^(s+64) || ceil(len(IV))^64).
|
||||
hw_intel.ghash(j0[:], h[:], iv)
|
||||
aes_hw.ghash(j0[:], h[:], iv)
|
||||
|
||||
tmp: [_aes.GHASH_BLOCK_SIZE]byte
|
||||
endian.unchecked_put_u64be(tmp[8:], u64(l) * 8)
|
||||
hw_intel.ghash(j0[:], h[:], tmp[:])
|
||||
aes_hw.ghash(j0[:], h[:], tmp[:])
|
||||
}
|
||||
|
||||
// ECB encrypt j0, so that we can just XOR with the tag.
|
||||
@@ -94,7 +94,7 @@ final_ghash_hw :: proc(
|
||||
endian.unchecked_put_u64be(blk[0:], u64(a_len) * 8)
|
||||
endian.unchecked_put_u64be(blk[8:], u64(t_len) * 8)
|
||||
|
||||
hw_intel.ghash(s[:], h[:], blk[:])
|
||||
aes_hw.ghash(s[:], h[:], blk[:])
|
||||
j0_vec := intrinsics.unaligned_load((^x86.__m128i)(j0))
|
||||
s_vec := intrinsics.unaligned_load((^x86.__m128i)(s))
|
||||
s_vec = x86._mm_xor_si128(s_vec, j0_vec)
|
||||
@@ -131,7 +131,7 @@ gctr_hw :: proc(
|
||||
nr_blocks := len(src) / BLOCK_SIZE
|
||||
for nr_blocks >= CTR_STRIDE_HW {
|
||||
if !is_seal {
|
||||
hw_intel.ghash(s[:], h[:], src[:CTR_STRIDE_BYTES_HW])
|
||||
aes_hw.ghash(s[:], h[:], src[:CTR_STRIDE_BYTES_HW])
|
||||
}
|
||||
|
||||
#unroll for i in 0 ..< CTR_STRIDE_HW {
|
||||
@@ -174,7 +174,7 @@ gctr_hw :: proc(
|
||||
xor_blocks_hw(dst, src, blks[:])
|
||||
|
||||
if is_seal {
|
||||
hw_intel.ghash(s[:], h[:], dst[:CTR_STRIDE_BYTES_HW])
|
||||
aes_hw.ghash(s[:], h[:], dst[:CTR_STRIDE_BYTES_HW])
|
||||
}
|
||||
|
||||
src = src[CTR_STRIDE_BYTES_HW:]
|
||||
@@ -186,7 +186,7 @@ gctr_hw :: proc(
|
||||
for n := len(src); n > 0; {
|
||||
l := min(n, BLOCK_SIZE)
|
||||
if !is_seal {
|
||||
hw_intel.ghash(s[:], h[:], src[:l])
|
||||
aes_hw.ghash(s[:], h[:], src[:l])
|
||||
}
|
||||
|
||||
blks[0], ctr = hw_inc_ctr32(&ctr_blk, ctr)
|
||||
@@ -219,7 +219,7 @@ gctr_hw :: proc(
|
||||
copy(dst, blk[:l])
|
||||
}
|
||||
if is_seal {
|
||||
hw_intel.ghash(s[:], h[:], dst[:l])
|
||||
aes_hw.ghash(s[:], h[:], dst[:l])
|
||||
}
|
||||
|
||||
dst = dst[l:]
|
||||
|
||||
@@ -1,18 +1,18 @@
|
||||
#+build amd64
|
||||
package aes
|
||||
|
||||
import "core:crypto/_aes/hw_intel"
|
||||
import aes_hw "core:crypto/_aes/hw"
|
||||
|
||||
// is_hardware_accelerated returns true if and only if (⟺) hardware accelerated AES
|
||||
// is supported.
|
||||
is_hardware_accelerated :: proc "contextless" () -> bool {
|
||||
return hw_intel.is_supported()
|
||||
return aes_hw.is_supported()
|
||||
}
|
||||
|
||||
@(private)
|
||||
Context_Impl_Hardware :: hw_intel.Context
|
||||
Context_Impl_Hardware :: aes_hw.Context
|
||||
|
||||
@(private, enable_target_feature = "sse2,aes")
|
||||
init_impl_hw :: proc(ctx: ^Context_Impl_Hardware, key: []byte) {
|
||||
hw_intel.init(ctx, key)
|
||||
aes_hw.init(ctx, key)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user