mirror of
https://github.com/odin-lang/Odin.git
synced 2026-01-03 03:32:37 +00:00
Merge pull request #3928 from Yawning/feature/aes-ni
core/crypto: Support AES-NI + PCLMUL
This commit is contained in:
33
.gitignore
vendored
33
.gitignore
vendored
@@ -24,38 +24,6 @@ bld/
|
||||
![Cc]ore/[Ll]og/
|
||||
tests/documentation/verify/
|
||||
tests/documentation/all.odin-doc
|
||||
tests/internal/test_map
|
||||
tests/internal/test_pow
|
||||
tests/internal/test_rtti
|
||||
tests/core/test_base64
|
||||
tests/core/test_cbor
|
||||
tests/core/test_core_compress
|
||||
tests/core/test_core_container
|
||||
tests/core/test_core_filepath
|
||||
tests/core/test_core_fmt
|
||||
tests/core/test_core_i18n
|
||||
tests/core/test_core_image
|
||||
tests/core/test_core_libc
|
||||
tests/core/test_core_match
|
||||
tests/core/test_core_math
|
||||
tests/core/test_core_net
|
||||
tests/core/test_core_os_exit
|
||||
tests/core/test_core_reflect
|
||||
tests/core/test_core_strings
|
||||
tests/core/test_core_time
|
||||
tests/core/test_crypto
|
||||
tests/core/test_hash
|
||||
tests/core/test_hex
|
||||
tests/core/test_hxa
|
||||
tests/core/test_json
|
||||
tests/core/test_linalg_glsl_math
|
||||
tests/core/test_noise
|
||||
tests/core/test_varint
|
||||
tests/core/test_xml
|
||||
tests/core/test_core_slice
|
||||
tests/core/test_core_thread
|
||||
tests/core/test_core_runtime
|
||||
tests/vendor/vendor_botan
|
||||
# Visual Studio 2015 cache/options directory
|
||||
.vs/
|
||||
# Visual Studio Code options directory
|
||||
@@ -63,6 +31,7 @@ tests/vendor/vendor_botan
|
||||
# Uncomment if you have tasks that create the project's static files in wwwroot
|
||||
#wwwroot/
|
||||
demo
|
||||
benchmark
|
||||
|
||||
# MSTest test Results
|
||||
[Tt]est[Rr]esult*/
|
||||
|
||||
@@ -1167,3 +1167,28 @@ fields_proc :: proc(s: []byte, f: proc(rune) -> bool, allocator := context.alloc
|
||||
|
||||
return subslices[:]
|
||||
}
|
||||
|
||||
// alias returns true iff a and b have a non-zero length, and any part of
|
||||
// a overlaps with b.
|
||||
alias :: proc "contextless" (a, b: []byte) -> bool {
|
||||
a_len, b_len := len(a), len(b)
|
||||
if a_len == 0 || b_len == 0 {
|
||||
return false
|
||||
}
|
||||
|
||||
a_start, b_start := uintptr(raw_data(a)), uintptr(raw_data(b))
|
||||
a_end, b_end := a_start + uintptr(a_len-1), b_start + uintptr(b_len-1)
|
||||
|
||||
return a_start <= b_end && b_start <= a_end
|
||||
}
|
||||
|
||||
// alias_inexactly returns true iff a and b have a non-zero length,
|
||||
// the base pointer of a and b are NOT equal, and any part of a overlaps
|
||||
// with b (ie: `alias(a, b)` with an exception that returns false for
|
||||
// `a == b`, `b = a[:len(a)-69]` and similar conditions).
|
||||
alias_inexactly :: proc "contextless" (a, b: []byte) -> bool {
|
||||
if raw_data(a) == raw_data(b) {
|
||||
return false
|
||||
}
|
||||
return alias(a, b)
|
||||
}
|
||||
|
||||
@@ -7,9 +7,8 @@ STRIDE :: 4
|
||||
|
||||
// Context is a keyed AES (ECB) instance.
|
||||
Context :: struct {
|
||||
_sk_exp: [120]u64,
|
||||
_num_rounds: int,
|
||||
_is_initialized: bool,
|
||||
_sk_exp: [120]u64,
|
||||
_num_rounds: int,
|
||||
}
|
||||
|
||||
// init initializes a context for AES with the provided key.
|
||||
@@ -18,13 +17,10 @@ init :: proc(ctx: ^Context, key: []byte) {
|
||||
|
||||
ctx._num_rounds = keysched(skey[:], key)
|
||||
skey_expand(ctx._sk_exp[:], skey[:], ctx._num_rounds)
|
||||
ctx._is_initialized = true
|
||||
}
|
||||
|
||||
// encrypt_block sets `dst` to `AES-ECB-Encrypt(src)`.
|
||||
encrypt_block :: proc(ctx: ^Context, dst, src: []byte) {
|
||||
assert(ctx._is_initialized)
|
||||
|
||||
q: [8]u64
|
||||
load_blockx1(&q, src)
|
||||
_encrypt(&q, ctx._sk_exp[:], ctx._num_rounds)
|
||||
@@ -33,8 +29,6 @@ encrypt_block :: proc(ctx: ^Context, dst, src: []byte) {
|
||||
|
||||
// encrypt_block sets `dst` to `AES-ECB-Decrypt(src)`.
|
||||
decrypt_block :: proc(ctx: ^Context, dst, src: []byte) {
|
||||
assert(ctx._is_initialized)
|
||||
|
||||
q: [8]u64
|
||||
load_blockx1(&q, src)
|
||||
_decrypt(&q, ctx._sk_exp[:], ctx._num_rounds)
|
||||
@@ -43,8 +37,6 @@ decrypt_block :: proc(ctx: ^Context, dst, src: []byte) {
|
||||
|
||||
// encrypt_blocks sets `dst` to `AES-ECB-Encrypt(src[0], .. src[n])`.
|
||||
encrypt_blocks :: proc(ctx: ^Context, dst, src: [][]byte) {
|
||||
assert(ctx._is_initialized)
|
||||
|
||||
q: [8]u64 = ---
|
||||
src, dst := src, dst
|
||||
|
||||
@@ -67,8 +59,6 @@ encrypt_blocks :: proc(ctx: ^Context, dst, src: [][]byte) {
|
||||
|
||||
// decrypt_blocks sets dst to `AES-ECB-Decrypt(src[0], .. src[n])`.
|
||||
decrypt_blocks :: proc(ctx: ^Context, dst, src: [][]byte) {
|
||||
assert(ctx._is_initialized)
|
||||
|
||||
q: [8]u64 = ---
|
||||
src, dst := src, dst
|
||||
|
||||
|
||||
43
core/crypto/_aes/hw_intel/api.odin
Normal file
43
core/crypto/_aes/hw_intel/api.odin
Normal file
@@ -0,0 +1,43 @@
|
||||
//+build amd64
|
||||
package aes_hw_intel
|
||||
|
||||
import "core:sys/info"
|
||||
|
||||
// is_supporte returns true iff hardware accelerated AES
|
||||
// is supported.
|
||||
is_supported :: proc "contextless" () -> bool {
|
||||
features, ok := info.cpu_features.?
|
||||
if !ok {
|
||||
return false
|
||||
}
|
||||
|
||||
// Note: Everything with AES-NI and PCLMULQDQ has support for
|
||||
// the required SSE extxtensions.
|
||||
req_features :: info.CPU_Features{
|
||||
.sse2,
|
||||
.ssse3,
|
||||
.sse41,
|
||||
.aes,
|
||||
.pclmulqdq,
|
||||
}
|
||||
return features >= req_features
|
||||
}
|
||||
|
||||
// Context is a keyed AES (ECB) instance.
|
||||
Context :: struct {
|
||||
// Note: The ideal thing to do is for the expanded round keys to be
|
||||
// arrays of `__m128i`, however that implies alignment (or using AVX).
|
||||
//
|
||||
// All the people using e-waste processors that don't support an
|
||||
// insturction set that has been around for over 10 years are why
|
||||
// we can't have nice things.
|
||||
_sk_exp_enc: [15][16]byte,
|
||||
_sk_exp_dec: [15][16]byte,
|
||||
_num_rounds: int,
|
||||
}
|
||||
|
||||
// init initializes a context for AES with the provided key.
|
||||
init :: proc(ctx: ^Context, key: []byte) {
|
||||
keysched(ctx, key)
|
||||
}
|
||||
|
||||
281
core/crypto/_aes/hw_intel/ghash.odin
Normal file
281
core/crypto/_aes/hw_intel/ghash.odin
Normal file
@@ -0,0 +1,281 @@
|
||||
// Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
|
||||
// All rights reserved.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions
|
||||
// are met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY THE AUTHORS “AS IS” AND ANY EXPRESS OR
|
||||
// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
// ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
|
||||
// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
||||
// GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
||||
// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
//+build amd64
|
||||
package aes_hw_intel
|
||||
|
||||
import "base:intrinsics"
|
||||
import "core:crypto/_aes"
|
||||
import "core:simd"
|
||||
import "core:simd/x86"
|
||||
|
||||
@(private = "file")
|
||||
GHASH_STRIDE_HW :: 4
|
||||
@(private = "file")
|
||||
GHASH_STRIDE_BYTES_HW :: GHASH_STRIDE_HW * _aes.GHASH_BLOCK_SIZE
|
||||
|
||||
// GHASH is defined over elements of GF(2^128) with "full little-endian"
|
||||
// representation: leftmost byte is least significant, and, within each
|
||||
// byte, leftmost _bit_ is least significant. The natural ordering in
|
||||
// x86 is "mixed little-endian": bytes are ordered from least to most
|
||||
// significant, but bits within a byte are in most-to-least significant
|
||||
// order. Going to full little-endian representation would require
|
||||
// reversing bits within each byte, which is doable but expensive.
|
||||
//
|
||||
// Instead, we go to full big-endian representation, by swapping bytes
|
||||
// around, which is done with a single _mm_shuffle_epi8() opcode (it
|
||||
// comes with SSSE3; all CPU that offer pclmulqdq also have SSSE3). We
|
||||
// can use a full big-endian representation because in a carryless
|
||||
// multiplication, we have a nice bit reversal property:
|
||||
//
|
||||
// rev_128(x) * rev_128(y) = rev_255(x * y)
|
||||
//
|
||||
// So by using full big-endian, we still get the right result, except
|
||||
// that it is right-shifted by 1 bit. The left-shift is relatively
|
||||
// inexpensive, and it can be mutualised.
|
||||
//
|
||||
// Since SSE2 opcodes do not have facilities for shitfting full 128-bit
|
||||
// values with bit precision, we have to break down values into 64-bit
|
||||
// chunks. We number chunks from 0 to 3 in left to right order.
|
||||
|
||||
@(private = "file")
|
||||
byteswap_index := transmute(x86.__m128i)simd.i8x16{
|
||||
// Note: simd.i8x16 is reverse order from x86._mm_set_epi8.
|
||||
15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
|
||||
}
|
||||
|
||||
@(private = "file", require_results, enable_target_feature = "sse2,ssse3")
|
||||
byteswap :: #force_inline proc "contextless" (x: x86.__m128i) -> x86.__m128i {
|
||||
return x86._mm_shuffle_epi8(x, byteswap_index)
|
||||
}
|
||||
|
||||
// From a 128-bit value kw, compute kx as the XOR of the two 64-bit
|
||||
// halves of kw (into the right half of kx; left half is unspecified),
|
||||
// and return kx.
|
||||
@(private = "file", require_results, enable_target_feature = "sse2")
|
||||
bk :: #force_inline proc "contextless" (kw: x86.__m128i) -> x86.__m128i {
|
||||
return x86._mm_xor_si128(kw, x86._mm_shuffle_epi32(kw, 0x0e))
|
||||
}
|
||||
|
||||
// Combine two 64-bit values (k0:k1) into a 128-bit (kw) value and
|
||||
// the XOR of the two values (kx), and return (kw, kx).
|
||||
@(private = "file", enable_target_feature = "sse2")
|
||||
pbk :: #force_inline proc "contextless" (k0, k1: x86.__m128i) -> (x86.__m128i, x86.__m128i) {
|
||||
kw := x86._mm_unpacklo_epi64(k1, k0)
|
||||
kx := x86._mm_xor_si128(k0, k1)
|
||||
return kw, kx
|
||||
}
|
||||
|
||||
// Left-shift by 1 bit a 256-bit value (in four 64-bit words).
|
||||
@(private = "file", require_results, enable_target_feature = "sse2")
|
||||
sl_256 :: #force_inline proc "contextless" (x0, x1, x2, x3: x86.__m128i) -> (x86.__m128i, x86.__m128i, x86.__m128i, x86.__m128i) {
|
||||
x0, x1, x2, x3 := x0, x1, x2, x3
|
||||
|
||||
x0 = x86._mm_or_si128(x86._mm_slli_epi64(x0, 1), x86._mm_srli_epi64(x1, 63))
|
||||
x1 = x86._mm_or_si128(x86._mm_slli_epi64(x1, 1), x86._mm_srli_epi64(x2, 63))
|
||||
x2 = x86._mm_or_si128(x86._mm_slli_epi64(x2, 1), x86._mm_srli_epi64(x3, 63))
|
||||
x3 = x86._mm_slli_epi64(x3, 1)
|
||||
|
||||
return x0, x1, x2, x3
|
||||
}
|
||||
|
||||
// Perform reduction in GF(2^128).
|
||||
@(private = "file", require_results, enable_target_feature = "sse2")
|
||||
reduce_f128 :: #force_inline proc "contextless" (x0, x1, x2, x3: x86.__m128i) -> (x86.__m128i, x86.__m128i) {
|
||||
x0, x1, x2 := x0, x1, x2
|
||||
|
||||
x1 = x86._mm_xor_si128(
|
||||
x1,
|
||||
x86._mm_xor_si128(
|
||||
x86._mm_xor_si128(
|
||||
x3,
|
||||
x86._mm_srli_epi64(x3, 1)),
|
||||
x86._mm_xor_si128(
|
||||
x86._mm_srli_epi64(x3, 2),
|
||||
x86._mm_srli_epi64(x3, 7))))
|
||||
x2 = x86._mm_xor_si128(
|
||||
x86._mm_xor_si128(
|
||||
x2,
|
||||
x86._mm_slli_epi64(x3, 63)),
|
||||
x86._mm_xor_si128(
|
||||
x86._mm_slli_epi64(x3, 62),
|
||||
x86._mm_slli_epi64(x3, 57)))
|
||||
x0 = x86._mm_xor_si128(
|
||||
x0,
|
||||
x86._mm_xor_si128(
|
||||
x86._mm_xor_si128(
|
||||
x2,
|
||||
x86._mm_srli_epi64(x2, 1)),
|
||||
x86._mm_xor_si128(
|
||||
x86._mm_srli_epi64(x2, 2),
|
||||
x86._mm_srli_epi64(x2, 7))))
|
||||
x1 = x86._mm_xor_si128(
|
||||
x86._mm_xor_si128(
|
||||
x1,
|
||||
x86._mm_slli_epi64(x2, 63)),
|
||||
x86._mm_xor_si128(
|
||||
x86._mm_slli_epi64(x2, 62),
|
||||
x86._mm_slli_epi64(x2, 57)))
|
||||
|
||||
return x0, x1
|
||||
}
|
||||
|
||||
// Square value kw in GF(2^128) into (dw,dx).
|
||||
@(private = "file", require_results, enable_target_feature = "sse2,pclmul")
|
||||
square_f128 :: #force_inline proc "contextless" (kw: x86.__m128i) -> (x86.__m128i, x86.__m128i) {
|
||||
z1 := x86._mm_clmulepi64_si128(kw, kw, 0x11)
|
||||
z3 := x86._mm_clmulepi64_si128(kw, kw, 0x00)
|
||||
z0 := x86._mm_shuffle_epi32(z1, 0x0E)
|
||||
z2 := x86._mm_shuffle_epi32(z3, 0x0E)
|
||||
z0, z1, z2, z3 = sl_256(z0, z1, z2, z3)
|
||||
z0, z1 = reduce_f128(z0, z1, z2, z3)
|
||||
return pbk(z0, z1)
|
||||
}
|
||||
|
||||
// ghash calculates the GHASH of data, with the key `key`, and input `dst`
|
||||
// and `data`, and stores the resulting digest in `dst`.
|
||||
//
|
||||
// Note: `dst` is both an input and an output, to support easy implementation
|
||||
// of GCM.
|
||||
@(enable_target_feature = "sse2,ssse3,pclmul")
|
||||
ghash :: proc "contextless" (dst, key, data: []byte) #no_bounds_check {
|
||||
if len(dst) != _aes.GHASH_BLOCK_SIZE || len(key) != _aes.GHASH_BLOCK_SIZE {
|
||||
intrinsics.trap()
|
||||
}
|
||||
|
||||
// Note: BearSSL opts to copy the remainder into a zero-filled
|
||||
// 64-byte buffer. We do something slightly more simple.
|
||||
|
||||
// Load key and dst (h and y).
|
||||
yw := intrinsics.unaligned_load((^x86.__m128i)(raw_data(dst)))
|
||||
h1w := intrinsics.unaligned_load((^x86.__m128i)(raw_data(key)))
|
||||
yw = byteswap(yw)
|
||||
h1w = byteswap(h1w)
|
||||
h1x := bk(h1w)
|
||||
|
||||
// Process 4 blocks at a time
|
||||
buf := data
|
||||
l := len(buf)
|
||||
if l >= GHASH_STRIDE_BYTES_HW {
|
||||
// Compute h2 = h^2
|
||||
h2w, h2x := square_f128(h1w)
|
||||
|
||||
// Compute h3 = h^3 = h*(h^2)
|
||||
t1 := x86._mm_clmulepi64_si128(h1w, h2w, 0x11)
|
||||
t3 := x86._mm_clmulepi64_si128(h1w, h2w, 0x00)
|
||||
t2 := x86._mm_xor_si128(
|
||||
x86._mm_clmulepi64_si128(h1x, h2x, 0x00),
|
||||
x86._mm_xor_si128(t1, t3))
|
||||
t0 := x86._mm_shuffle_epi32(t1, 0x0E)
|
||||
t1 = x86._mm_xor_si128(t1, x86._mm_shuffle_epi32(t2, 0x0E))
|
||||
t2 = x86._mm_xor_si128(t2, x86._mm_shuffle_epi32(t3, 0x0E))
|
||||
t0, t1, t2, t3 = sl_256(t0, t1, t2, t3)
|
||||
t0, t1 = reduce_f128(t0, t1, t2, t3)
|
||||
h3w, h3x := pbk(t0, t1)
|
||||
|
||||
// Compute h4 = h^4 = (h^2)^2
|
||||
h4w, h4x := square_f128(h2w)
|
||||
|
||||
for l >= GHASH_STRIDE_BYTES_HW {
|
||||
aw0 := intrinsics.unaligned_load((^x86.__m128i)(raw_data(buf)))
|
||||
aw1 := intrinsics.unaligned_load((^x86.__m128i)(raw_data(buf[16:])))
|
||||
aw2 := intrinsics.unaligned_load((^x86.__m128i)(raw_data(buf[32:])))
|
||||
aw3 := intrinsics.unaligned_load((^x86.__m128i)(raw_data(buf[48:])))
|
||||
aw0 = byteswap(aw0)
|
||||
aw1 = byteswap(aw1)
|
||||
aw2 = byteswap(aw2)
|
||||
aw3 = byteswap(aw3)
|
||||
buf, l = buf[GHASH_STRIDE_BYTES_HW:], l - GHASH_STRIDE_BYTES_HW
|
||||
|
||||
aw0 = x86._mm_xor_si128(aw0, yw)
|
||||
ax1 := bk(aw1)
|
||||
ax2 := bk(aw2)
|
||||
ax3 := bk(aw3)
|
||||
ax0 := bk(aw0)
|
||||
|
||||
t1 = x86._mm_xor_si128(
|
||||
x86._mm_xor_si128(
|
||||
x86._mm_clmulepi64_si128(aw0, h4w, 0x11),
|
||||
x86._mm_clmulepi64_si128(aw1, h3w, 0x11)),
|
||||
x86._mm_xor_si128(
|
||||
x86._mm_clmulepi64_si128(aw2, h2w, 0x11),
|
||||
x86._mm_clmulepi64_si128(aw3, h1w, 0x11)))
|
||||
t3 = x86._mm_xor_si128(
|
||||
x86._mm_xor_si128(
|
||||
x86._mm_clmulepi64_si128(aw0, h4w, 0x00),
|
||||
x86._mm_clmulepi64_si128(aw1, h3w, 0x00)),
|
||||
x86._mm_xor_si128(
|
||||
x86._mm_clmulepi64_si128(aw2, h2w, 0x00),
|
||||
x86._mm_clmulepi64_si128(aw3, h1w, 0x00)))
|
||||
t2 = x86._mm_xor_si128(
|
||||
x86._mm_xor_si128(
|
||||
x86._mm_clmulepi64_si128(ax0, h4x, 0x00),
|
||||
x86._mm_clmulepi64_si128(ax1, h3x, 0x00)),
|
||||
x86._mm_xor_si128(
|
||||
x86._mm_clmulepi64_si128(ax2, h2x, 0x00),
|
||||
x86._mm_clmulepi64_si128(ax3, h1x, 0x00)))
|
||||
t2 = x86._mm_xor_si128(t2, x86._mm_xor_si128(t1, t3))
|
||||
t0 = x86._mm_shuffle_epi32(t1, 0x0E)
|
||||
t1 = x86._mm_xor_si128(t1, x86._mm_shuffle_epi32(t2, 0x0E))
|
||||
t2 = x86._mm_xor_si128(t2, x86._mm_shuffle_epi32(t3, 0x0E))
|
||||
t0, t1, t2, t3 = sl_256(t0, t1, t2, t3)
|
||||
t0, t1 = reduce_f128(t0, t1, t2, t3)
|
||||
yw = x86._mm_unpacklo_epi64(t1, t0)
|
||||
}
|
||||
}
|
||||
|
||||
// Process 1 block at a time
|
||||
src: []byte
|
||||
for l > 0 {
|
||||
if l >= _aes.GHASH_BLOCK_SIZE {
|
||||
src = buf
|
||||
buf = buf[_aes.GHASH_BLOCK_SIZE:]
|
||||
l -= _aes.GHASH_BLOCK_SIZE
|
||||
} else {
|
||||
tmp: [_aes.GHASH_BLOCK_SIZE]byte
|
||||
copy(tmp[:], buf)
|
||||
src = tmp[:]
|
||||
l = 0
|
||||
}
|
||||
|
||||
aw := intrinsics.unaligned_load((^x86.__m128i)(raw_data(src)))
|
||||
aw = byteswap(aw)
|
||||
|
||||
aw = x86._mm_xor_si128(aw, yw)
|
||||
ax := bk(aw)
|
||||
|
||||
t1 := x86._mm_clmulepi64_si128(aw, h1w, 0x11)
|
||||
t3 := x86._mm_clmulepi64_si128(aw, h1w, 0x00)
|
||||
t2 := x86._mm_clmulepi64_si128(ax, h1x, 0x00)
|
||||
t2 = x86._mm_xor_si128(t2, x86._mm_xor_si128(t1, t3))
|
||||
t0 := x86._mm_shuffle_epi32(t1, 0x0E)
|
||||
t1 = x86._mm_xor_si128(t1, x86._mm_shuffle_epi32(t2, 0x0E))
|
||||
t2 = x86._mm_xor_si128(t2, x86._mm_shuffle_epi32(t3, 0x0E))
|
||||
t0, t1, t2, t3 = sl_256(t0, t1, t2, t3)
|
||||
t0, t1 = reduce_f128(t0, t1, t2, t3)
|
||||
yw = x86._mm_unpacklo_epi64(t1, t0)
|
||||
}
|
||||
|
||||
// Write back the hash (dst, aka y)
|
||||
yw = byteswap(yw)
|
||||
intrinsics.unaligned_store((^x86.__m128i)(raw_data(dst)), yw)
|
||||
}
|
||||
178
core/crypto/_aes/hw_intel/hw_intel_keysched.odin
Normal file
178
core/crypto/_aes/hw_intel/hw_intel_keysched.odin
Normal file
@@ -0,0 +1,178 @@
|
||||
// Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
|
||||
// All rights reserved.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions
|
||||
// are met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY THE AUTHORS “AS IS” AND ANY EXPRESS OR
|
||||
// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
// ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
|
||||
// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
||||
// GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
||||
// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
//+build amd64
|
||||
package aes_hw_intel
|
||||
|
||||
import "base:intrinsics"
|
||||
import "core:crypto/_aes"
|
||||
import "core:mem"
|
||||
import "core:simd/x86"
|
||||
|
||||
// Intel AES-NI based implementation. Inspiration taken from BearSSL.
|
||||
//
|
||||
// Note: This assumes that the SROA optimization pass is enabled to be
|
||||
// anything resembling performat otherwise, LLVM will not elide a massive
|
||||
// number of redundant loads/stores it generates for every intrinsic call.
|
||||
|
||||
@(private = "file", require_results, enable_target_feature = "sse2")
|
||||
expand_step128 :: #force_inline proc(k1, k2: x86.__m128i) -> x86.__m128i {
|
||||
k1, k2 := k1, k2
|
||||
|
||||
k2 = x86._mm_shuffle_epi32(k2, 0xff)
|
||||
k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
|
||||
k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
|
||||
k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
|
||||
return x86._mm_xor_si128(k1, k2)
|
||||
}
|
||||
|
||||
@(private = "file", require_results, enable_target_feature = "sse,sse2")
|
||||
expand_step192a :: #force_inline proc (k1_, k2_: ^x86.__m128i, k3: x86.__m128i) -> (x86.__m128i, x86.__m128i) {
|
||||
k1, k2, k3 := k1_^, k2_^, k3
|
||||
|
||||
k3 = x86._mm_shuffle_epi32(k3, 0x55)
|
||||
k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
|
||||
k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
|
||||
k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
|
||||
k1 = x86._mm_xor_si128(k1, k3)
|
||||
|
||||
tmp := k2
|
||||
k2 = x86._mm_xor_si128(k2, x86._mm_slli_si128(k2, 0x04))
|
||||
k2 = x86._mm_xor_si128(k2, x86._mm_shuffle_epi32(k1, 0xff))
|
||||
|
||||
k1_, k2_ := k1_, k2_
|
||||
k1_^, k2_^ = k1, k2
|
||||
|
||||
r1 := transmute(x86.__m128i)(x86._mm_shuffle_ps(transmute(x86.__m128)(tmp), transmute(x86.__m128)(k1), 0x44))
|
||||
r2 := transmute(x86.__m128i)(x86._mm_shuffle_ps(transmute(x86.__m128)(k1), transmute(x86.__m128)(k2), 0x4e))
|
||||
|
||||
return r1, r2
|
||||
}
|
||||
|
||||
@(private = "file", require_results, enable_target_feature = "sse2")
|
||||
expand_step192b :: #force_inline proc (k1_, k2_: ^x86.__m128i, k3: x86.__m128i) -> x86.__m128i {
|
||||
k1, k2, k3 := k1_^, k2_^, k3
|
||||
|
||||
k3 = x86._mm_shuffle_epi32(k3, 0x55)
|
||||
k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
|
||||
k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
|
||||
k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
|
||||
k1 = x86._mm_xor_si128(k1, k3)
|
||||
|
||||
k2 = x86._mm_xor_si128(k2, x86._mm_slli_si128(k2, 0x04))
|
||||
k2 = x86._mm_xor_si128(k2, x86._mm_shuffle_epi32(k1, 0xff))
|
||||
|
||||
k1_, k2_ := k1_, k2_
|
||||
k1_^, k2_^ = k1, k2
|
||||
|
||||
return k1
|
||||
}
|
||||
|
||||
@(private = "file", require_results, enable_target_feature = "sse2")
|
||||
expand_step256b :: #force_inline proc(k1, k2: x86.__m128i) -> x86.__m128i {
|
||||
k1, k2 := k1, k2
|
||||
|
||||
k2 = x86._mm_shuffle_epi32(k2, 0xaa)
|
||||
k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
|
||||
k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
|
||||
k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
|
||||
return x86._mm_xor_si128(k1, k2)
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = "aes")
|
||||
derive_dec_keys :: proc(ctx: ^Context, sks: ^[15]x86.__m128i, num_rounds: int) {
|
||||
intrinsics.unaligned_store((^x86.__m128i)(&ctx._sk_exp_dec[0]), sks[num_rounds])
|
||||
for i in 1 ..< num_rounds {
|
||||
tmp := x86._mm_aesimc_si128(sks[i])
|
||||
intrinsics.unaligned_store((^x86.__m128i)(&ctx._sk_exp_dec[num_rounds - i]), tmp)
|
||||
}
|
||||
intrinsics.unaligned_store((^x86.__m128i)(&ctx._sk_exp_dec[num_rounds]), sks[0])
|
||||
}
|
||||
|
||||
@(private, enable_target_feature = "sse,sse2,aes")
|
||||
keysched :: proc(ctx: ^Context, key: []byte) {
|
||||
sks: [15]x86.__m128i = ---
|
||||
|
||||
// Compute the encryption keys.
|
||||
num_rounds, key_len := 0, len(key)
|
||||
switch key_len {
|
||||
case _aes.KEY_SIZE_128:
|
||||
sks[0] = intrinsics.unaligned_load((^x86.__m128i)(raw_data(key)))
|
||||
sks[1] = expand_step128(sks[0], x86._mm_aeskeygenassist_si128(sks[0], 0x01))
|
||||
sks[2] = expand_step128(sks[1], x86._mm_aeskeygenassist_si128(sks[1], 0x02))
|
||||
sks[3] = expand_step128(sks[2], x86._mm_aeskeygenassist_si128(sks[2], 0x04))
|
||||
sks[4] = expand_step128(sks[3], x86._mm_aeskeygenassist_si128(sks[3], 0x08))
|
||||
sks[5] = expand_step128(sks[4], x86._mm_aeskeygenassist_si128(sks[4], 0x10))
|
||||
sks[6] = expand_step128(sks[5], x86._mm_aeskeygenassist_si128(sks[5], 0x20))
|
||||
sks[7] = expand_step128(sks[6], x86._mm_aeskeygenassist_si128(sks[6], 0x40))
|
||||
sks[8] = expand_step128(sks[7], x86._mm_aeskeygenassist_si128(sks[7], 0x80))
|
||||
sks[9] = expand_step128(sks[8], x86._mm_aeskeygenassist_si128(sks[8], 0x1b))
|
||||
sks[10] = expand_step128(sks[9], x86._mm_aeskeygenassist_si128(sks[9], 0x36))
|
||||
num_rounds = _aes.ROUNDS_128
|
||||
case _aes.KEY_SIZE_192:
|
||||
k0 := intrinsics.unaligned_load((^x86.__m128i)(raw_data(key)))
|
||||
k1 := x86.__m128i{
|
||||
intrinsics.unaligned_load((^i64)(raw_data(key[16:]))),
|
||||
0,
|
||||
}
|
||||
sks[0] = k0
|
||||
sks[1], sks[2] = expand_step192a(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x01))
|
||||
sks[3] = expand_step192b(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x02))
|
||||
sks[4], sks[5] = expand_step192a(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x04))
|
||||
sks[6] = expand_step192b(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x08))
|
||||
sks[7], sks[8] = expand_step192a(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x10))
|
||||
sks[9] = expand_step192b(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x20))
|
||||
sks[10], sks[11] = expand_step192a(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x40))
|
||||
sks[12] = expand_step192b(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x80))
|
||||
num_rounds = _aes.ROUNDS_192
|
||||
case _aes.KEY_SIZE_256:
|
||||
sks[0] = intrinsics.unaligned_load((^x86.__m128i)(raw_data(key)))
|
||||
sks[1] = intrinsics.unaligned_load((^x86.__m128i)(raw_data(key[16:])))
|
||||
sks[2] = expand_step128(sks[0], x86._mm_aeskeygenassist_si128(sks[1], 0x01))
|
||||
sks[3] = expand_step256b(sks[1], x86._mm_aeskeygenassist_si128(sks[2], 0x01))
|
||||
sks[4] = expand_step128(sks[2], x86._mm_aeskeygenassist_si128(sks[3], 0x02))
|
||||
sks[5] = expand_step256b(sks[3], x86._mm_aeskeygenassist_si128(sks[4], 0x02))
|
||||
sks[6] = expand_step128(sks[4], x86._mm_aeskeygenassist_si128(sks[5], 0x04))
|
||||
sks[7] = expand_step256b(sks[5], x86._mm_aeskeygenassist_si128(sks[6], 0x04))
|
||||
sks[8] = expand_step128(sks[6], x86._mm_aeskeygenassist_si128(sks[7], 0x08))
|
||||
sks[9] = expand_step256b(sks[7], x86._mm_aeskeygenassist_si128(sks[8], 0x08))
|
||||
sks[10] = expand_step128(sks[8], x86._mm_aeskeygenassist_si128(sks[9], 0x10))
|
||||
sks[11] = expand_step256b(sks[9], x86._mm_aeskeygenassist_si128(sks[10], 0x10))
|
||||
sks[12] = expand_step128(sks[10], x86._mm_aeskeygenassist_si128(sks[11], 0x20))
|
||||
sks[13] = expand_step256b(sks[11], x86._mm_aeskeygenassist_si128(sks[12], 0x20))
|
||||
sks[14] = expand_step128(sks[12], x86._mm_aeskeygenassist_si128(sks[13], 0x40))
|
||||
num_rounds = _aes.ROUNDS_256
|
||||
case:
|
||||
panic("crypto/aes: invalid AES key size")
|
||||
}
|
||||
for i in 0 ..= num_rounds {
|
||||
intrinsics.unaligned_store((^x86.__m128i)(&ctx._sk_exp_enc[i]), sks[i])
|
||||
}
|
||||
|
||||
// Compute the decryption keys. GCM and CTR do not need this, however
|
||||
// ECB, CBC, OCB3, etc do.
|
||||
derive_dec_keys(ctx, &sks, num_rounds)
|
||||
|
||||
ctx._num_rounds = num_rounds
|
||||
|
||||
mem.zero_explicit(&sks, size_of(sks))
|
||||
}
|
||||
@@ -6,7 +6,6 @@ See:
|
||||
- https://nvlpubs.nist.gov/nistpubs/Legacy/SP/nistspecialpublication800-38a.pdf
|
||||
- https://nvlpubs.nist.gov/nistpubs/Legacy/SP/nistspecialpublication800-38d.pdf
|
||||
*/
|
||||
|
||||
package aes
|
||||
|
||||
import "core:crypto/_aes"
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
package aes
|
||||
|
||||
import "core:bytes"
|
||||
import "core:crypto/_aes/ct64"
|
||||
import "core:encoding/endian"
|
||||
import "core:math/bits"
|
||||
@@ -37,14 +38,15 @@ init_ctr :: proc(ctx: ^Context_CTR, key, iv: []byte, impl := Implementation.Hard
|
||||
xor_bytes_ctr :: proc(ctx: ^Context_CTR, dst, src: []byte) {
|
||||
assert(ctx._is_initialized)
|
||||
|
||||
// TODO: Enforcing that dst and src alias exactly or not at all
|
||||
// is a good idea, though odd aliasing should be extremely uncommon.
|
||||
|
||||
src, dst := src, dst
|
||||
if dst_len := len(dst); dst_len < len(src) {
|
||||
src = src[:dst_len]
|
||||
}
|
||||
|
||||
if bytes.alias_inexactly(dst, src) {
|
||||
panic("crypto/aes: dst and src alias inexactly")
|
||||
}
|
||||
|
||||
for remaining := len(src); remaining > 0; {
|
||||
// Process multiple blocks at once
|
||||
if ctx._off == BLOCK_SIZE {
|
||||
@@ -123,8 +125,8 @@ reset_ctr :: proc "contextless" (ctx: ^Context_CTR) {
|
||||
ctx._is_initialized = false
|
||||
}
|
||||
|
||||
@(private)
|
||||
ctr_blocks :: proc(ctx: ^Context_CTR, dst, src: []byte, nr_blocks: int) {
|
||||
@(private = "file")
|
||||
ctr_blocks :: proc(ctx: ^Context_CTR, dst, src: []byte, nr_blocks: int) #no_bounds_check {
|
||||
// Use the optimized hardware implementation if available.
|
||||
if _, is_hw := ctx._impl.(Context_Impl_Hardware); is_hw {
|
||||
ctr_blocks_hw(ctx, dst, src, nr_blocks)
|
||||
@@ -183,17 +185,17 @@ xor_blocks :: #force_inline proc "contextless" (dst, src: []byte, blocks: [][]by
|
||||
// performance of this implementation matters to where that
|
||||
// optimization would be worth it, use chacha20poly1305, or a
|
||||
// CPU that isn't e-waste.
|
||||
if src != nil {
|
||||
#no_bounds_check {
|
||||
for i in 0 ..< len(blocks) {
|
||||
off := i * BLOCK_SIZE
|
||||
for j in 0 ..< BLOCK_SIZE {
|
||||
blocks[i][j] ~= src[off + j]
|
||||
#no_bounds_check {
|
||||
if src != nil {
|
||||
for i in 0 ..< len(blocks) {
|
||||
off := i * BLOCK_SIZE
|
||||
for j in 0 ..< BLOCK_SIZE {
|
||||
blocks[i][j] ~= src[off + j]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for i in 0 ..< len(blocks) {
|
||||
copy(dst[i * BLOCK_SIZE:], blocks[i])
|
||||
}
|
||||
}
|
||||
for i in 0 ..< len(blocks) {
|
||||
copy(dst[i * BLOCK_SIZE:], blocks[i])
|
||||
}
|
||||
}
|
||||
|
||||
151
core/crypto/aes/aes_ctr_hw_intel.odin
Normal file
151
core/crypto/aes/aes_ctr_hw_intel.odin
Normal file
@@ -0,0 +1,151 @@
|
||||
//+build amd64
|
||||
package aes
|
||||
|
||||
import "base:intrinsics"
|
||||
import "core:crypto/_aes"
|
||||
import "core:math/bits"
|
||||
import "core:mem"
|
||||
import "core:simd/x86"
|
||||
|
||||
@(private)
|
||||
CTR_STRIDE_HW :: 4
|
||||
@(private)
|
||||
CTR_STRIDE_BYTES_HW :: CTR_STRIDE_HW * BLOCK_SIZE
|
||||
|
||||
@(private, enable_target_feature = "sse2,aes")
|
||||
ctr_blocks_hw :: proc(ctx: ^Context_CTR, dst, src: []byte, nr_blocks: int) #no_bounds_check {
|
||||
hw_ctx := ctx._impl.(Context_Impl_Hardware)
|
||||
|
||||
sks: [15]x86.__m128i = ---
|
||||
for i in 0 ..= hw_ctx._num_rounds {
|
||||
sks[i] = intrinsics.unaligned_load((^x86.__m128i)(&hw_ctx._sk_exp_enc[i]))
|
||||
}
|
||||
|
||||
hw_inc_ctr := #force_inline proc "contextless" (hi, lo: u64) -> (x86.__m128i, u64, u64) {
|
||||
ret := x86.__m128i{
|
||||
i64(intrinsics.byte_swap(hi)),
|
||||
i64(intrinsics.byte_swap(lo)),
|
||||
}
|
||||
|
||||
hi, lo := hi, lo
|
||||
carry: u64
|
||||
|
||||
lo, carry = bits.add_u64(lo, 1, 0)
|
||||
hi, _ = bits.add_u64(hi, 0, carry)
|
||||
return ret, hi, lo
|
||||
}
|
||||
|
||||
// The latency of AESENC depends on mfg and microarchitecture:
|
||||
// - 7 -> up to Broadwell
|
||||
// - 4 -> AMD and Skylake - Cascade Lake
|
||||
// - 3 -> Ice Lake and newer
|
||||
//
|
||||
// This implementation does 4 blocks at once, since performance
|
||||
// should be "adequate" across most CPUs.
|
||||
|
||||
src, dst := src, dst
|
||||
nr_blocks := nr_blocks
|
||||
ctr_hi, ctr_lo := ctx._ctr_hi, ctx._ctr_lo
|
||||
|
||||
blks: [CTR_STRIDE_HW]x86.__m128i = ---
|
||||
for nr_blocks >= CTR_STRIDE_HW {
|
||||
#unroll for i in 0..< CTR_STRIDE_HW {
|
||||
blks[i], ctr_hi, ctr_lo = hw_inc_ctr(ctr_hi, ctr_lo)
|
||||
}
|
||||
|
||||
#unroll for i in 0 ..< CTR_STRIDE_HW {
|
||||
blks[i] = x86._mm_xor_si128(blks[i], sks[0])
|
||||
}
|
||||
#unroll for i in 1 ..= 9 {
|
||||
#unroll for j in 0 ..< CTR_STRIDE_HW {
|
||||
blks[j] = x86._mm_aesenc_si128(blks[j], sks[i])
|
||||
}
|
||||
}
|
||||
switch hw_ctx._num_rounds {
|
||||
case _aes.ROUNDS_128:
|
||||
#unroll for i in 0 ..< CTR_STRIDE_HW {
|
||||
blks[i] = x86._mm_aesenclast_si128(blks[i], sks[10])
|
||||
}
|
||||
case _aes.ROUNDS_192:
|
||||
#unroll for i in 10 ..= 11 {
|
||||
#unroll for j in 0 ..< CTR_STRIDE_HW {
|
||||
blks[j] = x86._mm_aesenc_si128(blks[j], sks[i])
|
||||
}
|
||||
}
|
||||
#unroll for i in 0 ..< CTR_STRIDE_HW {
|
||||
blks[i] = x86._mm_aesenclast_si128(blks[i], sks[12])
|
||||
}
|
||||
case _aes.ROUNDS_256:
|
||||
#unroll for i in 10 ..= 13 {
|
||||
#unroll for j in 0 ..< CTR_STRIDE_HW {
|
||||
blks[j] = x86._mm_aesenc_si128(blks[j], sks[i])
|
||||
}
|
||||
}
|
||||
#unroll for i in 0 ..< CTR_STRIDE_HW {
|
||||
blks[i] = x86._mm_aesenclast_si128(blks[i], sks[14])
|
||||
}
|
||||
}
|
||||
|
||||
xor_blocks_hw(dst, src, blks[:])
|
||||
|
||||
if src != nil {
|
||||
src = src[CTR_STRIDE_BYTES_HW:]
|
||||
}
|
||||
dst = dst[CTR_STRIDE_BYTES_HW:]
|
||||
nr_blocks -= CTR_STRIDE_HW
|
||||
}
|
||||
|
||||
// Handle the remainder.
|
||||
for nr_blocks > 0 {
|
||||
blks[0], ctr_hi, ctr_lo = hw_inc_ctr(ctr_hi, ctr_lo)
|
||||
|
||||
blks[0] = x86._mm_xor_si128(blks[0], sks[0])
|
||||
#unroll for i in 1 ..= 9 {
|
||||
blks[0] = x86._mm_aesenc_si128(blks[0], sks[i])
|
||||
}
|
||||
switch hw_ctx._num_rounds {
|
||||
case _aes.ROUNDS_128:
|
||||
blks[0] = x86._mm_aesenclast_si128(blks[0], sks[10])
|
||||
case _aes.ROUNDS_192:
|
||||
#unroll for i in 10 ..= 11 {
|
||||
blks[0] = x86._mm_aesenc_si128(blks[0], sks[i])
|
||||
}
|
||||
blks[0] = x86._mm_aesenclast_si128(blks[0], sks[12])
|
||||
case _aes.ROUNDS_256:
|
||||
#unroll for i in 10 ..= 13 {
|
||||
blks[0] = x86._mm_aesenc_si128(blks[0], sks[i])
|
||||
}
|
||||
blks[0] = x86._mm_aesenclast_si128(blks[0], sks[14])
|
||||
}
|
||||
|
||||
xor_blocks_hw(dst, src, blks[:1])
|
||||
|
||||
if src != nil {
|
||||
src = src[BLOCK_SIZE:]
|
||||
}
|
||||
dst = dst[BLOCK_SIZE:]
|
||||
nr_blocks -= 1
|
||||
}
|
||||
|
||||
// Write back the counter.
|
||||
ctx._ctr_hi, ctx._ctr_lo = ctr_hi, ctr_lo
|
||||
|
||||
mem.zero_explicit(&blks, size_of(blks))
|
||||
mem.zero_explicit(&sks, size_of(sks))
|
||||
}
|
||||
|
||||
@(private, enable_target_feature = "sse2")
|
||||
xor_blocks_hw :: proc(dst, src: []byte, blocks: []x86.__m128i) {
|
||||
#no_bounds_check {
|
||||
if src != nil {
|
||||
for i in 0 ..< len(blocks) {
|
||||
off := i * BLOCK_SIZE
|
||||
tmp := intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[off:])))
|
||||
blocks[i] = x86._mm_xor_si128(blocks[i], tmp)
|
||||
}
|
||||
}
|
||||
for i in 0 ..< len(blocks) {
|
||||
intrinsics.unaligned_store((^x86.__m128i)(raw_data(dst[i * BLOCK_SIZE:])), blocks[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
58
core/crypto/aes/aes_ecb_hw_intel.odin
Normal file
58
core/crypto/aes/aes_ecb_hw_intel.odin
Normal file
@@ -0,0 +1,58 @@
|
||||
//+build amd64
|
||||
package aes
|
||||
|
||||
import "base:intrinsics"
|
||||
import "core:crypto/_aes"
|
||||
import "core:simd/x86"
|
||||
|
||||
@(private, enable_target_feature = "sse2,aes")
|
||||
encrypt_block_hw :: proc(ctx: ^Context_Impl_Hardware, dst, src: []byte) {
|
||||
blk := intrinsics.unaligned_load((^x86.__m128i)(raw_data(src)))
|
||||
|
||||
blk = x86._mm_xor_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[0])))
|
||||
#unroll for i in 1 ..= 9 {
|
||||
blk = x86._mm_aesenc_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[i])))
|
||||
}
|
||||
switch ctx._num_rounds {
|
||||
case _aes.ROUNDS_128:
|
||||
blk = x86._mm_aesenclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[10])))
|
||||
case _aes.ROUNDS_192:
|
||||
#unroll for i in 10 ..= 11 {
|
||||
blk = x86._mm_aesenc_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[i])))
|
||||
}
|
||||
blk = x86._mm_aesenclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[12])))
|
||||
case _aes.ROUNDS_256:
|
||||
#unroll for i in 10 ..= 13 {
|
||||
blk = x86._mm_aesenc_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[i])))
|
||||
}
|
||||
blk = x86._mm_aesenclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[14])))
|
||||
}
|
||||
|
||||
intrinsics.unaligned_store((^x86.__m128i)(raw_data(dst)), blk)
|
||||
}
|
||||
|
||||
@(private, enable_target_feature = "sse2,aes")
|
||||
decrypt_block_hw :: proc(ctx: ^Context_Impl_Hardware, dst, src: []byte) {
|
||||
blk := intrinsics.unaligned_load((^x86.__m128i)(raw_data(src)))
|
||||
|
||||
blk = x86._mm_xor_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[0])))
|
||||
#unroll for i in 1 ..= 9 {
|
||||
blk = x86._mm_aesdec_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[i])))
|
||||
}
|
||||
switch ctx._num_rounds {
|
||||
case _aes.ROUNDS_128:
|
||||
blk = x86._mm_aesdeclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[10])))
|
||||
case _aes.ROUNDS_192:
|
||||
#unroll for i in 10 ..= 11 {
|
||||
blk = x86._mm_aesdec_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[i])))
|
||||
}
|
||||
blk = x86._mm_aesdeclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[12])))
|
||||
case _aes.ROUNDS_256:
|
||||
#unroll for i in 10 ..= 13 {
|
||||
blk = x86._mm_aesdec_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[i])))
|
||||
}
|
||||
blk = x86._mm_aesdeclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[14])))
|
||||
}
|
||||
|
||||
intrinsics.unaligned_store((^x86.__m128i)(raw_data(dst)), blk)
|
||||
}
|
||||
@@ -1,13 +1,16 @@
|
||||
package aes
|
||||
|
||||
import "core:bytes"
|
||||
import "core:crypto"
|
||||
import "core:crypto/_aes"
|
||||
import "core:crypto/_aes/ct64"
|
||||
import "core:encoding/endian"
|
||||
import "core:mem"
|
||||
|
||||
// GCM_NONCE_SIZE is the size of the GCM nonce in bytes.
|
||||
// GCM_NONCE_SIZE is the default size of the GCM nonce in bytes.
|
||||
GCM_NONCE_SIZE :: 12
|
||||
// GCM_NONCE_SIZE_MAX is the maximum size of the GCM nonce in bytes.
|
||||
GCM_NONCE_SIZE_MAX :: 0x2000000000000000 // floor((2^64 - 1) / 8) bits
|
||||
// GCM_TAG_SIZE is the size of a GCM tag in bytes.
|
||||
GCM_TAG_SIZE :: _aes.GHASH_TAG_SIZE
|
||||
|
||||
@@ -39,6 +42,9 @@ seal_gcm :: proc(ctx: ^Context_GCM, dst, tag, nonce, aad, plaintext: []byte) {
|
||||
if len(dst) != len(plaintext) {
|
||||
panic("crypto/aes: invalid destination ciphertext size")
|
||||
}
|
||||
if bytes.alias_inexactly(dst, plaintext) {
|
||||
panic("crypto/aes: dst and plaintext alias inexactly")
|
||||
}
|
||||
|
||||
if impl, is_hw := ctx._impl.(Context_Impl_Hardware); is_hw {
|
||||
gcm_seal_hw(&impl, dst, tag, nonce, aad, plaintext)
|
||||
@@ -47,17 +53,19 @@ seal_gcm :: proc(ctx: ^Context_GCM, dst, tag, nonce, aad, plaintext: []byte) {
|
||||
|
||||
h: [_aes.GHASH_KEY_SIZE]byte
|
||||
j0: [_aes.GHASH_BLOCK_SIZE]byte
|
||||
j0_enc: [_aes.GHASH_BLOCK_SIZE]byte
|
||||
s: [_aes.GHASH_TAG_SIZE]byte
|
||||
init_ghash_ct64(ctx, &h, &j0, nonce)
|
||||
init_ghash_ct64(ctx, &h, &j0, &j0_enc, nonce)
|
||||
|
||||
// Note: Our GHASH implementation handles appending padding.
|
||||
ct64.ghash(s[:], h[:], aad)
|
||||
gctr_ct64(ctx, dst, &s, plaintext, &h, nonce, true)
|
||||
final_ghash_ct64(&s, &h, &j0, len(aad), len(plaintext))
|
||||
gctr_ct64(ctx, dst, &s, plaintext, &h, &j0, true)
|
||||
final_ghash_ct64(&s, &h, &j0_enc, len(aad), len(plaintext))
|
||||
copy(tag, s[:])
|
||||
|
||||
mem.zero_explicit(&h, len(h))
|
||||
mem.zero_explicit(&j0, len(j0))
|
||||
mem.zero_explicit(&j0_enc, len(j0_enc))
|
||||
}
|
||||
|
||||
// open_gcm authenticates the aad and ciphertext, and decrypts the ciphertext,
|
||||
@@ -73,6 +81,9 @@ open_gcm :: proc(ctx: ^Context_GCM, dst, nonce, aad, ciphertext, tag: []byte) ->
|
||||
if len(dst) != len(ciphertext) {
|
||||
panic("crypto/aes: invalid destination plaintext size")
|
||||
}
|
||||
if bytes.alias_inexactly(dst, ciphertext) {
|
||||
panic("crypto/aes: dst and ciphertext alias inexactly")
|
||||
}
|
||||
|
||||
if impl, is_hw := ctx._impl.(Context_Impl_Hardware); is_hw {
|
||||
return gcm_open_hw(&impl, dst, nonce, aad, ciphertext, tag)
|
||||
@@ -80,12 +91,13 @@ open_gcm :: proc(ctx: ^Context_GCM, dst, nonce, aad, ciphertext, tag: []byte) ->
|
||||
|
||||
h: [_aes.GHASH_KEY_SIZE]byte
|
||||
j0: [_aes.GHASH_BLOCK_SIZE]byte
|
||||
j0_enc: [_aes.GHASH_BLOCK_SIZE]byte
|
||||
s: [_aes.GHASH_TAG_SIZE]byte
|
||||
init_ghash_ct64(ctx, &h, &j0, nonce)
|
||||
init_ghash_ct64(ctx, &h, &j0, &j0_enc, nonce)
|
||||
|
||||
ct64.ghash(s[:], h[:], aad)
|
||||
gctr_ct64(ctx, dst, &s, ciphertext, &h, nonce, false)
|
||||
final_ghash_ct64(&s, &h, &j0, len(aad), len(ciphertext))
|
||||
gctr_ct64(ctx, dst, &s, ciphertext, &h, &j0, false)
|
||||
final_ghash_ct64(&s, &h, &j0_enc, len(aad), len(ciphertext))
|
||||
|
||||
ok := crypto.compare_constant_time(s[:], tag) == 1
|
||||
if !ok {
|
||||
@@ -94,6 +106,7 @@ open_gcm :: proc(ctx: ^Context_GCM, dst, nonce, aad, ciphertext, tag: []byte) ->
|
||||
|
||||
mem.zero_explicit(&h, len(h))
|
||||
mem.zero_explicit(&j0, len(j0))
|
||||
mem.zero_explicit(&j0_enc, len(j0_enc))
|
||||
mem.zero_explicit(&s, len(s))
|
||||
|
||||
return ok
|
||||
@@ -106,19 +119,14 @@ reset_gcm :: proc "contextless" (ctx: ^Context_GCM) {
|
||||
ctx._is_initialized = false
|
||||
}
|
||||
|
||||
@(private)
|
||||
@(private = "file")
|
||||
gcm_validate_common_slice_sizes :: proc(tag, nonce, aad, text: []byte) {
|
||||
if len(tag) != GCM_TAG_SIZE {
|
||||
panic("crypto/aes: invalid GCM tag size")
|
||||
}
|
||||
|
||||
// The specification supports nonces in the range [1, 2^64) bits
|
||||
// however per NIST SP 800-38D 5.2.1.1:
|
||||
//
|
||||
// > For IVs, it is recommended that implementations restrict support
|
||||
// > to the length of 96 bits, to promote interoperability, efficiency,
|
||||
// > and simplicity of design.
|
||||
if len(nonce) != GCM_NONCE_SIZE {
|
||||
// The specification supports nonces in the range [1, 2^64) bits.
|
||||
if l := len(nonce); l == 0 || u64(l) >= GCM_NONCE_SIZE_MAX {
|
||||
panic("crypto/aes: invalid GCM nonce size")
|
||||
}
|
||||
|
||||
@@ -135,6 +143,7 @@ init_ghash_ct64 :: proc(
|
||||
ctx: ^Context_GCM,
|
||||
h: ^[_aes.GHASH_KEY_SIZE]byte,
|
||||
j0: ^[_aes.GHASH_BLOCK_SIZE]byte,
|
||||
j0_enc: ^[_aes.GHASH_BLOCK_SIZE]byte,
|
||||
nonce: []byte,
|
||||
) {
|
||||
impl := &ctx._impl.(ct64.Context)
|
||||
@@ -142,12 +151,25 @@ init_ghash_ct64 :: proc(
|
||||
// 1. Let H = CIPH(k, 0^128)
|
||||
ct64.encrypt_block(impl, h[:], h[:])
|
||||
|
||||
// Define a block, J0, as follows:
|
||||
if l := len(nonce); l == GCM_NONCE_SIZE {
|
||||
// if len(IV) = 96, then let J0 = IV || 0^31 || 1
|
||||
copy(j0[:], nonce)
|
||||
j0[_aes.GHASH_BLOCK_SIZE - 1] = 1
|
||||
} else {
|
||||
// If len(IV) != 96, then let s = 128 ceil(len(IV)/128) - len(IV),
|
||||
// and let J0 = GHASHH(IV || 0^(s+64) || ceil(len(IV))^64).
|
||||
ct64.ghash(j0[:], h[:], nonce)
|
||||
|
||||
tmp: [_aes.GHASH_BLOCK_SIZE]byte
|
||||
endian.unchecked_put_u64be(tmp[8:], u64(l) * 8)
|
||||
ct64.ghash(j0[:], h[:], tmp[:])
|
||||
}
|
||||
|
||||
// ECB encrypt j0, so that we can just XOR with the tag. In theory
|
||||
// this could be processed along with the final GCTR block, to
|
||||
// potentially save a call to AES-ECB, but... just use AES-NI.
|
||||
copy(j0[:], nonce)
|
||||
j0[_aes.GHASH_BLOCK_SIZE - 1] = 1
|
||||
ct64.encrypt_block(impl, j0[:], j0[:])
|
||||
ct64.encrypt_block(impl, j0_enc[:], j0[:])
|
||||
}
|
||||
|
||||
@(private = "file")
|
||||
@@ -175,33 +197,27 @@ gctr_ct64 :: proc(
|
||||
s: ^[_aes.GHASH_BLOCK_SIZE]byte,
|
||||
src: []byte,
|
||||
h: ^[_aes.GHASH_KEY_SIZE]byte,
|
||||
nonce: []byte,
|
||||
nonce: ^[_aes.GHASH_BLOCK_SIZE]byte,
|
||||
is_seal: bool,
|
||||
) {
|
||||
) #no_bounds_check {
|
||||
ct64_inc_ctr32 := #force_inline proc "contextless" (dst: []byte, ctr: u32) -> u32 {
|
||||
endian.unchecked_put_u32be(dst[12:], ctr)
|
||||
return ctr + 1
|
||||
}
|
||||
|
||||
// 2. Define a block J_0 as follows:
|
||||
// if len(IV) = 96, then let J0 = IV || 0^31 || 1
|
||||
//
|
||||
// Note: We only support 96 bit IVs.
|
||||
// Setup the counter blocks.
|
||||
tmp, tmp2: [ct64.STRIDE][BLOCK_SIZE]byte = ---, ---
|
||||
ctrs, blks: [ct64.STRIDE][]byte = ---, ---
|
||||
ctr: u32 = 2
|
||||
ctr := endian.unchecked_get_u32be(nonce[GCM_NONCE_SIZE:]) + 1
|
||||
for i in 0 ..< ct64.STRIDE {
|
||||
// Setup scratch space for the keystream.
|
||||
blks[i] = tmp2[i][:]
|
||||
|
||||
// Pre-copy the IV to all the counter blocks.
|
||||
ctrs[i] = tmp[i][:]
|
||||
copy(ctrs[i], nonce)
|
||||
copy(ctrs[i], nonce[:GCM_NONCE_SIZE])
|
||||
}
|
||||
|
||||
// We stitch the GCTR and GHASH operations together, so that only
|
||||
// one pass over the ciphertext is required.
|
||||
|
||||
impl := &ctx._impl.(ct64.Context)
|
||||
src, dst := src, dst
|
||||
|
||||
|
||||
243
core/crypto/aes/aes_gcm_hw_intel.odin
Normal file
243
core/crypto/aes/aes_gcm_hw_intel.odin
Normal file
@@ -0,0 +1,243 @@
|
||||
//+build amd64
|
||||
package aes
|
||||
|
||||
import "base:intrinsics"
|
||||
import "core:crypto"
|
||||
import "core:crypto/_aes"
|
||||
import "core:crypto/_aes/hw_intel"
|
||||
import "core:encoding/endian"
|
||||
import "core:mem"
|
||||
import "core:simd/x86"
|
||||
|
||||
@(private)
|
||||
gcm_seal_hw :: proc(ctx: ^Context_Impl_Hardware, dst, tag, nonce, aad, plaintext: []byte) {
|
||||
h: [_aes.GHASH_KEY_SIZE]byte
|
||||
j0: [_aes.GHASH_BLOCK_SIZE]byte
|
||||
j0_enc: [_aes.GHASH_BLOCK_SIZE]byte
|
||||
s: [_aes.GHASH_TAG_SIZE]byte
|
||||
init_ghash_hw(ctx, &h, &j0, &j0_enc, nonce)
|
||||
|
||||
// Note: Our GHASH implementation handles appending padding.
|
||||
hw_intel.ghash(s[:], h[:], aad)
|
||||
gctr_hw(ctx, dst, &s, plaintext, &h, &j0, true)
|
||||
final_ghash_hw(&s, &h, &j0_enc, len(aad), len(plaintext))
|
||||
copy(tag, s[:])
|
||||
|
||||
mem.zero_explicit(&h, len(h))
|
||||
mem.zero_explicit(&j0, len(j0))
|
||||
mem.zero_explicit(&j0_enc, len(j0_enc))
|
||||
}
|
||||
|
||||
@(private)
|
||||
gcm_open_hw :: proc(ctx: ^Context_Impl_Hardware, dst, nonce, aad, ciphertext, tag: []byte) -> bool {
|
||||
h: [_aes.GHASH_KEY_SIZE]byte
|
||||
j0: [_aes.GHASH_BLOCK_SIZE]byte
|
||||
j0_enc: [_aes.GHASH_BLOCK_SIZE]byte
|
||||
s: [_aes.GHASH_TAG_SIZE]byte
|
||||
init_ghash_hw(ctx, &h, &j0, &j0_enc, nonce)
|
||||
|
||||
hw_intel.ghash(s[:], h[:], aad)
|
||||
gctr_hw(ctx, dst, &s, ciphertext, &h, &j0, false)
|
||||
final_ghash_hw(&s, &h, &j0_enc, len(aad), len(ciphertext))
|
||||
|
||||
ok := crypto.compare_constant_time(s[:], tag) == 1
|
||||
if !ok {
|
||||
mem.zero_explicit(raw_data(dst), len(dst))
|
||||
}
|
||||
|
||||
mem.zero_explicit(&h, len(h))
|
||||
mem.zero_explicit(&j0, len(j0))
|
||||
mem.zero_explicit(&j0_enc, len(j0_enc))
|
||||
mem.zero_explicit(&s, len(s))
|
||||
|
||||
return ok
|
||||
}
|
||||
|
||||
@(private = "file")
|
||||
init_ghash_hw :: proc(
|
||||
ctx: ^Context_Impl_Hardware,
|
||||
h: ^[_aes.GHASH_KEY_SIZE]byte,
|
||||
j0: ^[_aes.GHASH_BLOCK_SIZE]byte,
|
||||
j0_enc: ^[_aes.GHASH_BLOCK_SIZE]byte,
|
||||
nonce: []byte,
|
||||
) {
|
||||
// 1. Let H = CIPH(k, 0^128)
|
||||
encrypt_block_hw(ctx, h[:], h[:])
|
||||
|
||||
// Define a block, J0, as follows:
|
||||
if l := len(nonce); l == GCM_NONCE_SIZE {
|
||||
// if len(IV) = 96, then let J0 = IV || 0^31 || 1
|
||||
copy(j0[:], nonce)
|
||||
j0[_aes.GHASH_BLOCK_SIZE - 1] = 1
|
||||
} else {
|
||||
// If len(IV) != 96, then let s = 128 ceil(len(IV)/128) - len(IV),
|
||||
// and let J0 = GHASHH(IV || 0^(s+64) || ceil(len(IV))^64).
|
||||
hw_intel.ghash(j0[:], h[:], nonce)
|
||||
|
||||
tmp: [_aes.GHASH_BLOCK_SIZE]byte
|
||||
endian.unchecked_put_u64be(tmp[8:], u64(l) * 8)
|
||||
hw_intel.ghash(j0[:], h[:], tmp[:])
|
||||
}
|
||||
|
||||
// ECB encrypt j0, so that we can just XOR with the tag.
|
||||
encrypt_block_hw(ctx, j0_enc[:], j0[:])
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = "sse2")
|
||||
final_ghash_hw :: proc(
|
||||
s: ^[_aes.GHASH_BLOCK_SIZE]byte,
|
||||
h: ^[_aes.GHASH_KEY_SIZE]byte,
|
||||
j0: ^[_aes.GHASH_BLOCK_SIZE]byte,
|
||||
a_len: int,
|
||||
t_len: int,
|
||||
) {
|
||||
blk: [_aes.GHASH_BLOCK_SIZE]byte
|
||||
endian.unchecked_put_u64be(blk[0:], u64(a_len) * 8)
|
||||
endian.unchecked_put_u64be(blk[8:], u64(t_len) * 8)
|
||||
|
||||
hw_intel.ghash(s[:], h[:], blk[:])
|
||||
j0_vec := intrinsics.unaligned_load((^x86.__m128i)(j0))
|
||||
s_vec := intrinsics.unaligned_load((^x86.__m128i)(s))
|
||||
s_vec = x86._mm_xor_si128(s_vec, j0_vec)
|
||||
intrinsics.unaligned_store((^x86.__m128i)(s), s_vec)
|
||||
}
|
||||
|
||||
@(private = "file", enable_target_feature = "sse2,sse4.1,aes")
|
||||
gctr_hw :: proc(
|
||||
ctx: ^Context_Impl_Hardware,
|
||||
dst: []byte,
|
||||
s: ^[_aes.GHASH_BLOCK_SIZE]byte,
|
||||
src: []byte,
|
||||
h: ^[_aes.GHASH_KEY_SIZE]byte,
|
||||
nonce: ^[_aes.GHASH_BLOCK_SIZE]byte,
|
||||
is_seal: bool,
|
||||
) #no_bounds_check {
|
||||
sks: [15]x86.__m128i = ---
|
||||
for i in 0 ..= ctx._num_rounds {
|
||||
sks[i] = intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[i]))
|
||||
}
|
||||
|
||||
// Setup the counter block
|
||||
ctr_blk := intrinsics.unaligned_load((^x86.__m128i)(nonce))
|
||||
ctr := endian.unchecked_get_u32be(nonce[GCM_NONCE_SIZE:]) + 1
|
||||
|
||||
src, dst := src, dst
|
||||
|
||||
// Note: Instead of doing GHASH and CTR separately, it is more
|
||||
// performant to interleave (stitch) the two operations together.
|
||||
// This results in an unreadable mess, so we opt for simplicity
|
||||
// as performance is adequate.
|
||||
|
||||
blks: [CTR_STRIDE_HW]x86.__m128i = ---
|
||||
nr_blocks := len(src) / BLOCK_SIZE
|
||||
for nr_blocks >= CTR_STRIDE_HW {
|
||||
if !is_seal {
|
||||
hw_intel.ghash(s[:], h[:], src[:CTR_STRIDE_BYTES_HW])
|
||||
}
|
||||
|
||||
#unroll for i in 0 ..< CTR_STRIDE_HW {
|
||||
blks[i], ctr = hw_inc_ctr32(&ctr_blk, ctr)
|
||||
}
|
||||
|
||||
#unroll for i in 0 ..< CTR_STRIDE_HW {
|
||||
blks[i] = x86._mm_xor_si128(blks[i], sks[0])
|
||||
}
|
||||
#unroll for i in 1 ..= 9 {
|
||||
#unroll for j in 0 ..< CTR_STRIDE_HW {
|
||||
blks[j] = x86._mm_aesenc_si128(blks[j], sks[i])
|
||||
}
|
||||
}
|
||||
switch ctx._num_rounds {
|
||||
case _aes.ROUNDS_128:
|
||||
#unroll for i in 0 ..< CTR_STRIDE_HW {
|
||||
blks[i] = x86._mm_aesenclast_si128(blks[i], sks[10])
|
||||
}
|
||||
case _aes.ROUNDS_192:
|
||||
#unroll for i in 10 ..= 11 {
|
||||
#unroll for j in 0 ..< CTR_STRIDE_HW {
|
||||
blks[j] = x86._mm_aesenc_si128(blks[j], sks[i])
|
||||
}
|
||||
}
|
||||
#unroll for i in 0 ..< CTR_STRIDE_HW {
|
||||
blks[i] = x86._mm_aesenclast_si128(blks[i], sks[12])
|
||||
}
|
||||
case _aes.ROUNDS_256:
|
||||
#unroll for i in 10 ..= 13 {
|
||||
#unroll for j in 0 ..< CTR_STRIDE_HW {
|
||||
blks[j] = x86._mm_aesenc_si128(blks[j], sks[i])
|
||||
}
|
||||
}
|
||||
#unroll for i in 0 ..< CTR_STRIDE_HW {
|
||||
blks[i] = x86._mm_aesenclast_si128(blks[i], sks[14])
|
||||
}
|
||||
}
|
||||
|
||||
xor_blocks_hw(dst, src, blks[:])
|
||||
|
||||
if is_seal {
|
||||
hw_intel.ghash(s[:], h[:], dst[:CTR_STRIDE_BYTES_HW])
|
||||
}
|
||||
|
||||
src = src[CTR_STRIDE_BYTES_HW:]
|
||||
dst = dst[CTR_STRIDE_BYTES_HW:]
|
||||
nr_blocks -= CTR_STRIDE_HW
|
||||
}
|
||||
|
||||
// Handle the remainder.
|
||||
for n := len(src); n > 0; {
|
||||
l := min(n, BLOCK_SIZE)
|
||||
if !is_seal {
|
||||
hw_intel.ghash(s[:], h[:], src[:l])
|
||||
}
|
||||
|
||||
blks[0], ctr = hw_inc_ctr32(&ctr_blk, ctr)
|
||||
|
||||
blks[0] = x86._mm_xor_si128(blks[0], sks[0])
|
||||
#unroll for i in 1 ..= 9 {
|
||||
blks[0] = x86._mm_aesenc_si128(blks[0], sks[i])
|
||||
}
|
||||
switch ctx._num_rounds {
|
||||
case _aes.ROUNDS_128:
|
||||
blks[0] = x86._mm_aesenclast_si128(blks[0], sks[10])
|
||||
case _aes.ROUNDS_192:
|
||||
#unroll for i in 10 ..= 11 {
|
||||
blks[0] = x86._mm_aesenc_si128(blks[0], sks[i])
|
||||
}
|
||||
blks[0] = x86._mm_aesenclast_si128(blks[0], sks[12])
|
||||
case _aes.ROUNDS_256:
|
||||
#unroll for i in 10 ..= 13 {
|
||||
blks[0] = x86._mm_aesenc_si128(blks[0], sks[i])
|
||||
}
|
||||
blks[0] = x86._mm_aesenclast_si128(blks[0], sks[14])
|
||||
}
|
||||
|
||||
if l == BLOCK_SIZE {
|
||||
xor_blocks_hw(dst, src, blks[:1])
|
||||
} else {
|
||||
blk: [BLOCK_SIZE]byte
|
||||
copy(blk[:], src)
|
||||
xor_blocks_hw(blk[:], blk[:], blks[:1])
|
||||
copy(dst, blk[:l])
|
||||
}
|
||||
if is_seal {
|
||||
hw_intel.ghash(s[:], h[:], dst[:l])
|
||||
}
|
||||
|
||||
dst = dst[l:]
|
||||
src = src[l:]
|
||||
n -= l
|
||||
}
|
||||
|
||||
mem.zero_explicit(&blks, size_of(blks))
|
||||
mem.zero_explicit(&sks, size_of(sks))
|
||||
}
|
||||
|
||||
// BUG: Sticking this in gctr_hw (like the other implementations) crashes
|
||||
// the compiler.
|
||||
//
|
||||
// src/check_expr.cpp(7892): Assertion Failure: `c->curr_proc_decl->entity`
|
||||
@(private = "file", enable_target_feature = "sse4.1")
|
||||
hw_inc_ctr32 :: #force_inline proc "contextless" (src: ^x86.__m128i, ctr: u32) -> (x86.__m128i, u32) {
|
||||
ret := x86._mm_insert_epi32(src^, i32(intrinsics.byte_swap(ctr)), 3)
|
||||
return ret, ctr + 1
|
||||
}
|
||||
@@ -1,3 +1,4 @@
|
||||
//+build !amd64
|
||||
package aes
|
||||
|
||||
@(private = "file")
|
||||
|
||||
18
core/crypto/aes/aes_impl_hw_intel.odin
Normal file
18
core/crypto/aes/aes_impl_hw_intel.odin
Normal file
@@ -0,0 +1,18 @@
|
||||
//+build amd64
|
||||
package aes
|
||||
|
||||
import "core:crypto/_aes/hw_intel"
|
||||
|
||||
// is_hardware_accelerated returns true iff hardware accelerated AES
|
||||
// is supported.
|
||||
is_hardware_accelerated :: proc "contextless" () -> bool {
|
||||
return hw_intel.is_supported()
|
||||
}
|
||||
|
||||
@(private)
|
||||
Context_Impl_Hardware :: hw_intel.Context
|
||||
|
||||
@(private, enable_target_feature = "sse2,aes")
|
||||
init_impl_hw :: proc(ctx: ^Context_Impl_Hardware, key: []byte) {
|
||||
hw_intel.init(ctx, key)
|
||||
}
|
||||
@@ -7,6 +7,7 @@ See:
|
||||
*/
|
||||
package chacha20
|
||||
|
||||
import "core:bytes"
|
||||
import "core:encoding/endian"
|
||||
import "core:math/bits"
|
||||
import "core:mem"
|
||||
@@ -121,14 +122,15 @@ seek :: proc(ctx: ^Context, block_nr: u64) {
|
||||
xor_bytes :: proc(ctx: ^Context, dst, src: []byte) {
|
||||
assert(ctx._is_initialized)
|
||||
|
||||
// TODO: Enforcing that dst and src alias exactly or not at all
|
||||
// is a good idea, though odd aliasing should be extremely uncommon.
|
||||
|
||||
src, dst := src, dst
|
||||
if dst_len := len(dst); dst_len < len(src) {
|
||||
src = src[:dst_len]
|
||||
}
|
||||
|
||||
if bytes.alias_inexactly(dst, src) {
|
||||
panic("crypto/chacha20: dst and src alias inexactly")
|
||||
}
|
||||
|
||||
for remaining := len(src); remaining > 0; {
|
||||
// Process multiple blocks at once
|
||||
if ctx._off == _BLOCK_SIZE {
|
||||
|
||||
@@ -60,7 +60,11 @@ rand_bytes :: proc (dst: []byte) {
|
||||
_rand_bytes(dst)
|
||||
}
|
||||
|
||||
|
||||
// random_generator returns a `runtime.Random_Generator` backed by the
|
||||
// system entropy source.
|
||||
//
|
||||
// Support for the system entropy source can be checked with the
|
||||
// `HAS_RAND_BYTES` boolean constant.
|
||||
random_generator :: proc() -> runtime.Random_Generator {
|
||||
return {
|
||||
procedure = proc(data: rawptr, mode: runtime.Random_Generator_Mode, p: []byte) {
|
||||
|
||||
@@ -2,33 +2,33 @@
|
||||
package simd_x86
|
||||
|
||||
@(require_results, enable_target_feature = "aes")
|
||||
_mm_aesdec :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
|
||||
_mm_aesdec_si128 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
|
||||
return aesdec(a, b)
|
||||
}
|
||||
|
||||
@(require_results, enable_target_feature = "aes")
|
||||
_mm_aesdeclast :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
|
||||
_mm_aesdeclast_si128 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
|
||||
return aesdeclast(a, b)
|
||||
}
|
||||
|
||||
@(require_results, enable_target_feature = "aes")
|
||||
_mm_aesenc :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
|
||||
_mm_aesenc_si128 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
|
||||
return aesenc(a, b)
|
||||
}
|
||||
|
||||
@(require_results, enable_target_feature = "aes")
|
||||
_mm_aesenclast :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
|
||||
_mm_aesenclast_si128 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
|
||||
return aesenclast(a, b)
|
||||
}
|
||||
|
||||
@(require_results, enable_target_feature = "aes")
|
||||
_mm_aesimc :: #force_inline proc "c" (a: __m128i) -> __m128i {
|
||||
_mm_aesimc_si128 :: #force_inline proc "c" (a: __m128i) -> __m128i {
|
||||
return aesimc(a)
|
||||
}
|
||||
|
||||
@(require_results, enable_target_feature = "aes")
|
||||
_mm_aeskeygenassist :: #force_inline proc "c" (a: __m128i, $IMM8: u8) -> __m128i {
|
||||
return aeskeygenassist(a, u8(IMM8))
|
||||
_mm_aeskeygenassist_si128 :: #force_inline proc "c" (a: __m128i, $IMM8: u8) -> __m128i {
|
||||
return aeskeygenassist(a, IMM8)
|
||||
}
|
||||
|
||||
|
||||
@@ -45,5 +45,5 @@ foreign _ {
|
||||
@(link_name = "llvm.x86.aesni.aesimc")
|
||||
aesimc :: proc(a: __m128i) -> __m128i ---
|
||||
@(link_name = "llvm.x86.aesni.aeskeygenassist")
|
||||
aeskeygenassist :: proc(a: __m128i, imm8: u8) -> __m128i ---
|
||||
aeskeygenassist :: proc(a: __m128i, #const imm8: u8) -> __m128i ---
|
||||
}
|
||||
|
||||
@@ -144,19 +144,26 @@ _mm_subs_epu16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
|
||||
_mm_slli_si128_impl :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i {
|
||||
shift :: IMM8 & 0xff
|
||||
|
||||
// This needs to emit behavior identical to PSLLDQ which is as follows:
|
||||
//
|
||||
// TEMP := COUNT
|
||||
// IF (TEMP > 15) THEN TEMP := 16; FI
|
||||
// DEST := DEST << (TEMP * 8)
|
||||
// DEST[MAXVL-1:128] (Unmodified)
|
||||
|
||||
return transmute(__m128i)simd.shuffle(
|
||||
transmute(i8x16)a,
|
||||
i8x16(0),
|
||||
0 when shift > 15 else (16 - shift + 0),
|
||||
1 when shift > 15 else (16 - shift + 1),
|
||||
2 when shift > 15 else (16 - shift + 2),
|
||||
3 when shift > 15 else (16 - shift + 3),
|
||||
4 when shift > 15 else (16 - shift + 4),
|
||||
5 when shift > 15 else (16 - shift + 5),
|
||||
6 when shift > 15 else (16 - shift + 6),
|
||||
7 when shift > 15 else (16 - shift + 7),
|
||||
8 when shift > 15 else (16 - shift + 8),
|
||||
9 when shift > 15 else (16 - shift + 9),
|
||||
transmute(i8x16)a,
|
||||
0 when shift > 15 else (16 - shift + 0),
|
||||
1 when shift > 15 else (16 - shift + 1),
|
||||
2 when shift > 15 else (16 - shift + 2),
|
||||
3 when shift > 15 else (16 - shift + 3),
|
||||
4 when shift > 15 else (16 - shift + 4),
|
||||
5 when shift > 15 else (16 - shift + 5),
|
||||
6 when shift > 15 else (16 - shift + 6),
|
||||
7 when shift > 15 else (16 - shift + 7),
|
||||
8 when shift > 15 else (16 - shift + 8),
|
||||
9 when shift > 15 else (16 - shift + 9),
|
||||
10 when shift > 15 else (16 - shift + 10),
|
||||
11 when shift > 15 else (16 - shift + 11),
|
||||
12 when shift > 15 else (16 - shift + 12),
|
||||
@@ -435,7 +442,7 @@ _mm_store_si128 :: #force_inline proc "c" (mem_addr: ^__m128i, a: __m128i) {
|
||||
}
|
||||
@(enable_target_feature="sse2")
|
||||
_mm_storeu_si128 :: #force_inline proc "c" (mem_addr: ^__m128i, a: __m128i) {
|
||||
storeudq(mem_addr, a)
|
||||
intrinsics.unaligned_store(mem_addr, a)
|
||||
}
|
||||
@(enable_target_feature="sse2")
|
||||
_mm_storel_epi64 :: #force_inline proc "c" (mem_addr: ^__m128i, a: __m128i) {
|
||||
@@ -1178,8 +1185,6 @@ foreign _ {
|
||||
cvttsd2si :: proc(a: __m128d) -> i32 ---
|
||||
@(link_name="llvm.x86.sse2.cvttps2dq")
|
||||
cvttps2dq :: proc(a: __m128) -> i32x4 ---
|
||||
@(link_name="llvm.x86.sse2.storeu.dq")
|
||||
storeudq :: proc(mem_addr: rawptr, a: __m128i) ---
|
||||
@(link_name="llvm.x86.sse2.storeu.pd")
|
||||
storeupd :: proc(mem_addr: rawptr, a: __m128d) ---
|
||||
|
||||
|
||||
@@ -28,6 +28,32 @@ benchmark_crypto :: proc(t: ^testing.T) {
|
||||
strings.builder_destroy(&str)
|
||||
}
|
||||
|
||||
{
|
||||
name := "AES256-CTR 64 bytes"
|
||||
options := &time.Benchmark_Options {
|
||||
rounds = 1_000,
|
||||
bytes = 64,
|
||||
setup = _setup_sized_buf,
|
||||
bench = _benchmark_aes256_ctr,
|
||||
teardown = _teardown_sized_buf,
|
||||
}
|
||||
|
||||
err := time.benchmark(options, context.allocator)
|
||||
testing.expect(t, err == nil, name)
|
||||
benchmark_print(&str, name, options)
|
||||
|
||||
name = "AES256-CTR 1024 bytes"
|
||||
options.bytes = 1024
|
||||
err = time.benchmark(options, context.allocator)
|
||||
testing.expect(t, err == nil, name)
|
||||
benchmark_print(&str, name, options)
|
||||
|
||||
name = "AES256-CTR 65536 bytes"
|
||||
options.bytes = 65536
|
||||
err = time.benchmark(options, context.allocator)
|
||||
testing.expect(t, err == nil, name)
|
||||
benchmark_print(&str, name, options)
|
||||
}
|
||||
{
|
||||
name := "ChaCha20 64 bytes"
|
||||
options := &time.Benchmark_Options {
|
||||
@@ -323,6 +349,36 @@ _benchmark_chacha20poly1305 :: proc(
|
||||
return nil
|
||||
}
|
||||
|
||||
@(private)
|
||||
_benchmark_aes256_ctr :: proc(
|
||||
options: ^time.Benchmark_Options,
|
||||
allocator := context.allocator,
|
||||
) -> (
|
||||
err: time.Benchmark_Error,
|
||||
) {
|
||||
buf := options.input
|
||||
key := [aes.KEY_SIZE_256]byte {
|
||||
0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef,
|
||||
0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef,
|
||||
0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef,
|
||||
0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef,
|
||||
}
|
||||
nonce := [aes.CTR_IV_SIZE]byte {
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
}
|
||||
|
||||
ctx: aes.Context_CTR = ---
|
||||
aes.init_ctr(&ctx, key[:], nonce[:])
|
||||
|
||||
for _ in 0 ..= options.rounds {
|
||||
aes.xor_bytes_ctr(&ctx, buf, buf)
|
||||
}
|
||||
options.count = options.rounds
|
||||
options.processed = options.rounds * options.bytes
|
||||
return nil
|
||||
}
|
||||
|
||||
_benchmark_aes256_gcm :: proc(
|
||||
options: ^time.Benchmark_Options,
|
||||
allocator := context.allocator,
|
||||
|
||||
@@ -12,8 +12,6 @@ import "core:crypto/sha2"
|
||||
test_aes :: proc(t: ^testing.T) {
|
||||
runtime.DEFAULT_TEMP_ALLOCATOR_TEMP_GUARD()
|
||||
|
||||
log.info("Testing AES")
|
||||
|
||||
impls := make([dynamic]aes.Implementation, 0, 2)
|
||||
defer delete(impls)
|
||||
append(&impls, aes.Implementation.Portable)
|
||||
@@ -29,7 +27,7 @@ test_aes :: proc(t: ^testing.T) {
|
||||
}
|
||||
|
||||
test_aes_ecb :: proc(t: ^testing.T, impl: aes.Implementation) {
|
||||
log.infof("Testing AES-ECB/%v", impl)
|
||||
log.debugf("Testing AES-ECB/%v", impl)
|
||||
|
||||
test_vectors := []struct {
|
||||
key: string,
|
||||
@@ -136,7 +134,7 @@ test_aes_ecb :: proc(t: ^testing.T, impl: aes.Implementation) {
|
||||
}
|
||||
|
||||
test_aes_ctr :: proc(t: ^testing.T, impl: aes.Implementation) {
|
||||
log.infof("Testing AES-CTR/%v", impl)
|
||||
log.debugf("Testing AES-CTR/%v", impl)
|
||||
|
||||
test_vectors := []struct {
|
||||
key: string,
|
||||
@@ -200,7 +198,7 @@ test_aes_ctr :: proc(t: ^testing.T, impl: aes.Implementation) {
|
||||
ctx: aes.Context_CTR
|
||||
key: [aes.KEY_SIZE_256]byte
|
||||
nonce: [aes.CTR_IV_SIZE]byte
|
||||
aes.init_ctr(&ctx, key[:], nonce[:])
|
||||
aes.init_ctr(&ctx, key[:], nonce[:], impl)
|
||||
|
||||
h_ctx: sha2.Context_512
|
||||
sha2.init_512_256(&h_ctx)
|
||||
@@ -226,7 +224,7 @@ test_aes_ctr :: proc(t: ^testing.T, impl: aes.Implementation) {
|
||||
}
|
||||
|
||||
test_aes_gcm :: proc(t: ^testing.T, impl: aes.Implementation) {
|
||||
log.infof("Testing AES-GCM/%v", impl)
|
||||
log.debugf("Testing AES-GCM/%v", impl)
|
||||
|
||||
// NIST did a reorg of their site, so the source of the test vectors
|
||||
// is only available from an archive. The commented out tests are
|
||||
@@ -431,7 +429,7 @@ test_aes_gcm :: proc(t: ^testing.T, impl: aes.Implementation) {
|
||||
testing.expectf(
|
||||
t,
|
||||
ok && dst_str == v.plaintext,
|
||||
"AES-GCM/%v: Expected: (%s, true) for open(%s, %s, %s, %s, %s), but got (%s, %s) instead",
|
||||
"AES-GCM/%v: Expected: (%s, true) for open(%s, %s, %s, %s, %s), but got (%s, %v) instead",
|
||||
impl,
|
||||
v.plaintext,
|
||||
v.key,
|
||||
|
||||
@@ -58,9 +58,9 @@ test_sqrt_ratio_m1 :: proc(t: ^testing.T) {
|
||||
v_bytes, _ := hex.decode(transmute([]byte)(v.v), context.temp_allocator)
|
||||
r_bytes, _ := hex.decode(transmute([]byte)(v.r), context.temp_allocator)
|
||||
|
||||
u_ := transmute(^[32]byte)(raw_data(u_bytes))
|
||||
v_ := transmute(^[32]byte)(raw_data(v_bytes))
|
||||
r_ := transmute(^[32]byte)(raw_data(r_bytes))
|
||||
u_ := (^[32]byte)(raw_data(u_bytes))
|
||||
v_ := (^[32]byte)(raw_data(v_bytes))
|
||||
r_ := (^[32]byte)(raw_data(r_bytes))
|
||||
|
||||
u, vee, r: field.Tight_Field_Element
|
||||
field.fe_from_bytes(&u, u_)
|
||||
|
||||
@@ -161,7 +161,7 @@ test_pbkdf2 :: proc(t: ^testing.T) {
|
||||
testing.expectf(
|
||||
t,
|
||||
dst_str == v.dk,
|
||||
"HMAC-%s: Expected: %s for input of (%s, %s, %d), but got %s instead",
|
||||
"PBKDF2-%s: Expected: %s for input of (%s, %s, %d), but got %s instead",
|
||||
algo_name,
|
||||
v.dk,
|
||||
v.password,
|
||||
|
||||
Reference in New Issue
Block a user