diff --git a/core/crypto/_aes/ct64/ghash.odin b/core/crypto/_aes/ct64/ghash.odin index 21ac2ca97..a522a481a 100644 --- a/core/crypto/_aes/ct64/ghash.odin +++ b/core/crypto/_aes/ct64/ghash.odin @@ -80,8 +80,8 @@ ghash :: proc "contextless" (dst, key, data: []byte) { h2 := h0 ~ h1 h2r := h0r ~ h1r - src: []byte for l > 0 { + src: []byte = --- if l >= _aes.GHASH_BLOCK_SIZE { src = buf buf = buf[_aes.GHASH_BLOCK_SIZE:] diff --git a/core/crypto/_aes/hw_intel/api.odin b/core/crypto/_aes/hw_intel/api.odin index 5cb5a68bb..1796bb093 100644 --- a/core/crypto/_aes/hw_intel/api.odin +++ b/core/crypto/_aes/hw_intel/api.odin @@ -3,7 +3,7 @@ package aes_hw_intel import "core:sys/info" -// is_supporte returns true iff hardware accelerated AES +// is_supported returns true iff hardware accelerated AES // is supported. is_supported :: proc "contextless" () -> bool { features, ok := info.cpu_features.? diff --git a/core/crypto/_aes/hw_intel/ghash.odin b/core/crypto/_aes/hw_intel/ghash.odin index 9a5208523..d61e71b3a 100644 --- a/core/crypto/_aes/hw_intel/ghash.odin +++ b/core/crypto/_aes/hw_intel/ghash.odin @@ -25,7 +25,6 @@ package aes_hw_intel import "base:intrinsics" import "core:crypto/_aes" -import "core:simd" import "core:simd/x86" @(private = "file") @@ -58,14 +57,11 @@ GHASH_STRIDE_BYTES_HW :: GHASH_STRIDE_HW * _aes.GHASH_BLOCK_SIZE // chunks. We number chunks from 0 to 3 in left to right order. @(private = "file") -byteswap_index := transmute(x86.__m128i)simd.i8x16{ - // Note: simd.i8x16 is reverse order from x86._mm_set_epi8. - 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -} +_BYTESWAP_INDEX: x86.__m128i : { 0x08090a0b0c0d0e0f, 0x0001020304050607 } @(private = "file", require_results, enable_target_feature = "sse2,ssse3") byteswap :: #force_inline proc "contextless" (x: x86.__m128i) -> x86.__m128i { - return x86._mm_shuffle_epi8(x, byteswap_index) + return x86._mm_shuffle_epi8(x, _BYTESWAP_INDEX) } // From a 128-bit value kw, compute kx as the XOR of the two 64-bit @@ -244,8 +240,8 @@ ghash :: proc "contextless" (dst, key, data: []byte) #no_bounds_check { } // Process 1 block at a time - src: []byte for l > 0 { + src: []byte = --- if l >= _aes.GHASH_BLOCK_SIZE { src = buf buf = buf[_aes.GHASH_BLOCK_SIZE:] diff --git a/core/crypto/_chacha20/chacha20.odin b/core/crypto/_chacha20/chacha20.odin new file mode 100644 index 000000000..a907209de --- /dev/null +++ b/core/crypto/_chacha20/chacha20.odin @@ -0,0 +1,123 @@ +package _chacha20 + +import "base:intrinsics" +import "core:encoding/endian" +import "core:math/bits" +import "core:mem" + +// KEY_SIZE is the (X)ChaCha20 key size in bytes. +KEY_SIZE :: 32 +// IV_SIZE is the ChaCha20 IV size in bytes. +IV_SIZE :: 12 +// XIV_SIZE is the XChaCha20 IV size in bytes. +XIV_SIZE :: 24 + +// MAX_CTR_IETF is the maximum counter value for the IETF flavor ChaCha20. +MAX_CTR_IETF :: 0xffffffff +// BLOCK_SIZE is the (X)ChaCha20 block size in bytes. +BLOCK_SIZE :: 64 +// STATE_SIZE_U32 is the (X)ChaCha20 state size in u32s. +STATE_SIZE_U32 :: 16 +// Rounds is the (X)ChaCha20 round count. +ROUNDS :: 20 + +// SIGMA_0 is sigma[0:4]. +SIGMA_0: u32 : 0x61707865 +// SIGMA_1 is sigma[4:8]. +SIGMA_1: u32 : 0x3320646e +// SIGMA_2 is sigma[8:12]. +SIGMA_2: u32 : 0x79622d32 +// SIGMA_3 is sigma[12:16]. +SIGMA_3: u32 : 0x6b206574 + +// Context is a ChaCha20 or XChaCha20 instance. +Context :: struct { + _s: [STATE_SIZE_U32]u32, + _buffer: [BLOCK_SIZE]byte, + _off: int, + _is_ietf_flavor: bool, + _is_initialized: bool, +} + +// init inititializes a Context for ChaCha20 with the provided key and +// iv. +// +// WARNING: This ONLY handles ChaCha20. XChaCha20 sub-key and IV +// derivation is expected to be handled by the caller, so that the +// HChaCha call can be suitably accelerated. +init :: proc "contextless" (ctx: ^Context, key, iv: []byte, is_xchacha: bool) { + if len(key) != KEY_SIZE || len(iv) != IV_SIZE { + intrinsics.trap() + } + + k, n := key, iv + + ctx._s[0] = SIGMA_0 + ctx._s[1] = SIGMA_1 + ctx._s[2] = SIGMA_2 + ctx._s[3] = SIGMA_3 + ctx._s[4] = endian.unchecked_get_u32le(k[0:4]) + ctx._s[5] = endian.unchecked_get_u32le(k[4:8]) + ctx._s[6] = endian.unchecked_get_u32le(k[8:12]) + ctx._s[7] = endian.unchecked_get_u32le(k[12:16]) + ctx._s[8] = endian.unchecked_get_u32le(k[16:20]) + ctx._s[9] = endian.unchecked_get_u32le(k[20:24]) + ctx._s[10] = endian.unchecked_get_u32le(k[24:28]) + ctx._s[11] = endian.unchecked_get_u32le(k[28:32]) + ctx._s[12] = 0 + ctx._s[13] = endian.unchecked_get_u32le(n[0:4]) + ctx._s[14] = endian.unchecked_get_u32le(n[4:8]) + ctx._s[15] = endian.unchecked_get_u32le(n[8:12]) + + ctx._off = BLOCK_SIZE + ctx._is_ietf_flavor = !is_xchacha + ctx._is_initialized = true +} + +// seek seeks the (X)ChaCha20 stream counter to the specified block. +seek :: proc(ctx: ^Context, block_nr: u64) { + assert(ctx._is_initialized) + + if ctx._is_ietf_flavor { + if block_nr > MAX_CTR_IETF { + panic("crypto/chacha20: attempted to seek past maximum counter") + } + } else { + ctx._s[13] = u32(block_nr >> 32) + } + ctx._s[12] = u32(block_nr) + ctx._off = BLOCK_SIZE +} + +// reset sanitizes the Context. The Context must be re-initialized to +// be used again. +reset :: proc(ctx: ^Context) { + mem.zero_explicit(&ctx._s, size_of(ctx._s)) + mem.zero_explicit(&ctx._buffer, size_of(ctx._buffer)) + + ctx._is_initialized = false +} + +check_counter_limit :: proc(ctx: ^Context, nr_blocks: int) { + // Enforce the maximum consumed keystream per IV. + // + // While all modern "standard" definitions of ChaCha20 use + // the IETF 32-bit counter, for XChaCha20 most common + // implementations allow for a 64-bit counter. + // + // Honestly, the answer here is "use a MRAE primitive", but + // go with "common" practice in the case of XChaCha20. + + ERR_CTR_EXHAUSTED :: "crypto/chacha20: maximum (X)ChaCha20 keystream per IV reached" + + if ctx._is_ietf_flavor { + if u64(ctx._s[12]) + u64(nr_blocks) > MAX_CTR_IETF { + panic(ERR_CTR_EXHAUSTED) + } + } else { + ctr := (u64(ctx._s[13]) << 32) | u64(ctx._s[12]) + if _, carry := bits.add_u64(ctr, u64(nr_blocks), 0); carry != 0 { + panic(ERR_CTR_EXHAUSTED) + } + } +} diff --git a/core/crypto/_chacha20/ref/chacha20_ref.odin b/core/crypto/_chacha20/ref/chacha20_ref.odin new file mode 100644 index 000000000..c111c1c76 --- /dev/null +++ b/core/crypto/_chacha20/ref/chacha20_ref.odin @@ -0,0 +1,360 @@ +package chacha20_ref + +import "core:crypto/_chacha20" +import "core:encoding/endian" +import "core:math/bits" + +stream_blocks :: proc(ctx: ^_chacha20.Context, dst, src: []byte, nr_blocks: int) { + // Enforce the maximum consumed keystream per IV. + _chacha20.check_counter_limit(ctx, nr_blocks) + + dst, src := dst, src + x := &ctx._s + for n := 0; n < nr_blocks; n = n + 1 { + x0, x1, x2, x3 := + _chacha20.SIGMA_0, _chacha20.SIGMA_1, _chacha20.SIGMA_2, _chacha20.SIGMA_3 + x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 := + x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15] + + for i := _chacha20.ROUNDS; i > 0; i = i - 2 { + // Even when forcing inlining manually inlining all of + // these is decently faster. + + // quarterround(x, 0, 4, 8, 12) + x0 += x4 + x12 ~= x0 + x12 = bits.rotate_left32(x12, 16) + x8 += x12 + x4 ~= x8 + x4 = bits.rotate_left32(x4, 12) + x0 += x4 + x12 ~= x0 + x12 = bits.rotate_left32(x12, 8) + x8 += x12 + x4 ~= x8 + x4 = bits.rotate_left32(x4, 7) + + // quarterround(x, 1, 5, 9, 13) + x1 += x5 + x13 ~= x1 + x13 = bits.rotate_left32(x13, 16) + x9 += x13 + x5 ~= x9 + x5 = bits.rotate_left32(x5, 12) + x1 += x5 + x13 ~= x1 + x13 = bits.rotate_left32(x13, 8) + x9 += x13 + x5 ~= x9 + x5 = bits.rotate_left32(x5, 7) + + // quarterround(x, 2, 6, 10, 14) + x2 += x6 + x14 ~= x2 + x14 = bits.rotate_left32(x14, 16) + x10 += x14 + x6 ~= x10 + x6 = bits.rotate_left32(x6, 12) + x2 += x6 + x14 ~= x2 + x14 = bits.rotate_left32(x14, 8) + x10 += x14 + x6 ~= x10 + x6 = bits.rotate_left32(x6, 7) + + // quarterround(x, 3, 7, 11, 15) + x3 += x7 + x15 ~= x3 + x15 = bits.rotate_left32(x15, 16) + x11 += x15 + x7 ~= x11 + x7 = bits.rotate_left32(x7, 12) + x3 += x7 + x15 ~= x3 + x15 = bits.rotate_left32(x15, 8) + x11 += x15 + x7 ~= x11 + x7 = bits.rotate_left32(x7, 7) + + // quarterround(x, 0, 5, 10, 15) + x0 += x5 + x15 ~= x0 + x15 = bits.rotate_left32(x15, 16) + x10 += x15 + x5 ~= x10 + x5 = bits.rotate_left32(x5, 12) + x0 += x5 + x15 ~= x0 + x15 = bits.rotate_left32(x15, 8) + x10 += x15 + x5 ~= x10 + x5 = bits.rotate_left32(x5, 7) + + // quarterround(x, 1, 6, 11, 12) + x1 += x6 + x12 ~= x1 + x12 = bits.rotate_left32(x12, 16) + x11 += x12 + x6 ~= x11 + x6 = bits.rotate_left32(x6, 12) + x1 += x6 + x12 ~= x1 + x12 = bits.rotate_left32(x12, 8) + x11 += x12 + x6 ~= x11 + x6 = bits.rotate_left32(x6, 7) + + // quarterround(x, 2, 7, 8, 13) + x2 += x7 + x13 ~= x2 + x13 = bits.rotate_left32(x13, 16) + x8 += x13 + x7 ~= x8 + x7 = bits.rotate_left32(x7, 12) + x2 += x7 + x13 ~= x2 + x13 = bits.rotate_left32(x13, 8) + x8 += x13 + x7 ~= x8 + x7 = bits.rotate_left32(x7, 7) + + // quarterround(x, 3, 4, 9, 14) + x3 += x4 + x14 ~= x3 + x14 = bits.rotate_left32(x14, 16) + x9 += x14 + x4 ~= x9 + x4 = bits.rotate_left32(x4, 12) + x3 += x4 + x14 ~= x3 + x14 = bits.rotate_left32(x14, 8) + x9 += x14 + x4 ~= x9 + x4 = bits.rotate_left32(x4, 7) + } + + x0 += _chacha20.SIGMA_0 + x1 += _chacha20.SIGMA_1 + x2 += _chacha20.SIGMA_2 + x3 += _chacha20.SIGMA_3 + x4 += x[4] + x5 += x[5] + x6 += x[6] + x7 += x[7] + x8 += x[8] + x9 += x[9] + x10 += x[10] + x11 += x[11] + x12 += x[12] + x13 += x[13] + x14 += x[14] + x15 += x[15] + + // - The caller(s) ensure that src/dst are valid. + // - The compiler knows if the target is picky about alignment. + + #no_bounds_check { + if src != nil { + endian.unchecked_put_u32le(dst[0:4], endian.unchecked_get_u32le(src[0:4]) ~ x0) + endian.unchecked_put_u32le(dst[4:8], endian.unchecked_get_u32le(src[4:8]) ~ x1) + endian.unchecked_put_u32le(dst[8:12], endian.unchecked_get_u32le(src[8:12]) ~ x2) + endian.unchecked_put_u32le(dst[12:16], endian.unchecked_get_u32le(src[12:16]) ~ x3) + endian.unchecked_put_u32le(dst[16:20], endian.unchecked_get_u32le(src[16:20]) ~ x4) + endian.unchecked_put_u32le(dst[20:24], endian.unchecked_get_u32le(src[20:24]) ~ x5) + endian.unchecked_put_u32le(dst[24:28], endian.unchecked_get_u32le(src[24:28]) ~ x6) + endian.unchecked_put_u32le(dst[28:32], endian.unchecked_get_u32le(src[28:32]) ~ x7) + endian.unchecked_put_u32le(dst[32:36], endian.unchecked_get_u32le(src[32:36]) ~ x8) + endian.unchecked_put_u32le(dst[36:40], endian.unchecked_get_u32le(src[36:40]) ~ x9) + endian.unchecked_put_u32le( + dst[40:44], + endian.unchecked_get_u32le(src[40:44]) ~ x10, + ) + endian.unchecked_put_u32le( + dst[44:48], + endian.unchecked_get_u32le(src[44:48]) ~ x11, + ) + endian.unchecked_put_u32le( + dst[48:52], + endian.unchecked_get_u32le(src[48:52]) ~ x12, + ) + endian.unchecked_put_u32le( + dst[52:56], + endian.unchecked_get_u32le(src[52:56]) ~ x13, + ) + endian.unchecked_put_u32le( + dst[56:60], + endian.unchecked_get_u32le(src[56:60]) ~ x14, + ) + endian.unchecked_put_u32le( + dst[60:64], + endian.unchecked_get_u32le(src[60:64]) ~ x15, + ) + src = src[_chacha20.BLOCK_SIZE:] + } else { + endian.unchecked_put_u32le(dst[0:4], x0) + endian.unchecked_put_u32le(dst[4:8], x1) + endian.unchecked_put_u32le(dst[8:12], x2) + endian.unchecked_put_u32le(dst[12:16], x3) + endian.unchecked_put_u32le(dst[16:20], x4) + endian.unchecked_put_u32le(dst[20:24], x5) + endian.unchecked_put_u32le(dst[24:28], x6) + endian.unchecked_put_u32le(dst[28:32], x7) + endian.unchecked_put_u32le(dst[32:36], x8) + endian.unchecked_put_u32le(dst[36:40], x9) + endian.unchecked_put_u32le(dst[40:44], x10) + endian.unchecked_put_u32le(dst[44:48], x11) + endian.unchecked_put_u32le(dst[48:52], x12) + endian.unchecked_put_u32le(dst[52:56], x13) + endian.unchecked_put_u32le(dst[56:60], x14) + endian.unchecked_put_u32le(dst[60:64], x15) + } + dst = dst[_chacha20.BLOCK_SIZE:] + } + + // Increment the counter. Overflow checking is done upon + // entry into the routine, so a 64-bit increment safely + // covers both cases. + new_ctr := ((u64(ctx._s[13]) << 32) | u64(ctx._s[12])) + 1 + x[12] = u32(new_ctr) + x[13] = u32(new_ctr >> 32) + } +} + +hchacha20 :: proc "contextless" (dst, key, iv: []byte) { + x0, x1, x2, x3 := _chacha20.SIGMA_0, _chacha20.SIGMA_1, _chacha20.SIGMA_2, _chacha20.SIGMA_3 + x4 := endian.unchecked_get_u32le(key[0:4]) + x5 := endian.unchecked_get_u32le(key[4:8]) + x6 := endian.unchecked_get_u32le(key[8:12]) + x7 := endian.unchecked_get_u32le(key[12:16]) + x8 := endian.unchecked_get_u32le(key[16:20]) + x9 := endian.unchecked_get_u32le(key[20:24]) + x10 := endian.unchecked_get_u32le(key[24:28]) + x11 := endian.unchecked_get_u32le(key[28:32]) + x12 := endian.unchecked_get_u32le(iv[0:4]) + x13 := endian.unchecked_get_u32le(iv[4:8]) + x14 := endian.unchecked_get_u32le(iv[8:12]) + x15 := endian.unchecked_get_u32le(iv[12:16]) + + for i := _chacha20.ROUNDS; i > 0; i = i - 2 { + // quarterround(x, 0, 4, 8, 12) + x0 += x4 + x12 ~= x0 + x12 = bits.rotate_left32(x12, 16) + x8 += x12 + x4 ~= x8 + x4 = bits.rotate_left32(x4, 12) + x0 += x4 + x12 ~= x0 + x12 = bits.rotate_left32(x12, 8) + x8 += x12 + x4 ~= x8 + x4 = bits.rotate_left32(x4, 7) + + // quarterround(x, 1, 5, 9, 13) + x1 += x5 + x13 ~= x1 + x13 = bits.rotate_left32(x13, 16) + x9 += x13 + x5 ~= x9 + x5 = bits.rotate_left32(x5, 12) + x1 += x5 + x13 ~= x1 + x13 = bits.rotate_left32(x13, 8) + x9 += x13 + x5 ~= x9 + x5 = bits.rotate_left32(x5, 7) + + // quarterround(x, 2, 6, 10, 14) + x2 += x6 + x14 ~= x2 + x14 = bits.rotate_left32(x14, 16) + x10 += x14 + x6 ~= x10 + x6 = bits.rotate_left32(x6, 12) + x2 += x6 + x14 ~= x2 + x14 = bits.rotate_left32(x14, 8) + x10 += x14 + x6 ~= x10 + x6 = bits.rotate_left32(x6, 7) + + // quarterround(x, 3, 7, 11, 15) + x3 += x7 + x15 ~= x3 + x15 = bits.rotate_left32(x15, 16) + x11 += x15 + x7 ~= x11 + x7 = bits.rotate_left32(x7, 12) + x3 += x7 + x15 ~= x3 + x15 = bits.rotate_left32(x15, 8) + x11 += x15 + x7 ~= x11 + x7 = bits.rotate_left32(x7, 7) + + // quarterround(x, 0, 5, 10, 15) + x0 += x5 + x15 ~= x0 + x15 = bits.rotate_left32(x15, 16) + x10 += x15 + x5 ~= x10 + x5 = bits.rotate_left32(x5, 12) + x0 += x5 + x15 ~= x0 + x15 = bits.rotate_left32(x15, 8) + x10 += x15 + x5 ~= x10 + x5 = bits.rotate_left32(x5, 7) + + // quarterround(x, 1, 6, 11, 12) + x1 += x6 + x12 ~= x1 + x12 = bits.rotate_left32(x12, 16) + x11 += x12 + x6 ~= x11 + x6 = bits.rotate_left32(x6, 12) + x1 += x6 + x12 ~= x1 + x12 = bits.rotate_left32(x12, 8) + x11 += x12 + x6 ~= x11 + x6 = bits.rotate_left32(x6, 7) + + // quarterround(x, 2, 7, 8, 13) + x2 += x7 + x13 ~= x2 + x13 = bits.rotate_left32(x13, 16) + x8 += x13 + x7 ~= x8 + x7 = bits.rotate_left32(x7, 12) + x2 += x7 + x13 ~= x2 + x13 = bits.rotate_left32(x13, 8) + x8 += x13 + x7 ~= x8 + x7 = bits.rotate_left32(x7, 7) + + // quarterround(x, 3, 4, 9, 14) + x3 += x4 + x14 ~= x3 + x14 = bits.rotate_left32(x14, 16) + x9 += x14 + x4 ~= x9 + x4 = bits.rotate_left32(x4, 12) + x3 += x4 + x14 ~= x3 + x14 = bits.rotate_left32(x14, 8) + x9 += x14 + x4 ~= x9 + x4 = bits.rotate_left32(x4, 7) + } + + endian.unchecked_put_u32le(dst[0:4], x0) + endian.unchecked_put_u32le(dst[4:8], x1) + endian.unchecked_put_u32le(dst[8:12], x2) + endian.unchecked_put_u32le(dst[12:16], x3) + endian.unchecked_put_u32le(dst[16:20], x12) + endian.unchecked_put_u32le(dst[20:24], x13) + endian.unchecked_put_u32le(dst[24:28], x14) + endian.unchecked_put_u32le(dst[28:32], x15) +} diff --git a/core/crypto/_chacha20/simd128/chacha20_simd128.odin b/core/crypto/_chacha20/simd128/chacha20_simd128.odin new file mode 100644 index 000000000..4cab3c5e8 --- /dev/null +++ b/core/crypto/_chacha20/simd128/chacha20_simd128.odin @@ -0,0 +1,481 @@ +package chacha20_simd128 + +import "base:intrinsics" +import "core:crypto/_chacha20" +import "core:simd" +import "core:sys/info" + +// Portable 128-bit `core:simd` implementation. +// +// This is loosely based on Ted Krovetz's public domain C intrinsic +// implementation. +// +// This is written to perform adequately on any target that has "enough" +// 128-bit vector registers, the current thought is that 4 blocks at at +// time is reasonable for amd64, though Ted's code is more conservative. +// +// See: +// supercop-20230530/crypto_stream/chacha20/krovetz/vec128 + +// Ensure the compiler emits SIMD instructions. This is a minimum, and +// setting the microarchitecture at compile time will allow for better +// code gen when applicable (eg: AVX). This is somewhat redundant with +// the default microarchitecture configurations. +when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 { + @(private = "file") + TARGET_SIMD_FEATURES :: "neon" +} else when ODIN_ARCH == .amd64 || ODIN_ARCH == .i386 { + // Note: LLVM appears to be smart enough to use PSHUFB despite not + // explicitly using simd.u8x16 shuffles. + @(private = "file") + TARGET_SIMD_FEATURES :: "sse2,ssse3" +} else { + @(private = "file") + TARGET_SIMD_FEATURES :: "" +} + +@(private = "file") +_ROT_7L: simd.u32x4 : {7, 7, 7, 7} +@(private = "file") +_ROT_7R: simd.u32x4 : {25, 25, 25, 25} +@(private = "file") +_ROT_12L: simd.u32x4 : {12, 12, 12, 12} +@(private = "file") +_ROT_12R: simd.u32x4 : {20, 20, 20, 20} +@(private = "file") +_ROT_8L: simd.u32x4 : {8, 8, 8, 8} +@(private = "file") +_ROT_8R: simd.u32x4 : {24, 24, 24, 24} +@(private = "file") +_ROT_16: simd.u32x4 : {16, 16, 16, 16} + +when ODIN_ENDIAN == .Big { + @(private = "file") + _increment_counter :: #force_inline proc "contextless" (ctx: ^Context) -> simd.u32x4 { + // In the Big Endian case, the low and high portions in the vector + // are flipped, so the 64-bit addition can't be done with a simple + // vector add. + x := &ctx._s + + new_ctr := ((u64(ctx._s[13]) << 32) | u64(ctx._s[12])) + 1 + x[12] = u32(new_ctr) + x[13] = u32(new_ctr >> 32) + + return intrinsics.unaligned_load(transmute(^simd.u32x4)&x[12]) + } + + // Convert the endian-ness of the components of a u32x4 vector, for + // the purposes of output. + @(private = "file") + _byteswap_u32x4 :: #force_inline proc "contextless" (v: simd.u32x4) -> simd.u32x4 { + return( + transmute(simd.u32x4)simd.shuffle( + transmute(simd.u8x16)v, + transmute(simd.u8x16)v, + 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, + ) + ) + } +} else { + @(private = "file") + _VEC_ONE: simd.u64x2 : {1, 0} +} + +@(private = "file") +_dq_round_simd128 :: #force_inline proc "contextless" ( + v0, v1, v2, v3: simd.u32x4, +) -> ( + simd.u32x4, + simd.u32x4, + simd.u32x4, + simd.u32x4, +) { + v0, v1, v2, v3 := v0, v1, v2, v3 + + // a += b; d ^= a; d = ROTW16(d); + v0 = simd.add(v0, v1) + v3 = simd.bit_xor(v3, v0) + v3 = simd.bit_xor(simd.shl(v3, _ROT_16), simd.shr(v3, _ROT_16)) + + // c += d; b ^= c; b = ROTW12(b); + v2 = simd.add(v2, v3) + v1 = simd.bit_xor(v1, v2) + v1 = simd.bit_xor(simd.shl(v1, _ROT_12L), simd.shr(v1, _ROT_12R)) + + // a += b; d ^= a; d = ROTW8(d); + v0 = simd.add(v0, v1) + v3 = simd.bit_xor(v3, v0) + v3 = simd.bit_xor(simd.shl(v3, _ROT_8L), simd.shr(v3, _ROT_8R)) + + // c += d; b ^= c; b = ROTW7(b); + v2 = simd.add(v2, v3) + v1 = simd.bit_xor(v1, v2) + v1 = simd.bit_xor(simd.shl(v1, _ROT_7L), simd.shr(v1, _ROT_7R)) + + // b = ROTV1(b); c = ROTV2(c); d = ROTV3(d); + v1 = simd.shuffle(v1, v1, 1, 2, 3, 0) + v2 = simd.shuffle(v2, v2, 2, 3, 0, 1) + v3 = simd.shuffle(v3, v3, 3, 0, 1, 2) + + // a += b; d ^= a; d = ROTW16(d); + v0 = simd.add(v0, v1) + v3 = simd.bit_xor(v3, v0) + v3 = simd.bit_xor(simd.shl(v3, _ROT_16), simd.shr(v3, _ROT_16)) + + // c += d; b ^= c; b = ROTW12(b); + v2 = simd.add(v2, v3) + v1 = simd.bit_xor(v1, v2) + v1 = simd.bit_xor(simd.shl(v1, _ROT_12L), simd.shr(v1, _ROT_12R)) + + // a += b; d ^= a; d = ROTW8(d); + v0 = simd.add(v0, v1) + v3 = simd.bit_xor(v3, v0) + v3 = simd.bit_xor(simd.shl(v3, _ROT_8L), simd.shr(v3, _ROT_8R)) + + // c += d; b ^= c; b = ROTW7(b); + v2 = simd.add(v2, v3) + v1 = simd.bit_xor(v1, v2) + v1 = simd.bit_xor(simd.shl(v1, _ROT_7L), simd.shr(v1, _ROT_7R)) + + // b = ROTV3(b); c = ROTV2(c); d = ROTV1(d); + v1 = simd.shuffle(v1, v1, 3, 0, 1, 2) + v2 = simd.shuffle(v2, v2, 2, 3, 0, 1) + v3 = simd.shuffle(v3, v3, 1, 2, 3, 0) + + return v0, v1, v2, v3 +} + +@(private = "file") +_add_state_simd128 :: #force_inline proc "contextless" ( + v0, v1, v2, v3, s0, s1, s2, s3: simd.u32x4, +) -> ( + simd.u32x4, + simd.u32x4, + simd.u32x4, + simd.u32x4, +) { + v0, v1, v2, v3 := v0, v1, v2, v3 + + v0 = simd.add(v0, s0) + v1 = simd.add(v1, s1) + v2 = simd.add(v2, s2) + v3 = simd.add(v3, s3) + + when ODIN_ENDIAN == .Big { + v0 = _byteswap_u32x4(v0) + v1 = _byteswap_u32x4(v1) + v2 = _byteswap_u32x4(v2) + v3 = _byteswap_u32x4(v3) + } + + return v0, v1, v2, v3 +} + +@(private = "file") +_xor_simd128 :: #force_inline proc "contextless" ( + src: [^]simd.u32x4, + v0, v1, v2, v3: simd.u32x4, +) -> ( + simd.u32x4, + simd.u32x4, + simd.u32x4, + simd.u32x4, +) { + v0, v1, v2, v3 := v0, v1, v2, v3 + + v0 = simd.bit_xor(v0, intrinsics.unaligned_load((^simd.u32x4)(src[0:]))) + v1 = simd.bit_xor(v1, intrinsics.unaligned_load((^simd.u32x4)(src[1:]))) + v2 = simd.bit_xor(v2, intrinsics.unaligned_load((^simd.u32x4)(src[2:]))) + v3 = simd.bit_xor(v3, intrinsics.unaligned_load((^simd.u32x4)(src[3:]))) + + return v0, v1, v2, v3 +} + +@(private = "file") +_store_simd128 :: #force_inline proc "contextless" ( + dst: [^]simd.u32x4, + v0, v1, v2, v3: simd.u32x4, +) { + intrinsics.unaligned_store((^simd.u32x4)(dst[0:]), v0) + intrinsics.unaligned_store((^simd.u32x4)(dst[1:]), v1) + intrinsics.unaligned_store((^simd.u32x4)(dst[2:]), v2) + intrinsics.unaligned_store((^simd.u32x4)(dst[3:]), v3) +} + +// is_performant returns true iff the target and current host both support +// "enough" 128-bit SIMD to make this implementation performant. +is_performant :: proc "contextless" () -> bool { + when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 || ODIN_ARCH == .amd64 || ODIN_ARCH == .i386 { + when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 { + req_features :: info.CPU_Features{.asimd} + } else when ODIN_ARCH == .amd64 || ODIN_ARCH == .i386 { + req_features :: info.CPU_Features{.sse2, .ssse3} + } + + features, ok := info.cpu_features.? + if !ok { + return false + } + + return features >= req_features + } else when ODIN_ARCH == .wasm64p32 || ODIN_ARCH == .wasm32 { + return intrinsics.has_target_feature("simd128") + } else { + return false + } +} + +@(enable_target_feature = TARGET_SIMD_FEATURES) +stream_blocks :: proc(ctx: ^_chacha20.Context, dst, src: []byte, nr_blocks: int) { + // Enforce the maximum consumed keystream per IV. + _chacha20.check_counter_limit(ctx, nr_blocks) + + dst_v := ([^]simd.u32x4)(raw_data(dst)) + src_v := ([^]simd.u32x4)(raw_data(src)) + + x := &ctx._s + n := nr_blocks + + // The state vector is an array of uint32s in native byte-order. + x_v := ([^]simd.u32x4)(raw_data(x)) + s0 := intrinsics.unaligned_load((^simd.u32x4)(x_v[0:])) + s1 := intrinsics.unaligned_load((^simd.u32x4)(x_v[1:])) + s2 := intrinsics.unaligned_load((^simd.u32x4)(x_v[2:])) + s3 := intrinsics.unaligned_load((^simd.u32x4)(x_v[3:])) + + // 8 blocks at a time. + // + // Note: This is only worth it on Aarch64. + when ODIN_ARCH == .arm64 { + for ; n >= 8; n = n - 8 { + v0, v1, v2, v3 := s0, s1, s2, s3 + + when ODIN_ENDIAN == .Little { + s7 := transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s3, _VEC_ONE) + } else { + s7 := _increment_counter(ctx) + } + v4, v5, v6, v7 := s0, s1, s2, s7 + + when ODIN_ENDIAN == .Little { + s11 := transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s7, _VEC_ONE) + } else { + s11 := _increment_counter(ctx) + } + v8, v9, v10, v11 := s0, s1, s2, s11 + + when ODIN_ENDIAN == .Little { + s15 := transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s11, _VEC_ONE) + } else { + s15 := _increment_counter(ctx) + } + v12, v13, v14, v15 := s0, s1, s2, s15 + + when ODIN_ENDIAN == .Little { + s19 := transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s15, _VEC_ONE) + } else { + s19 := _increment_counter(ctx) + } + + v16, v17, v18, v19 := s0, s1, s2, s19 + when ODIN_ENDIAN == .Little { + s23 := transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s19, _VEC_ONE) + } else { + s23 := _increment_counter(ctx) + } + + v20, v21, v22, v23 := s0, s1, s2, s23 + when ODIN_ENDIAN == .Little { + s27 := transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s23, _VEC_ONE) + } else { + s27 := _increment_counter(ctx) + } + + v24, v25, v26, v27 := s0, s1, s2, s27 + when ODIN_ENDIAN == .Little { + s31 := transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s27, _VEC_ONE) + } else { + s31 := _increment_counter(ctx) + } + v28, v29, v30, v31 := s0, s1, s2, s31 + + for i := _chacha20.ROUNDS; i > 0; i = i - 2 { + v0, v1, v2, v3 = _dq_round_simd128(v0, v1, v2, v3) + v4, v5, v6, v7 = _dq_round_simd128(v4, v5, v6, v7) + v8, v9, v10, v11 = _dq_round_simd128(v8, v9, v10, v11) + v12, v13, v14, v15 = _dq_round_simd128(v12, v13, v14, v15) + v16, v17, v18, v19 = _dq_round_simd128(v16, v17, v18, v19) + v20, v21, v22, v23 = _dq_round_simd128(v20, v21, v22, v23) + v24, v25, v26, v27 = _dq_round_simd128(v24, v25, v26, v27) + v28, v29, v30, v31 = _dq_round_simd128(v28, v29, v30, v31) + } + + v0, v1, v2, v3 = _add_state_simd128(v0, v1, v2, v3, s0, s1, s2, s3) + v4, v5, v6, v7 = _add_state_simd128(v4, v5, v6, v7, s0, s1, s2, s7) + v8, v9, v10, v11 = _add_state_simd128(v8, v9, v10, v11, s0, s1, s2, s11) + v12, v13, v14, v15 = _add_state_simd128(v12, v13, v14, v15, s0, s1, s2, s15) + v16, v17, v18, v19 = _add_state_simd128(v16, v17, v18, v19, s0, s1, s2, s19) + v20, v21, v22, v23 = _add_state_simd128(v20, v21, v22, v23, s0, s1, s2, s23) + v24, v25, v26, v27 = _add_state_simd128(v24, v25, v26, v27, s0, s1, s2, s27) + v28, v29, v30, v31 = _add_state_simd128(v28, v29, v30, v31, s0, s1, s2, s31) + + #no_bounds_check { + if src != nil { + v0, v1, v2, v3 = _xor_simd128(src_v, v0, v1, v2, v3) + v4, v5, v6, v7 = _xor_simd128(src_v[4:], v4, v5, v6, v7) + v8, v9, v10, v11 = _xor_simd128(src_v[8:], v8, v9, v10, v11) + v12, v13, v14, v15 = _xor_simd128(src_v[12:], v12, v13, v14, v15) + v16, v17, v18, v19 = _xor_simd128(src_v[16:], v16, v17, v18, v19) + v20, v21, v22, v23 = _xor_simd128(src_v[20:], v20, v21, v22, v23) + v24, v25, v26, v27 = _xor_simd128(src_v[24:], v24, v25, v26, v27) + v28, v29, v30, v31 = _xor_simd128(src_v[28:], v28, v29, v30, v31) + src_v = src_v[32:] + } + + _store_simd128(dst_v, v0, v1, v2, v3) + _store_simd128(dst_v[4:], v4, v5, v6, v7) + _store_simd128(dst_v[8:], v8, v9, v10, v11) + _store_simd128(dst_v[12:], v12, v13, v14, v15) + _store_simd128(dst_v[16:], v16, v17, v18, v19) + _store_simd128(dst_v[20:], v20, v21, v22, v23) + _store_simd128(dst_v[24:], v24, v25, v26, v27) + _store_simd128(dst_v[28:], v28, v29, v30, v31) + dst_v = dst_v[32:] + } + + when ODIN_ENDIAN == .Little { + // s31 holds the most current counter, so `s3 = s31 + 1`. + s3 = transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s31, _VEC_ONE) + } else { + s3 = _increment_counter(ctx) + } + } + } + + // 4 blocks at a time. + // + // Note: The i386 target lacks the required number of registers + // for this to be performant, so it is skipped. + when ODIN_ARCH != .i386 { + for ; n >= 4; n = n - 4 { + v0, v1, v2, v3 := s0, s1, s2, s3 + + when ODIN_ENDIAN == .Little { + s7 := transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s3, _VEC_ONE) + } else { + s7 := _increment_counter(ctx) + } + v4, v5, v6, v7 := s0, s1, s2, s7 + + when ODIN_ENDIAN == .Little { + s11 := transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s7, _VEC_ONE) + } else { + s11 := _increment_counter(ctx) + } + v8, v9, v10, v11 := s0, s1, s2, s11 + + when ODIN_ENDIAN == .Little { + s15 := transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s11, _VEC_ONE) + } else { + s15 := _increment_counter(ctx) + } + v12, v13, v14, v15 := s0, s1, s2, s15 + + for i := _chacha20.ROUNDS; i > 0; i = i - 2 { + v0, v1, v2, v3 = _dq_round_simd128(v0, v1, v2, v3) + v4, v5, v6, v7 = _dq_round_simd128(v4, v5, v6, v7) + v8, v9, v10, v11 = _dq_round_simd128(v8, v9, v10, v11) + v12, v13, v14, v15 = _dq_round_simd128(v12, v13, v14, v15) + } + + v0, v1, v2, v3 = _add_state_simd128(v0, v1, v2, v3, s0, s1, s2, s3) + v4, v5, v6, v7 = _add_state_simd128(v4, v5, v6, v7, s0, s1, s2, s7) + v8, v9, v10, v11 = _add_state_simd128(v8, v9, v10, v11, s0, s1, s2, s11) + v12, v13, v14, v15 = _add_state_simd128(v12, v13, v14, v15, s0, s1, s2, s15) + + #no_bounds_check { + if src != nil { + v0, v1, v2, v3 = _xor_simd128(src_v, v0, v1, v2, v3) + v4, v5, v6, v7 = _xor_simd128(src_v[4:], v4, v5, v6, v7) + v8, v9, v10, v11 = _xor_simd128(src_v[8:], v8, v9, v10, v11) + v12, v13, v14, v15 = _xor_simd128(src_v[12:], v12, v13, v14, v15) + src_v = src_v[16:] + } + + _store_simd128(dst_v, v0, v1, v2, v3) + _store_simd128(dst_v[4:], v4, v5, v6, v7) + _store_simd128(dst_v[8:], v8, v9, v10, v11) + _store_simd128(dst_v[12:], v12, v13, v14, v15) + dst_v = dst_v[16:] + } + + when ODIN_ENDIAN == .Little { + // s15 holds the most current counter, so `s3 = s15 + 1`. + s3 = transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s15, _VEC_ONE) + } else { + s3 = _increment_counter(ctx) + } + } + } + + // 1 block at a time. + for ; n > 0; n = n - 1 { + v0, v1, v2, v3 := s0, s1, s2, s3 + + for i := _chacha20.ROUNDS; i > 0; i = i - 2 { + v0, v1, v2, v3 = _dq_round_simd128(v0, v1, v2, v3) + } + v0, v1, v2, v3 = _add_state_simd128(v0, v1, v2, v3, s0, s1, s2, s3) + + #no_bounds_check { + if src != nil { + v0, v1, v2, v3 = _xor_simd128(src_v, v0, v1, v2, v3) + src_v = src_v[4:] + } + + _store_simd128(dst_v, v0, v1, v2, v3) + dst_v = dst_v[4:] + } + + // Increment the counter. Overflow checking is done upon + // entry into the routine, so a 64-bit increment safely + // covers both cases. + when ODIN_ENDIAN == .Little { + s3 = transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s3, _VEC_ONE) + } else { + s3 = _increment_counter(ctx) + } + } + + when ODIN_ENDIAN == .Little { + // Write back the counter to the state. + intrinsics.unaligned_store((^simd.u32x4)(x_v[3:]), s3) + } +} + +@(enable_target_feature = TARGET_SIMD_FEATURES) +hchacha20 :: proc "contextless" (dst, key, iv: []byte) { + v0 := simd.u32x4{_chacha20.SIGMA_0, _chacha20.SIGMA_1, _chacha20.SIGMA_2, _chacha20.SIGMA_3} + v1 := intrinsics.unaligned_load((^simd.u32x4)(&key[0])) + v2 := intrinsics.unaligned_load((^simd.u32x4)(&key[16])) + v3 := intrinsics.unaligned_load((^simd.u32x4)(&iv[0])) + + when ODIN_ENDIAN == .Big { + v1 = _byteswap_u32x4(v1) + v2 = _byteswap_u32x4(v2) + v3 = _byteswap_u32x4(v3) + } + + for i := _chacha20.ROUNDS; i > 0; i = i - 2 { + v0, v1, v2, v3 = _dq_round_simd128(v0, v1, v2, v3) + } + + when ODIN_ENDIAN == .Big { + v0 = _byteswap_u32x4(v0) + v3 = _byteswap_u32x4(v3) + } + + dst_v := ([^]simd.u32x4)(raw_data(dst)) + intrinsics.unaligned_store((^simd.u32x4)(dst_v[0:]), v0) + intrinsics.unaligned_store((^simd.u32x4)(dst_v[1:]), v3) +} diff --git a/core/crypto/_chacha20/simd256/chacha20_simd256.odin b/core/crypto/_chacha20/simd256/chacha20_simd256.odin new file mode 100644 index 000000000..10f2d75fe --- /dev/null +++ b/core/crypto/_chacha20/simd256/chacha20_simd256.odin @@ -0,0 +1,319 @@ +//+build amd64 +package chacha20_simd256 + +import "base:intrinsics" +import "core:crypto/_chacha20" +import chacha_simd128 "core:crypto/_chacha20/simd128" +import "core:simd" +import "core:sys/info" + +// This is loosely based on Ted Krovetz's public domain C intrinsic +// implementations. While written using `core:simd`, this is currently +// amd64 specific because we do not have a way to detect ARM SVE. +// +// See: +// supercop-20230530/crypto_stream/chacha20/krovetz/vec128 +// supercop-20230530/crypto_stream/chacha20/krovetz/avx2 + +#assert(ODIN_ENDIAN == .Little) + +@(private = "file") +_ROT_7L: simd.u32x8 : {7, 7, 7, 7, 7, 7, 7, 7} +@(private = "file") +_ROT_7R: simd.u32x8 : {25, 25, 25, 25, 25, 25, 25, 25} +@(private = "file") +_ROT_12L: simd.u32x8 : {12, 12, 12, 12, 12, 12, 12, 12} +@(private = "file") +_ROT_12R: simd.u32x8 : {20, 20, 20, 20, 20, 20, 20, 20} +@(private = "file") +_ROT_8L: simd.u32x8 : {8, 8, 8, 8, 8, 8, 8, 8} +@(private = "file") +_ROT_8R: simd.u32x8 : {24, 24, 24, 24, 24, 24, 24, 24} +@(private = "file") +_ROT_16: simd.u32x8 : {16, 16, 16, 16, 16, 16, 16, 16} +@(private = "file") +_VEC_ZERO_ONE: simd.u64x4 : {0, 0, 1, 0} +@(private = "file") +_VEC_TWO: simd.u64x4 : {2, 0, 2, 0} + +// is_performant returns true iff the target and current host both support +// "enough" SIMD to make this implementation performant. +is_performant :: proc "contextless" () -> bool { + req_features :: info.CPU_Features{.avx, .avx2} + + features, ok := info.cpu_features.? + if !ok { + return false + } + + return features >= req_features +} + +@(private = "file") +_dq_round_simd256 :: #force_inline proc "contextless" ( + v0, v1, v2, v3: simd.u32x8, +) -> ( + simd.u32x8, + simd.u32x8, + simd.u32x8, + simd.u32x8, +) { + v0, v1, v2, v3 := v0, v1, v2, v3 + + // a += b; d ^= a; d = ROTW16(d); + v0 = simd.add(v0, v1) + v3 = simd.bit_xor(v3, v0) + v3 = simd.bit_xor(simd.shl(v3, _ROT_16), simd.shr(v3, _ROT_16)) + + // c += d; b ^= c; b = ROTW12(b); + v2 = simd.add(v2, v3) + v1 = simd.bit_xor(v1, v2) + v1 = simd.bit_xor(simd.shl(v1, _ROT_12L), simd.shr(v1, _ROT_12R)) + + // a += b; d ^= a; d = ROTW8(d); + v0 = simd.add(v0, v1) + v3 = simd.bit_xor(v3, v0) + v3 = simd.bit_xor(simd.shl(v3, _ROT_8L), simd.shr(v3, _ROT_8R)) + + // c += d; b ^= c; b = ROTW7(b); + v2 = simd.add(v2, v3) + v1 = simd.bit_xor(v1, v2) + v1 = simd.bit_xor(simd.shl(v1, _ROT_7L), simd.shr(v1, _ROT_7R)) + + // b = ROTV1(b); c = ROTV2(c); d = ROTV3(d); + v1 = simd.shuffle(v1, v1, 1, 2, 3, 0, 5, 6, 7, 4) + v2 = simd.shuffle(v2, v2, 2, 3, 0, 1, 6, 7, 4, 5) + v3 = simd.shuffle(v3, v3, 3, 0, 1, 2, 7, 4, 5, 6) + + // a += b; d ^= a; d = ROTW16(d); + v0 = simd.add(v0, v1) + v3 = simd.bit_xor(v3, v0) + v3 = simd.bit_xor(simd.shl(v3, _ROT_16), simd.shr(v3, _ROT_16)) + + // c += d; b ^= c; b = ROTW12(b); + v2 = simd.add(v2, v3) + v1 = simd.bit_xor(v1, v2) + v1 = simd.bit_xor(simd.shl(v1, _ROT_12L), simd.shr(v1, _ROT_12R)) + + // a += b; d ^= a; d = ROTW8(d); + v0 = simd.add(v0, v1) + v3 = simd.bit_xor(v3, v0) + v3 = simd.bit_xor(simd.shl(v3, _ROT_8L), simd.shr(v3, _ROT_8R)) + + // c += d; b ^= c; b = ROTW7(b); + v2 = simd.add(v2, v3) + v1 = simd.bit_xor(v1, v2) + v1 = simd.bit_xor(simd.shl(v1, _ROT_7L), simd.shr(v1, _ROT_7R)) + + // b = ROTV3(b); c = ROTV2(c); d = ROTV1(d); + v1 = simd.shuffle(v1, v1, 3, 0, 1, 2, 7, 4, 5, 6) + v2 = simd.shuffle(v2, v2, 2, 3, 0, 1, 6, 7, 4, 5) + v3 = simd.shuffle(v3, v3, 1, 2, 3, 0, 5, 6, 7, 4) + + return v0, v1, v2, v3 +} + +@(private = "file") +_add_and_permute_state_simd256 :: #force_inline proc "contextless" ( + v0, v1, v2, v3, s0, s1, s2, s3: simd.u32x8, +) -> ( + simd.u32x8, + simd.u32x8, + simd.u32x8, + simd.u32x8, +) { + t0 := simd.add(v0, s0) + t1 := simd.add(v1, s1) + t2 := simd.add(v2, s2) + t3 := simd.add(v3, s3) + + // Big Endian would byteswap here. + + // Each of v0 .. v3 has 128-bits of keystream for 2 separate blocks. + // permute the state such that (r0, r1) contains block 0, and (r2, r3) + // contains block 1. + r0 := simd.shuffle(t0, t1, 0, 1, 2, 3, 8, 9, 10, 11) + r2 := simd.shuffle(t0, t1, 4, 5, 6, 7, 12, 13, 14, 15) + r1 := simd.shuffle(t2, t3, 0, 1, 2, 3, 8, 9, 10, 11) + r3 := simd.shuffle(t2, t3, 4, 5, 6, 7, 12, 13, 14, 15) + + return r0, r1, r2, r3 +} + +@(private = "file") +_xor_simd256 :: #force_inline proc "contextless" ( + src: [^]simd.u32x8, + v0, v1, v2, v3: simd.u32x8, +) -> ( + simd.u32x8, + simd.u32x8, + simd.u32x8, + simd.u32x8, +) { + v0, v1, v2, v3 := v0, v1, v2, v3 + + v0 = simd.bit_xor(v0, intrinsics.unaligned_load((^simd.u32x8)(src[0:]))) + v1 = simd.bit_xor(v1, intrinsics.unaligned_load((^simd.u32x8)(src[1:]))) + v2 = simd.bit_xor(v2, intrinsics.unaligned_load((^simd.u32x8)(src[2:]))) + v3 = simd.bit_xor(v3, intrinsics.unaligned_load((^simd.u32x8)(src[3:]))) + + return v0, v1, v2, v3 +} + +@(private = "file") +_xor_simd256_x1 :: #force_inline proc "contextless" ( + src: [^]simd.u32x8, + v0, v1: simd.u32x8, +) -> ( + simd.u32x8, + simd.u32x8, +) { + v0, v1 := v0, v1 + + v0 = simd.bit_xor(v0, intrinsics.unaligned_load((^simd.u32x8)(src[0:]))) + v1 = simd.bit_xor(v1, intrinsics.unaligned_load((^simd.u32x8)(src[1:]))) + + return v0, v1 +} + +@(private = "file") +_store_simd256 :: #force_inline proc "contextless" ( + dst: [^]simd.u32x8, + v0, v1, v2, v3: simd.u32x8, +) { + intrinsics.unaligned_store((^simd.u32x8)(dst[0:]), v0) + intrinsics.unaligned_store((^simd.u32x8)(dst[1:]), v1) + intrinsics.unaligned_store((^simd.u32x8)(dst[2:]), v2) + intrinsics.unaligned_store((^simd.u32x8)(dst[3:]), v3) +} + +@(private = "file") +_store_simd256_x1 :: #force_inline proc "contextless" ( + dst: [^]simd.u32x8, + v0, v1: simd.u32x8, +) { + intrinsics.unaligned_store((^simd.u32x8)(dst[0:]), v0) + intrinsics.unaligned_store((^simd.u32x8)(dst[1:]), v1) +} + +@(enable_target_feature = "sse2,ssse3,avx,avx2") +stream_blocks :: proc(ctx: ^_chacha20.Context, dst, src: []byte, nr_blocks: int) { + // Enforce the maximum consumed keystream per IV. + _chacha20.check_counter_limit(ctx, nr_blocks) + + dst_v := ([^]simd.u32x8)(raw_data(dst)) + src_v := ([^]simd.u32x8)(raw_data(src)) + + x := &ctx._s + n := nr_blocks + + // The state vector is an array of uint32s in native byte-order. + // Setup s0 .. s3 such that each register stores 2 copies of the + // state. + x_v := ([^]simd.u32x4)(raw_data(x)) + t0 := intrinsics.unaligned_load((^simd.u32x4)(x_v[0:])) + t1 := intrinsics.unaligned_load((^simd.u32x4)(x_v[1:])) + t2 := intrinsics.unaligned_load((^simd.u32x4)(x_v[2:])) + t3 := intrinsics.unaligned_load((^simd.u32x4)(x_v[3:])) + s0 := simd.swizzle(t0, 0, 1, 2, 3, 0, 1, 2, 3) + s1 := simd.swizzle(t1, 0, 1, 2, 3, 0, 1, 2, 3) + s2 := simd.swizzle(t2, 0, 1, 2, 3, 0, 1, 2, 3) + s3 := simd.swizzle(t3, 0, 1, 2, 3, 0, 1, 2, 3) + + // Advance the counter in the 2nd copy of the state by one. + s3 = transmute(simd.u32x8)simd.add(transmute(simd.u64x4)s3, _VEC_ZERO_ONE) + + // 8 blocks at a time. + for ; n >= 8; n = n - 8 { + v0, v1, v2, v3 := s0, s1, s2, s3 + + s7 := transmute(simd.u32x8)simd.add(transmute(simd.u64x4)s3, _VEC_TWO) + v4, v5, v6, v7 := s0, s1, s2, s7 + + s11 := transmute(simd.u32x8)simd.add(transmute(simd.u64x4)s7, _VEC_TWO) + v8, v9, v10, v11 := s0, s1, s2, s11 + + s15 := transmute(simd.u32x8)simd.add(transmute(simd.u64x4)s11, _VEC_TWO) + v12, v13, v14, v15 := s0, s1, s2, s15 + + for i := _chacha20.ROUNDS; i > 0; i = i - 2 { + v0, v1, v2, v3 = _dq_round_simd256(v0, v1, v2, v3) + v4, v5, v6, v7 = _dq_round_simd256(v4, v5, v6, v7) + v8, v9, v10, v11 = _dq_round_simd256(v8, v9, v10, v11) + v12, v13, v14, v15 = _dq_round_simd256(v12, v13, v14, v15) + } + + v0, v1, v2, v3 = _add_and_permute_state_simd256(v0, v1, v2, v3, s0, s1, s2, s3) + v4, v5, v6, v7 = _add_and_permute_state_simd256(v4, v5, v6, v7, s0, s1, s2, s7) + v8, v9, v10, v11 = _add_and_permute_state_simd256(v8, v9, v10, v11, s0, s1, s2, s11) + v12, v13, v14, v15 = _add_and_permute_state_simd256(v12, v13, v14, v15, s0, s1, s2, s15) + + #no_bounds_check { + if src != nil { + v0, v1, v2, v3 = _xor_simd256(src_v, v0, v1, v2, v3) + v4, v5, v6, v7 = _xor_simd256(src_v[4:], v4, v5, v6, v7) + v8, v9, v10, v11 = _xor_simd256(src_v[8:], v8, v9, v10, v11) + v12, v13, v14, v15 = _xor_simd256(src_v[12:], v12, v13, v14, v15) + src_v = src_v[16:] + } + + _store_simd256(dst_v, v0, v1, v2, v3) + _store_simd256(dst_v[4:], v4, v5, v6, v7) + _store_simd256(dst_v[8:], v8, v9, v10, v11) + _store_simd256(dst_v[12:], v12, v13, v14, v15) + dst_v = dst_v[16:] + } + + s3 = transmute(simd.u32x8)simd.add(transmute(simd.u64x4)s15, _VEC_TWO) + } + + + // 2 (or 1) block at a time. + for ; n > 0; n = n - 2 { + v0, v1, v2, v3 := s0, s1, s2, s3 + + for i := _chacha20.ROUNDS; i > 0; i = i - 2 { + v0, v1, v2, v3 = _dq_round_simd256(v0, v1, v2, v3) + } + v0, v1, v2, v3 = _add_and_permute_state_simd256(v0, v1, v2, v3, s0, s1, s2, s3) + + if n == 1 { + // Note: No need to advance src_v, dst_v, or increment the counter + // since this is guaranteed to be the final block. + #no_bounds_check { + if src != nil { + v0, v1 = _xor_simd256_x1(src_v, v0, v1) + } + + _store_simd256_x1(dst_v, v0, v1) + } + break + } + + #no_bounds_check { + if src != nil { + v0, v1, v2, v3 = _xor_simd256(src_v, v0, v1, v2, v3) + src_v = src_v[4:] + } + + _store_simd256(dst_v, v0, v1, v2, v3) + dst_v = dst_v[4:] + } + + s3 = transmute(simd.u32x8)simd.add(transmute(simd.u64x4)s3, _VEC_TWO) + } + + // Write back the counter. Doing it this way, saves having to + // pull out the correct counter value from s3. + new_ctr := ((u64(ctx._s[13]) << 32) | u64(ctx._s[12])) + u64(nr_blocks) + ctx._s[12] = u32(new_ctr) + ctx._s[13] = u32(new_ctr >> 32) +} + +@(enable_target_feature = "sse2,ssse3,avx") +hchacha20 :: proc "contextless" (dst, key, iv: []byte) { + // We can just enable AVX and call the simd128 code as going + // wider has 0 performance benefit, but VEX encoded instructions + // is nice. + #force_inline chacha_simd128.hchacha20(dst, key, iv) +} \ No newline at end of file diff --git a/core/crypto/_chacha20/simd256/chacha20_simd256_stub.odin b/core/crypto/_chacha20/simd256/chacha20_simd256_stub.odin new file mode 100644 index 000000000..039d6cb96 --- /dev/null +++ b/core/crypto/_chacha20/simd256/chacha20_simd256_stub.odin @@ -0,0 +1,17 @@ +//+build !amd64 +package chacha20_simd256 + +import "base:intrinsics" +import "core:crypto/_chacha20" + +is_performant :: proc "contextless" () -> bool { + return false +} + +stream_blocks :: proc(ctx: ^_chacha20.Context, dst, src: []byte, nr_blocks: int) { + panic("crypto/chacha20: simd256 implementation unsupported") +} + +hchacha20 :: proc "contextless" (dst, key, iv: []byte) { + intrinsics.trap() +} \ No newline at end of file diff --git a/core/crypto/aead/aead.odin b/core/crypto/aead/aead.odin new file mode 100644 index 000000000..9b7d810e4 --- /dev/null +++ b/core/crypto/aead/aead.odin @@ -0,0 +1,36 @@ +package aead + +// seal_oneshot encrypts the plaintext and authenticates the aad and ciphertext, +// with the provided algorithm, key, and iv, stores the output in dst and tag. +// +// dst and plaintext MUST alias exactly or not at all. +seal_oneshot :: proc(algo: Algorithm, dst, tag, key, iv, aad, plaintext: []byte, impl: Implementation = nil) { + ctx: Context + init(&ctx, algo, key, impl) + defer reset(&ctx) + seal_ctx(&ctx, dst, tag, iv, aad, plaintext) +} + +// open authenticates the aad and ciphertext, and decrypts the ciphertext, +// with the provided algorithm, key, iv, and tag, and stores the output in dst, +// returning true iff the authentication was successful. If authentication +// fails, the destination buffer will be zeroed. +// +// dst and plaintext MUST alias exactly or not at all. +@(require_results) +open_oneshot :: proc(algo: Algorithm, dst, key, iv, aad, ciphertext, tag: []byte, impl: Implementation = nil) -> bool { + ctx: Context + init(&ctx, algo, key, impl) + defer reset(&ctx) + return open_ctx(&ctx, dst, iv, aad, ciphertext, tag) +} + +seal :: proc { + seal_ctx, + seal_oneshot, +} + +open :: proc { + open_ctx, + open_oneshot, +} diff --git a/core/crypto/aead/doc.odin b/core/crypto/aead/doc.odin new file mode 100644 index 000000000..388d31453 --- /dev/null +++ b/core/crypto/aead/doc.odin @@ -0,0 +1,58 @@ +/* +package aead provides a generic interface to the supported Authenticated +Encryption with Associated Data algorithms. + +Both a one-shot and context based interface are provided, with similar +usage. If multiple messages are to be sealed/opened via the same key, +the context based interface may be more efficient, depending on the +algorithm. + +WARNING: Reusing the same key + iv to seal (encrypt) multiple messages +results in catastrophic loss of security for most algorithms. + +```odin +package aead_example + +import "core:bytes" +import "core:crypto" +import "core:crypto/aead" + +main :: proc() { + algo := aead.Algorithm.XCHACHA20POLY1305 + + // The example added associated data, and plaintext. + aad_str := "Get your ass in gear boys." + pt_str := "They're immanetizing the Eschaton." + + aad := transmute([]byte)aad_str + plaintext := transmute([]byte)pt_str + pt_len := len(plaintext) + + // Generate a random key for the purposes of illustration. + key := make([]byte, aead.KEY_SIZES[algo]) + defer delete(key) + crypto.rand_bytes(key) + + // `ciphertext || tag`, is a common way data is transmitted, so + // demonstrate that. + buf := make([]byte, pt_len + aead.TAG_SIZES[algo]) + defer delete(buf) + ciphertext, tag := buf[:pt_len], buf[pt_len:] + + // Seal the AAD + Plaintext. + iv := make([]byte, aead.IV_SIZES[algo]) + defer delete(iv) + crypto.rand_bytes(iv) // Random IVs are safe with XChaCha20-Poly1305. + aead.seal(algo, ciphertext, tag, key, iv, aad, plaintext) + + // Open the AAD + Ciphertext. + opened_pt := buf[:pt_len] + if ok := aead.open(algo, opened_pt, key, iv, aad, ciphertext, tag); !ok { + panic("aead example: failed to open") + } + + assert(bytes.equal(opened_pt, plaintext)) +} +``` +*/ +package aead \ No newline at end of file diff --git a/core/crypto/aead/low_level.odin b/core/crypto/aead/low_level.odin new file mode 100644 index 000000000..38a0c84ba --- /dev/null +++ b/core/crypto/aead/low_level.odin @@ -0,0 +1,187 @@ +package aead + +import "core:crypto/aes" +import "core:crypto/chacha20" +import "core:crypto/chacha20poly1305" +import "core:reflect" + +// Implementation is an AEAD implementation. Most callers will not need +// to use this as the package will automatically select the most performant +// implementation available. +Implementation :: union { + aes.Implementation, + chacha20.Implementation, +} + +// MAX_TAG_SIZE is the maximum size tag that can be returned by any of the +// Algorithms supported via this package. +MAX_TAG_SIZE :: 16 + +// Algorithm is the algorithm identifier associated with a given Context. +Algorithm :: enum { + Invalid, + AES_GCM_128, + AES_GCM_192, + AES_GCM_256, + CHACHA20POLY1305, + XCHACHA20POLY1305, +} + +// ALGORITM_NAMES is the Agorithm to algorithm name string. +ALGORITHM_NAMES := [Algorithm]string { + .Invalid = "Invalid", + .AES_GCM_128 = "AES-GCM-128", + .AES_GCM_192 = "AES-GCM-192", + .AES_GCM_256 = "AES-GCM-256", + .CHACHA20POLY1305 = "chacha20poly1305", + .XCHACHA20POLY1305 = "xchacha20poly1305", +} + +// TAG_SIZES is the Algorithm to tag size in bytes. +TAG_SIZES := [Algorithm]int { + .Invalid = 0, + .AES_GCM_128 = aes.GCM_TAG_SIZE, + .AES_GCM_192 = aes.GCM_TAG_SIZE, + .AES_GCM_256 = aes.GCM_TAG_SIZE, + .CHACHA20POLY1305 = chacha20poly1305.TAG_SIZE, + .XCHACHA20POLY1305 = chacha20poly1305.TAG_SIZE, +} + +// KEY_SIZES is the Algorithm to key size in bytes. +KEY_SIZES := [Algorithm]int { + .Invalid = 0, + .AES_GCM_128 = aes.KEY_SIZE_128, + .AES_GCM_192 = aes.KEY_SIZE_192, + .AES_GCM_256 = aes.KEY_SIZE_256, + .CHACHA20POLY1305 = chacha20poly1305.KEY_SIZE, + .XCHACHA20POLY1305 = chacha20poly1305.KEY_SIZE, +} + +// IV_SIZES is the Algorithm to initialization vector size in bytes. +// +// Note: Some algorithms (such as AES-GCM) support variable IV sizes. +IV_SIZES := [Algorithm]int { + .Invalid = 0, + .AES_GCM_128 = aes.GCM_IV_SIZE, + .AES_GCM_192 = aes.GCM_IV_SIZE, + .AES_GCM_256 = aes.GCM_IV_SIZE, + .CHACHA20POLY1305 = chacha20poly1305.IV_SIZE, + .XCHACHA20POLY1305 = chacha20poly1305.XIV_SIZE, +} + +// Context is a concrete instantiation of a specific AEAD algorithm. +Context :: struct { + _algo: Algorithm, + _impl: union { + aes.Context_GCM, + chacha20poly1305.Context, + }, +} + +@(private) +_IMPL_IDS := [Algorithm]typeid { + .Invalid = nil, + .AES_GCM_128 = typeid_of(aes.Context_GCM), + .AES_GCM_192 = typeid_of(aes.Context_GCM), + .AES_GCM_256 = typeid_of(aes.Context_GCM), + .CHACHA20POLY1305 = typeid_of(chacha20poly1305.Context), + .XCHACHA20POLY1305 = typeid_of(chacha20poly1305.Context), +} + +// init initializes a Context with a specific AEAD Algorithm. +init :: proc(ctx: ^Context, algorithm: Algorithm, key: []byte, impl: Implementation = nil) { + if ctx._impl != nil { + reset(ctx) + } + + if len(key) != KEY_SIZES[algorithm] { + panic("crypto/aead: invalid key size") + } + + // Directly specialize the union by setting the type ID (save a copy). + reflect.set_union_variant_typeid( + ctx._impl, + _IMPL_IDS[algorithm], + ) + switch algorithm { + case .AES_GCM_128, .AES_GCM_192, .AES_GCM_256: + impl_ := impl != nil ? impl.(aes.Implementation) : aes.DEFAULT_IMPLEMENTATION + aes.init_gcm(&ctx._impl.(aes.Context_GCM), key, impl_) + case .CHACHA20POLY1305: + impl_ := impl != nil ? impl.(chacha20.Implementation) : chacha20.DEFAULT_IMPLEMENTATION + chacha20poly1305.init(&ctx._impl.(chacha20poly1305.Context), key, impl_) + case .XCHACHA20POLY1305: + impl_ := impl != nil ? impl.(chacha20.Implementation) : chacha20.DEFAULT_IMPLEMENTATION + chacha20poly1305.init_xchacha(&ctx._impl.(chacha20poly1305.Context), key, impl_) + case .Invalid: + panic("crypto/aead: uninitialized algorithm") + case: + panic("crypto/aead: invalid algorithm") + } + + ctx._algo = algorithm +} + +// seal_ctx encrypts the plaintext and authenticates the aad and ciphertext, +// with the provided Context and iv, stores the output in dst and tag. +// +// dst and plaintext MUST alias exactly or not at all. +seal_ctx :: proc(ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) { + switch &impl in ctx._impl { + case aes.Context_GCM: + aes.seal_gcm(&impl, dst, tag, iv, aad, plaintext) + case chacha20poly1305.Context: + chacha20poly1305.seal(&impl, dst, tag, iv, aad, plaintext) + case: + panic("crypto/aead: uninitialized algorithm") + } +} + +// open_ctx authenticates the aad and ciphertext, and decrypts the ciphertext, +// with the provided Context, iv, and tag, and stores the output in dst, +// returning true iff the authentication was successful. If authentication +// fails, the destination buffer will be zeroed. +// +// dst and plaintext MUST alias exactly or not at all. +@(require_results) +open_ctx :: proc(ctx: ^Context, dst, iv, aad, ciphertext, tag: []byte) -> bool { + switch &impl in ctx._impl { + case aes.Context_GCM: + return aes.open_gcm(&impl, dst, iv, aad, ciphertext, tag) + case chacha20poly1305.Context: + return chacha20poly1305.open(&impl, dst, iv, aad, ciphertext, tag) + case: + panic("crypto/aead: uninitialized algorithm") + } +} + +// reset sanitizes the Context. The Context must be re-initialized to +// be used again. +reset :: proc(ctx: ^Context) { + switch &impl in ctx._impl { + case aes.Context_GCM: + aes.reset_gcm(&impl) + case chacha20poly1305.Context: + chacha20poly1305.reset(&impl) + case: + // Calling reset repeatedly is fine. + } + + ctx._algo = .Invalid + ctx._impl = nil +} + +// algorithm returns the Algorithm used by a Context instance. +algorithm :: proc(ctx: ^Context) -> Algorithm { + return ctx._algo +} + +// iv_size returns the IV size of a Context instance in bytes. +iv_size :: proc(ctx: ^Context) -> int { + return IV_SIZES[ctx._algo] +} + +// tag_size returns the tag size of a Context instance in bytes. +tag_size :: proc(ctx: ^Context) -> int { + return TAG_SIZES[ctx._algo] +} diff --git a/core/crypto/aes/aes_ctr.odin b/core/crypto/aes/aes_ctr.odin index 1c5fe31e8..20b75e57f 100644 --- a/core/crypto/aes/aes_ctr.odin +++ b/core/crypto/aes/aes_ctr.odin @@ -20,7 +20,7 @@ Context_CTR :: struct { } // init_ctr initializes a Context_CTR with the provided key and IV. -init_ctr :: proc(ctx: ^Context_CTR, key, iv: []byte, impl := Implementation.Hardware) { +init_ctr :: proc(ctx: ^Context_CTR, key, iv: []byte, impl := DEFAULT_IMPLEMENTATION) { if len(iv) != CTR_IV_SIZE { panic("crypto/aes: invalid CTR IV size") } @@ -47,7 +47,7 @@ xor_bytes_ctr :: proc(ctx: ^Context_CTR, dst, src: []byte) { panic("crypto/aes: dst and src alias inexactly") } - for remaining := len(src); remaining > 0; { + #no_bounds_check for remaining := len(src); remaining > 0; { // Process multiple blocks at once if ctx._off == BLOCK_SIZE { if nr_blocks := remaining / BLOCK_SIZE; nr_blocks > 0 { @@ -85,7 +85,7 @@ keystream_bytes_ctr :: proc(ctx: ^Context_CTR, dst: []byte) { assert(ctx._is_initialized) dst := dst - for remaining := len(dst); remaining > 0; { + #no_bounds_check for remaining := len(dst); remaining > 0; { // Process multiple blocks at once if ctx._off == BLOCK_SIZE { if nr_blocks := remaining / BLOCK_SIZE; nr_blocks > 0 { diff --git a/core/crypto/aes/aes_ecb.odin b/core/crypto/aes/aes_ecb.odin index 498429e29..32476006c 100644 --- a/core/crypto/aes/aes_ecb.odin +++ b/core/crypto/aes/aes_ecb.odin @@ -12,7 +12,7 @@ Context_ECB :: struct { } // init_ecb initializes a Context_ECB with the provided key. -init_ecb :: proc(ctx: ^Context_ECB, key: []byte, impl := Implementation.Hardware) { +init_ecb :: proc(ctx: ^Context_ECB, key: []byte, impl := DEFAULT_IMPLEMENTATION) { init_impl(&ctx._impl, key, impl) ctx._is_initialized = true } diff --git a/core/crypto/aes/aes_gcm.odin b/core/crypto/aes/aes_gcm.odin index 25e0cc35b..8616821ce 100644 --- a/core/crypto/aes/aes_gcm.odin +++ b/core/crypto/aes/aes_gcm.odin @@ -7,10 +7,10 @@ import "core:crypto/_aes/ct64" import "core:encoding/endian" import "core:mem" -// GCM_NONCE_SIZE is the default size of the GCM nonce in bytes. -GCM_NONCE_SIZE :: 12 -// GCM_NONCE_SIZE_MAX is the maximum size of the GCM nonce in bytes. -GCM_NONCE_SIZE_MAX :: 0x2000000000000000 // floor((2^64 - 1) / 8) bits +// GCM_IV_SIZE is the default size of the GCM IV in bytes. +GCM_IV_SIZE :: 12 +// GCM_IV_SIZE_MAX is the maximum size of the GCM IV in bytes. +GCM_IV_SIZE_MAX :: 0x2000000000000000 // floor((2^64 - 1) / 8) bits // GCM_TAG_SIZE is the size of a GCM tag in bytes. GCM_TAG_SIZE :: _aes.GHASH_TAG_SIZE @@ -26,19 +26,19 @@ Context_GCM :: struct { } // init_gcm initializes a Context_GCM with the provided key. -init_gcm :: proc(ctx: ^Context_GCM, key: []byte, impl := Implementation.Hardware) { +init_gcm :: proc(ctx: ^Context_GCM, key: []byte, impl := DEFAULT_IMPLEMENTATION) { init_impl(&ctx._impl, key, impl) ctx._is_initialized = true } // seal_gcm encrypts the plaintext and authenticates the aad and ciphertext, -// with the provided Context_GCM and nonce, stores the output in dst and tag. +// with the provided Context_GCM and iv, stores the output in dst and tag. // // dst and plaintext MUST alias exactly or not at all. -seal_gcm :: proc(ctx: ^Context_GCM, dst, tag, nonce, aad, plaintext: []byte) { +seal_gcm :: proc(ctx: ^Context_GCM, dst, tag, iv, aad, plaintext: []byte) { assert(ctx._is_initialized) - gcm_validate_common_slice_sizes(tag, nonce, aad, plaintext) + gcm_validate_common_slice_sizes(tag, iv, aad, plaintext) if len(dst) != len(plaintext) { panic("crypto/aes: invalid destination ciphertext size") } @@ -47,7 +47,7 @@ seal_gcm :: proc(ctx: ^Context_GCM, dst, tag, nonce, aad, plaintext: []byte) { } if impl, is_hw := ctx._impl.(Context_Impl_Hardware); is_hw { - gcm_seal_hw(&impl, dst, tag, nonce, aad, plaintext) + gcm_seal_hw(&impl, dst, tag, iv, aad, plaintext) return } @@ -55,7 +55,7 @@ seal_gcm :: proc(ctx: ^Context_GCM, dst, tag, nonce, aad, plaintext: []byte) { j0: [_aes.GHASH_BLOCK_SIZE]byte j0_enc: [_aes.GHASH_BLOCK_SIZE]byte s: [_aes.GHASH_TAG_SIZE]byte - init_ghash_ct64(ctx, &h, &j0, &j0_enc, nonce) + init_ghash_ct64(ctx, &h, &j0, &j0_enc, iv) // Note: Our GHASH implementation handles appending padding. ct64.ghash(s[:], h[:], aad) @@ -69,15 +69,16 @@ seal_gcm :: proc(ctx: ^Context_GCM, dst, tag, nonce, aad, plaintext: []byte) { } // open_gcm authenticates the aad and ciphertext, and decrypts the ciphertext, -// with the provided Context_GCM, nonce, and tag, and stores the output in dst, +// with the provided Context_GCM, iv, and tag, and stores the output in dst, // returning true iff the authentication was successful. If authentication // fails, the destination buffer will be zeroed. // // dst and plaintext MUST alias exactly or not at all. -open_gcm :: proc(ctx: ^Context_GCM, dst, nonce, aad, ciphertext, tag: []byte) -> bool { +@(require_results) +open_gcm :: proc(ctx: ^Context_GCM, dst, iv, aad, ciphertext, tag: []byte) -> bool { assert(ctx._is_initialized) - gcm_validate_common_slice_sizes(tag, nonce, aad, ciphertext) + gcm_validate_common_slice_sizes(tag, iv, aad, ciphertext) if len(dst) != len(ciphertext) { panic("crypto/aes: invalid destination plaintext size") } @@ -86,14 +87,14 @@ open_gcm :: proc(ctx: ^Context_GCM, dst, nonce, aad, ciphertext, tag: []byte) -> } if impl, is_hw := ctx._impl.(Context_Impl_Hardware); is_hw { - return gcm_open_hw(&impl, dst, nonce, aad, ciphertext, tag) + return gcm_open_hw(&impl, dst, iv, aad, ciphertext, tag) } h: [_aes.GHASH_KEY_SIZE]byte j0: [_aes.GHASH_BLOCK_SIZE]byte j0_enc: [_aes.GHASH_BLOCK_SIZE]byte s: [_aes.GHASH_TAG_SIZE]byte - init_ghash_ct64(ctx, &h, &j0, &j0_enc, nonce) + init_ghash_ct64(ctx, &h, &j0, &j0_enc, iv) ct64.ghash(s[:], h[:], aad) gctr_ct64(ctx, dst, &s, ciphertext, &h, &j0, false) @@ -112,7 +113,7 @@ open_gcm :: proc(ctx: ^Context_GCM, dst, nonce, aad, ciphertext, tag: []byte) -> return ok } -// reset_ctr sanitizes the Context_GCM. The Context_GCM must be +// reset_gcm sanitizes the Context_GCM. The Context_GCM must be // re-initialized to be used again. reset_gcm :: proc "contextless" (ctx: ^Context_GCM) { reset_impl(&ctx._impl) @@ -120,14 +121,14 @@ reset_gcm :: proc "contextless" (ctx: ^Context_GCM) { } @(private = "file") -gcm_validate_common_slice_sizes :: proc(tag, nonce, aad, text: []byte) { +gcm_validate_common_slice_sizes :: proc(tag, iv, aad, text: []byte) { if len(tag) != GCM_TAG_SIZE { panic("crypto/aes: invalid GCM tag size") } - // The specification supports nonces in the range [1, 2^64) bits. - if l := len(nonce); l == 0 || u64(l) >= GCM_NONCE_SIZE_MAX { - panic("crypto/aes: invalid GCM nonce size") + // The specification supports IVs in the range [1, 2^64) bits. + if l := len(iv); l == 0 || u64(l) >= GCM_IV_SIZE_MAX { + panic("crypto/aes: invalid GCM IV size") } if aad_len := u64(len(aad)); aad_len > GCM_A_MAX { @@ -144,7 +145,7 @@ init_ghash_ct64 :: proc( h: ^[_aes.GHASH_KEY_SIZE]byte, j0: ^[_aes.GHASH_BLOCK_SIZE]byte, j0_enc: ^[_aes.GHASH_BLOCK_SIZE]byte, - nonce: []byte, + iv: []byte, ) { impl := &ctx._impl.(ct64.Context) @@ -152,14 +153,14 @@ init_ghash_ct64 :: proc( ct64.encrypt_block(impl, h[:], h[:]) // Define a block, J0, as follows: - if l := len(nonce); l == GCM_NONCE_SIZE { + if l := len(iv); l == GCM_IV_SIZE { // if len(IV) = 96, then let J0 = IV || 0^31 || 1 - copy(j0[:], nonce) + copy(j0[:], iv) j0[_aes.GHASH_BLOCK_SIZE - 1] = 1 } else { // If len(IV) != 96, then let s = 128 ceil(len(IV)/128) - len(IV), // and let J0 = GHASHH(IV || 0^(s+64) || ceil(len(IV))^64). - ct64.ghash(j0[:], h[:], nonce) + ct64.ghash(j0[:], h[:], iv) tmp: [_aes.GHASH_BLOCK_SIZE]byte endian.unchecked_put_u64be(tmp[8:], u64(l) * 8) @@ -197,7 +198,7 @@ gctr_ct64 :: proc( s: ^[_aes.GHASH_BLOCK_SIZE]byte, src: []byte, h: ^[_aes.GHASH_KEY_SIZE]byte, - nonce: ^[_aes.GHASH_BLOCK_SIZE]byte, + iv: ^[_aes.GHASH_BLOCK_SIZE]byte, is_seal: bool, ) #no_bounds_check { ct64_inc_ctr32 := #force_inline proc "contextless" (dst: []byte, ctr: u32) -> u32 { @@ -208,14 +209,14 @@ gctr_ct64 :: proc( // Setup the counter blocks. tmp, tmp2: [ct64.STRIDE][BLOCK_SIZE]byte = ---, --- ctrs, blks: [ct64.STRIDE][]byte = ---, --- - ctr := endian.unchecked_get_u32be(nonce[GCM_NONCE_SIZE:]) + 1 + ctr := endian.unchecked_get_u32be(iv[GCM_IV_SIZE:]) + 1 for i in 0 ..< ct64.STRIDE { // Setup scratch space for the keystream. blks[i] = tmp2[i][:] // Pre-copy the IV to all the counter blocks. ctrs[i] = tmp[i][:] - copy(ctrs[i], nonce[:GCM_NONCE_SIZE]) + copy(ctrs[i], iv[:GCM_IV_SIZE]) } impl := &ctx._impl.(ct64.Context) diff --git a/core/crypto/aes/aes_gcm_hw_intel.odin b/core/crypto/aes/aes_gcm_hw_intel.odin index 7d32d4d96..ffd8ed642 100644 --- a/core/crypto/aes/aes_gcm_hw_intel.odin +++ b/core/crypto/aes/aes_gcm_hw_intel.odin @@ -10,12 +10,12 @@ import "core:mem" import "core:simd/x86" @(private) -gcm_seal_hw :: proc(ctx: ^Context_Impl_Hardware, dst, tag, nonce, aad, plaintext: []byte) { +gcm_seal_hw :: proc(ctx: ^Context_Impl_Hardware, dst, tag, iv, aad, plaintext: []byte) { h: [_aes.GHASH_KEY_SIZE]byte j0: [_aes.GHASH_BLOCK_SIZE]byte j0_enc: [_aes.GHASH_BLOCK_SIZE]byte s: [_aes.GHASH_TAG_SIZE]byte - init_ghash_hw(ctx, &h, &j0, &j0_enc, nonce) + init_ghash_hw(ctx, &h, &j0, &j0_enc, iv) // Note: Our GHASH implementation handles appending padding. hw_intel.ghash(s[:], h[:], aad) @@ -29,12 +29,12 @@ gcm_seal_hw :: proc(ctx: ^Context_Impl_Hardware, dst, tag, nonce, aad, plaintext } @(private) -gcm_open_hw :: proc(ctx: ^Context_Impl_Hardware, dst, nonce, aad, ciphertext, tag: []byte) -> bool { +gcm_open_hw :: proc(ctx: ^Context_Impl_Hardware, dst, iv, aad, ciphertext, tag: []byte) -> bool { h: [_aes.GHASH_KEY_SIZE]byte j0: [_aes.GHASH_BLOCK_SIZE]byte j0_enc: [_aes.GHASH_BLOCK_SIZE]byte s: [_aes.GHASH_TAG_SIZE]byte - init_ghash_hw(ctx, &h, &j0, &j0_enc, nonce) + init_ghash_hw(ctx, &h, &j0, &j0_enc, iv) hw_intel.ghash(s[:], h[:], aad) gctr_hw(ctx, dst, &s, ciphertext, &h, &j0, false) @@ -59,20 +59,20 @@ init_ghash_hw :: proc( h: ^[_aes.GHASH_KEY_SIZE]byte, j0: ^[_aes.GHASH_BLOCK_SIZE]byte, j0_enc: ^[_aes.GHASH_BLOCK_SIZE]byte, - nonce: []byte, + iv: []byte, ) { // 1. Let H = CIPH(k, 0^128) encrypt_block_hw(ctx, h[:], h[:]) // Define a block, J0, as follows: - if l := len(nonce); l == GCM_NONCE_SIZE { + if l := len(iv); l == GCM_IV_SIZE { // if len(IV) = 96, then let J0 = IV || 0^31 || 1 - copy(j0[:], nonce) + copy(j0[:], iv) j0[_aes.GHASH_BLOCK_SIZE - 1] = 1 } else { // If len(IV) != 96, then let s = 128 ceil(len(IV)/128) - len(IV), // and let J0 = GHASHH(IV || 0^(s+64) || ceil(len(IV))^64). - hw_intel.ghash(j0[:], h[:], nonce) + hw_intel.ghash(j0[:], h[:], iv) tmp: [_aes.GHASH_BLOCK_SIZE]byte endian.unchecked_put_u64be(tmp[8:], u64(l) * 8) @@ -109,7 +109,7 @@ gctr_hw :: proc( s: ^[_aes.GHASH_BLOCK_SIZE]byte, src: []byte, h: ^[_aes.GHASH_KEY_SIZE]byte, - nonce: ^[_aes.GHASH_BLOCK_SIZE]byte, + iv: ^[_aes.GHASH_BLOCK_SIZE]byte, is_seal: bool, ) #no_bounds_check { sks: [15]x86.__m128i = --- @@ -118,8 +118,8 @@ gctr_hw :: proc( } // Setup the counter block - ctr_blk := intrinsics.unaligned_load((^x86.__m128i)(nonce)) - ctr := endian.unchecked_get_u32be(nonce[GCM_NONCE_SIZE:]) + 1 + ctr_blk := intrinsics.unaligned_load((^x86.__m128i)(iv)) + ctr := endian.unchecked_get_u32be(iv[GCM_IV_SIZE:]) + 1 src, dst := src, dst diff --git a/core/crypto/aes/aes_impl.odin b/core/crypto/aes/aes_impl.odin index 03747f1fb..f26874809 100644 --- a/core/crypto/aes/aes_impl.odin +++ b/core/crypto/aes/aes_impl.odin @@ -10,6 +10,10 @@ Context_Impl :: union { Context_Impl_Hardware, } +// DEFAULT_IMPLEMENTATION is the implementation that will be used by +// default if possible. +DEFAULT_IMPLEMENTATION :: Implementation.Hardware + // Implementation is an AES implementation. Most callers will not need // to use this as the package will automatically select the most performant // implementation available (See `is_hardware_accelerated()`). diff --git a/core/crypto/aes/aes_impl_hw_gen.odin b/core/crypto/aes/aes_impl_hw_gen.odin index 5361c6ef0..3557b1aae 100644 --- a/core/crypto/aes/aes_impl_hw_gen.odin +++ b/core/crypto/aes/aes_impl_hw_gen.odin @@ -34,11 +34,11 @@ ctr_blocks_hw :: proc(ctx: ^Context_CTR, dst, src: []byte, nr_blocks: int) { } @(private) -gcm_seal_hw :: proc(ctx: ^Context_Impl_Hardware, dst, tag, nonce, aad, plaintext: []byte) { +gcm_seal_hw :: proc(ctx: ^Context_Impl_Hardware, dst, tag, iv, aad, plaintext: []byte) { panic(ERR_HW_NOT_SUPPORTED) } @(private) -gcm_open_hw :: proc(ctx: ^Context_Impl_Hardware, dst, nonce, aad, ciphertext, tag: []byte) -> bool { +gcm_open_hw :: proc(ctx: ^Context_Impl_Hardware, dst, iv, aad, ciphertext, tag: []byte) -> bool { panic(ERR_HW_NOT_SUPPORTED) } diff --git a/core/crypto/chacha20/chacha20.odin b/core/crypto/chacha20/chacha20.odin index 73d3e1ea2..6d1a6bfc5 100644 --- a/core/crypto/chacha20/chacha20.odin +++ b/core/crypto/chacha20/chacha20.odin @@ -8,119 +8,66 @@ See: package chacha20 import "core:bytes" -import "core:encoding/endian" -import "core:math/bits" +import "core:crypto/_chacha20" import "core:mem" // KEY_SIZE is the (X)ChaCha20 key size in bytes. -KEY_SIZE :: 32 -// NONCE_SIZE is the ChaCha20 nonce size in bytes. -NONCE_SIZE :: 12 -// XNONCE_SIZE is the XChaCha20 nonce size in bytes. -XNONCE_SIZE :: 24 - -@(private) -_MAX_CTR_IETF :: 0xffffffff - -@(private) -_BLOCK_SIZE :: 64 -@(private) -_STATE_SIZE_U32 :: 16 -@(private) -_ROUNDS :: 20 - -@(private) -_SIGMA_0: u32 : 0x61707865 -@(private) -_SIGMA_1: u32 : 0x3320646e -@(private) -_SIGMA_2: u32 : 0x79622d32 -@(private) -_SIGMA_3: u32 : 0x6b206574 +KEY_SIZE :: _chacha20.KEY_SIZE +// IV_SIZE is the ChaCha20 IV size in bytes. +IV_SIZE :: _chacha20.IV_SIZE +// XIV_SIZE is the XChaCha20 IV size in bytes. +XIV_SIZE :: _chacha20.XIV_SIZE // Context is a ChaCha20 or XChaCha20 instance. Context :: struct { - _s: [_STATE_SIZE_U32]u32, - _buffer: [_BLOCK_SIZE]byte, - _off: int, - _is_ietf_flavor: bool, - _is_initialized: bool, + _state: _chacha20.Context, + _impl: Implementation, } // init inititializes a Context for ChaCha20 or XChaCha20 with the provided -// key and nonce. -init :: proc(ctx: ^Context, key, nonce: []byte) { +// key and iv. +init :: proc(ctx: ^Context, key, iv: []byte, impl := DEFAULT_IMPLEMENTATION) { if len(key) != KEY_SIZE { - panic("crypto/chacha20: invalid ChaCha20 key size") + panic("crypto/chacha20: invalid (X)ChaCha20 key size") } - if n_len := len(nonce); n_len != NONCE_SIZE && n_len != XNONCE_SIZE { - panic("crypto/chacha20: invalid (X)ChaCha20 nonce size") + if l := len(iv); l != IV_SIZE && l != XIV_SIZE { + panic("crypto/chacha20: invalid (X)ChaCha20 IV size") } - k, n := key, nonce + k, n := key, iv - // Derive the XChaCha20 subkey and sub-nonce via HChaCha20. - is_xchacha := len(nonce) == XNONCE_SIZE + init_impl(ctx, impl) + + is_xchacha := len(iv) == XIV_SIZE if is_xchacha { - sub_key := ctx._buffer[:KEY_SIZE] - _hchacha20(sub_key, k, n) + sub_iv: [IV_SIZE]byte + sub_key := ctx._state._buffer[:KEY_SIZE] + hchacha20(sub_key, k, n, ctx._impl) k = sub_key - n = n[16:24] + copy(sub_iv[4:], n[16:]) + n = sub_iv[:] } - ctx._s[0] = _SIGMA_0 - ctx._s[1] = _SIGMA_1 - ctx._s[2] = _SIGMA_2 - ctx._s[3] = _SIGMA_3 - ctx._s[4] = endian.unchecked_get_u32le(k[0:4]) - ctx._s[5] = endian.unchecked_get_u32le(k[4:8]) - ctx._s[6] = endian.unchecked_get_u32le(k[8:12]) - ctx._s[7] = endian.unchecked_get_u32le(k[12:16]) - ctx._s[8] = endian.unchecked_get_u32le(k[16:20]) - ctx._s[9] = endian.unchecked_get_u32le(k[20:24]) - ctx._s[10] = endian.unchecked_get_u32le(k[24:28]) - ctx._s[11] = endian.unchecked_get_u32le(k[28:32]) - ctx._s[12] = 0 - if !is_xchacha { - ctx._s[13] = endian.unchecked_get_u32le(n[0:4]) - ctx._s[14] = endian.unchecked_get_u32le(n[4:8]) - ctx._s[15] = endian.unchecked_get_u32le(n[8:12]) - } else { - ctx._s[13] = 0 - ctx._s[14] = endian.unchecked_get_u32le(n[0:4]) - ctx._s[15] = endian.unchecked_get_u32le(n[4:8]) + _chacha20.init(&ctx._state, k, n, is_xchacha) + if is_xchacha { // The sub-key is stored in the keystream buffer. While // this will be overwritten in most circumstances, explicitly // clear it out early. - mem.zero_explicit(&ctx._buffer, KEY_SIZE) + mem.zero_explicit(&ctx._state._buffer, KEY_SIZE) } - - ctx._off = _BLOCK_SIZE - ctx._is_ietf_flavor = !is_xchacha - ctx._is_initialized = true } // seek seeks the (X)ChaCha20 stream counter to the specified block. seek :: proc(ctx: ^Context, block_nr: u64) { - assert(ctx._is_initialized) - - if ctx._is_ietf_flavor { - if block_nr > _MAX_CTR_IETF { - panic("crypto/chacha20: attempted to seek past maximum counter") - } - } else { - ctx._s[13] = u32(block_nr >> 32) - } - ctx._s[12] = u32(block_nr) - ctx._off = _BLOCK_SIZE + _chacha20.seek(&ctx._state, block_nr) } // xor_bytes XORs each byte in src with bytes taken from the (X)ChaCha20 // keystream, and writes the resulting output to dst. Dst and src MUST // alias exactly or not at all. xor_bytes :: proc(ctx: ^Context, dst, src: []byte) { - assert(ctx._is_initialized) + assert(ctx._state._is_initialized) src, dst := src, dst if dst_len := len(dst); dst_len < len(src) { @@ -131,12 +78,13 @@ xor_bytes :: proc(ctx: ^Context, dst, src: []byte) { panic("crypto/chacha20: dst and src alias inexactly") } - for remaining := len(src); remaining > 0; { + st := &ctx._state + #no_bounds_check for remaining := len(src); remaining > 0; { // Process multiple blocks at once - if ctx._off == _BLOCK_SIZE { - if nr_blocks := remaining / _BLOCK_SIZE; nr_blocks > 0 { - direct_bytes := nr_blocks * _BLOCK_SIZE - _do_blocks(ctx, dst, src, nr_blocks) + if st._off == _chacha20.BLOCK_SIZE { + if nr_blocks := remaining / _chacha20.BLOCK_SIZE; nr_blocks > 0 { + direct_bytes := nr_blocks * _chacha20.BLOCK_SIZE + stream_blocks(ctx, dst, src, nr_blocks) remaining -= direct_bytes if remaining == 0 { return @@ -147,17 +95,17 @@ xor_bytes :: proc(ctx: ^Context, dst, src: []byte) { // If there is a partial block, generate and buffer 1 block // worth of keystream. - _do_blocks(ctx, ctx._buffer[:], nil, 1) - ctx._off = 0 + stream_blocks(ctx, st._buffer[:], nil, 1) + st._off = 0 } // Process partial blocks from the buffered keystream. - to_xor := min(_BLOCK_SIZE - ctx._off, remaining) - buffered_keystream := ctx._buffer[ctx._off:] + to_xor := min(_chacha20.BLOCK_SIZE - st._off, remaining) + buffered_keystream := st._buffer[st._off:] for i := 0; i < to_xor; i = i + 1 { dst[i] = buffered_keystream[i] ~ src[i] } - ctx._off += to_xor + st._off += to_xor dst = dst[to_xor:] src = src[to_xor:] remaining -= to_xor @@ -166,15 +114,15 @@ xor_bytes :: proc(ctx: ^Context, dst, src: []byte) { // keystream_bytes fills dst with the raw (X)ChaCha20 keystream output. keystream_bytes :: proc(ctx: ^Context, dst: []byte) { - assert(ctx._is_initialized) + assert(ctx._state._is_initialized) - dst := dst - for remaining := len(dst); remaining > 0; { + dst, st := dst, &ctx._state + #no_bounds_check for remaining := len(dst); remaining > 0; { // Process multiple blocks at once - if ctx._off == _BLOCK_SIZE { - if nr_blocks := remaining / _BLOCK_SIZE; nr_blocks > 0 { - direct_bytes := nr_blocks * _BLOCK_SIZE - _do_blocks(ctx, dst, nil, nr_blocks) + if st._off == _chacha20.BLOCK_SIZE { + if nr_blocks := remaining / _chacha20.BLOCK_SIZE; nr_blocks > 0 { + direct_bytes := nr_blocks * _chacha20.BLOCK_SIZE + stream_blocks(ctx, dst, nil, nr_blocks) remaining -= direct_bytes if remaining == 0 { return @@ -184,15 +132,15 @@ keystream_bytes :: proc(ctx: ^Context, dst: []byte) { // If there is a partial block, generate and buffer 1 block // worth of keystream. - _do_blocks(ctx, ctx._buffer[:], nil, 1) - ctx._off = 0 + stream_blocks(ctx, st._buffer[:], nil, 1) + st._off = 0 } // Process partial blocks from the buffered keystream. - to_copy := min(_BLOCK_SIZE - ctx._off, remaining) - buffered_keystream := ctx._buffer[ctx._off:] + to_copy := min(_chacha20.BLOCK_SIZE - st._off, remaining) + buffered_keystream := st._buffer[st._off:] copy(dst[:to_copy], buffered_keystream[:to_copy]) - ctx._off += to_copy + st._off += to_copy dst = dst[to_copy:] remaining -= to_copy } @@ -201,366 +149,5 @@ keystream_bytes :: proc(ctx: ^Context, dst: []byte) { // reset sanitizes the Context. The Context must be re-initialized to // be used again. reset :: proc(ctx: ^Context) { - mem.zero_explicit(&ctx._s, size_of(ctx._s)) - mem.zero_explicit(&ctx._buffer, size_of(ctx._buffer)) - - ctx._is_initialized = false -} - -@(private) -_do_blocks :: proc(ctx: ^Context, dst, src: []byte, nr_blocks: int) { - // Enforce the maximum consumed keystream per nonce. - // - // While all modern "standard" definitions of ChaCha20 use - // the IETF 32-bit counter, for XChaCha20 most common - // implementations allow for a 64-bit counter. - // - // Honestly, the answer here is "use a MRAE primitive", but - // go with common practice in the case of XChaCha20. - if ctx._is_ietf_flavor { - if u64(ctx._s[12]) + u64(nr_blocks) > 0xffffffff { - panic("crypto/chacha20: maximum ChaCha20 keystream per nonce reached") - } - } else { - ctr := (u64(ctx._s[13]) << 32) | u64(ctx._s[12]) - if _, carry := bits.add_u64(ctr, u64(nr_blocks), 0); carry != 0 { - panic("crypto/chacha20: maximum XChaCha20 keystream per nonce reached") - } - } - - dst, src := dst, src - x := &ctx._s - for n := 0; n < nr_blocks; n = n + 1 { - x0, x1, x2, x3 := _SIGMA_0, _SIGMA_1, _SIGMA_2, _SIGMA_3 - x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 := x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15] - - for i := _ROUNDS; i > 0; i = i - 2 { - // Even when forcing inlining manually inlining all of - // these is decently faster. - - // quarterround(x, 0, 4, 8, 12) - x0 += x4 - x12 ~= x0 - x12 = bits.rotate_left32(x12, 16) - x8 += x12 - x4 ~= x8 - x4 = bits.rotate_left32(x4, 12) - x0 += x4 - x12 ~= x0 - x12 = bits.rotate_left32(x12, 8) - x8 += x12 - x4 ~= x8 - x4 = bits.rotate_left32(x4, 7) - - // quarterround(x, 1, 5, 9, 13) - x1 += x5 - x13 ~= x1 - x13 = bits.rotate_left32(x13, 16) - x9 += x13 - x5 ~= x9 - x5 = bits.rotate_left32(x5, 12) - x1 += x5 - x13 ~= x1 - x13 = bits.rotate_left32(x13, 8) - x9 += x13 - x5 ~= x9 - x5 = bits.rotate_left32(x5, 7) - - // quarterround(x, 2, 6, 10, 14) - x2 += x6 - x14 ~= x2 - x14 = bits.rotate_left32(x14, 16) - x10 += x14 - x6 ~= x10 - x6 = bits.rotate_left32(x6, 12) - x2 += x6 - x14 ~= x2 - x14 = bits.rotate_left32(x14, 8) - x10 += x14 - x6 ~= x10 - x6 = bits.rotate_left32(x6, 7) - - // quarterround(x, 3, 7, 11, 15) - x3 += x7 - x15 ~= x3 - x15 = bits.rotate_left32(x15, 16) - x11 += x15 - x7 ~= x11 - x7 = bits.rotate_left32(x7, 12) - x3 += x7 - x15 ~= x3 - x15 = bits.rotate_left32(x15, 8) - x11 += x15 - x7 ~= x11 - x7 = bits.rotate_left32(x7, 7) - - // quarterround(x, 0, 5, 10, 15) - x0 += x5 - x15 ~= x0 - x15 = bits.rotate_left32(x15, 16) - x10 += x15 - x5 ~= x10 - x5 = bits.rotate_left32(x5, 12) - x0 += x5 - x15 ~= x0 - x15 = bits.rotate_left32(x15, 8) - x10 += x15 - x5 ~= x10 - x5 = bits.rotate_left32(x5, 7) - - // quarterround(x, 1, 6, 11, 12) - x1 += x6 - x12 ~= x1 - x12 = bits.rotate_left32(x12, 16) - x11 += x12 - x6 ~= x11 - x6 = bits.rotate_left32(x6, 12) - x1 += x6 - x12 ~= x1 - x12 = bits.rotate_left32(x12, 8) - x11 += x12 - x6 ~= x11 - x6 = bits.rotate_left32(x6, 7) - - // quarterround(x, 2, 7, 8, 13) - x2 += x7 - x13 ~= x2 - x13 = bits.rotate_left32(x13, 16) - x8 += x13 - x7 ~= x8 - x7 = bits.rotate_left32(x7, 12) - x2 += x7 - x13 ~= x2 - x13 = bits.rotate_left32(x13, 8) - x8 += x13 - x7 ~= x8 - x7 = bits.rotate_left32(x7, 7) - - // quarterround(x, 3, 4, 9, 14) - x3 += x4 - x14 ~= x3 - x14 = bits.rotate_left32(x14, 16) - x9 += x14 - x4 ~= x9 - x4 = bits.rotate_left32(x4, 12) - x3 += x4 - x14 ~= x3 - x14 = bits.rotate_left32(x14, 8) - x9 += x14 - x4 ~= x9 - x4 = bits.rotate_left32(x4, 7) - } - - x0 += _SIGMA_0 - x1 += _SIGMA_1 - x2 += _SIGMA_2 - x3 += _SIGMA_3 - x4 += x[4] - x5 += x[5] - x6 += x[6] - x7 += x[7] - x8 += x[8] - x9 += x[9] - x10 += x[10] - x11 += x[11] - x12 += x[12] - x13 += x[13] - x14 += x[14] - x15 += x[15] - - // While the "correct" answer to getting more performance out of - // this is "use vector operations", support for that is currently - // a work in progress/to be designed. - // - // In the meantime: - // - The caller(s) ensure that src/dst are valid. - // - The compiler knows if the target is picky about alignment. - - #no_bounds_check { - if src != nil { - endian.unchecked_put_u32le(dst[0:4], endian.unchecked_get_u32le(src[0:4]) ~ x0) - endian.unchecked_put_u32le(dst[4:8], endian.unchecked_get_u32le(src[4:8]) ~ x1) - endian.unchecked_put_u32le(dst[8:12], endian.unchecked_get_u32le(src[8:12]) ~ x2) - endian.unchecked_put_u32le(dst[12:16], endian.unchecked_get_u32le(src[12:16]) ~ x3) - endian.unchecked_put_u32le(dst[16:20], endian.unchecked_get_u32le(src[16:20]) ~ x4) - endian.unchecked_put_u32le(dst[20:24], endian.unchecked_get_u32le(src[20:24]) ~ x5) - endian.unchecked_put_u32le(dst[24:28], endian.unchecked_get_u32le(src[24:28]) ~ x6) - endian.unchecked_put_u32le(dst[28:32], endian.unchecked_get_u32le(src[28:32]) ~ x7) - endian.unchecked_put_u32le(dst[32:36], endian.unchecked_get_u32le(src[32:36]) ~ x8) - endian.unchecked_put_u32le(dst[36:40], endian.unchecked_get_u32le(src[36:40]) ~ x9) - endian.unchecked_put_u32le(dst[40:44], endian.unchecked_get_u32le(src[40:44]) ~ x10) - endian.unchecked_put_u32le(dst[44:48], endian.unchecked_get_u32le(src[44:48]) ~ x11) - endian.unchecked_put_u32le(dst[48:52], endian.unchecked_get_u32le(src[48:52]) ~ x12) - endian.unchecked_put_u32le(dst[52:56], endian.unchecked_get_u32le(src[52:56]) ~ x13) - endian.unchecked_put_u32le(dst[56:60], endian.unchecked_get_u32le(src[56:60]) ~ x14) - endian.unchecked_put_u32le(dst[60:64], endian.unchecked_get_u32le(src[60:64]) ~ x15) - src = src[_BLOCK_SIZE:] - } else { - endian.unchecked_put_u32le(dst[0:4], x0) - endian.unchecked_put_u32le(dst[4:8], x1) - endian.unchecked_put_u32le(dst[8:12], x2) - endian.unchecked_put_u32le(dst[12:16], x3) - endian.unchecked_put_u32le(dst[16:20], x4) - endian.unchecked_put_u32le(dst[20:24], x5) - endian.unchecked_put_u32le(dst[24:28], x6) - endian.unchecked_put_u32le(dst[28:32], x7) - endian.unchecked_put_u32le(dst[32:36], x8) - endian.unchecked_put_u32le(dst[36:40], x9) - endian.unchecked_put_u32le(dst[40:44], x10) - endian.unchecked_put_u32le(dst[44:48], x11) - endian.unchecked_put_u32le(dst[48:52], x12) - endian.unchecked_put_u32le(dst[52:56], x13) - endian.unchecked_put_u32le(dst[56:60], x14) - endian.unchecked_put_u32le(dst[60:64], x15) - } - dst = dst[_BLOCK_SIZE:] - } - - // Increment the counter. Overflow checking is done upon - // entry into the routine, so a 64-bit increment safely - // covers both cases. - new_ctr := ((u64(ctx._s[13]) << 32) | u64(ctx._s[12])) + 1 - x[12] = u32(new_ctr) - x[13] = u32(new_ctr >> 32) - } -} - -@(private) -_hchacha20 :: proc "contextless" (dst, key, nonce: []byte) { - x0, x1, x2, x3 := _SIGMA_0, _SIGMA_1, _SIGMA_2, _SIGMA_3 - x4 := endian.unchecked_get_u32le(key[0:4]) - x5 := endian.unchecked_get_u32le(key[4:8]) - x6 := endian.unchecked_get_u32le(key[8:12]) - x7 := endian.unchecked_get_u32le(key[12:16]) - x8 := endian.unchecked_get_u32le(key[16:20]) - x9 := endian.unchecked_get_u32le(key[20:24]) - x10 := endian.unchecked_get_u32le(key[24:28]) - x11 := endian.unchecked_get_u32le(key[28:32]) - x12 := endian.unchecked_get_u32le(nonce[0:4]) - x13 := endian.unchecked_get_u32le(nonce[4:8]) - x14 := endian.unchecked_get_u32le(nonce[8:12]) - x15 := endian.unchecked_get_u32le(nonce[12:16]) - - for i := _ROUNDS; i > 0; i = i - 2 { - // quarterround(x, 0, 4, 8, 12) - x0 += x4 - x12 ~= x0 - x12 = bits.rotate_left32(x12, 16) - x8 += x12 - x4 ~= x8 - x4 = bits.rotate_left32(x4, 12) - x0 += x4 - x12 ~= x0 - x12 = bits.rotate_left32(x12, 8) - x8 += x12 - x4 ~= x8 - x4 = bits.rotate_left32(x4, 7) - - // quarterround(x, 1, 5, 9, 13) - x1 += x5 - x13 ~= x1 - x13 = bits.rotate_left32(x13, 16) - x9 += x13 - x5 ~= x9 - x5 = bits.rotate_left32(x5, 12) - x1 += x5 - x13 ~= x1 - x13 = bits.rotate_left32(x13, 8) - x9 += x13 - x5 ~= x9 - x5 = bits.rotate_left32(x5, 7) - - // quarterround(x, 2, 6, 10, 14) - x2 += x6 - x14 ~= x2 - x14 = bits.rotate_left32(x14, 16) - x10 += x14 - x6 ~= x10 - x6 = bits.rotate_left32(x6, 12) - x2 += x6 - x14 ~= x2 - x14 = bits.rotate_left32(x14, 8) - x10 += x14 - x6 ~= x10 - x6 = bits.rotate_left32(x6, 7) - - // quarterround(x, 3, 7, 11, 15) - x3 += x7 - x15 ~= x3 - x15 = bits.rotate_left32(x15, 16) - x11 += x15 - x7 ~= x11 - x7 = bits.rotate_left32(x7, 12) - x3 += x7 - x15 ~= x3 - x15 = bits.rotate_left32(x15, 8) - x11 += x15 - x7 ~= x11 - x7 = bits.rotate_left32(x7, 7) - - // quarterround(x, 0, 5, 10, 15) - x0 += x5 - x15 ~= x0 - x15 = bits.rotate_left32(x15, 16) - x10 += x15 - x5 ~= x10 - x5 = bits.rotate_left32(x5, 12) - x0 += x5 - x15 ~= x0 - x15 = bits.rotate_left32(x15, 8) - x10 += x15 - x5 ~= x10 - x5 = bits.rotate_left32(x5, 7) - - // quarterround(x, 1, 6, 11, 12) - x1 += x6 - x12 ~= x1 - x12 = bits.rotate_left32(x12, 16) - x11 += x12 - x6 ~= x11 - x6 = bits.rotate_left32(x6, 12) - x1 += x6 - x12 ~= x1 - x12 = bits.rotate_left32(x12, 8) - x11 += x12 - x6 ~= x11 - x6 = bits.rotate_left32(x6, 7) - - // quarterround(x, 2, 7, 8, 13) - x2 += x7 - x13 ~= x2 - x13 = bits.rotate_left32(x13, 16) - x8 += x13 - x7 ~= x8 - x7 = bits.rotate_left32(x7, 12) - x2 += x7 - x13 ~= x2 - x13 = bits.rotate_left32(x13, 8) - x8 += x13 - x7 ~= x8 - x7 = bits.rotate_left32(x7, 7) - - // quarterround(x, 3, 4, 9, 14) - x3 += x4 - x14 ~= x3 - x14 = bits.rotate_left32(x14, 16) - x9 += x14 - x4 ~= x9 - x4 = bits.rotate_left32(x4, 12) - x3 += x4 - x14 ~= x3 - x14 = bits.rotate_left32(x14, 8) - x9 += x14 - x4 ~= x9 - x4 = bits.rotate_left32(x4, 7) - } - - endian.unchecked_put_u32le(dst[0:4], x0) - endian.unchecked_put_u32le(dst[4:8], x1) - endian.unchecked_put_u32le(dst[8:12], x2) - endian.unchecked_put_u32le(dst[12:16], x3) - endian.unchecked_put_u32le(dst[16:20], x12) - endian.unchecked_put_u32le(dst[20:24], x13) - endian.unchecked_put_u32le(dst[24:28], x14) - endian.unchecked_put_u32le(dst[28:32], x15) + _chacha20.reset(&ctx._state) } diff --git a/core/crypto/chacha20/chacha20_impl.odin b/core/crypto/chacha20/chacha20_impl.odin new file mode 100644 index 000000000..be2ee06b4 --- /dev/null +++ b/core/crypto/chacha20/chacha20_impl.odin @@ -0,0 +1,56 @@ +package chacha20 + +import "base:intrinsics" +import "core:crypto/_chacha20/ref" +import "core:crypto/_chacha20/simd128" +import "core:crypto/_chacha20/simd256" + +// DEFAULT_IMPLEMENTATION is the implementation that will be used by +// default if possible. +DEFAULT_IMPLEMENTATION :: Implementation.Simd256 + +// Implementation is a ChaCha20 implementation. Most callers will not need +// to use this as the package will automatically select the most performant +// implementation available. +Implementation :: enum { + Portable, + Simd128, + Simd256, +} + +@(private) +init_impl :: proc(ctx: ^Context, impl: Implementation) { + impl := impl + if impl == .Simd256 && !simd256.is_performant() { + impl = .Simd128 + } + if impl == .Simd128 && !simd128.is_performant() { + impl = .Portable + } + + ctx._impl = impl +} + +@(private) +stream_blocks :: proc(ctx: ^Context, dst, src: []byte, nr_blocks: int) { + switch ctx._impl { + case .Simd256: + simd256.stream_blocks(&ctx._state, dst, src, nr_blocks) + case .Simd128: + simd128.stream_blocks(&ctx._state, dst, src, nr_blocks) + case .Portable: + ref.stream_blocks(&ctx._state, dst, src, nr_blocks) + } +} + +@(private) +hchacha20 :: proc "contextless" (dst, key, iv: []byte, impl: Implementation) { + switch impl { + case .Simd256: + simd256.hchacha20(dst, key, iv) + case .Simd128: + simd128.hchacha20(dst, key, iv) + case .Portable: + ref.hchacha20(dst, key, iv) + } +} diff --git a/core/crypto/chacha20poly1305/chacha20poly1305.odin b/core/crypto/chacha20poly1305/chacha20poly1305.odin index 7fc112d0d..e2cd35a7e 100644 --- a/core/crypto/chacha20poly1305/chacha20poly1305.odin +++ b/core/crypto/chacha20poly1305/chacha20poly1305.odin @@ -1,9 +1,11 @@ /* -package chacha20poly1305 implements the AEAD_CHACHA20_POLY1305 Authenticated -Encryption with Additional Data algorithm. +package chacha20poly1305 implements the AEAD_CHACHA20_POLY1305 and +AEAD_XChaCha20_Poly1305 Authenticated Encryption with Additional Data +algorithms. See: - https://www.rfc-editor.org/rfc/rfc8439 +- https://datatracker.ietf.org/doc/html/draft-arciszewski-xchacha-03 */ package chacha20poly1305 @@ -15,8 +17,10 @@ import "core:mem" // KEY_SIZE is the chacha20poly1305 key size in bytes. KEY_SIZE :: chacha20.KEY_SIZE -// NONCE_SIZE is the chacha20poly1305 nonce size in bytes. -NONCE_SIZE :: chacha20.NONCE_SIZE +// IV_SIZE is the chacha20poly1305 IV size in bytes. +IV_SIZE :: chacha20.IV_SIZE +// XIV_SIZE is the xchacha20poly1305 IV size in bytes. +XIV_SIZE :: chacha20.XIV_SIZE // TAG_SIZE is the chacha20poly1305 tag size in bytes. TAG_SIZE :: poly1305.TAG_SIZE @@ -24,15 +28,13 @@ TAG_SIZE :: poly1305.TAG_SIZE _P_MAX :: 64 * 0xffffffff // 64 * (2^32-1) @(private) -_validate_common_slice_sizes :: proc (tag, key, nonce, aad, text: []byte) { +_validate_common_slice_sizes :: proc (tag, iv, aad, text: []byte, is_xchacha: bool) { if len(tag) != TAG_SIZE { panic("crypto/chacha20poly1305: invalid destination tag size") } - if len(key) != KEY_SIZE { - panic("crypto/chacha20poly1305: invalid key size") - } - if len(nonce) != NONCE_SIZE { - panic("crypto/chacha20poly1305: invalid nonce size") + expected_iv_len := is_xchacha ? XIV_SIZE : IV_SIZE + if len(iv) != expected_iv_len { + panic("crypto/chacha20poly1305: invalid IV size") } #assert(size_of(int) == 8 || size_of(int) <= 4) @@ -59,18 +61,52 @@ _update_mac_pad16 :: #force_inline proc (ctx: ^poly1305.Context, x_len: int) { } } -// encrypt encrypts the plaintext and authenticates the aad and ciphertext, -// with the provided key and nonce, stores the output in ciphertext and tag. -encrypt :: proc (ciphertext, tag, key, nonce, aad, plaintext: []byte) { - _validate_common_slice_sizes(tag, key, nonce, aad, plaintext) +// Context is a keyed (X)Chacha20Poly1305 instance. +Context :: struct { + _key: [KEY_SIZE]byte, + _impl: chacha20.Implementation, + _is_xchacha: bool, + _is_initialized: bool, +} + +// init initializes a Context with the provided key, for AEAD_CHACHA20_POLY1305. +init :: proc(ctx: ^Context, key: []byte, impl := chacha20.DEFAULT_IMPLEMENTATION) { + if len(key) != KEY_SIZE { + panic("crypto/chacha20poly1305: invalid key size") + } + + copy(ctx._key[:], key) + ctx._impl = impl + ctx._is_xchacha = false + ctx._is_initialized = true +} + +// init_xchacha initializes a Context with the provided key, for +// AEAD_XChaCha20_Poly1305. +// +// Note: While there are multiple definitions of XChaCha20-Poly1305 +// this sticks to the IETF draft and uses a 32-bit counter. +init_xchacha :: proc(ctx: ^Context, key: []byte, impl := chacha20.DEFAULT_IMPLEMENTATION) { + init(ctx, key, impl) + ctx._is_xchacha = true +} + +// seal encrypts the plaintext and authenticates the aad and ciphertext, +// with the provided Context and iv, stores the output in dst and tag. +// +// dst and plaintext MUST alias exactly or not at all. +seal :: proc(ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) { + ciphertext := dst + _validate_common_slice_sizes(tag, iv, aad, plaintext, ctx._is_xchacha) if len(ciphertext) != len(plaintext) { panic("crypto/chacha20poly1305: invalid destination ciphertext size") } stream_ctx: chacha20.Context = --- - chacha20.init(&stream_ctx, key, nonce) + chacha20.init(&stream_ctx, ctx._key[:],iv, ctx._impl) + stream_ctx._state._is_ietf_flavor = true - // otk = poly1305_key_gen(key, nonce) + // otk = poly1305_key_gen(key, iv) otk: [poly1305.KEY_SIZE]byte = --- chacha20.keystream_bytes(&stream_ctx, otk[:]) mac_ctx: poly1305.Context = --- @@ -87,7 +123,7 @@ encrypt :: proc (ciphertext, tag, key, nonce, aad, plaintext: []byte) { poly1305.update(&mac_ctx, aad) _update_mac_pad16(&mac_ctx, aad_len) - // ciphertext = chacha20_encrypt(key, 1, nonce, plaintext) + // ciphertext = chacha20_encrypt(key, 1, iv, plaintext) chacha20.seek(&stream_ctx, 1) chacha20.xor_bytes(&stream_ctx, ciphertext, plaintext) chacha20.reset(&stream_ctx) // Don't need the stream context anymore. @@ -107,13 +143,16 @@ encrypt :: proc (ciphertext, tag, key, nonce, aad, plaintext: []byte) { poly1305.final(&mac_ctx, tag) // Implicitly sanitizes context. } -// decrypt authenticates the aad and ciphertext, and decrypts the ciphertext, -// with the provided key, nonce, and tag, and stores the output in plaintext, -// returning true iff the authentication was successful. +// open authenticates the aad and ciphertext, and decrypts the ciphertext, +// with the provided Context, iv, and tag, and stores the output in dst, +// returning true iff the authentication was successful. If authentication +// fails, the destination buffer will be zeroed. // -// If authentication fails, the destination plaintext buffer will be zeroed. -decrypt :: proc (plaintext, tag, key, nonce, aad, ciphertext: []byte) -> bool { - _validate_common_slice_sizes(tag, key, nonce, aad, ciphertext) +// dst and plaintext MUST alias exactly or not at all. +@(require_results) +open :: proc(ctx: ^Context, dst, iv, aad, ciphertext, tag: []byte) -> bool { + plaintext := dst + _validate_common_slice_sizes(tag, iv, aad, ciphertext, ctx._is_xchacha) if len(ciphertext) != len(plaintext) { panic("crypto/chacha20poly1305: invalid destination plaintext size") } @@ -123,9 +162,10 @@ decrypt :: proc (plaintext, tag, key, nonce, aad, ciphertext: []byte) -> bool { // points where needed. stream_ctx: chacha20.Context = --- - chacha20.init(&stream_ctx, key, nonce) + chacha20.init(&stream_ctx, ctx._key[:], iv, ctx._impl) + stream_ctx._state._is_ietf_flavor = true - // otk = poly1305_key_gen(key, nonce) + // otk = poly1305_key_gen(key, iv) otk: [poly1305.KEY_SIZE]byte = --- chacha20.keystream_bytes(&stream_ctx, otk[:]) defer chacha20.reset(&stream_ctx) @@ -160,9 +200,17 @@ decrypt :: proc (plaintext, tag, key, nonce, aad, ciphertext: []byte) -> bool { return false } - // plaintext = chacha20_decrypt(key, 1, nonce, ciphertext) + // plaintext = chacha20_decrypt(key, 1, iv, ciphertext) chacha20.seek(&stream_ctx, 1) chacha20.xor_bytes(&stream_ctx, plaintext, ciphertext) return true } + +// reset sanitizes the Context. The Context must be +// re-initialized to be used again. +reset :: proc "contextless" (ctx: ^Context) { + mem.zero_explicit(&ctx._key, len(ctx._key)) + ctx._is_xchacha = false + ctx._is_initialized = false +} diff --git a/core/crypto/ed25519/ed25519.odin b/core/crypto/ed25519/ed25519.odin index 86da35669..5584b06f7 100644 --- a/core/crypto/ed25519/ed25519.odin +++ b/core/crypto/ed25519/ed25519.odin @@ -21,7 +21,7 @@ PUBLIC_KEY_SIZE :: 32 SIGNATURE_SIZE :: 64 @(private) -NONCE_SIZE :: 32 +HDIGEST2_SIZE :: 32 // Private_Key is an Ed25519 private key. Private_Key :: struct { @@ -33,7 +33,7 @@ Private_Key :: struct { // See: https://github.com/MystenLabs/ed25519-unsafe-libs _b: [PRIVATE_KEY_SIZE]byte, _s: grp.Scalar, - _nonce: [NONCE_SIZE]byte, + _hdigest2: [HDIGEST2_SIZE]byte, _pub_key: Public_Key, _is_initialized: bool, } @@ -63,7 +63,7 @@ private_key_set_bytes :: proc(priv_key: ^Private_Key, b: []byte) -> bool { sha2.final(&ctx, h_bytes[:]) copy(priv_key._b[:], b) - copy(priv_key._nonce[:], h_bytes[32:]) + copy(priv_key._hdigest2[:], h_bytes[32:]) grp.sc_set_bytes_rfc8032(&priv_key._s, h_bytes[:32]) // Derive the corresponding public key. @@ -116,7 +116,7 @@ sign :: proc(priv_key: ^Private_Key, msg, sig: []byte) { ctx: sha2.Context_512 = --- digest_bytes: [sha2.DIGEST_SIZE_512]byte = --- sha2.init_512(&ctx) - sha2.update(&ctx, priv_key._nonce[:]) + sha2.update(&ctx, priv_key._hdigest2[:]) sha2.update(&ctx, msg) sha2.final(&ctx, digest_bytes[:]) diff --git a/core/crypto/hash/hash.odin b/core/crypto/hash/hash.odin index e4b3d4be1..f7671270a 100644 --- a/core/crypto/hash/hash.odin +++ b/core/crypto/hash/hash.odin @@ -28,20 +28,26 @@ hash_bytes :: proc(algorithm: Algorithm, data: []byte, allocator := context.allo // hash_string_to_buffer will hash the given input and assign the // computed digest to the third parameter. It requires that the -// destination buffer is at least as big as the digest size. -hash_string_to_buffer :: proc(algorithm: Algorithm, data: string, hash: []byte) { - hash_bytes_to_buffer(algorithm, transmute([]byte)(data), hash) +// destination buffer is at least as big as the digest size. The +// provided destination buffer is returned to match the behavior of +// `hash_string`. +hash_string_to_buffer :: proc(algorithm: Algorithm, data: string, hash: []byte) -> []byte { + return hash_bytes_to_buffer(algorithm, transmute([]byte)(data), hash) } // hash_bytes_to_buffer will hash the given input and write the // computed digest into the third parameter. It requires that the -// destination buffer is at least as big as the digest size. -hash_bytes_to_buffer :: proc(algorithm: Algorithm, data, hash: []byte) { +// destination buffer is at least as big as the digest size. The +// provided destination buffer is returned to match the behavior of +// `hash_bytes`. +hash_bytes_to_buffer :: proc(algorithm: Algorithm, data, hash: []byte) -> []byte { ctx: Context init(&ctx, algorithm) update(&ctx, data) final(&ctx, hash) + + return hash } // hash_stream will incrementally fully consume a stream, and return the diff --git a/examples/all/all_main.odin b/examples/all/all_main.odin index d92a6b8c4..62b891352 100644 --- a/examples/all/all_main.odin +++ b/examples/all/all_main.odin @@ -25,6 +25,7 @@ import rbtree "core:container/rbtree" import topological_sort "core:container/topological_sort" import crypto "core:crypto" +import aead "core:crypto/aead" import aes "core:crypto/aes" import blake2b "core:crypto/blake2b" import blake2s "core:crypto/blake2s" @@ -164,6 +165,7 @@ _ :: rbtree _ :: topological_sort _ :: crypto _ :: crypto_hash +_ :: aead _ :: aes _ :: blake2b _ :: blake2s diff --git a/tests/benchmark/crypto/benchmark_crypto.odin b/tests/benchmark/crypto/benchmark_crypto.odin index b2ac4bca3..66c9f89d3 100644 --- a/tests/benchmark/crypto/benchmark_crypto.odin +++ b/tests/benchmark/crypto/benchmark_crypto.odin @@ -279,13 +279,13 @@ _benchmark_chacha20 :: proc( 0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef, } - nonce := [chacha20.NONCE_SIZE]byte { + iv := [chacha20.IV_SIZE]byte { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, } ctx: chacha20.Context = --- - chacha20.init(&ctx, key[:], nonce[:]) + chacha20.init(&ctx, key[:], iv[:]) for _ in 0 ..= options.rounds { chacha20.xor_bytes(&ctx, buf, buf) @@ -334,15 +334,18 @@ _benchmark_chacha20poly1305 :: proc( 0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef, } - nonce := [chacha20.NONCE_SIZE]byte { + iv := [chacha20.IV_SIZE]byte { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, } + ctx: chacha20poly1305.Context = --- + chacha20poly1305.init(&ctx, key[:]) // Basically 0 overhead. + tag: [chacha20poly1305.TAG_SIZE]byte = --- for _ in 0 ..= options.rounds { - chacha20poly1305.encrypt(buf, tag[:], key[:], nonce[:], nil, buf) + chacha20poly1305.seal(&ctx, buf, tag[:], iv[:], nil, buf) } options.count = options.rounds options.processed = options.rounds * options.bytes @@ -363,13 +366,13 @@ _benchmark_aes256_ctr :: proc( 0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef, } - nonce := [aes.CTR_IV_SIZE]byte { + iv := [aes.CTR_IV_SIZE]byte { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, } ctx: aes.Context_CTR = --- - aes.init_ctr(&ctx, key[:], nonce[:]) + aes.init_ctr(&ctx, key[:], iv[:]) for _ in 0 ..= options.rounds { aes.xor_bytes_ctr(&ctx, buf, buf) @@ -386,13 +389,13 @@ _benchmark_aes256_gcm :: proc( err: time.Benchmark_Error, ) { buf := options.input - nonce: [aes.GCM_NONCE_SIZE]byte + iv: [aes.GCM_IV_SIZE]byte tag: [aes.GCM_TAG_SIZE]byte = --- ctx := transmute(^aes.Context_GCM)context.user_ptr for _ in 0 ..= options.rounds { - aes.seal_gcm(ctx, buf, tag[:], nonce[:], nil, buf) + aes.seal_gcm(ctx, buf, tag[:], iv[:], nil, buf) } options.count = options.rounds options.processed = options.rounds * options.bytes diff --git a/tests/core/crypto/test_core_crypto.odin b/tests/core/crypto/test_core_crypto.odin index f3f76646b..b3eb6e041 100644 --- a/tests/core/crypto/test_core_crypto.odin +++ b/tests/core/crypto/test_core_crypto.odin @@ -19,15 +19,39 @@ import "base:runtime" import "core:log" import "core:crypto" +import chacha_simd128 "core:crypto/_chacha20/simd128" +import chacha_simd256 "core:crypto/_chacha20/simd256" import "core:crypto/chacha20" -import "core:crypto/chacha20poly1305" +import "core:crypto/sha2" +@(private) _PLAINTEXT_SUNSCREEN_STR := "Ladies and Gentlemen of the class of '99: If I could offer you only one tip for the future, sunscreen would be it." @(test) test_chacha20 :: proc(t: ^testing.T) { runtime.DEFAULT_TEMP_ALLOCATOR_TEMP_GUARD() + impls := supported_chacha_impls() + + for impl in impls { + test_chacha20_stream(t, impl) + } +} + +supported_chacha_impls :: proc() -> [dynamic]chacha20.Implementation { + impls := make([dynamic]chacha20.Implementation, 0, 3, context.temp_allocator) + append(&impls, chacha20.Implementation.Portable) + if chacha_simd128.is_performant() { + append(&impls, chacha20.Implementation.Simd128) + } + if chacha_simd256.is_performant() { + append(&impls, chacha20.Implementation.Simd256) + } + + return impls +} + +test_chacha20_stream :: proc(t: ^testing.T, impl: chacha20.Implementation) { // Test cases taken from RFC 8439, and draft-irtf-cfrg-xchacha-03 plaintext := transmute([]byte)(_PLAINTEXT_SUNSCREEN_STR) @@ -38,7 +62,7 @@ test_chacha20 :: proc(t: ^testing.T) { 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, } - nonce := [chacha20.NONCE_SIZE]byte { + iv := [chacha20.IV_SIZE]byte { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x4a, 0x00, 0x00, 0x00, 0x00, } @@ -64,7 +88,7 @@ test_chacha20 :: proc(t: ^testing.T) { derived_ciphertext: [114]byte ctx: chacha20.Context = --- - chacha20.init(&ctx, key[:], nonce[:]) + chacha20.init(&ctx, key[:], iv[:], impl) chacha20.seek(&ctx, 1) // The test vectors start the counter at 1. chacha20.xor_bytes(&ctx, derived_ciphertext[:], plaintext[:]) @@ -72,7 +96,8 @@ test_chacha20 :: proc(t: ^testing.T) { testing.expectf( t, derived_ciphertext_str == ciphertext_str, - "Expected %s for xor_bytes(plaintext_str), but got %s instead", + "chacha20/%v: Expected %s for xor_bytes(plaintext_str), but got %s instead", + impl, ciphertext_str, derived_ciphertext_str, ) @@ -84,7 +109,7 @@ test_chacha20 :: proc(t: ^testing.T) { 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, } - xnonce := [chacha20.XNONCE_SIZE]byte { + xiv := [chacha20.XIV_SIZE]byte { 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, @@ -109,7 +134,7 @@ test_chacha20 :: proc(t: ^testing.T) { } xciphertext_str := string(hex.encode(xciphertext[:], context.temp_allocator)) - chacha20.init(&ctx, xkey[:], xnonce[:]) + chacha20.init(&ctx, xkey[:], xiv[:], impl) chacha20.seek(&ctx, 1) chacha20.xor_bytes(&ctx, derived_ciphertext[:], plaintext[:]) @@ -117,128 +142,44 @@ test_chacha20 :: proc(t: ^testing.T) { testing.expectf( t, derived_ciphertext_str == xciphertext_str, - "Expected %s for xor_bytes(plaintext_str), but got %s instead", + "chacha20/%v: Expected %s for xor_bytes(plaintext_str), but got %s instead", + impl, xciphertext_str, derived_ciphertext_str, ) -} -@(test) -test_chacha20poly1305 :: proc(t: ^testing.T) { - plaintext := transmute([]byte)(_PLAINTEXT_SUNSCREEN_STR) + // Incrementally read 1, 2, 3, ..., 2048 bytes of keystream, and + // compare the SHA-512/256 digest with a known value. Results + // and testcase taken from a known good implementation by the + // same author as the Odin test case. - aad := [12]byte { - 0x50, 0x51, 0x52, 0x53, 0xc0, 0xc1, 0xc2, 0xc3, - 0xc4, 0xc5, 0xc6, 0xc7, + tmp := make([]byte, 2048, context.temp_allocator) + + mem.zero(&key, size_of(key)) + mem.zero(&iv, size_of(iv)) + chacha20.init(&ctx, key[:], iv[:], impl) + + h_ctx: sha2.Context_512 + sha2.init_512_256(&h_ctx) + + for i := 1; i <= 2048; i = i + 1 { + chacha20.keystream_bytes(&ctx, tmp[:i]) + sha2.update(&h_ctx, tmp[:i]) } - key := [chacha20poly1305.KEY_SIZE]byte { - 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, - 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, - 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, - 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, - } + digest: [32]byte + sha2.final(&h_ctx, digest[:]) + digest_str := string(hex.encode(digest[:], context.temp_allocator)) - nonce := [chacha20poly1305.NONCE_SIZE]byte { - 0x07, 0x00, 0x00, 0x00, 0x40, 0x41, 0x42, 0x43, - 0x44, 0x45, 0x46, 0x47, - } - - ciphertext := [114]byte { - 0xd3, 0x1a, 0x8d, 0x34, 0x64, 0x8e, 0x60, 0xdb, - 0x7b, 0x86, 0xaf, 0xbc, 0x53, 0xef, 0x7e, 0xc2, - 0xa4, 0xad, 0xed, 0x51, 0x29, 0x6e, 0x08, 0xfe, - 0xa9, 0xe2, 0xb5, 0xa7, 0x36, 0xee, 0x62, 0xd6, - 0x3d, 0xbe, 0xa4, 0x5e, 0x8c, 0xa9, 0x67, 0x12, - 0x82, 0xfa, 0xfb, 0x69, 0xda, 0x92, 0x72, 0x8b, - 0x1a, 0x71, 0xde, 0x0a, 0x9e, 0x06, 0x0b, 0x29, - 0x05, 0xd6, 0xa5, 0xb6, 0x7e, 0xcd, 0x3b, 0x36, - 0x92, 0xdd, 0xbd, 0x7f, 0x2d, 0x77, 0x8b, 0x8c, - 0x98, 0x03, 0xae, 0xe3, 0x28, 0x09, 0x1b, 0x58, - 0xfa, 0xb3, 0x24, 0xe4, 0xfa, 0xd6, 0x75, 0x94, - 0x55, 0x85, 0x80, 0x8b, 0x48, 0x31, 0xd7, 0xbc, - 0x3f, 0xf4, 0xde, 0xf0, 0x8e, 0x4b, 0x7a, 0x9d, - 0xe5, 0x76, 0xd2, 0x65, 0x86, 0xce, 0xc6, 0x4b, - 0x61, 0x16, - } - ciphertext_str := string(hex.encode(ciphertext[:], context.temp_allocator)) - - tag := [chacha20poly1305.TAG_SIZE]byte { - 0x1a, 0xe1, 0x0b, 0x59, 0x4f, 0x09, 0xe2, 0x6a, - 0x7e, 0x90, 0x2e, 0xcb, 0xd0, 0x60, 0x06, 0x91, - } - tag_str := string(hex.encode(tag[:], context.temp_allocator)) - - derived_tag: [chacha20poly1305.TAG_SIZE]byte - derived_ciphertext: [114]byte - - chacha20poly1305.encrypt( - derived_ciphertext[:], - derived_tag[:], - key[:], - nonce[:], - aad[:], - plaintext, - ) - - derived_ciphertext_str := string(hex.encode(derived_ciphertext[:], context.temp_allocator)) + expected_digest_str := "cfd6e949225b854fe04946491e6935ff05ff983d1554bc885bca0ec8082dd5b8" testing.expectf( t, - derived_ciphertext_str == ciphertext_str, - "Expected ciphertext %s for encrypt(aad, plaintext), but got %s instead", - ciphertext_str, - derived_ciphertext_str, + expected_digest_str == digest_str, + "chacha20/%v: Expected %s for keystream digest, but got %s instead", + impl, + expected_digest_str, + digest_str, ) - - derived_tag_str := string(hex.encode(derived_tag[:], context.temp_allocator)) - testing.expectf( - t, - derived_tag_str == tag_str, - "Expected tag %s for encrypt(aad, plaintext), but got %s instead", - tag_str, - derived_tag_str, - ) - - derived_plaintext: [114]byte - ok := chacha20poly1305.decrypt( - derived_plaintext[:], - tag[:], - key[:], - nonce[:], - aad[:], - ciphertext[:], - ) - derived_plaintext_str := string(derived_plaintext[:]) - testing.expect(t, ok, "Expected true for decrypt(tag, aad, ciphertext)") - testing.expectf( - t, - derived_plaintext_str == _PLAINTEXT_SUNSCREEN_STR, - "Expected plaintext %s for decrypt(tag, aad, ciphertext), but got %s instead", - _PLAINTEXT_SUNSCREEN_STR, - derived_plaintext_str, - ) - - derived_ciphertext[0] ~= 0xa5 - ok = chacha20poly1305.decrypt( - derived_plaintext[:], - tag[:], - key[:], - nonce[:], - aad[:], - derived_ciphertext[:], - ) - testing.expect(t, !ok, "Expected false for decrypt(tag, aad, corrupted_ciphertext)") - - aad[0] ~= 0xa5 - ok = chacha20poly1305.decrypt( - derived_plaintext[:], - tag[:], - key[:], - nonce[:], - aad[:], - ciphertext[:], - ) - testing.expect(t, !ok, "Expected false for decrypt(tag, corrupted_aad, ciphertext)") } @(test) diff --git a/tests/core/crypto/test_core_crypto_aead.odin b/tests/core/crypto/test_core_crypto_aead.odin new file mode 100644 index 000000000..90eedc0b2 --- /dev/null +++ b/tests/core/crypto/test_core_crypto_aead.odin @@ -0,0 +1,339 @@ +package test_core_crypto + +import "base:runtime" +import "core:crypto/aead" +import "core:encoding/hex" +import "core:testing" + +@(test) +test_aead :: proc(t: ^testing.T) { + runtime.DEFAULT_TEMP_ALLOCATOR_TEMP_GUARD() + + aes_impls := make([dynamic]aead.Implementation, context.temp_allocator) + for impl in supported_aes_impls() { + append(&aes_impls, impl) + } + chacha_impls := make([dynamic]aead.Implementation, context.temp_allocator) + for impl in supported_chacha_impls() { + append(&chacha_impls, impl) + } + impls := [aead.Algorithm][dynamic]aead.Implementation{ + .Invalid = nil, + .AES_GCM_128 = aes_impls, + .AES_GCM_192 = aes_impls, + .AES_GCM_256 = aes_impls, + .CHACHA20POLY1305 = chacha_impls, + .XCHACHA20POLY1305 = chacha_impls, + } + + test_vectors := []struct{ + algo: aead.Algorithm, + key: string, + iv: string, + aad: string, + plaintext: string, + ciphertext: string, + tag: string, + } { + // AES-GCM + // - https://csrc.nist.rip/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf + // + // Note: NIST did a reorg of their site, so the source of the test vectors + // is only available from an archive. + { + .AES_GCM_128, + "00000000000000000000000000000000", + "000000000000000000000000", + "", + "", + "", + "58e2fccefa7e3061367f1d57a4e7455a", + }, + { + .AES_GCM_128, + "00000000000000000000000000000000", + "000000000000000000000000", + "", + "00000000000000000000000000000000", + "0388dace60b6a392f328c2b971b2fe78", + "ab6e47d42cec13bdf53a67b21257bddf", + }, + { + .AES_GCM_128, + "feffe9928665731c6d6a8f9467308308", + "cafebabefacedbaddecaf888", + "", + "d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b391aafd255", + "42831ec2217774244b7221b784d0d49ce3aa212f2c02a4e035c17e2329aca12e21d514b25466931c7d8f6a5aac84aa051ba30b396a0aac973d58e091473f5985", + "4d5c2af327cd64a62cf35abd2ba6fab4", + }, + { + .AES_GCM_128, + "feffe9928665731c6d6a8f9467308308", + "cafebabefacedbaddecaf888", + "feedfacedeadbeeffeedfacedeadbeefabaddad2", + "d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39", + "42831ec2217774244b7221b784d0d49ce3aa212f2c02a4e035c17e2329aca12e21d514b25466931c7d8f6a5aac84aa051ba30b396a0aac973d58e091", + "5bc94fbc3221a5db94fae95ae7121a47", + }, + { + .AES_GCM_128, + "feffe9928665731c6d6a8f9467308308", + "cafebabefacedbad", + "feedfacedeadbeeffeedfacedeadbeefabaddad2", + "d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39", + "61353b4c2806934a777ff51fa22a4755699b2a714fcdc6f83766e5f97b6c742373806900e49f24b22b097544d4896b424989b5e1ebac0f07c23f4598", + "3612d2e79e3b0785561be14aaca2fccb", + }, + { + .AES_GCM_128, + "feffe9928665731c6d6a8f9467308308", + "9313225df88406e555909c5aff5269aa6a7a9538534f7da1e4c303d2a318a728c3c0c95156809539fcf0e2429a6b525416aedbf5a0de6a57a637b39b", + "feedfacedeadbeeffeedfacedeadbeefabaddad2", + "d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39", + "8ce24998625615b603a033aca13fb894be9112a5c3a211a8ba262a3cca7e2ca701e4a9a4fba43c90ccdcb281d48c7c6fd62875d2aca417034c34aee5", + "619cc5aefffe0bfa462af43c1699d050", + }, + { + .AES_GCM_192, + "000000000000000000000000000000000000000000000000", + "000000000000000000000000", + "", + "", + "", + "cd33b28ac773f74ba00ed1f312572435", + }, + { + .AES_GCM_192, + "000000000000000000000000000000000000000000000000", + "000000000000000000000000", + "", + "00000000000000000000000000000000", + "98e7247c07f0fe411c267e4384b0f600", + "2ff58d80033927ab8ef4d4587514f0fb", + }, + { + .AES_GCM_192, + "feffe9928665731c6d6a8f9467308308feffe9928665731c", + "cafebabefacedbaddecaf888", + "", + "d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b391aafd255", + "3980ca0b3c00e841eb06fac4872a2757859e1ceaa6efd984628593b40ca1e19c7d773d00c144c525ac619d18c84a3f4718e2448b2fe324d9ccda2710acade256", + "9924a7c8587336bfb118024db8674a14", + }, + { + .AES_GCM_192, + "feffe9928665731c6d6a8f9467308308feffe9928665731c", + "cafebabefacedbaddecaf888", + "feedfacedeadbeeffeedfacedeadbeefabaddad2", + "d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39", + "3980ca0b3c00e841eb06fac4872a2757859e1ceaa6efd984628593b40ca1e19c7d773d00c144c525ac619d18c84a3f4718e2448b2fe324d9ccda2710", + "2519498e80f1478f37ba55bd6d27618c", + }, + { + .AES_GCM_192, + "feffe9928665731c6d6a8f9467308308feffe9928665731c", + "cafebabefacedbad", + "feedfacedeadbeeffeedfacedeadbeefabaddad2", + "d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39", + "0f10f599ae14a154ed24b36e25324db8c566632ef2bbb34f8347280fc4507057fddc29df9a471f75c66541d4d4dad1c9e93a19a58e8b473fa0f062f7", + "65dcc57fcf623a24094fcca40d3533f8", + }, + { + .AES_GCM_192, + "feffe9928665731c6d6a8f9467308308feffe9928665731c", + "9313225df88406e555909c5aff5269aa6a7a9538534f7da1e4c303d2a318a728c3c0c95156809539fcf0e2429a6b525416aedbf5a0de6a57a637b39b", + "feedfacedeadbeeffeedfacedeadbeefabaddad2", + "d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39", + "d27e88681ce3243c4830165a8fdcf9ff1de9a1d8e6b447ef6ef7b79828666e4581e79012af34ddd9e2f037589b292db3e67c036745fa22e7e9b7373b", + "dcf566ff291c25bbb8568fc3d376a6d9", + }, + { + .AES_GCM_256, + "0000000000000000000000000000000000000000000000000000000000000000", + "000000000000000000000000", + "", + "", + "", + "530f8afbc74536b9a963b4f1c4cb738b", + }, + { + .AES_GCM_256, + "0000000000000000000000000000000000000000000000000000000000000000", + "000000000000000000000000", + "", + "00000000000000000000000000000000", + "cea7403d4d606b6e074ec5d3baf39d18", + "d0d1c8a799996bf0265b98b5d48ab919", + }, + { + .AES_GCM_256, + "feffe9928665731c6d6a8f9467308308feffe9928665731c6d6a8f9467308308", + "cafebabefacedbaddecaf888", + "", + "d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b391aafd255", + "522dc1f099567d07f47f37a32a84427d643a8cdcbfe5c0c97598a2bd2555d1aa8cb08e48590dbb3da7b08b1056828838c5f61e6393ba7a0abcc9f662898015ad", + "b094dac5d93471bdec1a502270e3cc6c", + }, + { + .AES_GCM_256, + "feffe9928665731c6d6a8f9467308308feffe9928665731c6d6a8f9467308308", + "cafebabefacedbaddecaf888", + "feedfacedeadbeeffeedfacedeadbeefabaddad2", + "d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39", + "522dc1f099567d07f47f37a32a84427d643a8cdcbfe5c0c97598a2bd2555d1aa8cb08e48590dbb3da7b08b1056828838c5f61e6393ba7a0abcc9f662", + "76fc6ece0f4e1768cddf8853bb2d551b", + }, + { + .AES_GCM_256, + "feffe9928665731c6d6a8f9467308308feffe9928665731c6d6a8f9467308308", + "cafebabefacedbad", + "feedfacedeadbeeffeedfacedeadbeefabaddad2", + "d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39", + "c3762df1ca787d32ae47c13bf19844cbaf1ae14d0b976afac52ff7d79bba9de0feb582d33934a4f0954cc2363bc73f7862ac430e64abe499f47c9b1f", + "3a337dbf46a792c45e454913fe2ea8f2", + }, + { + .AES_GCM_256, + "feffe9928665731c6d6a8f9467308308feffe9928665731c6d6a8f9467308308", + "9313225df88406e555909c5aff5269aa6a7a9538534f7da1e4c303d2a318a728c3c0c95156809539fcf0e2429a6b525416aedbf5a0de6a57a637b39b", + "feedfacedeadbeeffeedfacedeadbeefabaddad2", + "d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39", + "5a8def2f0c9e53f1f75d7853659e2a20eeb2b22aafde6419a058ab4f6f746bf40fc0c3b780f244452da3ebf1c5d82cdea2418997200ef82e44ae7e3f", + "a44a8266ee1c8eb0c8b5d4cf5ae9f19a", + }, + // Chacha20-Poly1305 + // https://www.rfc-editor.org/rfc/rfc8439 + { + .CHACHA20POLY1305, + "808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f", + "070000004041424344454647", + "50515253c0c1c2c3c4c5c6c7", + string(hex.encode(transmute([]byte)(_PLAINTEXT_SUNSCREEN_STR), context.temp_allocator)), + "d31a8d34648e60db7b86afbc53ef7ec2a4aded51296e08fea9e2b5a736ee62d63dbea45e8ca9671282fafb69da92728b1a71de0a9e060b2905d6a5b67ecd3b3692ddbd7f2d778b8c9803aee328091b58fab324e4fad675945585808b4831d7bc3ff4def08e4b7a9de576d26586cec64b6116", + "1ae10b594f09e26a7e902ecbd0600691", + }, + // XChaCha20-Poly1305-IETF + // - https://datatracker.ietf.org/doc/html/draft-arciszewski-xchacha-03 + { + .XCHACHA20POLY1305, + "808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f", + "404142434445464748494a4b4c4d4e4f5051525354555657", + "50515253c0c1c2c3c4c5c6c7", + "4c616469657320616e642047656e746c656d656e206f662074686520636c617373206f66202739393a204966204920636f756c64206f6666657220796f75206f6e6c79206f6e652074697020666f7220746865206675747572652c2073756e73637265656e20776f756c642062652069742e", + "bd6d179d3e83d43b9576579493c0e939572a1700252bfaccbed2902c21396cbb731c7f1b0b4aa6440bf3a82f4eda7e39ae64c6708c54c216cb96b72e1213b4522f8c9ba40db5d945b11b69b982c1bb9e3f3fac2bc369488f76b2383565d3fff921f9664c97637da9768812f615c68b13b52e", + "c0875924c1c7987947deafd8780acf49", + }, + } + for v, _ in test_vectors { + algo_name := aead.ALGORITHM_NAMES[v.algo] + + key, _ := hex.decode(transmute([]byte)(v.key), context.temp_allocator) + iv, _ := hex.decode(transmute([]byte)(v.iv), context.temp_allocator) + aad, _ := hex.decode(transmute([]byte)(v.aad), context.temp_allocator) + plaintext, _ := hex.decode(transmute([]byte)(v.plaintext), context.temp_allocator) + ciphertext, _ := hex.decode(transmute([]byte)(v.ciphertext), context.temp_allocator) + tag, _ := hex.decode(transmute([]byte)(v.tag), context.temp_allocator) + + tag_ := make([]byte, len(tag), context.temp_allocator) + dst := make([]byte, len(ciphertext), context.temp_allocator) + + ctx: aead.Context + for impl in impls[v.algo] { + aead.init(&ctx, v.algo, key, impl) + + aead.seal(&ctx, dst, tag_, iv, aad, plaintext) + dst_str := string(hex.encode(dst, context.temp_allocator)) + tag_str := string(hex.encode(tag_, context.temp_allocator)) + testing.expectf( + t, + dst_str == v.ciphertext && tag_str == v.tag, + "%s/%v: Expected: (%s, %s) for seal_ctx(%s, %s, %s, %s), but got (%s, %s) instead", + algo_name, + impl, + v.ciphertext, + v.tag, + v.key, + v.iv, + v.aad, + v.plaintext, + dst_str, + tag_str, + ) + + aead.seal(v.algo, dst, tag_, key, iv, aad, plaintext, impl) + dst_str = string(hex.encode(dst, context.temp_allocator)) + tag_str = string(hex.encode(tag_, context.temp_allocator)) + testing.expectf( + t, + dst_str == v.ciphertext && tag_str == v.tag, + "%s/%v: Expected: (%s, %s) for seal_oneshot(%s, %s, %s, %s), but got (%s, %s) instead", + algo_name, + impl, + v.ciphertext, + v.tag, + v.key, + v.iv, + v.aad, + v.plaintext, + dst_str, + tag_str, + ) + + ok := aead.open(&ctx, dst, iv, aad, ciphertext, tag) + dst_str = string(hex.encode(dst, context.temp_allocator)) + testing.expectf( + t, + ok && dst_str == v.plaintext, + "%s/%v: Expected: (%s, true) for open_ctx(%s, %s, %s, %s, %s), but got (%s, %v) instead", + algo_name, + impl, + v.plaintext, + v.key, + v.iv, + v.aad, + v.ciphertext, + v.tag, + dst_str, + ok, + ) + + ok = aead.open(v.algo, dst, key, iv, aad, ciphertext, tag, impl) + dst_str = string(hex.encode(dst, context.temp_allocator)) + testing.expectf( + t, + ok && dst_str == v.plaintext, + "%s/%v: Expected: (%s, true) for open_oneshot(%s, %s, %s, %s, %s), but got (%s, %v) instead", + algo_name, + impl, + v.plaintext, + v.key, + v.iv, + v.aad, + v.ciphertext, + v.tag, + dst_str, + ok, + ) + + tag_[0] ~= 0xa5 + ok = aead.open(&ctx, dst, iv, aad, ciphertext, tag_) + testing.expectf(t, !ok, "%s/%v: Expected false for open(bad_tag, aad, ciphertext)", algo_name, impl) + + if len(dst) > 0 { + copy(dst, ciphertext[:]) + dst[0] ~= 0xa5 + ok = aead.open(&ctx, dst, iv, aad, dst, tag) + testing.expectf(t, !ok, "%s/%v: Expected false for open(tag, aad, bad_ciphertext)", algo_name, impl) + } + + if len(aad) > 0 { + aad_ := make([]byte, len(aad), context.temp_allocator) + copy(aad_, aad) + aad_[0] ~= 0xa5 + ok = aead.open(&ctx, dst, iv, aad_, ciphertext, tag) + testing.expectf(t, !ok, "%s/%v: Expected false for open(tag, bad_aad, ciphertext)", algo_name, impl) + } + } + } +} diff --git a/tests/core/crypto/test_core_crypto_aes.odin b/tests/core/crypto/test_core_crypto_aes.odin index c2fa2835c..b68b30976 100644 --- a/tests/core/crypto/test_core_crypto_aes.odin +++ b/tests/core/crypto/test_core_crypto_aes.odin @@ -12,18 +12,22 @@ import "core:crypto/sha2" test_aes :: proc(t: ^testing.T) { runtime.DEFAULT_TEMP_ALLOCATOR_TEMP_GUARD() - impls := make([dynamic]aes.Implementation, 0, 2) - defer delete(impls) + impls := supported_aes_impls() + + for impl in impls { + test_aes_ecb(t, impl) + test_aes_ctr(t, impl) + } +} + +supported_aes_impls :: proc() -> [dynamic]aes.Implementation { + impls := make([dynamic]aes.Implementation, 0, 2, context.temp_allocator) append(&impls, aes.Implementation.Portable) if aes.is_hardware_accelerated() { append(&impls, aes.Implementation.Hardware) } - for impl in impls { - test_aes_ecb(t, impl) - test_aes_ctr(t, impl) - test_aes_gcm(t, impl) - } + return impls } test_aes_ecb :: proc(t: ^testing.T, impl: aes.Implementation) { @@ -197,13 +201,13 @@ test_aes_ctr :: proc(t: ^testing.T, impl: aes.Implementation) { ctx: aes.Context_CTR key: [aes.KEY_SIZE_256]byte - nonce: [aes.CTR_IV_SIZE]byte - aes.init_ctr(&ctx, key[:], nonce[:], impl) + iv: [aes.CTR_IV_SIZE]byte + aes.init_ctr(&ctx, key[:], iv[:], impl) h_ctx: sha2.Context_512 sha2.init_512_256(&h_ctx) - for i := 1; i < 2048; i = i + 1 { + for i := 1; i <= 2048; i = i + 1 { aes.keystream_bytes_ctr(&ctx, tmp[:i]) sha2.update(&h_ctx, tmp[:i]) } @@ -212,7 +216,7 @@ test_aes_ctr :: proc(t: ^testing.T, impl: aes.Implementation) { sha2.final(&h_ctx, digest[:]) digest_str := string(hex.encode(digest[:], context.temp_allocator)) - expected_digest_str := "d4445343afeb9d1237f95b10d00358aed4c1d7d57c9fe480cd0afb5e2ffd448c" + expected_digest_str := "b5ba4e7d6e3d1ff2bb54387fc1528573a6b351610ce7bcc80b00da089f4b1bf0" testing.expectf( t, expected_digest_str == digest_str, @@ -222,223 +226,3 @@ test_aes_ctr :: proc(t: ^testing.T, impl: aes.Implementation) { digest_str, ) } - -test_aes_gcm :: proc(t: ^testing.T, impl: aes.Implementation) { - log.debugf("Testing AES-GCM/%v", impl) - - // NIST did a reorg of their site, so the source of the test vectors - // is only available from an archive. The commented out tests are - // for non-96-bit IVs which our implementation does not support. - // - // https://csrc.nist.rip/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf - test_vectors := []struct { - key: string, - iv: string, - aad: string, - plaintext: string, - ciphertext: string, - tag: string, - } { - { - "00000000000000000000000000000000", - "000000000000000000000000", - "", - "", - "", - "58e2fccefa7e3061367f1d57a4e7455a", - }, - { - "00000000000000000000000000000000", - "000000000000000000000000", - "", - "00000000000000000000000000000000", - "0388dace60b6a392f328c2b971b2fe78", - "ab6e47d42cec13bdf53a67b21257bddf", - }, - { - "feffe9928665731c6d6a8f9467308308", - "cafebabefacedbaddecaf888", - "", - "d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b391aafd255", - "42831ec2217774244b7221b784d0d49ce3aa212f2c02a4e035c17e2329aca12e21d514b25466931c7d8f6a5aac84aa051ba30b396a0aac973d58e091473f5985", - "4d5c2af327cd64a62cf35abd2ba6fab4", - }, - { - "feffe9928665731c6d6a8f9467308308", - "cafebabefacedbaddecaf888", - "feedfacedeadbeeffeedfacedeadbeefabaddad2", - "d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39", - "42831ec2217774244b7221b784d0d49ce3aa212f2c02a4e035c17e2329aca12e21d514b25466931c7d8f6a5aac84aa051ba30b396a0aac973d58e091", - "5bc94fbc3221a5db94fae95ae7121a47", - }, - /* - { - "feffe9928665731c6d6a8f9467308308", - "cafebabefacedbad", - "feedfacedeadbeeffeedfacedeadbeefabaddad2", - "d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39", - "61353b4c2806934a777ff51fa22a4755699b2a714fcdc6f83766e5f97b6c742373806900e49f24b22b097544d4896b424989b5e1ebac0f07c23f4598", - "3612d2e79e3b0785561be14aaca2fccb", - }, - { - "feffe9928665731c6d6a8f9467308308", - "9313225df88406e555909c5aff5269aa6a7a9538534f7da1e4c303d2a318a728c3c0c95156809539fcf0e2429a6b525416aedbf5a0de6a57a637b39b", - "feedfacedeadbeeffeedfacedeadbeefabaddad2", - "d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39", - "8ce24998625615b603a033aca13fb894be9112a5c3a211a8ba262a3cca7e2ca701e4a9a4fba43c90ccdcb281d48c7c6fd62875d2aca417034c34aee5", - "619cc5aefffe0bfa462af43c1699d050", - }, - */ - { - "000000000000000000000000000000000000000000000000", - "000000000000000000000000", - "", - "", - "", - "cd33b28ac773f74ba00ed1f312572435", - }, - { - "000000000000000000000000000000000000000000000000", - "000000000000000000000000", - "", - "00000000000000000000000000000000", - "98e7247c07f0fe411c267e4384b0f600", - "2ff58d80033927ab8ef4d4587514f0fb", - }, - { - "feffe9928665731c6d6a8f9467308308feffe9928665731c", - "cafebabefacedbaddecaf888", - "", - "d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b391aafd255", - "3980ca0b3c00e841eb06fac4872a2757859e1ceaa6efd984628593b40ca1e19c7d773d00c144c525ac619d18c84a3f4718e2448b2fe324d9ccda2710acade256", - "9924a7c8587336bfb118024db8674a14", - }, - { - "feffe9928665731c6d6a8f9467308308feffe9928665731c", - "cafebabefacedbaddecaf888", - "feedfacedeadbeeffeedfacedeadbeefabaddad2", - "d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39", - "3980ca0b3c00e841eb06fac4872a2757859e1ceaa6efd984628593b40ca1e19c7d773d00c144c525ac619d18c84a3f4718e2448b2fe324d9ccda2710", - "2519498e80f1478f37ba55bd6d27618c", - }, - /* - { - "feffe9928665731c6d6a8f9467308308feffe9928665731c", - "cafebabefacedbad", - "feedfacedeadbeeffeedfacedeadbeefabaddad2", - "d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39", - "0f10f599ae14a154ed24b36e25324db8c566632ef2bbb34f8347280fc4507057fddc29df9a471f75c66541d4d4dad1c9e93a19a58e8b473fa0f062f7", - "65dcc57fcf623a24094fcca40d3533f8", - }, - { - "feffe9928665731c6d6a8f9467308308feffe9928665731c", - "9313225df88406e555909c5aff5269aa6a7a9538534f7da1e4c303d2a318a728c3c0c95156809539fcf0e2429a6b525416aedbf5a0de6a57a637b39b", - "feedfacedeadbeeffeedfacedeadbeefabaddad2", - "d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39", - "d27e88681ce3243c4830165a8fdcf9ff1de9a1d8e6b447ef6ef7b79828666e4581e79012af34ddd9e2f037589b292db3e67c036745fa22e7e9b7373b", - "dcf566ff291c25bbb8568fc3d376a6d9", - }, - */ - { - "0000000000000000000000000000000000000000000000000000000000000000", - "000000000000000000000000", - "", - "", - "", - "530f8afbc74536b9a963b4f1c4cb738b", - }, - { - "0000000000000000000000000000000000000000000000000000000000000000", - "000000000000000000000000", - "", - "00000000000000000000000000000000", - "cea7403d4d606b6e074ec5d3baf39d18", - "d0d1c8a799996bf0265b98b5d48ab919", - }, - { - "feffe9928665731c6d6a8f9467308308feffe9928665731c6d6a8f9467308308", - "cafebabefacedbaddecaf888", - "", - "d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b391aafd255", - "522dc1f099567d07f47f37a32a84427d643a8cdcbfe5c0c97598a2bd2555d1aa8cb08e48590dbb3da7b08b1056828838c5f61e6393ba7a0abcc9f662898015ad", - "b094dac5d93471bdec1a502270e3cc6c", - }, - { - "feffe9928665731c6d6a8f9467308308feffe9928665731c6d6a8f9467308308", - "cafebabefacedbaddecaf888", - "feedfacedeadbeeffeedfacedeadbeefabaddad2", - "d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39", - "522dc1f099567d07f47f37a32a84427d643a8cdcbfe5c0c97598a2bd2555d1aa8cb08e48590dbb3da7b08b1056828838c5f61e6393ba7a0abcc9f662", - "76fc6ece0f4e1768cddf8853bb2d551b", - }, - /* - { - "feffe9928665731c6d6a8f9467308308feffe9928665731c6d6a8f9467308308", - "cafebabefacedbad", - "feedfacedeadbeeffeedfacedeadbeefabaddad2", - "d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39", - "c3762df1ca787d32ae47c13bf19844cbaf1ae14d0b976afac52ff7d79bba9de0feb582d33934a4f0954cc2363bc73f7862ac430e64abe499f47c9b1f", - "3a337dbf46a792c45e454913fe2ea8f2", - }, - { - "feffe9928665731c6d6a8f9467308308feffe9928665731c6d6a8f9467308308", - "9313225df88406e555909c5aff5269aa6a7a9538534f7da1e4c303d2a318a728c3c0c95156809539fcf0e2429a6b525416aedbf5a0de6a57a637b39b", - "feedfacedeadbeeffeedfacedeadbeefabaddad2", - "d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39", - "5a8def2f0c9e53f1f75d7853659e2a20eeb2b22aafde6419a058ab4f6f746bf40fc0c3b780f244452da3ebf1c5d82cdea2418997200ef82e44ae7e3f", - "a44a8266ee1c8eb0c8b5d4cf5ae9f19a", - }, - */ - } - for v, _ in test_vectors { - key, _ := hex.decode(transmute([]byte)(v.key), context.temp_allocator) - iv, _ := hex.decode(transmute([]byte)(v.iv), context.temp_allocator) - aad, _ := hex.decode(transmute([]byte)(v.aad), context.temp_allocator) - plaintext, _ := hex.decode(transmute([]byte)(v.plaintext), context.temp_allocator) - ciphertext, _ := hex.decode(transmute([]byte)(v.ciphertext), context.temp_allocator) - tag, _ := hex.decode(transmute([]byte)(v.tag), context.temp_allocator) - - tag_ := make([]byte, len(tag), context.temp_allocator) - dst := make([]byte, len(ciphertext), context.temp_allocator) - - ctx: aes.Context_GCM - aes.init_gcm(&ctx, key, impl) - - aes.seal_gcm(&ctx, dst, tag_, iv, aad, plaintext) - dst_str := string(hex.encode(dst[:], context.temp_allocator)) - tag_str := string(hex.encode(tag_[:], context.temp_allocator)) - - testing.expectf( - t, - dst_str == v.ciphertext && tag_str == v.tag, - "AES-GCM/%v: Expected: (%s, %s) for seal(%s, %s, %s, %s), but got (%s, %s) instead", - impl, - v.ciphertext, - v.tag, - v.key, - v.iv, - v.aad, - v.plaintext, - dst_str, - tag_str, - ) - - ok := aes.open_gcm(&ctx, dst, iv, aad, ciphertext, tag) - dst_str = string(hex.encode(dst[:], context.temp_allocator)) - - testing.expectf( - t, - ok && dst_str == v.plaintext, - "AES-GCM/%v: Expected: (%s, true) for open(%s, %s, %s, %s, %s), but got (%s, %v) instead", - impl, - v.plaintext, - v.key, - v.iv, - v.aad, - v.ciphertext, - v.tag, - dst_str, - ok, - ) - } -}