mirror of
https://github.com/odin-lang/Odin.git
synced 2026-04-19 13:00:28 +00:00
base/runtime: Add chacha8rand as the default RNG
This commit is contained in:
@@ -41,88 +41,3 @@ random_generator_reset_u64 :: proc(rg: Random_Generator, p: u64) {
|
||||
rg.procedure(rg.data, .Reset, ([^]byte)(&p)[:size_of(p)])
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Default_Random_State :: struct {
|
||||
state: u64,
|
||||
inc: u64,
|
||||
}
|
||||
|
||||
default_random_generator_proc :: proc(data: rawptr, mode: Random_Generator_Mode, p: []byte) {
|
||||
@(require_results)
|
||||
read_u64 :: proc "contextless" (r: ^Default_Random_State) -> u64 {
|
||||
old_state := r.state
|
||||
r.state = old_state * 6364136223846793005 + (r.inc|1)
|
||||
xor_shifted := (((old_state >> 59) + 5) ~ old_state) * 12605985483714917081
|
||||
rot := (old_state >> 59)
|
||||
return (xor_shifted >> rot) | (xor_shifted << ((-rot) & 63))
|
||||
}
|
||||
|
||||
@(thread_local)
|
||||
global_rand_seed: Default_Random_State
|
||||
|
||||
init :: proc "contextless" (r: ^Default_Random_State, seed: u64) {
|
||||
seed := seed
|
||||
if seed == 0 {
|
||||
seed = u64(intrinsics.read_cycle_counter())
|
||||
}
|
||||
r.state = 0
|
||||
r.inc = (seed << 1) | 1
|
||||
_ = read_u64(r)
|
||||
r.state += seed
|
||||
_ = read_u64(r)
|
||||
}
|
||||
|
||||
r: ^Default_Random_State = ---
|
||||
if data == nil {
|
||||
r = &global_rand_seed
|
||||
} else {
|
||||
r = cast(^Default_Random_State)data
|
||||
}
|
||||
|
||||
switch mode {
|
||||
case .Read:
|
||||
if r.state == 0 && r.inc == 0 {
|
||||
init(r, 0)
|
||||
}
|
||||
|
||||
switch len(p) {
|
||||
case size_of(u64):
|
||||
// Fast path for a 64-bit destination.
|
||||
intrinsics.unaligned_store((^u64)(raw_data(p)), read_u64(r))
|
||||
case:
|
||||
// All other cases.
|
||||
pos := i8(0)
|
||||
val := u64(0)
|
||||
for &v in p {
|
||||
if pos == 0 {
|
||||
val = read_u64(r)
|
||||
pos = 8
|
||||
}
|
||||
v = byte(val)
|
||||
val >>= 8
|
||||
pos -= 1
|
||||
}
|
||||
}
|
||||
|
||||
case .Reset:
|
||||
seed: u64
|
||||
mem_copy_non_overlapping(&seed, raw_data(p), min(size_of(seed), len(p)))
|
||||
init(r, seed)
|
||||
|
||||
case .Query_Info:
|
||||
if len(p) != size_of(Random_Generator_Query_Info) {
|
||||
return
|
||||
}
|
||||
info := (^Random_Generator_Query_Info)(raw_data(p))
|
||||
info^ += {.Uniform, .Resettable}
|
||||
}
|
||||
}
|
||||
|
||||
@(require_results)
|
||||
default_random_generator :: proc "contextless" (state: ^Default_Random_State = nil) -> Random_Generator {
|
||||
return {
|
||||
procedure = default_random_generator_proc,
|
||||
data = state,
|
||||
}
|
||||
}
|
||||
164
base/runtime/random_generator_chacha8.odin
Normal file
164
base/runtime/random_generator_chacha8.odin
Normal file
@@ -0,0 +1,164 @@
|
||||
package runtime
|
||||
|
||||
import "base:intrinsics"
|
||||
|
||||
// This is an implementation of the Chacha8Rand DRBG, as specified
|
||||
// in https://github.com/C2SP/C2SP/blob/main/chacha8rand.md
|
||||
//
|
||||
// There is a tradeoff to be made between state-size and performance,
|
||||
// in terms of the amount of rng output buffered.
|
||||
//
|
||||
// The sensible buffer sizes are:
|
||||
// - 256-bytes: 128-bit SIMD with 16x vector registers (SSE2)
|
||||
// - 512-bytes: 128-bit SIMD with 32x vector registers (ARMv8),
|
||||
// 256-bit SIMD with 16x vector registers (AVX2),
|
||||
// - 1024-bytes: AVX-512
|
||||
//
|
||||
// Notes:
|
||||
// - Smaller than 256-bytes is possible but would require redundant
|
||||
// calls to the ChaCha8 function, which is prohibitively expensive.
|
||||
// - Larger than 1024-bytes is possible but pointless as the construct
|
||||
// is defined around 992-bytes of RNG output and 32-bytes of input
|
||||
// per iteration.
|
||||
//
|
||||
// This implementation opts for a 1024-byte buffer for simplicity,
|
||||
// under the rationale that modern extremely memory constrained targets
|
||||
// provide suitable functionality in hardware, and the language makes
|
||||
// supporting the various SIMD flavors easy.
|
||||
|
||||
@(private = "file")
|
||||
RNG_SEED_SIZE :: 32
|
||||
@(private)
|
||||
RNG_OUTPUT_PER_ITER :: 1024 - RNG_SEED_SIZE
|
||||
|
||||
@(private)
|
||||
CHACHA_SIGMA_0: u32 : 0x61707865
|
||||
@(private)
|
||||
CHACHA_SIGMA_1: u32 : 0x3320646e
|
||||
@(private)
|
||||
CHACHA_SIGMA_2: u32 : 0x79622d32
|
||||
@(private)
|
||||
CHACHA_SIGMA_3: u32 : 0x6b206574
|
||||
@(private)
|
||||
CHACHA_ROUNDS :: 8
|
||||
|
||||
Default_Random_State :: struct {
|
||||
_buf: [1024]byte,
|
||||
_off: int,
|
||||
_seeded: bool,
|
||||
}
|
||||
|
||||
@(require_results)
|
||||
default_random_generator :: proc "contextless" (state: ^Default_Random_State = nil) -> Random_Generator {
|
||||
return {
|
||||
procedure = default_random_generator_proc,
|
||||
data = state,
|
||||
}
|
||||
}
|
||||
|
||||
default_random_generator_proc :: proc(data: rawptr, mode: Random_Generator_Mode, p: []byte) {
|
||||
@(thread_local)
|
||||
state: Default_Random_State
|
||||
|
||||
r: ^Default_Random_State = &state
|
||||
if data != nil {
|
||||
r = cast(^Default_Random_State)data
|
||||
}
|
||||
next_seed := r._buf[RNG_OUTPUT_PER_ITER:]
|
||||
|
||||
switch mode {
|
||||
case .Read:
|
||||
if !r._seeded { // Unlikely.
|
||||
rand_bytes(next_seed)
|
||||
r._off = RNG_OUTPUT_PER_ITER // Force refill.
|
||||
r._seeded = true
|
||||
}
|
||||
|
||||
assert(r._off <= RNG_OUTPUT_PER_ITER, "chacha8rand/BUG: outputed key material")
|
||||
if r._off >= RNG_OUTPUT_PER_ITER { // Unlikely.
|
||||
chacha8rand_refill(r)
|
||||
}
|
||||
|
||||
// We are guaranteed to have at least some RNG output buffered.
|
||||
//
|
||||
// As an invariant each read will consume a multiple of 8-bytes
|
||||
// of output at a time.
|
||||
assert(r._off <= RNG_OUTPUT_PER_ITER - 8, "chacha8rand/BUG: less than 8-bytes of output available")
|
||||
assert(r._off % 8 == 0, "chacha8rand/BUG: buffered output is not a multiple of 8-bytes")
|
||||
|
||||
p_len := len(p)
|
||||
if p_len == size_of(u64) {
|
||||
#no_bounds_check {
|
||||
// Fast path for a 64-bit destination.
|
||||
src := (^u64)(raw_data(r._buf[r._off:]))
|
||||
intrinsics.unaligned_store((^u64)(raw_data(p)), src^)
|
||||
src^ = 0 // Erasure (backtrack resistance)
|
||||
r._off += 8
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
p_ := p
|
||||
for remaining := p_len; remaining > 0; {
|
||||
sz := min(remaining, RNG_OUTPUT_PER_ITER - r._off)
|
||||
#no_bounds_check {
|
||||
copy(p_[:sz], r._buf[r._off:])
|
||||
p_ = p_[sz:]
|
||||
remaining -= sz
|
||||
}
|
||||
rounded_sz := ((sz + 7) / 8) * 8
|
||||
new_off := r._off + rounded_sz
|
||||
#no_bounds_check if new_off < RNG_OUTPUT_PER_ITER {
|
||||
// Erasure (backtrack resistance)
|
||||
intrinsics.mem_zero(raw_data(r._buf[r._off:]), rounded_sz)
|
||||
r._off = new_off
|
||||
} else {
|
||||
// Can omit erasure since we are overwriting the entire
|
||||
// buffer.
|
||||
chacha8rand_refill(r)
|
||||
}
|
||||
}
|
||||
|
||||
case .Reset:
|
||||
// If no seed is passed, the next call to .Read will attempt to
|
||||
// reseed from the system entropy source.
|
||||
if len(p) == 0 {
|
||||
r._seeded = false
|
||||
return
|
||||
}
|
||||
|
||||
// The cryptographic security of the output depends entirely
|
||||
// on the quality of the entropy in the seed, we will allow
|
||||
// re-seeding (as it makes testing easier), but callers that
|
||||
// decide to provide arbitrary seeds are on their own as far
|
||||
// as ensuring high-quality entropy.
|
||||
intrinsics.mem_zero(raw_data(next_seed), RNG_SEED_SIZE)
|
||||
copy(next_seed, p)
|
||||
r._seeded = true
|
||||
r._off = RNG_OUTPUT_PER_ITER // Force a refill.
|
||||
|
||||
case .Query_Info:
|
||||
if len(p) != size_of(Random_Generator_Query_Info) {
|
||||
return
|
||||
}
|
||||
info := (^Random_Generator_Query_Info)(raw_data(p))
|
||||
info^ += {.Uniform, .Cryptographic, .Resettable}
|
||||
}
|
||||
}
|
||||
|
||||
@(private = "file")
|
||||
chacha8rand_refill :: proc(r: ^Default_Random_State) {
|
||||
assert(r._seeded == true, "chacha8rand/BUG: unseeded refill")
|
||||
|
||||
// i386 has insufficient vector registers to use the
|
||||
// accelerated path at the moment.
|
||||
when ODIN_ARCH == .amd64 && intrinsics.has_target_feature("avx2") {
|
||||
chacha8rand_refill_simd256(r)
|
||||
} else when HAS_HARDWARE_SIMD && ODIN_ARCH != .i386 {
|
||||
chacha8rand_refill_simd128(r)
|
||||
} else {
|
||||
chacha8rand_refill_ref(r)
|
||||
}
|
||||
|
||||
r._off = 0
|
||||
}
|
||||
145
base/runtime/random_generator_chacha8_ref.odin
Normal file
145
base/runtime/random_generator_chacha8_ref.odin
Normal file
@@ -0,0 +1,145 @@
|
||||
package runtime
|
||||
|
||||
import "base:intrinsics"
|
||||
|
||||
@(private)
|
||||
chacha8rand_refill_ref :: proc(r: ^Default_Random_State) {
|
||||
// Initialize the base state.
|
||||
k: [^]u32 = (^u32)(raw_data(r._buf[RNG_OUTPUT_PER_ITER:]))
|
||||
when ODIN_ENDIAN == .Little {
|
||||
s4 := k[0]
|
||||
s5 := k[1]
|
||||
s6 := k[2]
|
||||
s7 := k[3]
|
||||
s8 := k[4]
|
||||
s9 := k[5]
|
||||
s10 := k[6]
|
||||
s11 := k[7]
|
||||
} else {
|
||||
s4 := intrinsics.byte_swap(k[0])
|
||||
s5 := intrinsics.byte_swap(k[1])
|
||||
s6 := intrinsics.byte_swap(k[2])
|
||||
s7 := intrinsics.byte_swap(k[3])
|
||||
s8 := intrinsics.byte_swap(k[4])
|
||||
s9 := intrinsics.byte_swap(k[5])
|
||||
s10 := intrinsics.byte_swap(k[6])
|
||||
s11 := intrinicss.byte_swap(k[7])
|
||||
}
|
||||
s12: u32 // Counter starts at 0.
|
||||
s13, s14, s15: u32 // IV of all 0s.
|
||||
|
||||
dst: [^]u32 = (^u32)(raw_data(r._buf[:]))
|
||||
|
||||
// At least with LLVM21 force_inline produces identical perf to
|
||||
// manual inlining, yay.
|
||||
quarter_round := #force_inline proc "contextless" (a, b, c, d: u32) -> (u32, u32, u32, u32) {
|
||||
a, b, c, d := a, b, c, d
|
||||
|
||||
a += b
|
||||
d ~= a
|
||||
d = rotl(d, 16)
|
||||
|
||||
c += d
|
||||
b ~= c
|
||||
b = rotl(b, 12)
|
||||
|
||||
a += b
|
||||
d ~= a
|
||||
d = rotl(d, 8)
|
||||
|
||||
c += d
|
||||
b ~= c
|
||||
b = rotl(b, 7)
|
||||
|
||||
return a, b, c, d
|
||||
}
|
||||
|
||||
// Filippo Valsorda made an observation that only one of the column
|
||||
// round depends on the counter (s12), so it is worth precomputing
|
||||
// and reusing across multiple blocks. As far as I know, only Go's
|
||||
// chacha implementation does this.
|
||||
|
||||
p1, p5, p9, p13 := quarter_round(CHACHA_SIGMA_1, s5, s9, s13)
|
||||
p2, p6, p10, p14 := quarter_round(CHACHA_SIGMA_2, s6, s10, s14)
|
||||
p3, p7, p11, p15 := quarter_round(CHACHA_SIGMA_3, s7, s11, s15)
|
||||
|
||||
// 4 groups
|
||||
for g := 0; g < 4; g = g + 1 {
|
||||
// 4 blocks per group
|
||||
for n := 0; n < 4; n = n + 1 {
|
||||
// First column round that depends on the counter
|
||||
p0, p4, p8, p12 := quarter_round(CHACHA_SIGMA_0, s4, s8, s12)
|
||||
|
||||
// First diagonal round
|
||||
x0, x5, x10, x15 := quarter_round(p0, p5, p10, p15)
|
||||
x1, x6, x11, x12 := quarter_round(p1, p6, p11, p12)
|
||||
x2, x7, x8, x13 := quarter_round(p2, p7, p8, p13)
|
||||
x3, x4, x9, x14 := quarter_round(p3, p4, p9, p14)
|
||||
|
||||
for i := CHACHA_ROUNDS - 2; i > 0; i = i - 2 {
|
||||
x0, x4, x8, x12 = quarter_round(x0, x4, x8, x12)
|
||||
x1, x5, x9, x13 = quarter_round(x1, x5, x9, x13)
|
||||
x2, x6, x10, x14 = quarter_round(x2, x6, x10, x14)
|
||||
x3, x7, x11, x15 = quarter_round(x3, x7, x11, x15)
|
||||
|
||||
x0, x5, x10, x15 = quarter_round(x0, x5, x10, x15)
|
||||
x1, x6, x11, x12 = quarter_round(x1, x6, x11, x12)
|
||||
x2, x7, x8, x13 = quarter_round(x2, x7, x8, x13)
|
||||
x3, x4, x9, x14 = quarter_round(x3, x4, x9, x14)
|
||||
}
|
||||
|
||||
// Interleave 4 blocks
|
||||
// NB: The additions of sigma and the counter are omitted
|
||||
STRIDE :: 4
|
||||
d_ := dst[n:]
|
||||
when ODIN_ENDIAN == .Little {
|
||||
d_[STRIDE*0] = x0
|
||||
d_[STRIDE*1] = x1
|
||||
d_[STRIDE*2] = x2
|
||||
d_[STRIDE*3] = x3
|
||||
d_[STRIDE*4] = x4 + s4
|
||||
d_[STRIDE*5] = x5 + s5
|
||||
d_[STRIDE*6] = x6 + s6
|
||||
d_[STRIDE*7] = x7 + s7
|
||||
d_[STRIDE*8] = x8 + s8
|
||||
d_[STRIDE*9] = x9 + s9
|
||||
d_[STRIDE*10] = x10 + s10
|
||||
d_[STRIDE*11] = x11 + s11
|
||||
d_[STRIDE*12] = x12
|
||||
d_[STRIDE*13] = x13 + s13
|
||||
d_[STRIDE*14] = x14 + s14
|
||||
d_[STRIDE*15] = x15 + s15
|
||||
} else {
|
||||
d_[STRIDE*0] = intrinsics.byte_swap(x0)
|
||||
d_[STRIDE*1] = intrinsics.byte_swap(x1)
|
||||
d_[STRIDE*2] = intrinsics.byte_swap(x2)
|
||||
d_[STRIDE*3] = intrinsics.byte_swap(x3)
|
||||
d_[STRIDE*4] = intrinsics.byte_swap(x4 + s4)
|
||||
d_[STRIDE*5] = intrinsics.byte_swap(x5 + s5)
|
||||
d_[STRIDE*6] = intrinsics.byte_swap(x6 + s6)
|
||||
d_[STRIDE*7] = intrinsics.byte_swap(x7 + s7)
|
||||
d_[STRIDE*8] = intrinsics.byte_swap(x8 + s8)
|
||||
d_[STRIDE*9] = intrinsics.byte_swap(x9 + s9)
|
||||
d_[STRIDE*10] = intrinsics.byte_swap(x10 + s10)
|
||||
d_[STRIDE*11] = intrinsics.byte_swap(x11 + s11)
|
||||
d_[STRIDE*12] = intrinsics.byte_swap(x12)
|
||||
d_[STRIDE*13] = intrinsics.byte_swap(x13 + s13)
|
||||
d_[STRIDE*14] = intrinsics.byte_swap(x14 + s14)
|
||||
d_[STRIDE*15] = intrinsics.byte_swap(x15 + s15)
|
||||
}
|
||||
|
||||
s12 = s12 + 1 // Increment the counter
|
||||
}
|
||||
|
||||
dst = dst[16*4:]
|
||||
}
|
||||
}
|
||||
|
||||
// This replicates `rotate_left32` from `core:math/bits`, under the
|
||||
// assumption that this will live in `base:runtime`.
|
||||
@(require_results, private = "file")
|
||||
rotl :: #force_inline proc "contextless" (x: u32, k: int) -> u32 {
|
||||
n :: 32
|
||||
s := uint(k) & (n-1)
|
||||
return x << s | x >> (n-s)
|
||||
}
|
||||
290
base/runtime/random_generator_chacha8_simd128.odin
Normal file
290
base/runtime/random_generator_chacha8_simd128.odin
Normal file
@@ -0,0 +1,290 @@
|
||||
#+build !i386
|
||||
package runtime
|
||||
|
||||
import "base:intrinsics"
|
||||
|
||||
@(private = "file")
|
||||
u32x4 :: #simd[4]u32
|
||||
|
||||
@(private = "file")
|
||||
S0: u32x4 : {CHACHA_SIGMA_0, CHACHA_SIGMA_0, CHACHA_SIGMA_0, CHACHA_SIGMA_0}
|
||||
@(private = "file")
|
||||
S1: u32x4 : {CHACHA_SIGMA_1, CHACHA_SIGMA_1, CHACHA_SIGMA_1, CHACHA_SIGMA_1}
|
||||
@(private = "file")
|
||||
S2: u32x4 : {CHACHA_SIGMA_2, CHACHA_SIGMA_2, CHACHA_SIGMA_2, CHACHA_SIGMA_2}
|
||||
@(private = "file")
|
||||
S3: u32x4 : {CHACHA_SIGMA_3, CHACHA_SIGMA_3, CHACHA_SIGMA_3, CHACHA_SIGMA_3}
|
||||
|
||||
@(private = "file")
|
||||
_ROT_7L: u32x4 : {7, 7, 7, 7}
|
||||
@(private = "file")
|
||||
_ROT_7R: u32x4 : {25, 25, 25, 25}
|
||||
@(private = "file")
|
||||
_ROT_12L: u32x4 : {12, 12, 12, 12}
|
||||
@(private = "file")
|
||||
_ROT_12R: u32x4 : {20, 20, 20, 20}
|
||||
@(private = "file")
|
||||
_ROT_8L: u32x4 : {8, 8, 8, 8}
|
||||
@(private = "file")
|
||||
_ROT_8R: u32x4 : {24, 24, 24, 24}
|
||||
@(private = "file")
|
||||
_ROT_16: u32x4 : {16, 16, 16, 16}
|
||||
@(private = "file")
|
||||
_CTR_INC_4: u32x4 : {4, 4, 4, 4}
|
||||
@(private = "file")
|
||||
_CTR_INC_8: u32x4 : {8, 8, 8, 8}
|
||||
|
||||
when ODIN_ENDIAN == .Big {
|
||||
@(private = "file")
|
||||
_byteswap_u32x4 :: #force_inline proc "contextless" (v: u32x4) -> u32x4 {
|
||||
u8x16 :: #simd[16]u8
|
||||
return(
|
||||
transmute(u32x4)simd.shuffle(
|
||||
transmute(u8x16)v,
|
||||
transmute(u8x16)v,
|
||||
3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
|
||||
)
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@(private)
|
||||
chacha8rand_refill_simd128 :: proc(r: ^Default_Random_State) {
|
||||
// Initialize the base state.
|
||||
k: [^]u32 = (^u32)(raw_data(r._buf[RNG_OUTPUT_PER_ITER:]))
|
||||
when ODIN_ENDIAN == .Little {
|
||||
s4_ := k[0]
|
||||
s5_ := k[1]
|
||||
s6_ := k[2]
|
||||
s7_ := k[3]
|
||||
s8_ := k[4]
|
||||
s9_ := k[5]
|
||||
s10_ := k[6]
|
||||
s11_ := k[7]
|
||||
} else {
|
||||
s4_ := intrinsics.byte_swap(k[0])
|
||||
s5_ := intrinsics.byte_swap(k[1])
|
||||
s6_ := intrinsics.byte_swap(k[2])
|
||||
s7_ := intrinsics.byte_swap(k[3])
|
||||
s8_ := intrinsics.byte_swap(k[4])
|
||||
s9_ := intrinsics.byte_swap(k[5])
|
||||
s10_ := intrinsics.byte_swap(k[6])
|
||||
s11_ := intrinicss.byte_swap(k[7])
|
||||
}
|
||||
|
||||
// 4-lane ChaCha8.
|
||||
s4 := u32x4{s4_, s4_, s4_, s4_}
|
||||
s5 := u32x4{s5_, s5_, s5_, s5_}
|
||||
s6 := u32x4{s6_, s6_, s6_, s6_}
|
||||
s7 := u32x4{s7_, s7_, s7_, s7_}
|
||||
s8 := u32x4{s8_, s8_, s8_, s8_}
|
||||
s9 := u32x4{s9_, s9_, s9_, s9_}
|
||||
s10 := u32x4{s10_, s10_, s10_, s10_}
|
||||
s11 := u32x4{s11_, s11_, s11_, s11_}
|
||||
s12 := u32x4{0, 1, 2, 3}
|
||||
s13, s14, s15: u32x4
|
||||
|
||||
dst: [^]u32x4 = (^u32x4)(raw_data(r._buf[:]))
|
||||
|
||||
quarter_round := #force_inline proc "contextless" (a, b, c, d: u32x4) -> (u32x4, u32x4, u32x4, u32x4) {
|
||||
a, b, c, d := a, b, c, d
|
||||
|
||||
a = intrinsics.simd_add(a, b)
|
||||
d = intrinsics.simd_bit_xor(d, a)
|
||||
d = intrinsics.simd_bit_xor(intrinsics.simd_shl(d, _ROT_16), intrinsics.simd_shr(d, _ROT_16))
|
||||
|
||||
c = intrinsics.simd_add(c, d)
|
||||
b = intrinsics.simd_bit_xor(b, c)
|
||||
b = intrinsics.simd_bit_xor(intrinsics.simd_shl(b, _ROT_12L), intrinsics.simd_shr(b, _ROT_12R))
|
||||
|
||||
a = intrinsics.simd_add(a, b)
|
||||
d = intrinsics.simd_bit_xor(d, a)
|
||||
d = intrinsics.simd_bit_xor(intrinsics.simd_shl(d, _ROT_8L), intrinsics.simd_shr(d, _ROT_8R))
|
||||
|
||||
c = intrinsics.simd_add(c, d)
|
||||
b = intrinsics.simd_bit_xor(b, c)
|
||||
b = intrinsics.simd_bit_xor(intrinsics.simd_shl(b, _ROT_7L), intrinsics.simd_shr(b, _ROT_7R))
|
||||
|
||||
return a, b, c, d
|
||||
}
|
||||
|
||||
// 8 blocks at a time.
|
||||
//
|
||||
// Note:
|
||||
// This uses a ton of registers so it is only worth it on targets
|
||||
// that have something like 32 128-bit registers. This is currently
|
||||
// all ARMv8 targets, and RISC-V Zvl128b (`V` application profile)
|
||||
// targets.
|
||||
//
|
||||
// While our current definition of `.arm32` is 32-bit ARMv8, this
|
||||
// may change in the future (ARMv7 is still relevant), and things
|
||||
// like Cortex-A8/A9 does "pretend" 128-bit SIMD 64-bits at a time
|
||||
// thus needs bemchmarking.
|
||||
when ODIN_ARCH == .arm64 || ODIN_ARCH == .riscv64 {
|
||||
for _ in 0..<2 {
|
||||
x0_0, x1_0, x2_0, x3_0 := S0, S1, S2, S3
|
||||
x4_0, x5_0, x6_0, x7_0 := s4, s5, s6, s7
|
||||
x8_0, x9_0, x10_0, x11_0 := s8, s9, s10, s11
|
||||
x12_0, x13_0, x14_0, x15_0 := s12, s13, s14, s15
|
||||
|
||||
x0_1, x1_1, x2_1, x3_1 := S0, S1, S2, S3
|
||||
x4_1, x5_1, x6_1, x7_1 := s4, s5, s6, s7
|
||||
x8_1, x9_1, x10_1, x11_1 := s8, s9, s10, s11
|
||||
x12_1 := intrinsics.simd_add(s12, _CTR_INC_4)
|
||||
x13_1, x14_1, x15_1 := s13, s14, s15
|
||||
|
||||
for i := CHACHA_ROUNDS; i > 0; i = i - 2 {
|
||||
x0_0, x4_0, x8_0, x12_0 = quarter_round(x0_0, x4_0, x8_0, x12_0)
|
||||
x0_1, x4_1, x8_1, x12_1 = quarter_round(x0_1, x4_1, x8_1, x12_1)
|
||||
x1_0, x5_0, x9_0, x13_0 = quarter_round(x1_0, x5_0, x9_0, x13_0)
|
||||
x1_1, x5_1, x9_1, x13_1 = quarter_round(x1_1, x5_1, x9_1, x13_1)
|
||||
x2_0, x6_0, x10_0, x14_0 = quarter_round(x2_0, x6_0, x10_0, x14_0)
|
||||
x2_1, x6_1, x10_1, x14_1 = quarter_round(x2_1, x6_1, x10_1, x14_1)
|
||||
x3_0, x7_0, x11_0, x15_0 = quarter_round(x3_0, x7_0, x11_0, x15_0)
|
||||
x3_1, x7_1, x11_1, x15_1 = quarter_round(x3_1, x7_1, x11_1, x15_1)
|
||||
|
||||
x0_0, x5_0, x10_0, x15_0 = quarter_round(x0_0, x5_0, x10_0, x15_0)
|
||||
x0_1, x5_1, x10_1, x15_1 = quarter_round(x0_1, x5_1, x10_1, x15_1)
|
||||
x1_0, x6_0, x11_0, x12_0 = quarter_round(x1_0, x6_0, x11_0, x12_0)
|
||||
x1_1, x6_1, x11_1, x12_1 = quarter_round(x1_1, x6_1, x11_1, x12_1)
|
||||
x2_0, x7_0, x8_0, x13_0 = quarter_round(x2_0, x7_0, x8_0, x13_0)
|
||||
x2_1, x7_1, x8_1, x13_1 = quarter_round(x2_1, x7_1, x8_1, x13_1)
|
||||
x3_0, x4_0, x9_0, x14_0 = quarter_round(x3_0, x4_0, x9_0, x14_0)
|
||||
x3_1, x4_1, x9_1, x14_1 = quarter_round(x3_1, x4_1, x9_1, x14_1)
|
||||
}
|
||||
|
||||
when ODIN_ENDIAN == .Little {
|
||||
intrinsics.unaligned_store((^u32x4)(dst[0:]), x0_0)
|
||||
intrinsics.unaligned_store((^u32x4)(dst[1:]), x1_0)
|
||||
intrinsics.unaligned_store((^u32x4)(dst[2:]), x2_0)
|
||||
intrinsics.unaligned_store((^u32x4)(dst[3:]), x3_0)
|
||||
intrinsics.unaligned_store((^u32x4)(dst[4:]), intrinsics.simd_add(x4_0, s4))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[5:]), intrinsics.simd_add(x5_0, s5))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[6:]), intrinsics.simd_add(x6_0, s6))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[7:]), intrinsics.simd_add(x7_0, s7))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[8:]), intrinsics.simd_add(x8_0, s8))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[9:]), intrinsics.simd_add(x9_0, s9))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[10:]), intrinsics.simd_add(x10_0, s10))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[11:]), intrinsics.simd_add(x11_0, s11))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[12:]), x12_0)
|
||||
intrinsics.unaligned_store((^u32x4)(dst[13:]), intrinsics.simd_add(x13_0, s13))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[14:]), intrinsics.simd_add(x14_0, s14))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[15:]), intrinsics.simd_add(x15_0, s15))
|
||||
|
||||
intrinsics.unaligned_store((^u32x4)(dst[16:]), x0_1)
|
||||
intrinsics.unaligned_store((^u32x4)(dst[17:]), x1_1)
|
||||
intrinsics.unaligned_store((^u32x4)(dst[18:]), x2_1)
|
||||
intrinsics.unaligned_store((^u32x4)(dst[19:]), x3_1)
|
||||
intrinsics.unaligned_store((^u32x4)(dst[20:]), intrinsics.simd_add(x4_1, s4))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[21:]), intrinsics.simd_add(x5_1, s5))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[22:]), intrinsics.simd_add(x6_1, s6))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[23:]), intrinsics.simd_add(x7_1, s7))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[24:]), intrinsics.simd_add(x8_1, s8))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[25:]), intrinsics.simd_add(x9_1, s9))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[26:]), intrinsics.simd_add(x10_1, s10))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[27:]), intrinsics.simd_add(x11_1, s11))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[28:]), x12_1)
|
||||
intrinsics.unaligned_store((^u32x4)(dst[29:]), intrinsics.simd_add(x13_1, s13))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[30:]), intrinsics.simd_add(x14_1, s14))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[31:]), intrinsics.simd_add(x15_1, s15))
|
||||
} else {
|
||||
intrinsics.unaligned_store((^u32x4)(dst[0:]), _byteswap_u32x4(x0_0))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[1:]), _byteswap_u32x4(x1_0))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[2:]), _byteswap_u32x4(x2_0))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[3:]), _byteswap_u32x4(x3_0))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[4:]), _byteswap_u32x4(intrinsics.simd_add(x4_0, s4)))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[5:]), _byteswap_u32x4(intrinsics.simd_add(x5_0, s5)))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[6:]), _byteswap_u32x4(intrinsics.simd_add(x6_0, s6)))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[7:]), _byteswap_u32x4(intrinsics.simd_add(x7_0, s7)))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[8:]), _byteswap_u32x4(intrinsics.simd_add(x8_0, s8)))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[9:]), _byteswap_u32x4(intrinsics.simd_add(x9_0, s9)))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[10:]), _byteswap_u32x4(intrinsics.simd_add(x10_0, s10)))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[11:]), _byteswap_u32x4(intrinsics.simd_add(x11_0, s11)))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[12:]), _byteswap_u32x4(x12_0))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[13:]), _byteswap_u32x4(intrinsics.simd_add(x13_0, s13)))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[14:]), _byteswap_u32x4(intrinsics.simd_add(x14_0, s14)))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[15:]), _byteswap_u32x4(intrinsics.simd_add(x15_0, s15)))
|
||||
|
||||
intrinsics.unaligned_store((^u32x4)(dst[16:]), _byteswap_u32x4(x0_1))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[17:]), _byteswap_u32x4(x1_1))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[18:]), _byteswap_u32x4(x2_1))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[19:]), _byteswap_u32x4(x3_1))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[20:]), _byteswap_u32x4(intrinsics.simd_add(x4_1, s4)))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[21:]), _byteswap_u32x4(intrinsics.simd_add(x5_1, s5)))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[22:]), _byteswap_u32x4(intrinsics.simd_add(x6_1, s6)))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[23:]), _byteswap_u32x4(intrinsics.simd_add(x7_1, s7)))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[24:]), _byteswap_u32x4(intrinsics.simd_add(x8_1, s8)))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[25:]), _byteswap_u32x4(intrinsics.simd_add(x9_1, s9)))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[26:]), _byteswap_u32x4(intrinsics.simd_add(x10_1, s10)))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[27:]), _byteswap_u32x4(intrinsics.simd_add(x11_1, s11)))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[28:]), _byteswap_u32x4(x12_1))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[29:]), _byteswap_u32x4(intrinsics.simd_add(x13_1, s13)))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[30:]), _byteswap_u32x4(intrinsics.simd_add(x14_1, s14)))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[31:]), _byteswap_u32x4(intrinsics.simd_add(x15_1, s15)))
|
||||
}
|
||||
|
||||
s12 = intrinsics.simd_add(s12, _CTR_INC_8)
|
||||
|
||||
dst = dst[32:]
|
||||
}
|
||||
} else {
|
||||
for _ in 0..<4 {
|
||||
x0, x1, x2, x3 := S0, S1, S2, S3
|
||||
x4, x5, x6, x7 := s4, s5, s6, s7
|
||||
x8, x9, x10, x11 := s8, s9, s10, s11
|
||||
x12, x13, x14, x15 := s12, s13, s14, s15
|
||||
|
||||
for i := CHACHA_ROUNDS; i > 0; i = i - 2 {
|
||||
x0, x4, x8, x12 = quarter_round(x0, x4, x8, x12)
|
||||
x1, x5, x9, x13 = quarter_round(x1, x5, x9, x13)
|
||||
x2, x6, x10, x14 = quarter_round(x2, x6, x10, x14)
|
||||
x3, x7, x11, x15 = quarter_round(x3, x7, x11, x15)
|
||||
|
||||
x0, x5, x10, x15 = quarter_round(x0, x5, x10, x15)
|
||||
x1, x6, x11, x12 = quarter_round(x1, x6, x11, x12)
|
||||
x2, x7, x8, x13 = quarter_round(x2, x7, x8, x13)
|
||||
x3, x4, x9, x14 = quarter_round(x3, x4, x9, x14)
|
||||
}
|
||||
|
||||
when ODIN_ENDIAN == .Little {
|
||||
intrinsics.unaligned_store((^u32x4)(dst[0:]), x0)
|
||||
intrinsics.unaligned_store((^u32x4)(dst[1:]), x1)
|
||||
intrinsics.unaligned_store((^u32x4)(dst[2:]), x2)
|
||||
intrinsics.unaligned_store((^u32x4)(dst[3:]), x3)
|
||||
intrinsics.unaligned_store((^u32x4)(dst[4:]), intrinsics.simd_add(x4, s4))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[5:]), intrinsics.simd_add(x5, s5))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[6:]), intrinsics.simd_add(x6, s6))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[7:]), intrinsics.simd_add(x7, s7))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[8:]), intrinsics.simd_add(x8, s8))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[9:]), intrinsics.simd_add(x9, s9))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[10:]), intrinsics.simd_add(x10, s10))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[11:]), intrinsics.simd_add(x11, s11))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[12:]), x12)
|
||||
intrinsics.unaligned_store((^u32x4)(dst[13:]), intrinsics.simd_add(x13, s13))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[14:]), intrinsics.simd_add(x14, s14))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[15:]), intrinsics.simd_add(x15, s15))
|
||||
} else {
|
||||
intrinsics.unaligned_store((^u32x4)(dst[0:]), _byteswap_u32x4(x0))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[1:]), _byteswap_u32x4(x1))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[2:]), _byteswap_u32x4(x2))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[3:]), _byteswap_u32x4(x3))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[4:]), _byteswap_u32x4(intrinsics.simd_add(x4, s4)))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[5:]), _byteswap_u32x4(intrinsics.simd_add(x5, s5)))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[6:]), _byteswap_u32x4(intrinsics.simd_add(x6, s6)))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[7:]), _byteswap_u32x4(intrinsics.simd_add(x7, s7)))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[8:]), _byteswap_u32x4(intrinsics.simd_add(x8, s8)))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[9:]), _byteswap_u32x4(intrinsics.simd_add(x9, s9)))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[10:]), _byteswap_u32x4(intrinsics.simd_add(x10, s10)))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[11:]), _byteswap_u32x4(intrinsics.simd_add(x11, s11)))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[12:]), _byteswap_u32x4(x12))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[13:]), _byteswap_u32x4(intrinsics.simd_add(x13, s13)))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[14:]), _byteswap_u32x4(intrinsics.simd_add(x14, s14)))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[15:]), _byteswap_u32x4(intrinsics.simd_add(x15, s15)))
|
||||
}
|
||||
|
||||
s12 = intrinsics.simd_add(s12, _CTR_INC_4)
|
||||
|
||||
dst = dst[16:]
|
||||
}
|
||||
}
|
||||
}
|
||||
197
base/runtime/random_generator_chacha8_simd256.odin
Normal file
197
base/runtime/random_generator_chacha8_simd256.odin
Normal file
@@ -0,0 +1,197 @@
|
||||
#+build amd64
|
||||
package runtime
|
||||
|
||||
import "base:intrinsics"
|
||||
|
||||
#assert(ODIN_ENDIAN == .Little)
|
||||
|
||||
@(private = "file")
|
||||
u32x8 :: #simd[8]u32
|
||||
@(private = "file")
|
||||
u32x4 :: #simd[4]u32
|
||||
|
||||
@(private = "file")
|
||||
S0: u32x8 : {
|
||||
CHACHA_SIGMA_0, CHACHA_SIGMA_0, CHACHA_SIGMA_0, CHACHA_SIGMA_0,
|
||||
CHACHA_SIGMA_0, CHACHA_SIGMA_0, CHACHA_SIGMA_0, CHACHA_SIGMA_0,
|
||||
}
|
||||
@(private = "file")
|
||||
S1: u32x8 : {
|
||||
CHACHA_SIGMA_1, CHACHA_SIGMA_1, CHACHA_SIGMA_1, CHACHA_SIGMA_1,
|
||||
CHACHA_SIGMA_1, CHACHA_SIGMA_1, CHACHA_SIGMA_1, CHACHA_SIGMA_1,
|
||||
}
|
||||
@(private = "file")
|
||||
S2: u32x8 : {
|
||||
CHACHA_SIGMA_2, CHACHA_SIGMA_2, CHACHA_SIGMA_2, CHACHA_SIGMA_2,
|
||||
CHACHA_SIGMA_2, CHACHA_SIGMA_2, CHACHA_SIGMA_2, CHACHA_SIGMA_2,
|
||||
}
|
||||
@(private = "file")
|
||||
S3: u32x8 : {
|
||||
CHACHA_SIGMA_3, CHACHA_SIGMA_3, CHACHA_SIGMA_3, CHACHA_SIGMA_3,
|
||||
CHACHA_SIGMA_3, CHACHA_SIGMA_3, CHACHA_SIGMA_3, CHACHA_SIGMA_3,
|
||||
}
|
||||
|
||||
@(private = "file")
|
||||
_ROT_7L: u32x8 : {7, 7, 7, 7, 7, 7, 7, 7}
|
||||
@(private = "file")
|
||||
_ROT_7R: u32x8 : {25, 25, 25, 25, 25, 25, 25, 25}
|
||||
@(private = "file")
|
||||
_ROT_12L: u32x8 : {12, 12, 12, 12, 12, 12, 12, 12}
|
||||
@(private = "file")
|
||||
_ROT_12R: u32x8 : {20, 20, 20, 20, 20, 20, 20, 20}
|
||||
@(private = "file")
|
||||
_ROT_8L: u32x8 : {8, 8, 8, 8, 8, 8, 8, 8}
|
||||
@(private = "file")
|
||||
_ROT_8R: u32x8 : {24, 24, 24, 24, 24, 24, 24, 24}
|
||||
@(private = "file")
|
||||
_ROT_16: u32x8 : {16, 16, 16, 16, 16, 16, 16, 16}
|
||||
@(private = "file")
|
||||
_CTR_INC_8: u32x8 : {8, 8, 8, 8, 8, 8, 8, 8}
|
||||
|
||||
// To the best of my knowledge this is only really useful on
|
||||
// modern x86-64 as most ARM silicon is missing support for SVE2.
|
||||
|
||||
@(private, enable_target_feature = "avx,avx2")
|
||||
chacha8rand_refill_simd256 :: proc(r: ^Default_Random_State) {
|
||||
// Initialize the base state.
|
||||
k: [^]u32 = (^u32)(raw_data(r._buf[RNG_OUTPUT_PER_ITER:]))
|
||||
s4_ := k[0]
|
||||
s5_ := k[1]
|
||||
s6_ := k[2]
|
||||
s7_ := k[3]
|
||||
s8_ := k[4]
|
||||
s9_ := k[5]
|
||||
s10_ := k[6]
|
||||
s11_ := k[7]
|
||||
|
||||
// 8-lane ChaCha8.
|
||||
s4 := u32x8{s4_, s4_, s4_, s4_, s4_, s4_, s4_, s4_}
|
||||
s5 := u32x8{s5_, s5_, s5_, s5_, s5_, s5_, s5_, s5_}
|
||||
s6 := u32x8{s6_, s6_, s6_, s6_, s6_, s6_, s6_, s6_}
|
||||
s7 := u32x8{s7_, s7_, s7_, s7_, s7_, s7_, s7_, s7_}
|
||||
s8 := u32x8{s8_, s8_, s8_, s8_, s8_, s8_, s8_, s8_}
|
||||
s9 := u32x8{s9_, s9_, s9_, s9_, s9_, s9_, s9_, s9_}
|
||||
s10 := u32x8{s10_, s10_, s10_, s10_, s10_, s10_, s10_, s10_}
|
||||
s11 := u32x8{s11_, s11_, s11_, s11_, s11_, s11_, s11_, s11_}
|
||||
s12 := u32x8{0, 1, 2, 3, 4, 5, 6, 7}
|
||||
s13, s14, s15: u32x8
|
||||
|
||||
u32x4 :: #simd[4]u32
|
||||
dst: [^]u32x4 = (^u32x4)(raw_data(r._buf[:]))
|
||||
|
||||
quarter_round := #force_inline proc "contextless" (a, b, c, d: u32x8) -> (u32x8, u32x8, u32x8, u32x8) {
|
||||
a, b, c, d := a, b, c, d
|
||||
|
||||
a = intrinsics.simd_add(a, b)
|
||||
d = intrinsics.simd_bit_xor(d, a)
|
||||
d = intrinsics.simd_bit_xor(intrinsics.simd_shl(d, _ROT_16), intrinsics.simd_shr(d, _ROT_16))
|
||||
|
||||
c = intrinsics.simd_add(c, d)
|
||||
b = intrinsics.simd_bit_xor(b, c)
|
||||
b = intrinsics.simd_bit_xor(intrinsics.simd_shl(b, _ROT_12L), intrinsics.simd_shr(b, _ROT_12R))
|
||||
|
||||
a = intrinsics.simd_add(a, b)
|
||||
d = intrinsics.simd_bit_xor(d, a)
|
||||
d = intrinsics.simd_bit_xor(intrinsics.simd_shl(d, _ROT_8L), intrinsics.simd_shr(d, _ROT_8R))
|
||||
|
||||
c = intrinsics.simd_add(c, d)
|
||||
b = intrinsics.simd_bit_xor(b, c)
|
||||
b = intrinsics.simd_bit_xor(intrinsics.simd_shl(b, _ROT_7L), intrinsics.simd_shr(b, _ROT_7R))
|
||||
|
||||
return a, b, c, d
|
||||
}
|
||||
|
||||
for _ in 0..<2 {
|
||||
x0, x1, x2, x3 := S0, S1, S2, S3
|
||||
x4, x5, x6, x7 := s4, s5, s6, s7
|
||||
x8, x9, x10, x11 := s8, s9, s10, s11
|
||||
x12, x13, x14, x15 := s12, s13, s14, s15
|
||||
|
||||
for i := CHACHA_ROUNDS; i > 0; i = i - 2 {
|
||||
x0, x4, x8, x12 = quarter_round(x0, x4, x8, x12)
|
||||
x1, x5, x9, x13 = quarter_round(x1, x5, x9, x13)
|
||||
x2, x6, x10, x14 = quarter_round(x2, x6, x10, x14)
|
||||
x3, x7, x11, x15 = quarter_round(x3, x7, x11, x15)
|
||||
|
||||
x0, x5, x10, x15 = quarter_round(x0, x5, x10, x15)
|
||||
x1, x6, x11, x12 = quarter_round(x1, x6, x11, x12)
|
||||
x2, x7, x8, x13 = quarter_round(x2, x7, x8, x13)
|
||||
x3, x4, x9, x14 = quarter_round(x3, x4, x9, x14)
|
||||
}
|
||||
|
||||
x4 = intrinsics.simd_add(x4, s4)
|
||||
x5 = intrinsics.simd_add(x5, s5)
|
||||
x6 = intrinsics.simd_add(x6, s6)
|
||||
x7 = intrinsics.simd_add(x7, s7)
|
||||
x8 = intrinsics.simd_add(x8, s8)
|
||||
x9 = intrinsics.simd_add(x9, s9)
|
||||
x10 = intrinsics.simd_add(x10, s10)
|
||||
x11 = intrinsics.simd_add(x11, s11)
|
||||
x13 = intrinsics.simd_add(x13, s13)
|
||||
x14 = intrinsics.simd_add(x14, s14)
|
||||
x15 = intrinsics.simd_add(x15, s15)
|
||||
|
||||
// Ok, now we have x0->x15 with 8 lanes, but we need to
|
||||
// output the first 4 blocks, then the second 4 blocks.
|
||||
//
|
||||
// LLVM appears not to consider "this instruction is totally
|
||||
// awful on the given microarchitcture", which leads to
|
||||
// `VPCOMPRESSED` being generated iff AVX512 support is
|
||||
// enabled for `intrinsics.simd_masked_compress_store`.
|
||||
// On Zen 4, this leads to a 50% performance regression vs
|
||||
// the 128-bit SIMD code.
|
||||
//
|
||||
// The fake intrinsic (because LLVM doesn't appear to have
|
||||
// an amd64 specific one), doesn't generate `VEXTRACTI128`,
|
||||
// but instead does cleverness without horrible regressions.
|
||||
|
||||
intrinsics.unaligned_store((^u32x4)(dst[0:]), _mm_mm256_extracti128_si256(x0, 0))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[1:]), _mm_mm256_extracti128_si256(x1, 0))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[2:]), _mm_mm256_extracti128_si256(x2, 0))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[3:]), _mm_mm256_extracti128_si256(x3, 0))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[4:]), _mm_mm256_extracti128_si256(x4, 0))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[5:]), _mm_mm256_extracti128_si256(x5, 0))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[6:]), _mm_mm256_extracti128_si256(x6, 0))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[7:]), _mm_mm256_extracti128_si256(x7, 0))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[8:]), _mm_mm256_extracti128_si256(x8, 0))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[9:]), _mm_mm256_extracti128_si256(x9, 0))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[10:]), _mm_mm256_extracti128_si256(x10, 0))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[11:]), _mm_mm256_extracti128_si256(x11, 0))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[12:]), _mm_mm256_extracti128_si256(x12, 0))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[13:]), _mm_mm256_extracti128_si256(x13, 0))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[14:]), _mm_mm256_extracti128_si256(x14, 0))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[15:]), _mm_mm256_extracti128_si256(x15, 0))
|
||||
|
||||
intrinsics.unaligned_store((^u32x4)(dst[16:]), _mm_mm256_extracti128_si256(x0, 1))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[17:]), _mm_mm256_extracti128_si256(x1, 1))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[18:]), _mm_mm256_extracti128_si256(x2, 1))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[19:]), _mm_mm256_extracti128_si256(x3, 1))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[20:]), _mm_mm256_extracti128_si256(x4, 1))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[21:]), _mm_mm256_extracti128_si256(x5, 1))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[22:]), _mm_mm256_extracti128_si256(x6, 1))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[23:]), _mm_mm256_extracti128_si256(x7, 1))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[24:]), _mm_mm256_extracti128_si256(x8, 1))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[25:]), _mm_mm256_extracti128_si256(x9, 1))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[26:]), _mm_mm256_extracti128_si256(x10, 1))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[27:]), _mm_mm256_extracti128_si256(x11, 1))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[28:]), _mm_mm256_extracti128_si256(x12, 1))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[29:]), _mm_mm256_extracti128_si256(x13, 1))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[30:]), _mm_mm256_extracti128_si256(x14, 1))
|
||||
intrinsics.unaligned_store((^u32x4)(dst[31:]), _mm_mm256_extracti128_si256(x15, 1))
|
||||
|
||||
s12 = intrinsics.simd_add(s12, _CTR_INC_8)
|
||||
|
||||
dst = dst[32:]
|
||||
}
|
||||
}
|
||||
|
||||
@(private = "file", require_results, enable_target_feature="avx2")
|
||||
_mm_mm256_extracti128_si256 :: #force_inline proc "c" (a: u32x8, $OFFSET: int) -> u32x4 {
|
||||
when OFFSET == 0 {
|
||||
return intrinsics.simd_shuffle(a, a, 0, 1, 2, 3)
|
||||
} else when OFFSET == 1 {
|
||||
return intrinsics.simd_shuffle(a, a, 4, 5, 6, 7)
|
||||
} else {
|
||||
#panic("chacha8rand: invalid offset")
|
||||
}
|
||||
}
|
||||
@@ -11,15 +11,50 @@ Generator :: runtime.Random_Generator
|
||||
Generator_Query_Info :: runtime.Random_Generator_Query_Info
|
||||
|
||||
Default_Random_State :: runtime.Default_Random_State
|
||||
|
||||
/*
|
||||
Returns an instance of the runtime pseudorandom generator. If no
|
||||
initial state is provided, the PRNG will be lazily initialized with
|
||||
entropy from the system entropy source on first-use.
|
||||
|
||||
The cryptographic security of the returned random number generator
|
||||
is directly dependent on the quality of the initialization entropy.
|
||||
Calling `reset`/`create` SHOULD be done with no seed/state, or
|
||||
32-bytes of high-quality entropy.
|
||||
|
||||
WARNING:
|
||||
- The lazy initialization will panic if there is no system entropy
|
||||
source available.
|
||||
- While the generator is cryptographically secure, developers SHOULD
|
||||
prefer `crypto.random_generator()` for cryptographic use cases such
|
||||
as key generation.
|
||||
|
||||
Inputs:
|
||||
- state: Optional initial PRNG state.
|
||||
|
||||
Returns:
|
||||
- A `Generator` instance.
|
||||
*/
|
||||
default_random_generator :: runtime.default_random_generator
|
||||
|
||||
@(require_results)
|
||||
create :: proc(seed: u64) -> (state: Default_Random_State) {
|
||||
create_u64 :: proc(seed: u64) -> (state: Default_Random_State) {
|
||||
seed := seed
|
||||
runtime.default_random_generator_proc(&state, .Reset, ([^]byte)(&seed)[:size_of(seed)])
|
||||
return
|
||||
}
|
||||
|
||||
@(require_results)
|
||||
create_bytes :: proc(seed: []byte) -> (state: Default_Random_State) {
|
||||
runtime.default_random_generator_proc(&state, .Reset, seed)
|
||||
return
|
||||
}
|
||||
|
||||
create :: proc {
|
||||
create_u64,
|
||||
create_bytes,
|
||||
}
|
||||
|
||||
/*
|
||||
Reset the seed used by the context.random_generator.
|
||||
|
||||
@@ -39,10 +74,14 @@ Possible Output:
|
||||
|
||||
10
|
||||
*/
|
||||
reset :: proc(seed: u64, gen := context.random_generator) {
|
||||
runtime.random_generator_reset_u64(gen, seed)
|
||||
reset :: proc {
|
||||
reset_u64,
|
||||
reset_bytes,
|
||||
}
|
||||
|
||||
reset_u64 :: proc(seed: u64, gen := context.random_generator) {
|
||||
runtime.random_generator_reset_u64(gen, seed)
|
||||
}
|
||||
|
||||
reset_bytes :: proc(bytes: []byte, gen := context.random_generator) {
|
||||
runtime.random_generator_reset_bytes(gen, bytes)
|
||||
|
||||
Reference in New Issue
Block a user