mirror of
https://github.com/odin-lang/Odin.git
synced 2025-12-28 17:04:34 +00:00
146 lines
4.2 KiB
Odin
146 lines
4.2 KiB
Odin
package runtime
|
|
|
|
import "base:intrinsics"
|
|
|
|
@(private)
|
|
chacha8rand_refill_ref :: proc(r: ^Default_Random_State) {
|
|
// Initialize the base state.
|
|
k: [^]u32 = (^u32)(raw_data(r._buf[RNG_OUTPUT_PER_ITER:]))
|
|
when ODIN_ENDIAN == .Little {
|
|
s4 := k[0]
|
|
s5 := k[1]
|
|
s6 := k[2]
|
|
s7 := k[3]
|
|
s8 := k[4]
|
|
s9 := k[5]
|
|
s10 := k[6]
|
|
s11 := k[7]
|
|
} else {
|
|
s4 := intrinsics.byte_swap(k[0])
|
|
s5 := intrinsics.byte_swap(k[1])
|
|
s6 := intrinsics.byte_swap(k[2])
|
|
s7 := intrinsics.byte_swap(k[3])
|
|
s8 := intrinsics.byte_swap(k[4])
|
|
s9 := intrinsics.byte_swap(k[5])
|
|
s10 := intrinsics.byte_swap(k[6])
|
|
s11 := intrinicss.byte_swap(k[7])
|
|
}
|
|
s12: u32 // Counter starts at 0.
|
|
s13, s14, s15: u32 // IV of all 0s.
|
|
|
|
dst: [^]u32 = (^u32)(raw_data(r._buf[:]))
|
|
|
|
// At least with LLVM21 force_inline produces identical perf to
|
|
// manual inlining, yay.
|
|
quarter_round := #force_inline proc "contextless" (a, b, c, d: u32) -> (u32, u32, u32, u32) {
|
|
a, b, c, d := a, b, c, d
|
|
|
|
a += b
|
|
d ~= a
|
|
d = rotl(d, 16)
|
|
|
|
c += d
|
|
b ~= c
|
|
b = rotl(b, 12)
|
|
|
|
a += b
|
|
d ~= a
|
|
d = rotl(d, 8)
|
|
|
|
c += d
|
|
b ~= c
|
|
b = rotl(b, 7)
|
|
|
|
return a, b, c, d
|
|
}
|
|
|
|
// Filippo Valsorda made an observation that only one of the column
|
|
// round depends on the counter (s12), so it is worth precomputing
|
|
// and reusing across multiple blocks. As far as I know, only Go's
|
|
// chacha implementation does this.
|
|
|
|
p1, p5, p9, p13 := quarter_round(CHACHA_SIGMA_1, s5, s9, s13)
|
|
p2, p6, p10, p14 := quarter_round(CHACHA_SIGMA_2, s6, s10, s14)
|
|
p3, p7, p11, p15 := quarter_round(CHACHA_SIGMA_3, s7, s11, s15)
|
|
|
|
// 4 groups
|
|
for g := 0; g < 4; g = g + 1 {
|
|
// 4 blocks per group
|
|
for n := 0; n < 4; n = n + 1 {
|
|
// First column round that depends on the counter
|
|
p0, p4, p8, p12 := quarter_round(CHACHA_SIGMA_0, s4, s8, s12)
|
|
|
|
// First diagonal round
|
|
x0, x5, x10, x15 := quarter_round(p0, p5, p10, p15)
|
|
x1, x6, x11, x12 := quarter_round(p1, p6, p11, p12)
|
|
x2, x7, x8, x13 := quarter_round(p2, p7, p8, p13)
|
|
x3, x4, x9, x14 := quarter_round(p3, p4, p9, p14)
|
|
|
|
for i := CHACHA_ROUNDS - 2; i > 0; i = i - 2 {
|
|
x0, x4, x8, x12 = quarter_round(x0, x4, x8, x12)
|
|
x1, x5, x9, x13 = quarter_round(x1, x5, x9, x13)
|
|
x2, x6, x10, x14 = quarter_round(x2, x6, x10, x14)
|
|
x3, x7, x11, x15 = quarter_round(x3, x7, x11, x15)
|
|
|
|
x0, x5, x10, x15 = quarter_round(x0, x5, x10, x15)
|
|
x1, x6, x11, x12 = quarter_round(x1, x6, x11, x12)
|
|
x2, x7, x8, x13 = quarter_round(x2, x7, x8, x13)
|
|
x3, x4, x9, x14 = quarter_round(x3, x4, x9, x14)
|
|
}
|
|
|
|
// Interleave 4 blocks
|
|
// NB: The additions of sigma and the counter are omitted
|
|
STRIDE :: 4
|
|
d_ := dst[n:]
|
|
when ODIN_ENDIAN == .Little {
|
|
d_[STRIDE*0] = x0
|
|
d_[STRIDE*1] = x1
|
|
d_[STRIDE*2] = x2
|
|
d_[STRIDE*3] = x3
|
|
d_[STRIDE*4] = x4 + s4
|
|
d_[STRIDE*5] = x5 + s5
|
|
d_[STRIDE*6] = x6 + s6
|
|
d_[STRIDE*7] = x7 + s7
|
|
d_[STRIDE*8] = x8 + s8
|
|
d_[STRIDE*9] = x9 + s9
|
|
d_[STRIDE*10] = x10 + s10
|
|
d_[STRIDE*11] = x11 + s11
|
|
d_[STRIDE*12] = x12
|
|
d_[STRIDE*13] = x13 + s13
|
|
d_[STRIDE*14] = x14 + s14
|
|
d_[STRIDE*15] = x15 + s15
|
|
} else {
|
|
d_[STRIDE*0] = intrinsics.byte_swap(x0)
|
|
d_[STRIDE*1] = intrinsics.byte_swap(x1)
|
|
d_[STRIDE*2] = intrinsics.byte_swap(x2)
|
|
d_[STRIDE*3] = intrinsics.byte_swap(x3)
|
|
d_[STRIDE*4] = intrinsics.byte_swap(x4 + s4)
|
|
d_[STRIDE*5] = intrinsics.byte_swap(x5 + s5)
|
|
d_[STRIDE*6] = intrinsics.byte_swap(x6 + s6)
|
|
d_[STRIDE*7] = intrinsics.byte_swap(x7 + s7)
|
|
d_[STRIDE*8] = intrinsics.byte_swap(x8 + s8)
|
|
d_[STRIDE*9] = intrinsics.byte_swap(x9 + s9)
|
|
d_[STRIDE*10] = intrinsics.byte_swap(x10 + s10)
|
|
d_[STRIDE*11] = intrinsics.byte_swap(x11 + s11)
|
|
d_[STRIDE*12] = intrinsics.byte_swap(x12)
|
|
d_[STRIDE*13] = intrinsics.byte_swap(x13 + s13)
|
|
d_[STRIDE*14] = intrinsics.byte_swap(x14 + s14)
|
|
d_[STRIDE*15] = intrinsics.byte_swap(x15 + s15)
|
|
}
|
|
|
|
s12 = s12 + 1 // Increment the counter
|
|
}
|
|
|
|
dst = dst[16*4:]
|
|
}
|
|
}
|
|
|
|
// This replicates `rotate_left32` from `core:math/bits`, under the
|
|
// assumption that this will live in `base:runtime`.
|
|
@(require_results, private = "file")
|
|
rotl :: #force_inline proc "contextless" (x: u32, k: int) -> u32 {
|
|
n :: 32
|
|
s := uint(k) & (n-1)
|
|
return x << s | x >> (n-s)
|
|
}
|