Files
Odin/base/runtime/random_generator_chacha8_ref.odin
2025-11-29 16:16:52 +09:00

146 lines
4.2 KiB
Odin

package runtime
import "base:intrinsics"
@(private)
chacha8rand_refill_ref :: proc(r: ^Default_Random_State) {
// Initialize the base state.
k: [^]u32 = (^u32)(raw_data(r._buf[RNG_OUTPUT_PER_ITER:]))
when ODIN_ENDIAN == .Little {
s4 := k[0]
s5 := k[1]
s6 := k[2]
s7 := k[3]
s8 := k[4]
s9 := k[5]
s10 := k[6]
s11 := k[7]
} else {
s4 := intrinsics.byte_swap(k[0])
s5 := intrinsics.byte_swap(k[1])
s6 := intrinsics.byte_swap(k[2])
s7 := intrinsics.byte_swap(k[3])
s8 := intrinsics.byte_swap(k[4])
s9 := intrinsics.byte_swap(k[5])
s10 := intrinsics.byte_swap(k[6])
s11 := intrinicss.byte_swap(k[7])
}
s12: u32 // Counter starts at 0.
s13, s14, s15: u32 // IV of all 0s.
dst: [^]u32 = (^u32)(raw_data(r._buf[:]))
// At least with LLVM21 force_inline produces identical perf to
// manual inlining, yay.
quarter_round := #force_inline proc "contextless" (a, b, c, d: u32) -> (u32, u32, u32, u32) {
a, b, c, d := a, b, c, d
a += b
d ~= a
d = rotl(d, 16)
c += d
b ~= c
b = rotl(b, 12)
a += b
d ~= a
d = rotl(d, 8)
c += d
b ~= c
b = rotl(b, 7)
return a, b, c, d
}
// Filippo Valsorda made an observation that only one of the column
// round depends on the counter (s12), so it is worth precomputing
// and reusing across multiple blocks. As far as I know, only Go's
// chacha implementation does this.
p1, p5, p9, p13 := quarter_round(CHACHA_SIGMA_1, s5, s9, s13)
p2, p6, p10, p14 := quarter_round(CHACHA_SIGMA_2, s6, s10, s14)
p3, p7, p11, p15 := quarter_round(CHACHA_SIGMA_3, s7, s11, s15)
// 4 groups
for g := 0; g < 4; g = g + 1 {
// 4 blocks per group
for n := 0; n < 4; n = n + 1 {
// First column round that depends on the counter
p0, p4, p8, p12 := quarter_round(CHACHA_SIGMA_0, s4, s8, s12)
// First diagonal round
x0, x5, x10, x15 := quarter_round(p0, p5, p10, p15)
x1, x6, x11, x12 := quarter_round(p1, p6, p11, p12)
x2, x7, x8, x13 := quarter_round(p2, p7, p8, p13)
x3, x4, x9, x14 := quarter_round(p3, p4, p9, p14)
for i := CHACHA_ROUNDS - 2; i > 0; i = i - 2 {
x0, x4, x8, x12 = quarter_round(x0, x4, x8, x12)
x1, x5, x9, x13 = quarter_round(x1, x5, x9, x13)
x2, x6, x10, x14 = quarter_round(x2, x6, x10, x14)
x3, x7, x11, x15 = quarter_round(x3, x7, x11, x15)
x0, x5, x10, x15 = quarter_round(x0, x5, x10, x15)
x1, x6, x11, x12 = quarter_round(x1, x6, x11, x12)
x2, x7, x8, x13 = quarter_round(x2, x7, x8, x13)
x3, x4, x9, x14 = quarter_round(x3, x4, x9, x14)
}
// Interleave 4 blocks
// NB: The additions of sigma and the counter are omitted
STRIDE :: 4
d_ := dst[n:]
when ODIN_ENDIAN == .Little {
d_[STRIDE*0] = x0
d_[STRIDE*1] = x1
d_[STRIDE*2] = x2
d_[STRIDE*3] = x3
d_[STRIDE*4] = x4 + s4
d_[STRIDE*5] = x5 + s5
d_[STRIDE*6] = x6 + s6
d_[STRIDE*7] = x7 + s7
d_[STRIDE*8] = x8 + s8
d_[STRIDE*9] = x9 + s9
d_[STRIDE*10] = x10 + s10
d_[STRIDE*11] = x11 + s11
d_[STRIDE*12] = x12
d_[STRIDE*13] = x13 + s13
d_[STRIDE*14] = x14 + s14
d_[STRIDE*15] = x15 + s15
} else {
d_[STRIDE*0] = intrinsics.byte_swap(x0)
d_[STRIDE*1] = intrinsics.byte_swap(x1)
d_[STRIDE*2] = intrinsics.byte_swap(x2)
d_[STRIDE*3] = intrinsics.byte_swap(x3)
d_[STRIDE*4] = intrinsics.byte_swap(x4 + s4)
d_[STRIDE*5] = intrinsics.byte_swap(x5 + s5)
d_[STRIDE*6] = intrinsics.byte_swap(x6 + s6)
d_[STRIDE*7] = intrinsics.byte_swap(x7 + s7)
d_[STRIDE*8] = intrinsics.byte_swap(x8 + s8)
d_[STRIDE*9] = intrinsics.byte_swap(x9 + s9)
d_[STRIDE*10] = intrinsics.byte_swap(x10 + s10)
d_[STRIDE*11] = intrinsics.byte_swap(x11 + s11)
d_[STRIDE*12] = intrinsics.byte_swap(x12)
d_[STRIDE*13] = intrinsics.byte_swap(x13 + s13)
d_[STRIDE*14] = intrinsics.byte_swap(x14 + s14)
d_[STRIDE*15] = intrinsics.byte_swap(x15 + s15)
}
s12 = s12 + 1 // Increment the counter
}
dst = dst[16*4:]
}
}
// This replicates `rotate_left32` from `core:math/bits`, under the
// assumption that this will live in `base:runtime`.
@(require_results, private = "file")
rotl :: #force_inline proc "contextless" (x: u32, k: int) -> u32 {
n :: 32
s := uint(k) & (n-1)
return x << s | x >> (n-s)
}