mirror of
https://github.com/odin-lang/Odin.git
synced 2025-12-28 17:04:34 +00:00
291 lines
14 KiB
Odin
291 lines
14 KiB
Odin
#+build !i386
|
|
package runtime
|
|
|
|
import "base:intrinsics"
|
|
|
|
@(private = "file")
|
|
u32x4 :: #simd[4]u32
|
|
|
|
@(private = "file")
|
|
S0: u32x4 : {CHACHA_SIGMA_0, CHACHA_SIGMA_0, CHACHA_SIGMA_0, CHACHA_SIGMA_0}
|
|
@(private = "file")
|
|
S1: u32x4 : {CHACHA_SIGMA_1, CHACHA_SIGMA_1, CHACHA_SIGMA_1, CHACHA_SIGMA_1}
|
|
@(private = "file")
|
|
S2: u32x4 : {CHACHA_SIGMA_2, CHACHA_SIGMA_2, CHACHA_SIGMA_2, CHACHA_SIGMA_2}
|
|
@(private = "file")
|
|
S3: u32x4 : {CHACHA_SIGMA_3, CHACHA_SIGMA_3, CHACHA_SIGMA_3, CHACHA_SIGMA_3}
|
|
|
|
@(private = "file")
|
|
_ROT_7L: u32x4 : {7, 7, 7, 7}
|
|
@(private = "file")
|
|
_ROT_7R: u32x4 : {25, 25, 25, 25}
|
|
@(private = "file")
|
|
_ROT_12L: u32x4 : {12, 12, 12, 12}
|
|
@(private = "file")
|
|
_ROT_12R: u32x4 : {20, 20, 20, 20}
|
|
@(private = "file")
|
|
_ROT_8L: u32x4 : {8, 8, 8, 8}
|
|
@(private = "file")
|
|
_ROT_8R: u32x4 : {24, 24, 24, 24}
|
|
@(private = "file")
|
|
_ROT_16: u32x4 : {16, 16, 16, 16}
|
|
@(private = "file")
|
|
_CTR_INC_4: u32x4 : {4, 4, 4, 4}
|
|
@(private = "file")
|
|
_CTR_INC_8: u32x4 : {8, 8, 8, 8}
|
|
|
|
when ODIN_ENDIAN == .Big {
|
|
@(private = "file")
|
|
_byteswap_u32x4 :: #force_inline proc "contextless" (v: u32x4) -> u32x4 {
|
|
u8x16 :: #simd[16]u8
|
|
return(
|
|
transmute(u32x4)simd.shuffle(
|
|
transmute(u8x16)v,
|
|
transmute(u8x16)v,
|
|
3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
|
|
)
|
|
)
|
|
}
|
|
}
|
|
|
|
@(private)
|
|
chacha8rand_refill_simd128 :: proc(r: ^Default_Random_State) {
|
|
// Initialize the base state.
|
|
k: [^]u32 = (^u32)(raw_data(r._buf[RNG_OUTPUT_PER_ITER:]))
|
|
when ODIN_ENDIAN == .Little {
|
|
s4_ := k[0]
|
|
s5_ := k[1]
|
|
s6_ := k[2]
|
|
s7_ := k[3]
|
|
s8_ := k[4]
|
|
s9_ := k[5]
|
|
s10_ := k[6]
|
|
s11_ := k[7]
|
|
} else {
|
|
s4_ := intrinsics.byte_swap(k[0])
|
|
s5_ := intrinsics.byte_swap(k[1])
|
|
s6_ := intrinsics.byte_swap(k[2])
|
|
s7_ := intrinsics.byte_swap(k[3])
|
|
s8_ := intrinsics.byte_swap(k[4])
|
|
s9_ := intrinsics.byte_swap(k[5])
|
|
s10_ := intrinsics.byte_swap(k[6])
|
|
s11_ := intrinicss.byte_swap(k[7])
|
|
}
|
|
|
|
// 4-lane ChaCha8.
|
|
s4 := u32x4{s4_, s4_, s4_, s4_}
|
|
s5 := u32x4{s5_, s5_, s5_, s5_}
|
|
s6 := u32x4{s6_, s6_, s6_, s6_}
|
|
s7 := u32x4{s7_, s7_, s7_, s7_}
|
|
s8 := u32x4{s8_, s8_, s8_, s8_}
|
|
s9 := u32x4{s9_, s9_, s9_, s9_}
|
|
s10 := u32x4{s10_, s10_, s10_, s10_}
|
|
s11 := u32x4{s11_, s11_, s11_, s11_}
|
|
s12 := u32x4{0, 1, 2, 3}
|
|
s13, s14, s15: u32x4
|
|
|
|
dst: [^]u32x4 = (^u32x4)(raw_data(r._buf[:]))
|
|
|
|
quarter_round := #force_inline proc "contextless" (a, b, c, d: u32x4) -> (u32x4, u32x4, u32x4, u32x4) {
|
|
a, b, c, d := a, b, c, d
|
|
|
|
a = intrinsics.simd_add(a, b)
|
|
d = intrinsics.simd_bit_xor(d, a)
|
|
d = intrinsics.simd_bit_xor(intrinsics.simd_shl(d, _ROT_16), intrinsics.simd_shr(d, _ROT_16))
|
|
|
|
c = intrinsics.simd_add(c, d)
|
|
b = intrinsics.simd_bit_xor(b, c)
|
|
b = intrinsics.simd_bit_xor(intrinsics.simd_shl(b, _ROT_12L), intrinsics.simd_shr(b, _ROT_12R))
|
|
|
|
a = intrinsics.simd_add(a, b)
|
|
d = intrinsics.simd_bit_xor(d, a)
|
|
d = intrinsics.simd_bit_xor(intrinsics.simd_shl(d, _ROT_8L), intrinsics.simd_shr(d, _ROT_8R))
|
|
|
|
c = intrinsics.simd_add(c, d)
|
|
b = intrinsics.simd_bit_xor(b, c)
|
|
b = intrinsics.simd_bit_xor(intrinsics.simd_shl(b, _ROT_7L), intrinsics.simd_shr(b, _ROT_7R))
|
|
|
|
return a, b, c, d
|
|
}
|
|
|
|
// 8 blocks at a time.
|
|
//
|
|
// Note:
|
|
// This uses a ton of registers so it is only worth it on targets
|
|
// that have something like 32 128-bit registers. This is currently
|
|
// all ARMv8 targets, and RISC-V Zvl128b (`V` application profile)
|
|
// targets.
|
|
//
|
|
// While our current definition of `.arm32` is 32-bit ARMv8, this
|
|
// may change in the future (ARMv7 is still relevant), and things
|
|
// like Cortex-A8/A9 does "pretend" 128-bit SIMD 64-bits at a time
|
|
// thus needs bemchmarking.
|
|
when ODIN_ARCH == .arm64 || ODIN_ARCH == .riscv64 {
|
|
for _ in 0..<2 {
|
|
x0_0, x1_0, x2_0, x3_0 := S0, S1, S2, S3
|
|
x4_0, x5_0, x6_0, x7_0 := s4, s5, s6, s7
|
|
x8_0, x9_0, x10_0, x11_0 := s8, s9, s10, s11
|
|
x12_0, x13_0, x14_0, x15_0 := s12, s13, s14, s15
|
|
|
|
x0_1, x1_1, x2_1, x3_1 := S0, S1, S2, S3
|
|
x4_1, x5_1, x6_1, x7_1 := s4, s5, s6, s7
|
|
x8_1, x9_1, x10_1, x11_1 := s8, s9, s10, s11
|
|
x12_1 := intrinsics.simd_add(s12, _CTR_INC_4)
|
|
x13_1, x14_1, x15_1 := s13, s14, s15
|
|
|
|
for i := CHACHA_ROUNDS; i > 0; i = i - 2 {
|
|
x0_0, x4_0, x8_0, x12_0 = quarter_round(x0_0, x4_0, x8_0, x12_0)
|
|
x0_1, x4_1, x8_1, x12_1 = quarter_round(x0_1, x4_1, x8_1, x12_1)
|
|
x1_0, x5_0, x9_0, x13_0 = quarter_round(x1_0, x5_0, x9_0, x13_0)
|
|
x1_1, x5_1, x9_1, x13_1 = quarter_round(x1_1, x5_1, x9_1, x13_1)
|
|
x2_0, x6_0, x10_0, x14_0 = quarter_round(x2_0, x6_0, x10_0, x14_0)
|
|
x2_1, x6_1, x10_1, x14_1 = quarter_round(x2_1, x6_1, x10_1, x14_1)
|
|
x3_0, x7_0, x11_0, x15_0 = quarter_round(x3_0, x7_0, x11_0, x15_0)
|
|
x3_1, x7_1, x11_1, x15_1 = quarter_round(x3_1, x7_1, x11_1, x15_1)
|
|
|
|
x0_0, x5_0, x10_0, x15_0 = quarter_round(x0_0, x5_0, x10_0, x15_0)
|
|
x0_1, x5_1, x10_1, x15_1 = quarter_round(x0_1, x5_1, x10_1, x15_1)
|
|
x1_0, x6_0, x11_0, x12_0 = quarter_round(x1_0, x6_0, x11_0, x12_0)
|
|
x1_1, x6_1, x11_1, x12_1 = quarter_round(x1_1, x6_1, x11_1, x12_1)
|
|
x2_0, x7_0, x8_0, x13_0 = quarter_round(x2_0, x7_0, x8_0, x13_0)
|
|
x2_1, x7_1, x8_1, x13_1 = quarter_round(x2_1, x7_1, x8_1, x13_1)
|
|
x3_0, x4_0, x9_0, x14_0 = quarter_round(x3_0, x4_0, x9_0, x14_0)
|
|
x3_1, x4_1, x9_1, x14_1 = quarter_round(x3_1, x4_1, x9_1, x14_1)
|
|
}
|
|
|
|
when ODIN_ENDIAN == .Little {
|
|
intrinsics.unaligned_store((^u32x4)(dst[0:]), x0_0)
|
|
intrinsics.unaligned_store((^u32x4)(dst[1:]), x1_0)
|
|
intrinsics.unaligned_store((^u32x4)(dst[2:]), x2_0)
|
|
intrinsics.unaligned_store((^u32x4)(dst[3:]), x3_0)
|
|
intrinsics.unaligned_store((^u32x4)(dst[4:]), intrinsics.simd_add(x4_0, s4))
|
|
intrinsics.unaligned_store((^u32x4)(dst[5:]), intrinsics.simd_add(x5_0, s5))
|
|
intrinsics.unaligned_store((^u32x4)(dst[6:]), intrinsics.simd_add(x6_0, s6))
|
|
intrinsics.unaligned_store((^u32x4)(dst[7:]), intrinsics.simd_add(x7_0, s7))
|
|
intrinsics.unaligned_store((^u32x4)(dst[8:]), intrinsics.simd_add(x8_0, s8))
|
|
intrinsics.unaligned_store((^u32x4)(dst[9:]), intrinsics.simd_add(x9_0, s9))
|
|
intrinsics.unaligned_store((^u32x4)(dst[10:]), intrinsics.simd_add(x10_0, s10))
|
|
intrinsics.unaligned_store((^u32x4)(dst[11:]), intrinsics.simd_add(x11_0, s11))
|
|
intrinsics.unaligned_store((^u32x4)(dst[12:]), x12_0)
|
|
intrinsics.unaligned_store((^u32x4)(dst[13:]), intrinsics.simd_add(x13_0, s13))
|
|
intrinsics.unaligned_store((^u32x4)(dst[14:]), intrinsics.simd_add(x14_0, s14))
|
|
intrinsics.unaligned_store((^u32x4)(dst[15:]), intrinsics.simd_add(x15_0, s15))
|
|
|
|
intrinsics.unaligned_store((^u32x4)(dst[16:]), x0_1)
|
|
intrinsics.unaligned_store((^u32x4)(dst[17:]), x1_1)
|
|
intrinsics.unaligned_store((^u32x4)(dst[18:]), x2_1)
|
|
intrinsics.unaligned_store((^u32x4)(dst[19:]), x3_1)
|
|
intrinsics.unaligned_store((^u32x4)(dst[20:]), intrinsics.simd_add(x4_1, s4))
|
|
intrinsics.unaligned_store((^u32x4)(dst[21:]), intrinsics.simd_add(x5_1, s5))
|
|
intrinsics.unaligned_store((^u32x4)(dst[22:]), intrinsics.simd_add(x6_1, s6))
|
|
intrinsics.unaligned_store((^u32x4)(dst[23:]), intrinsics.simd_add(x7_1, s7))
|
|
intrinsics.unaligned_store((^u32x4)(dst[24:]), intrinsics.simd_add(x8_1, s8))
|
|
intrinsics.unaligned_store((^u32x4)(dst[25:]), intrinsics.simd_add(x9_1, s9))
|
|
intrinsics.unaligned_store((^u32x4)(dst[26:]), intrinsics.simd_add(x10_1, s10))
|
|
intrinsics.unaligned_store((^u32x4)(dst[27:]), intrinsics.simd_add(x11_1, s11))
|
|
intrinsics.unaligned_store((^u32x4)(dst[28:]), x12_1)
|
|
intrinsics.unaligned_store((^u32x4)(dst[29:]), intrinsics.simd_add(x13_1, s13))
|
|
intrinsics.unaligned_store((^u32x4)(dst[30:]), intrinsics.simd_add(x14_1, s14))
|
|
intrinsics.unaligned_store((^u32x4)(dst[31:]), intrinsics.simd_add(x15_1, s15))
|
|
} else {
|
|
intrinsics.unaligned_store((^u32x4)(dst[0:]), _byteswap_u32x4(x0_0))
|
|
intrinsics.unaligned_store((^u32x4)(dst[1:]), _byteswap_u32x4(x1_0))
|
|
intrinsics.unaligned_store((^u32x4)(dst[2:]), _byteswap_u32x4(x2_0))
|
|
intrinsics.unaligned_store((^u32x4)(dst[3:]), _byteswap_u32x4(x3_0))
|
|
intrinsics.unaligned_store((^u32x4)(dst[4:]), _byteswap_u32x4(intrinsics.simd_add(x4_0, s4)))
|
|
intrinsics.unaligned_store((^u32x4)(dst[5:]), _byteswap_u32x4(intrinsics.simd_add(x5_0, s5)))
|
|
intrinsics.unaligned_store((^u32x4)(dst[6:]), _byteswap_u32x4(intrinsics.simd_add(x6_0, s6)))
|
|
intrinsics.unaligned_store((^u32x4)(dst[7:]), _byteswap_u32x4(intrinsics.simd_add(x7_0, s7)))
|
|
intrinsics.unaligned_store((^u32x4)(dst[8:]), _byteswap_u32x4(intrinsics.simd_add(x8_0, s8)))
|
|
intrinsics.unaligned_store((^u32x4)(dst[9:]), _byteswap_u32x4(intrinsics.simd_add(x9_0, s9)))
|
|
intrinsics.unaligned_store((^u32x4)(dst[10:]), _byteswap_u32x4(intrinsics.simd_add(x10_0, s10)))
|
|
intrinsics.unaligned_store((^u32x4)(dst[11:]), _byteswap_u32x4(intrinsics.simd_add(x11_0, s11)))
|
|
intrinsics.unaligned_store((^u32x4)(dst[12:]), _byteswap_u32x4(x12_0))
|
|
intrinsics.unaligned_store((^u32x4)(dst[13:]), _byteswap_u32x4(intrinsics.simd_add(x13_0, s13)))
|
|
intrinsics.unaligned_store((^u32x4)(dst[14:]), _byteswap_u32x4(intrinsics.simd_add(x14_0, s14)))
|
|
intrinsics.unaligned_store((^u32x4)(dst[15:]), _byteswap_u32x4(intrinsics.simd_add(x15_0, s15)))
|
|
|
|
intrinsics.unaligned_store((^u32x4)(dst[16:]), _byteswap_u32x4(x0_1))
|
|
intrinsics.unaligned_store((^u32x4)(dst[17:]), _byteswap_u32x4(x1_1))
|
|
intrinsics.unaligned_store((^u32x4)(dst[18:]), _byteswap_u32x4(x2_1))
|
|
intrinsics.unaligned_store((^u32x4)(dst[19:]), _byteswap_u32x4(x3_1))
|
|
intrinsics.unaligned_store((^u32x4)(dst[20:]), _byteswap_u32x4(intrinsics.simd_add(x4_1, s4)))
|
|
intrinsics.unaligned_store((^u32x4)(dst[21:]), _byteswap_u32x4(intrinsics.simd_add(x5_1, s5)))
|
|
intrinsics.unaligned_store((^u32x4)(dst[22:]), _byteswap_u32x4(intrinsics.simd_add(x6_1, s6)))
|
|
intrinsics.unaligned_store((^u32x4)(dst[23:]), _byteswap_u32x4(intrinsics.simd_add(x7_1, s7)))
|
|
intrinsics.unaligned_store((^u32x4)(dst[24:]), _byteswap_u32x4(intrinsics.simd_add(x8_1, s8)))
|
|
intrinsics.unaligned_store((^u32x4)(dst[25:]), _byteswap_u32x4(intrinsics.simd_add(x9_1, s9)))
|
|
intrinsics.unaligned_store((^u32x4)(dst[26:]), _byteswap_u32x4(intrinsics.simd_add(x10_1, s10)))
|
|
intrinsics.unaligned_store((^u32x4)(dst[27:]), _byteswap_u32x4(intrinsics.simd_add(x11_1, s11)))
|
|
intrinsics.unaligned_store((^u32x4)(dst[28:]), _byteswap_u32x4(x12_1))
|
|
intrinsics.unaligned_store((^u32x4)(dst[29:]), _byteswap_u32x4(intrinsics.simd_add(x13_1, s13)))
|
|
intrinsics.unaligned_store((^u32x4)(dst[30:]), _byteswap_u32x4(intrinsics.simd_add(x14_1, s14)))
|
|
intrinsics.unaligned_store((^u32x4)(dst[31:]), _byteswap_u32x4(intrinsics.simd_add(x15_1, s15)))
|
|
}
|
|
|
|
s12 = intrinsics.simd_add(s12, _CTR_INC_8)
|
|
|
|
dst = dst[32:]
|
|
}
|
|
} else {
|
|
for _ in 0..<4 {
|
|
x0, x1, x2, x3 := S0, S1, S2, S3
|
|
x4, x5, x6, x7 := s4, s5, s6, s7
|
|
x8, x9, x10, x11 := s8, s9, s10, s11
|
|
x12, x13, x14, x15 := s12, s13, s14, s15
|
|
|
|
for i := CHACHA_ROUNDS; i > 0; i = i - 2 {
|
|
x0, x4, x8, x12 = quarter_round(x0, x4, x8, x12)
|
|
x1, x5, x9, x13 = quarter_round(x1, x5, x9, x13)
|
|
x2, x6, x10, x14 = quarter_round(x2, x6, x10, x14)
|
|
x3, x7, x11, x15 = quarter_round(x3, x7, x11, x15)
|
|
|
|
x0, x5, x10, x15 = quarter_round(x0, x5, x10, x15)
|
|
x1, x6, x11, x12 = quarter_round(x1, x6, x11, x12)
|
|
x2, x7, x8, x13 = quarter_round(x2, x7, x8, x13)
|
|
x3, x4, x9, x14 = quarter_round(x3, x4, x9, x14)
|
|
}
|
|
|
|
when ODIN_ENDIAN == .Little {
|
|
intrinsics.unaligned_store((^u32x4)(dst[0:]), x0)
|
|
intrinsics.unaligned_store((^u32x4)(dst[1:]), x1)
|
|
intrinsics.unaligned_store((^u32x4)(dst[2:]), x2)
|
|
intrinsics.unaligned_store((^u32x4)(dst[3:]), x3)
|
|
intrinsics.unaligned_store((^u32x4)(dst[4:]), intrinsics.simd_add(x4, s4))
|
|
intrinsics.unaligned_store((^u32x4)(dst[5:]), intrinsics.simd_add(x5, s5))
|
|
intrinsics.unaligned_store((^u32x4)(dst[6:]), intrinsics.simd_add(x6, s6))
|
|
intrinsics.unaligned_store((^u32x4)(dst[7:]), intrinsics.simd_add(x7, s7))
|
|
intrinsics.unaligned_store((^u32x4)(dst[8:]), intrinsics.simd_add(x8, s8))
|
|
intrinsics.unaligned_store((^u32x4)(dst[9:]), intrinsics.simd_add(x9, s9))
|
|
intrinsics.unaligned_store((^u32x4)(dst[10:]), intrinsics.simd_add(x10, s10))
|
|
intrinsics.unaligned_store((^u32x4)(dst[11:]), intrinsics.simd_add(x11, s11))
|
|
intrinsics.unaligned_store((^u32x4)(dst[12:]), x12)
|
|
intrinsics.unaligned_store((^u32x4)(dst[13:]), intrinsics.simd_add(x13, s13))
|
|
intrinsics.unaligned_store((^u32x4)(dst[14:]), intrinsics.simd_add(x14, s14))
|
|
intrinsics.unaligned_store((^u32x4)(dst[15:]), intrinsics.simd_add(x15, s15))
|
|
} else {
|
|
intrinsics.unaligned_store((^u32x4)(dst[0:]), _byteswap_u32x4(x0))
|
|
intrinsics.unaligned_store((^u32x4)(dst[1:]), _byteswap_u32x4(x1))
|
|
intrinsics.unaligned_store((^u32x4)(dst[2:]), _byteswap_u32x4(x2))
|
|
intrinsics.unaligned_store((^u32x4)(dst[3:]), _byteswap_u32x4(x3))
|
|
intrinsics.unaligned_store((^u32x4)(dst[4:]), _byteswap_u32x4(intrinsics.simd_add(x4, s4)))
|
|
intrinsics.unaligned_store((^u32x4)(dst[5:]), _byteswap_u32x4(intrinsics.simd_add(x5, s5)))
|
|
intrinsics.unaligned_store((^u32x4)(dst[6:]), _byteswap_u32x4(intrinsics.simd_add(x6, s6)))
|
|
intrinsics.unaligned_store((^u32x4)(dst[7:]), _byteswap_u32x4(intrinsics.simd_add(x7, s7)))
|
|
intrinsics.unaligned_store((^u32x4)(dst[8:]), _byteswap_u32x4(intrinsics.simd_add(x8, s8)))
|
|
intrinsics.unaligned_store((^u32x4)(dst[9:]), _byteswap_u32x4(intrinsics.simd_add(x9, s9)))
|
|
intrinsics.unaligned_store((^u32x4)(dst[10:]), _byteswap_u32x4(intrinsics.simd_add(x10, s10)))
|
|
intrinsics.unaligned_store((^u32x4)(dst[11:]), _byteswap_u32x4(intrinsics.simd_add(x11, s11)))
|
|
intrinsics.unaligned_store((^u32x4)(dst[12:]), _byteswap_u32x4(x12))
|
|
intrinsics.unaligned_store((^u32x4)(dst[13:]), _byteswap_u32x4(intrinsics.simd_add(x13, s13)))
|
|
intrinsics.unaligned_store((^u32x4)(dst[14:]), _byteswap_u32x4(intrinsics.simd_add(x14, s14)))
|
|
intrinsics.unaligned_store((^u32x4)(dst[15:]), _byteswap_u32x4(intrinsics.simd_add(x15, s15)))
|
|
}
|
|
|
|
s12 = intrinsics.simd_add(s12, _CTR_INC_4)
|
|
|
|
dst = dst[16:]
|
|
}
|
|
}
|
|
}
|