core/math/rand: Disable LLVM vectorization of xoshiro on Intel

LLVM tries really hard to vectorize this when it is a terrible idea, when `-microarch:native` is specified on modern processors. Not doing so is a 2.5->3x performance gain.
2026-06-15 14:53:43 +00:00 · 2026-03-18 05:41:50 +09:00
parent 62d78d61fa
commit a171746a15
1 changed files with 15 additions and 2 deletions
--- a/core/math/rand/rand_xoshiro256.odin
+++ b/core/math/rand/rand_xoshiro256.odin
@@ -3,6 +3,18 @@ package rand
 import "base:intrinsics"
 import "base:runtime"

+when ODIN_ARCH == .amd64 || ODIN_ARCH == .i386 {
+	// LLVM thinks that using SIMD for read_u64 is good,
+	// when it causes a ~3x performance regression.  As
+	// far as I can tell, this behavior is limited to
+	// Intel.
+	@(private = "file")
+	TARGET_FEATURES :: "-sse,-avx,-avx2"
+} else {
+	@(private = "file")
+	TARGET_FEATURES :: ""
+}
+
 /*
 The state for a xoshiro256** pseudorandom generator.
 */
@@ -10,8 +22,9 @@ Xoshiro256_Random_State :: struct {
 	s: [4]u64,
 }

+@(enable_target_feature = TARGET_FEATURES)
 xoshiro256_random_generator_proc :: proc(data: rawptr, mode: runtime.Random_Generator_Mode, p: []byte) {
-	@(require_results)
+	@(require_results, enable_target_feature = TARGET_FEATURES)
 	read_u64 :: proc "contextless" (r: ^Xoshiro256_Random_State) -> u64 {
 		// xoshiro256** output function and state transition

@@ -27,7 +40,7 @@ xoshiro256_random_generator_proc :: proc(data: rawptr, mode: runtime.Random_Gene

 		return result

-		rotate_left64 :: proc "contextless" (x: u64, k: int) -> u64 {
+		rotate_left64 :: #force_inline proc "contextless" (x: u64, k: int) -> u64 {
 			n :: 64
 			s := uint(k) & (n-1)
 			return x << s | x >> (n-s)