From 9d40f371bebaca6c74b615fa03dd5574eb51e327 Mon Sep 17 00:00:00 2001
From: Barinzaya <barinzaya@gmail.com>
Date: Wed, 30 Jul 2025 12:37:12 -0400
Subject: [PATCH 1/4] Add static SIMD support to XXH3 in core:hash/xxhash.

This uses compile-time features to decide how large of a SIMD vector to
use. It currently has checks for amd64/i386 to size its vectors for
SSE2/AVX2/AVX512 as necessary.

The generalized SIMD functions could also be useful for multiversioning
of the hash procs, to allow for run-time dispatch based on available CPU
features.
---
 core/hash/xxhash/common.odin         | 32 ++++++++++
 core/hash/xxhash/xxhash_3.odin       | 88 ++++++++++++++++++++++++++--
 core/hash/xxhash/xxhash_3_intel.odin | 13 ++++
 core/hash/xxhash/xxhash_3_other.odin |  8 +++
 4 files changed, 137 insertions(+), 4 deletions(-)
 create mode 100644 core/hash/xxhash/xxhash_3_intel.odin
 create mode 100644 core/hash/xxhash/xxhash_3_other.odin

diff --git a/core/hash/xxhash/common.odin b/core/hash/xxhash/common.odin
index adfc1bac2..636393b52 100644
--- a/core/hash/xxhash/common.odin
+++ b/core/hash/xxhash/common.odin
@@ -101,3 +101,35 @@ XXH64_read64 :: #force_inline proc(buf: []u8, alignment := Alignment.Unaligned)
 		return u64(b)
 	}
 }
+
+XXH64_read64_simd :: #force_inline proc(buf: []$E, $W: uint, alignment := Alignment.Unaligned) -> (res: #simd[W]u64) {
+	if alignment == .Aligned {
+		res = (^#simd[W]u64)(raw_data(buf))^
+	} else {
+		res = intrinsics.unaligned_load((^#simd[W]u64)(raw_data(buf)))
+	}
+
+	when ODIN_ENDIAN == .Big {
+		bytes := transmute(#simd[W*8]u8)res
+		bytes = intrinsics.simd_lanes_reverse(bytes)
+		res = transmute(#simd[W]u64)bytes
+		res = intrinsics.simd_lanes_reverse(res)
+	}
+	return
+}
+
+XXH64_write64_simd :: #force_inline proc(buf: []$E, value: $V/#simd[$W]u64, alignment := Alignment.Unaligned) {
+	value := value
+	when ODIN_ENDIAN == .Big {
+		bytes := transmute(#simd[W*8]u8)value
+		bytes = intrinsics.simd_lanes_reverse(bytes)
+		value = transmute(#simd[W]u64)bytes
+		value = intrinsics.simd_lanes_reverse(value)
+	}
+
+	if alignment == .Aligned {
+		(^V)(raw_data(buf))^ = value
+	} else {
+		intrinsics.unaligned_store((^V)(raw_data(buf)), value)
+	}
+}
diff --git a/core/hash/xxhash/xxhash_3.odin b/core/hash/xxhash/xxhash_3.odin
index 293e98528..8e88d4a90 100644
--- a/core/hash/xxhash/xxhash_3.odin
+++ b/core/hash/xxhash/xxhash_3.odin
@@ -52,6 +52,7 @@ XXH3_SECRET_SIZE_MIN    :: 136
 #assert(len(XXH3_kSecret) == 192 && len(XXH3_kSecret) > XXH3_SECRET_SIZE_MIN)
 
 XXH_ACC_ALIGN           :: 8   /* scalar */
+XXH_MAX_WIDTH           :: #config(XXH_MAX_WIDTH, 512) / 64
 
 /*
 	This is the optimal update size for incremental hashing.
@@ -733,10 +734,6 @@ XXH3_accumulate_512_f       :: #type proc(acc: []xxh_u64, input:  []u8, secret:
 XXH3_scramble_accumulator_f :: #type proc(acc: []xxh_u64, secret: []u8)
 XXH3_init_custom_secret_f   :: #type proc(custom_secret: []u8, seed64: xxh_u64)
 
-XXH3_accumulate_512       : XXH3_accumulate_512_f       = XXH3_accumulate_512_scalar
-XXH3_scramble_accumulator : XXH3_scramble_accumulator_f = XXH3_scramble_accumulator_scalar
-XXH3_init_custom_secret   : XXH3_init_custom_secret_f   = XXH3_init_custom_secret_scalar
-
 /* scalar variants - universal */
 @(optimization_mode="favor_size")
 XXH3_accumulate_512_scalar :: #force_inline proc(acc: []xxh_u64, input: []u8, secret: []u8) {
@@ -785,6 +782,89 @@ XXH3_init_custom_secret_scalar :: #force_inline proc(custom_secret: []u8, seed64
 	}
 }
 
+/* generalized SIMD variants */
+@(optimization_mode="favor_size")
+XXH3_accumulate_512_simd_generic :: #force_inline proc(acc: []xxh_u64, input: []u8, secret: []u8, $W: uint) {
+	u32xW :: #simd[W]u32
+	u64xW :: #simd[W]u64
+
+	#no_bounds_check for i in uint(0)..<XXH_ACC_NB/W {
+		data_val := XXH64_read64_simd(input[8 * W * i:], W)
+		sec      := XXH64_read64_simd(secret[8 * W * i:], W)
+		data_key := data_val ~ sec
+
+		// Swap adjacent lanes
+		when W == 2 {
+			data_val = swizzle(data_val, 1, 0)
+		} else when W == 4 {
+			data_val = swizzle(data_val, 1, 0, 3, 2)
+		} else when W == 8 {
+			data_val = swizzle(data_val, 1, 0, 3, 2, 5, 4, 7, 6)
+		} else {
+			#panic("Unsupported vector size!")
+		}
+
+		a := XXH64_read64_simd(acc[W * i:], W)
+		a += data_val
+		a += u64xW(u32xW(data_key)) * intrinsics.simd_shr(data_key, 32)
+		XXH64_write64_simd(acc[W * i:], a)
+	}
+}
+
+XXH3_scramble_accumulator_simd_generic :: #force_inline proc(acc: []xxh_u64, secret: []u8, $W: uint) {
+	u64xW :: #simd[W]u64
+	#no_bounds_check for i in uint(0)..<XXH_ACC_NB/W {
+		key64 := XXH64_read64_simd(secret[8 * W * i:], W)
+		acc64 := XXH64_read64_simd(acc[W * i:], W)
+		acc64 ~= intrinsics.simd_shr(acc64, 47)
+		acc64 ~= key64
+		acc64 *= XXH_PRIME32_1
+		XXH64_write64_simd(acc[W * i:], acc64)
+	}
+}
+
+@(optimization_mode="favor_size")
+XXH3_init_custom_secret_simd_generic :: #force_inline proc(custom_secret: []u8, seed64: xxh_u64, $W: uint) {
+	u64xW :: #simd[W]u64
+
+	seedVec := u64xW(seed64)
+	for i in 0..<W/2 {
+		j := 2*i + 1
+		seedVec = intrinsics.simd_replace(seedVec, j, -intrinsics.simd_extract(seedVec, j))
+	}
+
+	nbRounds := XXH_SECRET_DEFAULT_SIZE / 8 / W
+	#no_bounds_check for i in uint(0)..<nbRounds {
+		block := XXH64_read64_simd(XXH3_kSecret[8 * W * i:], W)
+		block += seedVec
+		XXH64_write64_simd(custom_secret[8 * W * i:], block)
+	}
+}
+
+XXH3_accumulate_512 :: #force_inline proc(acc: []xxh_u64, input: []u8, secret: []u8) {
+	when XXH_NATIVE_WIDTH > 1 {
+		XXH3_accumulate_512_simd_generic(acc, input, secret, XXH_NATIVE_WIDTH)
+	} else {
+		XXH3_accumulate_512_scalar(acc, input, secret)
+	}
+}
+
+XXH3_scramble_accumulator :: #force_inline proc(acc: []xxh_u64, secret: []u8) {
+	when XXH_NATIVE_WIDTH > 1 {
+		XXH3_scramble_accumulator_simd_generic(acc, secret, XXH_NATIVE_WIDTH)
+	} else {
+		XXH3_scramble_accumulator_scalar(acc, secret)
+	}
+}
+
+XXH3_init_custom_secret :: #force_inline proc(custom_secret: []u8, seed64: xxh_u64) {
+	when XXH_NATIVE_WIDTH > 1 {
+		XXH3_init_custom_secret_simd_generic(custom_secret, seed64, XXH_NATIVE_WIDTH)
+	} else {
+		XXH3_init_custom_secret_scalar(custom_secret, seed64)
+	}
+}
+
 XXH_PREFETCH_DIST :: 320
 
 /*
diff --git a/core/hash/xxhash/xxhash_3_intel.odin b/core/hash/xxhash/xxhash_3_intel.odin
new file mode 100644
index 000000000..3397fcef5
--- /dev/null
+++ b/core/hash/xxhash/xxhash_3_intel.odin
@@ -0,0 +1,13 @@
+#+build amd64, i386
+package xxhash
+
+import "base:intrinsics"
+
+@(private="file") SSE2_FEATURES :: "sse2"
+@(private="file") AVX2_FEATURES :: "avx2"
+@(private="file") AVX512_FEATURES :: "avx512dq,evex512"
+
+XXH_NATIVE_WIDTH :: min(XXH_MAX_WIDTH,
+	8 when intrinsics.has_target_feature(AVX512_FEATURES) else
+	4 when intrinsics.has_target_feature(AVX2_FEATURES) else
+	2 when intrinsics.has_target_feature(SSE2_FEATURES) else 1)
diff --git a/core/hash/xxhash/xxhash_3_other.odin b/core/hash/xxhash/xxhash_3_other.odin
new file mode 100644
index 000000000..e1a5d0474
--- /dev/null
+++ b/core/hash/xxhash/xxhash_3_other.odin
@@ -0,0 +1,8 @@
+#+build !amd64
+#+build !i386
+package xxhash
+
+import "base:runtime"
+
+XXH_NATIVE_WIDTH :: min(XXH_MAX_WIDTH,
+	2 when runtime.HAS_HARDWARE_SIMD else 1)

From 2f8b390c1922f096daf56c2400b212457fd37dca Mon Sep 17 00:00:00 2001
From: Barinzaya <barinzaya@gmail.com>
Date: Wed, 30 Jul 2025 14:29:44 -0400
Subject: [PATCH 2/4] Various minor changes in XXH3.

This includes various minor things that didn't seem right or could be
improved, including:
- XXH3_state is documented to have a strict alignment requirement of 64
  bytes, and thus came with a disclaimer not to use `new` because it
  wouldn't be aligned correctly. It now has an `#align(64)` so that it
  will.
- An _internal proc being marked #force_no_inline (every other one is
  #force_inline)
- Unnecessarily casting the product of two u32s through u128 (and
  ultimately truncating to u64 anyway)
---
 core/hash/xxhash/xxhash_3.odin | 36 ++++++++++++++++------------------
 1 file changed, 17 insertions(+), 19 deletions(-)

diff --git a/core/hash/xxhash/xxhash_3.odin b/core/hash/xxhash/xxhash_3.odin
index 8e88d4a90..555390bc5 100644
--- a/core/hash/xxhash/xxhash_3.odin
+++ b/core/hash/xxhash/xxhash_3.odin
@@ -63,10 +63,11 @@ XXH3_INTERNAL_BUFFER_SIZE :: 256
 	Streaming state.
 
 	IMPORTANT: This structure has a strict alignment requirement of 64 bytes!! **
-	Do not allocate this with `make()` or `new`, it will not be sufficiently aligned.
-	Use`XXH3_create_state` and `XXH3_destroy_state, or stack allocation.
+	Default allocators will align it correctly if created via `new`, as will
+	placing this struct on the cache, but if using a custom allocator make sure
+	that it handles the alignment correctly!
 */
-XXH3_state :: struct {
+XXH3_state :: struct #align(64) {
 	acc:               [8]u64,
 	custom_secret:     [XXH_SECRET_DEFAULT_SIZE]u8,
 	buffer:            [XXH3_INTERNAL_BUFFER_SIZE]u8,
@@ -478,7 +479,7 @@ XXH3_128bits_internal :: #force_inline proc(
 /* ===   Public XXH128 API   === */
 @(optimization_mode="favor_size")
 XXH3_128_default :: proc(input: []u8) -> (hash: XXH3_128_hash) {
-	return XXH3_128bits_internal(input, 0, XXH3_kSecret[:], XXH3_hashLong_128b_withSeed)
+	return XXH3_128bits_internal(input, 0, XXH3_kSecret[:], XXH3_hashLong_128b_default)
 }
 
 @(optimization_mode="favor_size")
@@ -748,7 +749,7 @@ XXH3_accumulate_512_scalar :: #force_inline proc(acc: []xxh_u64, input: []u8, se
 		sec := XXH64_read64(xsecret[8 * i:])
 		data_key    := data_val ~ sec
 		xacc[i ~ 1] += data_val /* swap adjacent lanes */
-		xacc[i    ] += u64(u128(u32(data_key)) * u128(u64(data_key >> 32)))
+		xacc[i    ] += u64(u32(data_key)) * u64(data_key >> 32)
 	}
 }
 
@@ -966,19 +967,8 @@ XXH3_hashLong_64b_default :: #force_no_inline proc(input: []u8, seed64: xxh_u64,
 	return XXH3_hashLong_64b_internal(input, XXH3_kSecret[:], XXH3_accumulate_512, XXH3_scramble_accumulator)
 }
 
-/*
-	XXH3_hashLong_64b_withSeed():
-	Generate a custom key based on alteration of default XXH3_kSecret with the seed,
-	and then use this key for long mode hashing.
-
-	This operation is decently fast but nonetheless costs a little bit of time.
-	Try to avoid it whenever possible (typically when seed==0).
-
-	It's important for performance that XXH3_hashLong is not inlined. Not sure
-	why (uop cache maybe?), but the difference is large and easily measurable.
-*/
 @(optimization_mode="favor_size")
-XXH3_hashLong_64b_withSeed_internal :: #force_no_inline proc(
+XXH3_hashLong_64b_withSeed_internal :: #force_inline proc(
 	input:       []u8,
 	seed:        xxh_u64,
 	f_acc512:    XXH3_accumulate_512_f,
@@ -995,7 +985,15 @@ XXH3_hashLong_64b_withSeed_internal :: #force_no_inline proc(
 }
 
 /*
-	It's important for performance that XXH3_hashLong is not inlined.
+	XXH3_hashLong_64b_withSeed():
+	Generate a custom key based on alteration of default XXH3_kSecret with the seed,
+	and then use this key for long mode hashing.
+
+	This operation is decently fast but nonetheless costs a little bit of time.
+	Try to avoid it whenever possible (typically when seed==0).
+
+	It's important for performance that XXH3_hashLong is not inlined. Not sure
+	why (uop cache maybe?), but the difference is large and easily measurable.
 */
 @(optimization_mode="favor_size")
 XXH3_hashLong_64b_withSeed :: #force_no_inline proc(input: []u8, seed: xxh_u64, secret: []u8) -> (hash: xxh_u64) {
@@ -1006,7 +1004,7 @@ XXH3_hashLong_64b_withSeed :: #force_no_inline proc(input: []u8, seed: xxh_u64,
 XXH3_hashLong64_f :: #type proc(input: []u8, seed: xxh_u64, secret: []u8)  -> (res: xxh_u64)
 
 @(optimization_mode="favor_size")
-XXH3_64bits_internal :: proc(input: []u8, seed: xxh_u64, secret: []u8, f_hashLong: XXH3_hashLong64_f) -> (hash: xxh_u64) {
+XXH3_64bits_internal :: #force_inline proc(input: []u8, seed: xxh_u64, secret: []u8, f_hashLong: XXH3_hashLong64_f) -> (hash: xxh_u64) {
 	assert(len(secret) >= XXH3_SECRET_SIZE_MIN)
 	/*
 		If an action is to be taken if len(secret) condition is not respected, it should be done here.

From f61dc7d071a95b1cdaaaed6f1451c7e1ddf384c5 Mon Sep 17 00:00:00 2001
From: Barinzaya <barinzaya@gmail.com>
Date: Wed, 30 Jul 2025 16:47:06 -0400
Subject: [PATCH 3/4] Remove favor_size attributes inhibiting SIMD
 optimizations.

This makes a tremendous (2x with SSE2, 3x with AVX2) difference on big
datasets on my system, but this may be hardware-dependent (e.g.
instruction cache sizes).

Naturally, this also results in somewhat larger code for the large-data
case (~75% larger).
---
 core/hash/xxhash/xxhash_3.odin | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/core/hash/xxhash/xxhash_3.odin b/core/hash/xxhash/xxhash_3.odin
index 555390bc5..bd5534f23 100644
--- a/core/hash/xxhash/xxhash_3.odin
+++ b/core/hash/xxhash/xxhash_3.odin
@@ -382,7 +382,6 @@ XXH3_INIT_ACC :: [XXH_ACC_NB]xxh_u64{
 
 XXH_SECRET_MERGEACCS_START :: 11
 
-@(optimization_mode="favor_size")
 XXH3_hashLong_128b_internal :: #force_inline proc(
 			input: []u8,
 			secret: []u8,
@@ -410,7 +409,6 @@ XXH3_hashLong_128b_internal :: #force_inline proc(
 /*
  * It's important for performance that XXH3_hashLong is not inlined.
  */
-@(optimization_mode="favor_size")
 XXH3_hashLong_128b_default :: #force_no_inline proc(input: []u8, seed: xxh_u64, secret: []u8) -> (res: XXH3_128_hash) {
 	return XXH3_hashLong_128b_internal(input, XXH3_kSecret[:], XXH3_accumulate_512, XXH3_scramble_accumulator)
 }
@@ -418,12 +416,10 @@ XXH3_hashLong_128b_default :: #force_no_inline proc(input: []u8, seed: xxh_u64,
 /*
  * It's important for performance that XXH3_hashLong is not inlined.
  */
-@(optimization_mode="favor_size")
 XXH3_hashLong_128b_withSecret :: #force_no_inline proc(input: []u8, seed: xxh_u64, secret: []u8) -> (res: XXH3_128_hash) {
 	return XXH3_hashLong_128b_internal(input, secret, XXH3_accumulate_512, XXH3_scramble_accumulator)
 }
 
-@(optimization_mode="favor_size")
 XXH3_hashLong_128b_withSeed_internal :: #force_inline proc(
 								input: []u8, seed: xxh_u64, secret: []u8,
 								f_acc512: XXH3_accumulate_512_f,
@@ -444,7 +440,6 @@ XXH3_hashLong_128b_withSeed_internal :: #force_inline proc(
 /*
  * It's important for performance that XXH3_hashLong is not inlined.
  */
- @(optimization_mode="favor_size")
 XXH3_hashLong_128b_withSeed :: #force_no_inline proc(input: []u8, seed: xxh_u64, secret: []u8) -> (res: XXH3_128_hash) {
 	return XXH3_hashLong_128b_withSeed_internal(input, seed, secret, XXH3_accumulate_512, XXH3_scramble_accumulator , XXH3_init_custom_secret)
 }
@@ -784,7 +779,6 @@ XXH3_init_custom_secret_scalar :: #force_inline proc(custom_secret: []u8, seed64
 }
 
 /* generalized SIMD variants */
-@(optimization_mode="favor_size")
 XXH3_accumulate_512_simd_generic :: #force_inline proc(acc: []xxh_u64, input: []u8, secret: []u8, $W: uint) {
 	u32xW :: #simd[W]u32
 	u64xW :: #simd[W]u64
@@ -824,7 +818,6 @@ XXH3_scramble_accumulator_simd_generic :: #force_inline proc(acc: []xxh_u64, sec
 	}
 }
 
-@(optimization_mode="favor_size")
 XXH3_init_custom_secret_simd_generic :: #force_inline proc(custom_secret: []u8, seed64: xxh_u64, $W: uint) {
 	u64xW :: #simd[W]u64
 
@@ -950,7 +943,6 @@ XXH3_hashLong_64b_internal :: #force_inline proc(input: []u8, secret: []u8,
 /*
 	It's important for performance that XXH3_hashLong is not inlined.
 */
-@(optimization_mode="favor_size")
 XXH3_hashLong_64b_withSecret :: #force_no_inline proc(input: []u8, seed64: xxh_u64, secret: []u8) -> (hash: xxh_u64) {
 	return XXH3_hashLong_64b_internal(input, secret, XXH3_accumulate_512, XXH3_scramble_accumulator)
 }
@@ -962,12 +954,10 @@ XXH3_hashLong_64b_withSecret :: #force_no_inline proc(input: []u8, seed64: xxh_u
 	This variant enforces that the compiler can detect that,
 	and uses this opportunity to streamline the generated code for better performance.
 */
-@(optimization_mode="favor_size")
 XXH3_hashLong_64b_default :: #force_no_inline proc(input: []u8, seed64: xxh_u64, secret: []u8) -> (hash: xxh_u64) {
 	return XXH3_hashLong_64b_internal(input, XXH3_kSecret[:], XXH3_accumulate_512, XXH3_scramble_accumulator)
 }
 
-@(optimization_mode="favor_size")
 XXH3_hashLong_64b_withSeed_internal :: #force_inline proc(
 	input:       []u8,
 	seed:        xxh_u64,
@@ -995,7 +985,6 @@ XXH3_hashLong_64b_withSeed_internal :: #force_inline proc(
 	It's important for performance that XXH3_hashLong is not inlined. Not sure
 	why (uop cache maybe?), but the difference is large and easily measurable.
 */
-@(optimization_mode="favor_size")
 XXH3_hashLong_64b_withSeed :: #force_no_inline proc(input: []u8, seed: xxh_u64, secret: []u8) -> (hash: xxh_u64) {
 	return XXH3_hashLong_64b_withSeed_internal(input, seed, XXH3_accumulate_512, XXH3_scramble_accumulator, XXH3_init_custom_secret)
 }

From 4ef7ed1cbdf675ce62f7f305b6edb9fd76084c6c Mon Sep 17 00:00:00 2001
From: Barinzaya <barinzaya@gmail.com>
Date: Thu, 31 Jul 2025 16:51:42 -0400
Subject: [PATCH 4/4] Skip bounds checking on the inner accumulate loop.

This helps performance with SSE (somewhat) and AVX-512 (quite a bit),
but not AVX2 for some reason.
---
 core/hash/xxhash/xxhash_3.odin | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/core/hash/xxhash/xxhash_3.odin b/core/hash/xxhash/xxhash_3.odin
index bd5534f23..fe92f16d9 100644
--- a/core/hash/xxhash/xxhash_3.odin
+++ b/core/hash/xxhash/xxhash_3.odin
@@ -64,7 +64,7 @@ XXH3_INTERNAL_BUFFER_SIZE :: 256
 
 	IMPORTANT: This structure has a strict alignment requirement of 64 bytes!! **
 	Default allocators will align it correctly if created via `new`, as will
-	placing this struct on the cache, but if using a custom allocator make sure
+	placing this struct on the stack, but if using a custom allocator make sure
 	that it handles the alignment correctly!
 */
 XXH3_state :: struct #align(64) {
@@ -870,7 +870,7 @@ XXH_PREFETCH_DIST :: 320
 XXH3_accumulate :: #force_inline proc(
 	acc: []xxh_u64, input: []u8, secret: []u8, nbStripes: uint, f_acc512: XXH3_accumulate_512_f) {
 
-	for n := uint(0); n < nbStripes; n += 1 {
+	#no_bounds_check for n := uint(0); n < nbStripes; n += 1 {
 		when !XXH_DISABLE_PREFETCH {
 			in_ptr := &input[n * XXH_STRIPE_LEN]
 			prefetch(in_ptr, XXH_PREFETCH_DIST)