From 9d40f371bebaca6c74b615fa03dd5574eb51e327 Mon Sep 17 00:00:00 2001 From: Barinzaya Date: Wed, 30 Jul 2025 12:37:12 -0400 Subject: [PATCH 1/4] Add static SIMD support to XXH3 in core:hash/xxhash. This uses compile-time features to decide how large of a SIMD vector to use. It currently has checks for amd64/i386 to size its vectors for SSE2/AVX2/AVX512 as necessary. The generalized SIMD functions could also be useful for multiversioning of the hash procs, to allow for run-time dispatch based on available CPU features. --- core/hash/xxhash/common.odin | 32 ++++++++++ core/hash/xxhash/xxhash_3.odin | 88 ++++++++++++++++++++++++++-- core/hash/xxhash/xxhash_3_intel.odin | 13 ++++ core/hash/xxhash/xxhash_3_other.odin | 8 +++ 4 files changed, 137 insertions(+), 4 deletions(-) create mode 100644 core/hash/xxhash/xxhash_3_intel.odin create mode 100644 core/hash/xxhash/xxhash_3_other.odin diff --git a/core/hash/xxhash/common.odin b/core/hash/xxhash/common.odin index adfc1bac2..636393b52 100644 --- a/core/hash/xxhash/common.odin +++ b/core/hash/xxhash/common.odin @@ -101,3 +101,35 @@ XXH64_read64 :: #force_inline proc(buf: []u8, alignment := Alignment.Unaligned) return u64(b) } } + +XXH64_read64_simd :: #force_inline proc(buf: []$E, $W: uint, alignment := Alignment.Unaligned) -> (res: #simd[W]u64) { + if alignment == .Aligned { + res = (^#simd[W]u64)(raw_data(buf))^ + } else { + res = intrinsics.unaligned_load((^#simd[W]u64)(raw_data(buf))) + } + + when ODIN_ENDIAN == .Big { + bytes := transmute(#simd[W*8]u8)res + bytes = intrinsics.simd_lanes_reverse(bytes) + res = transmute(#simd[W]u64)bytes + res = intrinsics.simd_lanes_reverse(res) + } + return +} + +XXH64_write64_simd :: #force_inline proc(buf: []$E, value: $V/#simd[$W]u64, alignment := Alignment.Unaligned) { + value := value + when ODIN_ENDIAN == .Big { + bytes := transmute(#simd[W*8]u8)value + bytes = intrinsics.simd_lanes_reverse(bytes) + value = transmute(#simd[W]u64)bytes + value = intrinsics.simd_lanes_reverse(value) + } + + if alignment == .Aligned { + (^V)(raw_data(buf))^ = value + } else { + intrinsics.unaligned_store((^V)(raw_data(buf)), value) + } +} diff --git a/core/hash/xxhash/xxhash_3.odin b/core/hash/xxhash/xxhash_3.odin index 293e98528..8e88d4a90 100644 --- a/core/hash/xxhash/xxhash_3.odin +++ b/core/hash/xxhash/xxhash_3.odin @@ -52,6 +52,7 @@ XXH3_SECRET_SIZE_MIN :: 136 #assert(len(XXH3_kSecret) == 192 && len(XXH3_kSecret) > XXH3_SECRET_SIZE_MIN) XXH_ACC_ALIGN :: 8 /* scalar */ +XXH_MAX_WIDTH :: #config(XXH_MAX_WIDTH, 512) / 64 /* This is the optimal update size for incremental hashing. @@ -733,10 +734,6 @@ XXH3_accumulate_512_f :: #type proc(acc: []xxh_u64, input: []u8, secret: XXH3_scramble_accumulator_f :: #type proc(acc: []xxh_u64, secret: []u8) XXH3_init_custom_secret_f :: #type proc(custom_secret: []u8, seed64: xxh_u64) -XXH3_accumulate_512 : XXH3_accumulate_512_f = XXH3_accumulate_512_scalar -XXH3_scramble_accumulator : XXH3_scramble_accumulator_f = XXH3_scramble_accumulator_scalar -XXH3_init_custom_secret : XXH3_init_custom_secret_f = XXH3_init_custom_secret_scalar - /* scalar variants - universal */ @(optimization_mode="favor_size") XXH3_accumulate_512_scalar :: #force_inline proc(acc: []xxh_u64, input: []u8, secret: []u8) { @@ -785,6 +782,89 @@ XXH3_init_custom_secret_scalar :: #force_inline proc(custom_secret: []u8, seed64 } } +/* generalized SIMD variants */ +@(optimization_mode="favor_size") +XXH3_accumulate_512_simd_generic :: #force_inline proc(acc: []xxh_u64, input: []u8, secret: []u8, $W: uint) { + u32xW :: #simd[W]u32 + u64xW :: #simd[W]u64 + + #no_bounds_check for i in uint(0).. 1 { + XXH3_accumulate_512_simd_generic(acc, input, secret, XXH_NATIVE_WIDTH) + } else { + XXH3_accumulate_512_scalar(acc, input, secret) + } +} + +XXH3_scramble_accumulator :: #force_inline proc(acc: []xxh_u64, secret: []u8) { + when XXH_NATIVE_WIDTH > 1 { + XXH3_scramble_accumulator_simd_generic(acc, secret, XXH_NATIVE_WIDTH) + } else { + XXH3_scramble_accumulator_scalar(acc, secret) + } +} + +XXH3_init_custom_secret :: #force_inline proc(custom_secret: []u8, seed64: xxh_u64) { + when XXH_NATIVE_WIDTH > 1 { + XXH3_init_custom_secret_simd_generic(custom_secret, seed64, XXH_NATIVE_WIDTH) + } else { + XXH3_init_custom_secret_scalar(custom_secret, seed64) + } +} + XXH_PREFETCH_DIST :: 320 /* diff --git a/core/hash/xxhash/xxhash_3_intel.odin b/core/hash/xxhash/xxhash_3_intel.odin new file mode 100644 index 000000000..3397fcef5 --- /dev/null +++ b/core/hash/xxhash/xxhash_3_intel.odin @@ -0,0 +1,13 @@ +#+build amd64, i386 +package xxhash + +import "base:intrinsics" + +@(private="file") SSE2_FEATURES :: "sse2" +@(private="file") AVX2_FEATURES :: "avx2" +@(private="file") AVX512_FEATURES :: "avx512dq,evex512" + +XXH_NATIVE_WIDTH :: min(XXH_MAX_WIDTH, + 8 when intrinsics.has_target_feature(AVX512_FEATURES) else + 4 when intrinsics.has_target_feature(AVX2_FEATURES) else + 2 when intrinsics.has_target_feature(SSE2_FEATURES) else 1) diff --git a/core/hash/xxhash/xxhash_3_other.odin b/core/hash/xxhash/xxhash_3_other.odin new file mode 100644 index 000000000..e1a5d0474 --- /dev/null +++ b/core/hash/xxhash/xxhash_3_other.odin @@ -0,0 +1,8 @@ +#+build !amd64 +#+build !i386 +package xxhash + +import "base:runtime" + +XXH_NATIVE_WIDTH :: min(XXH_MAX_WIDTH, + 2 when runtime.HAS_HARDWARE_SIMD else 1) From 2f8b390c1922f096daf56c2400b212457fd37dca Mon Sep 17 00:00:00 2001 From: Barinzaya Date: Wed, 30 Jul 2025 14:29:44 -0400 Subject: [PATCH 2/4] Various minor changes in XXH3. This includes various minor things that didn't seem right or could be improved, including: - XXH3_state is documented to have a strict alignment requirement of 64 bytes, and thus came with a disclaimer not to use `new` because it wouldn't be aligned correctly. It now has an `#align(64)` so that it will. - An _internal proc being marked #force_no_inline (every other one is #force_inline) - Unnecessarily casting the product of two u32s through u128 (and ultimately truncating to u64 anyway) --- core/hash/xxhash/xxhash_3.odin | 36 ++++++++++++++++------------------ 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/core/hash/xxhash/xxhash_3.odin b/core/hash/xxhash/xxhash_3.odin index 8e88d4a90..555390bc5 100644 --- a/core/hash/xxhash/xxhash_3.odin +++ b/core/hash/xxhash/xxhash_3.odin @@ -63,10 +63,11 @@ XXH3_INTERNAL_BUFFER_SIZE :: 256 Streaming state. IMPORTANT: This structure has a strict alignment requirement of 64 bytes!! ** - Do not allocate this with `make()` or `new`, it will not be sufficiently aligned. - Use`XXH3_create_state` and `XXH3_destroy_state, or stack allocation. + Default allocators will align it correctly if created via `new`, as will + placing this struct on the cache, but if using a custom allocator make sure + that it handles the alignment correctly! */ -XXH3_state :: struct { +XXH3_state :: struct #align(64) { acc: [8]u64, custom_secret: [XXH_SECRET_DEFAULT_SIZE]u8, buffer: [XXH3_INTERNAL_BUFFER_SIZE]u8, @@ -478,7 +479,7 @@ XXH3_128bits_internal :: #force_inline proc( /* === Public XXH128 API === */ @(optimization_mode="favor_size") XXH3_128_default :: proc(input: []u8) -> (hash: XXH3_128_hash) { - return XXH3_128bits_internal(input, 0, XXH3_kSecret[:], XXH3_hashLong_128b_withSeed) + return XXH3_128bits_internal(input, 0, XXH3_kSecret[:], XXH3_hashLong_128b_default) } @(optimization_mode="favor_size") @@ -748,7 +749,7 @@ XXH3_accumulate_512_scalar :: #force_inline proc(acc: []xxh_u64, input: []u8, se sec := XXH64_read64(xsecret[8 * i:]) data_key := data_val ~ sec xacc[i ~ 1] += data_val /* swap adjacent lanes */ - xacc[i ] += u64(u128(u32(data_key)) * u128(u64(data_key >> 32))) + xacc[i ] += u64(u32(data_key)) * u64(data_key >> 32) } } @@ -966,19 +967,8 @@ XXH3_hashLong_64b_default :: #force_no_inline proc(input: []u8, seed64: xxh_u64, return XXH3_hashLong_64b_internal(input, XXH3_kSecret[:], XXH3_accumulate_512, XXH3_scramble_accumulator) } -/* - XXH3_hashLong_64b_withSeed(): - Generate a custom key based on alteration of default XXH3_kSecret with the seed, - and then use this key for long mode hashing. - - This operation is decently fast but nonetheless costs a little bit of time. - Try to avoid it whenever possible (typically when seed==0). - - It's important for performance that XXH3_hashLong is not inlined. Not sure - why (uop cache maybe?), but the difference is large and easily measurable. -*/ @(optimization_mode="favor_size") -XXH3_hashLong_64b_withSeed_internal :: #force_no_inline proc( +XXH3_hashLong_64b_withSeed_internal :: #force_inline proc( input: []u8, seed: xxh_u64, f_acc512: XXH3_accumulate_512_f, @@ -995,7 +985,15 @@ XXH3_hashLong_64b_withSeed_internal :: #force_no_inline proc( } /* - It's important for performance that XXH3_hashLong is not inlined. + XXH3_hashLong_64b_withSeed(): + Generate a custom key based on alteration of default XXH3_kSecret with the seed, + and then use this key for long mode hashing. + + This operation is decently fast but nonetheless costs a little bit of time. + Try to avoid it whenever possible (typically when seed==0). + + It's important for performance that XXH3_hashLong is not inlined. Not sure + why (uop cache maybe?), but the difference is large and easily measurable. */ @(optimization_mode="favor_size") XXH3_hashLong_64b_withSeed :: #force_no_inline proc(input: []u8, seed: xxh_u64, secret: []u8) -> (hash: xxh_u64) { @@ -1006,7 +1004,7 @@ XXH3_hashLong_64b_withSeed :: #force_no_inline proc(input: []u8, seed: xxh_u64, XXH3_hashLong64_f :: #type proc(input: []u8, seed: xxh_u64, secret: []u8) -> (res: xxh_u64) @(optimization_mode="favor_size") -XXH3_64bits_internal :: proc(input: []u8, seed: xxh_u64, secret: []u8, f_hashLong: XXH3_hashLong64_f) -> (hash: xxh_u64) { +XXH3_64bits_internal :: #force_inline proc(input: []u8, seed: xxh_u64, secret: []u8, f_hashLong: XXH3_hashLong64_f) -> (hash: xxh_u64) { assert(len(secret) >= XXH3_SECRET_SIZE_MIN) /* If an action is to be taken if len(secret) condition is not respected, it should be done here. From f61dc7d071a95b1cdaaaed6f1451c7e1ddf384c5 Mon Sep 17 00:00:00 2001 From: Barinzaya Date: Wed, 30 Jul 2025 16:47:06 -0400 Subject: [PATCH 3/4] Remove favor_size attributes inhibiting SIMD optimizations. This makes a tremendous (2x with SSE2, 3x with AVX2) difference on big datasets on my system, but this may be hardware-dependent (e.g. instruction cache sizes). Naturally, this also results in somewhat larger code for the large-data case (~75% larger). --- core/hash/xxhash/xxhash_3.odin | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/core/hash/xxhash/xxhash_3.odin b/core/hash/xxhash/xxhash_3.odin index 555390bc5..bd5534f23 100644 --- a/core/hash/xxhash/xxhash_3.odin +++ b/core/hash/xxhash/xxhash_3.odin @@ -382,7 +382,6 @@ XXH3_INIT_ACC :: [XXH_ACC_NB]xxh_u64{ XXH_SECRET_MERGEACCS_START :: 11 -@(optimization_mode="favor_size") XXH3_hashLong_128b_internal :: #force_inline proc( input: []u8, secret: []u8, @@ -410,7 +409,6 @@ XXH3_hashLong_128b_internal :: #force_inline proc( /* * It's important for performance that XXH3_hashLong is not inlined. */ -@(optimization_mode="favor_size") XXH3_hashLong_128b_default :: #force_no_inline proc(input: []u8, seed: xxh_u64, secret: []u8) -> (res: XXH3_128_hash) { return XXH3_hashLong_128b_internal(input, XXH3_kSecret[:], XXH3_accumulate_512, XXH3_scramble_accumulator) } @@ -418,12 +416,10 @@ XXH3_hashLong_128b_default :: #force_no_inline proc(input: []u8, seed: xxh_u64, /* * It's important for performance that XXH3_hashLong is not inlined. */ -@(optimization_mode="favor_size") XXH3_hashLong_128b_withSecret :: #force_no_inline proc(input: []u8, seed: xxh_u64, secret: []u8) -> (res: XXH3_128_hash) { return XXH3_hashLong_128b_internal(input, secret, XXH3_accumulate_512, XXH3_scramble_accumulator) } -@(optimization_mode="favor_size") XXH3_hashLong_128b_withSeed_internal :: #force_inline proc( input: []u8, seed: xxh_u64, secret: []u8, f_acc512: XXH3_accumulate_512_f, @@ -444,7 +440,6 @@ XXH3_hashLong_128b_withSeed_internal :: #force_inline proc( /* * It's important for performance that XXH3_hashLong is not inlined. */ - @(optimization_mode="favor_size") XXH3_hashLong_128b_withSeed :: #force_no_inline proc(input: []u8, seed: xxh_u64, secret: []u8) -> (res: XXH3_128_hash) { return XXH3_hashLong_128b_withSeed_internal(input, seed, secret, XXH3_accumulate_512, XXH3_scramble_accumulator , XXH3_init_custom_secret) } @@ -784,7 +779,6 @@ XXH3_init_custom_secret_scalar :: #force_inline proc(custom_secret: []u8, seed64 } /* generalized SIMD variants */ -@(optimization_mode="favor_size") XXH3_accumulate_512_simd_generic :: #force_inline proc(acc: []xxh_u64, input: []u8, secret: []u8, $W: uint) { u32xW :: #simd[W]u32 u64xW :: #simd[W]u64 @@ -824,7 +818,6 @@ XXH3_scramble_accumulator_simd_generic :: #force_inline proc(acc: []xxh_u64, sec } } -@(optimization_mode="favor_size") XXH3_init_custom_secret_simd_generic :: #force_inline proc(custom_secret: []u8, seed64: xxh_u64, $W: uint) { u64xW :: #simd[W]u64 @@ -950,7 +943,6 @@ XXH3_hashLong_64b_internal :: #force_inline proc(input: []u8, secret: []u8, /* It's important for performance that XXH3_hashLong is not inlined. */ -@(optimization_mode="favor_size") XXH3_hashLong_64b_withSecret :: #force_no_inline proc(input: []u8, seed64: xxh_u64, secret: []u8) -> (hash: xxh_u64) { return XXH3_hashLong_64b_internal(input, secret, XXH3_accumulate_512, XXH3_scramble_accumulator) } @@ -962,12 +954,10 @@ XXH3_hashLong_64b_withSecret :: #force_no_inline proc(input: []u8, seed64: xxh_u This variant enforces that the compiler can detect that, and uses this opportunity to streamline the generated code for better performance. */ -@(optimization_mode="favor_size") XXH3_hashLong_64b_default :: #force_no_inline proc(input: []u8, seed64: xxh_u64, secret: []u8) -> (hash: xxh_u64) { return XXH3_hashLong_64b_internal(input, XXH3_kSecret[:], XXH3_accumulate_512, XXH3_scramble_accumulator) } -@(optimization_mode="favor_size") XXH3_hashLong_64b_withSeed_internal :: #force_inline proc( input: []u8, seed: xxh_u64, @@ -995,7 +985,6 @@ XXH3_hashLong_64b_withSeed_internal :: #force_inline proc( It's important for performance that XXH3_hashLong is not inlined. Not sure why (uop cache maybe?), but the difference is large and easily measurable. */ -@(optimization_mode="favor_size") XXH3_hashLong_64b_withSeed :: #force_no_inline proc(input: []u8, seed: xxh_u64, secret: []u8) -> (hash: xxh_u64) { return XXH3_hashLong_64b_withSeed_internal(input, seed, XXH3_accumulate_512, XXH3_scramble_accumulator, XXH3_init_custom_secret) } From 4ef7ed1cbdf675ce62f7f305b6edb9fd76084c6c Mon Sep 17 00:00:00 2001 From: Barinzaya Date: Thu, 31 Jul 2025 16:51:42 -0400 Subject: [PATCH 4/4] Skip bounds checking on the inner accumulate loop. This helps performance with SSE (somewhat) and AVX-512 (quite a bit), but not AVX2 for some reason. --- core/hash/xxhash/xxhash_3.odin | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/hash/xxhash/xxhash_3.odin b/core/hash/xxhash/xxhash_3.odin index bd5534f23..fe92f16d9 100644 --- a/core/hash/xxhash/xxhash_3.odin +++ b/core/hash/xxhash/xxhash_3.odin @@ -64,7 +64,7 @@ XXH3_INTERNAL_BUFFER_SIZE :: 256 IMPORTANT: This structure has a strict alignment requirement of 64 bytes!! ** Default allocators will align it correctly if created via `new`, as will - placing this struct on the cache, but if using a custom allocator make sure + placing this struct on the stack, but if using a custom allocator make sure that it handles the alignment correctly! */ XXH3_state :: struct #align(64) { @@ -870,7 +870,7 @@ XXH_PREFETCH_DIST :: 320 XXH3_accumulate :: #force_inline proc( acc: []xxh_u64, input: []u8, secret: []u8, nbStripes: uint, f_acc512: XXH3_accumulate_512_f) { - for n := uint(0); n < nbStripes; n += 1 { + #no_bounds_check for n := uint(0); n < nbStripes; n += 1 { when !XXH_DISABLE_PREFETCH { in_ptr := &input[n * XXH_STRIPE_LEN] prefetch(in_ptr, XXH_PREFETCH_DIST)