mirror of
https://github.com/odin-lang/Odin.git
synced 2025-12-28 17:04:34 +00:00
xxhash: Add XXH3_128 + test vectors.
This commit is contained in:
@@ -11,7 +11,9 @@ package xxhash
|
||||
|
||||
import "core:intrinsics"
|
||||
import "core:runtime"
|
||||
mem_copy :: runtime.mem_copy
|
||||
import "core:sys/llvm"
|
||||
mem_copy :: runtime.mem_copy
|
||||
byte_swap :: intrinsics.byte_swap
|
||||
|
||||
/*
|
||||
Version definition
|
||||
@@ -43,6 +45,22 @@ Error :: enum {
|
||||
Error,
|
||||
}
|
||||
|
||||
XXH_DISABLE_PREFETCH :: #config(XXH_DISABLE_PREFETCH, false)
|
||||
|
||||
when !XXH_DISABLE_PREFETCH {
|
||||
prefetch_address :: #force_inline proc(address: rawptr) {
|
||||
llvm.prefetch(address, .Read, .High, .Data)
|
||||
}
|
||||
prefetch_offset :: #force_inline proc(address: rawptr, auto_cast offset: uintptr) {
|
||||
ptr := rawptr(uintptr(address) + offset)
|
||||
prefetch_address(ptr)
|
||||
}
|
||||
} else {
|
||||
prefetch_address :: #force_inline proc(address: rawptr) {}
|
||||
prefetch_offset :: #force_inline proc(address: rawptr, auto_cast offset: uintptr) {}
|
||||
}
|
||||
prefetch :: proc { prefetch_address, prefetch_offset, }
|
||||
|
||||
@(optimization_mode="speed")
|
||||
XXH_rotl32 :: #force_inline proc(x, r: u32) -> (res: u32) {
|
||||
return ((x << r) | (x >> (32 - r)))
|
||||
@@ -54,7 +72,7 @@ XXH_rotl64 :: #force_inline proc(x, r: u64) -> (res: u64) {
|
||||
}
|
||||
|
||||
@(optimization_mode="speed")
|
||||
XXH32_read32 :: #force_inline proc(buf: []u8, alignment: Alignment) -> (res: u32) {
|
||||
XXH32_read32 :: #force_inline proc(buf: []u8, alignment := Alignment.Unaligned) -> (res: u32) {
|
||||
if XXH_FORCE_MEMORY_ACCESS == 2 || alignment == .Aligned {
|
||||
#no_bounds_check b := (^u32le)(&buf[0])^
|
||||
return u32(b)
|
||||
@@ -66,7 +84,7 @@ XXH32_read32 :: #force_inline proc(buf: []u8, alignment: Alignment) -> (res: u32
|
||||
}
|
||||
|
||||
@(optimization_mode="speed")
|
||||
XXH64_read64 :: #force_inline proc(buf: []u8, alignment: Alignment) -> (res: u64) {
|
||||
XXH64_read64 :: #force_inline proc(buf: []u8, alignment := Alignment.Unaligned) -> (res: u64) {
|
||||
if XXH_FORCE_MEMORY_ACCESS == 2 || alignment == .Aligned {
|
||||
#no_bounds_check b := (^u64le)(&buf[0])^
|
||||
return u64(b)
|
||||
|
||||
914
core/hash/xxhash/xxhash_3.odin
Normal file
914
core/hash/xxhash/xxhash_3.odin
Normal file
@@ -0,0 +1,914 @@
|
||||
/*
|
||||
An implementation of Yann Collet's [xxhash Fast Hash Algorithm](https://cyan4973.github.io/xxHash/).
|
||||
Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
|
||||
|
||||
Made available under Odin's BSD-3 license, based on the original C code.
|
||||
|
||||
List of contributors:
|
||||
Jeroen van Rijn: Initial implementation.
|
||||
*/
|
||||
package xxhash
|
||||
import "core:intrinsics"
|
||||
|
||||
/* *********************************************************************
|
||||
* XXH3
|
||||
* New generation hash designed for speed on small keys and vectorization
|
||||
************************************************************************
|
||||
* One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while
|
||||
* remaining a true 64-bit/128-bit hash function.
|
||||
*
|
||||
* This is done by prioritizing a subset of 64-bit operations that can be
|
||||
* emulated without too many steps on the average 32-bit machine.
|
||||
*
|
||||
* For example, these two lines seem similar, and run equally fast on 64-bit:
|
||||
*
|
||||
* xxh_u64 x;
|
||||
* x ^= (x >> 47); // good
|
||||
* x ^= (x >> 13); // bad
|
||||
*
|
||||
* However, to a 32-bit machine, there is a major difference.
|
||||
*
|
||||
* x ^= (x >> 47) looks like this:
|
||||
*
|
||||
* x.lo ^= (x.hi >> (47 - 32));
|
||||
*
|
||||
* while x ^= (x >> 13) looks like this:
|
||||
*
|
||||
* // note: funnel shifts are not usually cheap.
|
||||
* x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13));
|
||||
* x.hi ^= (x.hi >> 13);
|
||||
*
|
||||
* The first one is significantly faster than the second, simply because the
|
||||
* shift is larger than 32. This means:
|
||||
* - All the bits we need are in the upper 32 bits, so we can ignore the lower
|
||||
* 32 bits in the shift.
|
||||
* - The shift result will always fit in the lower 32 bits, and therefore,
|
||||
* we can ignore the upper 32 bits in the xor.
|
||||
*
|
||||
* Thanks to this optimization, XXH3 only requires these features to be efficient:
|
||||
*
|
||||
* - Usable unaligned access
|
||||
* - A 32-bit or 64-bit ALU
|
||||
* - If 32-bit, a decent ADC instruction
|
||||
* - A 32 or 64-bit multiply with a 64-bit result
|
||||
* - For the 128-bit variant, a decent byteswap helps short inputs.
|
||||
*
|
||||
* The first two are already required by XXH32, and almost all 32-bit and 64-bit
|
||||
* platforms which can run XXH32 can run XXH3 efficiently.
|
||||
*
|
||||
* Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one
|
||||
* notable exception.
|
||||
*
|
||||
* First of all, Thumb-1 lacks support for the UMULL instruction which
|
||||
* performs the important long multiply. This means numerous __aeabi_lmul
|
||||
* calls.
|
||||
*
|
||||
* Second of all, the 8 functional registers are just not enough.
|
||||
* Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need
|
||||
* Lo registers, and this shuffling results in thousands more MOVs than A32.
|
||||
*
|
||||
* A32 and T32 don't have this limitation. They can access all 14 registers,
|
||||
* do a 32->64 multiply with UMULL, and the flexible operand allowing free
|
||||
* shifts is helpful, too.
|
||||
*
|
||||
* Therefore, we do a quick sanity check.
|
||||
*
|
||||
* If compiling Thumb-1 for a target which supports ARM instructions, we will
|
||||
* emit a warning, as it is not a "sane" platform to compile for.
|
||||
*
|
||||
* Usually, if this happens, it is because of an accident and you probably need
|
||||
* to specify -march, as you likely meant to compile for a newer architecture.
|
||||
*
|
||||
* Credit: large sections of the vectorial and asm source code paths
|
||||
* have been contributed by @easyaspi314
|
||||
*/
|
||||
|
||||
XXH_ACC_ALIGN :: 8 /* scalar */
|
||||
|
||||
/* ==========================================
|
||||
* XXH3 default settings
|
||||
* ========================================== */
|
||||
|
||||
XXH3_SECRET_SIZE_MIN :: 136
|
||||
XXH_SECRET_DEFAULT_SIZE :: max(XXH3_SECRET_SIZE_MIN, #config(XXH_SECRET_DEFAULT_SIZE, 192))
|
||||
|
||||
XXH3_kSecret :: [?]u8{
|
||||
0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c,
|
||||
0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f,
|
||||
0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,
|
||||
0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c,
|
||||
0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3,
|
||||
0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,
|
||||
0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d,
|
||||
0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64,
|
||||
0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,
|
||||
0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e,
|
||||
0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce,
|
||||
0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,
|
||||
}
|
||||
#assert(size_of(XXH3_kSecret) == 192)
|
||||
|
||||
|
||||
/************************************************************************
|
||||
* XXH3 128-bit variant
|
||||
************************************************************************/
|
||||
|
||||
/*
|
||||
Stored in little endian order, although the fields themselves are in native endianness.
|
||||
*/
|
||||
xxh_u128 :: u128
|
||||
XXH3_128_hash :: u128
|
||||
XXH3_128_DEFAULT_SEED :: xxh_u64(0)
|
||||
|
||||
XXH128_hash_t :: struct #raw_union {
|
||||
using raw: struct {
|
||||
low: XXH64_hash, /*!< `value & 0xFFFFFFFFFFFFFFFF` */
|
||||
high: XXH64_hash, /*!< `value >> 64` */
|
||||
},
|
||||
h: xxh_u128,
|
||||
}
|
||||
#assert(size_of(xxh_u128) == size_of(XXH128_hash_t))
|
||||
|
||||
@(optimization_mode="speed")
|
||||
XXH_mul_32_to_64 :: #force_inline proc(x, y: xxh_u32) -> (res: xxh_u64) {
|
||||
return u64(x) * u64(y)
|
||||
}
|
||||
|
||||
@(optimization_mode="speed")
|
||||
XXH_mul_64_to_128 :: #force_inline proc(lhs, rhs: xxh_u64) -> (res: xxh_u128) {
|
||||
return xxh_u128(lhs) * xxh_u128(rhs)
|
||||
}
|
||||
|
||||
/*
|
||||
The reason for the separate function is to prevent passing too many structs
|
||||
around by value. This will hopefully inline the multiply, but we don't force it.
|
||||
|
||||
@param lhs, rhs The 64-bit integers to multiply
|
||||
@return The low 64 bits of the product XOR'd by the high 64 bits.
|
||||
*/
|
||||
@(optimization_mode="speed")
|
||||
XXH_mul_64_to_128_fold_64 :: #force_inline proc(lhs, rhs: xxh_u64) -> (res: xxh_u64) {
|
||||
t := XXH128_hash_t{}
|
||||
t.h = #force_inline XXH_mul_64_to_128(lhs, rhs)
|
||||
return t.low ~ t.high
|
||||
}
|
||||
|
||||
@(optimization_mode="speed")
|
||||
XXH_xorshift_64 :: #force_inline proc(v: xxh_u64, auto_cast shift: uint) -> (res: xxh_u64) {
|
||||
return v ~ (v >> shift)
|
||||
}
|
||||
|
||||
/*
|
||||
This is a fast avalanche stage, suitable when input bits are already partially mixed
|
||||
*/
|
||||
@(optimization_mode="speed")
|
||||
XXH3_avalanche :: #force_inline proc(h64: xxh_u64) -> (res: xxh_u64) {
|
||||
res = XXH_xorshift_64(h64, 37)
|
||||
res *= 0x165667919E3779F9
|
||||
res = XXH_xorshift_64(res, 32)
|
||||
return
|
||||
}
|
||||
|
||||
/*
|
||||
This is a stronger avalanche, inspired by Pelle Evensen's rrmxmx
|
||||
preferable when input has not been previously mixed
|
||||
*/
|
||||
@(optimization_mode="speed")
|
||||
XXH3_rrmxmx :: #force_inline proc(h64, length: xxh_u64) -> (res: xxh_u64) {
|
||||
/* this mix is inspired by Pelle Evensen's rrmxmx */
|
||||
res = h64
|
||||
res ~= XXH_rotl64(res, 49) ~ XXH_rotl64(res, 24)
|
||||
res *= 0x9FB21C651E98DF25
|
||||
res ~= (res >> 35) + length
|
||||
res *= 0x9FB21C651E98DF25
|
||||
return XXH_xorshift_64(res, 28)
|
||||
}
|
||||
|
||||
/*
|
||||
==========================================
|
||||
XXH3 128 bits (a.k.a XXH128)
|
||||
==========================================
|
||||
XXH3's 128-bit variant has better mixing and strength than the 64-bit variant,
|
||||
even without counting the significantly larger output size.
|
||||
|
||||
For example, extra steps are taken to avoid the seed-dependent collisions
|
||||
in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B).
|
||||
|
||||
This strength naturally comes at the cost of some speed, especially on short
|
||||
lengths. Note that longer hashes are about as fast as the 64-bit version
|
||||
due to it using only a slight modification of the 64-bit loop.
|
||||
|
||||
XXH128 is also more oriented towards 64-bit machines. It is still extremely
|
||||
fast for a _128-bit_ hash on 32-bit (it usually clears XXH64).
|
||||
*/
|
||||
|
||||
@(optimization_mode="speed")
|
||||
XXH3_len_1to3_128b :: #force_inline proc(input: []u8, secret: []u8, seed: xxh_u64) -> (res: xxh_u128) {
|
||||
/* A doubled version of 1to3_64b with different constants. */
|
||||
length := len(input)
|
||||
/*
|
||||
* len = 1: combinedl = { input[0], 0x01, input[0], input[0] }
|
||||
* len = 2: combinedl = { input[1], 0x02, input[0], input[1] }
|
||||
* len = 3: combinedl = { input[2], 0x03, input[0], input[1] }
|
||||
*/
|
||||
#no_bounds_check {
|
||||
c1 := input[ 0]
|
||||
c2 := input[length >> 1]
|
||||
c3 := input[length - 1]
|
||||
combinedl := (u32(c1) << 16) | (u32(c2) << 24) | (u32(c3) << 0) | (u32(length) << 8)
|
||||
combinedh := XXH_rotl32(byte_swap(combinedl), 13)
|
||||
bitflipl := u64(XXH32_read32(secret[0:]) ~ XXH32_read32(secret[4: ])) + seed
|
||||
bitfliph := u64(XXH32_read32(secret[8:]) ~ XXH32_read32(secret[12:])) - seed
|
||||
keyed_lo := u64(combinedl) ~ bitflipl
|
||||
keyed_hi := u64(combinedh) ~ bitfliph
|
||||
|
||||
return xxh_u128(XXH64_avalanche(keyed_lo)) | xxh_u128(XXH64_avalanche(keyed_hi)) << 64
|
||||
}
|
||||
}
|
||||
|
||||
@(optimization_mode="speed")
|
||||
XXH3_len_4to8_128b :: #force_inline proc(input: []u8, secret: []u8, seed: xxh_u64) -> (res: xxh_u128) {
|
||||
length := len(input)
|
||||
seed := seed
|
||||
|
||||
seed ~= u64(byte_swap(u32(seed))) << 32
|
||||
#no_bounds_check {
|
||||
input_lo := u64(XXH32_read32(input[0:]))
|
||||
input_hi := u64(XXH32_read32(input[length - 4:]))
|
||||
input_64 := u64(input_lo) + u64(input_hi) << 32
|
||||
bitflip := (XXH64_read64(secret[16:]) ~ XXH64_read64(secret[24:])) + seed
|
||||
keyed := input_64 ~ bitflip
|
||||
|
||||
/* Shift len to the left to ensure it is even, this avoids even multiplies. */
|
||||
m128 := XXH128_hash_t{
|
||||
h = XXH_mul_64_to_128(keyed, u64(XXH_PRIME64_1) + (u64(length) << 2)),
|
||||
}
|
||||
m128.high += (m128.low << 1)
|
||||
m128.low ~= (m128.high >> 3)
|
||||
|
||||
m128.low = XXH_xorshift_64(m128.low, 35)
|
||||
m128.low *= 0x9FB21C651E98DF25
|
||||
m128.low = XXH_xorshift_64(m128.low, 28)
|
||||
m128.high = XXH3_avalanche(m128.high)
|
||||
|
||||
return m128.h
|
||||
}
|
||||
}
|
||||
|
||||
@(optimization_mode="speed")
|
||||
XXH3_len_9to16_128b :: #force_inline proc(input: []u8, secret: []u8, seed: xxh_u64) -> (res: xxh_u128) {
|
||||
length := len(input)
|
||||
|
||||
#no_bounds_check {
|
||||
bitflipl := (XXH64_read64(secret[32:]) ~ XXH64_read64(secret[40:])) - seed
|
||||
bitfliph := (XXH64_read64(secret[48:]) ~ XXH64_read64(secret[56:])) + seed
|
||||
input_lo := XXH64_read64(input[0:])
|
||||
input_hi := XXH64_read64(input[length - 8:])
|
||||
m128 := XXH128_hash_t{
|
||||
h = XXH_mul_64_to_128(input_lo ~ input_hi ~ bitflipl, XXH_PRIME64_1),
|
||||
}
|
||||
/*
|
||||
* Put len in the middle of m128 to ensure that the length gets mixed to
|
||||
* both the low and high bits in the 128x64 multiply below.
|
||||
*/
|
||||
m128.low += u64(length - 1) << 54
|
||||
input_hi ~= bitfliph
|
||||
/*
|
||||
* Add the high 32 bits of input_hi to the high 32 bits of m128, then
|
||||
* add the long product of the low 32 bits of input_hi and XXH_XXH_PRIME32_2 to
|
||||
* the high 64 bits of m128.
|
||||
*
|
||||
* The best approach to this operation is different on 32-bit and 64-bit.
|
||||
*/
|
||||
when size_of(rawptr) == 4 { /* 32-bit */
|
||||
/*
|
||||
* 32-bit optimized version, which is more readable.
|
||||
*
|
||||
* On 32-bit, it removes an ADC and delays a dependency between the two
|
||||
* halves of m128.high64, but it generates an extra mask on 64-bit.
|
||||
*/
|
||||
m128.high += (input_hi & 0xFFFFFFFF00000000) + XXH_mul_32_to_64(u32(input_hi), XXH_PRIME32_2)
|
||||
} else {
|
||||
/*
|
||||
* 64-bit optimized (albeit more confusing) version.
|
||||
*
|
||||
* Uses some properties of addition and multiplication to remove the mask:
|
||||
*
|
||||
* Let:
|
||||
* a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF)
|
||||
* b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000)
|
||||
* c = XXH_XXH_PRIME32_2
|
||||
*
|
||||
* a + (b * c)
|
||||
* Inverse Property: x + y - x == y
|
||||
* a + (b * (1 + c - 1))
|
||||
* Distributive Property: x * (y + z) == (x * y) + (x * z)
|
||||
* a + (b * 1) + (b * (c - 1))
|
||||
* Identity Property: x * 1 == x
|
||||
* a + b + (b * (c - 1))
|
||||
*
|
||||
* Substitute a, b, and c:
|
||||
* input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (XXH_XXH_PRIME32_2 - 1))
|
||||
*
|
||||
* Since input_hi.hi + input_hi.lo == input_hi, we get this:
|
||||
* input_hi + ((xxh_u64)input_hi.lo * (XXH_XXH_PRIME32_2 - 1))
|
||||
*/
|
||||
m128.high += input_hi + XXH_mul_32_to_64(u32(input_hi), XXH_PRIME32_2 - 1)
|
||||
}
|
||||
/* m128 ^= XXH_swap64(m128 >> 64); */
|
||||
m128.low ~= byte_swap(m128.high)
|
||||
{ /* 128x64 multiply: h128 = m128 * XXH_PRIME64_2; */
|
||||
h128 := XXH128_hash_t{
|
||||
h = XXH_mul_64_to_128(m128.low, XXH_PRIME64_2),
|
||||
}
|
||||
h128.high += m128.high * XXH_PRIME64_2
|
||||
h128.low = XXH3_avalanche(h128.low)
|
||||
h128.high = XXH3_avalanche(h128.high)
|
||||
return h128.h
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN
|
||||
*/
|
||||
@(optimization_mode="speed")
|
||||
XXH3_len_0to16_128b :: #force_inline proc(input: []u8, secret: []u8, seed: xxh_u64) -> (res: xxh_u128) {
|
||||
length := len(input)
|
||||
|
||||
switch {
|
||||
case length > 8: return XXH3_len_9to16_128b(input, secret, seed)
|
||||
case length >= 4: return XXH3_len_4to8_128b (input, secret, seed)
|
||||
case length > 0: return XXH3_len_1to3_128b (input, secret, seed)
|
||||
case:
|
||||
#no_bounds_check bitflipl := XXH64_read64(secret[64:]) ~ XXH64_read64(secret[72:])
|
||||
#no_bounds_check bitfliph := XXH64_read64(secret[80:]) ~ XXH64_read64(secret[88:])
|
||||
return xxh_u128(XXH64_avalanche(seed ~ bitflipl)) | xxh_u128(XXH64_avalanche(seed ~ bitfliph)) << 64
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
A bit slower than XXH3_mix16B, but handles multiply by zero better.
|
||||
*/
|
||||
@(optimization_mode="speed")
|
||||
XXH128_mix32B :: #force_inline proc(acc: xxh_u128, input_1: []u8, input_2: []u8, secret: []u8, seed: xxh_u64) -> (res: xxh_u128) {
|
||||
acc128 := XXH128_hash_t{
|
||||
h = acc,
|
||||
}
|
||||
#no_bounds_check {
|
||||
acc128.low += XXH3_mix16B (input_1, secret[0:], seed)
|
||||
acc128.low ~= XXH64_read64(input_2[0:]) + XXH64_read64(input_2[8:])
|
||||
acc128.high += XXH3_mix16B (input_2, secret[16:], seed)
|
||||
acc128.high ~= XXH64_read64(input_1) + XXH64_read64(input_1[8:])
|
||||
return acc128.h
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
@(optimization_mode="speed")
|
||||
XXH3_len_17to128_128b :: #force_inline proc(input: []u8, secret: []u8, seed: xxh_u64) -> (res: xxh_u128) {
|
||||
length := len(input)
|
||||
|
||||
acc := XXH128_hash_t{}
|
||||
acc.low = xxh_u64(length) * XXH_PRIME64_1
|
||||
|
||||
switch{
|
||||
case length > 96:
|
||||
#no_bounds_check acc.h = XXH128_mix32B(acc.h, input[48:], input[length - 64:], secret[96:], seed)
|
||||
fallthrough
|
||||
case length > 64:
|
||||
#no_bounds_check acc.h = XXH128_mix32B(acc.h, input[32:], input[length - 48:], secret[64:], seed)
|
||||
fallthrough
|
||||
case length > 32:
|
||||
#no_bounds_check acc.h = XXH128_mix32B(acc.h, input[16:], input[length - 32:], secret[32:], seed)
|
||||
fallthrough
|
||||
case:
|
||||
#no_bounds_check acc.h = XXH128_mix32B(acc.h, input, input[length - 16:], secret, seed)
|
||||
|
||||
h128 := XXH128_hash_t{}
|
||||
h128.low = acc.low + acc.high
|
||||
h128.high = (acc.low * XXH_PRIME64_1) + (acc.high * XXH_PRIME64_4) + ((u64(length) - seed) * XXH_PRIME64_2)
|
||||
h128.low = XXH3_avalanche(h128.low)
|
||||
h128.high = u64(i64(0) - i64(XXH3_avalanche(h128.high)))
|
||||
return h128.h
|
||||
}
|
||||
unreachable()
|
||||
}
|
||||
|
||||
@(optimization_mode="speed")
|
||||
XXH3_len_129to240_128b :: #force_inline proc(input: []u8, secret: []u8, seed: xxh_u64) -> (res: xxh_u128) {
|
||||
length := len(input)
|
||||
|
||||
#no_bounds_check {
|
||||
acc := XXH128_hash_t{}
|
||||
acc.low = u64(length) * XXH_PRIME64_1
|
||||
|
||||
nbRounds := length / 32
|
||||
|
||||
i: int
|
||||
#no_bounds_check for i = 0; i < 4; i += 1 {
|
||||
acc.h = XXH128_mix32B(acc.h,
|
||||
input[32 * i:],
|
||||
input [32 * i + 16:],
|
||||
secret[32 * i:],
|
||||
seed)
|
||||
}
|
||||
acc.low = XXH3_avalanche(acc.low)
|
||||
acc.high = XXH3_avalanche(acc.high)
|
||||
|
||||
#no_bounds_check for i = 4; i < nbRounds; i += 1 {
|
||||
acc.h = XXH128_mix32B(acc.h,
|
||||
input[32 * i:], input[32 * i + 16:],
|
||||
secret[XXH3_MIDSIZE_STARTOFFSET + (32 * (i - 4)):],
|
||||
seed)
|
||||
}
|
||||
/* last bytes */
|
||||
#no_bounds_check acc.h = XXH128_mix32B(acc.h,
|
||||
input[length - 16:],
|
||||
input[length - 32:],
|
||||
secret[XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16:],
|
||||
u64(i64(0) - i64(seed)))
|
||||
|
||||
#no_bounds_check {
|
||||
h128 := XXH128_hash_t{}
|
||||
h128.low = acc.low + acc.high
|
||||
h128.high = u64(
|
||||
u128(acc.low * XXH_PRIME64_1) \
|
||||
+ u128(acc.high * XXH_PRIME64_4) \
|
||||
+ u128((u64(length) - seed) * XXH_PRIME64_2))
|
||||
h128.low = XXH3_avalanche(h128.low)
|
||||
h128.high = u64(i64(0) - i64(XXH3_avalanche(h128.high)))
|
||||
return h128.h
|
||||
}
|
||||
}
|
||||
unreachable()
|
||||
}
|
||||
|
||||
XXH3_INIT_ACC :: [XXH_ACC_NB]xxh_u64{
|
||||
XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3,
|
||||
XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1,
|
||||
}
|
||||
|
||||
XXH_SECRET_MERGEACCS_START :: 11
|
||||
|
||||
@(optimization_mode="speed")
|
||||
XXH3_hashLong_128b_internal :: #force_inline proc(
|
||||
input: []u8,
|
||||
secret: []u8,
|
||||
f_acc512: XXH3_accumulate_512_f,
|
||||
f_scramble: XXH3_scramble_accumulator_f) -> (res: XXH3_128_hash) {
|
||||
|
||||
acc := XXH3_INIT_ACC
|
||||
#assert(size_of(acc) == 64)
|
||||
|
||||
XXH3_hashLong_internal_loop(acc[:], input, secret, f_acc512, f_scramble)
|
||||
|
||||
/* converge into final hash */
|
||||
{
|
||||
length := len(input)
|
||||
secret_size := len(secret)
|
||||
|
||||
h128 := XXH128_hash_t{}
|
||||
h128.low = XXH3_mergeAccs(acc[:], secret[XXH_SECRET_MERGEACCS_START:], u64(length) * XXH_PRIME64_1)
|
||||
h128.high = XXH3_mergeAccs(acc[:], secret[secret_size - size_of(acc) - XXH_SECRET_MERGEACCS_START:],
|
||||
~(u64(length) * XXH_PRIME64_2))
|
||||
return h128.h
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* It's important for performance that XXH3_hashLong is not inlined.
|
||||
*/
|
||||
XXH3_hashLong_128b_default :: #force_no_inline proc(input: []u8, seed: xxh_u64, secret: []u8) -> (res: XXH3_128_hash) {
|
||||
k_secret := XXH3_kSecret
|
||||
return XXH3_hashLong_128b_internal(input, k_secret[:], XXH3_accumulate_512, XXH3_scramble_accumulator)
|
||||
}
|
||||
|
||||
/*
|
||||
* It's important for performance that XXH3_hashLong is not inlined.
|
||||
*/
|
||||
XXH3_hashLong_128b_withSecret :: #force_no_inline proc(input: []u8, seed: xxh_u64, secret: []u8) -> (res: XXH3_128_hash) {
|
||||
return XXH3_hashLong_128b_internal(input, secret, XXH3_accumulate_512, XXH3_scramble_accumulator)
|
||||
}
|
||||
|
||||
XXH3_hashLong_128b_withSeed_internal :: #force_inline proc(
|
||||
input: []u8, seed: xxh_u64, secret: []u8,
|
||||
f_acc512: XXH3_accumulate_512_f,
|
||||
f_scramble: XXH3_scramble_accumulator_f,
|
||||
f_initSec: XXH3_init_custom_secret_f) -> (res: XXH3_128_hash) {
|
||||
|
||||
if seed == 0 {
|
||||
k := XXH3_kSecret
|
||||
return XXH3_hashLong_128b_internal(input, k[:], f_acc512, f_scramble)
|
||||
}
|
||||
|
||||
{
|
||||
secret := [XXH_SECRET_DEFAULT_SIZE]u8{}
|
||||
f_initSec(secret[:], seed)
|
||||
return XXH3_hashLong_128b_internal(input, secret[:], f_acc512, f_scramble)
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* It's important for performance that XXH3_hashLong is not inlined.
|
||||
*/
|
||||
XXH3_hashLong_128b_withSeed :: #force_no_inline proc(input: []u8, seed: xxh_u64, secret: []u8) -> (res: XXH3_128_hash) {
|
||||
return XXH3_hashLong_128b_withSeed_internal(input, seed, secret, XXH3_accumulate_512, XXH3_scramble_accumulator , XXH3_init_custom_secret)
|
||||
}
|
||||
|
||||
XXH3_hashLong128_f :: #type proc(input: []u8, seed: xxh_u64, secret: []u8) -> (res: XXH3_128_hash)
|
||||
|
||||
XXH3_128bits_internal :: #force_inline proc(
|
||||
input: []u8, seed: xxh_u64, secret: []u8, f_hl128: XXH3_hashLong128_f) -> (res: XXH3_128_hash) {
|
||||
|
||||
assert(len(secret) >= XXH3_SECRET_SIZE_MIN)
|
||||
/*
|
||||
* If an action is to be taken if `secret` conditions are not respected,
|
||||
* it should be done here.
|
||||
* For now, it's a contract pre-condition.
|
||||
* Adding a check and a branch here would cost performance at every hash.
|
||||
*/
|
||||
length := len(input)
|
||||
|
||||
switch {
|
||||
case length <= 16:
|
||||
return XXH3_len_0to16_128b(input, secret, seed)
|
||||
case length <= 128:
|
||||
return XXH3_len_17to128_128b(input, secret, seed)
|
||||
case length <= XXH3_MIDSIZE_MAX:
|
||||
return XXH3_len_129to240_128b(input, secret, seed)
|
||||
case:
|
||||
return f_hl128(input, seed, secret)
|
||||
}
|
||||
}
|
||||
|
||||
/* === Public XXH128 API === */
|
||||
|
||||
XXH3_128bits :: proc(input: []u8) -> (hash: XXH3_128_hash) {
|
||||
k := XXH3_kSecret
|
||||
return XXH3_128bits_internal(input, XXH3_128_DEFAULT_SEED, k[:], XXH3_hashLong_128b_default)
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
==========================================
|
||||
Short keys
|
||||
==========================================
|
||||
One of the shortcomings of XXH32 and XXH64 was that their performance was
|
||||
sub-optimal on short lengths. It used an iterative algorithm which strongly
|
||||
favored lengths that were a multiple of 4 or 8.
|
||||
|
||||
Instead of iterating over individual inputs, we use a set of single shot
|
||||
functions which piece together a range of lengths and operate in constant time.
|
||||
Additionally, the number of multiplies has been significantly reduced. This
|
||||
reduces latency, especially when emulating 64-bit multiplies on 32-bit.
|
||||
|
||||
Depending on the platform, this may or may not be faster than XXH32, but it
|
||||
is almost guaranteed to be faster than XXH64.
|
||||
*/
|
||||
|
||||
/*
|
||||
At very short lengths, there isn't enough input to fully hide secrets, or use the entire secret.
|
||||
|
||||
There is also only a limited amount of mixing we can do before significantly impacting performance.
|
||||
|
||||
Therefore, we use different sections of the secret and always mix two secret samples with an XOR.
|
||||
This should have no effect on performance on the seedless or withSeed variants because everything
|
||||
_should_ be constant folded by modern compilers.
|
||||
|
||||
The XOR mixing hides individual parts of the secret and increases entropy.
|
||||
This adds an extra layer of strength for custom secrets.
|
||||
*/
|
||||
@(optimization_mode="speed")
|
||||
XXH3_len_1to3_64b :: #force_inline proc(input: []u8, secret: []u8, seed: xxh_u64) -> (res: xxh_u64) {
|
||||
length := u32(len(input))
|
||||
assert(input != nil)
|
||||
assert(1 <= length && length <= 3)
|
||||
assert(secret != nil)
|
||||
/*
|
||||
len = 1: combined = { input[0], 0x01, input[0], input[0] }
|
||||
len = 2: combined = { input[1], 0x02, input[0], input[1] }
|
||||
len = 3: combined = { input[2], 0x03, input[0], input[1] }
|
||||
*/
|
||||
#no_bounds_check {
|
||||
c1 := u32(input[0 ])
|
||||
c2 := u32(input[length >> 1])
|
||||
c3 := u32(input[length - 1])
|
||||
|
||||
combined := c1 << 16 | c2 << 24 | c3 << 0 | length << 8
|
||||
bitflip := (u64(XXH32_read32(secret)) ~ u64(XXH32_read32(secret[4:]))) + seed
|
||||
keyed := u64(combined) ~ bitflip
|
||||
return XXH64_avalanche(keyed)
|
||||
}
|
||||
}
|
||||
|
||||
@(optimization_mode="speed")
|
||||
XXH3_len_4to8_64b :: #force_inline proc(input: []u8, secret: []u8, seed: xxh_u64) -> (res: xxh_u64) {
|
||||
length := u32(len(input))
|
||||
assert(input != nil)
|
||||
assert(4 <= length && length <= 8)
|
||||
assert(secret != nil)
|
||||
seed := seed
|
||||
|
||||
seed ~= u64(byte_swap(u32(seed) << 32))
|
||||
#no_bounds_check {
|
||||
input1 := XXH32_read32(input)
|
||||
input2 := XXH32_read32(input[length - 4:])
|
||||
bitflip := (XXH64_read64(secret[8:]) ~ XXH64_read64(secret[16:])) - seed
|
||||
input64 := u64(input2) + (u64(input1) << 32)
|
||||
keyed := input64 ~ bitflip
|
||||
return XXH3_rrmxmx(keyed, u64(length))
|
||||
}
|
||||
}
|
||||
|
||||
@(optimization_mode="speed")
|
||||
XXH3_len_9to16_64b :: #force_inline proc(input: []u8, secret: []u8, seed: xxh_u64) -> (res: xxh_u64) {
|
||||
length := u64(len(input))
|
||||
assert(input != nil)
|
||||
assert(9 <= length && length <= 16)
|
||||
assert(secret != nil)
|
||||
#no_bounds_check {
|
||||
bitflip1 := (XXH64_read64(secret[24:]) ~ XXH64_read64(secret[32:])) + seed
|
||||
bitflip2 := (XXH64_read64(secret[40:]) ~ XXH64_read64(secret[48:])) - seed
|
||||
input_lo := XXH64_read64(input) ~ bitflip1
|
||||
input_hi := XXH64_read64(input[length - 8:]) ~ bitflip2
|
||||
acc := length + byte_swap(input_lo) + input_hi \
|
||||
+ XXH_mul_64_to_128_fold_64(input_lo, input_hi)
|
||||
return XXH3_avalanche(acc)
|
||||
}
|
||||
}
|
||||
|
||||
@(optimization_mode="speed")
|
||||
XXH3_len_0to16_64b :: #force_inline proc(input: []u8, secret: []u8, seed: xxh_u64) -> (res: xxh_u64) {
|
||||
length := u64(len(input))
|
||||
assert(input != nil)
|
||||
assert(length <= 16)
|
||||
#no_bounds_check {
|
||||
switch {
|
||||
case length > 8: return #force_inline XXH3_len_9to16_64b(input, secret, seed)
|
||||
case length >= 4: return #force_inline XXH3_len_4to8_64b (input, secret, seed)
|
||||
case length > 0: return #force_inline XXH3_len_1to3_64b (input, secret, seed)
|
||||
case:
|
||||
return #force_inline XXH64_avalanche(seed ~ (XXH64_read64(secret[56:]) ~ XXH64_read64(secret[64:])))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
DISCLAIMER: There are known *seed-dependent* multicollisions here due to
|
||||
multiplication by zero, affecting hashes of lengths 17 to 240.
|
||||
|
||||
However, they are very unlikely.
|
||||
|
||||
Keep this in mind when using the unseeded XXH3_64bits() variant: As with all
|
||||
unseeded non-cryptographic hashes, it does not attempt to defend itself
|
||||
against specially crafted inputs, only random inputs.
|
||||
|
||||
Compared to classic UMAC where a 1 in 2^31 chance of 4 consecutive bytes
|
||||
cancelling out the secret is taken an arbitrary number of times (addressed
|
||||
in XXH3_accumulate_512), this collision is very unlikely with random inputs
|
||||
and/or proper seeding:
|
||||
|
||||
This only has a 1 in 2^63 chance of 8 consecutive bytes cancelling out, in a
|
||||
function that is only called up to 16 times per hash with up to 240 bytes of
|
||||
input.
|
||||
|
||||
This is not too bad for a non-cryptographic hash function, especially with
|
||||
only 64 bit outputs.
|
||||
|
||||
The 128-bit variant (which trades some speed for strength) is NOT affected
|
||||
by this, although it is always a good idea to use a proper seed if you care
|
||||
about strength.
|
||||
*/
|
||||
@(optimization_mode="speed")
|
||||
XXH3_mix16B :: #force_inline proc(input: []u8, secret: []u8, seed: xxh_u64) -> (res: xxh_u64) {
|
||||
input_lo := XXH64_read64(input[0:])
|
||||
input_hi := XXH64_read64(input[8:])
|
||||
|
||||
input_lo ~= (XXH64_read64(secret[0:]) + seed)
|
||||
input_hi ~= (XXH64_read64(secret[8:]) - seed)
|
||||
return XXH_mul_64_to_128_fold_64(input_lo, input_hi)
|
||||
}
|
||||
|
||||
/* For mid range keys, XXH3 uses a Mum-hash variant. */
|
||||
@(optimization_mode="speed")
|
||||
XXH3_len_17to128_64b :: proc(input: []u8, secret: []u8, seed: xxh_u64) -> (res: xxh_u64) {
|
||||
assert(len(secret) >= XXH3_SECRET_SIZE_MIN)
|
||||
length := len(input)
|
||||
assert(16 < length && length <= 128)
|
||||
|
||||
#no_bounds_check {
|
||||
acc := u64(length) * XXH_PRIME64_1
|
||||
switch {
|
||||
case length > 96:
|
||||
acc += XXH3_mix16B(input[48: ], secret[96: ], seed)
|
||||
acc += XXH3_mix16B(input[length - 64:], secret[112:], seed)
|
||||
fallthrough
|
||||
case length > 64:
|
||||
acc += XXH3_mix16B(input[32: ], secret[64: ], seed)
|
||||
acc += XXH3_mix16B(input[length - 48:], secret[80: ], seed)
|
||||
fallthrough
|
||||
case length > 32:
|
||||
acc += XXH3_mix16B(input[16: ], secret[32: ], seed)
|
||||
acc += XXH3_mix16B(input[length - 32:], secret[48: ], seed)
|
||||
fallthrough
|
||||
case:
|
||||
acc += XXH3_mix16B(input[0: ], secret[0: ], seed)
|
||||
acc += XXH3_mix16B(input[length - 16:], secret[16: ], seed)
|
||||
}
|
||||
return XXH3_avalanche(acc)
|
||||
}
|
||||
}
|
||||
|
||||
XXH3_MIDSIZE_MAX :: 240
|
||||
XXH3_MIDSIZE_STARTOFFSET :: 3
|
||||
XXH3_MIDSIZE_LASTOFFSET :: 17
|
||||
|
||||
@(optimization_mode="speed")
|
||||
XXH3_len_129to240_64b :: proc(input: []u8, secret: []u8, seed: xxh_u64) -> (res: xxh_u64) {
|
||||
assert(len(secret) >= XXH3_SECRET_SIZE_MIN)
|
||||
length := len(input)
|
||||
assert(128 < length && length <= XXH3_MIDSIZE_MAX)
|
||||
|
||||
#no_bounds_check {
|
||||
acc := u64(length) * XXH_PRIME64_1
|
||||
nbRounds := length / 16
|
||||
|
||||
i: int
|
||||
for i = 0; i < 8; i += 1 {
|
||||
acc += XXH3_mix16B(input[16 * i:], secret[16 * i:], seed)
|
||||
}
|
||||
|
||||
acc = XXH3_avalanche(acc)
|
||||
assert(nbRounds >= 8)
|
||||
|
||||
for i = 8; i < nbRounds; i += 1 {
|
||||
acc += XXH3_mix16B(input[16 * i:], secret[(16 * (i - 8)) + XXH3_MIDSIZE_STARTOFFSET:], seed)
|
||||
}
|
||||
/* last bytes */
|
||||
acc += XXH3_mix16B(input[length - 16:], secret[XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET:], seed)
|
||||
return XXH3_avalanche(acc)
|
||||
}
|
||||
}
|
||||
|
||||
/* ======= Long Keys ======= */
|
||||
|
||||
XXH_STRIPE_LEN :: 64
|
||||
XXH_SECRET_CONSUME_RATE :: 8 /* nb of secret bytes consumed at each accumulation */
|
||||
XXH_ACC_NB :: (XXH_STRIPE_LEN / size_of(xxh_u64))
|
||||
|
||||
@(optimization_mode="speed")
|
||||
XXH_writeLE64 :: #force_inline proc(dst: []u8, v64: u64le) {
|
||||
v := v64
|
||||
mem_copy(raw_data(dst), &v, size_of(v64))
|
||||
}
|
||||
|
||||
/*
|
||||
* XXH3_accumulate_512 is the tightest loop for long inputs, and it is the most optimized.
|
||||
*
|
||||
* It is a hardened version of UMAC, based off of FARSH's implementation.
|
||||
*
|
||||
* This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD
|
||||
* implementations, and it is ridiculously fast.
|
||||
*
|
||||
* We harden it by mixing the original input to the accumulators as well as the product.
|
||||
*
|
||||
* This means that in the (relatively likely) case of a multiply by zero, the
|
||||
* original input is preserved.
|
||||
*
|
||||
* On 128-bit inputs, we swap 64-bit pairs when we add the input to improve
|
||||
* cross-pollination, as otherwise the upper and lower halves would be
|
||||
* essentially independent.
|
||||
*
|
||||
* This doesn't matter on 64-bit hashes since they all get merged together in
|
||||
* the end, so we skip the extra step.
|
||||
*
|
||||
* Both XXH3_64bits and XXH3_128bits use this subroutine.
|
||||
*/
|
||||
|
||||
XXH3_accumulate_512_f :: #type proc(acc: []xxh_u64, input: []u8, secret: []u8)
|
||||
XXH3_scramble_accumulator_f :: #type proc(acc: []xxh_u64, secret: []u8)
|
||||
XXH3_init_custom_secret_f :: #type proc(custom_secret: []u8, seed64: xxh_u64)
|
||||
|
||||
XXH3_accumulate_512 : XXH3_accumulate_512_f = XXH3_accumulate_512_scalar
|
||||
XXH3_scramble_accumulator : XXH3_scramble_accumulator_f = XXH3_scramble_accumulator_scalar
|
||||
XXH3_init_custom_secret : XXH3_init_custom_secret_f = XXH3_init_custom_secret_scalar
|
||||
|
||||
/* scalar variants - universal */
|
||||
@(optimization_mode="speed")
|
||||
XXH3_accumulate_512_scalar :: #force_inline proc(acc: []xxh_u64, input: []u8, secret: []u8) {
|
||||
xacc := acc /* presumed aligned */
|
||||
xinput := input /* no alignment restriction */
|
||||
xsecret := secret /* no alignment restriction */
|
||||
|
||||
assert(uintptr(raw_data(acc)) & uintptr(XXH_ACC_ALIGN - 1) == 0)
|
||||
|
||||
#no_bounds_check for i := uint(0); i < XXH_ACC_NB; i += 1 {
|
||||
data_val := XXH64_read64(xinput[8 * i:])
|
||||
data_key := data_val ~ XXH64_read64(xsecret[8 * i:])
|
||||
xacc[i ~ 1] += data_val /* swap adjacent lanes */
|
||||
xacc[i ] += XXH_mul_32_to_64(u32(data_key & 0xFFFFFFFF), u32(data_key >> 32))
|
||||
}
|
||||
}
|
||||
|
||||
@(optimization_mode="speed")
|
||||
XXH3_scramble_accumulator_scalar :: #force_inline proc(acc: []xxh_u64, secret: []u8) {
|
||||
xacc := acc /* presumed aligned */
|
||||
xsecret := secret /* no alignment restriction */
|
||||
|
||||
assert(uintptr(raw_data(acc)) & uintptr(XXH_ACC_ALIGN - 1) == 0)
|
||||
|
||||
#no_bounds_check for i := uint(0); i < XXH_ACC_NB; i += 1 {
|
||||
key64 := XXH64_read64(xsecret[8 * i:])
|
||||
acc64 := xacc[i]
|
||||
acc64 = XXH_xorshift_64(acc64, 47)
|
||||
acc64 ~= key64
|
||||
acc64 *= u64(XXH_PRIME32_1)
|
||||
xacc[i] = acc64
|
||||
}
|
||||
}
|
||||
|
||||
@(optimization_mode="speed")
|
||||
XXH3_init_custom_secret_scalar :: #force_inline proc(custom_secret: []u8, seed64: xxh_u64) {
|
||||
#assert((XXH_SECRET_DEFAULT_SIZE & 15) == 0)
|
||||
|
||||
kSecretPtr := XXH3_kSecret
|
||||
|
||||
nbRounds := XXH_SECRET_DEFAULT_SIZE / 16
|
||||
#no_bounds_check for i := 0; i < nbRounds; i += 1 {
|
||||
lo := XXH64_read64(kSecretPtr[16 * i: ]) + seed64
|
||||
hi := XXH64_read64(kSecretPtr[16 * i + 8:]) - seed64
|
||||
XXH_writeLE64(custom_secret[16 * i: ], u64le(lo))
|
||||
XXH_writeLE64(custom_secret[16 * i + 8:], u64le(hi))
|
||||
}
|
||||
}
|
||||
|
||||
XXH_PREFETCH_DIST :: 320
|
||||
|
||||
/*
|
||||
* XXH3_accumulate()
|
||||
* Loops over XXH3_accumulate_512().
|
||||
* Assumption: nbStripes will not overflow the secret size
|
||||
*/
|
||||
@(optimization_mode="speed")
|
||||
XXH3_accumulate :: #force_inline proc(acc: []xxh_u64, input: []u8, secret: []u8, nbStripes: uint,
|
||||
f_acc512: XXH3_accumulate_512_f) {
|
||||
|
||||
for n := uint(0); n < nbStripes; n += 1 {
|
||||
when !XXH_DISABLE_PREFETCH {
|
||||
in_ptr := &input[n * XXH_STRIPE_LEN]
|
||||
prefetch(in_ptr, XXH_PREFETCH_DIST)
|
||||
}
|
||||
f_acc512(acc, input[n * XXH_STRIPE_LEN:], secret[n * XXH_SECRET_CONSUME_RATE:])
|
||||
}
|
||||
}
|
||||
|
||||
@(optimization_mode="speed")
|
||||
XXH3_hashLong_internal_loop :: #force_inline proc(acc: []xxh_u64, input: []u8, secret: []u8,
|
||||
f_acc512: XXH3_accumulate_512_f, f_scramble: XXH3_scramble_accumulator_f) {
|
||||
|
||||
length := uint(len(input))
|
||||
secret_size := uint(len(secret))
|
||||
stripes_per_block := (secret_size - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE
|
||||
|
||||
block_len := XXH_STRIPE_LEN * stripes_per_block
|
||||
blocks := (length - 1) / block_len
|
||||
|
||||
#no_bounds_check for n := uint(0); n < blocks; n += 1 {
|
||||
XXH3_accumulate(acc, input[n * block_len:], secret, stripes_per_block, f_acc512)
|
||||
f_scramble(acc, secret[secret_size - XXH_STRIPE_LEN:])
|
||||
}
|
||||
|
||||
/* last partial block */
|
||||
#no_bounds_check {
|
||||
stripes := ((length - 1) - (block_len * blocks)) / XXH_STRIPE_LEN
|
||||
|
||||
XXH3_accumulate(acc, input[blocks * block_len:], secret, stripes, f_acc512)
|
||||
|
||||
/* last stripe */
|
||||
#no_bounds_check {
|
||||
p := input[length - XXH_STRIPE_LEN:]
|
||||
XXH_SECRET_LASTACC_START :: 7 /* not aligned on 8, last secret is different from acc & scrambler */
|
||||
f_acc512(acc, p, secret[secret_size - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START:])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@(optimization_mode="speed")
|
||||
XXH3_mix2Accs :: #force_inline proc(acc: []xxh_u64, secret: []u8) -> (res: xxh_u64) {
|
||||
return XXH_mul_64_to_128_fold_64(
|
||||
acc[0] ~ XXH64_read64(secret),
|
||||
acc[1] ~ XXH64_read64(secret[8:]))
|
||||
}
|
||||
|
||||
@(optimization_mode="speed")
|
||||
XXH3_mergeAccs :: #force_inline proc(acc: []xxh_u64, secret: []u8, start: xxh_u64) -> (res: xxh_u64) {
|
||||
result64 := start
|
||||
#no_bounds_check for i := 0; i < 4; i += 1 {
|
||||
result64 += XXH3_mix2Accs(acc[2 * i:], secret[16 * i:])
|
||||
}
|
||||
return XXH3_avalanche(result64)
|
||||
}
|
||||
@@ -15,6 +15,7 @@ import "core:intrinsics"
|
||||
32-bit hash functions
|
||||
*/
|
||||
XXH32_hash :: u32
|
||||
xxh_u32 :: u32
|
||||
XXH32_DEFAULT_SEED :: XXH32_hash(0)
|
||||
|
||||
XXH32_state :: struct {
|
||||
@@ -153,7 +154,7 @@ XXH32_endian_align :: #force_inline proc(input: []u8, seed := XXH32_DEFAULT_SEED
|
||||
v3 := seed + 0
|
||||
v4 := seed - XXH_PRIME32_1
|
||||
|
||||
for len(buf) >= 15 {
|
||||
for len(buf) >= 16 {
|
||||
#no_bounds_check v1 = XXH32_round(v1, XXH32_read32(buf, alignment)); buf = buf[4:]
|
||||
#no_bounds_check v2 = XXH32_round(v2, XXH32_read32(buf, alignment)); buf = buf[4:]
|
||||
#no_bounds_check v3 = XXH32_round(v3, XXH32_read32(buf, alignment)); buf = buf[4:]
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
package test_core_image
|
||||
package test_core_hash
|
||||
|
||||
import "core:hash/xxhash"
|
||||
import "core:time"
|
||||
@@ -31,6 +31,7 @@ when ODIN_TEST {
|
||||
main :: proc() {
|
||||
t := testing.T{}
|
||||
test_benchmark_runner(&t)
|
||||
test_xxhash_vectors(&t)
|
||||
fmt.printf("%v/%v tests successful.\n", TEST_count - TEST_fail, TEST_count)
|
||||
}
|
||||
|
||||
@@ -52,7 +53,7 @@ teardown_xxhash :: proc(options: ^time.Benchmark_Options, allocator := context.a
|
||||
return nil
|
||||
}
|
||||
|
||||
benchmark_xxhash32 :: proc(options: ^time.Benchmark_Options, allocator := context.allocator) -> (err: time.Benchmark_Error) {
|
||||
benchmark_xxh32 :: proc(options: ^time.Benchmark_Options, allocator := context.allocator) -> (err: time.Benchmark_Error) {
|
||||
buf := options.input
|
||||
|
||||
h: u32
|
||||
@@ -65,7 +66,7 @@ benchmark_xxhash32 :: proc(options: ^time.Benchmark_Options, allocator := contex
|
||||
return nil
|
||||
}
|
||||
|
||||
benchmark_xxhash64 :: proc(options: ^time.Benchmark_Options, allocator := context.allocator) -> (err: time.Benchmark_Error) {
|
||||
benchmark_xxh64 :: proc(options: ^time.Benchmark_Options, allocator := context.allocator) -> (err: time.Benchmark_Error) {
|
||||
buf := options.input
|
||||
|
||||
h: u64
|
||||
@@ -78,6 +79,19 @@ benchmark_xxhash64 :: proc(options: ^time.Benchmark_Options, allocator := contex
|
||||
return nil
|
||||
}
|
||||
|
||||
benchmark_xxh3_128 :: proc(options: ^time.Benchmark_Options, allocator := context.allocator) -> (err: time.Benchmark_Error) {
|
||||
buf := options.input
|
||||
|
||||
h: u128
|
||||
for _ in 0..=options.rounds {
|
||||
h = xxhash.XXH3_128bits(buf)
|
||||
}
|
||||
options.count = options.rounds
|
||||
options.processed = options.rounds * options.bytes
|
||||
options.hash = h
|
||||
return nil
|
||||
}
|
||||
|
||||
benchmark_print :: proc(name: string, options: ^time.Benchmark_Options) {
|
||||
fmt.printf("\t[%v] %v rounds, %v bytes processed in %v ns\n\t\t%5.3f rounds/s, %5.3f MiB/s\n",
|
||||
name,
|
||||
@@ -93,12 +107,12 @@ benchmark_print :: proc(name: string, options: ^time.Benchmark_Options) {
|
||||
test_benchmark_runner :: proc(t: ^testing.T) {
|
||||
fmt.println("Starting benchmarks:")
|
||||
|
||||
name := "xxhash32 100 zero bytes"
|
||||
name := "XXH32 100 zero bytes"
|
||||
options := &time.Benchmark_Options{
|
||||
rounds = 1_000,
|
||||
bytes = 100,
|
||||
setup = setup_xxhash,
|
||||
bench = benchmark_xxhash32,
|
||||
bench = benchmark_xxh32,
|
||||
teardown = teardown_xxhash,
|
||||
}
|
||||
|
||||
@@ -107,25 +121,65 @@ test_benchmark_runner :: proc(t: ^testing.T) {
|
||||
expect(t, options.hash == 0x85f6413c, name)
|
||||
benchmark_print(name, options)
|
||||
|
||||
name = "xxhash32 1 MiB zero bytes"
|
||||
name = "XXH32 1 MiB zero bytes"
|
||||
options.bytes = 1_048_576
|
||||
err = time.benchmark(options, context.allocator)
|
||||
expect(t, err == nil, name)
|
||||
expect(t, options.hash == 0x9430f97f, name)
|
||||
benchmark_print(name, options)
|
||||
|
||||
name = "xxhash64 100 zero bytes"
|
||||
name = "XXH64 100 zero bytes"
|
||||
options.bytes = 100
|
||||
options.bench = benchmark_xxhash64
|
||||
options.bench = benchmark_xxh64
|
||||
err = time.benchmark(options, context.allocator)
|
||||
expect(t, err == nil, name)
|
||||
expect(t, options.hash == 0x17bb1103c92c502f, name)
|
||||
benchmark_print(name, options)
|
||||
|
||||
name = "xxhash64 1 MiB zero bytes"
|
||||
name = "XXH64 1 MiB zero bytes"
|
||||
options.bytes = 1_048_576
|
||||
err = time.benchmark(options, context.allocator)
|
||||
expect(t, err == nil, name)
|
||||
expect(t, options.hash == 0x87d2a1b6e1163ef1, name)
|
||||
benchmark_print(name, options)
|
||||
|
||||
name = "XXH3_128 100 zero bytes"
|
||||
options.bytes = 100
|
||||
options.bench = benchmark_xxh3_128
|
||||
err = time.benchmark(options, context.allocator)
|
||||
expect(t, err == nil, name)
|
||||
expect(t, options.hash == 0x6ba30a4e9dffe1ff801fedc74ccd608c, name)
|
||||
benchmark_print(name, options)
|
||||
|
||||
name = "XXH3_128 1 MiB zero bytes"
|
||||
options.bytes = 1_048_576
|
||||
err = time.benchmark(options, context.allocator)
|
||||
expect(t, err == nil, name)
|
||||
expect(t, options.hash == 0xb6ef17a3448492b6918780b90550bf34, name)
|
||||
benchmark_print(name, options)
|
||||
}
|
||||
|
||||
@test
|
||||
test_xxhash_vectors :: proc(t: ^testing.T) {
|
||||
fmt.println("Verifying against XXHASH_TEST_VECTOR_ZERO:")
|
||||
|
||||
buf := make([]u8, 256)
|
||||
defer delete(buf)
|
||||
|
||||
for v, i in XXHASH_TEST_VECTOR_ZERO[:] {
|
||||
b := buf[:i]
|
||||
|
||||
xxh32 := xxhash.XXH32(b)
|
||||
xxh64 := xxhash.XXH64(b)
|
||||
xxh3_128 := xxhash.XXH3_128bits(b)
|
||||
|
||||
xxh32_error := fmt.tprintf("[ XXH32(%03d] Expected: %08x. Got: %08x.", i, v.xxh_32, xxh32)
|
||||
xxh64_error := fmt.tprintf("[ XXH64(%03d] Expected: %16x. Got: %16x.", i, v.xxh_64, xxh64)
|
||||
xxh3_128_error := fmt.tprintf("[XXH3_128(%03d] Expected: %32x. Got: %32x.", i, v.xxh3_128, xxh3_128)
|
||||
|
||||
expect(t, xxh32 == v.xxh_32, xxh32_error)
|
||||
expect(t, xxh64 == v.xxh_64, xxh64_error)
|
||||
expect(t, xxh3_128 == v.xxh3_128, xxh3_128_error)
|
||||
}
|
||||
|
||||
}
|
||||
1568
tests/core/hash/test_vectors_xxhash.odin
Normal file
1568
tests/core/hash/test_vectors_xxhash.odin
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user