xxhash: Add XXH3_128 + test vectors.

This commit is contained in:
Jeroen van Rijn
2021-09-11 15:27:51 +02:00
parent 1f1d8ef884
commit eaefbc43cb
5 changed files with 2568 additions and 13 deletions

View File

@@ -11,7 +11,9 @@ package xxhash
import "core:intrinsics"
import "core:runtime"
mem_copy :: runtime.mem_copy
import "core:sys/llvm"
mem_copy :: runtime.mem_copy
byte_swap :: intrinsics.byte_swap
/*
Version definition
@@ -43,6 +45,22 @@ Error :: enum {
Error,
}
XXH_DISABLE_PREFETCH :: #config(XXH_DISABLE_PREFETCH, false)
when !XXH_DISABLE_PREFETCH {
prefetch_address :: #force_inline proc(address: rawptr) {
llvm.prefetch(address, .Read, .High, .Data)
}
prefetch_offset :: #force_inline proc(address: rawptr, auto_cast offset: uintptr) {
ptr := rawptr(uintptr(address) + offset)
prefetch_address(ptr)
}
} else {
prefetch_address :: #force_inline proc(address: rawptr) {}
prefetch_offset :: #force_inline proc(address: rawptr, auto_cast offset: uintptr) {}
}
prefetch :: proc { prefetch_address, prefetch_offset, }
@(optimization_mode="speed")
XXH_rotl32 :: #force_inline proc(x, r: u32) -> (res: u32) {
return ((x << r) | (x >> (32 - r)))
@@ -54,7 +72,7 @@ XXH_rotl64 :: #force_inline proc(x, r: u64) -> (res: u64) {
}
@(optimization_mode="speed")
XXH32_read32 :: #force_inline proc(buf: []u8, alignment: Alignment) -> (res: u32) {
XXH32_read32 :: #force_inline proc(buf: []u8, alignment := Alignment.Unaligned) -> (res: u32) {
if XXH_FORCE_MEMORY_ACCESS == 2 || alignment == .Aligned {
#no_bounds_check b := (^u32le)(&buf[0])^
return u32(b)
@@ -66,7 +84,7 @@ XXH32_read32 :: #force_inline proc(buf: []u8, alignment: Alignment) -> (res: u32
}
@(optimization_mode="speed")
XXH64_read64 :: #force_inline proc(buf: []u8, alignment: Alignment) -> (res: u64) {
XXH64_read64 :: #force_inline proc(buf: []u8, alignment := Alignment.Unaligned) -> (res: u64) {
if XXH_FORCE_MEMORY_ACCESS == 2 || alignment == .Aligned {
#no_bounds_check b := (^u64le)(&buf[0])^
return u64(b)

View File

@@ -0,0 +1,914 @@
/*
An implementation of Yann Collet's [xxhash Fast Hash Algorithm](https://cyan4973.github.io/xxHash/).
Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
Made available under Odin's BSD-3 license, based on the original C code.
List of contributors:
Jeroen van Rijn: Initial implementation.
*/
package xxhash
import "core:intrinsics"
/* *********************************************************************
* XXH3
* New generation hash designed for speed on small keys and vectorization
************************************************************************
* One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while
* remaining a true 64-bit/128-bit hash function.
*
* This is done by prioritizing a subset of 64-bit operations that can be
* emulated without too many steps on the average 32-bit machine.
*
* For example, these two lines seem similar, and run equally fast on 64-bit:
*
* xxh_u64 x;
* x ^= (x >> 47); // good
* x ^= (x >> 13); // bad
*
* However, to a 32-bit machine, there is a major difference.
*
* x ^= (x >> 47) looks like this:
*
* x.lo ^= (x.hi >> (47 - 32));
*
* while x ^= (x >> 13) looks like this:
*
* // note: funnel shifts are not usually cheap.
* x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13));
* x.hi ^= (x.hi >> 13);
*
* The first one is significantly faster than the second, simply because the
* shift is larger than 32. This means:
* - All the bits we need are in the upper 32 bits, so we can ignore the lower
* 32 bits in the shift.
* - The shift result will always fit in the lower 32 bits, and therefore,
* we can ignore the upper 32 bits in the xor.
*
* Thanks to this optimization, XXH3 only requires these features to be efficient:
*
* - Usable unaligned access
* - A 32-bit or 64-bit ALU
* - If 32-bit, a decent ADC instruction
* - A 32 or 64-bit multiply with a 64-bit result
* - For the 128-bit variant, a decent byteswap helps short inputs.
*
* The first two are already required by XXH32, and almost all 32-bit and 64-bit
* platforms which can run XXH32 can run XXH3 efficiently.
*
* Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one
* notable exception.
*
* First of all, Thumb-1 lacks support for the UMULL instruction which
* performs the important long multiply. This means numerous __aeabi_lmul
* calls.
*
* Second of all, the 8 functional registers are just not enough.
* Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need
* Lo registers, and this shuffling results in thousands more MOVs than A32.
*
* A32 and T32 don't have this limitation. They can access all 14 registers,
* do a 32->64 multiply with UMULL, and the flexible operand allowing free
* shifts is helpful, too.
*
* Therefore, we do a quick sanity check.
*
* If compiling Thumb-1 for a target which supports ARM instructions, we will
* emit a warning, as it is not a "sane" platform to compile for.
*
* Usually, if this happens, it is because of an accident and you probably need
* to specify -march, as you likely meant to compile for a newer architecture.
*
* Credit: large sections of the vectorial and asm source code paths
* have been contributed by @easyaspi314
*/
XXH_ACC_ALIGN :: 8 /* scalar */
/* ==========================================
* XXH3 default settings
* ========================================== */
XXH3_SECRET_SIZE_MIN :: 136
XXH_SECRET_DEFAULT_SIZE :: max(XXH3_SECRET_SIZE_MIN, #config(XXH_SECRET_DEFAULT_SIZE, 192))
XXH3_kSecret :: [?]u8{
0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c,
0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f,
0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,
0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c,
0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3,
0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,
0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d,
0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64,
0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,
0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e,
0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce,
0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,
}
#assert(size_of(XXH3_kSecret) == 192)
/************************************************************************
* XXH3 128-bit variant
************************************************************************/
/*
Stored in little endian order, although the fields themselves are in native endianness.
*/
xxh_u128 :: u128
XXH3_128_hash :: u128
XXH3_128_DEFAULT_SEED :: xxh_u64(0)
XXH128_hash_t :: struct #raw_union {
using raw: struct {
low: XXH64_hash, /*!< `value & 0xFFFFFFFFFFFFFFFF` */
high: XXH64_hash, /*!< `value >> 64` */
},
h: xxh_u128,
}
#assert(size_of(xxh_u128) == size_of(XXH128_hash_t))
@(optimization_mode="speed")
XXH_mul_32_to_64 :: #force_inline proc(x, y: xxh_u32) -> (res: xxh_u64) {
return u64(x) * u64(y)
}
@(optimization_mode="speed")
XXH_mul_64_to_128 :: #force_inline proc(lhs, rhs: xxh_u64) -> (res: xxh_u128) {
return xxh_u128(lhs) * xxh_u128(rhs)
}
/*
The reason for the separate function is to prevent passing too many structs
around by value. This will hopefully inline the multiply, but we don't force it.
@param lhs, rhs The 64-bit integers to multiply
@return The low 64 bits of the product XOR'd by the high 64 bits.
*/
@(optimization_mode="speed")
XXH_mul_64_to_128_fold_64 :: #force_inline proc(lhs, rhs: xxh_u64) -> (res: xxh_u64) {
t := XXH128_hash_t{}
t.h = #force_inline XXH_mul_64_to_128(lhs, rhs)
return t.low ~ t.high
}
@(optimization_mode="speed")
XXH_xorshift_64 :: #force_inline proc(v: xxh_u64, auto_cast shift: uint) -> (res: xxh_u64) {
return v ~ (v >> shift)
}
/*
This is a fast avalanche stage, suitable when input bits are already partially mixed
*/
@(optimization_mode="speed")
XXH3_avalanche :: #force_inline proc(h64: xxh_u64) -> (res: xxh_u64) {
res = XXH_xorshift_64(h64, 37)
res *= 0x165667919E3779F9
res = XXH_xorshift_64(res, 32)
return
}
/*
This is a stronger avalanche, inspired by Pelle Evensen's rrmxmx
preferable when input has not been previously mixed
*/
@(optimization_mode="speed")
XXH3_rrmxmx :: #force_inline proc(h64, length: xxh_u64) -> (res: xxh_u64) {
/* this mix is inspired by Pelle Evensen's rrmxmx */
res = h64
res ~= XXH_rotl64(res, 49) ~ XXH_rotl64(res, 24)
res *= 0x9FB21C651E98DF25
res ~= (res >> 35) + length
res *= 0x9FB21C651E98DF25
return XXH_xorshift_64(res, 28)
}
/*
==========================================
XXH3 128 bits (a.k.a XXH128)
==========================================
XXH3's 128-bit variant has better mixing and strength than the 64-bit variant,
even without counting the significantly larger output size.
For example, extra steps are taken to avoid the seed-dependent collisions
in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B).
This strength naturally comes at the cost of some speed, especially on short
lengths. Note that longer hashes are about as fast as the 64-bit version
due to it using only a slight modification of the 64-bit loop.
XXH128 is also more oriented towards 64-bit machines. It is still extremely
fast for a _128-bit_ hash on 32-bit (it usually clears XXH64).
*/
@(optimization_mode="speed")
XXH3_len_1to3_128b :: #force_inline proc(input: []u8, secret: []u8, seed: xxh_u64) -> (res: xxh_u128) {
/* A doubled version of 1to3_64b with different constants. */
length := len(input)
/*
* len = 1: combinedl = { input[0], 0x01, input[0], input[0] }
* len = 2: combinedl = { input[1], 0x02, input[0], input[1] }
* len = 3: combinedl = { input[2], 0x03, input[0], input[1] }
*/
#no_bounds_check {
c1 := input[ 0]
c2 := input[length >> 1]
c3 := input[length - 1]
combinedl := (u32(c1) << 16) | (u32(c2) << 24) | (u32(c3) << 0) | (u32(length) << 8)
combinedh := XXH_rotl32(byte_swap(combinedl), 13)
bitflipl := u64(XXH32_read32(secret[0:]) ~ XXH32_read32(secret[4: ])) + seed
bitfliph := u64(XXH32_read32(secret[8:]) ~ XXH32_read32(secret[12:])) - seed
keyed_lo := u64(combinedl) ~ bitflipl
keyed_hi := u64(combinedh) ~ bitfliph
return xxh_u128(XXH64_avalanche(keyed_lo)) | xxh_u128(XXH64_avalanche(keyed_hi)) << 64
}
}
@(optimization_mode="speed")
XXH3_len_4to8_128b :: #force_inline proc(input: []u8, secret: []u8, seed: xxh_u64) -> (res: xxh_u128) {
length := len(input)
seed := seed
seed ~= u64(byte_swap(u32(seed))) << 32
#no_bounds_check {
input_lo := u64(XXH32_read32(input[0:]))
input_hi := u64(XXH32_read32(input[length - 4:]))
input_64 := u64(input_lo) + u64(input_hi) << 32
bitflip := (XXH64_read64(secret[16:]) ~ XXH64_read64(secret[24:])) + seed
keyed := input_64 ~ bitflip
/* Shift len to the left to ensure it is even, this avoids even multiplies. */
m128 := XXH128_hash_t{
h = XXH_mul_64_to_128(keyed, u64(XXH_PRIME64_1) + (u64(length) << 2)),
}
m128.high += (m128.low << 1)
m128.low ~= (m128.high >> 3)
m128.low = XXH_xorshift_64(m128.low, 35)
m128.low *= 0x9FB21C651E98DF25
m128.low = XXH_xorshift_64(m128.low, 28)
m128.high = XXH3_avalanche(m128.high)
return m128.h
}
}
@(optimization_mode="speed")
XXH3_len_9to16_128b :: #force_inline proc(input: []u8, secret: []u8, seed: xxh_u64) -> (res: xxh_u128) {
length := len(input)
#no_bounds_check {
bitflipl := (XXH64_read64(secret[32:]) ~ XXH64_read64(secret[40:])) - seed
bitfliph := (XXH64_read64(secret[48:]) ~ XXH64_read64(secret[56:])) + seed
input_lo := XXH64_read64(input[0:])
input_hi := XXH64_read64(input[length - 8:])
m128 := XXH128_hash_t{
h = XXH_mul_64_to_128(input_lo ~ input_hi ~ bitflipl, XXH_PRIME64_1),
}
/*
* Put len in the middle of m128 to ensure that the length gets mixed to
* both the low and high bits in the 128x64 multiply below.
*/
m128.low += u64(length - 1) << 54
input_hi ~= bitfliph
/*
* Add the high 32 bits of input_hi to the high 32 bits of m128, then
* add the long product of the low 32 bits of input_hi and XXH_XXH_PRIME32_2 to
* the high 64 bits of m128.
*
* The best approach to this operation is different on 32-bit and 64-bit.
*/
when size_of(rawptr) == 4 { /* 32-bit */
/*
* 32-bit optimized version, which is more readable.
*
* On 32-bit, it removes an ADC and delays a dependency between the two
* halves of m128.high64, but it generates an extra mask on 64-bit.
*/
m128.high += (input_hi & 0xFFFFFFFF00000000) + XXH_mul_32_to_64(u32(input_hi), XXH_PRIME32_2)
} else {
/*
* 64-bit optimized (albeit more confusing) version.
*
* Uses some properties of addition and multiplication to remove the mask:
*
* Let:
* a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF)
* b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000)
* c = XXH_XXH_PRIME32_2
*
* a + (b * c)
* Inverse Property: x + y - x == y
* a + (b * (1 + c - 1))
* Distributive Property: x * (y + z) == (x * y) + (x * z)
* a + (b * 1) + (b * (c - 1))
* Identity Property: x * 1 == x
* a + b + (b * (c - 1))
*
* Substitute a, b, and c:
* input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (XXH_XXH_PRIME32_2 - 1))
*
* Since input_hi.hi + input_hi.lo == input_hi, we get this:
* input_hi + ((xxh_u64)input_hi.lo * (XXH_XXH_PRIME32_2 - 1))
*/
m128.high += input_hi + XXH_mul_32_to_64(u32(input_hi), XXH_PRIME32_2 - 1)
}
/* m128 ^= XXH_swap64(m128 >> 64); */
m128.low ~= byte_swap(m128.high)
{ /* 128x64 multiply: h128 = m128 * XXH_PRIME64_2; */
h128 := XXH128_hash_t{
h = XXH_mul_64_to_128(m128.low, XXH_PRIME64_2),
}
h128.high += m128.high * XXH_PRIME64_2
h128.low = XXH3_avalanche(h128.low)
h128.high = XXH3_avalanche(h128.high)
return h128.h
}
}
}
/*
Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN
*/
@(optimization_mode="speed")
XXH3_len_0to16_128b :: #force_inline proc(input: []u8, secret: []u8, seed: xxh_u64) -> (res: xxh_u128) {
length := len(input)
switch {
case length > 8: return XXH3_len_9to16_128b(input, secret, seed)
case length >= 4: return XXH3_len_4to8_128b (input, secret, seed)
case length > 0: return XXH3_len_1to3_128b (input, secret, seed)
case:
#no_bounds_check bitflipl := XXH64_read64(secret[64:]) ~ XXH64_read64(secret[72:])
#no_bounds_check bitfliph := XXH64_read64(secret[80:]) ~ XXH64_read64(secret[88:])
return xxh_u128(XXH64_avalanche(seed ~ bitflipl)) | xxh_u128(XXH64_avalanche(seed ~ bitfliph)) << 64
}
}
/*
A bit slower than XXH3_mix16B, but handles multiply by zero better.
*/
@(optimization_mode="speed")
XXH128_mix32B :: #force_inline proc(acc: xxh_u128, input_1: []u8, input_2: []u8, secret: []u8, seed: xxh_u64) -> (res: xxh_u128) {
acc128 := XXH128_hash_t{
h = acc,
}
#no_bounds_check {
acc128.low += XXH3_mix16B (input_1, secret[0:], seed)
acc128.low ~= XXH64_read64(input_2[0:]) + XXH64_read64(input_2[8:])
acc128.high += XXH3_mix16B (input_2, secret[16:], seed)
acc128.high ~= XXH64_read64(input_1) + XXH64_read64(input_1[8:])
return acc128.h
}
}
@(optimization_mode="speed")
XXH3_len_17to128_128b :: #force_inline proc(input: []u8, secret: []u8, seed: xxh_u64) -> (res: xxh_u128) {
length := len(input)
acc := XXH128_hash_t{}
acc.low = xxh_u64(length) * XXH_PRIME64_1
switch{
case length > 96:
#no_bounds_check acc.h = XXH128_mix32B(acc.h, input[48:], input[length - 64:], secret[96:], seed)
fallthrough
case length > 64:
#no_bounds_check acc.h = XXH128_mix32B(acc.h, input[32:], input[length - 48:], secret[64:], seed)
fallthrough
case length > 32:
#no_bounds_check acc.h = XXH128_mix32B(acc.h, input[16:], input[length - 32:], secret[32:], seed)
fallthrough
case:
#no_bounds_check acc.h = XXH128_mix32B(acc.h, input, input[length - 16:], secret, seed)
h128 := XXH128_hash_t{}
h128.low = acc.low + acc.high
h128.high = (acc.low * XXH_PRIME64_1) + (acc.high * XXH_PRIME64_4) + ((u64(length) - seed) * XXH_PRIME64_2)
h128.low = XXH3_avalanche(h128.low)
h128.high = u64(i64(0) - i64(XXH3_avalanche(h128.high)))
return h128.h
}
unreachable()
}
@(optimization_mode="speed")
XXH3_len_129to240_128b :: #force_inline proc(input: []u8, secret: []u8, seed: xxh_u64) -> (res: xxh_u128) {
length := len(input)
#no_bounds_check {
acc := XXH128_hash_t{}
acc.low = u64(length) * XXH_PRIME64_1
nbRounds := length / 32
i: int
#no_bounds_check for i = 0; i < 4; i += 1 {
acc.h = XXH128_mix32B(acc.h,
input[32 * i:],
input [32 * i + 16:],
secret[32 * i:],
seed)
}
acc.low = XXH3_avalanche(acc.low)
acc.high = XXH3_avalanche(acc.high)
#no_bounds_check for i = 4; i < nbRounds; i += 1 {
acc.h = XXH128_mix32B(acc.h,
input[32 * i:], input[32 * i + 16:],
secret[XXH3_MIDSIZE_STARTOFFSET + (32 * (i - 4)):],
seed)
}
/* last bytes */
#no_bounds_check acc.h = XXH128_mix32B(acc.h,
input[length - 16:],
input[length - 32:],
secret[XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16:],
u64(i64(0) - i64(seed)))
#no_bounds_check {
h128 := XXH128_hash_t{}
h128.low = acc.low + acc.high
h128.high = u64(
u128(acc.low * XXH_PRIME64_1) \
+ u128(acc.high * XXH_PRIME64_4) \
+ u128((u64(length) - seed) * XXH_PRIME64_2))
h128.low = XXH3_avalanche(h128.low)
h128.high = u64(i64(0) - i64(XXH3_avalanche(h128.high)))
return h128.h
}
}
unreachable()
}
XXH3_INIT_ACC :: [XXH_ACC_NB]xxh_u64{
XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3,
XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1,
}
XXH_SECRET_MERGEACCS_START :: 11
@(optimization_mode="speed")
XXH3_hashLong_128b_internal :: #force_inline proc(
input: []u8,
secret: []u8,
f_acc512: XXH3_accumulate_512_f,
f_scramble: XXH3_scramble_accumulator_f) -> (res: XXH3_128_hash) {
acc := XXH3_INIT_ACC
#assert(size_of(acc) == 64)
XXH3_hashLong_internal_loop(acc[:], input, secret, f_acc512, f_scramble)
/* converge into final hash */
{
length := len(input)
secret_size := len(secret)
h128 := XXH128_hash_t{}
h128.low = XXH3_mergeAccs(acc[:], secret[XXH_SECRET_MERGEACCS_START:], u64(length) * XXH_PRIME64_1)
h128.high = XXH3_mergeAccs(acc[:], secret[secret_size - size_of(acc) - XXH_SECRET_MERGEACCS_START:],
~(u64(length) * XXH_PRIME64_2))
return h128.h
}
}
/*
* It's important for performance that XXH3_hashLong is not inlined.
*/
XXH3_hashLong_128b_default :: #force_no_inline proc(input: []u8, seed: xxh_u64, secret: []u8) -> (res: XXH3_128_hash) {
k_secret := XXH3_kSecret
return XXH3_hashLong_128b_internal(input, k_secret[:], XXH3_accumulate_512, XXH3_scramble_accumulator)
}
/*
* It's important for performance that XXH3_hashLong is not inlined.
*/
XXH3_hashLong_128b_withSecret :: #force_no_inline proc(input: []u8, seed: xxh_u64, secret: []u8) -> (res: XXH3_128_hash) {
return XXH3_hashLong_128b_internal(input, secret, XXH3_accumulate_512, XXH3_scramble_accumulator)
}
XXH3_hashLong_128b_withSeed_internal :: #force_inline proc(
input: []u8, seed: xxh_u64, secret: []u8,
f_acc512: XXH3_accumulate_512_f,
f_scramble: XXH3_scramble_accumulator_f,
f_initSec: XXH3_init_custom_secret_f) -> (res: XXH3_128_hash) {
if seed == 0 {
k := XXH3_kSecret
return XXH3_hashLong_128b_internal(input, k[:], f_acc512, f_scramble)
}
{
secret := [XXH_SECRET_DEFAULT_SIZE]u8{}
f_initSec(secret[:], seed)
return XXH3_hashLong_128b_internal(input, secret[:], f_acc512, f_scramble)
}
}
/*
* It's important for performance that XXH3_hashLong is not inlined.
*/
XXH3_hashLong_128b_withSeed :: #force_no_inline proc(input: []u8, seed: xxh_u64, secret: []u8) -> (res: XXH3_128_hash) {
return XXH3_hashLong_128b_withSeed_internal(input, seed, secret, XXH3_accumulate_512, XXH3_scramble_accumulator , XXH3_init_custom_secret)
}
XXH3_hashLong128_f :: #type proc(input: []u8, seed: xxh_u64, secret: []u8) -> (res: XXH3_128_hash)
XXH3_128bits_internal :: #force_inline proc(
input: []u8, seed: xxh_u64, secret: []u8, f_hl128: XXH3_hashLong128_f) -> (res: XXH3_128_hash) {
assert(len(secret) >= XXH3_SECRET_SIZE_MIN)
/*
* If an action is to be taken if `secret` conditions are not respected,
* it should be done here.
* For now, it's a contract pre-condition.
* Adding a check and a branch here would cost performance at every hash.
*/
length := len(input)
switch {
case length <= 16:
return XXH3_len_0to16_128b(input, secret, seed)
case length <= 128:
return XXH3_len_17to128_128b(input, secret, seed)
case length <= XXH3_MIDSIZE_MAX:
return XXH3_len_129to240_128b(input, secret, seed)
case:
return f_hl128(input, seed, secret)
}
}
/* === Public XXH128 API === */
XXH3_128bits :: proc(input: []u8) -> (hash: XXH3_128_hash) {
k := XXH3_kSecret
return XXH3_128bits_internal(input, XXH3_128_DEFAULT_SEED, k[:], XXH3_hashLong_128b_default)
}
/*
==========================================
Short keys
==========================================
One of the shortcomings of XXH32 and XXH64 was that their performance was
sub-optimal on short lengths. It used an iterative algorithm which strongly
favored lengths that were a multiple of 4 or 8.
Instead of iterating over individual inputs, we use a set of single shot
functions which piece together a range of lengths and operate in constant time.
Additionally, the number of multiplies has been significantly reduced. This
reduces latency, especially when emulating 64-bit multiplies on 32-bit.
Depending on the platform, this may or may not be faster than XXH32, but it
is almost guaranteed to be faster than XXH64.
*/
/*
At very short lengths, there isn't enough input to fully hide secrets, or use the entire secret.
There is also only a limited amount of mixing we can do before significantly impacting performance.
Therefore, we use different sections of the secret and always mix two secret samples with an XOR.
This should have no effect on performance on the seedless or withSeed variants because everything
_should_ be constant folded by modern compilers.
The XOR mixing hides individual parts of the secret and increases entropy.
This adds an extra layer of strength for custom secrets.
*/
@(optimization_mode="speed")
XXH3_len_1to3_64b :: #force_inline proc(input: []u8, secret: []u8, seed: xxh_u64) -> (res: xxh_u64) {
length := u32(len(input))
assert(input != nil)
assert(1 <= length && length <= 3)
assert(secret != nil)
/*
len = 1: combined = { input[0], 0x01, input[0], input[0] }
len = 2: combined = { input[1], 0x02, input[0], input[1] }
len = 3: combined = { input[2], 0x03, input[0], input[1] }
*/
#no_bounds_check {
c1 := u32(input[0 ])
c2 := u32(input[length >> 1])
c3 := u32(input[length - 1])
combined := c1 << 16 | c2 << 24 | c3 << 0 | length << 8
bitflip := (u64(XXH32_read32(secret)) ~ u64(XXH32_read32(secret[4:]))) + seed
keyed := u64(combined) ~ bitflip
return XXH64_avalanche(keyed)
}
}
@(optimization_mode="speed")
XXH3_len_4to8_64b :: #force_inline proc(input: []u8, secret: []u8, seed: xxh_u64) -> (res: xxh_u64) {
length := u32(len(input))
assert(input != nil)
assert(4 <= length && length <= 8)
assert(secret != nil)
seed := seed
seed ~= u64(byte_swap(u32(seed) << 32))
#no_bounds_check {
input1 := XXH32_read32(input)
input2 := XXH32_read32(input[length - 4:])
bitflip := (XXH64_read64(secret[8:]) ~ XXH64_read64(secret[16:])) - seed
input64 := u64(input2) + (u64(input1) << 32)
keyed := input64 ~ bitflip
return XXH3_rrmxmx(keyed, u64(length))
}
}
@(optimization_mode="speed")
XXH3_len_9to16_64b :: #force_inline proc(input: []u8, secret: []u8, seed: xxh_u64) -> (res: xxh_u64) {
length := u64(len(input))
assert(input != nil)
assert(9 <= length && length <= 16)
assert(secret != nil)
#no_bounds_check {
bitflip1 := (XXH64_read64(secret[24:]) ~ XXH64_read64(secret[32:])) + seed
bitflip2 := (XXH64_read64(secret[40:]) ~ XXH64_read64(secret[48:])) - seed
input_lo := XXH64_read64(input) ~ bitflip1
input_hi := XXH64_read64(input[length - 8:]) ~ bitflip2
acc := length + byte_swap(input_lo) + input_hi \
+ XXH_mul_64_to_128_fold_64(input_lo, input_hi)
return XXH3_avalanche(acc)
}
}
@(optimization_mode="speed")
XXH3_len_0to16_64b :: #force_inline proc(input: []u8, secret: []u8, seed: xxh_u64) -> (res: xxh_u64) {
length := u64(len(input))
assert(input != nil)
assert(length <= 16)
#no_bounds_check {
switch {
case length > 8: return #force_inline XXH3_len_9to16_64b(input, secret, seed)
case length >= 4: return #force_inline XXH3_len_4to8_64b (input, secret, seed)
case length > 0: return #force_inline XXH3_len_1to3_64b (input, secret, seed)
case:
return #force_inline XXH64_avalanche(seed ~ (XXH64_read64(secret[56:]) ~ XXH64_read64(secret[64:])))
}
}
}
/*
DISCLAIMER: There are known *seed-dependent* multicollisions here due to
multiplication by zero, affecting hashes of lengths 17 to 240.
However, they are very unlikely.
Keep this in mind when using the unseeded XXH3_64bits() variant: As with all
unseeded non-cryptographic hashes, it does not attempt to defend itself
against specially crafted inputs, only random inputs.
Compared to classic UMAC where a 1 in 2^31 chance of 4 consecutive bytes
cancelling out the secret is taken an arbitrary number of times (addressed
in XXH3_accumulate_512), this collision is very unlikely with random inputs
and/or proper seeding:
This only has a 1 in 2^63 chance of 8 consecutive bytes cancelling out, in a
function that is only called up to 16 times per hash with up to 240 bytes of
input.
This is not too bad for a non-cryptographic hash function, especially with
only 64 bit outputs.
The 128-bit variant (which trades some speed for strength) is NOT affected
by this, although it is always a good idea to use a proper seed if you care
about strength.
*/
@(optimization_mode="speed")
XXH3_mix16B :: #force_inline proc(input: []u8, secret: []u8, seed: xxh_u64) -> (res: xxh_u64) {
input_lo := XXH64_read64(input[0:])
input_hi := XXH64_read64(input[8:])
input_lo ~= (XXH64_read64(secret[0:]) + seed)
input_hi ~= (XXH64_read64(secret[8:]) - seed)
return XXH_mul_64_to_128_fold_64(input_lo, input_hi)
}
/* For mid range keys, XXH3 uses a Mum-hash variant. */
@(optimization_mode="speed")
XXH3_len_17to128_64b :: proc(input: []u8, secret: []u8, seed: xxh_u64) -> (res: xxh_u64) {
assert(len(secret) >= XXH3_SECRET_SIZE_MIN)
length := len(input)
assert(16 < length && length <= 128)
#no_bounds_check {
acc := u64(length) * XXH_PRIME64_1
switch {
case length > 96:
acc += XXH3_mix16B(input[48: ], secret[96: ], seed)
acc += XXH3_mix16B(input[length - 64:], secret[112:], seed)
fallthrough
case length > 64:
acc += XXH3_mix16B(input[32: ], secret[64: ], seed)
acc += XXH3_mix16B(input[length - 48:], secret[80: ], seed)
fallthrough
case length > 32:
acc += XXH3_mix16B(input[16: ], secret[32: ], seed)
acc += XXH3_mix16B(input[length - 32:], secret[48: ], seed)
fallthrough
case:
acc += XXH3_mix16B(input[0: ], secret[0: ], seed)
acc += XXH3_mix16B(input[length - 16:], secret[16: ], seed)
}
return XXH3_avalanche(acc)
}
}
XXH3_MIDSIZE_MAX :: 240
XXH3_MIDSIZE_STARTOFFSET :: 3
XXH3_MIDSIZE_LASTOFFSET :: 17
@(optimization_mode="speed")
XXH3_len_129to240_64b :: proc(input: []u8, secret: []u8, seed: xxh_u64) -> (res: xxh_u64) {
assert(len(secret) >= XXH3_SECRET_SIZE_MIN)
length := len(input)
assert(128 < length && length <= XXH3_MIDSIZE_MAX)
#no_bounds_check {
acc := u64(length) * XXH_PRIME64_1
nbRounds := length / 16
i: int
for i = 0; i < 8; i += 1 {
acc += XXH3_mix16B(input[16 * i:], secret[16 * i:], seed)
}
acc = XXH3_avalanche(acc)
assert(nbRounds >= 8)
for i = 8; i < nbRounds; i += 1 {
acc += XXH3_mix16B(input[16 * i:], secret[(16 * (i - 8)) + XXH3_MIDSIZE_STARTOFFSET:], seed)
}
/* last bytes */
acc += XXH3_mix16B(input[length - 16:], secret[XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET:], seed)
return XXH3_avalanche(acc)
}
}
/* ======= Long Keys ======= */
XXH_STRIPE_LEN :: 64
XXH_SECRET_CONSUME_RATE :: 8 /* nb of secret bytes consumed at each accumulation */
XXH_ACC_NB :: (XXH_STRIPE_LEN / size_of(xxh_u64))
@(optimization_mode="speed")
XXH_writeLE64 :: #force_inline proc(dst: []u8, v64: u64le) {
v := v64
mem_copy(raw_data(dst), &v, size_of(v64))
}
/*
* XXH3_accumulate_512 is the tightest loop for long inputs, and it is the most optimized.
*
* It is a hardened version of UMAC, based off of FARSH's implementation.
*
* This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD
* implementations, and it is ridiculously fast.
*
* We harden it by mixing the original input to the accumulators as well as the product.
*
* This means that in the (relatively likely) case of a multiply by zero, the
* original input is preserved.
*
* On 128-bit inputs, we swap 64-bit pairs when we add the input to improve
* cross-pollination, as otherwise the upper and lower halves would be
* essentially independent.
*
* This doesn't matter on 64-bit hashes since they all get merged together in
* the end, so we skip the extra step.
*
* Both XXH3_64bits and XXH3_128bits use this subroutine.
*/
XXH3_accumulate_512_f :: #type proc(acc: []xxh_u64, input: []u8, secret: []u8)
XXH3_scramble_accumulator_f :: #type proc(acc: []xxh_u64, secret: []u8)
XXH3_init_custom_secret_f :: #type proc(custom_secret: []u8, seed64: xxh_u64)
XXH3_accumulate_512 : XXH3_accumulate_512_f = XXH3_accumulate_512_scalar
XXH3_scramble_accumulator : XXH3_scramble_accumulator_f = XXH3_scramble_accumulator_scalar
XXH3_init_custom_secret : XXH3_init_custom_secret_f = XXH3_init_custom_secret_scalar
/* scalar variants - universal */
@(optimization_mode="speed")
XXH3_accumulate_512_scalar :: #force_inline proc(acc: []xxh_u64, input: []u8, secret: []u8) {
xacc := acc /* presumed aligned */
xinput := input /* no alignment restriction */
xsecret := secret /* no alignment restriction */
assert(uintptr(raw_data(acc)) & uintptr(XXH_ACC_ALIGN - 1) == 0)
#no_bounds_check for i := uint(0); i < XXH_ACC_NB; i += 1 {
data_val := XXH64_read64(xinput[8 * i:])
data_key := data_val ~ XXH64_read64(xsecret[8 * i:])
xacc[i ~ 1] += data_val /* swap adjacent lanes */
xacc[i ] += XXH_mul_32_to_64(u32(data_key & 0xFFFFFFFF), u32(data_key >> 32))
}
}
@(optimization_mode="speed")
XXH3_scramble_accumulator_scalar :: #force_inline proc(acc: []xxh_u64, secret: []u8) {
xacc := acc /* presumed aligned */
xsecret := secret /* no alignment restriction */
assert(uintptr(raw_data(acc)) & uintptr(XXH_ACC_ALIGN - 1) == 0)
#no_bounds_check for i := uint(0); i < XXH_ACC_NB; i += 1 {
key64 := XXH64_read64(xsecret[8 * i:])
acc64 := xacc[i]
acc64 = XXH_xorshift_64(acc64, 47)
acc64 ~= key64
acc64 *= u64(XXH_PRIME32_1)
xacc[i] = acc64
}
}
@(optimization_mode="speed")
XXH3_init_custom_secret_scalar :: #force_inline proc(custom_secret: []u8, seed64: xxh_u64) {
#assert((XXH_SECRET_DEFAULT_SIZE & 15) == 0)
kSecretPtr := XXH3_kSecret
nbRounds := XXH_SECRET_DEFAULT_SIZE / 16
#no_bounds_check for i := 0; i < nbRounds; i += 1 {
lo := XXH64_read64(kSecretPtr[16 * i: ]) + seed64
hi := XXH64_read64(kSecretPtr[16 * i + 8:]) - seed64
XXH_writeLE64(custom_secret[16 * i: ], u64le(lo))
XXH_writeLE64(custom_secret[16 * i + 8:], u64le(hi))
}
}
XXH_PREFETCH_DIST :: 320
/*
* XXH3_accumulate()
* Loops over XXH3_accumulate_512().
* Assumption: nbStripes will not overflow the secret size
*/
@(optimization_mode="speed")
XXH3_accumulate :: #force_inline proc(acc: []xxh_u64, input: []u8, secret: []u8, nbStripes: uint,
f_acc512: XXH3_accumulate_512_f) {
for n := uint(0); n < nbStripes; n += 1 {
when !XXH_DISABLE_PREFETCH {
in_ptr := &input[n * XXH_STRIPE_LEN]
prefetch(in_ptr, XXH_PREFETCH_DIST)
}
f_acc512(acc, input[n * XXH_STRIPE_LEN:], secret[n * XXH_SECRET_CONSUME_RATE:])
}
}
@(optimization_mode="speed")
XXH3_hashLong_internal_loop :: #force_inline proc(acc: []xxh_u64, input: []u8, secret: []u8,
f_acc512: XXH3_accumulate_512_f, f_scramble: XXH3_scramble_accumulator_f) {
length := uint(len(input))
secret_size := uint(len(secret))
stripes_per_block := (secret_size - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE
block_len := XXH_STRIPE_LEN * stripes_per_block
blocks := (length - 1) / block_len
#no_bounds_check for n := uint(0); n < blocks; n += 1 {
XXH3_accumulate(acc, input[n * block_len:], secret, stripes_per_block, f_acc512)
f_scramble(acc, secret[secret_size - XXH_STRIPE_LEN:])
}
/* last partial block */
#no_bounds_check {
stripes := ((length - 1) - (block_len * blocks)) / XXH_STRIPE_LEN
XXH3_accumulate(acc, input[blocks * block_len:], secret, stripes, f_acc512)
/* last stripe */
#no_bounds_check {
p := input[length - XXH_STRIPE_LEN:]
XXH_SECRET_LASTACC_START :: 7 /* not aligned on 8, last secret is different from acc & scrambler */
f_acc512(acc, p, secret[secret_size - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START:])
}
}
}
@(optimization_mode="speed")
XXH3_mix2Accs :: #force_inline proc(acc: []xxh_u64, secret: []u8) -> (res: xxh_u64) {
return XXH_mul_64_to_128_fold_64(
acc[0] ~ XXH64_read64(secret),
acc[1] ~ XXH64_read64(secret[8:]))
}
@(optimization_mode="speed")
XXH3_mergeAccs :: #force_inline proc(acc: []xxh_u64, secret: []u8, start: xxh_u64) -> (res: xxh_u64) {
result64 := start
#no_bounds_check for i := 0; i < 4; i += 1 {
result64 += XXH3_mix2Accs(acc[2 * i:], secret[16 * i:])
}
return XXH3_avalanche(result64)
}

View File

@@ -15,6 +15,7 @@ import "core:intrinsics"
32-bit hash functions
*/
XXH32_hash :: u32
xxh_u32 :: u32
XXH32_DEFAULT_SEED :: XXH32_hash(0)
XXH32_state :: struct {
@@ -153,7 +154,7 @@ XXH32_endian_align :: #force_inline proc(input: []u8, seed := XXH32_DEFAULT_SEED
v3 := seed + 0
v4 := seed - XXH_PRIME32_1
for len(buf) >= 15 {
for len(buf) >= 16 {
#no_bounds_check v1 = XXH32_round(v1, XXH32_read32(buf, alignment)); buf = buf[4:]
#no_bounds_check v2 = XXH32_round(v2, XXH32_read32(buf, alignment)); buf = buf[4:]
#no_bounds_check v3 = XXH32_round(v3, XXH32_read32(buf, alignment)); buf = buf[4:]

View File

@@ -1,4 +1,4 @@
package test_core_image
package test_core_hash
import "core:hash/xxhash"
import "core:time"
@@ -31,6 +31,7 @@ when ODIN_TEST {
main :: proc() {
t := testing.T{}
test_benchmark_runner(&t)
test_xxhash_vectors(&t)
fmt.printf("%v/%v tests successful.\n", TEST_count - TEST_fail, TEST_count)
}
@@ -52,7 +53,7 @@ teardown_xxhash :: proc(options: ^time.Benchmark_Options, allocator := context.a
return nil
}
benchmark_xxhash32 :: proc(options: ^time.Benchmark_Options, allocator := context.allocator) -> (err: time.Benchmark_Error) {
benchmark_xxh32 :: proc(options: ^time.Benchmark_Options, allocator := context.allocator) -> (err: time.Benchmark_Error) {
buf := options.input
h: u32
@@ -65,7 +66,7 @@ benchmark_xxhash32 :: proc(options: ^time.Benchmark_Options, allocator := contex
return nil
}
benchmark_xxhash64 :: proc(options: ^time.Benchmark_Options, allocator := context.allocator) -> (err: time.Benchmark_Error) {
benchmark_xxh64 :: proc(options: ^time.Benchmark_Options, allocator := context.allocator) -> (err: time.Benchmark_Error) {
buf := options.input
h: u64
@@ -78,6 +79,19 @@ benchmark_xxhash64 :: proc(options: ^time.Benchmark_Options, allocator := contex
return nil
}
benchmark_xxh3_128 :: proc(options: ^time.Benchmark_Options, allocator := context.allocator) -> (err: time.Benchmark_Error) {
buf := options.input
h: u128
for _ in 0..=options.rounds {
h = xxhash.XXH3_128bits(buf)
}
options.count = options.rounds
options.processed = options.rounds * options.bytes
options.hash = h
return nil
}
benchmark_print :: proc(name: string, options: ^time.Benchmark_Options) {
fmt.printf("\t[%v] %v rounds, %v bytes processed in %v ns\n\t\t%5.3f rounds/s, %5.3f MiB/s\n",
name,
@@ -93,12 +107,12 @@ benchmark_print :: proc(name: string, options: ^time.Benchmark_Options) {
test_benchmark_runner :: proc(t: ^testing.T) {
fmt.println("Starting benchmarks:")
name := "xxhash32 100 zero bytes"
name := "XXH32 100 zero bytes"
options := &time.Benchmark_Options{
rounds = 1_000,
bytes = 100,
setup = setup_xxhash,
bench = benchmark_xxhash32,
bench = benchmark_xxh32,
teardown = teardown_xxhash,
}
@@ -107,25 +121,65 @@ test_benchmark_runner :: proc(t: ^testing.T) {
expect(t, options.hash == 0x85f6413c, name)
benchmark_print(name, options)
name = "xxhash32 1 MiB zero bytes"
name = "XXH32 1 MiB zero bytes"
options.bytes = 1_048_576
err = time.benchmark(options, context.allocator)
expect(t, err == nil, name)
expect(t, options.hash == 0x9430f97f, name)
benchmark_print(name, options)
name = "xxhash64 100 zero bytes"
name = "XXH64 100 zero bytes"
options.bytes = 100
options.bench = benchmark_xxhash64
options.bench = benchmark_xxh64
err = time.benchmark(options, context.allocator)
expect(t, err == nil, name)
expect(t, options.hash == 0x17bb1103c92c502f, name)
benchmark_print(name, options)
name = "xxhash64 1 MiB zero bytes"
name = "XXH64 1 MiB zero bytes"
options.bytes = 1_048_576
err = time.benchmark(options, context.allocator)
expect(t, err == nil, name)
expect(t, options.hash == 0x87d2a1b6e1163ef1, name)
benchmark_print(name, options)
name = "XXH3_128 100 zero bytes"
options.bytes = 100
options.bench = benchmark_xxh3_128
err = time.benchmark(options, context.allocator)
expect(t, err == nil, name)
expect(t, options.hash == 0x6ba30a4e9dffe1ff801fedc74ccd608c, name)
benchmark_print(name, options)
name = "XXH3_128 1 MiB zero bytes"
options.bytes = 1_048_576
err = time.benchmark(options, context.allocator)
expect(t, err == nil, name)
expect(t, options.hash == 0xb6ef17a3448492b6918780b90550bf34, name)
benchmark_print(name, options)
}
@test
test_xxhash_vectors :: proc(t: ^testing.T) {
fmt.println("Verifying against XXHASH_TEST_VECTOR_ZERO:")
buf := make([]u8, 256)
defer delete(buf)
for v, i in XXHASH_TEST_VECTOR_ZERO[:] {
b := buf[:i]
xxh32 := xxhash.XXH32(b)
xxh64 := xxhash.XXH64(b)
xxh3_128 := xxhash.XXH3_128bits(b)
xxh32_error := fmt.tprintf("[ XXH32(%03d] Expected: %08x. Got: %08x.", i, v.xxh_32, xxh32)
xxh64_error := fmt.tprintf("[ XXH64(%03d] Expected: %16x. Got: %16x.", i, v.xxh_64, xxh64)
xxh3_128_error := fmt.tprintf("[XXH3_128(%03d] Expected: %32x. Got: %32x.", i, v.xxh3_128, xxh3_128)
expect(t, xxh32 == v.xxh_32, xxh32_error)
expect(t, xxh64 == v.xxh_64, xxh64_error)
expect(t, xxh3_128 == v.xxh3_128, xxh3_128_error)
}
}

File diff suppressed because it is too large Load Diff