|
|
|
|
@@ -8,26 +8,24 @@
|
|
|
|
|
|
|
|
|
|
// package simd_util implements compositions of SIMD operations for optimizing
|
|
|
|
|
// the core library where available.
|
|
|
|
|
|
|
|
|
|
//+build i386, amd64
|
|
|
|
|
package simd_util
|
|
|
|
|
|
|
|
|
|
import "base:intrinsics"
|
|
|
|
|
import "core:simd/x86"
|
|
|
|
|
|
|
|
|
|
@private SCAN_REGISTER_SIZE :: 16
|
|
|
|
|
@private SCAN_REGISTERS :: 4
|
|
|
|
|
@private SCAN_WIDTH :: SCAN_REGISTERS * SCAN_REGISTER_SIZE
|
|
|
|
|
@private SCAN_WIDTH :: 32
|
|
|
|
|
|
|
|
|
|
// How long should a string be before using any of the `index_*` procedures in
|
|
|
|
|
// this package.
|
|
|
|
|
RECOMMENDED_SCAN_SIZE :: SCAN_REGISTER_SIZE
|
|
|
|
|
@(private, rodata)
|
|
|
|
|
simd_scanner_indices := #simd[SCAN_WIDTH]u8 {
|
|
|
|
|
0, 1, 2, 3, 4, 5, 6, 7,
|
|
|
|
|
8, 9, 10, 11, 12, 13, 14, 15,
|
|
|
|
|
16, 17, 18, 19, 20, 21, 22, 23,
|
|
|
|
|
24, 25, 26, 27, 28, 29, 30, 31,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
Scan a slice of bytes for a specific byte.
|
|
|
|
|
|
|
|
|
|
This procedure safely handles padding out slices of any length, including empty
|
|
|
|
|
slices.
|
|
|
|
|
This procedure safely handles slices of any length, including empty slices.
|
|
|
|
|
|
|
|
|
|
Inputs:
|
|
|
|
|
- data: A slice of bytes.
|
|
|
|
|
@@ -36,83 +34,54 @@ Inputs:
|
|
|
|
|
Returns:
|
|
|
|
|
- index: The index of the byte `c`, or -1 if it was not found.
|
|
|
|
|
*/
|
|
|
|
|
@(enable_target_feature="sse2")
|
|
|
|
|
index_byte :: proc(data: []u8, c: byte) -> (index: int) #no_bounds_check {
|
|
|
|
|
scanner_data: [SCAN_REGISTER_SIZE]u8 = c
|
|
|
|
|
scanner := intrinsics.unaligned_load(cast(^x86.__m128i)&scanner_data[0])
|
|
|
|
|
|
|
|
|
|
i: int
|
|
|
|
|
length := len(data)
|
|
|
|
|
full_chunks_length := length - length % SCAN_WIDTH
|
|
|
|
|
i := 0
|
|
|
|
|
|
|
|
|
|
for /**/; i < full_chunks_length; i += SCAN_WIDTH {
|
|
|
|
|
simd_load := intrinsics.unaligned_load(cast(^[SCAN_REGISTERS]x86.__m128i)&data[i])
|
|
|
|
|
|
|
|
|
|
#unroll for j in 0..<SCAN_REGISTERS {
|
|
|
|
|
cmp := x86._mm_cmpeq_epi8(simd_load[j], scanner)
|
|
|
|
|
mask := x86._mm_movemask_epi8(cmp)
|
|
|
|
|
|
|
|
|
|
// NOTE(Feoramund): I experimented with ORing all the masks onto a
|
|
|
|
|
// 128-bit integer before performing the `mask != 0` check to see
|
|
|
|
|
// if that might be faster. However, the cost to avoid 3
|
|
|
|
|
// compares resulted in a marginally slower runtime on my machine.
|
|
|
|
|
//
|
|
|
|
|
// Simpler won out here.
|
|
|
|
|
if mask != 0 {
|
|
|
|
|
ctz := intrinsics.count_trailing_zeros(mask)
|
|
|
|
|
return i + j * SCAN_REGISTER_SIZE + cast(int)ctz
|
|
|
|
|
// Guard against small strings.
|
|
|
|
|
if length < SCAN_WIDTH {
|
|
|
|
|
for /**/; i < length; i += 1 {
|
|
|
|
|
if data[i] == c {
|
|
|
|
|
return i
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return -1
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ptr := cast(int)cast(uintptr)raw_data(data)
|
|
|
|
|
|
|
|
|
|
alignment_start := (SCAN_WIDTH - ptr % SCAN_WIDTH) % SCAN_WIDTH
|
|
|
|
|
|
|
|
|
|
// Iterate as a scalar until the data is aligned on a `SCAN_WIDTH` boundary.
|
|
|
|
|
//
|
|
|
|
|
// This way, every load in the vector loop will be aligned, which should be
|
|
|
|
|
// the fastest possible scenario.
|
|
|
|
|
for /**/; i < alignment_start; i += 1 {
|
|
|
|
|
if data[i] == c {
|
|
|
|
|
return i
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if i < length {
|
|
|
|
|
// The data is not exactly divisible by SCAN_WIDTH, and we haven't found
|
|
|
|
|
// what we're looking for yet, so we must pad out the end, then run our
|
|
|
|
|
// algorithm on it.
|
|
|
|
|
padded_data_end: [SCAN_WIDTH]u8 = ---
|
|
|
|
|
remnant_length := length % SCAN_WIDTH
|
|
|
|
|
intrinsics.mem_copy_non_overlapping(
|
|
|
|
|
&padded_data_end[0],
|
|
|
|
|
&raw_data(data)[full_chunks_length],
|
|
|
|
|
remnant_length,
|
|
|
|
|
)
|
|
|
|
|
// Iterate as a vector over every aligned chunk, evaluating each byte simultaneously at the CPU level.
|
|
|
|
|
scanner: #simd[SCAN_WIDTH]u8 = c
|
|
|
|
|
tail := length - (length - alignment_start) % SCAN_WIDTH
|
|
|
|
|
|
|
|
|
|
simd_load := intrinsics.unaligned_load(cast(^[SCAN_REGISTERS]x86.__m128i)&padded_data_end[0])
|
|
|
|
|
|
|
|
|
|
#unroll for j in 0..<SCAN_REGISTERS {
|
|
|
|
|
cmp := x86._mm_cmpeq_epi8(simd_load[j], scanner)
|
|
|
|
|
mask := x86._mm_movemask_epi8(cmp)
|
|
|
|
|
|
|
|
|
|
// Because this data is padded out, it's possible that we could
|
|
|
|
|
// match on uninitialized memory, so we must guard against that.
|
|
|
|
|
|
|
|
|
|
// Create a relevancy mask: (Example)
|
|
|
|
|
//
|
|
|
|
|
// max(u64) = 0xFFFF_FFFF_FFFF_FFFF
|
|
|
|
|
//
|
|
|
|
|
// Convert an integer into a stream of on-bits by using the
|
|
|
|
|
// shifted negation of the maximum. The subtraction selects which
|
|
|
|
|
// section of the overall mask we should apply.
|
|
|
|
|
//
|
|
|
|
|
// << 17 - (1 * SCAN_REGISTER_SIZE)
|
|
|
|
|
// = 0xFFFF_FFFF_FFFF_FFFE
|
|
|
|
|
//
|
|
|
|
|
submask := max(u64) << u64(remnant_length - (j * SCAN_REGISTER_SIZE))
|
|
|
|
|
//
|
|
|
|
|
// ~submask = 0x0000_0000_0000_0001
|
|
|
|
|
// (submask >> 63) = 0x0000_0000_0000_0001
|
|
|
|
|
//
|
|
|
|
|
// The multiplication is a guard against zero.
|
|
|
|
|
//
|
|
|
|
|
submask = ~submask * (submask >> 63)
|
|
|
|
|
//
|
|
|
|
|
// Finally, mask out any irrelevant bits with the submask.
|
|
|
|
|
mask &= i32(submask)
|
|
|
|
|
|
|
|
|
|
if mask != 0 {
|
|
|
|
|
ctz := int(intrinsics.count_trailing_zeros(mask))
|
|
|
|
|
return i + j * SCAN_REGISTER_SIZE + ctz
|
|
|
|
|
}
|
|
|
|
|
for /**/; i < tail; i += SCAN_WIDTH {
|
|
|
|
|
load := (cast(^#simd[SCAN_WIDTH]u8)(&data[i]))^
|
|
|
|
|
comparison := intrinsics.simd_lanes_eq(load, scanner)
|
|
|
|
|
match := intrinsics.simd_reduce_or(comparison)
|
|
|
|
|
if match > 0 {
|
|
|
|
|
sentinel: #simd[SCAN_WIDTH]u8 = u8(0xFF)
|
|
|
|
|
index_select := intrinsics.simd_select(comparison, simd_scanner_indices, sentinel)
|
|
|
|
|
index_reduce := intrinsics.simd_reduce_min(index_select)
|
|
|
|
|
return i + cast(int)index_reduce
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Iterate as a scalar over the remaining unaligned portion.
|
|
|
|
|
for /**/; i < length; i += 1 {
|
|
|
|
|
if data[i] == c {
|
|
|
|
|
return i
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@@ -123,8 +92,7 @@ index_byte :: proc(data: []u8, c: byte) -> (index: int) #no_bounds_check {
|
|
|
|
|
Scan a slice of bytes for a specific byte, starting from the end and working
|
|
|
|
|
backwards to the start.
|
|
|
|
|
|
|
|
|
|
This procedure safely handles padding out slices of any length, including empty
|
|
|
|
|
slices.
|
|
|
|
|
This procedure safely handles slices of any length, including empty slices.
|
|
|
|
|
|
|
|
|
|
Inputs:
|
|
|
|
|
- data: A slice of bytes.
|
|
|
|
|
@@ -133,54 +101,58 @@ Inputs:
|
|
|
|
|
Returns:
|
|
|
|
|
- index: The index of the byte `c`, or -1 if it was not found.
|
|
|
|
|
*/
|
|
|
|
|
@(enable_target_feature="sse2")
|
|
|
|
|
last_index_byte :: proc(data: []u8, c: byte) -> int #no_bounds_check {
|
|
|
|
|
scanner_data: [SCAN_REGISTER_SIZE]u8 = c
|
|
|
|
|
scanner := intrinsics.unaligned_load(cast(^x86.__m128i)&scanner_data[0])
|
|
|
|
|
length := len(data)
|
|
|
|
|
i := length - 1
|
|
|
|
|
|
|
|
|
|
i := len(data) - SCAN_WIDTH
|
|
|
|
|
|
|
|
|
|
for /**/; i >= 0; i -= SCAN_WIDTH {
|
|
|
|
|
simd_load := intrinsics.unaligned_load(cast(^[SCAN_REGISTERS]x86.__m128i)&data[i])
|
|
|
|
|
|
|
|
|
|
// There is no #reverse #unroll at the time of this writing, so we use
|
|
|
|
|
// `j` to count down by subtraction.
|
|
|
|
|
#unroll for j in 1..=SCAN_REGISTERS {
|
|
|
|
|
cmp := x86._mm_cmpeq_epi8(simd_load[SCAN_REGISTERS-j], scanner)
|
|
|
|
|
mask := x86._mm_movemask_epi8(cmp)
|
|
|
|
|
|
|
|
|
|
if mask != 0 {
|
|
|
|
|
// CLZ is used instead to get the on-bit from the other end.
|
|
|
|
|
clz := (8 * size_of(mask) - 1) - int(intrinsics.count_leading_zeros(mask))
|
|
|
|
|
return i + SCAN_WIDTH - j * SCAN_REGISTER_SIZE + clz
|
|
|
|
|
// Guard against small strings.
|
|
|
|
|
if length < SCAN_WIDTH {
|
|
|
|
|
for /**/; i >= 0; i -= 1 {
|
|
|
|
|
if data[i] == c {
|
|
|
|
|
return i
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return -1
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ptr := cast(int)cast(uintptr)raw_data(data)
|
|
|
|
|
|
|
|
|
|
tail := length - (ptr + length) % SCAN_WIDTH
|
|
|
|
|
|
|
|
|
|
// Iterate as a scalar until the data is aligned on a `SCAN_WIDTH` boundary.
|
|
|
|
|
//
|
|
|
|
|
// This way, every load in the vector loop will be aligned, which should be
|
|
|
|
|
// the fastest possible scenario.
|
|
|
|
|
for /**/; i >= tail; i -= 1 {
|
|
|
|
|
if data[i] == c {
|
|
|
|
|
return i
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if i < 0 {
|
|
|
|
|
padded_data_end: [SCAN_WIDTH]u8 = ---
|
|
|
|
|
remnant_length := len(data) % SCAN_WIDTH
|
|
|
|
|
intrinsics.mem_copy_non_overlapping(
|
|
|
|
|
&padded_data_end[0],
|
|
|
|
|
&raw_data(data)[0],
|
|
|
|
|
remnant_length,
|
|
|
|
|
)
|
|
|
|
|
// Iterate as a vector over every aligned chunk, evaluating each byte simultaneously at the CPU level.
|
|
|
|
|
scanner: #simd[SCAN_WIDTH]u8 = c
|
|
|
|
|
alignment_start := (SCAN_WIDTH - ptr % SCAN_WIDTH) % SCAN_WIDTH
|
|
|
|
|
|
|
|
|
|
simd_load := intrinsics.unaligned_load(cast(^[SCAN_REGISTERS]x86.__m128i)&padded_data_end[0])
|
|
|
|
|
i -= SCAN_WIDTH - 1
|
|
|
|
|
|
|
|
|
|
#unroll for j in 1..=SCAN_REGISTERS {
|
|
|
|
|
cmp := x86._mm_cmpeq_epi8(simd_load[SCAN_REGISTERS-j], scanner)
|
|
|
|
|
mask := x86._mm_movemask_epi8(cmp)
|
|
|
|
|
for /**/; i >= alignment_start; i -= SCAN_WIDTH {
|
|
|
|
|
load := (cast(^#simd[SCAN_WIDTH]u8)(&data[i]))^
|
|
|
|
|
comparison := intrinsics.simd_lanes_eq(load, scanner)
|
|
|
|
|
match := intrinsics.simd_reduce_or(comparison)
|
|
|
|
|
if match > 0 {
|
|
|
|
|
sentinel: #simd[SCAN_WIDTH]u8
|
|
|
|
|
index_select := intrinsics.simd_select(comparison, simd_scanner_indices, sentinel)
|
|
|
|
|
index_reduce := intrinsics.simd_reduce_max(index_select)
|
|
|
|
|
return i + cast(int)index_reduce
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
submask := max(u64) << u64(remnant_length - (SCAN_REGISTERS-j) * SCAN_REGISTER_SIZE)
|
|
|
|
|
submask = ~submask * (submask >> 63)
|
|
|
|
|
|
|
|
|
|
mask &= i32(submask)
|
|
|
|
|
|
|
|
|
|
if mask != 0 {
|
|
|
|
|
clz := (8 * size_of(mask) - 1) - int(intrinsics.count_leading_zeros(mask))
|
|
|
|
|
return SCAN_WIDTH - j * SCAN_REGISTER_SIZE + clz
|
|
|
|
|
}
|
|
|
|
|
// Iterate as a scalar over the remaining unaligned portion.
|
|
|
|
|
i += SCAN_WIDTH - 1
|
|
|
|
|
|
|
|
|
|
for /**/; i >= 0; i -= 1 {
|
|
|
|
|
if data[i] == c {
|
|
|
|
|
return i
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|