Simplify and make simd_util cross-platform

This new algorithm uses a Scalar->Vector->Scalar iteration loop which
requires no masking off of any incomplete data chunks.

Also, the width was reduced to 32 bytes instead of 64, as I found this
to be about as fast as the previous 64-byte x86 version.
This commit is contained in:
Feoramund
2024-08-09 17:39:19 -04:00
parent 793811b219
commit 12dd0cb72a
5 changed files with 101 additions and 151 deletions

View File

@@ -309,14 +309,8 @@ index_byte :: proc(s: []byte, c: byte) -> int {
// NOTE(Feoramund): On my Alder Lake CPU, I have only witnessed a
// significant speedup when compiling in either Size or Speed mode.
// The SIMD version is usually 2-3x slower without optimizations on.
when ODIN_OPTIMIZATION_MODE > .Minimal && intrinsics.has_target_feature("sse2") {
// SIMD's benefits are noticeable only past a certain threshold of data.
// For small data, use the plain old algorithm.
if len(s) >= simd_util.RECOMMENDED_SCAN_SIZE {
return simd_util.index_byte(s, c)
} else {
return _index_byte(s, c)
}
when ODIN_OPTIMIZATION_MODE > .Minimal {
return #force_inline simd_util.index_byte(s, c)
} else {
return _index_byte(s, c)
}
@@ -333,12 +327,8 @@ last_index_byte :: proc(s: []byte, c: byte) -> int {
return -1
}
when ODIN_OPTIMIZATION_MODE > .Minimal && intrinsics.has_target_feature("sse2") {
if len(s) >= simd_util.RECOMMENDED_SCAN_SIZE {
return simd_util.last_index_byte(s, c)
} else {
return _last_index_byte(s, c)
}
when ODIN_OPTIMIZATION_MODE > .Minimal {
return #force_inline simd_util.last_index_byte(s, c)
} else {
return _last_index_byte(s, c)
}

View File

@@ -8,26 +8,24 @@
// package simd_util implements compositions of SIMD operations for optimizing
// the core library where available.
//+build i386, amd64
package simd_util
import "base:intrinsics"
import "core:simd/x86"
@private SCAN_REGISTER_SIZE :: 16
@private SCAN_REGISTERS :: 4
@private SCAN_WIDTH :: SCAN_REGISTERS * SCAN_REGISTER_SIZE
@private SCAN_WIDTH :: 32
// How long should a string be before using any of the `index_*` procedures in
// this package.
RECOMMENDED_SCAN_SIZE :: SCAN_REGISTER_SIZE
@(private, rodata)
simd_scanner_indices := #simd[SCAN_WIDTH]u8 {
0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23,
24, 25, 26, 27, 28, 29, 30, 31,
}
/*
Scan a slice of bytes for a specific byte.
This procedure safely handles padding out slices of any length, including empty
slices.
This procedure safely handles slices of any length, including empty slices.
Inputs:
- data: A slice of bytes.
@@ -36,83 +34,54 @@ Inputs:
Returns:
- index: The index of the byte `c`, or -1 if it was not found.
*/
@(enable_target_feature="sse2")
index_byte :: proc(data: []u8, c: byte) -> (index: int) #no_bounds_check {
scanner_data: [SCAN_REGISTER_SIZE]u8 = c
scanner := intrinsics.unaligned_load(cast(^x86.__m128i)&scanner_data[0])
i: int
length := len(data)
full_chunks_length := length - length % SCAN_WIDTH
i := 0
for /**/; i < full_chunks_length; i += SCAN_WIDTH {
simd_load := intrinsics.unaligned_load(cast(^[SCAN_REGISTERS]x86.__m128i)&data[i])
#unroll for j in 0..<SCAN_REGISTERS {
cmp := x86._mm_cmpeq_epi8(simd_load[j], scanner)
mask := x86._mm_movemask_epi8(cmp)
// NOTE(Feoramund): I experimented with ORing all the masks onto a
// 128-bit integer before performing the `mask != 0` check to see
// if that might be faster. However, the cost to avoid 3
// compares resulted in a marginally slower runtime on my machine.
//
// Simpler won out here.
if mask != 0 {
ctz := intrinsics.count_trailing_zeros(mask)
return i + j * SCAN_REGISTER_SIZE + cast(int)ctz
// Guard against small strings.
if length < SCAN_WIDTH {
for /**/; i < length; i += 1 {
if data[i] == c {
return i
}
}
return -1
}
ptr := cast(int)cast(uintptr)raw_data(data)
alignment_start := (SCAN_WIDTH - ptr % SCAN_WIDTH) % SCAN_WIDTH
// Iterate as a scalar until the data is aligned on a `SCAN_WIDTH` boundary.
//
// This way, every load in the vector loop will be aligned, which should be
// the fastest possible scenario.
for /**/; i < alignment_start; i += 1 {
if data[i] == c {
return i
}
}
if i < length {
// The data is not exactly divisible by SCAN_WIDTH, and we haven't found
// what we're looking for yet, so we must pad out the end, then run our
// algorithm on it.
padded_data_end: [SCAN_WIDTH]u8 = ---
remnant_length := length % SCAN_WIDTH
intrinsics.mem_copy_non_overlapping(
&padded_data_end[0],
&raw_data(data)[full_chunks_length],
remnant_length,
)
// Iterate as a vector over every aligned chunk, evaluating each byte simultaneously at the CPU level.
scanner: #simd[SCAN_WIDTH]u8 = c
tail := length - (length - alignment_start) % SCAN_WIDTH
simd_load := intrinsics.unaligned_load(cast(^[SCAN_REGISTERS]x86.__m128i)&padded_data_end[0])
#unroll for j in 0..<SCAN_REGISTERS {
cmp := x86._mm_cmpeq_epi8(simd_load[j], scanner)
mask := x86._mm_movemask_epi8(cmp)
// Because this data is padded out, it's possible that we could
// match on uninitialized memory, so we must guard against that.
// Create a relevancy mask: (Example)
//
// max(u64) = 0xFFFF_FFFF_FFFF_FFFF
//
// Convert an integer into a stream of on-bits by using the
// shifted negation of the maximum. The subtraction selects which
// section of the overall mask we should apply.
//
// << 17 - (1 * SCAN_REGISTER_SIZE)
// = 0xFFFF_FFFF_FFFF_FFFE
//
submask := max(u64) << u64(remnant_length - (j * SCAN_REGISTER_SIZE))
//
// ~submask = 0x0000_0000_0000_0001
// (submask >> 63) = 0x0000_0000_0000_0001
//
// The multiplication is a guard against zero.
//
submask = ~submask * (submask >> 63)
//
// Finally, mask out any irrelevant bits with the submask.
mask &= i32(submask)
if mask != 0 {
ctz := int(intrinsics.count_trailing_zeros(mask))
return i + j * SCAN_REGISTER_SIZE + ctz
}
for /**/; i < tail; i += SCAN_WIDTH {
load := (cast(^#simd[SCAN_WIDTH]u8)(&data[i]))^
comparison := intrinsics.simd_lanes_eq(load, scanner)
match := intrinsics.simd_reduce_or(comparison)
if match > 0 {
sentinel: #simd[SCAN_WIDTH]u8 = u8(0xFF)
index_select := intrinsics.simd_select(comparison, simd_scanner_indices, sentinel)
index_reduce := intrinsics.simd_reduce_min(index_select)
return i + cast(int)index_reduce
}
}
// Iterate as a scalar over the remaining unaligned portion.
for /**/; i < length; i += 1 {
if data[i] == c {
return i
}
}
@@ -123,8 +92,7 @@ index_byte :: proc(data: []u8, c: byte) -> (index: int) #no_bounds_check {
Scan a slice of bytes for a specific byte, starting from the end and working
backwards to the start.
This procedure safely handles padding out slices of any length, including empty
slices.
This procedure safely handles slices of any length, including empty slices.
Inputs:
- data: A slice of bytes.
@@ -133,54 +101,58 @@ Inputs:
Returns:
- index: The index of the byte `c`, or -1 if it was not found.
*/
@(enable_target_feature="sse2")
last_index_byte :: proc(data: []u8, c: byte) -> int #no_bounds_check {
scanner_data: [SCAN_REGISTER_SIZE]u8 = c
scanner := intrinsics.unaligned_load(cast(^x86.__m128i)&scanner_data[0])
length := len(data)
i := length - 1
i := len(data) - SCAN_WIDTH
for /**/; i >= 0; i -= SCAN_WIDTH {
simd_load := intrinsics.unaligned_load(cast(^[SCAN_REGISTERS]x86.__m128i)&data[i])
// There is no #reverse #unroll at the time of this writing, so we use
// `j` to count down by subtraction.
#unroll for j in 1..=SCAN_REGISTERS {
cmp := x86._mm_cmpeq_epi8(simd_load[SCAN_REGISTERS-j], scanner)
mask := x86._mm_movemask_epi8(cmp)
if mask != 0 {
// CLZ is used instead to get the on-bit from the other end.
clz := (8 * size_of(mask) - 1) - int(intrinsics.count_leading_zeros(mask))
return i + SCAN_WIDTH - j * SCAN_REGISTER_SIZE + clz
// Guard against small strings.
if length < SCAN_WIDTH {
for /**/; i >= 0; i -= 1 {
if data[i] == c {
return i
}
}
return -1
}
ptr := cast(int)cast(uintptr)raw_data(data)
tail := length - (ptr + length) % SCAN_WIDTH
// Iterate as a scalar until the data is aligned on a `SCAN_WIDTH` boundary.
//
// This way, every load in the vector loop will be aligned, which should be
// the fastest possible scenario.
for /**/; i >= tail; i -= 1 {
if data[i] == c {
return i
}
}
if i < 0 {
padded_data_end: [SCAN_WIDTH]u8 = ---
remnant_length := len(data) % SCAN_WIDTH
intrinsics.mem_copy_non_overlapping(
&padded_data_end[0],
&raw_data(data)[0],
remnant_length,
)
// Iterate as a vector over every aligned chunk, evaluating each byte simultaneously at the CPU level.
scanner: #simd[SCAN_WIDTH]u8 = c
alignment_start := (SCAN_WIDTH - ptr % SCAN_WIDTH) % SCAN_WIDTH
simd_load := intrinsics.unaligned_load(cast(^[SCAN_REGISTERS]x86.__m128i)&padded_data_end[0])
i -= SCAN_WIDTH - 1
#unroll for j in 1..=SCAN_REGISTERS {
cmp := x86._mm_cmpeq_epi8(simd_load[SCAN_REGISTERS-j], scanner)
mask := x86._mm_movemask_epi8(cmp)
for /**/; i >= alignment_start; i -= SCAN_WIDTH {
load := (cast(^#simd[SCAN_WIDTH]u8)(&data[i]))^
comparison := intrinsics.simd_lanes_eq(load, scanner)
match := intrinsics.simd_reduce_or(comparison)
if match > 0 {
sentinel: #simd[SCAN_WIDTH]u8
index_select := intrinsics.simd_select(comparison, simd_scanner_indices, sentinel)
index_reduce := intrinsics.simd_reduce_max(index_select)
return i + cast(int)index_reduce
}
}
submask := max(u64) << u64(remnant_length - (SCAN_REGISTERS-j) * SCAN_REGISTER_SIZE)
submask = ~submask * (submask >> 63)
mask &= i32(submask)
if mask != 0 {
clz := (8 * size_of(mask) - 1) - int(intrinsics.count_leading_zeros(mask))
return SCAN_WIDTH - j * SCAN_REGISTER_SIZE + clz
}
// Iterate as a scalar over the remaining unaligned portion.
i += SCAN_WIDTH - 1
for /**/; i >= 0; i -= 1 {
if data[i] == c {
return i
}
}

View File

@@ -1438,14 +1438,8 @@ index_byte :: proc(s: string, c: byte) -> (res: int) {
// NOTE(Feoramund): On my Alder Lake CPU, I have only witnessed a
// significant speedup when compiling in either Size or Speed mode.
// The SIMD version is usually 2-3x slower without optimizations on.
when ODIN_OPTIMIZATION_MODE > .Minimal && intrinsics.has_target_feature("sse2") {
// SIMD's benefits are noticeable only past a certain threshold of data.
// For small data, use the plain old algorithm.
if len(s) >= simd_util.RECOMMENDED_SCAN_SIZE {
return simd_util.index_byte(transmute([]u8)s, c)
} else {
return _index_byte(s, c)
}
when ODIN_OPTIMIZATION_MODE > .Minimal {
return #force_inline simd_util.index_byte(transmute([]u8)s, c)
} else {
return _index_byte(s, c)
}
@@ -1492,12 +1486,8 @@ last_index_byte :: proc(s: string, c: byte) -> (res: int) {
return -1
}
when ODIN_OPTIMIZATION_MODE > .Minimal && intrinsics.has_target_feature("sse2") {
if len(s) >= simd_util.RECOMMENDED_SCAN_SIZE {
return simd_util.last_index_byte(transmute([]u8)s, c)
} else {
return _last_index_byte(s, c)
}
when ODIN_OPTIMIZATION_MODE > .Minimal {
return #force_inline simd_util.last_index_byte(transmute([]u8)s, c)
} else {
return _last_index_byte(s, c)
}

View File

@@ -1,4 +1,3 @@
//+build i386, amd64
package benchmark_simd_util
import "core:fmt"

View File

@@ -1,4 +1,3 @@
//+build i386, amd64
package test_core_simd_util
import simd_util "core:simd/util"