Simplify and make simd_util cross-platform

This new algorithm uses a Scalar->Vector->Scalar iteration loop which
requires no masking off of any incomplete data chunks.

Also, the width was reduced to 32 bytes instead of 64, as I found this
to be about as fast as the previous 64-byte x86 version.
This commit is contained in:
Feoramund
2024-08-09 17:39:19 -04:00
parent 793811b219
commit 12dd0cb72a
5 changed files with 101 additions and 151 deletions

View File

@@ -309,14 +309,8 @@ index_byte :: proc(s: []byte, c: byte) -> int {
// NOTE(Feoramund): On my Alder Lake CPU, I have only witnessed a
// significant speedup when compiling in either Size or Speed mode.
// The SIMD version is usually 2-3x slower without optimizations on.
when ODIN_OPTIMIZATION_MODE > .Minimal && intrinsics.has_target_feature("sse2") {
// SIMD's benefits are noticeable only past a certain threshold of data.
// For small data, use the plain old algorithm.
if len(s) >= simd_util.RECOMMENDED_SCAN_SIZE {
return simd_util.index_byte(s, c)
} else {
return _index_byte(s, c)
}
when ODIN_OPTIMIZATION_MODE > .Minimal {
return #force_inline simd_util.index_byte(s, c)
} else {
return _index_byte(s, c)
}
@@ -333,12 +327,8 @@ last_index_byte :: proc(s: []byte, c: byte) -> int {
return -1
}
when ODIN_OPTIMIZATION_MODE > .Minimal && intrinsics.has_target_feature("sse2") {
if len(s) >= simd_util.RECOMMENDED_SCAN_SIZE {
return simd_util.last_index_byte(s, c)
} else {
return _last_index_byte(s, c)
}
when ODIN_OPTIMIZATION_MODE > .Minimal {
return #force_inline simd_util.last_index_byte(s, c)
} else {
return _last_index_byte(s, c)
}