Simplify and make simd_util cross-platform

This new algorithm uses a Scalar->Vector->Scalar iteration loop which requires no masking off of any incomplete data chunks. Also, the width was reduced to 32 bytes instead of 64, as I found this to be about as fast as the previous 64-byte x86 version.
2026-04-23 06:45:19 +00:00 · 2024-08-09 17:39:19 -04:00
parent 793811b219
commit 12dd0cb72a
5 changed files with 101 additions and 151 deletions
--- a/core/bytes/bytes.odin
+++ b/core/bytes/bytes.odin
@@ -309,14 +309,8 @@ index_byte :: proc(s: []byte, c: byte) -> int {
 	// NOTE(Feoramund): On my Alder Lake CPU, I have only witnessed a
 	// significant speedup when compiling in either Size or Speed mode.
 	// The SIMD version is usually 2-3x slower without optimizations on.
-	when ODIN_OPTIMIZATION_MODE > .Minimal && intrinsics.has_target_feature("sse2") {
-		// SIMD's benefits are noticeable only past a certain threshold of data.
-		// For small data, use the plain old algorithm.
-		if len(s) >= simd_util.RECOMMENDED_SCAN_SIZE {
-			return simd_util.index_byte(s, c)
-		} else {
-			return _index_byte(s, c)
-		}
+	when ODIN_OPTIMIZATION_MODE > .Minimal {
+		return #force_inline simd_util.index_byte(s, c)
 	} else {
 		return _index_byte(s, c)
 	}
@@ -333,12 +327,8 @@ last_index_byte :: proc(s: []byte, c: byte) -> int {
 		return -1
 	}

-	when ODIN_OPTIMIZATION_MODE > .Minimal && intrinsics.has_target_feature("sse2") {
-		if len(s) >= simd_util.RECOMMENDED_SCAN_SIZE {
-			return simd_util.last_index_byte(s, c)
-		} else {
-			return _last_index_byte(s, c)
-		}
+	when ODIN_OPTIMIZATION_MODE > .Minimal {
+		return #force_inline simd_util.last_index_byte(s, c)
 	} else {
 		return _last_index_byte(s, c)
 	}