From f66fcd9acb390b199452a125ed09899dffefde5d Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Sun, 4 Aug 2024 15:58:56 -0400 Subject: [PATCH] Use vectorized `index_*` procs in `core` --- core/bytes/bytes.odin | 47 ++++++++++++++++++++++++++++++++------- core/strings/strings.odin | 47 ++++++++++++++++++++++++++++++++------- 2 files changed, 78 insertions(+), 16 deletions(-) diff --git a/core/bytes/bytes.odin b/core/bytes/bytes.odin index 7cbf092ac..dcd4931e2 100644 --- a/core/bytes/bytes.odin +++ b/core/bytes/bytes.odin @@ -1,6 +1,8 @@ package bytes +import "base:intrinsics" import "core:mem" +@require import simd_util "core:simd/util" import "core:unicode" import "core:unicode/utf8" @@ -295,22 +297,51 @@ split_after_iterator :: proc(s: ^[]byte, sep: []byte) -> ([]byte, bool) { index_byte :: proc(s: []byte, c: byte) -> int { - for i := 0; i < len(s); i += 1 { - if s[i] == c { - return i + _index_byte :: #force_inline proc(s: []byte, c: byte) -> int { + for i := 0; i < len(s); i += 1 { + if s[i] == c { + return i + } } + return -1 + } + + // NOTE(Feoramund): On my Alder Lake CPU, I have only witnessed a + // significant speedup when compiling in either Size or Speed mode. + // The SIMD version is usually 2-3x slower without optimizations on. + when ODIN_OPTIMIZATION_MODE > .Minimal && intrinsics.has_target_feature("sse2") { + // SIMD's benefits are noticeable only past a certain threshold of data. + // For small data, use the plain old algorithm. + if len(s) >= simd_util.RECOMMENDED_SCAN_SIZE { + return simd_util.index_byte(s, c) + } else { + return _index_byte(s, c) + } + } else { + return _index_byte(s, c) } - return -1 } // Returns -1 if c is not present last_index_byte :: proc(s: []byte, c: byte) -> int { - for i := len(s)-1; i >= 0; i -= 1 { - if s[i] == c { - return i + _last_index_byte :: #force_inline proc(s: []byte, c: byte) -> int { + for i := len(s)-1; i >= 0; i -= 1 { + if s[i] == c { + return i + } } + return -1 + } + + when ODIN_OPTIMIZATION_MODE > .Minimal && intrinsics.has_target_feature("sse2") { + if len(s) >= simd_util.RECOMMENDED_SCAN_SIZE { + return simd_util.last_index_byte(s, c) + } else { + return _last_index_byte(s, c) + } + } else { + return _last_index_byte(s, c) } - return -1 } diff --git a/core/strings/strings.odin b/core/strings/strings.odin index e9b50bab0..9d3e88165 100644 --- a/core/strings/strings.odin +++ b/core/strings/strings.odin @@ -1,7 +1,9 @@ // Procedures to manipulate UTF-8 encoded strings package strings +import "base:intrinsics" import "core:io" +@require import simd_util "core:simd/util" import "core:mem" import "core:unicode" import "core:unicode/utf8" @@ -1424,12 +1426,29 @@ Output: */ index_byte :: proc(s: string, c: byte) -> (res: int) { - for i := 0; i < len(s); i += 1 { - if s[i] == c { - return i + _index_byte :: #force_inline proc(s: string, c: byte) -> int { + for i := 0; i < len(s); i += 1 { + if s[i] == c { + return i + } } + return -1 + } + + // NOTE(Feoramund): On my Alder Lake CPU, I have only witnessed a + // significant speedup when compiling in either Size or Speed mode. + // The SIMD version is usually 2-3x slower without optimizations on. + when ODIN_OPTIMIZATION_MODE > .Minimal && intrinsics.has_target_feature("sse2") { + // SIMD's benefits are noticeable only past a certain threshold of data. + // For small data, use the plain old algorithm. + if len(s) >= simd_util.RECOMMENDED_SCAN_SIZE { + return simd_util.index_byte(transmute([]u8)s, c) + } else { + return _index_byte(s, c) + } + } else { + return _index_byte(s, c) } - return -1 } /* Returns the byte offset of the last byte `c` in the string `s`, -1 when not found. @@ -1464,12 +1483,24 @@ Output: */ last_index_byte :: proc(s: string, c: byte) -> (res: int) { - for i := len(s)-1; i >= 0; i -= 1 { - if s[i] == c { - return i + _last_index_byte :: #force_inline proc(s: string, c: byte) -> int { + for i := len(s)-1; i >= 0; i -= 1 { + if s[i] == c { + return i + } } + return -1 + } + + when ODIN_OPTIMIZATION_MODE > .Minimal && intrinsics.has_target_feature("sse2") { + if len(s) >= simd_util.RECOMMENDED_SCAN_SIZE { + return simd_util.last_index_byte(transmute([]u8)s, c) + } else { + return _last_index_byte(s, c) + } + } else { + return _last_index_byte(s, c) } - return -1 } /* Returns the byte offset of the first rune `r` in the string `s` it finds, -1 when not found.