diff --git a/core/bytes/bytes.odin b/core/bytes/bytes.odin index dcd4931e2..136c98f6b 100644 --- a/core/bytes/bytes.odin +++ b/core/bytes/bytes.odin @@ -309,14 +309,8 @@ index_byte :: proc(s: []byte, c: byte) -> int { // NOTE(Feoramund): On my Alder Lake CPU, I have only witnessed a // significant speedup when compiling in either Size or Speed mode. // The SIMD version is usually 2-3x slower without optimizations on. - when ODIN_OPTIMIZATION_MODE > .Minimal && intrinsics.has_target_feature("sse2") { - // SIMD's benefits are noticeable only past a certain threshold of data. - // For small data, use the plain old algorithm. - if len(s) >= simd_util.RECOMMENDED_SCAN_SIZE { - return simd_util.index_byte(s, c) - } else { - return _index_byte(s, c) - } + when ODIN_OPTIMIZATION_MODE > .Minimal { + return #force_inline simd_util.index_byte(s, c) } else { return _index_byte(s, c) } @@ -333,12 +327,8 @@ last_index_byte :: proc(s: []byte, c: byte) -> int { return -1 } - when ODIN_OPTIMIZATION_MODE > .Minimal && intrinsics.has_target_feature("sse2") { - if len(s) >= simd_util.RECOMMENDED_SCAN_SIZE { - return simd_util.last_index_byte(s, c) - } else { - return _last_index_byte(s, c) - } + when ODIN_OPTIMIZATION_MODE > .Minimal { + return #force_inline simd_util.last_index_byte(s, c) } else { return _last_index_byte(s, c) } diff --git a/core/simd/util/util.odin b/core/simd/util/util.odin index ac523b42a..b209a44ea 100644 --- a/core/simd/util/util.odin +++ b/core/simd/util/util.odin @@ -8,26 +8,24 @@ // package simd_util implements compositions of SIMD operations for optimizing // the core library where available. - -//+build i386, amd64 package simd_util import "base:intrinsics" -import "core:simd/x86" -@private SCAN_REGISTER_SIZE :: 16 -@private SCAN_REGISTERS :: 4 -@private SCAN_WIDTH :: SCAN_REGISTERS * SCAN_REGISTER_SIZE +@private SCAN_WIDTH :: 32 -// How long should a string be before using any of the `index_*` procedures in -// this package. -RECOMMENDED_SCAN_SIZE :: SCAN_REGISTER_SIZE +@(private, rodata) +simd_scanner_indices := #simd[SCAN_WIDTH]u8 { + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, +} /* Scan a slice of bytes for a specific byte. -This procedure safely handles padding out slices of any length, including empty -slices. +This procedure safely handles slices of any length, including empty slices. Inputs: - data: A slice of bytes. @@ -36,83 +34,54 @@ Inputs: Returns: - index: The index of the byte `c`, or -1 if it was not found. */ -@(enable_target_feature="sse2") index_byte :: proc(data: []u8, c: byte) -> (index: int) #no_bounds_check { - scanner_data: [SCAN_REGISTER_SIZE]u8 = c - scanner := intrinsics.unaligned_load(cast(^x86.__m128i)&scanner_data[0]) - - i: int length := len(data) - full_chunks_length := length - length % SCAN_WIDTH + i := 0 - for /**/; i < full_chunks_length; i += SCAN_WIDTH { - simd_load := intrinsics.unaligned_load(cast(^[SCAN_REGISTERS]x86.__m128i)&data[i]) - - #unroll for j in 0..> 63) = 0x0000_0000_0000_0001 - // - // The multiplication is a guard against zero. - // - submask = ~submask * (submask >> 63) - // - // Finally, mask out any irrelevant bits with the submask. - mask &= i32(submask) - - if mask != 0 { - ctz := int(intrinsics.count_trailing_zeros(mask)) - return i + j * SCAN_REGISTER_SIZE + ctz - } + for /**/; i < tail; i += SCAN_WIDTH { + load := (cast(^#simd[SCAN_WIDTH]u8)(&data[i]))^ + comparison := intrinsics.simd_lanes_eq(load, scanner) + match := intrinsics.simd_reduce_or(comparison) + if match > 0 { + sentinel: #simd[SCAN_WIDTH]u8 = u8(0xFF) + index_select := intrinsics.simd_select(comparison, simd_scanner_indices, sentinel) + index_reduce := intrinsics.simd_reduce_min(index_select) + return i + cast(int)index_reduce + } + } + + // Iterate as a scalar over the remaining unaligned portion. + for /**/; i < length; i += 1 { + if data[i] == c { + return i } } @@ -123,8 +92,7 @@ index_byte :: proc(data: []u8, c: byte) -> (index: int) #no_bounds_check { Scan a slice of bytes for a specific byte, starting from the end and working backwards to the start. -This procedure safely handles padding out slices of any length, including empty -slices. +This procedure safely handles slices of any length, including empty slices. Inputs: - data: A slice of bytes. @@ -133,54 +101,58 @@ Inputs: Returns: - index: The index of the byte `c`, or -1 if it was not found. */ -@(enable_target_feature="sse2") last_index_byte :: proc(data: []u8, c: byte) -> int #no_bounds_check { - scanner_data: [SCAN_REGISTER_SIZE]u8 = c - scanner := intrinsics.unaligned_load(cast(^x86.__m128i)&scanner_data[0]) + length := len(data) + i := length - 1 - i := len(data) - SCAN_WIDTH - - for /**/; i >= 0; i -= SCAN_WIDTH { - simd_load := intrinsics.unaligned_load(cast(^[SCAN_REGISTERS]x86.__m128i)&data[i]) - - // There is no #reverse #unroll at the time of this writing, so we use - // `j` to count down by subtraction. - #unroll for j in 1..=SCAN_REGISTERS { - cmp := x86._mm_cmpeq_epi8(simd_load[SCAN_REGISTERS-j], scanner) - mask := x86._mm_movemask_epi8(cmp) - - if mask != 0 { - // CLZ is used instead to get the on-bit from the other end. - clz := (8 * size_of(mask) - 1) - int(intrinsics.count_leading_zeros(mask)) - return i + SCAN_WIDTH - j * SCAN_REGISTER_SIZE + clz + // Guard against small strings. + if length < SCAN_WIDTH { + for /**/; i >= 0; i -= 1 { + if data[i] == c { + return i } } + return -1 + } + + ptr := cast(int)cast(uintptr)raw_data(data) + + tail := length - (ptr + length) % SCAN_WIDTH + + // Iterate as a scalar until the data is aligned on a `SCAN_WIDTH` boundary. + // + // This way, every load in the vector loop will be aligned, which should be + // the fastest possible scenario. + for /**/; i >= tail; i -= 1 { + if data[i] == c { + return i + } } - if i < 0 { - padded_data_end: [SCAN_WIDTH]u8 = --- - remnant_length := len(data) % SCAN_WIDTH - intrinsics.mem_copy_non_overlapping( - &padded_data_end[0], - &raw_data(data)[0], - remnant_length, - ) + // Iterate as a vector over every aligned chunk, evaluating each byte simultaneously at the CPU level. + scanner: #simd[SCAN_WIDTH]u8 = c + alignment_start := (SCAN_WIDTH - ptr % SCAN_WIDTH) % SCAN_WIDTH - simd_load := intrinsics.unaligned_load(cast(^[SCAN_REGISTERS]x86.__m128i)&padded_data_end[0]) + i -= SCAN_WIDTH - 1 - #unroll for j in 1..=SCAN_REGISTERS { - cmp := x86._mm_cmpeq_epi8(simd_load[SCAN_REGISTERS-j], scanner) - mask := x86._mm_movemask_epi8(cmp) + for /**/; i >= alignment_start; i -= SCAN_WIDTH { + load := (cast(^#simd[SCAN_WIDTH]u8)(&data[i]))^ + comparison := intrinsics.simd_lanes_eq(load, scanner) + match := intrinsics.simd_reduce_or(comparison) + if match > 0 { + sentinel: #simd[SCAN_WIDTH]u8 + index_select := intrinsics.simd_select(comparison, simd_scanner_indices, sentinel) + index_reduce := intrinsics.simd_reduce_max(index_select) + return i + cast(int)index_reduce + } + } - submask := max(u64) << u64(remnant_length - (SCAN_REGISTERS-j) * SCAN_REGISTER_SIZE) - submask = ~submask * (submask >> 63) - - mask &= i32(submask) - - if mask != 0 { - clz := (8 * size_of(mask) - 1) - int(intrinsics.count_leading_zeros(mask)) - return SCAN_WIDTH - j * SCAN_REGISTER_SIZE + clz - } + // Iterate as a scalar over the remaining unaligned portion. + i += SCAN_WIDTH - 1 + + for /**/; i >= 0; i -= 1 { + if data[i] == c { + return i } } diff --git a/core/strings/strings.odin b/core/strings/strings.odin index 9d3e88165..b8e43f90d 100644 --- a/core/strings/strings.odin +++ b/core/strings/strings.odin @@ -1438,14 +1438,8 @@ index_byte :: proc(s: string, c: byte) -> (res: int) { // NOTE(Feoramund): On my Alder Lake CPU, I have only witnessed a // significant speedup when compiling in either Size or Speed mode. // The SIMD version is usually 2-3x slower without optimizations on. - when ODIN_OPTIMIZATION_MODE > .Minimal && intrinsics.has_target_feature("sse2") { - // SIMD's benefits are noticeable only past a certain threshold of data. - // For small data, use the plain old algorithm. - if len(s) >= simd_util.RECOMMENDED_SCAN_SIZE { - return simd_util.index_byte(transmute([]u8)s, c) - } else { - return _index_byte(s, c) - } + when ODIN_OPTIMIZATION_MODE > .Minimal { + return #force_inline simd_util.index_byte(transmute([]u8)s, c) } else { return _index_byte(s, c) } @@ -1492,12 +1486,8 @@ last_index_byte :: proc(s: string, c: byte) -> (res: int) { return -1 } - when ODIN_OPTIMIZATION_MODE > .Minimal && intrinsics.has_target_feature("sse2") { - if len(s) >= simd_util.RECOMMENDED_SCAN_SIZE { - return simd_util.last_index_byte(transmute([]u8)s, c) - } else { - return _last_index_byte(s, c) - } + when ODIN_OPTIMIZATION_MODE > .Minimal { + return #force_inline simd_util.last_index_byte(transmute([]u8)s, c) } else { return _last_index_byte(s, c) } diff --git a/tests/benchmark/simd/util/benchmark_simd_util.odin b/tests/benchmark/simd/util/benchmark_simd_util.odin index 4538c6612..18fa0a9e3 100644 --- a/tests/benchmark/simd/util/benchmark_simd_util.odin +++ b/tests/benchmark/simd/util/benchmark_simd_util.odin @@ -1,4 +1,3 @@ -//+build i386, amd64 package benchmark_simd_util import "core:fmt" diff --git a/tests/core/simd/util/test_core_simd_util.odin b/tests/core/simd/util/test_core_simd_util.odin index 65bf566c0..ff7e1f9aa 100644 --- a/tests/core/simd/util/test_core_simd_util.odin +++ b/tests/core/simd/util/test_core_simd_util.odin @@ -1,4 +1,3 @@ -//+build i386, amd64 package test_core_simd_util import simd_util "core:simd/util"