diff --git a/core/simd/util/util.odin b/core/simd/util/util.odin new file mode 100644 index 000000000..ac523b42a --- /dev/null +++ b/core/simd/util/util.odin @@ -0,0 +1,188 @@ +/* + (c) Copyright 2024 Feoramund . + Made available under Odin's BSD-3 license. + + List of contributors: + Feoramund: `index_byte` procedures. +*/ + +// package simd_util implements compositions of SIMD operations for optimizing +// the core library where available. + +//+build i386, amd64 +package simd_util + +import "base:intrinsics" +import "core:simd/x86" + +@private SCAN_REGISTER_SIZE :: 16 +@private SCAN_REGISTERS :: 4 +@private SCAN_WIDTH :: SCAN_REGISTERS * SCAN_REGISTER_SIZE + +// How long should a string be before using any of the `index_*` procedures in +// this package. +RECOMMENDED_SCAN_SIZE :: SCAN_REGISTER_SIZE + +/* +Scan a slice of bytes for a specific byte. + +This procedure safely handles padding out slices of any length, including empty +slices. + +Inputs: +- data: A slice of bytes. +- c: The byte to search for. + +Returns: +- index: The index of the byte `c`, or -1 if it was not found. +*/ +@(enable_target_feature="sse2") +index_byte :: proc(data: []u8, c: byte) -> (index: int) #no_bounds_check { + scanner_data: [SCAN_REGISTER_SIZE]u8 = c + scanner := intrinsics.unaligned_load(cast(^x86.__m128i)&scanner_data[0]) + + i: int + length := len(data) + full_chunks_length := length - length % SCAN_WIDTH + + for /**/; i < full_chunks_length; i += SCAN_WIDTH { + simd_load := intrinsics.unaligned_load(cast(^[SCAN_REGISTERS]x86.__m128i)&data[i]) + + #unroll for j in 0..> 63) = 0x0000_0000_0000_0001 + // + // The multiplication is a guard against zero. + // + submask = ~submask * (submask >> 63) + // + // Finally, mask out any irrelevant bits with the submask. + mask &= i32(submask) + + if mask != 0 { + ctz := int(intrinsics.count_trailing_zeros(mask)) + return i + j * SCAN_REGISTER_SIZE + ctz + } + } + } + + return -1 +} + +/* +Scan a slice of bytes for a specific byte, starting from the end and working +backwards to the start. + +This procedure safely handles padding out slices of any length, including empty +slices. + +Inputs: +- data: A slice of bytes. +- c: The byte to search for. + +Returns: +- index: The index of the byte `c`, or -1 if it was not found. +*/ +@(enable_target_feature="sse2") +last_index_byte :: proc(data: []u8, c: byte) -> int #no_bounds_check { + scanner_data: [SCAN_REGISTER_SIZE]u8 = c + scanner := intrinsics.unaligned_load(cast(^x86.__m128i)&scanner_data[0]) + + i := len(data) - SCAN_WIDTH + + for /**/; i >= 0; i -= SCAN_WIDTH { + simd_load := intrinsics.unaligned_load(cast(^[SCAN_REGISTERS]x86.__m128i)&data[i]) + + // There is no #reverse #unroll at the time of this writing, so we use + // `j` to count down by subtraction. + #unroll for j in 1..=SCAN_REGISTERS { + cmp := x86._mm_cmpeq_epi8(simd_load[SCAN_REGISTERS-j], scanner) + mask := x86._mm_movemask_epi8(cmp) + + if mask != 0 { + // CLZ is used instead to get the on-bit from the other end. + clz := (8 * size_of(mask) - 1) - int(intrinsics.count_leading_zeros(mask)) + return i + SCAN_WIDTH - j * SCAN_REGISTER_SIZE + clz + } + } + } + + if i < 0 { + padded_data_end: [SCAN_WIDTH]u8 = --- + remnant_length := len(data) % SCAN_WIDTH + intrinsics.mem_copy_non_overlapping( + &padded_data_end[0], + &raw_data(data)[0], + remnant_length, + ) + + simd_load := intrinsics.unaligned_load(cast(^[SCAN_REGISTERS]x86.__m128i)&padded_data_end[0]) + + #unroll for j in 1..=SCAN_REGISTERS { + cmp := x86._mm_cmpeq_epi8(simd_load[SCAN_REGISTERS-j], scanner) + mask := x86._mm_movemask_epi8(cmp) + + submask := max(u64) << u64(remnant_length - (SCAN_REGISTERS-j) * SCAN_REGISTER_SIZE) + submask = ~submask * (submask >> 63) + + mask &= i32(submask) + + if mask != 0 { + clz := (8 * size_of(mask) - 1) - int(intrinsics.count_leading_zeros(mask)) + return SCAN_WIDTH - j * SCAN_REGISTER_SIZE + clz + } + } + } + + return -1 +}