mirror of
https://github.com/odin-lang/Odin.git
synced 2026-01-07 13:33:13 +00:00
Merge pull request #4063 from Feoramund/simd-memory
Vectorize `base:runtime.memory_*`
This commit is contained in:
@@ -16,6 +16,12 @@ RUNTIME_REQUIRE :: false // !ODIN_TILDE
|
||||
@(private)
|
||||
__float16 :: f16 when __ODIN_LLVM_F16_SUPPORTED else u16
|
||||
|
||||
HAS_HARDWARE_SIMD :: false when (ODIN_ARCH == .amd64 || ODIN_ARCH == .i386) && !intrinsics.has_target_feature("sse2") else
|
||||
false when (ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32) && !intrinsics.has_target_feature("neon") else
|
||||
false when (ODIN_ARCH == .wasm64p32 || ODIN_ARCH == .wasm32) && !intrinsics.has_target_feature("simd128") else
|
||||
false when (ODIN_ARCH == .riscv64) && !intrinsics.has_target_feature("v") else
|
||||
true
|
||||
|
||||
|
||||
@(private)
|
||||
byte_slice :: #force_inline proc "contextless" (data: rawptr, len: int) -> []byte #no_bounds_check {
|
||||
@@ -229,150 +235,173 @@ memory_equal :: proc "contextless" (x, y: rawptr, n: int) -> bool {
|
||||
case n == 0: return true
|
||||
case x == y: return true
|
||||
}
|
||||
a, b := ([^]byte)(x), ([^]byte)(y)
|
||||
length := uint(n)
|
||||
a, b := cast([^]byte)x, cast([^]byte)y
|
||||
|
||||
for i := uint(0); i < length; i += 1 {
|
||||
n := uint(n)
|
||||
i := uint(0)
|
||||
m := uint(0)
|
||||
|
||||
if n >= 8 {
|
||||
when HAS_HARDWARE_SIMD {
|
||||
// Avoid using 256-bit SIMD on platforms where its emulation is
|
||||
// likely to be less than ideal.
|
||||
when ODIN_ARCH == .amd64 && intrinsics.has_target_feature("avx2") {
|
||||
m = n / 32 * 32
|
||||
for /**/; i < m; i += 32 {
|
||||
load_a := intrinsics.unaligned_load(cast(^#simd[32]u8)&a[i])
|
||||
load_b := intrinsics.unaligned_load(cast(^#simd[32]u8)&b[i])
|
||||
ne := intrinsics.simd_lanes_ne(load_a, load_b)
|
||||
if intrinsics.simd_reduce_or(ne) != 0 {
|
||||
return false
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
m = (n-i) / 16 * 16
|
||||
for /**/; i < m; i += 16 {
|
||||
load_a := intrinsics.unaligned_load(cast(^#simd[16]u8)&a[i])
|
||||
load_b := intrinsics.unaligned_load(cast(^#simd[16]u8)&b[i])
|
||||
ne := intrinsics.simd_lanes_ne(load_a, load_b)
|
||||
if intrinsics.simd_reduce_or(ne) != 0 {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
m = (n-i) / 8 * 8
|
||||
for /**/; i < m; i += 8 {
|
||||
if intrinsics.unaligned_load(cast(^uintptr)&a[i]) != intrinsics.unaligned_load(cast(^uintptr)&b[i]) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for /**/; i < n; i += 1 {
|
||||
if a[i] != b[i] {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
|
||||
/*
|
||||
|
||||
when size_of(uint) == 8 {
|
||||
if word_length := length >> 3; word_length != 0 {
|
||||
for _ in 0..<word_length {
|
||||
if intrinsics.unaligned_load((^u64)(a)) != intrinsics.unaligned_load((^u64)(b)) {
|
||||
return false
|
||||
}
|
||||
a = a[size_of(u64):]
|
||||
b = b[size_of(u64):]
|
||||
}
|
||||
}
|
||||
|
||||
if length & 4 != 0 {
|
||||
if intrinsics.unaligned_load((^u32)(a)) != intrinsics.unaligned_load((^u32)(b)) {
|
||||
return false
|
||||
}
|
||||
a = a[size_of(u32):]
|
||||
b = b[size_of(u32):]
|
||||
}
|
||||
|
||||
if length & 2 != 0 {
|
||||
if intrinsics.unaligned_load((^u16)(a)) != intrinsics.unaligned_load((^u16)(b)) {
|
||||
return false
|
||||
}
|
||||
a = a[size_of(u16):]
|
||||
b = b[size_of(u16):]
|
||||
}
|
||||
|
||||
if length & 1 != 0 && a[0] != b[0] {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
} else {
|
||||
if word_length := length >> 2; word_length != 0 {
|
||||
for _ in 0..<word_length {
|
||||
if intrinsics.unaligned_load((^u32)(a)) != intrinsics.unaligned_load((^u32)(b)) {
|
||||
return false
|
||||
}
|
||||
a = a[size_of(u32):]
|
||||
b = b[size_of(u32):]
|
||||
}
|
||||
}
|
||||
|
||||
length &= 3
|
||||
|
||||
if length != 0 {
|
||||
for i in 0..<length {
|
||||
if a[i] != b[i] {
|
||||
return false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
*/
|
||||
|
||||
}
|
||||
memory_compare :: proc "contextless" (a, b: rawptr, n: int) -> int #no_bounds_check {
|
||||
|
||||
memory_compare :: proc "contextless" (x, y: rawptr, n: int) -> int #no_bounds_check {
|
||||
switch {
|
||||
case a == b: return 0
|
||||
case a == nil: return -1
|
||||
case b == nil: return +1
|
||||
case x == y: return 0
|
||||
case x == nil: return -1
|
||||
case y == nil: return +1
|
||||
}
|
||||
a, b := cast([^]byte)x, cast([^]byte)y
|
||||
|
||||
n := uint(n)
|
||||
i := uint(0)
|
||||
m := uint(0)
|
||||
|
||||
x := uintptr(a)
|
||||
y := uintptr(b)
|
||||
n := uintptr(n)
|
||||
|
||||
SU :: size_of(uintptr)
|
||||
fast := n/SU + 1
|
||||
offset := (fast-1)*SU
|
||||
curr_block := uintptr(0)
|
||||
if n < SU {
|
||||
fast = 0
|
||||
}
|
||||
|
||||
for /**/; curr_block < fast; curr_block += 1 {
|
||||
va := (^uintptr)(x + curr_block * size_of(uintptr))^
|
||||
vb := (^uintptr)(y + curr_block * size_of(uintptr))^
|
||||
if va ~ vb != 0 {
|
||||
for pos := curr_block*SU; pos < n; pos += 1 {
|
||||
a := (^byte)(x+pos)^
|
||||
b := (^byte)(y+pos)^
|
||||
if a ~ b != 0 {
|
||||
return -1 if (int(a) - int(b)) < 0 else +1
|
||||
when HAS_HARDWARE_SIMD {
|
||||
when ODIN_ARCH == .amd64 && intrinsics.has_target_feature("avx2") {
|
||||
m = n / 32 * 32
|
||||
for /**/; i < m; i += 32 {
|
||||
load_a := intrinsics.unaligned_load(cast(^#simd[32]u8)&a[i])
|
||||
load_b := intrinsics.unaligned_load(cast(^#simd[32]u8)&b[i])
|
||||
comparison := intrinsics.simd_lanes_ne(load_a, load_b)
|
||||
if intrinsics.simd_reduce_or(comparison) != 0 {
|
||||
sentinel: #simd[32]u8 = u8(0xFF)
|
||||
indices := intrinsics.simd_indices(#simd[32]u8)
|
||||
index_select := intrinsics.simd_select(comparison, indices, sentinel)
|
||||
index_reduce := cast(uint)intrinsics.simd_reduce_min(index_select)
|
||||
return -1 if a[i+index_reduce] < b[i+index_reduce] else +1
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for /**/; offset < n; offset += 1 {
|
||||
a := (^byte)(x+offset)^
|
||||
b := (^byte)(y+offset)^
|
||||
if a ~ b != 0 {
|
||||
return -1 if (int(a) - int(b)) < 0 else +1
|
||||
m = (n-i) / 16 * 16
|
||||
for /**/; i < m; i += 16 {
|
||||
load_a := intrinsics.unaligned_load(cast(^#simd[16]u8)&a[i])
|
||||
load_b := intrinsics.unaligned_load(cast(^#simd[16]u8)&b[i])
|
||||
comparison := intrinsics.simd_lanes_ne(load_a, load_b)
|
||||
if intrinsics.simd_reduce_or(comparison) != 0 {
|
||||
sentinel: #simd[16]u8 = u8(0xFF)
|
||||
indices := intrinsics.simd_indices(#simd[16]u8)
|
||||
index_select := intrinsics.simd_select(comparison, indices, sentinel)
|
||||
index_reduce := cast(uint)intrinsics.simd_reduce_min(index_select)
|
||||
return -1 if a[i+index_reduce] < b[i+index_reduce] else +1
|
||||
}
|
||||
}
|
||||
|
||||
// 64-bit SIMD is faster than using a `uintptr` to detect a difference then
|
||||
// re-iterating with the byte-by-byte loop, at least on AMD64.
|
||||
m = (n-i) / 8 * 8
|
||||
for /**/; i < m; i += 8 {
|
||||
load_a := intrinsics.unaligned_load(cast(^#simd[8]u8)&a[i])
|
||||
load_b := intrinsics.unaligned_load(cast(^#simd[8]u8)&b[i])
|
||||
comparison := intrinsics.simd_lanes_ne(load_a, load_b)
|
||||
if intrinsics.simd_reduce_or(comparison) != 0 {
|
||||
sentinel: #simd[8]u8 = u8(0xFF)
|
||||
indices := intrinsics.simd_indices(#simd[8]u8)
|
||||
index_select := intrinsics.simd_select(comparison, indices, sentinel)
|
||||
index_reduce := cast(uint)intrinsics.simd_reduce_min(index_select)
|
||||
return -1 if a[i+index_reduce] < b[i+index_reduce] else +1
|
||||
}
|
||||
}
|
||||
|
||||
for /**/; i < n; i += 1 {
|
||||
if a[i] ~ b[i] != 0 {
|
||||
return -1 if int(a[i]) - int(b[i]) < 0 else +1
|
||||
}
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
memory_compare_zero :: proc "contextless" (a: rawptr, n: int) -> int #no_bounds_check {
|
||||
x := uintptr(a)
|
||||
n := uintptr(n)
|
||||
n := uint(n)
|
||||
i := uint(0)
|
||||
m := uint(0)
|
||||
|
||||
SU :: size_of(uintptr)
|
||||
fast := n/SU + 1
|
||||
offset := (fast-1)*SU
|
||||
curr_block := uintptr(0)
|
||||
if n < SU {
|
||||
fast = 0
|
||||
}
|
||||
// Because we're comparing against zero, we never return -1, as that would
|
||||
// indicate the compared value is less than zero.
|
||||
//
|
||||
// Note that a zero return value here means equality.
|
||||
|
||||
for /**/; curr_block < fast; curr_block += 1 {
|
||||
va := (^uintptr)(x + curr_block * size_of(uintptr))^
|
||||
if va ~ 0 != 0 {
|
||||
for pos := curr_block*SU; pos < n; pos += 1 {
|
||||
a := (^byte)(x+pos)^
|
||||
if a ~ 0 != 0 {
|
||||
return -1 if int(a) < 0 else +1
|
||||
bytes := ([^]u8)(a)
|
||||
|
||||
if n >= 8 {
|
||||
when HAS_HARDWARE_SIMD {
|
||||
when ODIN_ARCH == .amd64 && intrinsics.has_target_feature("avx2") {
|
||||
scanner32: #simd[32]u8
|
||||
m = n / 32 * 32
|
||||
for /**/; i < m; i += 32 {
|
||||
load := intrinsics.unaligned_load(cast(^#simd[32]u8)&bytes[i])
|
||||
ne := intrinsics.simd_lanes_ne(scanner32, load)
|
||||
if intrinsics.simd_reduce_or(ne) > 0 {
|
||||
return 1
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
scanner16: #simd[16]u8
|
||||
m = (n-i) / 16 * 16
|
||||
for /**/; i < m; i += 16 {
|
||||
load := intrinsics.unaligned_load(cast(^#simd[16]u8)&bytes[i])
|
||||
ne := intrinsics.simd_lanes_ne(scanner16, load)
|
||||
if intrinsics.simd_reduce_or(ne) != 0 {
|
||||
return 1
|
||||
}
|
||||
}
|
||||
|
||||
m = (n-i) / 8 * 8
|
||||
for /**/; i < m; i += 8 {
|
||||
if intrinsics.unaligned_load(cast(^uintptr)&bytes[i]) != 0 {
|
||||
return 1
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for /**/; offset < n; offset += 1 {
|
||||
a := (^byte)(x+offset)^
|
||||
if a ~ 0 != 0 {
|
||||
return -1 if int(a) < 0 else +1
|
||||
for /**/; i < n; i += 1 {
|
||||
if bytes[i] != 0 {
|
||||
return 1
|
||||
}
|
||||
}
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
|
||||
@@ -350,7 +350,7 @@ index_byte :: proc "contextless" (s: []byte, c: byte) -> (index: int) #no_bounds
|
||||
}
|
||||
|
||||
c_vec: simd.u8x16 = c
|
||||
when !simd.IS_EMULATED {
|
||||
when simd.HAS_HARDWARE_SIMD {
|
||||
// Note: While this is something that could also logically take
|
||||
// advantage of AVX512, the various downclocking and power
|
||||
// consumption related woes make premature to have a dedicated
|
||||
@@ -485,7 +485,7 @@ last_index_byte :: proc "contextless" (s: []byte, c: byte) -> int #no_bounds_che
|
||||
}
|
||||
|
||||
c_vec: simd.u8x16 = c
|
||||
when !simd.IS_EMULATED {
|
||||
when simd.HAS_HARDWARE_SIMD {
|
||||
// Note: While this is something that could also logically take
|
||||
// advantage of AVX512, the various downclocking and power
|
||||
// consumption related woes make premature to have a dedicated
|
||||
|
||||
@@ -39,7 +39,7 @@ when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 {
|
||||
|
||||
// Some targets lack runtime feature detection, and will flat out refuse
|
||||
// to load binaries that have unknown instructions. This is distinct from
|
||||
// `simd.IS_EMULATED` as actually good designs support runtime feature
|
||||
// `simd.HAS_HARDWARE_SIMD` as actually good designs support runtime feature
|
||||
// detection and that constant establishes a baseline.
|
||||
//
|
||||
// See:
|
||||
|
||||
@@ -21,20 +21,17 @@ package simd
|
||||
|
||||
import "base:builtin"
|
||||
import "base:intrinsics"
|
||||
import "base:runtime"
|
||||
|
||||
/*
|
||||
Check if SIMD is software-emulated on a target platform.
|
||||
|
||||
This value is `false`, when the compile-time target has the hardware support for
|
||||
at 128-bit (or wider) SIMD. If the compile-time target lacks the hardware support
|
||||
for 128-bit SIMD, this value is `true`, and all SIMD operations will likely be
|
||||
This value is `true`, when the compile-time target has the hardware support for
|
||||
at least 128-bit (or wider) SIMD. If the compile-time target lacks the hardware support
|
||||
for 128-bit SIMD, this value is `false`, and all SIMD operations will likely be
|
||||
emulated.
|
||||
*/
|
||||
IS_EMULATED :: true when (ODIN_ARCH == .amd64 || ODIN_ARCH == .i386) && !intrinsics.has_target_feature("sse2") else
|
||||
true when (ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32) && !intrinsics.has_target_feature("neon") else
|
||||
true when (ODIN_ARCH == .wasm64p32 || ODIN_ARCH == .wasm32) && !intrinsics.has_target_feature("simd128") else
|
||||
true when (ODIN_ARCH == .riscv64) && !intrinsics.has_target_feature("v") else
|
||||
false
|
||||
HAS_HARDWARE_SIMD :: runtime.HAS_HARDWARE_SIMD
|
||||
|
||||
/*
|
||||
Vector of 16 `u8` lanes (128 bits).
|
||||
|
||||
@@ -54,14 +54,15 @@ run_trial_size :: proc(p: proc "contextless" ([]u8, byte) -> int, size: int, idx
|
||||
|
||||
accumulator: int
|
||||
|
||||
for _ in 0..<runs {
|
||||
start := time.now()
|
||||
accumulator += p(data, 'z')
|
||||
done := time.since(start)
|
||||
timing += done
|
||||
}
|
||||
watch: time.Stopwatch
|
||||
|
||||
timing /= time.Duration(runs)
|
||||
time.stopwatch_start(&watch)
|
||||
for _ in 0..<runs {
|
||||
accumulator += p(data, 'z')
|
||||
}
|
||||
time.stopwatch_stop(&watch)
|
||||
|
||||
timing = time.stopwatch_duration(watch)
|
||||
|
||||
log.debug(accumulator)
|
||||
return
|
||||
|
||||
227
tests/benchmark/runtime/benchmark_runtime.odin
Normal file
227
tests/benchmark/runtime/benchmark_runtime.odin
Normal file
@@ -0,0 +1,227 @@
|
||||
package benchmark_runtime
|
||||
|
||||
import "base:runtime"
|
||||
import "core:fmt"
|
||||
import "core:log"
|
||||
import "core:testing"
|
||||
import "core:strings"
|
||||
import "core:text/table"
|
||||
import "core:time"
|
||||
|
||||
RUNS_PER_SIZE :: 2500
|
||||
|
||||
sizes := [?]int {
|
||||
7, 8, 9,
|
||||
15, 16, 17,
|
||||
31, 32, 33,
|
||||
63, 64, 65,
|
||||
95, 96, 97,
|
||||
128,
|
||||
256,
|
||||
512,
|
||||
1024,
|
||||
4096,
|
||||
1024 * 1024,
|
||||
}
|
||||
|
||||
// These are the normal, unoptimized algorithms.
|
||||
|
||||
plain_memory_equal :: proc "contextless" (x, y: rawptr, n: int) -> bool {
|
||||
switch {
|
||||
case n == 0: return true
|
||||
case x == y: return true
|
||||
}
|
||||
a, b := ([^]byte)(x), ([^]byte)(y)
|
||||
length := uint(n)
|
||||
|
||||
for i := uint(0); i < length; i += 1 {
|
||||
if a[i] != b[i] {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
plain_memory_compare :: proc "contextless" (a, b: rawptr, n: int) -> int #no_bounds_check {
|
||||
switch {
|
||||
case a == b: return 0
|
||||
case a == nil: return -1
|
||||
case b == nil: return +1
|
||||
}
|
||||
|
||||
x := uintptr(a)
|
||||
y := uintptr(b)
|
||||
n := uintptr(n)
|
||||
|
||||
SU :: size_of(uintptr)
|
||||
fast := n/SU + 1
|
||||
offset := (fast-1)*SU
|
||||
curr_block := uintptr(0)
|
||||
if n < SU {
|
||||
fast = 0
|
||||
}
|
||||
|
||||
for /**/; curr_block < fast; curr_block += 1 {
|
||||
va := (^uintptr)(x + curr_block * size_of(uintptr))^
|
||||
vb := (^uintptr)(y + curr_block * size_of(uintptr))^
|
||||
if va ~ vb != 0 {
|
||||
for pos := curr_block*SU; pos < n; pos += 1 {
|
||||
a := (^byte)(x+pos)^
|
||||
b := (^byte)(y+pos)^
|
||||
if a ~ b != 0 {
|
||||
return -1 if (int(a) - int(b)) < 0 else +1
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for /**/; offset < n; offset += 1 {
|
||||
a := (^byte)(x+offset)^
|
||||
b := (^byte)(y+offset)^
|
||||
if a ~ b != 0 {
|
||||
return -1 if (int(a) - int(b)) < 0 else +1
|
||||
}
|
||||
}
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
plain_memory_compare_zero :: proc "contextless" (a: rawptr, n: int) -> int #no_bounds_check {
|
||||
x := uintptr(a)
|
||||
n := uintptr(n)
|
||||
|
||||
SU :: size_of(uintptr)
|
||||
fast := n/SU + 1
|
||||
offset := (fast-1)*SU
|
||||
curr_block := uintptr(0)
|
||||
if n < SU {
|
||||
fast = 0
|
||||
}
|
||||
|
||||
for /**/; curr_block < fast; curr_block += 1 {
|
||||
va := (^uintptr)(x + curr_block * size_of(uintptr))^
|
||||
if va ~ 0 != 0 {
|
||||
for pos := curr_block*SU; pos < n; pos += 1 {
|
||||
a := (^byte)(x+pos)^
|
||||
if a ~ 0 != 0 {
|
||||
return -1 if int(a) < 0 else +1
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for /**/; offset < n; offset += 1 {
|
||||
a := (^byte)(x+offset)^
|
||||
if a ~ 0 != 0 {
|
||||
return -1 if int(a) < 0 else +1
|
||||
}
|
||||
}
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
run_trial_size_cmp :: proc(p: proc "contextless" (rawptr, rawptr, int) -> $R, size: int, idx: int, runs: int, loc := #caller_location) -> (timing: time.Duration) {
|
||||
left := make([]u8, size)
|
||||
right := make([]u8, size)
|
||||
defer {
|
||||
delete(left)
|
||||
delete(right)
|
||||
}
|
||||
|
||||
right[idx] = 0x01
|
||||
|
||||
accumulator: int
|
||||
|
||||
watch: time.Stopwatch
|
||||
|
||||
time.stopwatch_start(&watch)
|
||||
for _ in 0..<runs {
|
||||
result := p(&left[0], &right[0], size)
|
||||
when R == bool {
|
||||
assert(result == false, loc = loc)
|
||||
accumulator += 1
|
||||
} else when R == int {
|
||||
assert(result == -1, loc = loc)
|
||||
accumulator += result
|
||||
}
|
||||
}
|
||||
time.stopwatch_stop(&watch)
|
||||
timing = time.stopwatch_duration(watch)
|
||||
|
||||
log.debug(accumulator)
|
||||
return
|
||||
}
|
||||
|
||||
run_trial_size_zero :: proc(p: proc "contextless" (rawptr, int) -> int, size: int, idx: int, runs: int, loc := #caller_location) -> (timing: time.Duration) {
|
||||
data := make([]u8, size)
|
||||
defer delete(data)
|
||||
|
||||
data[idx] = 0x01
|
||||
|
||||
accumulator: int
|
||||
|
||||
watch: time.Stopwatch
|
||||
|
||||
time.stopwatch_start(&watch)
|
||||
for _ in 0..<runs {
|
||||
result := p(&data[0], size)
|
||||
assert(result == 1, loc = loc)
|
||||
accumulator += result
|
||||
}
|
||||
time.stopwatch_stop(&watch)
|
||||
timing = time.stopwatch_duration(watch)
|
||||
|
||||
log.debug(accumulator)
|
||||
return
|
||||
}
|
||||
|
||||
run_trial_size :: proc {
|
||||
run_trial_size_cmp,
|
||||
run_trial_size_zero,
|
||||
}
|
||||
|
||||
|
||||
bench_table :: proc(algo_name: string, plain, simd: $P) {
|
||||
string_buffer := strings.builder_make()
|
||||
defer strings.builder_destroy(&string_buffer)
|
||||
|
||||
tbl: table.Table
|
||||
table.init(&tbl)
|
||||
defer table.destroy(&tbl)
|
||||
|
||||
table.aligned_header_of_values(&tbl, .Right, "Algorithm", "Size", "Iterations", "Scalar", "SIMD", "SIMD Relative (%)", "SIMD Relative (x)")
|
||||
|
||||
for size in sizes {
|
||||
// Place the non-zero byte somewhere in the middle.
|
||||
needle_index := size / 2
|
||||
|
||||
plain_timing := run_trial_size(plain, size, needle_index, RUNS_PER_SIZE)
|
||||
simd_timing := run_trial_size(simd, size, needle_index, RUNS_PER_SIZE)
|
||||
|
||||
_plain := fmt.tprintf("%8M", plain_timing)
|
||||
_simd := fmt.tprintf("%8M", simd_timing)
|
||||
_relp := fmt.tprintf("%.3f %%", f64(simd_timing) / f64(plain_timing) * 100.0)
|
||||
_relx := fmt.tprintf("%.3f x", 1 / (f64(simd_timing) / f64(plain_timing)))
|
||||
|
||||
table.aligned_row_of_values(
|
||||
&tbl,
|
||||
.Right,
|
||||
algo_name,
|
||||
size, RUNS_PER_SIZE, _plain, _simd, _relp, _relx)
|
||||
}
|
||||
|
||||
builder_writer := strings.to_writer(&string_buffer)
|
||||
|
||||
fmt.sbprintln(&string_buffer)
|
||||
table.write_plain_table(builder_writer, &tbl)
|
||||
|
||||
my_table_string := strings.to_string(string_buffer)
|
||||
log.info(my_table_string)
|
||||
}
|
||||
|
||||
@test
|
||||
benchmark_memory_procs :: proc(t: ^testing.T) {
|
||||
bench_table("memory_equal", plain_memory_equal, runtime.memory_equal)
|
||||
bench_table("memory_compare", plain_memory_compare, runtime.memory_compare)
|
||||
bench_table("memory_compare_zero", plain_memory_compare_zero, runtime.memory_compare_zero)
|
||||
}
|
||||
@@ -4,6 +4,7 @@ package test_core_runtime
|
||||
import "base:intrinsics"
|
||||
import "core:mem"
|
||||
import "base:runtime"
|
||||
import "core:slice"
|
||||
import "core:testing"
|
||||
|
||||
// Tests that having space for the allocation, but not for the allocation and alignment
|
||||
@@ -177,3 +178,78 @@ test_map_get :: proc(t: ^testing.T) {
|
||||
check(t, m)
|
||||
}
|
||||
}
|
||||
|
||||
@(test)
|
||||
test_memory_equal :: proc(t: ^testing.T) {
|
||||
data: [256]u8
|
||||
cmp: [256]u8
|
||||
|
||||
slice.fill(data[:], 0xAA)
|
||||
slice.fill(cmp[:], 0xAA)
|
||||
|
||||
for offset in 0..<len(data) {
|
||||
subdata := data[offset:]
|
||||
subcmp := cmp[offset:]
|
||||
for idx in 0..<len(subdata) {
|
||||
if !testing.expect_value(t, runtime.memory_equal(&subdata[0], &subcmp[0], len(subdata)), true) {
|
||||
return
|
||||
}
|
||||
|
||||
subcmp[idx] = 0x55
|
||||
if !testing.expect_value(t, runtime.memory_equal(&subdata[0], &subcmp[0], len(subdata)), false) {
|
||||
return
|
||||
}
|
||||
subcmp[idx] = 0xAA
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@(test)
|
||||
test_memory_compare :: proc(t: ^testing.T) {
|
||||
data: [256]u8
|
||||
cmp: [256]u8
|
||||
|
||||
for offset in 0..<len(data) {
|
||||
subdata := data[offset:]
|
||||
subcmp := cmp[offset:]
|
||||
for idx in 0..<len(subdata) {
|
||||
if !testing.expect_value(t, runtime.memory_compare(&subdata[0], &subcmp[0], len(subdata)), 0) {
|
||||
return
|
||||
}
|
||||
|
||||
subdata[idx] = 0x7F
|
||||
subcmp[idx] = 0xFF
|
||||
if !testing.expect_value(t, runtime.memory_compare(&subdata[0], &subcmp[0], len(subdata)), -1) {
|
||||
return
|
||||
}
|
||||
|
||||
subdata[idx] = 0xFF
|
||||
subcmp[idx] = 0x7F
|
||||
if !testing.expect_value(t, runtime.memory_compare(&subdata[0], &subcmp[0], len(subdata)), 1) {
|
||||
return
|
||||
}
|
||||
|
||||
subdata[idx] = 0
|
||||
subcmp[idx] = 0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@(test)
|
||||
test_memory_compare_zero :: proc(t: ^testing.T) {
|
||||
data: [256]u8
|
||||
|
||||
for offset in 0..<len(data) {
|
||||
subdata := data[offset:]
|
||||
for idx in 0..<len(subdata) {
|
||||
if !testing.expect_value(t, runtime.memory_compare_zero(&subdata[0], len(subdata)), 0) {
|
||||
return
|
||||
}
|
||||
subdata[idx] = 0xFF
|
||||
if !testing.expect_value(t, runtime.memory_compare_zero(&subdata[0], len(subdata)), 1) {
|
||||
return
|
||||
}
|
||||
subdata[idx] = 0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user