diff --git a/core/simd/simd.odin b/core/simd/simd.odin index 37cc19ebd..385e24785 100644 --- a/core/simd/simd.odin +++ b/core/simd/simd.odin @@ -1759,7 +1759,7 @@ Returns: replace :: intrinsics.simd_replace /* -Reduce a vector to a scalar by adding up all the lanes. +Reduce a vector to a scalar by adding up all the lanes in an ordered fashion. This procedure returns a scalar that is the ordered sum of all lanes. The ordered sum may be important for accounting for precision errors in @@ -2510,3 +2510,457 @@ Example: recip :: #force_inline proc "contextless" (v: $T/#simd[$LANES]$E) -> T where intrinsics.type_is_float(E) { return T(1) / v } + +/* +Creates a vector where each lane contains the index of that lane. + +Inputs: +- `V`: The type of the vector to create. + +Result: +- A vector of the given type, where each lane contains the index of that lane. + +**Operation**: + +> for i in 0 ..< N { + res[i] = i + } +*/ +indices :: #force_inline proc "contextless" ($V: typeid/#simd[$N]$E) -> V where intrinsics.type_is_numeric(E) { + when N == 1 { + return {0} + } else when N == 2 { + return {0, 1} + } else when N == 4 { + return {0, 1, 2, 3} + } else when N == 8 { + return {0, 1, 2, 3, 4, 5, 6, 7} + } else when N == 16 { + return {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15} + } else when N == 32 { + return { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + } + } else when N == 64 { + return { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, + } + } else { + #panic("Unsupported vector size!") + } +} + +/* +Reduce a vector to a scalar by adding up all the lanes in a pairwise fashion. + +This procedure returns a scalar that is the sum of all lanes, calculated by +adding each even-numbered element with the following odd-numbered element. This +is repeated until only a single element remains. This order is supported by +hardware instructions for some types/architectures (e.g. i16/i32/f32/f64 on x86 +SSE, i8/i16/i32/f32 on ARM NEON). + +The order of the sum may be important for accounting for precision errors in +floating-point computation, as floating-point addition is not associative, that +is `(a+b)+c` may not be equal to `a+(b+c)`. + +Inputs: +- `v`: The vector to reduce. + +Result: +- Sum of all lanes, as a scalar. + +**Operation**: + + for n > 1 { + n = n / 2 + for i in 0 ..< n { + a[i] = a[2*i+0] + a[2*i+1] + } + } + res := a[0] + +Graphical representation of the operation for N=4: + + +-----------------------+ + v: | v0 | v1 | v2 | v3 | + +-----------------------+ + | | | | + `>[+]<' `>[+]<' + | | + `--->[+]<--' + | + v + +-----+ + result: | y0 | + +-----+ +*/ +reduce_add_pairs :: #force_inline proc "contextless" (v: #simd[$N]$E) -> E + where intrinsics.type_is_numeric(E) { + when N == 64 { v64 := v } + when N == 32 { v32 := v } + when N == 16 { v16 := v } + when N == 8 { v8 := v } + when N == 4 { v4 := v } + when N == 2 { v2 := v } + + when N >= 64 { + x32 := swizzle(v64, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 44, 46, + 48, 50, 52, 54, 56, 58, 60, 62) + y32 := swizzle(v64, + 1, 3, 5, 7, 9, 11, 13, 15, + 17, 19, 21, 23, 25, 27, 29, 31, + 33, 35, 37, 39, 41, 43, 45, 47, + 49, 51, 53, 55, 57, 59, 61, 63) + v32 := x32 * y32 + } + + when N >= 32 { + x16 := swizzle(v32, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30) + y16 := swizzle(v32, + 1, 3, 5, 7, 9, 11, 13, 15, + 17, 19, 21, 23, 25, 27, 29, 31) + v16 := x16 * y16 + } + + when N >= 16 { + x8 := swizzle(v16, 0, 2, 4, 6, 8, 10, 12, 14) + y8 := swizzle(v16, 1, 3, 5, 7, 9, 11, 13, 15) + v8 := x8 * y8 + } + + when N >= 8 { + x4 := swizzle(v8, 0, 2, 4, 6) + y4 := swizzle(v8, 1, 3, 5, 7) + v4 := x4 * y4 + } + + when N >= 4 { + x2 := swizzle(v4, 0, 2) + y2 := swizzle(v4, 1, 3) + v2 := x2 * y2 + } + + when N >= 2 { + return extract(v2, 0) * extract(v2, 1) + } else { + return extract(v, 0) + } +} + +/* +Reduce a vector to a scalar by adding up all the lanes in a binary fashion. + +This procedure returns a scalar that is the sum of all lanes, calculated by +splitting the vector in two parts and adding the two halves together +element-wise. This is repeated until only a single element remains. This order +will typically be faster to compute than the ordered sum for floats, as it can +be better parallelized. + +The order of the sum may be important for accounting for precision errors in +floating-point computation, as floating-point addition is not associative, that +is `(a+b)+c` may not be equal to `a+(b+c)`. + +Inputs: +- `v`: The vector to reduce. + +Result: +- Sum of all lanes, as a scalar. + +**Operation**: + + for n > 1 { + n = n / 2 + for i in 0 ..< n { + a[i] += a[i+n] + } + } + res := a[0] + +Graphical representation of the operation for N=4: + + +-----------------------+ + | v0 | v1 | v2 | v3 | + +-----------------------+ + | | | | + [+]<-- | ---' | + | [+]<--------' + | | + `>[+]<' + | + v + +-----+ + result: | y0 | + +-----+ +*/ +reduce_add_split :: #force_inline proc "contextless" (v: #simd[$N]$E) -> E + where intrinsics.type_is_numeric(E) { + when N == 64 { v64 := v } + when N == 32 { v32 := v } + when N == 16 { v16 := v } + when N == 8 { v8 := v } + when N == 4 { v4 := v } + when N == 2 { v2 := v } + + when N >= 64 { + x32 := swizzle(v64, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31) + y32 := swizzle(v64, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63) + v32 := x32 + y32 + } + + when N >= 32 { + x16 := swizzle(v32, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15) + y16 := swizzle(v32, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31) + v16 := x16 + y16 + } + + when N >= 16 { + x8 := swizzle(v16, 0, 1, 2, 3, 4, 5, 6, 7) + y8 := swizzle(v16, 8, 9, 10, 11, 12, 13, 14, 15) + v8 := x8 + y8 + } + + when N >= 8 { + x4 := swizzle(v8, 0, 1, 2, 3) + y4 := swizzle(v8, 4, 5, 6, 7) + v4 := x4 + y4 + } + + when N >= 4 { + x2 := swizzle(v4, 0, 1) + y2 := swizzle(v4, 2, 3) + v2 := x2 + y2 + } + + when N >= 2 { + return extract(v2, 0) + extract(v2, 1) + } else { + return extract(v, 0) + } +} + +/* +Reduce a vector to a scalar by multiplying all the lanes in a pairwise fashion. + +This procedure returns a scalar that is the product of all lanes, calculated by +multiplying each even-numbered element with the following odd-numbered element. +This is repeated until only a single element remains. This order may be faster +to compute than the ordered product for floats, as it can be better +parallelized. + +The order of the product may be important for accounting for precision errors +in floating-point computation, as floating-point multiplication is not +associative, that is `(a*b)*c` may not be equal to `a*(b*c)`. + +Inputs: +- `v`: The vector to reduce. + +Result: +- Product of all lanes, as a scalar. + +**Operation**: + + for n > 1 { + n = n / 2 + for i in 0 ..< n { + a[i] = a[2*i+0] * a[2*i+1] + } + } + res := a[0] + +Graphical representation of the operation for N=4: + + +-----------------------+ + v: | v0 | v1 | v2 | v3 | + +-----------------------+ + | | | | + `>[x]<' `>[x]<' + | | + `--->[x]<--' + | + v + +-----+ + result: | y0 | + +-----+ +*/ +reduce_mul_pairs :: #force_inline proc "contextless" (v: #simd[$N]$E) -> E + where intrinsics.type_is_numeric(E) { + when N == 64 { v64 := v } + when N == 32 { v32 := v } + when N == 16 { v16 := v } + when N == 8 { v8 := v } + when N == 4 { v4 := v } + when N == 2 { v2 := v } + + when N >= 64 { + x32 := swizzle(v64, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 44, 46, + 48, 50, 52, 54, 56, 58, 60, 62) + y32 := swizzle(v64, + 1, 3, 5, 7, 9, 11, 13, 15, + 17, 19, 21, 23, 25, 27, 29, 31, + 33, 35, 37, 39, 41, 43, 45, 47, + 49, 51, 53, 55, 57, 59, 61, 63) + v32 := x32 * y32 + } + + when N >= 32 { + x16 := swizzle(v32, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30) + y16 := swizzle(v32, + 1, 3, 5, 7, 9, 11, 13, 15, + 17, 19, 21, 23, 25, 27, 29, 31) + v16 := x16 * y16 + } + + when N >= 16 { + x8 := swizzle(v16, 0, 2, 4, 6, 8, 10, 12, 14) + y8 := swizzle(v16, 1, 3, 5, 7, 9, 11, 13, 15) + v8 := x8 * y8 + } + + when N >= 8 { + x4 := swizzle(v8, 0, 2, 4, 6) + y4 := swizzle(v8, 1, 3, 5, 7) + v4 := x4 * y4 + } + + when N >= 4 { + x2 := swizzle(v4, 0, 2) + y2 := swizzle(v4, 1, 3) + v2 := x2 * y2 + } + + when N >= 2 { + return extract(v2, 0) * extract(v2, 1) + } else { + return extract(v, 0) + } +} + +/* +Reduce a vector to a scalar by multiplying up all the lanes in a binary fashion. + +This procedure returns a scalar that is the product of all lanes, calculated by +splitting the vector in two parts and multiplying the two halves together +element-wise until only a single element remains. This is repeated until only a +single element remains. This order may be faster to compute than the ordered +product for floats, as it can be better parallelized. + +The order of the product may be important for accounting for precision errors +in floating-point computation, as floating-point multiplication is not +associative, that is `(a*b)*c` may not be equal to `a*(b*c)`. + +Inputs: +- `v`: The vector to reduce. + +Result: +- Product of all lanes, as a scalar. + +**Operation**: + + for n > 1 { + n = n / 2 + for i in 0 ..< n { + a[i] *= a[i+n] + } + } + res := a[0] + +Graphical representation of the operation for N=4: + + +-----------------------+ + | v0 | v1 | v2 | v3 | + +-----------------------+ + | | | | + [x]<-- | ---' | + | [x]<--------' + | | + `>[x]<' + | + v + +-----+ + result: | y0 | + +-----+ +*/ +reduce_mul_split :: #force_inline proc "contextless" (v: #simd[$N]$E) -> E + where intrinsics.type_is_numeric(E) { + when N == 64 { v64 := v } + when N == 32 { v32 := v } + when N == 16 { v16 := v } + when N == 8 { v8 := v } + when N == 4 { v4 := v } + when N == 2 { v2 := v } + + when N >= 64 { + x32 := swizzle(v64, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31) + y32 := swizzle(v64, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63) + v32 := x32 * y32 + } + + when N >= 32 { + x16 := swizzle(v32, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15) + y16 := swizzle(v32, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31) + v16 := x16 * y16 + } + + when N >= 16 { + x8 := swizzle(v16, 0, 1, 2, 3, 4, 5, 6, 7) + y8 := swizzle(v16, 8, 9, 10, 11, 12, 13, 14, 15) + v8 := x8 * y8 + } + + when N >= 8 { + x4 := swizzle(v8, 0, 1, 2, 3) + y4 := swizzle(v8, 4, 5, 6, 7) + v4 := x4 * y4 + } + + when N >= 4 { + x2 := swizzle(v4, 0, 1) + y2 := swizzle(v4, 2, 3) + v2 := x2 * y2 + } + + when N >= 2 { + return extract(v2, 0) * extract(v2, 1) + } else { + return extract(v, 0) + } +} +