mirror of
https://github.com/odin-lang/Odin.git
synced 2026-04-21 05:45:19 +00:00
Merge pull request #5108 from Barinzaya/core-simd-indices-redadd-redmul
Alternate `reduce_add`/`reduce_mul` intrinsics
This commit is contained in:
@@ -274,8 +274,12 @@ simd_lanes_ge :: proc(a, b: #simd[N]T) -> #simd[N]Integer ---
|
||||
simd_extract :: proc(a: #simd[N]T, idx: uint) -> T ---
|
||||
simd_replace :: proc(a: #simd[N]T, idx: uint, elem: T) -> #simd[N]T ---
|
||||
|
||||
simd_reduce_add_bisect :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
|
||||
simd_reduce_mul_bisect :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
|
||||
simd_reduce_add_ordered :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
|
||||
simd_reduce_mul_ordered :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
|
||||
simd_reduce_add_pairs :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
|
||||
simd_reduce_mul_pairs :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
|
||||
simd_reduce_min :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
|
||||
simd_reduce_max :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
|
||||
simd_reduce_and :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
|
||||
|
||||
@@ -1759,7 +1759,103 @@ Returns:
|
||||
replace :: intrinsics.simd_replace
|
||||
|
||||
/*
|
||||
Reduce a vector to a scalar by adding up all the lanes.
|
||||
Reduce a vector to a scalar by adding up all the lanes in a bisecting fashion.
|
||||
|
||||
This procedure returns a scalar that is the sum of all lanes, calculated by
|
||||
bisecting the vector into two parts, where the first contains lanes [0, N/2)
|
||||
and the second contains lanes [N/2, N), and adding the two halves element-wise
|
||||
to produce N/2 values. This is repeated until only a single element remains.
|
||||
This order may be faster to compute than the ordered sum for floats, as it can
|
||||
often be better parallelized.
|
||||
|
||||
The order of the sum may be important for accounting for precision errors in
|
||||
floating-point computation, as floating-point addition is not associative, that
|
||||
is `(a+b)+c` may not be equal to `a+(b+c)`.
|
||||
|
||||
Inputs:
|
||||
- `v`: The vector to reduce.
|
||||
|
||||
Result:
|
||||
- Sum of all lanes, as a scalar.
|
||||
|
||||
**Operation**:
|
||||
|
||||
for n > 1 {
|
||||
n = n / 2
|
||||
for i in 0 ..< n {
|
||||
a[i] += a[i+n]
|
||||
}
|
||||
}
|
||||
res := a[0]
|
||||
|
||||
Graphical representation of the operation for N=4:
|
||||
|
||||
+-----------------------+
|
||||
| v0 | v1 | v2 | v3 |
|
||||
+-----------------------+
|
||||
| | | |
|
||||
[+]<-- | ---' |
|
||||
| [+]<--------'
|
||||
| |
|
||||
`>[+]<'
|
||||
|
|
||||
v
|
||||
+-----+
|
||||
result: | y0 |
|
||||
+-----+
|
||||
*/
|
||||
reduce_add_bisect :: intrinsics.simd_reduce_add_bisect
|
||||
|
||||
/*
|
||||
Reduce a vector to a scalar by multiplying up all the lanes in a bisecting fashion.
|
||||
|
||||
This procedure returns a scalar that is the product of all lanes, calculated by
|
||||
bisecting the vector into two parts, where the first contains indices [0, N/2)
|
||||
and the second contains indices [N/2, N), and multiplying the two halves
|
||||
together element-wise to produce N/2 values. This is repeated until only a
|
||||
single element remains. This order may be faster to compute than the ordered
|
||||
product for floats, as it can often be better parallelized.
|
||||
|
||||
The order of the product may be important for accounting for precision errors
|
||||
in floating-point computation, as floating-point multiplication is not
|
||||
associative, that is `(a*b)*c` may not be equal to `a*(b*c)`.
|
||||
|
||||
Inputs:
|
||||
- `v`: The vector to reduce.
|
||||
|
||||
Result:
|
||||
- Product of all lanes, as a scalar.
|
||||
|
||||
**Operation**:
|
||||
|
||||
for n > 1 {
|
||||
n = n / 2
|
||||
for i in 0 ..< n {
|
||||
a[i] *= a[i+n]
|
||||
}
|
||||
}
|
||||
res := a[0]
|
||||
|
||||
Graphical representation of the operation for N=4:
|
||||
|
||||
+-----------------------+
|
||||
| v0 | v1 | v2 | v3 |
|
||||
+-----------------------+
|
||||
| | | |
|
||||
[x]<-- | ---' |
|
||||
| [x]<--------'
|
||||
| |
|
||||
`>[x]<'
|
||||
|
|
||||
v
|
||||
+-----+
|
||||
result: | y0 |
|
||||
+-----+
|
||||
*/
|
||||
reduce_mul_bisect :: intrinsics.simd_reduce_mul_bisect
|
||||
|
||||
/*
|
||||
Reduce a vector to a scalar by adding up all the lanes in an ordered fashion.
|
||||
|
||||
This procedure returns a scalar that is the ordered sum of all lanes. The
|
||||
ordered sum may be important for accounting for precision errors in
|
||||
@@ -1782,7 +1878,7 @@ Result:
|
||||
reduce_add_ordered :: intrinsics.simd_reduce_add_ordered
|
||||
|
||||
/*
|
||||
Reduce a vector to a scalar by multiplying all the lanes.
|
||||
Reduce a vector to a scalar by multiplying all the lanes in an ordered fashion.
|
||||
|
||||
This procedure returns a scalar that is the ordered product of all lanes.
|
||||
The ordered product may be important for accounting for precision errors in
|
||||
@@ -1804,6 +1900,100 @@ Result:
|
||||
*/
|
||||
reduce_mul_ordered :: intrinsics.simd_reduce_mul_ordered
|
||||
|
||||
/*
|
||||
Reduce a vector to a scalar by adding up all the lanes in a pairwise fashion.
|
||||
|
||||
This procedure returns a scalar that is the sum of all lanes, calculated by
|
||||
adding each even-indexed element with the following odd-indexed element to
|
||||
produce N/2 values. This is repeated until only a single element remains. This
|
||||
order is supported by hardware instructions for some types/architectures (e.g.
|
||||
i16/i32/f32/f64 on x86 SSE, i8/i16/i32/f32 on ARM NEON).
|
||||
|
||||
The order of the sum may be important for accounting for precision errors in
|
||||
floating-point computation, as floating-point addition is not associative, that
|
||||
is `(a+b)+c` may not be equal to `a+(b+c)`.
|
||||
|
||||
Inputs:
|
||||
- `v`: The vector to reduce.
|
||||
|
||||
Result:
|
||||
- Sum of all lanes, as a scalar.
|
||||
|
||||
**Operation**:
|
||||
|
||||
for n > 1 {
|
||||
n = n / 2
|
||||
for i in 0 ..< n {
|
||||
a[i] = a[2*i+0] + a[2*i+1]
|
||||
}
|
||||
}
|
||||
res := a[0]
|
||||
|
||||
Graphical representation of the operation for N=4:
|
||||
|
||||
+-----------------------+
|
||||
v: | v0 | v1 | v2 | v3 |
|
||||
+-----------------------+
|
||||
| | | |
|
||||
`>[+]<' `>[+]<'
|
||||
| |
|
||||
`--->[+]<--'
|
||||
|
|
||||
v
|
||||
+-----+
|
||||
result: | y0 |
|
||||
+-----+
|
||||
*/
|
||||
reduce_add_pairs :: intrinsics.simd_reduce_add_pairs
|
||||
|
||||
/*
|
||||
Reduce a vector to a scalar by multiplying all the lanes in a pairwise fashion.
|
||||
|
||||
This procedure returns a scalar that is the product of all lanes, calculated by
|
||||
bisecting the vector into two parts, where the first contains lanes [0, N/2)
|
||||
and the second contains lanes [N/2, N), and multiplying the two halves together
|
||||
multiplying each even-indexed element with the following odd-indexed element to
|
||||
produce N/2 values. This is repeated until only a single element remains. This
|
||||
order may be faster to compute than the ordered product for floats, as it can
|
||||
often be better parallelized.
|
||||
|
||||
The order of the product may be important for accounting for precision errors
|
||||
in floating-point computation, as floating-point multiplication is not
|
||||
associative, that is `(a*b)*c` may not be equal to `a*(b*c)`.
|
||||
|
||||
Inputs:
|
||||
- `v`: The vector to reduce.
|
||||
|
||||
Result:
|
||||
- Product of all lanes, as a scalar.
|
||||
|
||||
**Operation**:
|
||||
|
||||
for n > 1 {
|
||||
n = n / 2
|
||||
for i in 0 ..< n {
|
||||
a[i] = a[2*i+0] * a[2*i+1]
|
||||
}
|
||||
}
|
||||
res := a[0]
|
||||
|
||||
Graphical representation of the operation for N=4:
|
||||
|
||||
+-----------------------+
|
||||
v: | v0 | v1 | v2 | v3 |
|
||||
+-----------------------+
|
||||
| | | |
|
||||
`>[x]<' `>[x]<'
|
||||
| |
|
||||
`--->[x]<--'
|
||||
|
|
||||
v
|
||||
+-----+
|
||||
result: | y0 |
|
||||
+-----+
|
||||
*/
|
||||
reduce_mul_pairs :: intrinsics.simd_reduce_mul_pairs
|
||||
|
||||
/*
|
||||
Reduce a vector to a scalar by finding the minimum value between all of the lanes.
|
||||
|
||||
|
||||
@@ -853,8 +853,12 @@ gb_internal bool check_builtin_simd_operation(CheckerContext *c, Operand *operan
|
||||
}
|
||||
break;
|
||||
|
||||
case BuiltinProc_simd_reduce_add_bisect:
|
||||
case BuiltinProc_simd_reduce_mul_bisect:
|
||||
case BuiltinProc_simd_reduce_add_ordered:
|
||||
case BuiltinProc_simd_reduce_mul_ordered:
|
||||
case BuiltinProc_simd_reduce_add_pairs:
|
||||
case BuiltinProc_simd_reduce_mul_pairs:
|
||||
case BuiltinProc_simd_reduce_min:
|
||||
case BuiltinProc_simd_reduce_max:
|
||||
{
|
||||
|
||||
@@ -170,8 +170,12 @@ BuiltinProc__simd_begin,
|
||||
BuiltinProc_simd_extract,
|
||||
BuiltinProc_simd_replace,
|
||||
|
||||
BuiltinProc_simd_reduce_add_bisect,
|
||||
BuiltinProc_simd_reduce_mul_bisect,
|
||||
BuiltinProc_simd_reduce_add_ordered,
|
||||
BuiltinProc_simd_reduce_mul_ordered,
|
||||
BuiltinProc_simd_reduce_add_pairs,
|
||||
BuiltinProc_simd_reduce_mul_pairs,
|
||||
BuiltinProc_simd_reduce_min,
|
||||
BuiltinProc_simd_reduce_max,
|
||||
BuiltinProc_simd_reduce_and,
|
||||
@@ -518,8 +522,12 @@ gb_global BuiltinProc builtin_procs[BuiltinProc_COUNT] = {
|
||||
{STR_LIT("simd_extract"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
|
||||
{STR_LIT("simd_replace"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics},
|
||||
|
||||
{STR_LIT("simd_reduce_add_bisect"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
|
||||
{STR_LIT("simd_reduce_mul_bisect"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
|
||||
{STR_LIT("simd_reduce_add_ordered"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
|
||||
{STR_LIT("simd_reduce_mul_ordered"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
|
||||
{STR_LIT("simd_reduce_add_pairs"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
|
||||
{STR_LIT("simd_reduce_mul_pairs"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
|
||||
{STR_LIT("simd_reduce_min"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
|
||||
{STR_LIT("simd_reduce_max"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
|
||||
{STR_LIT("simd_reduce_and"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
|
||||
|
||||
@@ -1495,6 +1495,38 @@ gb_internal lbValue lb_build_builtin_simd_proc(lbProcedure *p, Ast *expr, TypeAn
|
||||
res.value = LLVMBuildInsertElement(p->builder, arg0.value, arg2.value, arg1.value, "");
|
||||
return res;
|
||||
|
||||
case BuiltinProc_simd_reduce_add_bisect:
|
||||
case BuiltinProc_simd_reduce_mul_bisect:
|
||||
{
|
||||
GB_ASSERT(arg0.type->kind == Type_SimdVector);
|
||||
i64 num_elems = arg0.type->SimdVector.count;
|
||||
|
||||
LLVMValueRef *indices = gb_alloc_array(temporary_allocator(), LLVMValueRef, num_elems);
|
||||
for (i64 i = 0; i < num_elems; i++) {
|
||||
indices[i] = lb_const_int(m, t_uint, cast(u64)i).value;
|
||||
}
|
||||
|
||||
switch (builtin_id) {
|
||||
case BuiltinProc_simd_reduce_add_bisect: op_code = is_float ? LLVMFAdd : LLVMAdd; break;
|
||||
case BuiltinProc_simd_reduce_mul_bisect: op_code = is_float ? LLVMFMul : LLVMMul; break;
|
||||
}
|
||||
|
||||
LLVMValueRef remaining = arg0.value;
|
||||
i64 num_remaining = num_elems;
|
||||
|
||||
while (num_remaining > 1) {
|
||||
num_remaining /= 2;
|
||||
LLVMValueRef left_indices = LLVMConstVector(&indices[0], cast(unsigned)num_remaining);
|
||||
LLVMValueRef left_value = LLVMBuildShuffleVector(p->builder, remaining, remaining, left_indices, "");
|
||||
LLVMValueRef right_indices = LLVMConstVector(&indices[num_remaining], cast(unsigned)num_remaining);
|
||||
LLVMValueRef right_value = LLVMBuildShuffleVector(p->builder, remaining, remaining, right_indices, "");
|
||||
remaining = LLVMBuildBinOp(p->builder, op_code, left_value, right_value, "");
|
||||
}
|
||||
|
||||
res.value = LLVMBuildExtractElement(p->builder, remaining, indices[0], "");
|
||||
return res;
|
||||
}
|
||||
|
||||
case BuiltinProc_simd_reduce_add_ordered:
|
||||
case BuiltinProc_simd_reduce_mul_ordered:
|
||||
{
|
||||
@@ -1527,6 +1559,40 @@ gb_internal lbValue lb_build_builtin_simd_proc(lbProcedure *p, Ast *expr, TypeAn
|
||||
res.value = lb_call_intrinsic(p, name, args, cast(unsigned)args_count, types, gb_count_of(types));
|
||||
return res;
|
||||
}
|
||||
|
||||
case BuiltinProc_simd_reduce_add_pairs:
|
||||
case BuiltinProc_simd_reduce_mul_pairs:
|
||||
{
|
||||
GB_ASSERT(arg0.type->kind == Type_SimdVector);
|
||||
i64 num_elems = arg0.type->SimdVector.count;
|
||||
|
||||
LLVMValueRef *indices = gb_alloc_array(temporary_allocator(), LLVMValueRef, num_elems);
|
||||
for (i64 i = 0; i < num_elems/2; i++) {
|
||||
indices[i] = lb_const_int(m, t_uint, cast(u64)(2*i)).value;
|
||||
indices[i+num_elems/2] = lb_const_int(m, t_uint, cast(u64)(2*i+1)).value;
|
||||
}
|
||||
|
||||
switch (builtin_id) {
|
||||
case BuiltinProc_simd_reduce_add_pairs: op_code = is_float ? LLVMFAdd : LLVMAdd; break;
|
||||
case BuiltinProc_simd_reduce_mul_pairs: op_code = is_float ? LLVMFMul : LLVMMul; break;
|
||||
}
|
||||
|
||||
LLVMValueRef remaining = arg0.value;
|
||||
i64 num_remaining = num_elems;
|
||||
|
||||
while (num_remaining > 1) {
|
||||
num_remaining /= 2;
|
||||
LLVMValueRef left_indices = LLVMConstVector(&indices[0], cast(unsigned)num_remaining);
|
||||
LLVMValueRef left_value = LLVMBuildShuffleVector(p->builder, remaining, remaining, left_indices, "");
|
||||
LLVMValueRef right_indices = LLVMConstVector(&indices[num_elems/2], cast(unsigned)num_remaining);
|
||||
LLVMValueRef right_value = LLVMBuildShuffleVector(p->builder, remaining, remaining, right_indices, "");
|
||||
remaining = LLVMBuildBinOp(p->builder, op_code, left_value, right_value, "");
|
||||
}
|
||||
|
||||
res.value = LLVMBuildExtractElement(p->builder, remaining, indices[0], "");
|
||||
return res;
|
||||
}
|
||||
|
||||
case BuiltinProc_simd_reduce_min:
|
||||
case BuiltinProc_simd_reduce_max:
|
||||
case BuiltinProc_simd_reduce_and:
|
||||
|
||||
Reference in New Issue
Block a user