Merge pull request #5108 from Barinzaya/core-simd-indices-redadd-redmul

Alternate `reduce_add`/`reduce_mul` intrinsics
This commit is contained in:
gingerBill
2025-05-06 15:46:49 +01:00
committed by GitHub
5 changed files with 274 additions and 2 deletions

View File

@@ -274,8 +274,12 @@ simd_lanes_ge :: proc(a, b: #simd[N]T) -> #simd[N]Integer ---
simd_extract :: proc(a: #simd[N]T, idx: uint) -> T ---
simd_replace :: proc(a: #simd[N]T, idx: uint, elem: T) -> #simd[N]T ---
simd_reduce_add_bisect :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
simd_reduce_mul_bisect :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
simd_reduce_add_ordered :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
simd_reduce_mul_ordered :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
simd_reduce_add_pairs :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
simd_reduce_mul_pairs :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
simd_reduce_min :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
simd_reduce_max :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
simd_reduce_and :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---

View File

@@ -1759,7 +1759,103 @@ Returns:
replace :: intrinsics.simd_replace
/*
Reduce a vector to a scalar by adding up all the lanes.
Reduce a vector to a scalar by adding up all the lanes in a bisecting fashion.
This procedure returns a scalar that is the sum of all lanes, calculated by
bisecting the vector into two parts, where the first contains lanes [0, N/2)
and the second contains lanes [N/2, N), and adding the two halves element-wise
to produce N/2 values. This is repeated until only a single element remains.
This order may be faster to compute than the ordered sum for floats, as it can
often be better parallelized.
The order of the sum may be important for accounting for precision errors in
floating-point computation, as floating-point addition is not associative, that
is `(a+b)+c` may not be equal to `a+(b+c)`.
Inputs:
- `v`: The vector to reduce.
Result:
- Sum of all lanes, as a scalar.
**Operation**:
for n > 1 {
n = n / 2
for i in 0 ..< n {
a[i] += a[i+n]
}
}
res := a[0]
Graphical representation of the operation for N=4:
+-----------------------+
| v0 | v1 | v2 | v3 |
+-----------------------+
| | | |
[+]<-- | ---' |
| [+]<--------'
| |
`>[+]<'
|
v
+-----+
result: | y0 |
+-----+
*/
reduce_add_bisect :: intrinsics.simd_reduce_add_bisect
/*
Reduce a vector to a scalar by multiplying up all the lanes in a bisecting fashion.
This procedure returns a scalar that is the product of all lanes, calculated by
bisecting the vector into two parts, where the first contains indices [0, N/2)
and the second contains indices [N/2, N), and multiplying the two halves
together element-wise to produce N/2 values. This is repeated until only a
single element remains. This order may be faster to compute than the ordered
product for floats, as it can often be better parallelized.
The order of the product may be important for accounting for precision errors
in floating-point computation, as floating-point multiplication is not
associative, that is `(a*b)*c` may not be equal to `a*(b*c)`.
Inputs:
- `v`: The vector to reduce.
Result:
- Product of all lanes, as a scalar.
**Operation**:
for n > 1 {
n = n / 2
for i in 0 ..< n {
a[i] *= a[i+n]
}
}
res := a[0]
Graphical representation of the operation for N=4:
+-----------------------+
| v0 | v1 | v2 | v3 |
+-----------------------+
| | | |
[x]<-- | ---' |
| [x]<--------'
| |
`>[x]<'
|
v
+-----+
result: | y0 |
+-----+
*/
reduce_mul_bisect :: intrinsics.simd_reduce_mul_bisect
/*
Reduce a vector to a scalar by adding up all the lanes in an ordered fashion.
This procedure returns a scalar that is the ordered sum of all lanes. The
ordered sum may be important for accounting for precision errors in
@@ -1782,7 +1878,7 @@ Result:
reduce_add_ordered :: intrinsics.simd_reduce_add_ordered
/*
Reduce a vector to a scalar by multiplying all the lanes.
Reduce a vector to a scalar by multiplying all the lanes in an ordered fashion.
This procedure returns a scalar that is the ordered product of all lanes.
The ordered product may be important for accounting for precision errors in
@@ -1804,6 +1900,100 @@ Result:
*/
reduce_mul_ordered :: intrinsics.simd_reduce_mul_ordered
/*
Reduce a vector to a scalar by adding up all the lanes in a pairwise fashion.
This procedure returns a scalar that is the sum of all lanes, calculated by
adding each even-indexed element with the following odd-indexed element to
produce N/2 values. This is repeated until only a single element remains. This
order is supported by hardware instructions for some types/architectures (e.g.
i16/i32/f32/f64 on x86 SSE, i8/i16/i32/f32 on ARM NEON).
The order of the sum may be important for accounting for precision errors in
floating-point computation, as floating-point addition is not associative, that
is `(a+b)+c` may not be equal to `a+(b+c)`.
Inputs:
- `v`: The vector to reduce.
Result:
- Sum of all lanes, as a scalar.
**Operation**:
for n > 1 {
n = n / 2
for i in 0 ..< n {
a[i] = a[2*i+0] + a[2*i+1]
}
}
res := a[0]
Graphical representation of the operation for N=4:
+-----------------------+
v: | v0 | v1 | v2 | v3 |
+-----------------------+
| | | |
`>[+]<' `>[+]<'
| |
`--->[+]<--'
|
v
+-----+
result: | y0 |
+-----+
*/
reduce_add_pairs :: intrinsics.simd_reduce_add_pairs
/*
Reduce a vector to a scalar by multiplying all the lanes in a pairwise fashion.
This procedure returns a scalar that is the product of all lanes, calculated by
bisecting the vector into two parts, where the first contains lanes [0, N/2)
and the second contains lanes [N/2, N), and multiplying the two halves together
multiplying each even-indexed element with the following odd-indexed element to
produce N/2 values. This is repeated until only a single element remains. This
order may be faster to compute than the ordered product for floats, as it can
often be better parallelized.
The order of the product may be important for accounting for precision errors
in floating-point computation, as floating-point multiplication is not
associative, that is `(a*b)*c` may not be equal to `a*(b*c)`.
Inputs:
- `v`: The vector to reduce.
Result:
- Product of all lanes, as a scalar.
**Operation**:
for n > 1 {
n = n / 2
for i in 0 ..< n {
a[i] = a[2*i+0] * a[2*i+1]
}
}
res := a[0]
Graphical representation of the operation for N=4:
+-----------------------+
v: | v0 | v1 | v2 | v3 |
+-----------------------+
| | | |
`>[x]<' `>[x]<'
| |
`--->[x]<--'
|
v
+-----+
result: | y0 |
+-----+
*/
reduce_mul_pairs :: intrinsics.simd_reduce_mul_pairs
/*
Reduce a vector to a scalar by finding the minimum value between all of the lanes.

View File

@@ -853,8 +853,12 @@ gb_internal bool check_builtin_simd_operation(CheckerContext *c, Operand *operan
}
break;
case BuiltinProc_simd_reduce_add_bisect:
case BuiltinProc_simd_reduce_mul_bisect:
case BuiltinProc_simd_reduce_add_ordered:
case BuiltinProc_simd_reduce_mul_ordered:
case BuiltinProc_simd_reduce_add_pairs:
case BuiltinProc_simd_reduce_mul_pairs:
case BuiltinProc_simd_reduce_min:
case BuiltinProc_simd_reduce_max:
{

View File

@@ -170,8 +170,12 @@ BuiltinProc__simd_begin,
BuiltinProc_simd_extract,
BuiltinProc_simd_replace,
BuiltinProc_simd_reduce_add_bisect,
BuiltinProc_simd_reduce_mul_bisect,
BuiltinProc_simd_reduce_add_ordered,
BuiltinProc_simd_reduce_mul_ordered,
BuiltinProc_simd_reduce_add_pairs,
BuiltinProc_simd_reduce_mul_pairs,
BuiltinProc_simd_reduce_min,
BuiltinProc_simd_reduce_max,
BuiltinProc_simd_reduce_and,
@@ -518,8 +522,12 @@ gb_global BuiltinProc builtin_procs[BuiltinProc_COUNT] = {
{STR_LIT("simd_extract"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
{STR_LIT("simd_replace"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics},
{STR_LIT("simd_reduce_add_bisect"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
{STR_LIT("simd_reduce_mul_bisect"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
{STR_LIT("simd_reduce_add_ordered"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
{STR_LIT("simd_reduce_mul_ordered"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
{STR_LIT("simd_reduce_add_pairs"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
{STR_LIT("simd_reduce_mul_pairs"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
{STR_LIT("simd_reduce_min"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
{STR_LIT("simd_reduce_max"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
{STR_LIT("simd_reduce_and"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},

View File

@@ -1495,6 +1495,38 @@ gb_internal lbValue lb_build_builtin_simd_proc(lbProcedure *p, Ast *expr, TypeAn
res.value = LLVMBuildInsertElement(p->builder, arg0.value, arg2.value, arg1.value, "");
return res;
case BuiltinProc_simd_reduce_add_bisect:
case BuiltinProc_simd_reduce_mul_bisect:
{
GB_ASSERT(arg0.type->kind == Type_SimdVector);
i64 num_elems = arg0.type->SimdVector.count;
LLVMValueRef *indices = gb_alloc_array(temporary_allocator(), LLVMValueRef, num_elems);
for (i64 i = 0; i < num_elems; i++) {
indices[i] = lb_const_int(m, t_uint, cast(u64)i).value;
}
switch (builtin_id) {
case BuiltinProc_simd_reduce_add_bisect: op_code = is_float ? LLVMFAdd : LLVMAdd; break;
case BuiltinProc_simd_reduce_mul_bisect: op_code = is_float ? LLVMFMul : LLVMMul; break;
}
LLVMValueRef remaining = arg0.value;
i64 num_remaining = num_elems;
while (num_remaining > 1) {
num_remaining /= 2;
LLVMValueRef left_indices = LLVMConstVector(&indices[0], cast(unsigned)num_remaining);
LLVMValueRef left_value = LLVMBuildShuffleVector(p->builder, remaining, remaining, left_indices, "");
LLVMValueRef right_indices = LLVMConstVector(&indices[num_remaining], cast(unsigned)num_remaining);
LLVMValueRef right_value = LLVMBuildShuffleVector(p->builder, remaining, remaining, right_indices, "");
remaining = LLVMBuildBinOp(p->builder, op_code, left_value, right_value, "");
}
res.value = LLVMBuildExtractElement(p->builder, remaining, indices[0], "");
return res;
}
case BuiltinProc_simd_reduce_add_ordered:
case BuiltinProc_simd_reduce_mul_ordered:
{
@@ -1527,6 +1559,40 @@ gb_internal lbValue lb_build_builtin_simd_proc(lbProcedure *p, Ast *expr, TypeAn
res.value = lb_call_intrinsic(p, name, args, cast(unsigned)args_count, types, gb_count_of(types));
return res;
}
case BuiltinProc_simd_reduce_add_pairs:
case BuiltinProc_simd_reduce_mul_pairs:
{
GB_ASSERT(arg0.type->kind == Type_SimdVector);
i64 num_elems = arg0.type->SimdVector.count;
LLVMValueRef *indices = gb_alloc_array(temporary_allocator(), LLVMValueRef, num_elems);
for (i64 i = 0; i < num_elems/2; i++) {
indices[i] = lb_const_int(m, t_uint, cast(u64)(2*i)).value;
indices[i+num_elems/2] = lb_const_int(m, t_uint, cast(u64)(2*i+1)).value;
}
switch (builtin_id) {
case BuiltinProc_simd_reduce_add_pairs: op_code = is_float ? LLVMFAdd : LLVMAdd; break;
case BuiltinProc_simd_reduce_mul_pairs: op_code = is_float ? LLVMFMul : LLVMMul; break;
}
LLVMValueRef remaining = arg0.value;
i64 num_remaining = num_elems;
while (num_remaining > 1) {
num_remaining /= 2;
LLVMValueRef left_indices = LLVMConstVector(&indices[0], cast(unsigned)num_remaining);
LLVMValueRef left_value = LLVMBuildShuffleVector(p->builder, remaining, remaining, left_indices, "");
LLVMValueRef right_indices = LLVMConstVector(&indices[num_elems/2], cast(unsigned)num_remaining);
LLVMValueRef right_value = LLVMBuildShuffleVector(p->builder, remaining, remaining, right_indices, "");
remaining = LLVMBuildBinOp(p->builder, op_code, left_value, right_value, "");
}
res.value = LLVMBuildExtractElement(p->builder, remaining, indices[0], "");
return res;
}
case BuiltinProc_simd_reduce_min:
case BuiltinProc_simd_reduce_max:
case BuiltinProc_simd_reduce_and: