From b0f53a6eaf8fcfeaffce84c9077c6955df222788 Mon Sep 17 00:00:00 2001
From: Barinzaya <barinzaya@gmail.com>
Date: Sat, 3 May 2025 17:25:20 -0400
Subject: [PATCH] Implemented suggestions on `core:simd` helpers.

Adjusted documentation, and renamed the reduce_*_split procs to
reduce_*_bisect.
---
 core/simd/simd.odin | 44 ++++++++++++++++++++++++--------------------
 1 file changed, 24 insertions(+), 20 deletions(-)

diff --git a/core/simd/simd.odin b/core/simd/simd.odin
index e11bdf816..a2fe22b4b 100644
--- a/core/simd/simd.odin
+++ b/core/simd/simd.odin
@@ -2512,7 +2512,7 @@ recip :: #force_inline proc "contextless" (v: $T/#simd[$LANES]$E) -> T where int
 }
 
 /*
-Creates a vector where each lane contains the index of that lane.
+Create a vector where each lane contains the index of that lane.
 
 Inputs:
 - `V`: The type of the vector to create.
@@ -2558,10 +2558,10 @@ indices :: #force_inline proc "contextless" ($V: typeid/#simd[$N]$E) -> V where
 Reduce a vector to a scalar by adding up all the lanes in a pairwise fashion.
 
 This procedure returns a scalar that is the sum of all lanes, calculated by
-adding each even-numbered element with the following odd-numbered element. This
-is repeated until only a single element remains. This order is supported by
-hardware instructions for some types/architectures (e.g. i16/i32/f32/f64 on x86
-SSE, i8/i16/i32/f32 on ARM NEON).
+adding each even-indexed element with the following odd-indexed element to
+produce N/2 values. This is repeated until only a single element remains. This
+order is supported by hardware instructions for some types/architectures (e.g.
+i16/i32/f32/f64 on x86 SSE, i8/i16/i32/f32 on ARM NEON).
 
 The order of the sum may be important for accounting for precision errors in
 floating-point computation, as floating-point addition is not associative, that
@@ -2657,13 +2657,14 @@ reduce_add_pairs :: #force_inline proc "contextless" (v: #simd[$N]$E) -> E
 }
 
 /*
-Reduce a vector to a scalar by adding up all the lanes in a binary fashion.
+Reduce a vector to a scalar by adding up all the lanes in a bisecting fashion.
 
 This procedure returns a scalar that is the sum of all lanes, calculated by
-splitting the vector in two parts and adding the two halves together
-element-wise. This is repeated until only a single element remains. This order
-will typically be faster to compute than the ordered sum for floats, as it can
-be better parallelized.
+bisecting the vector into two parts, where the first contains lanes [0, N/2)
+and the second contains lanes [N/2, N), and adding the two halves element-wise
+to produce N/2 values. This is repeated until only a single element remains.
+This order may be faster to compute than the ordered sum for floats, as it can
+often be better parallelized.
 
 The order of the sum may be important for accounting for precision errors in
 floating-point computation, as floating-point addition is not associative, that
@@ -2701,7 +2702,7 @@ Graphical representation of the operation for N=4:
 	result: | y0  |
 	        +-----+
 */
-reduce_add_split :: #force_inline proc "contextless" (v: #simd[$N]$E) -> E
+reduce_add_bisect :: #force_inline proc "contextless" (v: #simd[$N]$E) -> E
 	where intrinsics.type_is_numeric(E) {
 	when N == 64 { v64 := v }
 	when N == 32 { v32 := v }
@@ -2763,10 +2764,12 @@ reduce_add_split :: #force_inline proc "contextless" (v: #simd[$N]$E) -> E
 Reduce a vector to a scalar by multiplying all the lanes in a pairwise fashion.
 
 This procedure returns a scalar that is the product of all lanes, calculated by
-multiplying each even-numbered element with the following odd-numbered element.
-This is repeated until only a single element remains. This order may be faster
-to compute than the ordered product for floats, as it can be better
-parallelized.
+bisecting the vector into two parts, where the first contains lanes [0, N/2)
+and the second contains lanes [N/2, N), and multiplying the two halves together
+multiplying each even-indexed element with the following odd-indexed element to
+produce N/2 values. This is repeated until only a single element remains. This
+order may be faster to compute than the ordered product for floats, as it can
+often be better parallelized.
 
 The order of the product may be important for accounting for precision errors
 in floating-point computation, as floating-point multiplication is not
@@ -2862,13 +2865,14 @@ reduce_mul_pairs :: #force_inline proc "contextless" (v: #simd[$N]$E) -> E
 }
 
 /*
-Reduce a vector to a scalar by multiplying up all the lanes in a binary fashion.
+Reduce a vector to a scalar by multiplying up all the lanes in a bisecting fashion.
 
 This procedure returns a scalar that is the product of all lanes, calculated by
-splitting the vector in two parts and multiplying the two halves together
-element-wise until only a single element remains. This is repeated until only a
+bisecting the vector into two parts, where the first contains indices [0, N/2)
+and the second contains indices [N/2, N), and multiplying the two halves
+together element-wise to produce N/2 values. This is repeated until only a
 single element remains. This order may be faster to compute than the ordered
-product for floats, as it can be better parallelized.
+product for floats, as it can often be better parallelized.
 
 The order of the product may be important for accounting for precision errors
 in floating-point computation, as floating-point multiplication is not
@@ -2906,7 +2910,7 @@ Graphical representation of the operation for N=4:
 	result: | y0  |
 	        +-----+
 */
-reduce_mul_split :: #force_inline proc "contextless" (v: #simd[$N]$E) -> E
+reduce_mul_bisect :: #force_inline proc "contextless" (v: #simd[$N]$E) -> E
 	where intrinsics.type_is_numeric(E) {
 	when N == 64 { v64 := v }
 	when N == 32 { v32 := v }