diff --git a/core/simd/simd.odin b/core/simd/simd.odin
index 37cc19ebd..385e24785 100644
--- a/core/simd/simd.odin
+++ b/core/simd/simd.odin
@@ -1759,7 +1759,7 @@ Returns:
 replace :: intrinsics.simd_replace
 
 /*
-Reduce a vector to a scalar by adding up all the lanes.
+Reduce a vector to a scalar by adding up all the lanes in an ordered fashion.
 
 This procedure returns a scalar that is the ordered sum of all lanes. The
 ordered sum may be important for accounting for precision errors in
@@ -2510,3 +2510,457 @@ Example:
 recip :: #force_inline proc "contextless" (v: $T/#simd[$LANES]$E) -> T where intrinsics.type_is_float(E) {
 	return T(1) / v
 }
+
+/*
+Creates a vector where each lane contains the index of that lane.
+
+Inputs:
+- `V`: The type of the vector to create.
+
+Result:
+- A vector of the given type, where each lane contains the index of that lane.
+
+**Operation**:
+
+>	for i in 0 ..< N {
+		res[i] = i
+	}
+*/
+indices :: #force_inline proc "contextless" ($V: typeid/#simd[$N]$E) -> V where intrinsics.type_is_numeric(E) {
+	when N == 1 {
+		return {0}
+	} else when N == 2 {
+		return {0, 1}
+	} else when N == 4 {
+		return {0, 1, 2, 3}
+	} else when N == 8 {
+		return {0, 1, 2, 3, 4, 5, 6, 7}
+	} else when N == 16 {
+		return {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}
+	} else when N == 32 {
+		return {
+			0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+			16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+		}
+	} else when N == 64 {
+		return {
+			0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+			16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+			32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+			48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+		}
+	} else {
+		#panic("Unsupported vector size!")
+	}
+}
+
+/*
+Reduce a vector to a scalar by adding up all the lanes in a pairwise fashion.
+
+This procedure returns a scalar that is the sum of all lanes, calculated by
+adding each even-numbered element with the following odd-numbered element. This
+is repeated until only a single element remains. This order is supported by
+hardware instructions for some types/architectures (e.g. i16/i32/f32/f64 on x86
+SSE, i8/i16/i32/f32 on ARM NEON).
+
+The order of the sum may be important for accounting for precision errors in
+floating-point computation, as floating-point addition is not associative, that
+is `(a+b)+c` may not be equal to `a+(b+c)`.
+
+Inputs:
+- `v`: The vector to reduce.
+
+Result:
+- Sum of all lanes, as a scalar.
+
+**Operation**:
+
+	for n > 1 {
+		n = n / 2
+		for i in 0 ..< n {
+			a[i] = a[2*i+0] + a[2*i+1]
+		}
+	}
+	res := a[0]
+
+Graphical representation of the operation for N=4:
+
+	   +-----------------------+
+	v: | v0  | v1  | v2  | v3  |
+	   +-----------------------+
+	      |     |     |     |
+	      `>[+]<'     `>[+]<'
+	         |           |
+	         `--->[+]<--'
+	               |
+	               v
+	            +-----+
+	    result: | y0  |
+	            +-----+
+*/
+reduce_add_pairs :: #force_inline proc "contextless" (v: #simd[$N]$E) -> E
+	where intrinsics.type_is_numeric(E) {
+	when N == 64 { v64 := v }
+	when N == 32 { v32 := v }
+	when N == 16 { v16 := v }
+	when N == 8  { v8 := v }
+	when N == 4  { v4 := v }
+	when N == 2  { v2 := v }
+
+	when N >= 64 {
+		x32 := swizzle(v64,
+			0,  2,  4,  6,  8,  10, 12, 14,
+			16, 18, 20, 22, 24, 26, 28, 30,
+			32, 34, 36, 38, 40, 42, 44, 46,
+			48, 50, 52, 54, 56, 58, 60, 62)
+		y32 := swizzle(v64,
+			1,  3,  5,  7,  9,  11, 13, 15,
+			17, 19, 21, 23, 25, 27, 29, 31,
+			33, 35, 37, 39, 41, 43, 45, 47,
+			49, 51, 53, 55, 57, 59, 61, 63)
+		v32 := x32 * y32
+	}
+
+	when N >= 32 {
+		x16 := swizzle(v32,
+			0,  2,  4,  6,  8,  10, 12, 14,
+			16, 18, 20, 22, 24, 26, 28, 30)
+		y16 := swizzle(v32,
+			1,  3,  5,  7,  9,  11, 13, 15,
+			17, 19, 21, 23, 25, 27, 29, 31)
+		v16 := x16 * y16
+	}
+
+	when N >= 16 {
+		x8 := swizzle(v16, 0, 2, 4, 6, 8, 10, 12, 14)
+		y8 := swizzle(v16, 1, 3, 5, 7, 9, 11, 13, 15)
+		v8 := x8 * y8
+	}
+
+	when N >= 8 {
+		x4 := swizzle(v8, 0, 2, 4, 6)
+		y4 := swizzle(v8, 1, 3, 5, 7)
+		v4 := x4 * y4
+	}
+
+	when N >= 4 {
+		x2 := swizzle(v4, 0, 2)
+		y2 := swizzle(v4, 1, 3)
+		v2 := x2 * y2
+	}
+
+	when N >= 2 {
+		return extract(v2, 0) * extract(v2, 1)
+	} else {
+		return extract(v, 0)
+	}
+}
+
+/*
+Reduce a vector to a scalar by adding up all the lanes in a binary fashion.
+
+This procedure returns a scalar that is the sum of all lanes, calculated by
+splitting the vector in two parts and adding the two halves together
+element-wise. This is repeated until only a single element remains. This order
+will typically be faster to compute than the ordered sum for floats, as it can
+be better parallelized.
+
+The order of the sum may be important for accounting for precision errors in
+floating-point computation, as floating-point addition is not associative, that
+is `(a+b)+c` may not be equal to `a+(b+c)`.
+
+Inputs:
+- `v`: The vector to reduce.
+
+Result:
+- Sum of all lanes, as a scalar.
+
+**Operation**:
+
+	for n > 1 {
+		n = n / 2
+		for i in 0 ..< n {
+			a[i] += a[i+n]
+		}
+	}
+	res := a[0]
+
+Graphical representation of the operation for N=4:
+
+	     +-----------------------+
+	     | v0  | v1  | v2  | v3  |
+	     +-----------------------+
+	        |     |     |     |
+	       [+]<-- | ---'      |
+	        |    [+]<--------'
+	        |     |
+	        `>[+]<'
+	           |
+	           v
+	        +-----+
+	result: | y0  |
+	        +-----+
+*/
+reduce_add_split :: #force_inline proc "contextless" (v: #simd[$N]$E) -> E
+	where intrinsics.type_is_numeric(E) {
+	when N == 64 { v64 := v }
+	when N == 32 { v32 := v }
+	when N == 16 { v16 := v }
+	when N == 8  { v8 := v }
+	when N == 4  { v4 := v }
+	when N == 2  { v2 := v }
+
+	when N >= 64 {
+		x32 := swizzle(v64,
+			0,  1,  2,  3,  4,  5,  6,  7,
+			8,  9,  10, 11, 12, 13, 14, 15,
+			16, 17, 18, 19, 20, 21, 22, 23,
+			24, 25, 26, 27, 28, 29, 30, 31)
+		y32 := swizzle(v64,
+			32, 33, 34, 35, 36, 37, 38, 39,
+			40, 41, 42, 43, 44, 45, 46, 47,
+			48, 49, 50, 51, 52, 53, 54, 55,
+			56, 57, 58, 59, 60, 61, 62, 63)
+		v32 := x32 + y32
+	}
+
+	when N >= 32 {
+		x16 := swizzle(v32,
+			0,  1,  2,  3,  4,  5,  6,  7,
+			8,  9,  10, 11, 12, 13, 14, 15)
+		y16 := swizzle(v32,
+			16, 17, 18, 19, 20, 21, 22, 23,
+			24, 25, 26, 27, 28, 29, 30, 31)
+		v16 := x16 + y16
+	}
+
+	when N >= 16 {
+		x8 := swizzle(v16, 0, 1, 2,  3,  4,  5,  6,  7)
+		y8 := swizzle(v16, 8, 9, 10, 11, 12, 13, 14, 15)
+		v8 := x8 + y8
+	}
+
+	when N >= 8 {
+		x4 := swizzle(v8, 0, 1, 2, 3)
+		y4 := swizzle(v8, 4, 5, 6, 7)
+		v4 := x4 + y4
+	}
+
+	when N >= 4 {
+		x2 := swizzle(v4, 0, 1)
+		y2 := swizzle(v4, 2, 3)
+		v2 := x2 + y2
+	}
+
+	when N >= 2 {
+		return extract(v2, 0) + extract(v2, 1)
+	} else {
+		return extract(v, 0)
+	}
+}
+
+/*
+Reduce a vector to a scalar by multiplying all the lanes in a pairwise fashion.
+
+This procedure returns a scalar that is the product of all lanes, calculated by
+multiplying each even-numbered element with the following odd-numbered element.
+This is repeated until only a single element remains. This order may be faster
+to compute than the ordered product for floats, as it can be better
+parallelized.
+
+The order of the product may be important for accounting for precision errors
+in floating-point computation, as floating-point multiplication is not
+associative, that is `(a*b)*c` may not be equal to `a*(b*c)`.
+
+Inputs:
+- `v`: The vector to reduce.
+
+Result:
+- Product of all lanes, as a scalar.
+
+**Operation**:
+
+	for n > 1 {
+		n = n / 2
+		for i in 0 ..< n {
+			a[i] = a[2*i+0] * a[2*i+1]
+		}
+	}
+	res := a[0]
+
+Graphical representation of the operation for N=4:
+
+	   +-----------------------+
+	v: | v0  | v1  | v2  | v3  |
+	   +-----------------------+
+	      |     |     |     |
+	      `>[x]<'     `>[x]<'
+	         |           |
+	         `--->[x]<--'
+	               |
+	               v
+	            +-----+
+	    result: | y0  |
+	            +-----+
+*/
+reduce_mul_pairs :: #force_inline proc "contextless" (v: #simd[$N]$E) -> E
+	where intrinsics.type_is_numeric(E) {
+	when N == 64 { v64 := v }
+	when N == 32 { v32 := v }
+	when N == 16 { v16 := v }
+	when N == 8  { v8 := v }
+	when N == 4  { v4 := v }
+	when N == 2  { v2 := v }
+
+	when N >= 64 {
+		x32 := swizzle(v64,
+			0,  2,  4,  6,  8,  10, 12, 14,
+			16, 18, 20, 22, 24, 26, 28, 30,
+			32, 34, 36, 38, 40, 42, 44, 46,
+			48, 50, 52, 54, 56, 58, 60, 62)
+		y32 := swizzle(v64,
+			1,  3,  5,  7,  9,  11, 13, 15,
+			17, 19, 21, 23, 25, 27, 29, 31,
+			33, 35, 37, 39, 41, 43, 45, 47,
+			49, 51, 53, 55, 57, 59, 61, 63)
+		v32 := x32 * y32
+	}
+
+	when N >= 32 {
+		x16 := swizzle(v32,
+			0,  2,  4,  6,  8,  10, 12, 14,
+			16, 18, 20, 22, 24, 26, 28, 30)
+		y16 := swizzle(v32,
+			1,  3,  5,  7,  9,  11, 13, 15,
+			17, 19, 21, 23, 25, 27, 29, 31)
+		v16 := x16 * y16
+	}
+
+	when N >= 16 {
+		x8 := swizzle(v16, 0, 2, 4, 6, 8, 10, 12, 14)
+		y8 := swizzle(v16, 1, 3, 5, 7, 9, 11, 13, 15)
+		v8 := x8 * y8
+	}
+
+	when N >= 8 {
+		x4 := swizzle(v8, 0, 2, 4, 6)
+		y4 := swizzle(v8, 1, 3, 5, 7)
+		v4 := x4 * y4
+	}
+
+	when N >= 4 {
+		x2 := swizzle(v4, 0, 2)
+		y2 := swizzle(v4, 1, 3)
+		v2 := x2 * y2
+	}
+
+	when N >= 2 {
+		return extract(v2, 0) * extract(v2, 1)
+	} else {
+		return extract(v, 0)
+	}
+}
+
+/*
+Reduce a vector to a scalar by multiplying up all the lanes in a binary fashion.
+
+This procedure returns a scalar that is the product of all lanes, calculated by
+splitting the vector in two parts and multiplying the two halves together
+element-wise until only a single element remains. This is repeated until only a
+single element remains. This order may be faster to compute than the ordered
+product for floats, as it can be better parallelized.
+
+The order of the product may be important for accounting for precision errors
+in floating-point computation, as floating-point multiplication is not
+associative, that is `(a*b)*c` may not be equal to `a*(b*c)`.
+
+Inputs:
+- `v`: The vector to reduce.
+
+Result:
+- Product of all lanes, as a scalar.
+
+**Operation**:
+
+	for n > 1 {
+		n = n / 2
+		for i in 0 ..< n {
+			a[i] *= a[i+n]
+		}
+	}
+	res := a[0]
+
+Graphical representation of the operation for N=4:
+
+	     +-----------------------+
+	     | v0  | v1  | v2  | v3  |
+	     +-----------------------+
+	        |     |     |     |
+	       [x]<-- | ---'      |
+	        |    [x]<--------'
+	        |     |
+	        `>[x]<'
+	           |
+	           v
+	        +-----+
+	result: | y0  |
+	        +-----+
+*/
+reduce_mul_split :: #force_inline proc "contextless" (v: #simd[$N]$E) -> E
+	where intrinsics.type_is_numeric(E) {
+	when N == 64 { v64 := v }
+	when N == 32 { v32 := v }
+	when N == 16 { v16 := v }
+	when N == 8  { v8 := v }
+	when N == 4  { v4 := v }
+	when N == 2  { v2 := v }
+
+	when N >= 64 {
+		x32 := swizzle(v64,
+			0,  1,  2,  3,  4,  5,  6,  7,
+			8,  9,  10, 11, 12, 13, 14, 15,
+			16, 17, 18, 19, 20, 21, 22, 23,
+			24, 25, 26, 27, 28, 29, 30, 31)
+		y32 := swizzle(v64,
+			32, 33, 34, 35, 36, 37, 38, 39,
+			40, 41, 42, 43, 44, 45, 46, 47,
+			48, 49, 50, 51, 52, 53, 54, 55,
+			56, 57, 58, 59, 60, 61, 62, 63)
+		v32 := x32 * y32
+	}
+
+	when N >= 32 {
+		x16 := swizzle(v32,
+			0,  1,  2,  3,  4,  5,  6,  7,
+			8,  9,  10, 11, 12, 13, 14, 15)
+		y16 := swizzle(v32,
+			16, 17, 18, 19, 20, 21, 22, 23,
+			24, 25, 26, 27, 28, 29, 30, 31)
+		v16 := x16 * y16
+	}
+
+	when N >= 16 {
+		x8 := swizzle(v16, 0, 1, 2,  3,  4,  5,  6,  7)
+		y8 := swizzle(v16, 8, 9, 10, 11, 12, 13, 14, 15)
+		v8 := x8 * y8
+	}
+
+	when N >= 8 {
+		x4 := swizzle(v8, 0, 1, 2, 3)
+		y4 := swizzle(v8, 4, 5, 6, 7)
+		v4 := x4 * y4
+	}
+
+	when N >= 4 {
+		x2 := swizzle(v4, 0, 1)
+		y2 := swizzle(v4, 2, 3)
+		v2 := x2 * y2
+	}
+
+	when N >= 2 {
+		return extract(v2, 0) * extract(v2, 1)
+	} else {
+		return extract(v, 0)
+	}
+}
+