Merge pull request #6712 from kalsprite/runtime_swizzle_patch

Runtime swizzle patch
2026-07-17 21:21:04 +00:00 · 2026-05-21 10:51:04 +01:00
parent 844a828a94 98d6e0c26a
commit e950abf0c2
2 changed files with 105 additions and 79 deletions
--- a/core/simd/simd.odin
+++ b/core/simd/simd.odin
@@ -2441,40 +2441,55 @@ Graphically, the operation looks as follows. The `t` and `f` represent the
 select :: intrinsics.simd_select

 /*
-Runtime Equivalent to Shuffle.
+Hardware-level dynamic swizzle / table lookup.

-Performs element-wise table lookups using runtime indices.
-Each element in the indices vector selects an element from the table vector.
-The indices are automatically masked to prevent out-of-bounds access.
-
-This operation is hardware-accelerated on most platforms when using 8-bit
-integer vectors. For other element types or unsupported vector sizes, it
-falls back to software emulation.
+For each output lane `i`, picks an element from `table` using `indices[i]`
+as the source position. Maps directly onto hardware table-lookup instructions
+where available (`tbl` on ARM, `pshufb` on x86, `swizzle` on WASM). This is
+the runtime counterpart of `simd.shuffle`.

 Inputs:
- `table`: The lookup table vector (should be power-of-2 size for correct masking).
- `indices`: The indices vector (automatically masked to valid range).
+- `table`:   a `#simd[N]T` lookup table, where `T` is an integer type and
+            `N` is a power of two (enforced by `#simd`).
+- `indices`: a `#simd[N]T` of source positions. Must have the exact same
+            type as `table`.

 Returns:
- A vector where `result[i] = table[indices[i] & (table_size-1)]`.
+- A `#simd[N]T`. For `indices[i]` in `[0, N-1]`, `result[i] = table[indices[i]]`
+  on ARM, on WebAssembly (16-byte), and on the scalar emulation fallback.
+  On x86 vectors larger than 16 bytes, the operation is lane-local; see Notes.

-Operation:
+Operation (in-range indices, non-lane-local platforms):

-	for i in 0 ..< len(indices) {
-		masked_index := indices[i] & (len(table) - 1)
-		result[i] = table[masked_index]
+	for i in 0 ..< N {
+		result[i] = table[indices[i]]   // indices[i] assumed to be in [0, N-1]
 	}
-	return result
+
+Notes:
+- **Out-of-range indices** (`indices[i] >= N`) are platform-defined:
+  most hardware paths return 0; the scalar fallback wraps via
+  `indices[i] & (N-1)`. Mask explicitly if you need portable behavior.
+- **x86 wide vectors are lane-local.** `vpshufb` treats a 256-bit AVX2
+  register as two independent 128-bit lanes; `pshufb.b.512` treats a
+  512-bit AVX-512 register as four. An index in lane `k` can only address
+  table entries in that same lane. AVX2 Example: Lane 0 is [0..15], Lane 1
+  is [16..31]. Accessing `indices[20] = 5` yields `table[21]`, not `table[5]`.
+  ARM `tbl` and the emulation fallback are cross-lane across the full table.
+- **Hardware acceleration is conditional.** It requires 8-bit integer
+  elements, a vector size supported by the target, and the relevant
+  target features enabled (`ssse3` / `avx2` / `avx512f`+`avx512bw` on x86,
+  `neon` on ARM). Otherwise this falls back to scalar emulation. Non-byte
+  element types always use the scalar fallback.

 Implementation:

-	| Platform    | Lane Size                                 | Implementation      |
-	|-------------|-------------------------------------------|---------------------|
-	| x86-64      | pshufb (16B), vpshufb (32B), AVX512 (64B) | Single vector       |
-	| ARM64       | tbl1 (16B), tbl2 (32B), tbl4 (64B)        | Automatic splitting |
-	| ARM32       | vtbl1 (8B), vtbl2 (16B), vtbl4 (32B)      | Automatic splitting |
-	| WebAssembly | i8x16.swizzle (16B), Emulation (>16B)     | Mixed               |
-	| Other       | Emulation                                 | Software            |
+	| Platform    | Hardware path (8-bit elements)                  | Notes                         |
+	|-------------|-------------------------------------------------|-------------------------------|
+	| x86-64      | pshufb (16B), vpshufb (32B), pshufb.b.512 (64B) | 32 / 64 B are lane-local      |
+	| ARM64       | tbl1 (16B), tbl2 (32B), tbl4 (64B)              | >16 B: split into 16-B chunks |
+	| ARM32       | vtbl1 (8B), vtbl2 (16B), vtbl4 (32B)            | >8 B: split into 8-B chunks   |
+	| WebAssembly | i8x16.swizzle (16B)                             | Other sizes: emulated         |
+	| Other       | Scalar emulation                                |                               |

 Example:

@@ -2482,12 +2497,11 @@ Example:
 	import "core:fmt"

 	runtime_swizzle_example :: proc() {
-		table := simd.u8x16{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}
+		table   := simd.u8x16{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}
 		indices := simd.u8x16{15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}
-		result := simd.runtime_swizzle(table, indices)
-		fmt.println(result) // Expected: {15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}
+		result  := simd.runtime_swizzle(table, indices)
+		fmt.println(result) // {15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}
 	}
-
 */
 runtime_swizzle :: intrinsics.simd_runtime_swizzle

--- a/src/llvm_backend_proc.cpp
+++ b/src/llvm_backend_proc.cpp
@@ -2078,67 +2078,79 @@ gb_internal lbValue lb_build_builtin_simd_proc(lbProcedure *p, Ast *expr, TypeAn
 						lb_add_attribute_to_proc_with_string(p->module, p->value, str_lit("target-features"), str_lit("+neon"));
 					}
 					
-					// Handle ARM's multi-swizzle intrinsics by splitting the src vector
-					if (build_context.metrics.arch == TargetArch_arm64 && count > 16) {
-						// ARM64 TBL2/TBL3/TBL4: Split src into multiple 16-byte vectors
-						int num_tables = cast(int)(count / 16);
-						GB_ASSERT_MSG(count % 16 == 0, "ARM64 src size must be multiple of 16 bytes, got %lld bytes", count);
-						GB_ASSERT_MSG(num_tables <= 4, "ARM64 NEON supports maximum 4 tables (tbl4), got %d tables for %lld-byte vector", num_tables, count);
-						
-						LLVMValueRef src_parts[4]; // Max 4 tables for tbl4
+					// Handle ARM's multi-swizzle intrinsics by splitting the src vector.
+					// tbl[N]/vtbl[N] only produce <16 x i8>/<8 x i8>, so we issue one call
+					// per output chunk and concat the results.
+					if ((build_context.metrics.arch == TargetArch_arm64 && count > 16) ||
+					    (build_context.metrics.arch == TargetArch_arm32 && count > 8)) {
+						bool is_arm64 = build_context.metrics.arch == TargetArch_arm64;
+						i64 lane = is_arm64 ? 16 : 8;
+						int num_tables = cast(int)(count / lane);
+						GB_ASSERT_MSG(count % lane == 0, "ARM src size must be multiple of %lld bytes, got %lld bytes", lane, count);
+						GB_ASSERT_MSG(num_tables <= 4, "ARM NEON supports maximum 4 tables, got %d tables for %lld-byte vector", num_tables, count);
+
+						LLVMTypeRef i32_type = LLVMInt32TypeInContext(p->module->ctx);
+						LLVMTypeRef i8_type  = LLVMInt8TypeInContext(p->module->ctx);
+						LLVMTypeRef vN_type  = LLVMVectorType(i8_type, cast(unsigned)lane);
+
+						// Split src/indices into N lane-sized chunks
+						LLVMValueRef src_parts[4];
+						LLVMValueRef idx_parts[4];
 						for (int i = 0; i < num_tables; i++) {
-							// Extract 16-byte slice from the larger src
-							LLVMValueRef indices_for_extract[16];
-							for (int j = 0; j < 16; j++) {
-								indices_for_extract[j] = LLVMConstInt(LLVMInt32TypeInContext(p->module->ctx), i * 16 + j, false);
+							LLVMValueRef mask[16];
+							for (int j = 0; j < lane; j++) {
+								mask[j] = LLVMConstInt(i32_type, cast(unsigned)(i * lane + j), false);
 							}
-							LLVMValueRef extract_mask = LLVMConstVector(indices_for_extract, 16);
-							src_parts[i] = LLVMBuildShuffleVector(p->builder, src, LLVMGetUndef(LLVMTypeOf(src)), extract_mask, "");
+							LLVMValueRef shuffle_mask = LLVMConstVector(mask, cast(unsigned)lane);
+							src_parts[i] = LLVMBuildShuffleVector(p->builder, src,     LLVMGetUndef(LLVMTypeOf(src)),     shuffle_mask, "");
+							idx_parts[i] = LLVMBuildShuffleVector(p->builder, indices, LLVMGetUndef(LLVMTypeOf(indices)), shuffle_mask, "");
 						}
-						
-						// Call appropriate ARM64 tbl intrinsic
-						if (count == 32) {
-							LLVMValueRef args[3] = { src_parts[0], src_parts[1], indices };
-							res.value = lb_call_intrinsic(p, intrinsic_name, args, 3, nullptr, 0);
-						} else if (count == 48) {
-							LLVMValueRef args[4] = { src_parts[0], src_parts[1], src_parts[2], indices };
-							res.value = lb_call_intrinsic(p, intrinsic_name, args, 4, nullptr, 0);
-						} else if (count == 64) {
-							LLVMValueRef args[5] = { src_parts[0], src_parts[1], src_parts[2], src_parts[3], indices };
-							res.value = lb_call_intrinsic(p, intrinsic_name, args, 5, nullptr, 0);
+
+						// One tbl/vtbl call per output chunk; same N tables, different indices.
+						// ARM64 tbl[N] is overloaded on result type; ARM32 vtbl[N] has fixed <8 x i8>.
+						LLVMTypeRef overload_types[1] = { vN_type };
+						LLVMTypeRef *overloads_arg = is_arm64 ? overload_types : nullptr;
+						unsigned overloads_count   = is_arm64 ? 1 : 0;
+						LLVMValueRef out_parts[4];
+						for (int c = 0; c < num_tables; c++) {
+							LLVMValueRef args[5];
+							for (int i = 0; i < num_tables; i++) args[i] = src_parts[i];
+							args[num_tables] = idx_parts[c];
+							out_parts[c] = lb_call_intrinsic(p, intrinsic_name, args, num_tables + 1, overloads_arg, overloads_count);
 						}
-					} else if (build_context.metrics.arch == TargetArch_arm32 && count > 8) {
-						// ARM32 VTBL2/VTBL3/VTBL4: Split src into multiple 8-byte vectors
-						int num_tables = cast(int)count / 8;
-						GB_ASSERT_MSG(count % 8 == 0, "ARM32 src size must be multiple of 8 bytes, got %lld bytes", count);
-						GB_ASSERT_MSG(num_tables <= 4, "ARM32 NEON supports maximum 4 tables (vtbl4), got %d tables for %lld-byte vector", num_tables, count);
-						
-						LLVMValueRef src_parts[4]; // Max 4 tables for vtbl4
-						for (int i = 0; i < num_tables; i++) {
-							// Extract 8-byte slice from the larger src
-							LLVMValueRef indices_for_extract[8];
-							for (int j = 0; j < 8; j++) {
-								indices_for_extract[j] = LLVMConstInt(LLVMInt32TypeInContext(p->module->ctx), i * 8 + j, false);
+
+						// Concat out_parts[0..num_tables) into <count x i8> by pair-wise
+						// shufflevector. shufflevector requires equal-sized operands, so we
+						// pad the right-hand chunk to acc_size with undef on each step.
+						LLVMValueRef acc = out_parts[0];
+						i64 acc_size = lane;
+						for (int c = 1; c < num_tables; c++) {
+							LLVMValueRef rhs = out_parts[c];
+							if (acc_size > lane) {
+								LLVMValueRef pad[64];
+								for (i64 k = 0; k < acc_size; k++) {
+									pad[k] = (k < lane) ? LLVMConstInt(i32_type, cast(unsigned)k, false) : LLVMGetUndef(i32_type);
+								}
+								rhs = LLVMBuildShuffleVector(p->builder, rhs, LLVMGetUndef(LLVMTypeOf(rhs)), LLVMConstVector(pad, cast(unsigned)acc_size), "");
 							}
-							LLVMValueRef extract_mask = LLVMConstVector(indices_for_extract, 8);
-							src_parts[i] = LLVMBuildShuffleVector(p->builder, src, LLVMGetUndef(LLVMTypeOf(src)), extract_mask, "");
-						}
-						
-						// Call appropriate ARM32 vtbl intrinsic
-						if (count == 16) {
-							LLVMValueRef args[3] = { src_parts[0], src_parts[1], indices };
-							res.value = lb_call_intrinsic(p, intrinsic_name, args, 3, nullptr, 0);
-						} else if (count == 24) {
-							LLVMValueRef args[4] = { src_parts[0], src_parts[1], src_parts[2], indices };
-							res.value = lb_call_intrinsic(p, intrinsic_name, args, 4, nullptr, 0);
-						} else if (count == 32) {
-							LLVMValueRef args[5] = { src_parts[0], src_parts[1], src_parts[2], src_parts[3], indices };
-							res.value = lb_call_intrinsic(p, intrinsic_name, args, 5, nullptr, 0);
+							i64 new_size = acc_size + lane;
+							LLVMValueRef concat[64];
+							for (i64 k = 0; k < acc_size; k++) concat[k]            = LLVMConstInt(i32_type, cast(unsigned)k,            false);
+							for (i64 k = 0; k < lane;     k++) concat[acc_size + k] = LLVMConstInt(i32_type, cast(unsigned)(acc_size+k), false);
+							acc = LLVMBuildShuffleVector(p->builder, acc, rhs, LLVMConstVector(concat, cast(unsigned)new_size), "");
+							acc_size = new_size;
 						}
+						res.value = acc;
 					} else {
 						// Single runtime swizzle case (x86, WebAssembly, ARM single-table)
 						LLVMValueRef args[2] = { src, indices };
-						res.value = lb_call_intrinsic(p, intrinsic_name, args, gb_count_of(args), nullptr, 0);
+						if (build_context.metrics.arch == TargetArch_arm64) {
+							// ARM64 tbl1 is overloaded on result type; others are fixed
+							LLVMTypeRef overload_types[1] = { LLVMTypeOf(indices) };
+							res.value = lb_call_intrinsic(p, intrinsic_name, args, gb_count_of(args), overload_types, 1);
+						} else {
+							res.value = lb_call_intrinsic(p, intrinsic_name, args, gb_count_of(args), nullptr, 0);
+						}
 					}
 					return res;
 				} else {