Merge pull request #6712 from kalsprite/runtime_swizzle_patch

Runtime swizzle patch
This commit is contained in:
gingerBill
2026-05-21 10:51:04 +01:00
committed by GitHub
2 changed files with 105 additions and 79 deletions

View File

@@ -2441,40 +2441,55 @@ Graphically, the operation looks as follows. The `t` and `f` represent the
select :: intrinsics.simd_select
/*
Runtime Equivalent to Shuffle.
Hardware-level dynamic swizzle / table lookup.
Performs element-wise table lookups using runtime indices.
Each element in the indices vector selects an element from the table vector.
The indices are automatically masked to prevent out-of-bounds access.
This operation is hardware-accelerated on most platforms when using 8-bit
integer vectors. For other element types or unsupported vector sizes, it
falls back to software emulation.
For each output lane `i`, picks an element from `table` using `indices[i]`
as the source position. Maps directly onto hardware table-lookup instructions
where available (`tbl` on ARM, `pshufb` on x86, `swizzle` on WASM). This is
the runtime counterpart of `simd.shuffle`.
Inputs:
- `table`: The lookup table vector (should be power-of-2 size for correct masking).
- `indices`: The indices vector (automatically masked to valid range).
- `table`: a `#simd[N]T` lookup table, where `T` is an integer type and
`N` is a power of two (enforced by `#simd`).
- `indices`: a `#simd[N]T` of source positions. Must have the exact same
type as `table`.
Returns:
- A vector where `result[i] = table[indices[i] & (table_size-1)]`.
- A `#simd[N]T`. For `indices[i]` in `[0, N-1]`, `result[i] = table[indices[i]]`
on ARM, on WebAssembly (16-byte), and on the scalar emulation fallback.
On x86 vectors larger than 16 bytes, the operation is lane-local; see Notes.
Operation:
Operation (in-range indices, non-lane-local platforms):
for i in 0 ..< len(indices) {
masked_index := indices[i] & (len(table) - 1)
result[i] = table[masked_index]
for i in 0 ..< N {
result[i] = table[indices[i]] // indices[i] assumed to be in [0, N-1]
}
return result
Notes:
- **Out-of-range indices** (`indices[i] >= N`) are platform-defined:
most hardware paths return 0; the scalar fallback wraps via
`indices[i] & (N-1)`. Mask explicitly if you need portable behavior.
- **x86 wide vectors are lane-local.** `vpshufb` treats a 256-bit AVX2
register as two independent 128-bit lanes; `pshufb.b.512` treats a
512-bit AVX-512 register as four. An index in lane `k` can only address
table entries in that same lane. AVX2 Example: Lane 0 is [0..15], Lane 1
is [16..31]. Accessing `indices[20] = 5` yields `table[21]`, not `table[5]`.
ARM `tbl` and the emulation fallback are cross-lane across the full table.
- **Hardware acceleration is conditional.** It requires 8-bit integer
elements, a vector size supported by the target, and the relevant
target features enabled (`ssse3` / `avx2` / `avx512f`+`avx512bw` on x86,
`neon` on ARM). Otherwise this falls back to scalar emulation. Non-byte
element types always use the scalar fallback.
Implementation:
| Platform | Lane Size | Implementation |
|-------------|-------------------------------------------|---------------------|
| x86-64 | pshufb (16B), vpshufb (32B), AVX512 (64B) | Single vector |
| ARM64 | tbl1 (16B), tbl2 (32B), tbl4 (64B) | Automatic splitting |
| ARM32 | vtbl1 (8B), vtbl2 (16B), vtbl4 (32B) | Automatic splitting |
| WebAssembly | i8x16.swizzle (16B), Emulation (>16B) | Mixed |
| Other | Emulation | Software |
| Platform | Hardware path (8-bit elements) | Notes |
|-------------|-------------------------------------------------|-------------------------------|
| x86-64 | pshufb (16B), vpshufb (32B), pshufb.b.512 (64B) | 32 / 64 B are lane-local |
| ARM64 | tbl1 (16B), tbl2 (32B), tbl4 (64B) | >16 B: split into 16-B chunks |
| ARM32 | vtbl1 (8B), vtbl2 (16B), vtbl4 (32B) | >8 B: split into 8-B chunks |
| WebAssembly | i8x16.swizzle (16B) | Other sizes: emulated |
| Other | Scalar emulation | |
Example:
@@ -2482,12 +2497,11 @@ Example:
import "core:fmt"
runtime_swizzle_example :: proc() {
table := simd.u8x16{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}
table := simd.u8x16{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}
indices := simd.u8x16{15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}
result := simd.runtime_swizzle(table, indices)
fmt.println(result) // Expected: {15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}
result := simd.runtime_swizzle(table, indices)
fmt.println(result) // {15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}
}
*/
runtime_swizzle :: intrinsics.simd_runtime_swizzle

View File

@@ -2078,67 +2078,79 @@ gb_internal lbValue lb_build_builtin_simd_proc(lbProcedure *p, Ast *expr, TypeAn
lb_add_attribute_to_proc_with_string(p->module, p->value, str_lit("target-features"), str_lit("+neon"));
}
// Handle ARM's multi-swizzle intrinsics by splitting the src vector
if (build_context.metrics.arch == TargetArch_arm64 && count > 16) {
// ARM64 TBL2/TBL3/TBL4: Split src into multiple 16-byte vectors
int num_tables = cast(int)(count / 16);
GB_ASSERT_MSG(count % 16 == 0, "ARM64 src size must be multiple of 16 bytes, got %lld bytes", count);
GB_ASSERT_MSG(num_tables <= 4, "ARM64 NEON supports maximum 4 tables (tbl4), got %d tables for %lld-byte vector", num_tables, count);
LLVMValueRef src_parts[4]; // Max 4 tables for tbl4
// Handle ARM's multi-swizzle intrinsics by splitting the src vector.
// tbl[N]/vtbl[N] only produce <16 x i8>/<8 x i8>, so we issue one call
// per output chunk and concat the results.
if ((build_context.metrics.arch == TargetArch_arm64 && count > 16) ||
(build_context.metrics.arch == TargetArch_arm32 && count > 8)) {
bool is_arm64 = build_context.metrics.arch == TargetArch_arm64;
i64 lane = is_arm64 ? 16 : 8;
int num_tables = cast(int)(count / lane);
GB_ASSERT_MSG(count % lane == 0, "ARM src size must be multiple of %lld bytes, got %lld bytes", lane, count);
GB_ASSERT_MSG(num_tables <= 4, "ARM NEON supports maximum 4 tables, got %d tables for %lld-byte vector", num_tables, count);
LLVMTypeRef i32_type = LLVMInt32TypeInContext(p->module->ctx);
LLVMTypeRef i8_type = LLVMInt8TypeInContext(p->module->ctx);
LLVMTypeRef vN_type = LLVMVectorType(i8_type, cast(unsigned)lane);
// Split src/indices into N lane-sized chunks
LLVMValueRef src_parts[4];
LLVMValueRef idx_parts[4];
for (int i = 0; i < num_tables; i++) {
// Extract 16-byte slice from the larger src
LLVMValueRef indices_for_extract[16];
for (int j = 0; j < 16; j++) {
indices_for_extract[j] = LLVMConstInt(LLVMInt32TypeInContext(p->module->ctx), i * 16 + j, false);
LLVMValueRef mask[16];
for (int j = 0; j < lane; j++) {
mask[j] = LLVMConstInt(i32_type, cast(unsigned)(i * lane + j), false);
}
LLVMValueRef extract_mask = LLVMConstVector(indices_for_extract, 16);
src_parts[i] = LLVMBuildShuffleVector(p->builder, src, LLVMGetUndef(LLVMTypeOf(src)), extract_mask, "");
LLVMValueRef shuffle_mask = LLVMConstVector(mask, cast(unsigned)lane);
src_parts[i] = LLVMBuildShuffleVector(p->builder, src, LLVMGetUndef(LLVMTypeOf(src)), shuffle_mask, "");
idx_parts[i] = LLVMBuildShuffleVector(p->builder, indices, LLVMGetUndef(LLVMTypeOf(indices)), shuffle_mask, "");
}
// Call appropriate ARM64 tbl intrinsic
if (count == 32) {
LLVMValueRef args[3] = { src_parts[0], src_parts[1], indices };
res.value = lb_call_intrinsic(p, intrinsic_name, args, 3, nullptr, 0);
} else if (count == 48) {
LLVMValueRef args[4] = { src_parts[0], src_parts[1], src_parts[2], indices };
res.value = lb_call_intrinsic(p, intrinsic_name, args, 4, nullptr, 0);
} else if (count == 64) {
LLVMValueRef args[5] = { src_parts[0], src_parts[1], src_parts[2], src_parts[3], indices };
res.value = lb_call_intrinsic(p, intrinsic_name, args, 5, nullptr, 0);
// One tbl/vtbl call per output chunk; same N tables, different indices.
// ARM64 tbl[N] is overloaded on result type; ARM32 vtbl[N] has fixed <8 x i8>.
LLVMTypeRef overload_types[1] = { vN_type };
LLVMTypeRef *overloads_arg = is_arm64 ? overload_types : nullptr;
unsigned overloads_count = is_arm64 ? 1 : 0;
LLVMValueRef out_parts[4];
for (int c = 0; c < num_tables; c++) {
LLVMValueRef args[5];
for (int i = 0; i < num_tables; i++) args[i] = src_parts[i];
args[num_tables] = idx_parts[c];
out_parts[c] = lb_call_intrinsic(p, intrinsic_name, args, num_tables + 1, overloads_arg, overloads_count);
}
} else if (build_context.metrics.arch == TargetArch_arm32 && count > 8) {
// ARM32 VTBL2/VTBL3/VTBL4: Split src into multiple 8-byte vectors
int num_tables = cast(int)count / 8;
GB_ASSERT_MSG(count % 8 == 0, "ARM32 src size must be multiple of 8 bytes, got %lld bytes", count);
GB_ASSERT_MSG(num_tables <= 4, "ARM32 NEON supports maximum 4 tables (vtbl4), got %d tables for %lld-byte vector", num_tables, count);
LLVMValueRef src_parts[4]; // Max 4 tables for vtbl4
for (int i = 0; i < num_tables; i++) {
// Extract 8-byte slice from the larger src
LLVMValueRef indices_for_extract[8];
for (int j = 0; j < 8; j++) {
indices_for_extract[j] = LLVMConstInt(LLVMInt32TypeInContext(p->module->ctx), i * 8 + j, false);
// Concat out_parts[0..num_tables) into <count x i8> by pair-wise
// shufflevector. shufflevector requires equal-sized operands, so we
// pad the right-hand chunk to acc_size with undef on each step.
LLVMValueRef acc = out_parts[0];
i64 acc_size = lane;
for (int c = 1; c < num_tables; c++) {
LLVMValueRef rhs = out_parts[c];
if (acc_size > lane) {
LLVMValueRef pad[64];
for (i64 k = 0; k < acc_size; k++) {
pad[k] = (k < lane) ? LLVMConstInt(i32_type, cast(unsigned)k, false) : LLVMGetUndef(i32_type);
}
rhs = LLVMBuildShuffleVector(p->builder, rhs, LLVMGetUndef(LLVMTypeOf(rhs)), LLVMConstVector(pad, cast(unsigned)acc_size), "");
}
LLVMValueRef extract_mask = LLVMConstVector(indices_for_extract, 8);
src_parts[i] = LLVMBuildShuffleVector(p->builder, src, LLVMGetUndef(LLVMTypeOf(src)), extract_mask, "");
}
// Call appropriate ARM32 vtbl intrinsic
if (count == 16) {
LLVMValueRef args[3] = { src_parts[0], src_parts[1], indices };
res.value = lb_call_intrinsic(p, intrinsic_name, args, 3, nullptr, 0);
} else if (count == 24) {
LLVMValueRef args[4] = { src_parts[0], src_parts[1], src_parts[2], indices };
res.value = lb_call_intrinsic(p, intrinsic_name, args, 4, nullptr, 0);
} else if (count == 32) {
LLVMValueRef args[5] = { src_parts[0], src_parts[1], src_parts[2], src_parts[3], indices };
res.value = lb_call_intrinsic(p, intrinsic_name, args, 5, nullptr, 0);
i64 new_size = acc_size + lane;
LLVMValueRef concat[64];
for (i64 k = 0; k < acc_size; k++) concat[k] = LLVMConstInt(i32_type, cast(unsigned)k, false);
for (i64 k = 0; k < lane; k++) concat[acc_size + k] = LLVMConstInt(i32_type, cast(unsigned)(acc_size+k), false);
acc = LLVMBuildShuffleVector(p->builder, acc, rhs, LLVMConstVector(concat, cast(unsigned)new_size), "");
acc_size = new_size;
}
res.value = acc;
} else {
// Single runtime swizzle case (x86, WebAssembly, ARM single-table)
LLVMValueRef args[2] = { src, indices };
res.value = lb_call_intrinsic(p, intrinsic_name, args, gb_count_of(args), nullptr, 0);
if (build_context.metrics.arch == TargetArch_arm64) {
// ARM64 tbl1 is overloaded on result type; others are fixed
LLVMTypeRef overload_types[1] = { LLVMTypeOf(indices) };
res.value = lb_call_intrinsic(p, intrinsic_name, args, gb_count_of(args), overload_types, 1);
} else {
res.value = lb_call_intrinsic(p, intrinsic_name, args, gb_count_of(args), nullptr, 0);
}
}
return res;
} else {