mirror of
https://github.com/odin-lang/Odin.git
synced 2026-05-28 06:35:11 +00:00
Merge pull request #6712 from kalsprite/runtime_swizzle_patch
Runtime swizzle patch
This commit is contained in:
@@ -2441,40 +2441,55 @@ Graphically, the operation looks as follows. The `t` and `f` represent the
|
||||
select :: intrinsics.simd_select
|
||||
|
||||
/*
|
||||
Runtime Equivalent to Shuffle.
|
||||
Hardware-level dynamic swizzle / table lookup.
|
||||
|
||||
Performs element-wise table lookups using runtime indices.
|
||||
Each element in the indices vector selects an element from the table vector.
|
||||
The indices are automatically masked to prevent out-of-bounds access.
|
||||
|
||||
This operation is hardware-accelerated on most platforms when using 8-bit
|
||||
integer vectors. For other element types or unsupported vector sizes, it
|
||||
falls back to software emulation.
|
||||
For each output lane `i`, picks an element from `table` using `indices[i]`
|
||||
as the source position. Maps directly onto hardware table-lookup instructions
|
||||
where available (`tbl` on ARM, `pshufb` on x86, `swizzle` on WASM). This is
|
||||
the runtime counterpart of `simd.shuffle`.
|
||||
|
||||
Inputs:
|
||||
- `table`: The lookup table vector (should be power-of-2 size for correct masking).
|
||||
- `indices`: The indices vector (automatically masked to valid range).
|
||||
- `table`: a `#simd[N]T` lookup table, where `T` is an integer type and
|
||||
`N` is a power of two (enforced by `#simd`).
|
||||
- `indices`: a `#simd[N]T` of source positions. Must have the exact same
|
||||
type as `table`.
|
||||
|
||||
Returns:
|
||||
- A vector where `result[i] = table[indices[i] & (table_size-1)]`.
|
||||
- A `#simd[N]T`. For `indices[i]` in `[0, N-1]`, `result[i] = table[indices[i]]`
|
||||
on ARM, on WebAssembly (16-byte), and on the scalar emulation fallback.
|
||||
On x86 vectors larger than 16 bytes, the operation is lane-local; see Notes.
|
||||
|
||||
Operation:
|
||||
Operation (in-range indices, non-lane-local platforms):
|
||||
|
||||
for i in 0 ..< len(indices) {
|
||||
masked_index := indices[i] & (len(table) - 1)
|
||||
result[i] = table[masked_index]
|
||||
for i in 0 ..< N {
|
||||
result[i] = table[indices[i]] // indices[i] assumed to be in [0, N-1]
|
||||
}
|
||||
return result
|
||||
|
||||
Notes:
|
||||
- **Out-of-range indices** (`indices[i] >= N`) are platform-defined:
|
||||
most hardware paths return 0; the scalar fallback wraps via
|
||||
`indices[i] & (N-1)`. Mask explicitly if you need portable behavior.
|
||||
- **x86 wide vectors are lane-local.** `vpshufb` treats a 256-bit AVX2
|
||||
register as two independent 128-bit lanes; `pshufb.b.512` treats a
|
||||
512-bit AVX-512 register as four. An index in lane `k` can only address
|
||||
table entries in that same lane. AVX2 Example: Lane 0 is [0..15], Lane 1
|
||||
is [16..31]. Accessing `indices[20] = 5` yields `table[21]`, not `table[5]`.
|
||||
ARM `tbl` and the emulation fallback are cross-lane across the full table.
|
||||
- **Hardware acceleration is conditional.** It requires 8-bit integer
|
||||
elements, a vector size supported by the target, and the relevant
|
||||
target features enabled (`ssse3` / `avx2` / `avx512f`+`avx512bw` on x86,
|
||||
`neon` on ARM). Otherwise this falls back to scalar emulation. Non-byte
|
||||
element types always use the scalar fallback.
|
||||
|
||||
Implementation:
|
||||
|
||||
| Platform | Lane Size | Implementation |
|
||||
|-------------|-------------------------------------------|---------------------|
|
||||
| x86-64 | pshufb (16B), vpshufb (32B), AVX512 (64B) | Single vector |
|
||||
| ARM64 | tbl1 (16B), tbl2 (32B), tbl4 (64B) | Automatic splitting |
|
||||
| ARM32 | vtbl1 (8B), vtbl2 (16B), vtbl4 (32B) | Automatic splitting |
|
||||
| WebAssembly | i8x16.swizzle (16B), Emulation (>16B) | Mixed |
|
||||
| Other | Emulation | Software |
|
||||
| Platform | Hardware path (8-bit elements) | Notes |
|
||||
|-------------|-------------------------------------------------|-------------------------------|
|
||||
| x86-64 | pshufb (16B), vpshufb (32B), pshufb.b.512 (64B) | 32 / 64 B are lane-local |
|
||||
| ARM64 | tbl1 (16B), tbl2 (32B), tbl4 (64B) | >16 B: split into 16-B chunks |
|
||||
| ARM32 | vtbl1 (8B), vtbl2 (16B), vtbl4 (32B) | >8 B: split into 8-B chunks |
|
||||
| WebAssembly | i8x16.swizzle (16B) | Other sizes: emulated |
|
||||
| Other | Scalar emulation | |
|
||||
|
||||
Example:
|
||||
|
||||
@@ -2482,12 +2497,11 @@ Example:
|
||||
import "core:fmt"
|
||||
|
||||
runtime_swizzle_example :: proc() {
|
||||
table := simd.u8x16{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}
|
||||
table := simd.u8x16{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}
|
||||
indices := simd.u8x16{15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}
|
||||
result := simd.runtime_swizzle(table, indices)
|
||||
fmt.println(result) // Expected: {15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}
|
||||
result := simd.runtime_swizzle(table, indices)
|
||||
fmt.println(result) // {15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}
|
||||
}
|
||||
|
||||
*/
|
||||
runtime_swizzle :: intrinsics.simd_runtime_swizzle
|
||||
|
||||
|
||||
@@ -2078,67 +2078,79 @@ gb_internal lbValue lb_build_builtin_simd_proc(lbProcedure *p, Ast *expr, TypeAn
|
||||
lb_add_attribute_to_proc_with_string(p->module, p->value, str_lit("target-features"), str_lit("+neon"));
|
||||
}
|
||||
|
||||
// Handle ARM's multi-swizzle intrinsics by splitting the src vector
|
||||
if (build_context.metrics.arch == TargetArch_arm64 && count > 16) {
|
||||
// ARM64 TBL2/TBL3/TBL4: Split src into multiple 16-byte vectors
|
||||
int num_tables = cast(int)(count / 16);
|
||||
GB_ASSERT_MSG(count % 16 == 0, "ARM64 src size must be multiple of 16 bytes, got %lld bytes", count);
|
||||
GB_ASSERT_MSG(num_tables <= 4, "ARM64 NEON supports maximum 4 tables (tbl4), got %d tables for %lld-byte vector", num_tables, count);
|
||||
|
||||
LLVMValueRef src_parts[4]; // Max 4 tables for tbl4
|
||||
// Handle ARM's multi-swizzle intrinsics by splitting the src vector.
|
||||
// tbl[N]/vtbl[N] only produce <16 x i8>/<8 x i8>, so we issue one call
|
||||
// per output chunk and concat the results.
|
||||
if ((build_context.metrics.arch == TargetArch_arm64 && count > 16) ||
|
||||
(build_context.metrics.arch == TargetArch_arm32 && count > 8)) {
|
||||
bool is_arm64 = build_context.metrics.arch == TargetArch_arm64;
|
||||
i64 lane = is_arm64 ? 16 : 8;
|
||||
int num_tables = cast(int)(count / lane);
|
||||
GB_ASSERT_MSG(count % lane == 0, "ARM src size must be multiple of %lld bytes, got %lld bytes", lane, count);
|
||||
GB_ASSERT_MSG(num_tables <= 4, "ARM NEON supports maximum 4 tables, got %d tables for %lld-byte vector", num_tables, count);
|
||||
|
||||
LLVMTypeRef i32_type = LLVMInt32TypeInContext(p->module->ctx);
|
||||
LLVMTypeRef i8_type = LLVMInt8TypeInContext(p->module->ctx);
|
||||
LLVMTypeRef vN_type = LLVMVectorType(i8_type, cast(unsigned)lane);
|
||||
|
||||
// Split src/indices into N lane-sized chunks
|
||||
LLVMValueRef src_parts[4];
|
||||
LLVMValueRef idx_parts[4];
|
||||
for (int i = 0; i < num_tables; i++) {
|
||||
// Extract 16-byte slice from the larger src
|
||||
LLVMValueRef indices_for_extract[16];
|
||||
for (int j = 0; j < 16; j++) {
|
||||
indices_for_extract[j] = LLVMConstInt(LLVMInt32TypeInContext(p->module->ctx), i * 16 + j, false);
|
||||
LLVMValueRef mask[16];
|
||||
for (int j = 0; j < lane; j++) {
|
||||
mask[j] = LLVMConstInt(i32_type, cast(unsigned)(i * lane + j), false);
|
||||
}
|
||||
LLVMValueRef extract_mask = LLVMConstVector(indices_for_extract, 16);
|
||||
src_parts[i] = LLVMBuildShuffleVector(p->builder, src, LLVMGetUndef(LLVMTypeOf(src)), extract_mask, "");
|
||||
LLVMValueRef shuffle_mask = LLVMConstVector(mask, cast(unsigned)lane);
|
||||
src_parts[i] = LLVMBuildShuffleVector(p->builder, src, LLVMGetUndef(LLVMTypeOf(src)), shuffle_mask, "");
|
||||
idx_parts[i] = LLVMBuildShuffleVector(p->builder, indices, LLVMGetUndef(LLVMTypeOf(indices)), shuffle_mask, "");
|
||||
}
|
||||
|
||||
// Call appropriate ARM64 tbl intrinsic
|
||||
if (count == 32) {
|
||||
LLVMValueRef args[3] = { src_parts[0], src_parts[1], indices };
|
||||
res.value = lb_call_intrinsic(p, intrinsic_name, args, 3, nullptr, 0);
|
||||
} else if (count == 48) {
|
||||
LLVMValueRef args[4] = { src_parts[0], src_parts[1], src_parts[2], indices };
|
||||
res.value = lb_call_intrinsic(p, intrinsic_name, args, 4, nullptr, 0);
|
||||
} else if (count == 64) {
|
||||
LLVMValueRef args[5] = { src_parts[0], src_parts[1], src_parts[2], src_parts[3], indices };
|
||||
res.value = lb_call_intrinsic(p, intrinsic_name, args, 5, nullptr, 0);
|
||||
|
||||
// One tbl/vtbl call per output chunk; same N tables, different indices.
|
||||
// ARM64 tbl[N] is overloaded on result type; ARM32 vtbl[N] has fixed <8 x i8>.
|
||||
LLVMTypeRef overload_types[1] = { vN_type };
|
||||
LLVMTypeRef *overloads_arg = is_arm64 ? overload_types : nullptr;
|
||||
unsigned overloads_count = is_arm64 ? 1 : 0;
|
||||
LLVMValueRef out_parts[4];
|
||||
for (int c = 0; c < num_tables; c++) {
|
||||
LLVMValueRef args[5];
|
||||
for (int i = 0; i < num_tables; i++) args[i] = src_parts[i];
|
||||
args[num_tables] = idx_parts[c];
|
||||
out_parts[c] = lb_call_intrinsic(p, intrinsic_name, args, num_tables + 1, overloads_arg, overloads_count);
|
||||
}
|
||||
} else if (build_context.metrics.arch == TargetArch_arm32 && count > 8) {
|
||||
// ARM32 VTBL2/VTBL3/VTBL4: Split src into multiple 8-byte vectors
|
||||
int num_tables = cast(int)count / 8;
|
||||
GB_ASSERT_MSG(count % 8 == 0, "ARM32 src size must be multiple of 8 bytes, got %lld bytes", count);
|
||||
GB_ASSERT_MSG(num_tables <= 4, "ARM32 NEON supports maximum 4 tables (vtbl4), got %d tables for %lld-byte vector", num_tables, count);
|
||||
|
||||
LLVMValueRef src_parts[4]; // Max 4 tables for vtbl4
|
||||
for (int i = 0; i < num_tables; i++) {
|
||||
// Extract 8-byte slice from the larger src
|
||||
LLVMValueRef indices_for_extract[8];
|
||||
for (int j = 0; j < 8; j++) {
|
||||
indices_for_extract[j] = LLVMConstInt(LLVMInt32TypeInContext(p->module->ctx), i * 8 + j, false);
|
||||
|
||||
// Concat out_parts[0..num_tables) into <count x i8> by pair-wise
|
||||
// shufflevector. shufflevector requires equal-sized operands, so we
|
||||
// pad the right-hand chunk to acc_size with undef on each step.
|
||||
LLVMValueRef acc = out_parts[0];
|
||||
i64 acc_size = lane;
|
||||
for (int c = 1; c < num_tables; c++) {
|
||||
LLVMValueRef rhs = out_parts[c];
|
||||
if (acc_size > lane) {
|
||||
LLVMValueRef pad[64];
|
||||
for (i64 k = 0; k < acc_size; k++) {
|
||||
pad[k] = (k < lane) ? LLVMConstInt(i32_type, cast(unsigned)k, false) : LLVMGetUndef(i32_type);
|
||||
}
|
||||
rhs = LLVMBuildShuffleVector(p->builder, rhs, LLVMGetUndef(LLVMTypeOf(rhs)), LLVMConstVector(pad, cast(unsigned)acc_size), "");
|
||||
}
|
||||
LLVMValueRef extract_mask = LLVMConstVector(indices_for_extract, 8);
|
||||
src_parts[i] = LLVMBuildShuffleVector(p->builder, src, LLVMGetUndef(LLVMTypeOf(src)), extract_mask, "");
|
||||
}
|
||||
|
||||
// Call appropriate ARM32 vtbl intrinsic
|
||||
if (count == 16) {
|
||||
LLVMValueRef args[3] = { src_parts[0], src_parts[1], indices };
|
||||
res.value = lb_call_intrinsic(p, intrinsic_name, args, 3, nullptr, 0);
|
||||
} else if (count == 24) {
|
||||
LLVMValueRef args[4] = { src_parts[0], src_parts[1], src_parts[2], indices };
|
||||
res.value = lb_call_intrinsic(p, intrinsic_name, args, 4, nullptr, 0);
|
||||
} else if (count == 32) {
|
||||
LLVMValueRef args[5] = { src_parts[0], src_parts[1], src_parts[2], src_parts[3], indices };
|
||||
res.value = lb_call_intrinsic(p, intrinsic_name, args, 5, nullptr, 0);
|
||||
i64 new_size = acc_size + lane;
|
||||
LLVMValueRef concat[64];
|
||||
for (i64 k = 0; k < acc_size; k++) concat[k] = LLVMConstInt(i32_type, cast(unsigned)k, false);
|
||||
for (i64 k = 0; k < lane; k++) concat[acc_size + k] = LLVMConstInt(i32_type, cast(unsigned)(acc_size+k), false);
|
||||
acc = LLVMBuildShuffleVector(p->builder, acc, rhs, LLVMConstVector(concat, cast(unsigned)new_size), "");
|
||||
acc_size = new_size;
|
||||
}
|
||||
res.value = acc;
|
||||
} else {
|
||||
// Single runtime swizzle case (x86, WebAssembly, ARM single-table)
|
||||
LLVMValueRef args[2] = { src, indices };
|
||||
res.value = lb_call_intrinsic(p, intrinsic_name, args, gb_count_of(args), nullptr, 0);
|
||||
if (build_context.metrics.arch == TargetArch_arm64) {
|
||||
// ARM64 tbl1 is overloaded on result type; others are fixed
|
||||
LLVMTypeRef overload_types[1] = { LLVMTypeOf(indices) };
|
||||
res.value = lb_call_intrinsic(p, intrinsic_name, args, gb_count_of(args), overload_types, 1);
|
||||
} else {
|
||||
res.value = lb_call_intrinsic(p, intrinsic_name, args, gb_count_of(args), nullptr, 0);
|
||||
}
|
||||
}
|
||||
return res;
|
||||
} else {
|
||||
|
||||
Reference in New Issue
Block a user