diff --git a/core/simd/simd.odin b/core/simd/simd.odin index 7313babcd..1631c4d23 100644 --- a/core/simd/simd.odin +++ b/core/simd/simd.odin @@ -2441,40 +2441,55 @@ Graphically, the operation looks as follows. The `t` and `f` represent the select :: intrinsics.simd_select /* -Runtime Equivalent to Shuffle. +Hardware-level dynamic swizzle / table lookup. -Performs element-wise table lookups using runtime indices. -Each element in the indices vector selects an element from the table vector. -The indices are automatically masked to prevent out-of-bounds access. - -This operation is hardware-accelerated on most platforms when using 8-bit -integer vectors. For other element types or unsupported vector sizes, it -falls back to software emulation. +For each output lane `i`, picks an element from `table` using `indices[i]` +as the source position. Maps directly onto hardware table-lookup instructions +where available (`tbl` on ARM, `pshufb` on x86, `swizzle` on WASM). This is +the runtime counterpart of `simd.shuffle`. Inputs: -- `table`: The lookup table vector (should be power-of-2 size for correct masking). -- `indices`: The indices vector (automatically masked to valid range). +- `table`: a `#simd[N]T` lookup table, where `T` is an integer type and + `N` is a power of two (enforced by `#simd`). +- `indices`: a `#simd[N]T` of source positions. Must have the exact same + type as `table`. Returns: -- A vector where `result[i] = table[indices[i] & (table_size-1)]`. +- A `#simd[N]T`. For `indices[i]` in `[0, N-1]`, `result[i] = table[indices[i]]` + on ARM, on WebAssembly (16-byte), and on the scalar emulation fallback. + On x86 vectors larger than 16 bytes, the operation is lane-local; see Notes. -Operation: +Operation (in-range indices, non-lane-local platforms): - for i in 0 ..< len(indices) { - masked_index := indices[i] & (len(table) - 1) - result[i] = table[masked_index] + for i in 0 ..< N { + result[i] = table[indices[i]] // indices[i] assumed to be in [0, N-1] } - return result + +Notes: +- **Out-of-range indices** (`indices[i] >= N`) are platform-defined: + most hardware paths return 0; the scalar fallback wraps via + `indices[i] & (N-1)`. Mask explicitly if you need portable behavior. +- **x86 wide vectors are lane-local.** `vpshufb` treats a 256-bit AVX2 + register as two independent 128-bit lanes; `pshufb.b.512` treats a + 512-bit AVX-512 register as four. An index in lane `k` can only address + table entries in that same lane. AVX2 Example: Lane 0 is [0..15], Lane 1 + is [16..31]. Accessing `indices[20] = 5` yields `table[21]`, not `table[5]`. + ARM `tbl` and the emulation fallback are cross-lane across the full table. +- **Hardware acceleration is conditional.** It requires 8-bit integer + elements, a vector size supported by the target, and the relevant + target features enabled (`ssse3` / `avx2` / `avx512f`+`avx512bw` on x86, + `neon` on ARM). Otherwise this falls back to scalar emulation. Non-byte + element types always use the scalar fallback. Implementation: - | Platform | Lane Size | Implementation | - |-------------|-------------------------------------------|---------------------| - | x86-64 | pshufb (16B), vpshufb (32B), AVX512 (64B) | Single vector | - | ARM64 | tbl1 (16B), tbl2 (32B), tbl4 (64B) | Automatic splitting | - | ARM32 | vtbl1 (8B), vtbl2 (16B), vtbl4 (32B) | Automatic splitting | - | WebAssembly | i8x16.swizzle (16B), Emulation (>16B) | Mixed | - | Other | Emulation | Software | + | Platform | Hardware path (8-bit elements) | Notes | + |-------------|-------------------------------------------------|-------------------------------| + | x86-64 | pshufb (16B), vpshufb (32B), pshufb.b.512 (64B) | 32 / 64 B are lane-local | + | ARM64 | tbl1 (16B), tbl2 (32B), tbl4 (64B) | >16 B: split into 16-B chunks | + | ARM32 | vtbl1 (8B), vtbl2 (16B), vtbl4 (32B) | >8 B: split into 8-B chunks | + | WebAssembly | i8x16.swizzle (16B) | Other sizes: emulated | + | Other | Scalar emulation | | Example: @@ -2482,12 +2497,11 @@ Example: import "core:fmt" runtime_swizzle_example :: proc() { - table := simd.u8x16{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15} + table := simd.u8x16{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15} indices := simd.u8x16{15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14} - result := simd.runtime_swizzle(table, indices) - fmt.println(result) // Expected: {15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14} + result := simd.runtime_swizzle(table, indices) + fmt.println(result) // {15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14} } - */ runtime_swizzle :: intrinsics.simd_runtime_swizzle diff --git a/src/llvm_backend_proc.cpp b/src/llvm_backend_proc.cpp index bed7ad6ac..63ca868ec 100644 --- a/src/llvm_backend_proc.cpp +++ b/src/llvm_backend_proc.cpp @@ -2078,67 +2078,79 @@ gb_internal lbValue lb_build_builtin_simd_proc(lbProcedure *p, Ast *expr, TypeAn lb_add_attribute_to_proc_with_string(p->module, p->value, str_lit("target-features"), str_lit("+neon")); } - // Handle ARM's multi-swizzle intrinsics by splitting the src vector - if (build_context.metrics.arch == TargetArch_arm64 && count > 16) { - // ARM64 TBL2/TBL3/TBL4: Split src into multiple 16-byte vectors - int num_tables = cast(int)(count / 16); - GB_ASSERT_MSG(count % 16 == 0, "ARM64 src size must be multiple of 16 bytes, got %lld bytes", count); - GB_ASSERT_MSG(num_tables <= 4, "ARM64 NEON supports maximum 4 tables (tbl4), got %d tables for %lld-byte vector", num_tables, count); - - LLVMValueRef src_parts[4]; // Max 4 tables for tbl4 + // Handle ARM's multi-swizzle intrinsics by splitting the src vector. + // tbl[N]/vtbl[N] only produce <16 x i8>/<8 x i8>, so we issue one call + // per output chunk and concat the results. + if ((build_context.metrics.arch == TargetArch_arm64 && count > 16) || + (build_context.metrics.arch == TargetArch_arm32 && count > 8)) { + bool is_arm64 = build_context.metrics.arch == TargetArch_arm64; + i64 lane = is_arm64 ? 16 : 8; + int num_tables = cast(int)(count / lane); + GB_ASSERT_MSG(count % lane == 0, "ARM src size must be multiple of %lld bytes, got %lld bytes", lane, count); + GB_ASSERT_MSG(num_tables <= 4, "ARM NEON supports maximum 4 tables, got %d tables for %lld-byte vector", num_tables, count); + + LLVMTypeRef i32_type = LLVMInt32TypeInContext(p->module->ctx); + LLVMTypeRef i8_type = LLVMInt8TypeInContext(p->module->ctx); + LLVMTypeRef vN_type = LLVMVectorType(i8_type, cast(unsigned)lane); + + // Split src/indices into N lane-sized chunks + LLVMValueRef src_parts[4]; + LLVMValueRef idx_parts[4]; for (int i = 0; i < num_tables; i++) { - // Extract 16-byte slice from the larger src - LLVMValueRef indices_for_extract[16]; - for (int j = 0; j < 16; j++) { - indices_for_extract[j] = LLVMConstInt(LLVMInt32TypeInContext(p->module->ctx), i * 16 + j, false); + LLVMValueRef mask[16]; + for (int j = 0; j < lane; j++) { + mask[j] = LLVMConstInt(i32_type, cast(unsigned)(i * lane + j), false); } - LLVMValueRef extract_mask = LLVMConstVector(indices_for_extract, 16); - src_parts[i] = LLVMBuildShuffleVector(p->builder, src, LLVMGetUndef(LLVMTypeOf(src)), extract_mask, ""); + LLVMValueRef shuffle_mask = LLVMConstVector(mask, cast(unsigned)lane); + src_parts[i] = LLVMBuildShuffleVector(p->builder, src, LLVMGetUndef(LLVMTypeOf(src)), shuffle_mask, ""); + idx_parts[i] = LLVMBuildShuffleVector(p->builder, indices, LLVMGetUndef(LLVMTypeOf(indices)), shuffle_mask, ""); } - - // Call appropriate ARM64 tbl intrinsic - if (count == 32) { - LLVMValueRef args[3] = { src_parts[0], src_parts[1], indices }; - res.value = lb_call_intrinsic(p, intrinsic_name, args, 3, nullptr, 0); - } else if (count == 48) { - LLVMValueRef args[4] = { src_parts[0], src_parts[1], src_parts[2], indices }; - res.value = lb_call_intrinsic(p, intrinsic_name, args, 4, nullptr, 0); - } else if (count == 64) { - LLVMValueRef args[5] = { src_parts[0], src_parts[1], src_parts[2], src_parts[3], indices }; - res.value = lb_call_intrinsic(p, intrinsic_name, args, 5, nullptr, 0); + + // One tbl/vtbl call per output chunk; same N tables, different indices. + // ARM64 tbl[N] is overloaded on result type; ARM32 vtbl[N] has fixed <8 x i8>. + LLVMTypeRef overload_types[1] = { vN_type }; + LLVMTypeRef *overloads_arg = is_arm64 ? overload_types : nullptr; + unsigned overloads_count = is_arm64 ? 1 : 0; + LLVMValueRef out_parts[4]; + for (int c = 0; c < num_tables; c++) { + LLVMValueRef args[5]; + for (int i = 0; i < num_tables; i++) args[i] = src_parts[i]; + args[num_tables] = idx_parts[c]; + out_parts[c] = lb_call_intrinsic(p, intrinsic_name, args, num_tables + 1, overloads_arg, overloads_count); } - } else if (build_context.metrics.arch == TargetArch_arm32 && count > 8) { - // ARM32 VTBL2/VTBL3/VTBL4: Split src into multiple 8-byte vectors - int num_tables = cast(int)count / 8; - GB_ASSERT_MSG(count % 8 == 0, "ARM32 src size must be multiple of 8 bytes, got %lld bytes", count); - GB_ASSERT_MSG(num_tables <= 4, "ARM32 NEON supports maximum 4 tables (vtbl4), got %d tables for %lld-byte vector", num_tables, count); - - LLVMValueRef src_parts[4]; // Max 4 tables for vtbl4 - for (int i = 0; i < num_tables; i++) { - // Extract 8-byte slice from the larger src - LLVMValueRef indices_for_extract[8]; - for (int j = 0; j < 8; j++) { - indices_for_extract[j] = LLVMConstInt(LLVMInt32TypeInContext(p->module->ctx), i * 8 + j, false); + + // Concat out_parts[0..num_tables) into by pair-wise + // shufflevector. shufflevector requires equal-sized operands, so we + // pad the right-hand chunk to acc_size with undef on each step. + LLVMValueRef acc = out_parts[0]; + i64 acc_size = lane; + for (int c = 1; c < num_tables; c++) { + LLVMValueRef rhs = out_parts[c]; + if (acc_size > lane) { + LLVMValueRef pad[64]; + for (i64 k = 0; k < acc_size; k++) { + pad[k] = (k < lane) ? LLVMConstInt(i32_type, cast(unsigned)k, false) : LLVMGetUndef(i32_type); + } + rhs = LLVMBuildShuffleVector(p->builder, rhs, LLVMGetUndef(LLVMTypeOf(rhs)), LLVMConstVector(pad, cast(unsigned)acc_size), ""); } - LLVMValueRef extract_mask = LLVMConstVector(indices_for_extract, 8); - src_parts[i] = LLVMBuildShuffleVector(p->builder, src, LLVMGetUndef(LLVMTypeOf(src)), extract_mask, ""); - } - - // Call appropriate ARM32 vtbl intrinsic - if (count == 16) { - LLVMValueRef args[3] = { src_parts[0], src_parts[1], indices }; - res.value = lb_call_intrinsic(p, intrinsic_name, args, 3, nullptr, 0); - } else if (count == 24) { - LLVMValueRef args[4] = { src_parts[0], src_parts[1], src_parts[2], indices }; - res.value = lb_call_intrinsic(p, intrinsic_name, args, 4, nullptr, 0); - } else if (count == 32) { - LLVMValueRef args[5] = { src_parts[0], src_parts[1], src_parts[2], src_parts[3], indices }; - res.value = lb_call_intrinsic(p, intrinsic_name, args, 5, nullptr, 0); + i64 new_size = acc_size + lane; + LLVMValueRef concat[64]; + for (i64 k = 0; k < acc_size; k++) concat[k] = LLVMConstInt(i32_type, cast(unsigned)k, false); + for (i64 k = 0; k < lane; k++) concat[acc_size + k] = LLVMConstInt(i32_type, cast(unsigned)(acc_size+k), false); + acc = LLVMBuildShuffleVector(p->builder, acc, rhs, LLVMConstVector(concat, cast(unsigned)new_size), ""); + acc_size = new_size; } + res.value = acc; } else { // Single runtime swizzle case (x86, WebAssembly, ARM single-table) LLVMValueRef args[2] = { src, indices }; - res.value = lb_call_intrinsic(p, intrinsic_name, args, gb_count_of(args), nullptr, 0); + if (build_context.metrics.arch == TargetArch_arm64) { + // ARM64 tbl1 is overloaded on result type; others are fixed + LLVMTypeRef overload_types[1] = { LLVMTypeOf(indices) }; + res.value = lb_call_intrinsic(p, intrinsic_name, args, gb_count_of(args), overload_types, 1); + } else { + res.value = lb_call_intrinsic(p, intrinsic_name, args, gb_count_of(args), nullptr, 0); + } } return res; } else {