diff --git a/core/simd/x86/avx.odin b/core/simd/x86/avx.odin new file mode 100644 index 000000000..c274f4903 --- /dev/null +++ b/core/simd/x86/avx.odin @@ -0,0 +1,1852 @@ +#+build i386, amd64 +package simd_x86 + +import "base:intrinsics" + +// Adds packed double-precision (64-bit) floating-point elements in `a` and `b`. +@(require_results, enable_target_feature="avx") +_mm256_add_pd :: #force_inline proc "c" (a, b: __m256d) -> __m256d { + return intrinsics.simd_add(a, b) +} + +// Adds packed single-precision (32-bit) floating-point elements in `a` and `b`. +@(require_results, enable_target_feature="avx") +_mm256_add_ps :: #force_inline proc "c" (a, b: __m256) -> __m256 { + return intrinsics.simd_add(a, b) +} + +// Computes the bitwise AND of a packed double-precision (64-bit) floating-point elements in `a` and `b`. +@(require_results, enable_target_feature="avx") +_mm256_and_pd :: #force_inline proc "c" (a, b: __m256d) -> __m256d { + a := transmute(#simd[4]u64)a + b := transmute(#simd[4]u64)b + return transmute(__m256d)intrinsics.simd_bit_and(a, b) +} + +// Computes the bitwise AND of packed single-precision (32-bit) floating-point elements in `a` and `b`. +@(require_results, enable_target_feature="avx") +_mm256_and_ps :: #force_inline proc "c" (a, b: __m256) -> __m256 { + a := transmute(#simd[8]u32)a + b := transmute(#simd[8]u32)b + return transmute(__m256)intrinsics.simd_bit_and(a, b) +} + +// Computes the bitwise OR packed double-precision (64-bit) floating-point elements in `a` and `b`. +@(require_results, enable_target_feature="avx") +_mm256_or_pd :: #force_inline proc "c" (a, b: __m256d) -> __m256d { + a := transmute(#simd[4]u64)a + b := transmute(#simd[4]u64)b + return transmute(__m256d)intrinsics.simd_bit_or(a, b) +} + +// Computes the bitwise OR packed single-precision (32-bit) floating-point elements in `a` and `b`. +@(require_results, enable_target_feature="avx") +_mm256_or_ps :: #force_inline proc "c" (a, b: __m256) -> __m256 { + a := transmute(#simd[8]u32)a + b := transmute(#simd[8]u32)b + return transmute(__m256)intrinsics.simd_bit_or(a, b) +} + +// Shuffles double-precision (64-bit) floating-point elements within 128-bit lanes using the control in `imm8`. +@(require_results, enable_target_feature="avx") +_mm256_shuffle_pd :: #force_inline proc "c" (a, b: __m256d, $MASK: u8) -> __m256d { + return intrinsics.simd_shuffle( + a, + b, + MASK & 1, + ((MASK >> 1) & 1) + 4, + ((MASK >> 2) & 1) + 2, + ((MASK >> 3) & 1) + 6, + ) +} + + +// Shuffles single-precision (32-bit) floating-point elements in `a` within 128-bit lanes using the control in `imm8`. +@(require_results, enable_target_feature="avx") +_mm256_shuffle_ps :: #force_inline proc "c" (a, b: __m256, $MASK: u8) -> __m256 { + return intrinsics.simd_shuffle( + a, + b, + MASK & 0b11, + (MASK >> 2) & 0b11, + ((MASK >> 4) & 0b11) + 8, + ((MASK >> 6) & 0b11) + 8, + (MASK & 0b11) + 4, + ((MASK >> 2) & 0b11) + 4, + ((MASK >> 4) & 0b11) + 12, + ((MASK >> 6) & 0b11) + 12, + ) +} + + + +// Computes the bitwise NOT of packed double-precision (64-bit) floating-point elements in `a`, and then AND with `b`. +@(require_results, enable_target_feature="avx") +_mm256_andnot_pd :: #force_inline proc "c" (a, b: __m256d) -> __m256d { + a := transmute(#simd[4]u64)a + b := transmute(#simd[4]u64)b + return transmute(__m256d)intrinsics.simd_bit_and(intrinsics.simd_bit_xor((#simd[4]u64)(0), a), b) +} + +// Computes the bitwise NOT of packed single-precision (32-bit) floating-point elements in `a` and then AND with `b`. +@(require_results, enable_target_feature="avx") +_mm256_andnot_ps :: #force_inline proc "c" (a, b: __m256) -> __m256 { + a := transmute(#simd[8]u32)a + b := transmute(#simd[8]u32)b + return transmute(__m256)intrinsics.simd_bit_and(intrinsics.simd_bit_xor((#simd[8]u32)(0), a), b) +} + + + +// Compares packed double-precision (64-bit) floating-point elements in `a` and `b`, and returns packed maximum values +// +// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_pd) +@(require_results, enable_target_feature="avx") +_mm256_max_pd :: #force_inline proc "c" (a, b: __m256d) -> __m256d { + return llvm_vmaxpd(a, b) +} + +// Compares packed single-precision (32-bit) floating-point elements in `a` and `b`, and returns packed maximum values +// +// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_ps) +@(require_results, enable_target_feature="avx") +_mm256_max_ps :: #force_inline proc "c" (a, b: __m256) -> __m256 { + return llvm_vmaxps(a, b) +} + +// Compares packed double-precision (64-bit) floating-point elements in `a` and `b`, and returns packed minimum values +// +// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_pd) +@(require_results, enable_target_feature="avx") +_mm256_min_pd :: #force_inline proc "c" (a, b: __m256d) -> __m256d { + return llvm_vminpd(a, b) +} + +// Compares packed single-precision (32-bit) floating-point elements in `a` and `b`, and returns packed minimum values +// +// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_ps) +@(require_results, enable_target_feature="avx") +_mm256_min_ps :: #force_inline proc "c" (a, b: __m256) -> __m256 { + return llvm_vminps(a, b) +} + + + +// Multiplies packed double-precision (64-bit) floating-point elements in `a` and `b`. +@(require_results, enable_target_feature="avx") +_mm256_mul_pd :: #force_inline proc "c" (a, b: __m256d) -> __m256d { + return intrinsics.simd_mul(a, b) +} + +// Multiplies packed single-precision (32-bit) floating-point elements in `a` and `b`. +@(require_results, enable_target_feature="avx") +_mm256_mul_ps :: #force_inline proc "c" (a, b: __m256) -> __m256 { + return intrinsics.simd_mul(a, b) +} + +// Alternatively adds and subtracts packed double-precision (64-bit) floating-point elements in `a` to/from packed elements in `b`. +@(require_results, enable_target_feature="avx") +_mm256_addsub_pd :: #force_inline proc "c" (a, b: __m256d) -> __m256d { + add := intrinsics.simd_add(a, b) + sub := intrinsics.simd_sub(a, b) + return intrinsics.simd_shuffle(add, sub, 4, 1, 6, 3) +} + + +// Alternatively adds and subtracts packed single-precision (32-bit) floating-point elements in `a` to/from packed elements in `b`. +@(require_results, enable_target_feature="avx") +_mm256_addsub_ps :: #force_inline proc "c" (a, b: __m256) -> __m256 { + add := intrinsics.simd_add(a, b) + sub := intrinsics.simd_sub(a, b) + return intrinsics.simd_shuffle(add, sub, 8, 1, 10, 3, 12, 5, 14, 7) +} + + +// Subtracts packed double-precision (64-bit) floating-point elements in `b` +// from packed elements in `a`. +@(require_results, enable_target_feature="avx") +_mm256_sub_pd :: #force_inline proc "c" (a, b: __m256d) -> __m256d { + return intrinsics.simd_sub(a, b) +} + +// Subtracts packed single-precision (32-bit) floating-point elements in `b` +// from packed elements in `a`. +@(require_results, enable_target_feature="avx") +_mm256_sub_ps :: #force_inline proc "c" (a, b: __m256) -> __m256 { + return intrinsics.simd_sub(a, b) +} + +// Computes the division of each of the 8 packed 32-bit floating-point elements +// in `a` by the corresponding packed elements in `b`. +@(require_results, enable_target_feature="avx") +_mm256_div_ps :: #force_inline proc "c" (a, b: __m256) -> __m256 { + return intrinsics.simd_div(a, b) +} + +// Computes the division of each of the 4 packed 64-bit floating-point elements +// in `a` by the corresponding packed elements in `b`. +@(require_results, enable_target_feature="avx") +_mm256_div_pd :: #force_inline proc "c" (a, b: __m256d) -> __m256d { + return intrinsics.simd_div(a, b) +} + + + +// Rounds packed double-precision (64-bit) floating point elements in `a` +// according to the flag `ROUNDING`. The value of `ROUNDING` may be as follows: +// +// - `0x00`: Round to the nearest whole number. +// - `0x01`: Round down, toward negative infinity. +// - `0x02`: Round up, toward positive infinity. +// - `0x03`: Truncate the values. +// +// For a complete list of options, check [the LLVM docs][llvm_docs]. +// +// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382 +// +// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_round_pd) +@(require_results, enable_target_feature="avx") +_mm256_round_pd :: #force_inline proc "c" (a: __m256d, $ROUNDING: u8) -> __m256d where ROUNDING < 16 { + return llvm_roundpd256(a, ROUNDING) +} + +// Rounds packed double-precision (64-bit) floating point elements in `a` +// toward positive infinity. +@(require_results, enable_target_feature="avx") +_mm256_ceil_pd :: #force_inline proc "c" (a: __m256d) -> __m256d { + return intrinsics.simd_ceil(a) +} + +// Rounds packed double-precision (64-bit) floating point elements in `a` +// toward negative infinity. +@(require_results, enable_target_feature="avx") +_mm256_floor_pd :: #force_inline proc "c" (a: __m256d) -> __m256d { + return intrinsics.simd_floor(a) +} + + + +// Rounds packed single-precision (32-bit) floating point elements in `a` +// according to the flag `ROUNDING`. The value of `ROUNDING` may be as follows: +// +// - `0x00`: Round to the nearest whole number. +// - `0x01`: Round down, toward negative infinity. +// - `0x02`: Round up, toward positive infinity. +// - `0x03`: Truncate the values. +// +// For a complete list of options, check [the LLVM docs][llvm_docs]. +// +// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382 +// +// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_round_ps) +@(require_results, enable_target_feature="avx") +_mm256_round_ps :: #force_inline proc(a: __m256, $ROUNDING: u8) -> __m256 where ROUNDING < 16 { + return llvm_roundps256(a, u32(ROUNDING)) +} + +// Rounds packed single-precision (32-bit) floating point elements in `a` +// toward positive infinity. +@(require_results, enable_target_feature="avx") +_mm256_ceil_ps :: #force_inline proc "c" (a: __m256) -> __m256 { + return intrinsics.simd_ceil(a) +} + +// Rounds packed single-precision (32-bit) floating point elements in `a` +// toward negative infinity. +@(require_results, enable_target_feature="avx") +_mm256_floor_ps :: #force_inline proc "c" (a: __m256) -> __m256 { + return intrinsics.simd_floor(a) +} + +// Returns the square root of packed single-precision (32-bit) floating point elements in `a`. +// +// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sqrt_ps) +@(require_results, enable_target_feature="avx") +_mm256_sqrt_ps :: #force_inline proc "c" (a: __m256) -> __m256 { + return intrinsics.sqrt(a) +} + +// Returns the square root of packed double-precision (64-bit) floating point elements in `a`. +// +// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sqrt_pd) +@(require_results, enable_target_feature="avx") +_mm256_sqrt_pd :: #force_inline proc "c" (a: __m256d) -> __m256d { + return intrinsics.sqrt(a) +} + + + +// Blends packed double-precision (64-bit) floating-point elements from +// `a` and `b` using control mask `imm8`. +// +// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_pd) +@(require_results, enable_target_feature="avx") +_mm256_blend_pd :: #force_inline proc "c" (a, b: __m256d, $IIM4: u32) -> __m256d where IMM4 < 16 { + return intrinsics.simd_shuffle( + a, + b, + ((IMM4 >> 0) & 1) * 4 + 0, + ((IMM4 >> 1) & 1) * 4 + 1, + ((IMM4 >> 2) & 1) * 4 + 2, + ((IMM4 >> 3) & 1) * 4 + 3, + ) +} + +// Blends packed single-precision (32-bit) floating-point elements from +// `a` and `b` using control mask `imm8`. +@(require_results, enable_target_feature="avx") +_mm256_blend_ps :: #force_inline proc "c" (a, b: __m256, $IMM8: u8) -> __m256 { + return intrinsics.simd_shuffle( + a, + b, + ((IMM8 >> 0) & 1) * 8 + 0, + ((IMM8 >> 1) & 1) * 8 + 1, + ((IMM8 >> 2) & 1) * 8 + 2, + ((IMM8 >> 3) & 1) * 8 + 3, + ((IMM8 >> 4) & 1) * 8 + 4, + ((IMM8 >> 5) & 1) * 8 + 5, + ((IMM8 >> 6) & 1) * 8 + 6, + ((IMM8 >> 7) & 1) * 8 + 7, + ) +} + + + +// Blends packed double-precision (64-bit) floating-point elements from +// `a` and `b` using `c` as a mask. +@(require_results, enable_target_feature="avx") +_mm256_blendv_pd :: #force_inline proc "c" (a, b: __m256d, c: __m256d) -> __m256d { + mask := intrinsics.simd_lanes_lt(transmute(#simd[4]i64)c, 0) + return intrinsics.simd_select(mask, b, a) +} + +// Blends packed single-precision (32-bit) floating-point elements from +// `a` and `b` using `c` as a mask. +@(require_results, enable_target_feature="avx") +_mm256_blendv_ps :: #force_inline proc "c" (a, b: __m256, c: __m256) -> __m256 { + mask := intrinsics.simd_lanes_lt(transmute(#simd[8]i32)c, 0) + return intrinsics.simd_select(mask, b, a) +} + + + +// Conditionally multiplies the packed single-precision (32-bit) floating-point elements in `a` and `b` using the high 4 bits in `imm8`, +// sum the four products, and conditionally return the sum +// using the low 4 bits of `imm8`. +// +// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dp_ps) +@(require_results, enable_target_feature="avx") +_mm256_dp_ps :: #force_inline proc "c" (a, b: __m256, $IMM8: i8) -> __m256 { + return llvm_vdpps(a, b, IMM8) +} + +// Horizontal addition of adjacent pairs in the two packed vectors +// of 4 64-bit floating points `a` and `b`. +// In the result, sums of elements from `a` are returned in even locations, +// while sums of elements from `b` are returned in odd locations. +@(require_results, enable_target_feature="avx") +_mm256_hadd_pd :: #force_inline proc "c" (a, b: __m256d) -> __m256d { + even := intrinsics.simd_shuffle(a, b, 0, 4, 2, 6) + odd := intrinsics.simd_shuffle(a, b, 1, 5, 3, 7) + return intrinsics.simd_add(even, odd) +} + + + +// Horizontal addition of adjacent pairs in the two packed vectors +// of 8 32-bit floating points `a` and `b`. +// In the result, sums of elements from `a` are returned in locations of +// indices 0, 1, 4, 5; while sums of elements from `b` are locations +// 2, 3, 6, 7. +@(require_results, enable_target_feature="avx") +_mm256_hadd_ps :: #force_inline proc "c" (a, b: __m256) -> __m256 { + even := intrinsics.simd_shuffle(a, b, 0, 2, 8, 10, 4, 6, 12, 14) + odd := intrinsics.simd_shuffle(a, b, 1, 3, 9, 11, 5, 7, 13, 15) + return intrinsics.simd_add(even, odd) +} + +// Horizontal subtraction of adjacent pairs in the two packed vectors +// of 4 64-bit floating points `a` and `b`. +// In the result, sums of elements from `a` are returned in even locations, +// while sums of elements from `b` are returned in odd locations. +@(require_results, enable_target_feature="avx") +_mm256_hsub_pd :: #force_inline proc "c" (a, b: __m256d) -> __m256d { + even := intrinsics.simd_shuffle(a, b, 0, 4, 2, 6) + odd := intrinsics.simd_shuffle(a, b, 1, 5, 3, 7) + return intrinsics.simd_sub(even, odd) +} + +// Horizontal subtraction of adjacent pairs in the two packed vectors +// of 8 32-bit floating points `a` and `b`. +// In the result, sums of elements from `a` are returned in locations of +// indices 0, 1, 4, 5; while sums of elements from `b` are locations +// 2, 3, 6, 7. +@(require_results, enable_target_feature="avx") +_mm256_hsub_ps :: #force_inline proc "c" (a, b: __m256) -> __m256 { + even := intrinsics.simd_shuffle(a, b, 0, 2, 8, 10, 4, 6, 12, 14) + odd := intrinsics.simd_shuffle(a, b, 1, 3, 9, 11, 5, 7, 13, 15) + return intrinsics.simd_sub(even, odd) +} + +// Computes the bitwise XOR of packed double-precision (64-bit) floating-point elements in `a` and `b`. +@(require_results, enable_target_feature="avx") +_mm256_xor_pd :: #force_inline proc "c" (a, b: __m256d) -> __m256d { + a := transmute(#simd[4]u64)a + b := transmute(#simd[4]u64)b + return transmute(__m256d)intrinsics.simd_bit_xor(a, b) +} + +// Computes the bitwise XOR of packed single-precision (32-bit) floating-point elements in `a` and `b`. +@(require_results, enable_target_feature="avx") +_mm256_xor_ps :: #force_inline proc "c" (a, b: __m256) -> __m256 { + a := transmute(#simd[8]u32)a + b := transmute(#simd[8]u32)b + return transmute(__m256)intrinsics.simd_bit_xor(a, b) +} + + + +_CMP_EQ_OQ :: 0x00 // Equal (ordered, non-signaling) +_CMP_LT_OS :: 0x01 // Less-than (ordered, signaling) +_CMP_LE_OS :: 0x02 // Less-than-or-equal (ordered, signaling) +_CMP_UNORD_Q :: 0x03 // Unordered (non-signaling) +_CMP_NEQ_UQ :: 0x04 // Not-equal (unordered, non-signaling) +_CMP_NLT_US :: 0x05 // Not-less-than (unordered, signaling) +_CMP_NLE_US :: 0x06 // Not-less-than-or-equal (unordered, signaling) +_CMP_ORD_Q :: 0x07 // Ordered (non-signaling) +_CMP_EQ_UQ :: 0x08 // Equal (unordered, non-signaling) +_CMP_NGE_US :: 0x09 // Not-greater-than-or-equal (unordered, signaling) +_CMP_NGT_US :: 0x0a // Not-greater-than (unordered, signaling) +_CMP_FALSE_OQ :: 0x0b // False (ordered, non-signaling) +_CMP_NEQ_OQ :: 0x0c // Not-equal (ordered, non-signaling) +_CMP_GE_OS :: 0x0d // Greater-than-or-equal (ordered, signaling) +_CMP_GT_OS :: 0x0e // Greater-than (ordered, signaling) +_CMP_TRUE_UQ :: 0x0f // True (unordered, non-signaling) +_CMP_EQ_OS :: 0x10 // Equal (ordered, signaling) +_CMP_LT_OQ :: 0x11 // Less-than (ordered, non-signaling) +_CMP_LE_OQ :: 0x12 // Less-than-or-equal (ordered, non-signaling) +_CMP_UNORD_S :: 0x13 // Unordered (signaling) +_CMP_NEQ_US :: 0x14 // Not-equal (unordered, signaling) +_CMP_NLT_UQ :: 0x15 // Not-less-than (unordered, non-signaling) +_CMP_NLE_UQ :: 0x16 // Not-less-than-or-equal (unordered, non-signaling) +_CMP_ORD_S :: 0x17 // Ordered (signaling) +_CMP_EQ_US :: 0x18 // Equal (unordered, signaling) +_CMP_NGE_UQ :: 0x19 // Not-greater-than-or-equal (unordered, non-signaling) +_CMP_NGT_UQ :: 0x1a // Not-greater-than (unordered, non-signaling) +_CMP_FALSE_OS :: 0x1b // False (ordered, signaling) +_CMP_NEQ_OS :: 0x1c // Not-equal (ordered, signaling) +_CMP_GE_OQ :: 0x1d // Greater-than-or-equal (ordered, non-signaling) +_CMP_GT_OQ :: 0x1e // Greater-than (ordered, non-signaling) +_CMP_TRUE_US :: 0x1f // True (unordered, signaling) + + + +// Compares packed double-precision (64-bit) floating-point elements in `a` and `b` based on the comparison operand specified by `IMM5`. +// +// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_pd) +@(require_results, enable_target_feature="avx") +_mm_cmp_pd :: #force_inline proc "c" (a, b: __m128d, $IMM5: u8) -> __m128d where IMM5 < 32 { + return llvm_vcmppd(a, b, IMM5) +} + +// Compares packed double-precision (64-bit) floating-point elements in `a` and `b` based on the comparison operand specified by `IMM5`. +// +// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_pd) +@(require_results, enable_target_feature="avx") +_mm256_cmp_pd :: #force_inline proc "c" (a, b: __m256d, $IMM5: u8) -> __m256d where IMM5 < 32 { + return llvm_vcmppd256(a, b, IMM5) +} + +// Compares packed single-precision (32-bit) floating-point elements in `a` and `b` based on the comparison operand specified by `IMM5`. +// +// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_ps) +@(require_results, enable_target_feature="avx") +_mm_cmp_ps :: #force_inline proc "c" (a: __m128, b: __m128, $IMM5: u8) -> __m128 where IMM5 < 32 { + return llvm_vcmpps(a, b, IMM5) +} + +// Compares packed single-precision (32-bit) floating-point elements in `a` and `b` based on the comparison operand specified by `IMM5`. +// +// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_ps) +@(require_results, enable_target_feature="avx") +_mm256_cmp_ps :: #force_inline proc "c" (a, b: __m256, $IMM5: u8) -> __m256 where IMM5 < 32 { + return llvm_vcmpps256(a, b, IMM5) +} + +// Compares the lower double-precision (64-bit) floating-point element in +// `a` and `b` based on the comparison operand specified by `IMM5`, +// store the result in the lower element of returned vector, +// and copies the upper element from `a` to the upper element of returned +// vector. +// +// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_sd) +@(require_results, enable_target_feature="avx") +_mm_cmp_sd :: #force_inline proc "c" (a, b: __m128d, $IMM5: u8) -> __m128d where IMM5 < 32 { + return llvm_vcmpsd(a, b, IMM5) +} + +// Compares the lower single-precision (32-bit) floating-point element in +// `a` and `b` based on the comparison operand specified by `IMM5`, +// store the result in the lower element of returned vector, +// and copies the upper 3 packed elements from `a` to the upper elements of +// returned vector. +// +// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_ss) +@(require_results, enable_target_feature="avx") +_mm_cmp_ss :: #force_inline proc "c" (a: __m128, b: __m128, $IMM5: u8) -> __m128 where IMM5 < 32 { + return llvm_vcmpss(a, b, IMM5) +} + +// Converts packed 32-bit integers in `a` to packed double-precision (64-bit) floating-point elements. +@(require_results, enable_target_feature="avx") +_mm256_cvtepi32_pd :: #force_inline proc "c" (a: __m128i) -> __m256d { + return __m256d(transmute(#simd[4]i32)a) +} + +// Converts packed 32-bit integers in `a` to packed single-precision (32-bit) floating-point elements. +@(require_results, enable_target_feature="avx") +_mm256_cvtepi32_ps :: #force_inline proc "c" (a: __m256i) -> __m256 { + return __m256(transmute(#simd[8]i32)a) +} + +// Converts packed double-precision (64-bit) floating-point elements in `a` to packed single-precision (32-bit) floating-point elements. +@(require_results, enable_target_feature="avx") +_mm256_cvtpd_ps :: #force_inline proc "c" (a: __m256d) -> __m128 { + return __m128(a) +} + +// Converts packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit integers. +// +// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtps_epi32) +@(require_results, enable_target_feature="avx") +_mm256_cvtps_epi32 :: #force_inline proc "c" (a: __m256) -> __m256i { + return transmute(__m256i)llvm_vcvtps2dq(a) +} + +// Converts packed single-precision (32-bit) floating-point elements in `a` to packed double-precision (64-bit) floating-point elements. +@(require_results, enable_target_feature="avx") +_mm256_cvtps_pd :: #force_inline proc "c" (a: __m128) -> __m256d { + return __m256d(a) +} + +// Returns the first element of the input vector of `[4 x double]`. +@(require_results, enable_target_feature="avx") +_mm256_cvtsd_f64 :: #force_inline proc "c" (a: __m256d) -> f64 { + return intrinsics.simd_extract(a, 0) +} + +// Converts packed double-precision (64-bit) floating-point elements in `a` to packed 32-bit integers with truncation. +// +// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvttpd_epi32) +@(require_results, enable_target_feature="avx") +_mm256_cvttpd_epi32 :: #force_inline proc "c" (a: __m256d) -> __m128i { + return transmute(__m128i)llvm_vcvttpd2dq(a) +} + +// Converts packed double-precision (64-bit) floating-point elements in `a` to packed 32-bit integers. +// +// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtpd_epi32) +@(require_results, enable_target_feature="avx") +_mm256_cvtpd_epi32 :: #force_inline proc "c" (a: __m256d) -> __m128i { + return transmute(__m128i)llvm_vcvtpd2dq(a) +} + +// Converts packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit integers with truncation. +// +// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvttps_epi32) +@(require_results, enable_target_feature="avx") +_mm256_cvttps_epi32 :: #force_inline proc "c" (a: __m256) -> __m256i { + return transmute(__m256i)llvm_vcvttps2dq(a) +} + + + +// Extracts 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from `a`, selected with `imm8`. +@(require_results, enable_target_feature="avx") +_mm256_extractf128_ps :: #force_inline proc "c" (a: __m256, $IMM1: u8) -> __m128 where IMM1 < 2 { + when IMM1 == 0 { + return intrinsics.simd_shuffle(a, _mm256_undefined_ps(), 0, 1, 2, 3) + } else { + return intrinsics.simd_shuffle(a, _mm256_undefined_ps(), 4, 5, 6, 7) + } +} + +// Extracts 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from `a`, selected with `imm8`. +@(require_results, enable_target_feature="avx") +_mm256_extractf128_pd :: #force_inline proc "c" (a: __m256d, $IMM1: u8) -> __m128d where IMM1 < 2 { + when IMM1 == 0 { + return intrinsics.simd_shuffle(a, _mm256_undefined_pd(), 0, 1) + } else { + return intrinsics.simd_shuffle(a, _mm256_undefined_pd(), 2, 3) + } +} + +// Extracts 128 bits (composed of integer data) from `a`, selected with `imm8`. +@(require_results, enable_target_feature="avx") +_mm256_extractf128_si256 :: #force_inline proc "c" (a: __m256i, $IMM1: u8) -> __m128i where IMM1 < 2 { + when IMM1 == 0 { + dst := intrinsics.simd_shuffle(transmute(#simd[4]i64)a, (#simd[4]i64)(0), 0, 1) + return transmute(__m128i)dst + } else { + dst := intrinsics.simd_shuffle(transmute(#simd[4]i64)a, (#simd[4]i64)(0), 2, 3) + return transmute(__m128i)dst + } +} + +// Extracts a 32-bit integer from `a`, selected with `INDEX`. +@(require_results, enable_target_feature="avx") +_mm256_extract_epi32 :: #force_inline proc "c" (a: __m256i, $INDEX: u8) -> i32 where INDEX < 8 { + return intrinsics.simd_extract(transmute(#simd[8]i32)a, INDEX) +} + +@(require_results, enable_target_feature="avx") +_mm256_cvtsi256_si32 :: #force_inline proc "c" (a: __m256i) -> i32 { + return intrinsics.simd_extract(transmute(#simd[8]i32)a, 0) +} + +// Zeroes the contents of all XMM or YMM registers. +// +// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zeroall) +@(enable_target_feature="avx") +_mm256_zeroall :: #force_inline proc "c" () { + llvm_vzeroall() +} + +// Zeroes the upper 128 bits of all YMM registers; the lower 128-bits of the registers are unmodified. +// +// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zeroupper) +@(enable_target_feature="avx") +_mm256_zeroupper :: #force_inline proc "c" () { + llvm_vzeroupper() +} + +// Shuffles single-precision (32-bit) floating-point elements in `a` within 128-bit lanes using the control in `b`. +// +// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar_ps) +@(require_results, enable_target_feature="avx") +_mm256_permutevar_ps :: #force_inline proc "c" (a: __m256, b: __m256i) -> __m256 { + return llvm_vpermilps256(a, transmute(#simd[8]i32)b) +} + +// Shuffles single-precision (32-bit) floating-point elements in `a` using the control in `b`. +// +// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutevar_ps) +@(require_results, enable_target_feature="avx") +_mm_permutevar_ps :: #force_inline proc "c" (a: __m128, b: __m128i) -> __m128 { + return llvm_vpermilps(a, transmute(#simd[4]i32)b) +} + +// Shuffles single-precision (32-bit) floating-point elements in `a` within 128-bit lanes using the control in `imm8`. +@(require_results, enable_target_feature="avx") +_mm256_permute_ps :: #force_inline proc "c" (a: __m256, $IMM8: u8) -> __m256 { + return intrinsics.simd_shuffle( + a, + _mm256_undefined_ps(), + (IMM8 >> 0) & 0b11, + (IMM8 >> 2) & 0b11, + (IMM8 >> 4) & 0b11, + (IMM8 >> 6) & 0b11, + ((IMM8 >> 0) & 0b11) + 4, + ((IMM8 >> 2) & 0b11) + 4, + ((IMM8 >> 4) & 0b11) + 4, + ((IMM8 >> 6) & 0b11) + 4, + ) +} + +// Shuffles single-precision (32-bit) floating-point elements in `a` using the control in `imm8`. +@(require_results, enable_target_feature="avx") +_mm_permute_ps :: #force_inline proc "c" (a: __m128, $IMM8: u8) -> __m128 { + return intrinsics.simd_shuffle( + a, + _mm_undefined_ps(), + (IMM8 >> 0) & 0b11, + (IMM8 >> 2) & 0b11, + (IMM8 >> 4) & 0b11, + (IMM8 >> 6) & 0b11, + ) +} + +// Shuffles double-precision (64-bit) floating-point elements in `a` within 256-bit lanes using the control in `b`. +// +// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar_pd) +@(require_results, enable_target_feature="avx") +_mm256_permutevar_pd :: #force_inline proc "c" (a: __m256d, b: __m256i) -> __m256d { + return llvm_vpermilpd256(a, b) +} + +// Shuffles double-precision (64-bit) floating-point elements in `a` using the control in `b`. +// +// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutevar_pd) +@(require_results, enable_target_feature="avx") +_mm_permutevar_pd :: #force_inline proc "c" (a: __m128d, b: __m128i) -> __m128d { + return llvm_vpermilpd(a, b) +} + +// Shuffles double-precision (64-bit) floating-point elements in `a` within 128-bit lanes using the control in `imm8`. +@(require_results, enable_target_feature="avx") +_mm256_permute_pd :: #force_inline proc "c" (a: __m256d, $IMM4: u8) -> __m256d where IMM4 < 16 { + return intrinsics.simd_shuffle( + a, + _mm256_undefined_pd(), + ((IMM4 >> 0) & 1), + ((IMM4 >> 1) & 1), + ((IMM4 >> 2) & 1) + 2, + ((IMM4 >> 3) & 1) + 2, + ) +} + +// Shuffles double-precision (64-bit) floating-point elements in `a` using the control in `imm8`. +@(require_results, enable_target_feature="avx") +_mm_permute_pd :: #force_inline proc "c" (a: __m128d, $IMM2: u8) -> __m128d where IMM2 < 4 { + return intrinsics.simd_shuffle( + a, + _mm_undefined_pd(), + (IMM2) & 1, + (IMM2 >> 1) & 1, + ) +} + + + +// Shuffles 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) selected by `imm8` from `a` and `b`. +@(require_results, enable_target_feature="avx") +_mm256_permute2f128_ps :: #force_inline proc "c" (a, b: __m256, $IMM8: u8) -> __m256 { + return _mm256_castsi256_ps(_mm256_permute2f128_si256( + _mm256_castps_si256(a), + _mm256_castps_si256(b), + IMM8, + )) +} + +// Shuffles 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) selected by `imm8` from `a` and `b`. +@(require_results, enable_target_feature="avx") +_mm256_permute2f128_pd :: #force_inline proc "c" (a, b: __m256d, $IMM8: u8) -> __m256d { + _mm256_castsi256_pd(_mm256_permute2f128_si256( + _mm256_castpd_si256(a), + _mm256_castpd_si256(b), + IMM8, + )) +} + +// Shuffles 128-bits (composed of integer data) selected by `imm8` from `a` and `b`. +@(require_results, enable_target_feature="avx") +_mm256_permute2f128_si256 :: #force_inline proc "c" (a, b: __m256i, $IMM8: u8) -> __m256i { + r := intrinsics.simd_shuffle( + a, + b, + 2 * ((IMM8 & 0xf) & 0b11) + 0, + 2 * ((IMM8 & 0xf) & 0b11) + 1, + + 2 * (((IMM8 & 0xf0) >> 4) & 0b11) + 0, + 2 * (((IMM8 & 0xf0) >> 4) & 0b11) + 1, + ) + return intrinsics.simd_shuffle( + r, + __m256i(0), + + 4 if ((IMM8 & 0xf) & 0b1000) != 0 else 0, + 4 if ((IMM8 & 0xf) & 0b1000) != 0 else 1, + + 4 if (((IMM8 & 0xf0)>>4) & 0b1000) != 0 else 2, + 4 if (((IMM8 & 0xf0)>>4) & 0b1000) != 0 else 3, + ) +} + +// Broadcasts a single-precision (32-bit) floating-point element from memory to all elements of the returned vector. +@(require_results, enable_target_feature="avx") +_mm256_broadcast_ss :: #force_inline proc "c" (f: ^f32) -> __m256 { + return _mm256_set1_ps(f^) +} + +// Broadcasts a single-precision (32-bit) floating-point element from memory to all elements of the returned vector. +@(require_results, enable_target_feature="sse,avx") +_mm_broadcast_ss :: #force_inline proc "c" (f: ^f32) -> __m128 { + return _mm_set1_ps(f^) +} + +// Broadcasts a double-precision (64-bit) floating-point element from memory to all elements of the returned vector. +@(require_results, enable_target_feature="avx") +_mm256_broadcast_sd :: #force_inline proc "c" (f: ^f64) -> __m256d { + return _mm256_set1_pd(f^) +} + +// Broadcasts 128 bits from memory (composed of 4 packed single-precision (32-bit) floating-point elements) to all elements of the returned vector. +@(require_results, enable_target_feature="sse,avx") +_mm256_broadcast_ps :: #force_inline proc "c" (a: ^__m128) -> __m256 { + return intrinsics.simd_shuffle(a^, _mm_setzero_ps(), 0, 1, 2, 3, 0, 1, 2, 3) +} + +// Broadcasts 128 bits from memory (composed of 2 packed double-precision (64-bit) floating-point elements) to all elements of the returned vector. +@(require_results, enable_target_feature="sse2,avx") +_mm256_broadcast_pd :: #force_inline proc "c" (a: ^__m128d) -> __m256d { + return intrinsics.simd_shuffle(a^, _mm_setzero_pd(), 0, 1, 0, 1) +} + +// Copies `a` to result, then inserts 128 bits (composed of 4 packed +// single-precision (32-bit) floating-point elements) from `b` into result +// at the location specified by `imm8`. +@(require_results, enable_target_feature="sse,avx") +_mm256_insertf128_ps :: #force_inline proc "c" (a: __m256, b: __m128, $IMM1: u8) -> __m256 where IMM1 < 2 { + when IMM1 == 0 { + return intrinsics.simd_shuffle( + a, + _mm256_castps128_ps256(b), + 8, 9, 10, 11, 4, 5, 6, 7, + ) + } else { + return intrinsics.simd_shuffle( + a, + _mm256_castps128_ps256(b), + 0, 1, 2, 3, 8, 9, 10, 11, + ) + } +} + +// Copies `a` to result, then inserts 128 bits (composed of 2 packed +// double-precision (64-bit) floating-point elements) from `b` into result +// at the location specified by `imm8`. +@(require_results, enable_target_feature="sse2,avx") +_mm256_insertf128_pd :: #force_inline proc "c" (a: __m256d, b: __m128d, $IMM1: u8) -> __m256d where IMM1 < 2 { + when IMM1 == 0 { + return intrinsics.simd_shuffle( + a, + _mm256_castpd128_pd256(b), + 4, 5, 2, 3, + ) + } else { + return intrinsics.simd_shuffle( + a, + _mm256_castpd128_pd256(b), + 0, 1, 4, 5, + ) + } +} + +// Copies `a` to result, then inserts 128 bits from `b` into result at the location specified by `imm8`. +@(require_results, enable_target_feature="avx") +_mm256_insertf128_si256 :: #force_inline proc "c" (a: __m256i, b: __m128i, $IMM1: u8) -> __m256i where IMM1 < 2 { + when IMM1 == 0 { + return intrinsics.simd_shuffle( + a, + _mm256_castsi128_si256(b), + 4, 5, 2, 3, + ) + } else { + return intrinsics.simd_shuffle( + a, + _mm256_castsi128_si256(b), + 0, 1, 4, 5, + ) + } +} + +// Copies `a` to result, and inserts the 8-bit integer `i` into result at the location specified by `index`. +@(require_results, enable_target_feature="avx") +_mm256_insert_epi8 :: #force_inline proc "c" (a: __m256i, i: i8, $INDEX: u8) -> __m256i where INDEX < 32 { + return transmute(__m256i)intrinsics.simd_replace(transmute(#simd[32]i8)a, INDEX, i) +} + +// Copies `a` to result, and inserts the 16-bit integer `i` into result at the location specified by `index`. +@(require_results, enable_target_feature="avx") +_mm256_insert_epi16 :: #force_inline proc "c" (a: __m256i, i: i16, $INDEX: u8) -> __m256i where INDEX < 16 { + return transmute(__m256i)intrinsics.simd_replace(transmute(#simd[16]i16)a, INDEX, i) +} + +// Copies `a` to result, and inserts the 32-bit integer `i` into result at the location specified by `index`. +@(require_results, enable_target_feature="avx") +_mm256_insert_epi32 :: #force_inline proc "c" (a: __m256i, i: i32, $INDEX: u8) -> __m256i where INDEX < 8 { + return transmute(__m256i)intrinsics.simd_replace(transmute(#simd[8]i32)a, INDEX, i) +} + + + +// Loads 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from memory into result. +// `mem_addr` must be aligned on a 32-byte boundary or a +// general-protection exception may be generated. +// +// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_pd) +@(require_results, enable_target_feature="avx") +_mm256_load_pd :: #force_inline proc "c" (mem_addr: ^f64) -> __m256d { + return (^__m256d)(mem_addr)^ +} + +// Stores 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from `a` into memory. +// `mem_addr` must be aligned on a 32-byte boundary or a +// general-protection exception may be generated. +// +// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_pd) +@(enable_target_feature="avx") +_mm256_store_pd :: #force_inline proc "c" (mem_addr: ^f64, a: __m256d) { + (^__m256d)(mem_addr)^ = a +} + +// Loads 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from memory into result. +// `mem_addr` must be aligned on a 32-byte boundary or a +// general-protection exception may be generated. +// +// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_ps) +@(require_results, enable_target_feature="avx") +_mm256_load_ps :: #force_inline proc "c" (mem_addr: ^f32) -> __m256 { + return (^__m256)(mem_addr)^ +} + +// Stores 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from `a` into memory. +// `mem_addr` must be aligned on a 32-byte boundary or a +// general-protection exception may be generated. +// +// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_ps) +@(enable_target_feature="avx") +_mm256_store_ps :: #force_inline proc "c" (mem_addr: ^f32, a: __m256) { + (^__m256)(mem_addr)^ = a +} + +// Loads 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from memory into result. +// `mem_addr` does not need to be aligned on any particular boundary. +@(enable_target_feature="avx") +_mm256_loadu_pd :: #force_inline proc "c" (mem_addr: ^f64) -> __m256d { + return intrinsics.unaligned_load((^__m256d)(mem_addr)) +} + +// Stores 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from `a` into memory. +// `mem_addr` does not need to be aligned on any particular boundary. +@(enable_target_feature="avx") +_mm256_storeu_pd :: #force_inline proc "c" (mem_addr: ^f64, a: __m256d) { + intrinsics.unaligned_store((^__m256d)(mem_addr), a) +} + +// Loads 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from memory into result. +// `mem_addr` does not need to be aligned on any particular boundary. +@(require_results, enable_target_feature="avx") +_mm256_loadu_ps :: #force_inline proc "c" (mem_addr: ^f32) -> __m256 { + return intrinsics.unaligned_load((^__m256)(mem_addr)) +} + +// Stores 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from `a` into memory. +// `mem_addr` does not need to be aligned on any particular boundary. +@(enable_target_feature="avx") +_mm256_storeu_ps :: #force_inline proc "c" (mem_addr: ^f32, a: __m256) { + intrinsics.unaligned_store((^__m256)(mem_addr), a) +} + +// Loads 256-bits of integer data from memory into result. +// `mem_addr` must be aligned on a 32-byte boundary or a +// general-protection exception may be generated. +// +// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_si256) +@(require_results, enable_target_feature="avx") +_mm256_load_si256 :: #force_inline proc "c" (mem_addr: ^__m256i) -> __m256i { + return mem_addr^ +} + +// Stores 256-bits of integer data from `a` into memory. +// `mem_addr` must be aligned on a 32-byte boundary or a +// general-protection exception may be generated. +// +// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_si256) +@(enable_target_feature="avx") +_mm256_store_si256 :: #force_inline proc "c" (mem_addr: ^__m256i, a: __m256i) { + mem_addr^ = a +} + +// Loads 256-bits of integer data from memory into result. +// `mem_addr` does not need to be aligned on any particular boundary. +@(require_results, enable_target_feature="avx") +_mm256_loadu_si256 :: #force_inline proc "c" (mem_addr: ^__m256i) -> __m256i { + return intrinsics.unaligned_load(mem_addr) +} + +// Stores 256-bits of integer data from `a` into memory. +// `mem_addr` does not need to be aligned on any particular boundary. +@(enable_target_feature="avx") +_mm256_storeu_si256 :: #force_inline proc "c" (mem_addr: ^__m256i, a: __m256i) { + intrinsics.unaligned_store(mem_addr, a) +} + +// Loads packed double-precision (64-bit) floating-point elements from memory +// into result using `mask` (elements are zeroed out when the high bit of the +// corresponding element is not set). +@(require_results, enable_target_feature="avx") +_mm256_maskload_pd :: #force_inline proc "c" (mem_addr: ^f64, mask: __m256i) -> __m256d { + mask_mask := intrinsics.simd_shr(mask, 63) + return intrinsics.simd_masked_load(mem_addr, _mm256_setzero_pd(), mask_mask) +} + +// Stores packed double-precision (64-bit) floating-point elements from `a` +// into memory using `mask`. +@(enable_target_feature="avx") +_mm256_maskstore_pd :: #force_inline proc "c" (mem_addr: ^f64, mask: __m256i, a: __m256d) { + mask_mask := intrinsics.simd_shr(mask, 63) + intrinsics.simd_masked_store(mem_addr, a, mask_mask) +} + + +// Loads packed double-precision (64-bit) floating-point elements from memory +// into result using `mask` (elements are zeroed out when the high bit of the +// corresponding element is not set). +@(require_results, enable_target_feature="sse2,avx") +_mm_maskload_pd :: #force_inline proc "c" (mem_addr: ^f64, mask: __m128i) -> __m128d { + mask_mask := intrinsics.simd_shr(mask, 63) + return intrinsics.simd_masked_load(mem_addr, _mm_setzero_pd(), mask_mask) +} + +// Stores packed double-precision (64-bit) floating-point elements from `a` +// into memory using `mask`. +@(enable_target_feature="avx") +_mm_maskstore_pd :: #force_inline proc "c" (mem_addr: ^f64, mask: __m128i, a: __m128d) { + mask_mask := intrinsics.simd_shr(mask, 63) + intrinsics.simd_masked_store(mem_addr, a, mask_mask) +} + +// Loads packed single-precision (32-bit) floating-point elements from memory +// into result using `mask` (elements are zeroed out when the high bit of the +// corresponding element is not set). +@(require_results, enable_target_feature="avx") +_mm256_maskload_ps :: #force_inline proc "c" (mem_addr: ^f32, mask: __m256i) -> __m256 { + mask_mask := intrinsics.simd_shr(transmute(#simd[8]i32)mask, 31) + return intrinsics.simd_masked_load(mem_addr, _mm256_setzero_ps(), mask_mask) +} + +// Stores packed single-precision (32-bit) floating-point elements from `a` +// into memory using `mask`. +@(enable_target_feature="avx") +_mm256_maskstore_ps :: #force_inline proc "c" (mem_addr: ^f32, mask: __m256i, a: __m256) { + mask_mask := intrinsics.simd_shr(transmute(#simd[8]i32)mask, 31) + intrinsics.simd_masked_store(mem_addr, a, mask_mask) +} + +// Loads packed single-precision (32-bit) floating-point elements from memory +// into result using `mask` (elements are zeroed out when the high bit of the +// corresponding element is not set). +@(require_results, enable_target_feature="sse,avx") +_mm_maskload_ps :: #force_inline proc "c" (mem_addr: ^f32, mask: __m128i) -> __m128 { + mask_mask := intrinsics.simd_shr(transmute(#simd[4]i32)mask, 31) + return intrinsics.simd_masked_load(mem_addr, _mm_setzero_ps(), mask_mask) +} + +// Stores packed single-precision (32-bit) floating-point elements from `a` +// into memory using `mask`. +@(enable_target_feature="avx") +_mm_maskstore_ps :: #force_inline proc "c" (mem_addr: ^f32, mask: __m128i, a: __m128) { + mask_mask := intrinsics.simd_shr(transmute(#simd[4]i32)mask, 31) + intrinsics.simd_masked_store(mem_addr, a, mask_mask) +} + + + + +// Duplicate odd-indexed single-precision (32-bit) floating-point elements from `a`, and returns the results. +@(require_results, enable_target_feature="avx") +_mm256_movehdup_ps :: #force_inline proc "c" (a: __m256) -> __m256 { + return intrinsics.simd_shuffle(a, a, 1, 1, 3, 3, 5, 5, 7, 7) +} + +// Duplicate even-indexed single-precision (32-bit) floating-point elements from `a`, and returns the results. +@(require_results, enable_target_feature="avx") +_mm256_moveldup_ps :: #force_inline proc "c" (a: __m256) -> __m256 { + return intrinsics.simd_shuffle(a, a, 0, 0, 2, 2, 4, 4, 6, 6) +} + +// Duplicate even-indexed double-precision (64-bit) floating-point elements from `a`, and returns the results. +@(require_results, enable_target_feature="avx") +_mm256_movedup_pd :: #force_inline proc "c" (a: __m256d) -> __m256d { + return intrinsics.simd_shuffle(a, a, 0, 0, 2, 2) +} + + +// Loads 256-bits of integer data from unaligned memory into result. +// This intrinsic may perform better than `_mm256_loadu_si256` when the +// data crosses a cache line boundary. +// +// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_lddqu_si256) +@(require_results, enable_target_feature="avx") +_mm256_lddqu_si256 :: #force_inline proc "c" (mem_addr: ^__m256i) -> __m256i { + return transmute(__m256i)llvm_vlddqu(mem_addr) +} + +// Moves integer data from a 256-bit integer vector to a 32-byte +// aligned memory location. To minimize caching, the data is flagged as +// non-temporal (unlikely to be used again soon) +// +// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_si256) +// +// # Safety of non-temporal stores +// +// After using this intrinsic, but before any other access to the memory that this intrinsic +// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In +// particular, functions that call this intrinsic should generally call `_mm_sfence` before they +// return. +// +// See [`_mm_sfence`] for details. +@(enable_target_feature="avx") +_mm256_stream_si256 :: #force_inline proc "c" (mem_addr: ^__m256i, a: __m256i) { + panic_contextless("TODO: _mm256_stream_si256") +} + +// Moves double-precision values from a 256-bit vector of `[4 x double]` +// to a 32-byte aligned memory location. To minimize caching, the data is +// flagged as non-temporal (unlikely to be used again soon). +// +// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_pd) +// +// # Safety of non-temporal stores +// +// After using this intrinsic, but before any other access to the memory that this intrinsic +// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In +// particular, functions that call this intrinsic should generally call `_mm_sfence` before they +// return. +// +// See [`_mm_sfence`] for details. +@(enable_target_feature="avx") +_mm256_stream_pd :: #force_inline proc "c" (mem_addr: ^f64, a: __m256d) { + panic_contextless("TODO: _mm256_stream_pd") +} + +// Moves single-precision floating point values from a 256-bit vector +// of `[8 x float]` to a 32-byte aligned memory location. To minimize +// caching, the data is flagged as non-temporal (unlikely to be used again +// soon). +// +// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_ps) +// +// # Safety of non-temporal stores +// +// After using this intrinsic, but before any other access to the memory that this intrinsic +// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In +// particular, functions that call this intrinsic should generally call `_mm_sfence` before they +// return. +// +// See [`_mm_sfence`] for details. +@(enable_target_feature="avx") +_mm256_stream_ps :: #force_inline proc "c" (mem_addr: ^f32, a: __m256) { + panic_contextless("TODO: _mm256_stream_ps") +} + +// Computes the approximate reciprocal of packed single-precision (32-bit) floating-point elements in `a`, and returns the results. The maximum +// relative error for this approximation is less than 1.5*2^-12. +// +// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rcp_ps) +@(require_results, enable_target_feature="avx") +_mm256_rcp_ps :: #force_inline proc "c" (a: __m256) -> __m256 { + return llvm_vrcpps(a) +} + +// Computes the approximate reciprocal square root of packed single-precision +// (32-bit) floating-point elements in `a`, and returns the results. +// The maximum relative error for this approximation is less than 1.5*2^-12. +// +// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rsqrt_ps) +@(require_results, enable_target_feature="avx") +_mm256_rsqrt_ps :: #force_inline proc "c" (a: __m256) -> __m256 { + return llvm_vrsqrtps(a) +} + + + +// Unpacks and interleave double-precision (64-bit) floating-point elements +// from the high half of each 128-bit lane in `a` and `b`. +@(require_results, enable_target_feature="avx") +_mm256_unpackhi_pd :: #force_inline proc "c" (a, b: __m256d) -> __m256d { + return intrinsics.simd_shuffle(a, b, 1, 5, 3, 7) +} + +// Unpacks and interleave single-precision (32-bit) floating-point elements +// from the high half of each 128-bit lane in `a` and `b`. +@(require_results, enable_target_feature="avx") +_mm256_unpackhi_ps :: #force_inline proc "c" (a, b: __m256) -> __m256 { + return intrinsics.simd_shuffle(a, b, 2, 10, 3, 11, 6, 14, 7, 15) +} + +// Unpacks and interleave double-precision (64-bit) floating-point elements +// from the low half of each 128-bit lane in `a` and `b`. +@(require_results, enable_target_feature="avx") +_mm256_unpacklo_pd :: #force_inline proc "c" (a, b: __m256d) -> __m256d { + return intrinsics.simd_shuffle(a, b, 0, 4, 2, 6) +} + +// Unpacks and interleave single-precision (32-bit) floating-point elements +// from the low half of each 128-bit lane in `a` and `b`. +@(require_results, enable_target_feature="avx") +_mm256_unpacklo_ps :: #force_inline proc "c" (a, b: __m256) -> __m256 { + return intrinsics.simd_shuffle(a, b, 0, 8, 1, 9, 4, 12, 5, 13) +} + +// Computes the bitwise AND of 256 bits (representing integer data) in `a` and +// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0. +// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if +// the result is zero, otherwise set `CF` to 0. Return the `ZF` value. +@(require_results, enable_target_feature="avx") +_mm256_testz_si256 :: #force_inline proc "c" (a, b: __m256i) -> i32 { + r := intrinsics.simd_bit_and(a, b) + return i32(0 == intrinsics.simd_reduce_or(r)) +} + +// Computes the bitwise AND of 256 bits (representing integer data) in `a` and +// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0. +// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if +// the result is zero, otherwise set `CF` to 0. Return the `CF` value. +@(require_results, enable_target_feature="avx") +_mm256_testc_si256 :: #force_inline proc "c" (a, b: __m256i) -> i32 { + r := intrinsics.simd_bit_and(intrinsics.simd_bit_xor(a, __m256i(~i64(0))), b) + return i32(0 == intrinsics.simd_reduce_or(r)) +} + + + +// Computes the bitwise AND of 256 bits (representing integer data) in `a` and +// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0. +// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if +// the result is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and +// `CF` values are zero, otherwise return 0. +// +// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testnzc_si256) +@(require_results, enable_target_feature="avx") +_mm256_testnzc_si256 :: #force_inline proc "c" (a, b: __m256i) -> i32 { + return llvm_ptestnzc256(a, b) +} + +// Computes the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point elements) in `a` and `b`, producing an intermediate 256-bit +// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the +// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise +// NOT of `a` and then AND with `b`, producing an intermediate value, and set +// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value +// is zero, otherwise set `CF` to 0. Return the `ZF` value. +// +// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_pd) +@(require_results, enable_target_feature="avx") +_mm256_testz_pd :: #force_inline proc "c" (a, b: __m256d) -> i32 { + return llvm_vtestzpd256(a, b) +} + +// Computes the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point elements) in `a` and `b`, producing an intermediate 256-bit +// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the +// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise +// NOT of `a` and then AND with `b`, producing an intermediate value, and set +// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value +// is zero, otherwise set `CF` to 0. Return the `CF` value. +// +// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testc_pd) +@(require_results, enable_target_feature="avx") +_mm256_testc_pd :: #force_inline proc "c" (a, b: __m256d) -> i32 { + return llvm_vtestcpd256(a, b) +} + +// Computes the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point elements) in `a` and `b`, producing an intermediate 256-bit +// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the +// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise +// NOT of `a` and then AND with `b`, producing an intermediate value, and set +// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value +// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values +// are zero, otherwise return 0. +// +// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testnzc_pd) +@(require_results, enable_target_feature="avx") +_mm256_testnzc_pd :: #force_inline proc "c" (a, b: __m256d) -> i32 { + return llvm_vtestnzcpd256(a, b) +} + +// Computes the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point elements) in `a` and `b`, producing an intermediate 128-bit +// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the +// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise +// NOT of `a` and then AND with `b`, producing an intermediate value, and set +// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value +// is zero, otherwise set `CF` to 0. Return the `ZF` value. +@(require_results, enable_target_feature="sse2,avx") +_mm_testz_pd :: #force_inline proc "c" (a, b: __m128d) -> i32 { + r := intrinsics.simd_lanes_lt(transmute(__m128i)_mm_and_pd(a, b), __m128i(0)) + return i32(0 == intrinsics.simd_reduce_or(r)) +} + +// Computes the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point elements) in `a` and `b`, producing an intermediate 128-bit +// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the +// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise +// NOT of `a` and then AND with `b`, producing an intermediate value, and set +// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value +// is zero, otherwise set `CF` to 0. Return the `CF` value. +@(require_results, enable_target_feature="sse2,avx") +_mm_testc_pd :: #force_inline proc "c" (a, b: __m128d) -> i32 { + r := intrinsics.simd_lanes_lt(transmute(__m128i)_mm_andnot_pd(a, b), __m128i(0)) + return i32(0 == intrinsics.simd_reduce_or(r)) +} + +// Computes the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point elements) in `a` and `b`, producing an intermediate 128-bit +// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the +// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise +// NOT of `a` and then AND with `b`, producing an intermediate value, and set +// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value +// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values +// are zero, otherwise return 0. +// +// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_pd) +@(require_results, enable_target_feature="avx") +_mm_testnzc_pd :: #force_inline proc "c" (a, b: __m128d) -> i32 { + return llvm_vtestnzcpd(a, b) +} + +// Computes the bitwise AND of 256 bits (representing single-precision (32-bit) floating-point elements) in `a` and `b`, producing an intermediate 256-bit +// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the +// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise +// NOT of `a` and then AND with `b`, producing an intermediate value, and set +// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value +// is zero, otherwise set `CF` to 0. Return the `ZF` value. +// +// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_ps) +@(require_results, enable_target_feature="avx") +_mm256_testz_ps :: #force_inline proc "c" (a, b: __m256) -> i32 { + return llvm_vtestzps256(a, b) +} + +// Computes the bitwise AND of 256 bits (representing single-precision (32-bit) floating-point elements) in `a` and `b`, producing an intermediate 256-bit +// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the +// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise +// NOT of `a` and then AND with `b`, producing an intermediate value, and set +// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value +// is zero, otherwise set `CF` to 0. Return the `CF` value. +// +// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testc_ps) +@(require_results, enable_target_feature="avx") +_mm256_testc_ps :: #force_inline proc "c" (a, b: __m256) -> i32 { + return llvm_vtestcps256(a, b) +} + +// Computes the bitwise AND of 256 bits (representing single-precision (32-bit) floating-point elements) in `a` and `b`, producing an intermediate 256-bit +// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the +// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise +// NOT of `a` and then AND with `b`, producing an intermediate value, and set +// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value +// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values +// are zero, otherwise return 0. +// +// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testnzc_ps) +@(require_results, enable_target_feature="avx") +_mm256_testnzc_ps :: #force_inline proc "c" (a, b: __m256) -> i32 { + return llvm_vtestnzcps256(a, b) +} + +// Computes the bitwise AND of 128 bits (representing single-precision (32-bit) floating-point elements) in `a` and `b`, producing an intermediate 128-bit +// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the +// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise +// NOT of `a` and then AND with `b`, producing an intermediate value, and set +// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value +// is zero, otherwise set `CF` to 0. Return the `ZF` value. +@(require_results, enable_target_feature="sse,avx") +_mm_testz_ps :: #force_inline proc "c" (a: __m128, b: __m128) -> i32 { + r := intrinsics.simd_lanes_lt(transmute(#simd[4]i32)_mm_and_ps(a, b), (#simd[4]i32)(0)) + return i32(0 == intrinsics.simd_reduce_or(r)) +} + +// Computes the bitwise AND of 128 bits (representing single-precision (32-bit) floating-point elements) in `a` and `b`, producing an intermediate 128-bit +// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the +// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise +// NOT of `a` and then AND with `b`, producing an intermediate value, and set +// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value +// is zero, otherwise set `CF` to 0. Return the `CF` value. +@(require_results, enable_target_feature="sse,avx") +_mm_testc_ps :: #force_inline proc "c" (a: __m128, b: __m128) -> i32 { + r := intrinsics.simd_lanes_lt(transmute(#simd[4]i32)_mm_andnot_ps(a, b), (#simd[4]i32)(0)) + return i32(0 == intrinsics.simd_reduce_or(r)) +} + +// Computes the bitwise AND of 128 bits (representing single-precision (32-bit) floating-point elements) in `a` and `b`, producing an intermediate 128-bit +// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the +// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise +// NOT of `a` and then AND with `b`, producing an intermediate value, and set +// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value +// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values +// are zero, otherwise return 0. +// +// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_ps) +@(require_results, enable_target_feature="avx") +_mm_testnzc_ps :: #force_inline proc "c" (a: __m128, b: __m128) -> i32 { + return llvm_vtestnzcps(a, b) +} + +// Sets each bit of the returned mask based on the most significant bit of the +// corresponding packed double-precision (64-bit) floating-point element in +// `a`. +@(require_results, enable_target_feature="avx") +_mm256_movemask_pd :: #force_inline proc "c" (a: __m256d) -> i32 { + mask := intrinsics.simd_lanes_lt(transmute(#simd[4]i64)a, (#simd[4]i64)(0)) + return i32(transmute(u8)intrinsics.simd_extract_lsbs(mask)) +} + +// Sets each bit of the returned mask based on the most significant bit of the +// corresponding packed single-precision (32-bit) floating-point element in +// `a`. +@(require_results, enable_target_feature="avx") +_mm256_movemask_ps :: #force_inline proc "c" (a: __m256) -> i32 { + // Propagate the highest bit to the rest, because simd_bitmask + // requires all-1 or all-0. + mask := intrinsics.simd_lanes_lt(transmute(#simd[8]i32)a, (#simd[8]i32)(0)) + return i32(transmute(u8)intrinsics.simd_extract_lsbs(mask)) +} + +// Returns vector of type __m256d with all elements set to zero. +@(require_results, enable_target_feature="avx") +_mm256_setzero_pd :: #force_inline proc "c" () -> __m256d { + return 0 +} + +// Returns vector of type __m256 with all elements set to zero. +@(require_results, enable_target_feature="avx") +_mm256_setzero_ps :: #force_inline proc "c" () -> __m256 { + return 0 +} + +// Returns vector of type __m256i with all elements set to zero. +@(require_results, enable_target_feature="avx") +_mm256_setzero_si256 :: #force_inline proc "c" () -> __m256i { + return 0 +} + +// Sets packed double-precision (64-bit) floating-point elements in returned +// vector with the supplied values. +@(require_results, enable_target_feature="avx") +_mm256_set_pd :: #force_inline proc "c" (a: f64, b: f64, c: f64, d: f64) -> __m256d { + return _mm256_setr_pd(d, c, b, a) +} + +// Sets packed single-precision (32-bit) floating-point elements in returned +// vector with the supplied values. +@(require_results, enable_target_feature="avx") +_mm256_set_ps :: #force_inline proc "c" ( + a: f32, + b: f32, + c: f32, + d: f32, + e: f32, + f: f32, + g: f32, + h: f32, +) -> __m256 { + return _mm256_setr_ps(h, g, f, e, d, c, b, a) +} + +// Sets packed 8-bit integers in returned vector with the supplied values. +@(require_results, enable_target_feature="avx") +_mm256_set_epi8 :: #force_inline proc "c" ( + e00, e01, e02, e03, e04, e05, e06, e07: i8, + e08, e09, e10, e11, e12, e13, e14, e15: i8, + e16, e17, e18, e19, e20, e21, e22, e23: i8, + e24, e25, e26, e27, e28, e29, e30, e31: i8, +) -> __m256i { + return _mm256_setr_epi8( + e31, e30, e29, e28, e27, e26, e25, e24, + e23, e22, e21, e20, e19, e18, e17, e16, + e15, e14, e13, e12, e11, e10, e09, e08, + e07, e06, e05, e04, e03, e02, e01, e00, + ) +} + +// Sets packed 16-bit integers in returned vector with the supplied values. +@(require_results, enable_target_feature="avx") +_mm256_set_epi16 :: #force_inline proc "c" ( + e00, e01, e02, e03, e04, e05, e06, e07: i16, + e08, e09, e10, e11, e12, e13, e14, e15: i16, +) -> __m256i { + return _mm256_setr_epi16( + e15, e14, e13, e12, + e11, e10, e09, e08, + e07, e06, e05, e04, + e03, e02, e01, e00, + ) +} + +// Sets packed 32-bit integers in returned vector with the supplied values. +@(require_results, enable_target_feature="avx") +_mm256_set_epi32 :: #force_inline proc "c" (e0, e1, e2, e3, e4, e5, e6, e7: i32) -> __m256i { + return _mm256_setr_epi32(e7, e6, e5, e4, e3, e2, e1, e0) +} + +// Sets packed 64-bit integers in returned vector with the supplied values. +@(require_results, enable_target_feature="avx") +_mm256_set_epi64x :: #force_inline proc "c" (a: i64, b: i64, c: i64, d: i64) -> __m256i { + return _mm256_setr_epi64x(d, c, b, a) +} + +// Sets packed double-precision (64-bit) floating-point elements in returned +// vector with the supplied values in reverse order. +@(require_results, enable_target_feature="avx") +_mm256_setr_pd :: #force_inline proc "c" (a: f64, b: f64, c: f64, d: f64) -> __m256d { + return __m256d{a, b, c, d} +} + +// Sets packed single-precision (32-bit) floating-point elements in returned +// vector with the supplied values in reverse order. +@(require_results, enable_target_feature="avx") +_mm256_setr_ps :: #force_inline proc "c" (a, b, c, d, e, f, g, h: f32) -> __m256 { + return __m256{a, b, c, d, e, f, g, h} +} + +// Sets packed 8-bit integers in returned vector with the supplied values in +// reverse order. +@(require_results, enable_target_feature="avx") +_mm256_setr_epi8 :: #force_inline proc "c" ( + e00, e01, e02, e03, e04, e05, e06, e07: i8, + e08, e09, e10, e11, e12, e13, e14, e15: i8, + e16, e17, e18, e19, e20, e21, e22, e23: i8, + e24, e25, e26, e27, e28, e29, e30, e31: i8, +) -> __m256i { + return transmute(__m256i)#simd[32]i8{ + e00, e01, e02, e03, e04, e05, e06, e07, + e08, e09, e10, e11, e12, e13, e14, e15, + e16, e17, e18, e19, e20, e21, e22, e23, + e24, e25, e26, e27, e28, e29, e30, e31, + } +} + +// Sets packed 16-bit integers in returned vector with the supplied values in +// reverse order. +@(require_results, enable_target_feature="avx") +_mm256_setr_epi16 :: #force_inline proc "c" ( + e00, e01, e02, e03, e04, e05, e06, e07: i16, + e08, e09, e10, e11, e12, e13, e14, e15: i16, +) -> __m256i { + return transmute(__m256i)#simd[16]i16{ + e00, e01, e02, e03, + e04, e05, e06, e07, + e08, e09, e10, e11, + e12, e13, e14, e15, + } +} + +// Sets packed 32-bit integers in returned vector with the supplied values in +// reverse order. +@(require_results, enable_target_feature="avx") +_mm256_setr_epi32 :: #force_inline proc "c" (e0, e1, e2, e3, e4, e5, e6, e7: i32) -> __m256i { + return transmute(__m256i)#simd[8]i32{e0, e1, e2, e3, e4, e5, e6, e7} +} + +// Sets packed 64-bit integers in returned vector with the supplied values in +// reverse order. +@(require_results, enable_target_feature="avx") +_mm256_setr_epi64x :: #force_inline proc "c" (a: i64, b: i64, c: i64, d: i64) -> __m256i { + return {a, b, c, d} +} + +// Broadcasts double-precision (64-bit) floating-point value `a` to all elements of returned vector. +@(require_results, enable_target_feature="avx") +_mm256_set1_pd :: #force_inline proc "c" (a: f64) -> __m256d { + return a +} + +// Broadcasts single-precision (32-bit) floating-point value `a` to all elements of returned vector. +@(require_results, enable_target_feature="avx") +_mm256_set1_ps :: #force_inline proc "c" (a: f32) -> __m256 { + return a +} + +// Broadcasts 8-bit integer `a` to all elements of returned vector. +// This intrinsic may generate the `vpbroadcastb`. +@(require_results, enable_target_feature="avx") +_mm256_set1_epi8 :: #force_inline proc "c" (a: i8) -> __m256i { + return transmute(__m256i)(#simd[32]i8)(a) +} + +// Broadcasts 16-bit integer `a` to all elements of returned vector. +// This intrinsic may generate the `vpbroadcastw`. +// +// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi16) +@(require_results, enable_target_feature="avx") +_mm256_set1_epi16 :: #force_inline proc "c" (a: i16) -> __m256i { + return transmute(__m256i)(#simd[16]i16)(a) +} + +// Broadcasts 32-bit integer `a` to all elements of returned vector. +// This intrinsic may generate the `vpbroadcastd`. +@(require_results, enable_target_feature="avx") +_mm256_set1_epi32 :: #force_inline proc "c" (a: i32) -> __m256i { + return transmute(__m256i)(#simd[8]i32)(a) +} + +// Broadcasts 64-bit integer `a` to all elements of returned vector. +// This intrinsic may generate the `vpbroadcastq`. +// +// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi64x) +@(require_results, enable_target_feature="avx") +_mm256_set1_epi64x :: #force_inline proc "c" (a: i64) -> __m256i { + return a +} + +// Cast vector of type __m256d to type __m256. +@(require_results, enable_target_feature="avx") +_mm256_castpd_ps :: #force_inline proc "c" (a: __m256d) -> __m256 { + return transmute(__m256)a +} + +// Cast vector of type __m256 to type __m256d. +@(require_results, enable_target_feature="avx") +_mm256_castps_pd :: #force_inline proc "c" (a: __m256) -> __m256d { + return transmute(__m256d)a +} + +// Casts vector of type __m256 to type __m256i. +@(require_results, enable_target_feature="avx") +_mm256_castps_si256 :: #force_inline proc "c" (a: __m256) -> __m256i { + return transmute(__m256i)a +} + +// Casts vector of type __m256i to type __m256. +@(require_results, enable_target_feature="avx") +_mm256_castsi256_ps :: #force_inline proc "c" (a: __m256i) -> __m256 { + return transmute(__m256)a +} + +// Casts vector of type __m256d to type __m256i. +@(require_results, enable_target_feature="avx") +_mm256_castpd_si256 :: #force_inline proc "c" (a: __m256d) -> __m256i { + return transmute(__m256i)a +} + +// Casts vector of type __m256i to type __m256d. +@(require_results, enable_target_feature="avx") +_mm256_castsi256_pd :: #force_inline proc "c" (a: __m256i) -> __m256d { + return transmute(__m256d)a +} + +// Casts vector of type __m256 to type __m128. +@(require_results, enable_target_feature="avx") +_mm256_castps256_ps128 :: #force_inline proc "c" (a: __m256) -> __m128 { + return intrinsics.simd_shuffle(a, a, 0, 1, 2, 3) +} + +// Casts vector of type __m256d to type __m128d. +@(require_results, enable_target_feature="avx") +_mm256_castpd256_pd128 :: #force_inline proc "c" (a: __m256d) -> __m128d { + return intrinsics.simd_shuffle(a, a, 0, 1) +} + +// Casts vector of type __m256i to type __m128i. +@(require_results, enable_target_feature="avx") +_mm256_castsi256_si128 :: #force_inline proc "c" (a: __m256i) -> __m128i { + return intrinsics.simd_shuffle(a, a, 0, 1) +} + +// Casts vector of type __m128 to type __m256; +// the upper 128 bits of the result are indeterminate. +// +// In the Intel documentation, the upper bits are declared to be "undefined". +@(require_results, enable_target_feature="sse,avx") +_mm256_castps128_ps256 :: #force_inline proc "c" (a: __m128) -> __m256 { + return intrinsics.simd_shuffle(a, _mm_undefined_ps(), 0, 1, 2, 3, 4, 4, 4, 4) +} + +// Casts vector of type __m128d to type __m256d; +// the upper 128 bits of the result are indeterminate. +// +// In the Intel documentation, the upper bits are declared to be "undefined". +@(require_results, enable_target_feature="sse2,avx") +_mm256_castpd128_pd256 :: #force_inline proc "c" (a: __m128d) -> __m256d { + return intrinsics.simd_shuffle(a, _mm_undefined_pd(), 0, 1, 2, 2) +} + +// Casts vector of type __m128i to type __m256i; +// the upper 128 bits of the result are indeterminate. +// +// In the Intel documentation, the upper bits are declared to be "undefined". +@(require_results, enable_target_feature="avx") +_mm256_castsi128_si256 :: #force_inline proc "c" (a: __m128i) -> __m256i { + return intrinsics.simd_shuffle(a, __m128i(0), 0, 1, 2, 2) +} + +// Constructs a 256-bit floating-point vector of `[8 x float]` from a +// 128-bit floating-point vector of `[4 x float]`. The lower 128 bits contain +// the value of the source vector. The upper 128 bits are set to zero. +@(require_results, enable_target_feature="sse,avx") +_mm256_zextps128_ps256 :: #force_inline proc "c" (a: __m128) -> __m256 { + return intrinsics.simd_shuffle(a, _mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7) +} + +// Constructs a 256-bit integer vector from a 128-bit integer vector. +// The lower 128 bits contain the value of the source vector. The upper +// 128 bits are set to zero. +@(require_results, enable_target_feature="avx") +_mm256_zextsi128_si256 :: #force_inline proc "c" (a: __m128i) -> __m256i { + return intrinsics.simd_shuffle(a, __m128i(0), 0, 1, 2, 3) +} + +// Constructs a 256-bit floating-point vector of `[4 x double]` from a +// 128-bit floating-point vector of `[2 x double]`. The lower 128 bits +// contain the value of the source vector. The upper 128 bits are set +// to zero. +@(require_results, enable_target_feature="sse2,avx") +_mm256_zextpd128_pd256 :: #force_inline proc "c" (a: __m128d) -> __m256d { + return intrinsics.simd_shuffle(a, _mm_setzero_pd(), 0, 1, 2, 3) +} + +// Returns vector of type `__m256` with indeterminate elements. +// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically +@(require_results, enable_target_feature="avx") +_mm256_undefined_ps :: #force_inline proc "c" () -> __m256 { + return 0 +} + +// Returns vector of type `__m256d` with indeterminate elements. +// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically +@(require_results, enable_target_feature="avx") +_mm256_undefined_pd :: #force_inline proc "c" () -> __m256d { + return 0 +} + +// Returns vector of type __m256i with with indeterminate elements. +// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically +@(require_results, enable_target_feature="avx") +_mm256_undefined_si256 :: #force_inline proc "c" () -> __m256i { + return 0 +} + +// Sets packed __m256 returned vector with the supplied values. +@(require_results, enable_target_feature="avx") +_mm256_set_m128 :: #force_inline proc "c" (hi: __m128, lo: __m128) -> __m256 { + return intrinsics.simd_shuffle(lo, hi, 0, 1, 2, 3, 4, 5, 6, 7) +} + +// Sets packed __m256d returned vector with the supplied values. +@(require_results, enable_target_feature="avx") +_mm256_set_m128d :: #force_inline proc "c" (hi: __m128d, lo: __m128d) -> __m256d { + hi := transmute(__m128)hi + lo := transmute(__m128)lo + return transmute(__m256d)_mm256_set_m128(hi, lo) +} + +// Sets packed __m256i returned vector with the supplied values. +@(require_results, enable_target_feature="avx") +_mm256_set_m128i :: #force_inline proc "c" (hi: __m128i, lo: __m128i) -> __m256i { + hi := transmute(__m128)hi + lo := transmute(__m128)lo + return transmute(__m256i)_mm256_set_m128(hi, lo) +} + +// Sets packed __m256 returned vector with the supplied values. +@(require_results, enable_target_feature="avx") +_mm256_setr_m128 :: #force_inline proc "c" (lo: __m128, hi: __m128) -> __m256 { + return _mm256_set_m128(hi, lo) +} + +// Sets packed __m256d returned vector with the supplied values. +@(require_results, enable_target_feature="avx") +_mm256_setr_m128d :: #force_inline proc "c" (lo: __m128d, hi: __m128d) -> __m256d { + return _mm256_set_m128d(hi, lo) +} + +// Sets packed __m256i returned vector with the supplied values. +@(require_results, enable_target_feature="avx") +_mm256_setr_m128i :: #force_inline proc "c" (lo: __m128i, hi: __m128i) -> __m256i { + return _mm256_set_m128i(hi, lo) +} + +// Loads two 128-bit values (composed of 4 packed single-precision (32-bit) floating-point elements) from memory, and combine them into a 256-bit value. +// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary. +@(require_results, enable_target_feature="sse,avx") +_mm256_loadu2_m128 :: #force_inline proc "c" (hiaddr, loaddr: ^f32) -> __m256 { + a := _mm256_castps128_ps256(_mm_loadu_ps(loaddr)) + return _mm256_insertf128_ps(a, _mm_loadu_ps(hiaddr), 1) +} + +// Loads two 128-bit values (composed of 2 packed double-precision (64-bit) floating-point elements) from memory, and combine them into a 256-bit value. +// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary. +@(require_results, enable_target_feature="sse2,avx") +_mm256_loadu2_m128d :: #force_inline proc "c" (hiaddr, loaddr: ^f64) -> __m256d { + a := _mm256_castpd128_pd256(_mm_loadu_pd(loaddr)) + return _mm256_insertf128_pd(a, _mm_loadu_pd(hiaddr), 1) +} + +// Loads two 128-bit values (composed of integer data) from memory, and combine them into a 256-bit value. +// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary. +@(require_results, enable_target_feature="sse2,avx") +_mm256_loadu2_m128i :: #force_inline proc "c" (hiaddr, loaddr: ^__m128i) -> __m256i { + a := _mm256_castsi128_si256(_mm_loadu_si128(loaddr)) + return _mm256_insertf128_si256(a, _mm_loadu_si128(hiaddr), 1) +} + +// Stores the high and low 128-bit halves (each composed of 4 packed +// single-precision (32-bit) floating-point elements) from `a` into memory two +// different 128-bit locations. +// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary. +@(enable_target_feature="sse,avx") +_mm256_storeu2_m128 :: #force_inline proc "c" (hiaddr, loaddr: ^f32, a: __m256) { + lo := _mm256_castps256_ps128(a) + _mm_storeu_ps(loaddr, lo) + hi := _mm256_extractf128_ps(a, 1) + _mm_storeu_ps(hiaddr, hi) +} + +// Stores the high and low 128-bit halves (each composed of 2 packed +// double-precision (64-bit) floating-point elements) from `a` into memory two +// different 128-bit locations. +// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary. +@(enable_target_feature="sse2,avx") +_mm256_storeu2_m128d :: #force_inline proc "c" (hiaddr, loaddr: ^f64, a: __m256d) { + lo := _mm256_castpd256_pd128(a) + _mm_storeu_pd(loaddr, lo) + hi := _mm256_extractf128_pd(a, 1) + _mm_storeu_pd(hiaddr, hi) +} + +// Stores the high and low 128-bit halves (each composed of integer data) from +// `a` into memory two different 128-bit locations. +// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary. +@(enable_target_feature="sse2,avx") +_mm256_storeu2_m128i :: #force_inline proc "c" (hiaddr, loaddr: ^__m128i, a: __m256i) { + lo := _mm256_castsi256_si128(a) + _mm_storeu_si128(loaddr, lo) + hi := _mm256_extractf128_si256(a, 1) + _mm_storeu_si128(hiaddr, hi) +} + +// Returns the first element of the input vector of `[8 x float]`. +@(require_results, enable_target_feature="avx") +_mm256_cvtss_f32 :: #force_inline proc "c" (a: __m256) -> f32 { + return intrinsics.simd_extract(a, 0) +} + + + +@(require_results, enable_target_feature="avx") +_mm256_insert_epi64 :: #force_inline proc "c" (a: __m256i, i: i64, $idx: u32) -> __m256i { + return intrinsics.simd_replace(transmute(#simd[4]i64)a, idx, i) +} + +@(require_results, enable_target_feature="avx") +_mm256_extract_epi64 :: #force_inline proc "c" (a: __m256i, $idx: u32) -> i64 { + return intrinsics.simd_extract(transmute(#simd[4]i64)a, idx) +} + + +@(private, default_calling_convention="none") +foreign _ { + @(link_name="llvm.x86.avx.round.pd.256") llvm_roundpd256 :: proc(a: __m256d, #const b: u32) -> __m256d --- + @(link_name="llvm.x86.avx.round.ps.256") llvm_roundps256 :: proc(a: __m256, #const b: u32) -> __m256 --- + @(link_name="llvm.x86.avx.dp.ps.256") llvm_vdpps :: proc(a, b: __m256, #const imm8: u8) -> __m256 --- + @(link_name="llvm.x86.sse2.cmp.pd") llvm_vcmppd :: proc(a, b: __m128d, #const imm8: u8) -> __m128d --- + @(link_name="llvm.x86.avx.cmp.pd.256") llvm_vcmppd256 :: proc(a, b: __m256d, imm8: u8) -> __m256d --- + @(link_name="llvm.x86.sse.cmp.ps") llvm_vcmpps :: proc(a: __m128, b: __m128, #const imm8: u8) -> __m128 --- + @(link_name="llvm.x86.avx.cmp.ps.256") llvm_vcmpps256 :: proc(a, b: __m256, imm8: u8) -> __m256 --- + @(link_name="llvm.x86.sse2.cmp.sd") llvm_vcmpsd :: proc(a, b: __m128d, #const imm8: u8) -> __m128d --- + @(link_name="llvm.x86.sse.cmp.ss") llvm_vcmpss :: proc(a: __m128, b: __m128, #const imm8: u8) -> __m128 --- + @(link_name="llvm.x86.avx.cvt.ps2dq.256") llvm_vcvtps2dq :: proc(a: __m256) -> #simd[8]i32 --- + @(link_name="llvm.x86.avx.cvtt.pd2dq.256") llvm_vcvttpd2dq :: proc(a: __m256d) -> #simd[4]i32 --- + @(link_name="llvm.x86.avx.cvt.pd2dq.256") llvm_vcvtpd2dq :: proc(a: __m256d) -> #simd[4]i32 --- + @(link_name="llvm.x86.avx.cvtt.ps2dq.256") llvm_vcvttps2dq :: proc(a: __m256) -> #simd[8]i32 --- + @(link_name="llvm.x86.avx.vzeroall") llvm_vzeroall :: proc() --- + @(link_name="llvm.x86.avx.vzeroupper") llvm_vzeroupper :: proc() --- + @(link_name="llvm.x86.avx.vpermilvar.ps.256") llvm_vpermilps256 :: proc(a: __m256, b: #simd[8]i32) -> __m256 --- + @(link_name="llvm.x86.avx.vpermilvar.ps") llvm_vpermilps :: proc(a: __m128, b: #simd[4]i32) -> __m128 --- + @(link_name="llvm.x86.avx.vpermilvar.pd.256") llvm_vpermilpd256 :: proc(a: __m256d, b: #simd[4]i64) -> __m256d --- + @(link_name="llvm.x86.avx.vpermilvar.pd") llvm_vpermilpd :: proc(a: __m128d, b: #simd[2]i64) -> __m128d --- + @(link_name="llvm.x86.avx.ldu.dq.256") llvm_vlddqu :: proc(mem_addr: rawptr) -> #simd[32]i8 --- + @(link_name="llvm.x86.avx.rcp.ps.256") llvm_vrcpps :: proc(a: __m256) -> __m256 --- + @(link_name="llvm.x86.avx.rsqrt.ps.256") llvm_vrsqrtps :: proc(a: __m256) -> __m256 --- + @(link_name="llvm.x86.avx.ptestnzc.256") llvm_ptestnzc256 :: proc(a: #simd[4]i64, b: #simd[4]i64) -> i32 --- + @(link_name="llvm.x86.avx.vtestz.pd.256") llvm_vtestzpd256 :: proc(a, b: __m256d) -> i32 --- + @(link_name="llvm.x86.avx.vtestc.pd.256") llvm_vtestcpd256 :: proc(a, b: __m256d) -> i32 --- + @(link_name="llvm.x86.avx.vtestnzc.pd.256") llvm_vtestnzcpd256 :: proc(a, b: __m256d) -> i32 --- + @(link_name="llvm.x86.avx.vtestnzc.pd") llvm_vtestnzcpd :: proc(a, b: __m128d) -> i32 --- + @(link_name="llvm.x86.avx.vtestz.ps.256") llvm_vtestzps256 :: proc(a, b: __m256) -> i32 --- + @(link_name="llvm.x86.avx.vtestc.ps.256") llvm_vtestcps256 :: proc(a, b: __m256) -> i32 --- + @(link_name="llvm.x86.avx.vtestnzc.ps.256") llvm_vtestnzcps256 :: proc(a, b: __m256) -> i32 --- + @(link_name="llvm.x86.avx.vtestnzc.ps") llvm_vtestnzcps :: proc(a: __m128, b: __m128) -> i32 --- + @(link_name="llvm.x86.avx.min.ps.256") llvm_vminps :: proc(a, b: __m256) -> __m256 --- + @(link_name="llvm.x86.avx.max.ps.256") llvm_vmaxps :: proc(a, b: __m256) -> __m256 --- + @(link_name="llvm.x86.avx.min.pd.256") llvm_vminpd :: proc(a, b: __m256d) -> __m256d --- + @(link_name="llvm.x86.avx.max.pd.256") llvm_vmaxpd :: proc(a, b: __m256d) -> __m256d --- +}