Files
Odin/core/simd/x86/avx.odin
2026-04-08 15:42:40 +01:00

1855 lines
80 KiB
Odin

#+build i386, amd64
package simd_x86
import "base:intrinsics"
// Adds packed double-precision (64-bit) floating-point elements in `a` and `b`.
@(require_results, enable_target_feature="avx")
_mm256_add_pd :: #force_inline proc "c" (a, b: __m256d) -> __m256d {
return intrinsics.simd_add(a, b)
}
// Adds packed single-precision (32-bit) floating-point elements in `a` and `b`.
@(require_results, enable_target_feature="avx")
_mm256_add_ps :: #force_inline proc "c" (a, b: __m256) -> __m256 {
return intrinsics.simd_add(a, b)
}
// Computes the bitwise AND of a packed double-precision (64-bit) floating-point elements in `a` and `b`.
@(require_results, enable_target_feature="avx")
_mm256_and_pd :: #force_inline proc "c" (a, b: __m256d) -> __m256d {
a := transmute(#simd[4]u64)a
b := transmute(#simd[4]u64)b
return transmute(__m256d)intrinsics.simd_bit_and(a, b)
}
// Computes the bitwise AND of packed single-precision (32-bit) floating-point elements in `a` and `b`.
@(require_results, enable_target_feature="avx")
_mm256_and_ps :: #force_inline proc "c" (a, b: __m256) -> __m256 {
a := transmute(#simd[8]u32)a
b := transmute(#simd[8]u32)b
return transmute(__m256)intrinsics.simd_bit_and(a, b)
}
// Computes the bitwise OR packed double-precision (64-bit) floating-point elements in `a` and `b`.
@(require_results, enable_target_feature="avx")
_mm256_or_pd :: #force_inline proc "c" (a, b: __m256d) -> __m256d {
a := transmute(#simd[4]u64)a
b := transmute(#simd[4]u64)b
return transmute(__m256d)intrinsics.simd_bit_or(a, b)
}
// Computes the bitwise OR packed single-precision (32-bit) floating-point elements in `a` and `b`.
@(require_results, enable_target_feature="avx")
_mm256_or_ps :: #force_inline proc "c" (a, b: __m256) -> __m256 {
a := transmute(#simd[8]u32)a
b := transmute(#simd[8]u32)b
return transmute(__m256)intrinsics.simd_bit_or(a, b)
}
// Shuffles double-precision (64-bit) floating-point elements within 128-bit lanes using the control in `imm8`.
@(require_results, enable_target_feature="avx")
_mm256_shuffle_pd :: #force_inline proc "c" (a, b: __m256d, $MASK: u8) -> __m256d {
return intrinsics.simd_shuffle(
a,
b,
MASK & 1,
((MASK >> 1) & 1) + 4,
((MASK >> 2) & 1) + 2,
((MASK >> 3) & 1) + 6,
)
}
// Shuffles single-precision (32-bit) floating-point elements in `a` within 128-bit lanes using the control in `imm8`.
@(require_results, enable_target_feature="avx")
_mm256_shuffle_ps :: #force_inline proc "c" (a, b: __m256, $MASK: u8) -> __m256 {
return intrinsics.simd_shuffle(
a,
b,
MASK & 0b11,
(MASK >> 2) & 0b11,
((MASK >> 4) & 0b11) + 8,
((MASK >> 6) & 0b11) + 8,
(MASK & 0b11) + 4,
((MASK >> 2) & 0b11) + 4,
((MASK >> 4) & 0b11) + 12,
((MASK >> 6) & 0b11) + 12,
)
}
// Computes the bitwise NOT of packed double-precision (64-bit) floating-point elements in `a`, and then AND with `b`.
@(require_results, enable_target_feature="avx")
_mm256_andnot_pd :: #force_inline proc "c" (a, b: __m256d) -> __m256d {
a := transmute(#simd[4]u64)a
b := transmute(#simd[4]u64)b
return transmute(__m256d)intrinsics.simd_bit_and(intrinsics.simd_bit_xor((#simd[4]u64)(0), a), b)
}
// Computes the bitwise NOT of packed single-precision (32-bit) floating-point elements in `a` and then AND with `b`.
@(require_results, enable_target_feature="avx")
_mm256_andnot_ps :: #force_inline proc "c" (a, b: __m256) -> __m256 {
a := transmute(#simd[8]u32)a
b := transmute(#simd[8]u32)b
return transmute(__m256)intrinsics.simd_bit_and(intrinsics.simd_bit_xor((#simd[8]u32)(0), a), b)
}
// Compares packed double-precision (64-bit) floating-point elements in `a` and `b`, and returns packed maximum values
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_pd)
@(require_results, enable_target_feature="avx")
_mm256_max_pd :: #force_inline proc "c" (a, b: __m256d) -> __m256d {
return llvm_vmaxpd(a, b)
}
// Compares packed single-precision (32-bit) floating-point elements in `a` and `b`, and returns packed maximum values
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_ps)
@(require_results, enable_target_feature="avx")
_mm256_max_ps :: #force_inline proc "c" (a, b: __m256) -> __m256 {
return llvm_vmaxps(a, b)
}
// Compares packed double-precision (64-bit) floating-point elements in `a` and `b`, and returns packed minimum values
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_pd)
@(require_results, enable_target_feature="avx")
_mm256_min_pd :: #force_inline proc "c" (a, b: __m256d) -> __m256d {
return llvm_vminpd(a, b)
}
// Compares packed single-precision (32-bit) floating-point elements in `a` and `b`, and returns packed minimum values
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_ps)
@(require_results, enable_target_feature="avx")
_mm256_min_ps :: #force_inline proc "c" (a, b: __m256) -> __m256 {
return llvm_vminps(a, b)
}
// Multiplies packed double-precision (64-bit) floating-point elements in `a` and `b`.
@(require_results, enable_target_feature="avx")
_mm256_mul_pd :: #force_inline proc "c" (a, b: __m256d) -> __m256d {
return intrinsics.simd_mul(a, b)
}
// Multiplies packed single-precision (32-bit) floating-point elements in `a` and `b`.
@(require_results, enable_target_feature="avx")
_mm256_mul_ps :: #force_inline proc "c" (a, b: __m256) -> __m256 {
return intrinsics.simd_mul(a, b)
}
// Alternatively adds and subtracts packed double-precision (64-bit) floating-point elements in `a` to/from packed elements in `b`.
@(require_results, enable_target_feature="avx")
_mm256_addsub_pd :: #force_inline proc "c" (a, b: __m256d) -> __m256d {
add := intrinsics.simd_add(a, b)
sub := intrinsics.simd_sub(a, b)
return intrinsics.simd_shuffle(add, sub, 4, 1, 6, 3)
}
// Alternatively adds and subtracts packed single-precision (32-bit) floating-point elements in `a` to/from packed elements in `b`.
@(require_results, enable_target_feature="avx")
_mm256_addsub_ps :: #force_inline proc "c" (a, b: __m256) -> __m256 {
add := intrinsics.simd_add(a, b)
sub := intrinsics.simd_sub(a, b)
return intrinsics.simd_shuffle(add, sub, 8, 1, 10, 3, 12, 5, 14, 7)
}
// Subtracts packed double-precision (64-bit) floating-point elements in `b`
// from packed elements in `a`.
@(require_results, enable_target_feature="avx")
_mm256_sub_pd :: #force_inline proc "c" (a, b: __m256d) -> __m256d {
return intrinsics.simd_sub(a, b)
}
// Subtracts packed single-precision (32-bit) floating-point elements in `b`
// from packed elements in `a`.
@(require_results, enable_target_feature="avx")
_mm256_sub_ps :: #force_inline proc "c" (a, b: __m256) -> __m256 {
return intrinsics.simd_sub(a, b)
}
// Computes the division of each of the 8 packed 32-bit floating-point elements
// in `a` by the corresponding packed elements in `b`.
@(require_results, enable_target_feature="avx")
_mm256_div_ps :: #force_inline proc "c" (a, b: __m256) -> __m256 {
return intrinsics.simd_div(a, b)
}
// Computes the division of each of the 4 packed 64-bit floating-point elements
// in `a` by the corresponding packed elements in `b`.
@(require_results, enable_target_feature="avx")
_mm256_div_pd :: #force_inline proc "c" (a, b: __m256d) -> __m256d {
return intrinsics.simd_div(a, b)
}
// Rounds packed double-precision (64-bit) floating point elements in `a`
// according to the flag `ROUNDING`. The value of `ROUNDING` may be as follows:
//
// - `0x00`: Round to the nearest whole number.
// - `0x01`: Round down, toward negative infinity.
// - `0x02`: Round up, toward positive infinity.
// - `0x03`: Truncate the values.
//
// For a complete list of options, check [the LLVM docs][llvm_docs].
//
// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_round_pd)
@(require_results, enable_target_feature="avx")
_mm256_round_pd :: #force_inline proc "c" (a: __m256d, $ROUNDING: u8) -> __m256d where ROUNDING < 16 {
return llvm_roundpd256(a, ROUNDING)
}
// Rounds packed double-precision (64-bit) floating point elements in `a`
// toward positive infinity.
@(require_results, enable_target_feature="avx")
_mm256_ceil_pd :: #force_inline proc "c" (a: __m256d) -> __m256d {
return intrinsics.simd_ceil(a)
}
// Rounds packed double-precision (64-bit) floating point elements in `a`
// toward negative infinity.
@(require_results, enable_target_feature="avx")
_mm256_floor_pd :: #force_inline proc "c" (a: __m256d) -> __m256d {
return intrinsics.simd_floor(a)
}
// Rounds packed single-precision (32-bit) floating point elements in `a`
// according to the flag `ROUNDING`. The value of `ROUNDING` may be as follows:
//
// - `0x00`: Round to the nearest whole number.
// - `0x01`: Round down, toward negative infinity.
// - `0x02`: Round up, toward positive infinity.
// - `0x03`: Truncate the values.
//
// For a complete list of options, check [the LLVM docs][llvm_docs].
//
// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_round_ps)
@(require_results, enable_target_feature="avx")
_mm256_round_ps :: #force_inline proc(a: __m256, $ROUNDING: u8) -> __m256 where ROUNDING < 16 {
return llvm_roundps256(a, u32(ROUNDING))
}
// Rounds packed single-precision (32-bit) floating point elements in `a`
// toward positive infinity.
@(require_results, enable_target_feature="avx")
_mm256_ceil_ps :: #force_inline proc "c" (a: __m256) -> __m256 {
return intrinsics.simd_ceil(a)
}
// Rounds packed single-precision (32-bit) floating point elements in `a`
// toward negative infinity.
@(require_results, enable_target_feature="avx")
_mm256_floor_ps :: #force_inline proc "c" (a: __m256) -> __m256 {
return intrinsics.simd_floor(a)
}
// Returns the square root of packed single-precision (32-bit) floating point elements in `a`.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sqrt_ps)
@(require_results, enable_target_feature="avx")
_mm256_sqrt_ps :: #force_inline proc "c" (a: __m256) -> __m256 {
return intrinsics.sqrt(a)
}
// Returns the square root of packed double-precision (64-bit) floating point elements in `a`.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sqrt_pd)
@(require_results, enable_target_feature="avx")
_mm256_sqrt_pd :: #force_inline proc "c" (a: __m256d) -> __m256d {
return intrinsics.sqrt(a)
}
// Blends packed double-precision (64-bit) floating-point elements from
// `a` and `b` using control mask `imm8`.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_pd)
@(require_results, enable_target_feature="avx")
_mm256_blend_pd :: #force_inline proc "c" (a, b: __m256d, $IIM4: u32) -> __m256d where IMM4 < 16 {
return intrinsics.simd_shuffle(
a,
b,
((IMM4 >> 0) & 1) * 4 + 0,
((IMM4 >> 1) & 1) * 4 + 1,
((IMM4 >> 2) & 1) * 4 + 2,
((IMM4 >> 3) & 1) * 4 + 3,
)
}
// Blends packed single-precision (32-bit) floating-point elements from
// `a` and `b` using control mask `imm8`.
@(require_results, enable_target_feature="avx")
_mm256_blend_ps :: #force_inline proc "c" (a, b: __m256, $IMM8: u8) -> __m256 {
return intrinsics.simd_shuffle(
a,
b,
((IMM8 >> 0) & 1) * 8 + 0,
((IMM8 >> 1) & 1) * 8 + 1,
((IMM8 >> 2) & 1) * 8 + 2,
((IMM8 >> 3) & 1) * 8 + 3,
((IMM8 >> 4) & 1) * 8 + 4,
((IMM8 >> 5) & 1) * 8 + 5,
((IMM8 >> 6) & 1) * 8 + 6,
((IMM8 >> 7) & 1) * 8 + 7,
)
}
// Blends packed double-precision (64-bit) floating-point elements from
// `a` and `b` using `c` as a mask.
@(require_results, enable_target_feature="avx")
_mm256_blendv_pd :: #force_inline proc "c" (a, b: __m256d, c: __m256d) -> __m256d {
mask := intrinsics.simd_lanes_lt(transmute(#simd[4]i64)c, 0)
return intrinsics.simd_select(mask, b, a)
}
// Blends packed single-precision (32-bit) floating-point elements from
// `a` and `b` using `c` as a mask.
@(require_results, enable_target_feature="avx")
_mm256_blendv_ps :: #force_inline proc "c" (a, b: __m256, c: __m256) -> __m256 {
mask := intrinsics.simd_lanes_lt(transmute(#simd[8]i32)c, 0)
return intrinsics.simd_select(mask, b, a)
}
// Conditionally multiplies the packed single-precision (32-bit) floating-point elements in `a` and `b` using the high 4 bits in `imm8`,
// sum the four products, and conditionally return the sum
// using the low 4 bits of `imm8`.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dp_ps)
@(require_results, enable_target_feature="avx")
_mm256_dp_ps :: #force_inline proc "c" (a, b: __m256, $IMM8: i8) -> __m256 {
return llvm_vdpps(a, b, IMM8)
}
// Horizontal addition of adjacent pairs in the two packed vectors
// of 4 64-bit floating points `a` and `b`.
// In the result, sums of elements from `a` are returned in even locations,
// while sums of elements from `b` are returned in odd locations.
@(require_results, enable_target_feature="avx")
_mm256_hadd_pd :: #force_inline proc "c" (a, b: __m256d) -> __m256d {
even := intrinsics.simd_shuffle(a, b, 0, 4, 2, 6)
odd := intrinsics.simd_shuffle(a, b, 1, 5, 3, 7)
return intrinsics.simd_add(even, odd)
}
// Horizontal addition of adjacent pairs in the two packed vectors
// of 8 32-bit floating points `a` and `b`.
// In the result, sums of elements from `a` are returned in locations of
// indices 0, 1, 4, 5; while sums of elements from `b` are locations
// 2, 3, 6, 7.
@(require_results, enable_target_feature="avx")
_mm256_hadd_ps :: #force_inline proc "c" (a, b: __m256) -> __m256 {
even := intrinsics.simd_shuffle(a, b, 0, 2, 8, 10, 4, 6, 12, 14)
odd := intrinsics.simd_shuffle(a, b, 1, 3, 9, 11, 5, 7, 13, 15)
return intrinsics.simd_add(even, odd)
}
// Horizontal subtraction of adjacent pairs in the two packed vectors
// of 4 64-bit floating points `a` and `b`.
// In the result, sums of elements from `a` are returned in even locations,
// while sums of elements from `b` are returned in odd locations.
@(require_results, enable_target_feature="avx")
_mm256_hsub_pd :: #force_inline proc "c" (a, b: __m256d) -> __m256d {
even := intrinsics.simd_shuffle(a, b, 0, 4, 2, 6)
odd := intrinsics.simd_shuffle(a, b, 1, 5, 3, 7)
return intrinsics.simd_sub(even, odd)
}
// Horizontal subtraction of adjacent pairs in the two packed vectors
// of 8 32-bit floating points `a` and `b`.
// In the result, sums of elements from `a` are returned in locations of
// indices 0, 1, 4, 5; while sums of elements from `b` are locations
// 2, 3, 6, 7.
@(require_results, enable_target_feature="avx")
_mm256_hsub_ps :: #force_inline proc "c" (a, b: __m256) -> __m256 {
even := intrinsics.simd_shuffle(a, b, 0, 2, 8, 10, 4, 6, 12, 14)
odd := intrinsics.simd_shuffle(a, b, 1, 3, 9, 11, 5, 7, 13, 15)
return intrinsics.simd_sub(even, odd)
}
// Computes the bitwise XOR of packed double-precision (64-bit) floating-point elements in `a` and `b`.
@(require_results, enable_target_feature="avx")
_mm256_xor_pd :: #force_inline proc "c" (a, b: __m256d) -> __m256d {
a := transmute(#simd[4]u64)a
b := transmute(#simd[4]u64)b
return transmute(__m256d)intrinsics.simd_bit_xor(a, b)
}
// Computes the bitwise XOR of packed single-precision (32-bit) floating-point elements in `a` and `b`.
@(require_results, enable_target_feature="avx")
_mm256_xor_ps :: #force_inline proc "c" (a, b: __m256) -> __m256 {
a := transmute(#simd[8]u32)a
b := transmute(#simd[8]u32)b
return transmute(__m256)intrinsics.simd_bit_xor(a, b)
}
_CMP_EQ_OQ :: 0x00 // Equal (ordered, non-signaling)
_CMP_LT_OS :: 0x01 // Less-than (ordered, signaling)
_CMP_LE_OS :: 0x02 // Less-than-or-equal (ordered, signaling)
_CMP_UNORD_Q :: 0x03 // Unordered (non-signaling)
_CMP_NEQ_UQ :: 0x04 // Not-equal (unordered, non-signaling)
_CMP_NLT_US :: 0x05 // Not-less-than (unordered, signaling)
_CMP_NLE_US :: 0x06 // Not-less-than-or-equal (unordered, signaling)
_CMP_ORD_Q :: 0x07 // Ordered (non-signaling)
_CMP_EQ_UQ :: 0x08 // Equal (unordered, non-signaling)
_CMP_NGE_US :: 0x09 // Not-greater-than-or-equal (unordered, signaling)
_CMP_NGT_US :: 0x0a // Not-greater-than (unordered, signaling)
_CMP_FALSE_OQ :: 0x0b // False (ordered, non-signaling)
_CMP_NEQ_OQ :: 0x0c // Not-equal (ordered, non-signaling)
_CMP_GE_OS :: 0x0d // Greater-than-or-equal (ordered, signaling)
_CMP_GT_OS :: 0x0e // Greater-than (ordered, signaling)
_CMP_TRUE_UQ :: 0x0f // True (unordered, non-signaling)
_CMP_EQ_OS :: 0x10 // Equal (ordered, signaling)
_CMP_LT_OQ :: 0x11 // Less-than (ordered, non-signaling)
_CMP_LE_OQ :: 0x12 // Less-than-or-equal (ordered, non-signaling)
_CMP_UNORD_S :: 0x13 // Unordered (signaling)
_CMP_NEQ_US :: 0x14 // Not-equal (unordered, signaling)
_CMP_NLT_UQ :: 0x15 // Not-less-than (unordered, non-signaling)
_CMP_NLE_UQ :: 0x16 // Not-less-than-or-equal (unordered, non-signaling)
_CMP_ORD_S :: 0x17 // Ordered (signaling)
_CMP_EQ_US :: 0x18 // Equal (unordered, signaling)
_CMP_NGE_UQ :: 0x19 // Not-greater-than-or-equal (unordered, non-signaling)
_CMP_NGT_UQ :: 0x1a // Not-greater-than (unordered, non-signaling)
_CMP_FALSE_OS :: 0x1b // False (ordered, signaling)
_CMP_NEQ_OS :: 0x1c // Not-equal (ordered, signaling)
_CMP_GE_OQ :: 0x1d // Greater-than-or-equal (ordered, non-signaling)
_CMP_GT_OQ :: 0x1e // Greater-than (ordered, non-signaling)
_CMP_TRUE_US :: 0x1f // True (unordered, signaling)
// Compares packed double-precision (64-bit) floating-point elements in `a` and `b` based on the comparison operand specified by `IMM5`.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_pd)
@(require_results, enable_target_feature="avx")
_mm_cmp_pd :: #force_inline proc "c" (a, b: __m128d, $IMM5: u8) -> __m128d where IMM5 < 32 {
return llvm_vcmppd(a, b, IMM5)
}
// Compares packed double-precision (64-bit) floating-point elements in `a` and `b` based on the comparison operand specified by `IMM5`.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_pd)
@(require_results, enable_target_feature="avx")
_mm256_cmp_pd :: #force_inline proc "c" (a, b: __m256d, $IMM5: u8) -> __m256d where IMM5 < 32 {
return llvm_vcmppd256(a, b, IMM5)
}
// Compares packed single-precision (32-bit) floating-point elements in `a` and `b` based on the comparison operand specified by `IMM5`.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_ps)
@(require_results, enable_target_feature="avx")
_mm_cmp_ps :: #force_inline proc "c" (a: __m128, b: __m128, $IMM5: u8) -> __m128 where IMM5 < 32 {
return llvm_vcmpps(a, b, IMM5)
}
// Compares packed single-precision (32-bit) floating-point elements in `a` and `b` based on the comparison operand specified by `IMM5`.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_ps)
@(require_results, enable_target_feature="avx")
_mm256_cmp_ps :: #force_inline proc "c" (a, b: __m256, $IMM5: u8) -> __m256 where IMM5 < 32 {
return llvm_vcmpps256(a, b, IMM5)
}
// Compares the lower double-precision (64-bit) floating-point element in
// `a` and `b` based on the comparison operand specified by `IMM5`,
// store the result in the lower element of returned vector,
// and copies the upper element from `a` to the upper element of returned
// vector.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_sd)
@(require_results, enable_target_feature="avx")
_mm_cmp_sd :: #force_inline proc "c" (a, b: __m128d, $IMM5: u8) -> __m128d where IMM5 < 32 {
return llvm_vcmpsd(a, b, IMM5)
}
// Compares the lower single-precision (32-bit) floating-point element in
// `a` and `b` based on the comparison operand specified by `IMM5`,
// store the result in the lower element of returned vector,
// and copies the upper 3 packed elements from `a` to the upper elements of
// returned vector.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_ss)
@(require_results, enable_target_feature="avx")
_mm_cmp_ss :: #force_inline proc "c" (a: __m128, b: __m128, $IMM5: u8) -> __m128 where IMM5 < 32 {
return llvm_vcmpss(a, b, IMM5)
}
// Converts packed 32-bit integers in `a` to packed double-precision (64-bit) floating-point elements.
@(require_results, enable_target_feature="avx")
_mm256_cvtepi32_pd :: #force_inline proc "c" (a: __m128i) -> __m256d {
return __m256d(transmute(#simd[4]i32)a)
}
// Converts packed 32-bit integers in `a` to packed single-precision (32-bit) floating-point elements.
@(require_results, enable_target_feature="avx")
_mm256_cvtepi32_ps :: #force_inline proc "c" (a: __m256i) -> __m256 {
return __m256(transmute(#simd[8]i32)a)
}
// Converts packed double-precision (64-bit) floating-point elements in `a` to packed single-precision (32-bit) floating-point elements.
@(require_results, enable_target_feature="avx")
_mm256_cvtpd_ps :: #force_inline proc "c" (a: __m256d) -> __m128 {
return __m128(a)
}
// Converts packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit integers.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtps_epi32)
@(require_results, enable_target_feature="avx")
_mm256_cvtps_epi32 :: #force_inline proc "c" (a: __m256) -> __m256i {
return transmute(__m256i)llvm_vcvtps2dq(a)
}
// Converts packed single-precision (32-bit) floating-point elements in `a` to packed double-precision (64-bit) floating-point elements.
@(require_results, enable_target_feature="avx")
_mm256_cvtps_pd :: #force_inline proc "c" (a: __m128) -> __m256d {
return __m256d(a)
}
// Returns the first element of the input vector of `[4 x double]`.
@(require_results, enable_target_feature="avx")
_mm256_cvtsd_f64 :: #force_inline proc "c" (a: __m256d) -> f64 {
return intrinsics.simd_extract(a, 0)
}
// Converts packed double-precision (64-bit) floating-point elements in `a` to packed 32-bit integers with truncation.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvttpd_epi32)
@(require_results, enable_target_feature="avx")
_mm256_cvttpd_epi32 :: #force_inline proc "c" (a: __m256d) -> __m128i {
return transmute(__m128i)llvm_vcvttpd2dq(a)
}
// Converts packed double-precision (64-bit) floating-point elements in `a` to packed 32-bit integers.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtpd_epi32)
@(require_results, enable_target_feature="avx")
_mm256_cvtpd_epi32 :: #force_inline proc "c" (a: __m256d) -> __m128i {
return transmute(__m128i)llvm_vcvtpd2dq(a)
}
// Converts packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit integers with truncation.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvttps_epi32)
@(require_results, enable_target_feature="avx")
_mm256_cvttps_epi32 :: #force_inline proc "c" (a: __m256) -> __m256i {
return transmute(__m256i)llvm_vcvttps2dq(a)
}
// Extracts 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from `a`, selected with `imm8`.
@(require_results, enable_target_feature="avx")
_mm256_extractf128_ps :: #force_inline proc "c" (a: __m256, $IMM1: u8) -> __m128 where IMM1 < 2 {
when IMM1 == 0 {
return intrinsics.simd_shuffle(a, _mm256_undefined_ps(), 0, 1, 2, 3)
} else {
return intrinsics.simd_shuffle(a, _mm256_undefined_ps(), 4, 5, 6, 7)
}
}
// Extracts 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from `a`, selected with `imm8`.
@(require_results, enable_target_feature="avx")
_mm256_extractf128_pd :: #force_inline proc "c" (a: __m256d, $IMM1: u8) -> __m128d where IMM1 < 2 {
when IMM1 == 0 {
return intrinsics.simd_shuffle(a, _mm256_undefined_pd(), 0, 1)
} else {
return intrinsics.simd_shuffle(a, _mm256_undefined_pd(), 2, 3)
}
}
// Extracts 128 bits (composed of integer data) from `a`, selected with `imm8`.
@(require_results, enable_target_feature="avx")
_mm256_extractf128_si256 :: #force_inline proc "c" (a: __m256i, $IMM1: u8) -> __m128i where IMM1 < 2 {
when IMM1 == 0 {
dst := intrinsics.simd_shuffle(transmute(#simd[4]i64)a, (#simd[4]i64)(0), 0, 1)
return transmute(__m128i)dst
} else {
dst := intrinsics.simd_shuffle(transmute(#simd[4]i64)a, (#simd[4]i64)(0), 2, 3)
return transmute(__m128i)dst
}
}
// Extracts a 32-bit integer from `a`, selected with `INDEX`.
@(require_results, enable_target_feature="avx")
_mm256_extract_epi32 :: #force_inline proc "c" (a: __m256i, $INDEX: u8) -> i32 where INDEX < 8 {
return intrinsics.simd_extract(transmute(#simd[8]i32)a, INDEX)
}
@(require_results, enable_target_feature="avx")
_mm256_cvtsi256_si32 :: #force_inline proc "c" (a: __m256i) -> i32 {
return intrinsics.simd_extract(transmute(#simd[8]i32)a, 0)
}
// Zeroes the contents of all XMM or YMM registers.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zeroall)
@(enable_target_feature="avx")
_mm256_zeroall :: #force_inline proc "c" () {
llvm_vzeroall()
}
// Zeroes the upper 128 bits of all YMM registers; the lower 128-bits of the registers are unmodified.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zeroupper)
@(enable_target_feature="avx")
_mm256_zeroupper :: #force_inline proc "c" () {
llvm_vzeroupper()
}
// Shuffles single-precision (32-bit) floating-point elements in `a` within 128-bit lanes using the control in `b`.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar_ps)
@(require_results, enable_target_feature="avx")
_mm256_permutevar_ps :: #force_inline proc "c" (a: __m256, b: __m256i) -> __m256 {
return llvm_vpermilps256(a, transmute(#simd[8]i32)b)
}
// Shuffles single-precision (32-bit) floating-point elements in `a` using the control in `b`.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutevar_ps)
@(require_results, enable_target_feature="avx")
_mm_permutevar_ps :: #force_inline proc "c" (a: __m128, b: __m128i) -> __m128 {
return llvm_vpermilps(a, transmute(#simd[4]i32)b)
}
// Shuffles single-precision (32-bit) floating-point elements in `a` within 128-bit lanes using the control in `imm8`.
@(require_results, enable_target_feature="avx")
_mm256_permute_ps :: #force_inline proc "c" (a: __m256, $IMM8: u8) -> __m256 {
return intrinsics.simd_shuffle(
a,
_mm256_undefined_ps(),
(IMM8 >> 0) & 0b11,
(IMM8 >> 2) & 0b11,
(IMM8 >> 4) & 0b11,
(IMM8 >> 6) & 0b11,
((IMM8 >> 0) & 0b11) + 4,
((IMM8 >> 2) & 0b11) + 4,
((IMM8 >> 4) & 0b11) + 4,
((IMM8 >> 6) & 0b11) + 4,
)
}
// Shuffles single-precision (32-bit) floating-point elements in `a` using the control in `imm8`.
@(require_results, enable_target_feature="avx")
_mm_permute_ps :: #force_inline proc "c" (a: __m128, $IMM8: u8) -> __m128 {
return intrinsics.simd_shuffle(
a,
_mm_undefined_ps(),
(IMM8 >> 0) & 0b11,
(IMM8 >> 2) & 0b11,
(IMM8 >> 4) & 0b11,
(IMM8 >> 6) & 0b11,
)
}
// Shuffles double-precision (64-bit) floating-point elements in `a` within 256-bit lanes using the control in `b`.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar_pd)
@(require_results, enable_target_feature="avx")
_mm256_permutevar_pd :: #force_inline proc "c" (a: __m256d, b: __m256i) -> __m256d {
return llvm_vpermilpd256(a, b)
}
// Shuffles double-precision (64-bit) floating-point elements in `a` using the control in `b`.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutevar_pd)
@(require_results, enable_target_feature="avx")
_mm_permutevar_pd :: #force_inline proc "c" (a: __m128d, b: __m128i) -> __m128d {
return llvm_vpermilpd(a, b)
}
// Shuffles double-precision (64-bit) floating-point elements in `a` within 128-bit lanes using the control in `imm8`.
@(require_results, enable_target_feature="avx")
_mm256_permute_pd :: #force_inline proc "c" (a: __m256d, $IMM4: u8) -> __m256d where IMM4 < 16 {
return intrinsics.simd_shuffle(
a,
_mm256_undefined_pd(),
((IMM4 >> 0) & 1),
((IMM4 >> 1) & 1),
((IMM4 >> 2) & 1) + 2,
((IMM4 >> 3) & 1) + 2,
)
}
// Shuffles double-precision (64-bit) floating-point elements in `a` using the control in `imm8`.
@(require_results, enable_target_feature="avx")
_mm_permute_pd :: #force_inline proc "c" (a: __m128d, $IMM2: u8) -> __m128d where IMM2 < 4 {
return intrinsics.simd_shuffle(
a,
_mm_undefined_pd(),
(IMM2) & 1,
(IMM2 >> 1) & 1,
)
}
// Shuffles 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) selected by `imm8` from `a` and `b`.
@(require_results, enable_target_feature="avx")
_mm256_permute2f128_ps :: #force_inline proc "c" (a, b: __m256, $IMM8: u8) -> __m256 {
return _mm256_castsi256_ps(_mm256_permute2f128_si256(
_mm256_castps_si256(a),
_mm256_castps_si256(b),
IMM8,
))
}
// Shuffles 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) selected by `imm8` from `a` and `b`.
@(require_results, enable_target_feature="avx")
_mm256_permute2f128_pd :: #force_inline proc "c" (a, b: __m256d, $IMM8: u8) -> __m256d {
_mm256_castsi256_pd(_mm256_permute2f128_si256(
_mm256_castpd_si256(a),
_mm256_castpd_si256(b),
IMM8,
))
}
// Shuffles 128-bits (composed of integer data) selected by `imm8` from `a` and `b`.
@(require_results, enable_target_feature="avx")
_mm256_permute2f128_si256 :: #force_inline proc "c" (a, b: __m256i, $IMM8: u8) -> __m256i {
r := intrinsics.simd_shuffle(
a,
b,
2 * ((IMM8 & 0xf) & 0b11) + 0,
2 * ((IMM8 & 0xf) & 0b11) + 1,
2 * (((IMM8 & 0xf0) >> 4) & 0b11) + 0,
2 * (((IMM8 & 0xf0) >> 4) & 0b11) + 1,
)
return intrinsics.simd_shuffle(
r,
__m256i(0),
4 if ((IMM8 & 0xf) & 0b1000) != 0 else 0,
4 if ((IMM8 & 0xf) & 0b1000) != 0 else 1,
4 if (((IMM8 & 0xf0)>>4) & 0b1000) != 0 else 2,
4 if (((IMM8 & 0xf0)>>4) & 0b1000) != 0 else 3,
)
}
// Broadcasts a single-precision (32-bit) floating-point element from memory to all elements of the returned vector.
@(require_results, enable_target_feature="avx")
_mm256_broadcast_ss :: #force_inline proc "c" (f: ^f32) -> __m256 {
return _mm256_set1_ps(f^)
}
// Broadcasts a single-precision (32-bit) floating-point element from memory to all elements of the returned vector.
@(require_results, enable_target_feature="sse,avx")
_mm_broadcast_ss :: #force_inline proc "c" (f: ^f32) -> __m128 {
return _mm_set1_ps(f^)
}
// Broadcasts a double-precision (64-bit) floating-point element from memory to all elements of the returned vector.
@(require_results, enable_target_feature="avx")
_mm256_broadcast_sd :: #force_inline proc "c" (f: ^f64) -> __m256d {
return _mm256_set1_pd(f^)
}
// Broadcasts 128 bits from memory (composed of 4 packed single-precision (32-bit) floating-point elements) to all elements of the returned vector.
@(require_results, enable_target_feature="sse,avx")
_mm256_broadcast_ps :: #force_inline proc "c" (a: ^__m128) -> __m256 {
return intrinsics.simd_shuffle(a^, _mm_setzero_ps(), 0, 1, 2, 3, 0, 1, 2, 3)
}
// Broadcasts 128 bits from memory (composed of 2 packed double-precision (64-bit) floating-point elements) to all elements of the returned vector.
@(require_results, enable_target_feature="sse2,avx")
_mm256_broadcast_pd :: #force_inline proc "c" (a: ^__m128d) -> __m256d {
return intrinsics.simd_shuffle(a^, _mm_setzero_pd(), 0, 1, 0, 1)
}
// Copies `a` to result, then inserts 128 bits (composed of 4 packed
// single-precision (32-bit) floating-point elements) from `b` into result
// at the location specified by `imm8`.
@(require_results, enable_target_feature="sse,avx")
_mm256_insertf128_ps :: #force_inline proc "c" (a: __m256, b: __m128, $IMM1: u8) -> __m256 where IMM1 < 2 {
when IMM1 == 0 {
return intrinsics.simd_shuffle(
a,
_mm256_castps128_ps256(b),
8, 9, 10, 11, 4, 5, 6, 7,
)
} else {
return intrinsics.simd_shuffle(
a,
_mm256_castps128_ps256(b),
0, 1, 2, 3, 8, 9, 10, 11,
)
}
}
// Copies `a` to result, then inserts 128 bits (composed of 2 packed
// double-precision (64-bit) floating-point elements) from `b` into result
// at the location specified by `imm8`.
@(require_results, enable_target_feature="sse2,avx")
_mm256_insertf128_pd :: #force_inline proc "c" (a: __m256d, b: __m128d, $IMM1: u8) -> __m256d where IMM1 < 2 {
when IMM1 == 0 {
return intrinsics.simd_shuffle(
a,
_mm256_castpd128_pd256(b),
4, 5, 2, 3,
)
} else {
return intrinsics.simd_shuffle(
a,
_mm256_castpd128_pd256(b),
0, 1, 4, 5,
)
}
}
// Copies `a` to result, then inserts 128 bits from `b` into result at the location specified by `imm8`.
@(require_results, enable_target_feature="avx")
_mm256_insertf128_si256 :: #force_inline proc "c" (a: __m256i, b: __m128i, $IMM1: u8) -> __m256i where IMM1 < 2 {
when IMM1 == 0 {
return intrinsics.simd_shuffle(
a,
_mm256_castsi128_si256(b),
4, 5, 2, 3,
)
} else {
return intrinsics.simd_shuffle(
a,
_mm256_castsi128_si256(b),
0, 1, 4, 5,
)
}
}
// Copies `a` to result, and inserts the 8-bit integer `i` into result at the location specified by `index`.
@(require_results, enable_target_feature="avx")
_mm256_insert_epi8 :: #force_inline proc "c" (a: __m256i, i: i8, $INDEX: u8) -> __m256i where INDEX < 32 {
return transmute(__m256i)intrinsics.simd_replace(transmute(#simd[32]i8)a, INDEX, i)
}
// Copies `a` to result, and inserts the 16-bit integer `i` into result at the location specified by `index`.
@(require_results, enable_target_feature="avx")
_mm256_insert_epi16 :: #force_inline proc "c" (a: __m256i, i: i16, $INDEX: u8) -> __m256i where INDEX < 16 {
return transmute(__m256i)intrinsics.simd_replace(transmute(#simd[16]i16)a, INDEX, i)
}
// Copies `a` to result, and inserts the 32-bit integer `i` into result at the location specified by `index`.
@(require_results, enable_target_feature="avx")
_mm256_insert_epi32 :: #force_inline proc "c" (a: __m256i, i: i32, $INDEX: u8) -> __m256i where INDEX < 8 {
return transmute(__m256i)intrinsics.simd_replace(transmute(#simd[8]i32)a, INDEX, i)
}
// Loads 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from memory into result.
// `mem_addr` must be aligned on a 32-byte boundary or a
// general-protection exception may be generated.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_pd)
@(require_results, enable_target_feature="avx")
_mm256_load_pd :: #force_inline proc "c" (mem_addr: ^f64) -> __m256d {
return (^__m256d)(mem_addr)^
}
// Stores 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from `a` into memory.
// `mem_addr` must be aligned on a 32-byte boundary or a
// general-protection exception may be generated.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_pd)
@(enable_target_feature="avx")
_mm256_store_pd :: #force_inline proc "c" (mem_addr: ^f64, a: __m256d) {
(^__m256d)(mem_addr)^ = a
}
// Loads 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from memory into result.
// `mem_addr` must be aligned on a 32-byte boundary or a
// general-protection exception may be generated.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_ps)
@(require_results, enable_target_feature="avx")
_mm256_load_ps :: #force_inline proc "c" (mem_addr: ^f32) -> __m256 {
return (^__m256)(mem_addr)^
}
// Stores 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from `a` into memory.
// `mem_addr` must be aligned on a 32-byte boundary or a
// general-protection exception may be generated.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_ps)
@(enable_target_feature="avx")
_mm256_store_ps :: #force_inline proc "c" (mem_addr: ^f32, a: __m256) {
(^__m256)(mem_addr)^ = a
}
// Loads 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from memory into result.
// `mem_addr` does not need to be aligned on any particular boundary.
@(enable_target_feature="avx")
_mm256_loadu_pd :: #force_inline proc "c" (mem_addr: ^f64) -> __m256d {
return intrinsics.unaligned_load((^__m256d)(mem_addr))
}
// Stores 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from `a` into memory.
// `mem_addr` does not need to be aligned on any particular boundary.
@(enable_target_feature="avx")
_mm256_storeu_pd :: #force_inline proc "c" (mem_addr: ^f64, a: __m256d) {
intrinsics.unaligned_store((^__m256d)(mem_addr), a)
}
// Loads 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from memory into result.
// `mem_addr` does not need to be aligned on any particular boundary.
@(require_results, enable_target_feature="avx")
_mm256_loadu_ps :: #force_inline proc "c" (mem_addr: ^f32) -> __m256 {
return intrinsics.unaligned_load((^__m256)(mem_addr))
}
// Stores 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from `a` into memory.
// `mem_addr` does not need to be aligned on any particular boundary.
@(enable_target_feature="avx")
_mm256_storeu_ps :: #force_inline proc "c" (mem_addr: ^f32, a: __m256) {
intrinsics.unaligned_store((^__m256)(mem_addr), a)
}
// Loads 256-bits of integer data from memory into result.
// `mem_addr` must be aligned on a 32-byte boundary or a
// general-protection exception may be generated.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_si256)
@(require_results, enable_target_feature="avx")
_mm256_load_si256 :: #force_inline proc "c" (mem_addr: ^__m256i) -> __m256i {
return mem_addr^
}
// Stores 256-bits of integer data from `a` into memory.
// `mem_addr` must be aligned on a 32-byte boundary or a
// general-protection exception may be generated.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_si256)
@(enable_target_feature="avx")
_mm256_store_si256 :: #force_inline proc "c" (mem_addr: ^__m256i, a: __m256i) {
mem_addr^ = a
}
// Loads 256-bits of integer data from memory into result.
// `mem_addr` does not need to be aligned on any particular boundary.
@(require_results, enable_target_feature="avx")
_mm256_loadu_si256 :: #force_inline proc "c" (mem_addr: ^__m256i) -> __m256i {
return intrinsics.unaligned_load(mem_addr)
}
// Stores 256-bits of integer data from `a` into memory.
// `mem_addr` does not need to be aligned on any particular boundary.
@(enable_target_feature="avx")
_mm256_storeu_si256 :: #force_inline proc "c" (mem_addr: ^__m256i, a: __m256i) {
intrinsics.unaligned_store(mem_addr, a)
}
// Loads packed double-precision (64-bit) floating-point elements from memory
// into result using `mask` (elements are zeroed out when the high bit of the
// corresponding element is not set).
@(require_results, enable_target_feature="avx")
_mm256_maskload_pd :: #force_inline proc "c" (mem_addr: ^f64, mask: __m256i) -> __m256d {
mask_mask := intrinsics.simd_shr(mask, 63)
return intrinsics.simd_masked_load(mem_addr, _mm256_setzero_pd(), mask_mask)
}
// Stores packed double-precision (64-bit) floating-point elements from `a`
// into memory using `mask`.
@(enable_target_feature="avx")
_mm256_maskstore_pd :: #force_inline proc "c" (mem_addr: ^f64, mask: __m256i, a: __m256d) {
mask_mask := intrinsics.simd_shr(mask, 63)
intrinsics.simd_masked_store(mem_addr, a, mask_mask)
}
// Loads packed double-precision (64-bit) floating-point elements from memory
// into result using `mask` (elements are zeroed out when the high bit of the
// corresponding element is not set).
@(require_results, enable_target_feature="sse2,avx")
_mm_maskload_pd :: #force_inline proc "c" (mem_addr: ^f64, mask: __m128i) -> __m128d {
mask_mask := intrinsics.simd_shr(mask, 63)
return intrinsics.simd_masked_load(mem_addr, _mm_setzero_pd(), mask_mask)
}
// Stores packed double-precision (64-bit) floating-point elements from `a`
// into memory using `mask`.
@(enable_target_feature="avx")
_mm_maskstore_pd :: #force_inline proc "c" (mem_addr: ^f64, mask: __m128i, a: __m128d) {
mask_mask := intrinsics.simd_shr(mask, 63)
intrinsics.simd_masked_store(mem_addr, a, mask_mask)
}
// Loads packed single-precision (32-bit) floating-point elements from memory
// into result using `mask` (elements are zeroed out when the high bit of the
// corresponding element is not set).
@(require_results, enable_target_feature="avx")
_mm256_maskload_ps :: #force_inline proc "c" (mem_addr: ^f32, mask: __m256i) -> __m256 {
mask_mask := intrinsics.simd_shr(transmute(#simd[8]i32)mask, 31)
return intrinsics.simd_masked_load(mem_addr, _mm256_setzero_ps(), mask_mask)
}
// Stores packed single-precision (32-bit) floating-point elements from `a`
// into memory using `mask`.
@(enable_target_feature="avx")
_mm256_maskstore_ps :: #force_inline proc "c" (mem_addr: ^f32, mask: __m256i, a: __m256) {
mask_mask := intrinsics.simd_shr(transmute(#simd[8]i32)mask, 31)
intrinsics.simd_masked_store(mem_addr, a, mask_mask)
}
// Loads packed single-precision (32-bit) floating-point elements from memory
// into result using `mask` (elements are zeroed out when the high bit of the
// corresponding element is not set).
@(require_results, enable_target_feature="sse,avx")
_mm_maskload_ps :: #force_inline proc "c" (mem_addr: ^f32, mask: __m128i) -> __m128 {
mask_mask := intrinsics.simd_shr(transmute(#simd[4]i32)mask, 31)
return intrinsics.simd_masked_load(mem_addr, _mm_setzero_ps(), mask_mask)
}
// Stores packed single-precision (32-bit) floating-point elements from `a`
// into memory using `mask`.
@(enable_target_feature="avx")
_mm_maskstore_ps :: #force_inline proc "c" (mem_addr: ^f32, mask: __m128i, a: __m128) {
mask_mask := intrinsics.simd_shr(transmute(#simd[4]i32)mask, 31)
intrinsics.simd_masked_store(mem_addr, a, mask_mask)
}
// Duplicate odd-indexed single-precision (32-bit) floating-point elements from `a`, and returns the results.
@(require_results, enable_target_feature="avx")
_mm256_movehdup_ps :: #force_inline proc "c" (a: __m256) -> __m256 {
return intrinsics.simd_shuffle(a, a, 1, 1, 3, 3, 5, 5, 7, 7)
}
// Duplicate even-indexed single-precision (32-bit) floating-point elements from `a`, and returns the results.
@(require_results, enable_target_feature="avx")
_mm256_moveldup_ps :: #force_inline proc "c" (a: __m256) -> __m256 {
return intrinsics.simd_shuffle(a, a, 0, 0, 2, 2, 4, 4, 6, 6)
}
// Duplicate even-indexed double-precision (64-bit) floating-point elements from `a`, and returns the results.
@(require_results, enable_target_feature="avx")
_mm256_movedup_pd :: #force_inline proc "c" (a: __m256d) -> __m256d {
return intrinsics.simd_shuffle(a, a, 0, 0, 2, 2)
}
// Loads 256-bits of integer data from unaligned memory into result.
// This intrinsic may perform better than `_mm256_loadu_si256` when the
// data crosses a cache line boundary.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_lddqu_si256)
@(require_results, enable_target_feature="avx")
_mm256_lddqu_si256 :: #force_inline proc "c" (mem_addr: ^__m256i) -> __m256i {
return transmute(__m256i)llvm_vlddqu(mem_addr)
}
/*
// Moves integer data from a 256-bit integer vector to a 32-byte
// aligned memory location. To minimize caching, the data is flagged as
// non-temporal (unlikely to be used again soon)
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_si256)
//
// # Safety of non-temporal stores
//
// After using this intrinsic, but before any other access to the memory that this intrinsic
// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
// return.
//
// See [`_mm_sfence`] for details.
@(enable_target_feature="avx")
_mm256_stream_si256 :: #force_inline proc "c" (mem_addr: ^__m256i, a: __m256i) {
panic_contextless("TODO: _mm256_stream_si256")
}
// Moves double-precision values from a 256-bit vector of `[4 x double]`
// to a 32-byte aligned memory location. To minimize caching, the data is
// flagged as non-temporal (unlikely to be used again soon).
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_pd)
//
// # Safety of non-temporal stores
//
// After using this intrinsic, but before any other access to the memory that this intrinsic
// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
// return.
//
// See [`_mm_sfence`] for details.
@(enable_target_feature="avx")
_mm256_stream_pd :: #force_inline proc "c" (mem_addr: ^f64, a: __m256d) {
panic_contextless("TODO: _mm256_stream_pd")
}
// Moves single-precision floating point values from a 256-bit vector
// of `[8 x float]` to a 32-byte aligned memory location. To minimize
// caching, the data is flagged as non-temporal (unlikely to be used again
// soon).
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_ps)
//
// # Safety of non-temporal stores
//
// After using this intrinsic, but before any other access to the memory that this intrinsic
// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
// return.
//
// See [`_mm_sfence`] for details.
@(enable_target_feature="avx")
_mm256_stream_ps :: #force_inline proc "c" (mem_addr: ^f32, a: __m256) {
panic_contextless("TODO: _mm256_stream_ps")
}
*/
// Computes the approximate reciprocal of packed single-precision (32-bit) floating-point elements in `a`, and returns the results. The maximum
// relative error for this approximation is less than 1.5*2^-12.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rcp_ps)
@(require_results, enable_target_feature="avx")
_mm256_rcp_ps :: #force_inline proc "c" (a: __m256) -> __m256 {
return llvm_vrcpps(a)
}
// Computes the approximate reciprocal square root of packed single-precision
// (32-bit) floating-point elements in `a`, and returns the results.
// The maximum relative error for this approximation is less than 1.5*2^-12.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rsqrt_ps)
@(require_results, enable_target_feature="avx")
_mm256_rsqrt_ps :: #force_inline proc "c" (a: __m256) -> __m256 {
return llvm_vrsqrtps(a)
}
// Unpacks and interleave double-precision (64-bit) floating-point elements
// from the high half of each 128-bit lane in `a` and `b`.
@(require_results, enable_target_feature="avx")
_mm256_unpackhi_pd :: #force_inline proc "c" (a, b: __m256d) -> __m256d {
return intrinsics.simd_shuffle(a, b, 1, 5, 3, 7)
}
// Unpacks and interleave single-precision (32-bit) floating-point elements
// from the high half of each 128-bit lane in `a` and `b`.
@(require_results, enable_target_feature="avx")
_mm256_unpackhi_ps :: #force_inline proc "c" (a, b: __m256) -> __m256 {
return intrinsics.simd_shuffle(a, b, 2, 10, 3, 11, 6, 14, 7, 15)
}
// Unpacks and interleave double-precision (64-bit) floating-point elements
// from the low half of each 128-bit lane in `a` and `b`.
@(require_results, enable_target_feature="avx")
_mm256_unpacklo_pd :: #force_inline proc "c" (a, b: __m256d) -> __m256d {
return intrinsics.simd_shuffle(a, b, 0, 4, 2, 6)
}
// Unpacks and interleave single-precision (32-bit) floating-point elements
// from the low half of each 128-bit lane in `a` and `b`.
@(require_results, enable_target_feature="avx")
_mm256_unpacklo_ps :: #force_inline proc "c" (a, b: __m256) -> __m256 {
return intrinsics.simd_shuffle(a, b, 0, 8, 1, 9, 4, 12, 5, 13)
}
// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
// the result is zero, otherwise set `CF` to 0. Return the `ZF` value.
@(require_results, enable_target_feature="avx")
_mm256_testz_si256 :: #force_inline proc "c" (a, b: __m256i) -> i32 {
r := intrinsics.simd_bit_and(a, b)
return i32(0 == intrinsics.simd_reduce_or(r))
}
// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
// the result is zero, otherwise set `CF` to 0. Return the `CF` value.
@(require_results, enable_target_feature="avx")
_mm256_testc_si256 :: #force_inline proc "c" (a, b: __m256i) -> i32 {
r := intrinsics.simd_bit_and(intrinsics.simd_bit_xor(a, __m256i(~i64(0))), b)
return i32(0 == intrinsics.simd_reduce_or(r))
}
// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
// the result is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and
// `CF` values are zero, otherwise return 0.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testnzc_si256)
@(require_results, enable_target_feature="avx")
_mm256_testnzc_si256 :: #force_inline proc "c" (a, b: __m256i) -> i32 {
return llvm_ptestnzc256(a, b)
}
// Computes the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point elements) in `a` and `b`, producing an intermediate 256-bit
// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
// NOT of `a` and then AND with `b`, producing an intermediate value, and set
// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
// is zero, otherwise set `CF` to 0. Return the `ZF` value.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_pd)
@(require_results, enable_target_feature="avx")
_mm256_testz_pd :: #force_inline proc "c" (a, b: __m256d) -> i32 {
return llvm_vtestzpd256(a, b)
}
// Computes the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point elements) in `a` and `b`, producing an intermediate 256-bit
// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
// NOT of `a` and then AND with `b`, producing an intermediate value, and set
// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
// is zero, otherwise set `CF` to 0. Return the `CF` value.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testc_pd)
@(require_results, enable_target_feature="avx")
_mm256_testc_pd :: #force_inline proc "c" (a, b: __m256d) -> i32 {
return llvm_vtestcpd256(a, b)
}
// Computes the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point elements) in `a` and `b`, producing an intermediate 256-bit
// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
// NOT of `a` and then AND with `b`, producing an intermediate value, and set
// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
// are zero, otherwise return 0.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testnzc_pd)
@(require_results, enable_target_feature="avx")
_mm256_testnzc_pd :: #force_inline proc "c" (a, b: __m256d) -> i32 {
return llvm_vtestnzcpd256(a, b)
}
// Computes the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point elements) in `a` and `b`, producing an intermediate 128-bit
// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
// NOT of `a` and then AND with `b`, producing an intermediate value, and set
// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
// is zero, otherwise set `CF` to 0. Return the `ZF` value.
@(require_results, enable_target_feature="sse2,avx")
_mm_testz_pd :: #force_inline proc "c" (a, b: __m128d) -> i32 {
r := intrinsics.simd_lanes_lt(transmute(__m128i)_mm_and_pd(a, b), __m128i(0))
return i32(0 == intrinsics.simd_reduce_or(r))
}
// Computes the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point elements) in `a` and `b`, producing an intermediate 128-bit
// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
// NOT of `a` and then AND with `b`, producing an intermediate value, and set
// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
// is zero, otherwise set `CF` to 0. Return the `CF` value.
@(require_results, enable_target_feature="sse2,avx")
_mm_testc_pd :: #force_inline proc "c" (a, b: __m128d) -> i32 {
r := intrinsics.simd_lanes_lt(transmute(__m128i)_mm_andnot_pd(a, b), __m128i(0))
return i32(0 == intrinsics.simd_reduce_or(r))
}
// Computes the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point elements) in `a` and `b`, producing an intermediate 128-bit
// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
// NOT of `a` and then AND with `b`, producing an intermediate value, and set
// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
// are zero, otherwise return 0.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_pd)
@(require_results, enable_target_feature="avx")
_mm_testnzc_pd :: #force_inline proc "c" (a, b: __m128d) -> i32 {
return llvm_vtestnzcpd(a, b)
}
// Computes the bitwise AND of 256 bits (representing single-precision (32-bit) floating-point elements) in `a` and `b`, producing an intermediate 256-bit
// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
// NOT of `a` and then AND with `b`, producing an intermediate value, and set
// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
// is zero, otherwise set `CF` to 0. Return the `ZF` value.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_ps)
@(require_results, enable_target_feature="avx")
_mm256_testz_ps :: #force_inline proc "c" (a, b: __m256) -> i32 {
return llvm_vtestzps256(a, b)
}
// Computes the bitwise AND of 256 bits (representing single-precision (32-bit) floating-point elements) in `a` and `b`, producing an intermediate 256-bit
// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
// NOT of `a` and then AND with `b`, producing an intermediate value, and set
// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
// is zero, otherwise set `CF` to 0. Return the `CF` value.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testc_ps)
@(require_results, enable_target_feature="avx")
_mm256_testc_ps :: #force_inline proc "c" (a, b: __m256) -> i32 {
return llvm_vtestcps256(a, b)
}
// Computes the bitwise AND of 256 bits (representing single-precision (32-bit) floating-point elements) in `a` and `b`, producing an intermediate 256-bit
// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
// NOT of `a` and then AND with `b`, producing an intermediate value, and set
// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
// are zero, otherwise return 0.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testnzc_ps)
@(require_results, enable_target_feature="avx")
_mm256_testnzc_ps :: #force_inline proc "c" (a, b: __m256) -> i32 {
return llvm_vtestnzcps256(a, b)
}
// Computes the bitwise AND of 128 bits (representing single-precision (32-bit) floating-point elements) in `a` and `b`, producing an intermediate 128-bit
// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
// NOT of `a` and then AND with `b`, producing an intermediate value, and set
// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
// is zero, otherwise set `CF` to 0. Return the `ZF` value.
@(require_results, enable_target_feature="sse,avx")
_mm_testz_ps :: #force_inline proc "c" (a: __m128, b: __m128) -> i32 {
r := intrinsics.simd_lanes_lt(transmute(#simd[4]i32)_mm_and_ps(a, b), (#simd[4]i32)(0))
return i32(0 == intrinsics.simd_reduce_or(r))
}
// Computes the bitwise AND of 128 bits (representing single-precision (32-bit) floating-point elements) in `a` and `b`, producing an intermediate 128-bit
// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
// NOT of `a` and then AND with `b`, producing an intermediate value, and set
// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
// is zero, otherwise set `CF` to 0. Return the `CF` value.
@(require_results, enable_target_feature="sse,avx")
_mm_testc_ps :: #force_inline proc "c" (a: __m128, b: __m128) -> i32 {
r := intrinsics.simd_lanes_lt(transmute(#simd[4]i32)_mm_andnot_ps(a, b), (#simd[4]i32)(0))
return i32(0 == intrinsics.simd_reduce_or(r))
}
// Computes the bitwise AND of 128 bits (representing single-precision (32-bit) floating-point elements) in `a` and `b`, producing an intermediate 128-bit
// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
// NOT of `a` and then AND with `b`, producing an intermediate value, and set
// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
// are zero, otherwise return 0.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_ps)
@(require_results, enable_target_feature="avx")
_mm_testnzc_ps :: #force_inline proc "c" (a: __m128, b: __m128) -> i32 {
return llvm_vtestnzcps(a, b)
}
// Sets each bit of the returned mask based on the most significant bit of the
// corresponding packed double-precision (64-bit) floating-point element in
// `a`.
@(require_results, enable_target_feature="avx")
_mm256_movemask_pd :: #force_inline proc "c" (a: __m256d) -> i32 {
mask := intrinsics.simd_lanes_lt(transmute(#simd[4]i64)a, (#simd[4]i64)(0))
return i32(transmute(u8)intrinsics.simd_extract_lsbs(mask))
}
// Sets each bit of the returned mask based on the most significant bit of the
// corresponding packed single-precision (32-bit) floating-point element in
// `a`.
@(require_results, enable_target_feature="avx")
_mm256_movemask_ps :: #force_inline proc "c" (a: __m256) -> i32 {
// Propagate the highest bit to the rest, because simd_bitmask
// requires all-1 or all-0.
mask := intrinsics.simd_lanes_lt(transmute(#simd[8]i32)a, (#simd[8]i32)(0))
return i32(transmute(u8)intrinsics.simd_extract_lsbs(mask))
}
// Returns vector of type __m256d with all elements set to zero.
@(require_results, enable_target_feature="avx")
_mm256_setzero_pd :: #force_inline proc "c" () -> __m256d {
return 0
}
// Returns vector of type __m256 with all elements set to zero.
@(require_results, enable_target_feature="avx")
_mm256_setzero_ps :: #force_inline proc "c" () -> __m256 {
return 0
}
// Returns vector of type __m256i with all elements set to zero.
@(require_results, enable_target_feature="avx")
_mm256_setzero_si256 :: #force_inline proc "c" () -> __m256i {
return 0
}
// Sets packed double-precision (64-bit) floating-point elements in returned
// vector with the supplied values.
@(require_results, enable_target_feature="avx")
_mm256_set_pd :: #force_inline proc "c" (a: f64, b: f64, c: f64, d: f64) -> __m256d {
return _mm256_setr_pd(d, c, b, a)
}
// Sets packed single-precision (32-bit) floating-point elements in returned
// vector with the supplied values.
@(require_results, enable_target_feature="avx")
_mm256_set_ps :: #force_inline proc "c" (
a: f32,
b: f32,
c: f32,
d: f32,
e: f32,
f: f32,
g: f32,
h: f32,
) -> __m256 {
return _mm256_setr_ps(h, g, f, e, d, c, b, a)
}
// Sets packed 8-bit integers in returned vector with the supplied values.
@(require_results, enable_target_feature="avx")
_mm256_set_epi8 :: #force_inline proc "c" (
e00, e01, e02, e03, e04, e05, e06, e07: i8,
e08, e09, e10, e11, e12, e13, e14, e15: i8,
e16, e17, e18, e19, e20, e21, e22, e23: i8,
e24, e25, e26, e27, e28, e29, e30, e31: i8,
) -> __m256i {
return _mm256_setr_epi8(
e31, e30, e29, e28, e27, e26, e25, e24,
e23, e22, e21, e20, e19, e18, e17, e16,
e15, e14, e13, e12, e11, e10, e09, e08,
e07, e06, e05, e04, e03, e02, e01, e00,
)
}
// Sets packed 16-bit integers in returned vector with the supplied values.
@(require_results, enable_target_feature="avx")
_mm256_set_epi16 :: #force_inline proc "c" (
e00, e01, e02, e03, e04, e05, e06, e07: i16,
e08, e09, e10, e11, e12, e13, e14, e15: i16,
) -> __m256i {
return _mm256_setr_epi16(
e15, e14, e13, e12,
e11, e10, e09, e08,
e07, e06, e05, e04,
e03, e02, e01, e00,
)
}
// Sets packed 32-bit integers in returned vector with the supplied values.
@(require_results, enable_target_feature="avx")
_mm256_set_epi32 :: #force_inline proc "c" (e0, e1, e2, e3, e4, e5, e6, e7: i32) -> __m256i {
return _mm256_setr_epi32(e7, e6, e5, e4, e3, e2, e1, e0)
}
// Sets packed 64-bit integers in returned vector with the supplied values.
@(require_results, enable_target_feature="avx")
_mm256_set_epi64x :: #force_inline proc "c" (a: i64, b: i64, c: i64, d: i64) -> __m256i {
return _mm256_setr_epi64x(d, c, b, a)
}
// Sets packed double-precision (64-bit) floating-point elements in returned
// vector with the supplied values in reverse order.
@(require_results, enable_target_feature="avx")
_mm256_setr_pd :: #force_inline proc "c" (a: f64, b: f64, c: f64, d: f64) -> __m256d {
return __m256d{a, b, c, d}
}
// Sets packed single-precision (32-bit) floating-point elements in returned
// vector with the supplied values in reverse order.
@(require_results, enable_target_feature="avx")
_mm256_setr_ps :: #force_inline proc "c" (a, b, c, d, e, f, g, h: f32) -> __m256 {
return __m256{a, b, c, d, e, f, g, h}
}
// Sets packed 8-bit integers in returned vector with the supplied values in
// reverse order.
@(require_results, enable_target_feature="avx")
_mm256_setr_epi8 :: #force_inline proc "c" (
e00, e01, e02, e03, e04, e05, e06, e07: i8,
e08, e09, e10, e11, e12, e13, e14, e15: i8,
e16, e17, e18, e19, e20, e21, e22, e23: i8,
e24, e25, e26, e27, e28, e29, e30, e31: i8,
) -> __m256i {
return transmute(__m256i)#simd[32]i8{
e00, e01, e02, e03, e04, e05, e06, e07,
e08, e09, e10, e11, e12, e13, e14, e15,
e16, e17, e18, e19, e20, e21, e22, e23,
e24, e25, e26, e27, e28, e29, e30, e31,
}
}
// Sets packed 16-bit integers in returned vector with the supplied values in
// reverse order.
@(require_results, enable_target_feature="avx")
_mm256_setr_epi16 :: #force_inline proc "c" (
e00, e01, e02, e03, e04, e05, e06, e07: i16,
e08, e09, e10, e11, e12, e13, e14, e15: i16,
) -> __m256i {
return transmute(__m256i)#simd[16]i16{
e00, e01, e02, e03,
e04, e05, e06, e07,
e08, e09, e10, e11,
e12, e13, e14, e15,
}
}
// Sets packed 32-bit integers in returned vector with the supplied values in
// reverse order.
@(require_results, enable_target_feature="avx")
_mm256_setr_epi32 :: #force_inline proc "c" (e0, e1, e2, e3, e4, e5, e6, e7: i32) -> __m256i {
return transmute(__m256i)#simd[8]i32{e0, e1, e2, e3, e4, e5, e6, e7}
}
// Sets packed 64-bit integers in returned vector with the supplied values in
// reverse order.
@(require_results, enable_target_feature="avx")
_mm256_setr_epi64x :: #force_inline proc "c" (a: i64, b: i64, c: i64, d: i64) -> __m256i {
return {a, b, c, d}
}
// Broadcasts double-precision (64-bit) floating-point value `a` to all elements of returned vector.
@(require_results, enable_target_feature="avx")
_mm256_set1_pd :: #force_inline proc "c" (a: f64) -> __m256d {
return a
}
// Broadcasts single-precision (32-bit) floating-point value `a` to all elements of returned vector.
@(require_results, enable_target_feature="avx")
_mm256_set1_ps :: #force_inline proc "c" (a: f32) -> __m256 {
return a
}
// Broadcasts 8-bit integer `a` to all elements of returned vector.
// This intrinsic may generate the `vpbroadcastb`.
@(require_results, enable_target_feature="avx")
_mm256_set1_epi8 :: #force_inline proc "c" (a: i8) -> __m256i {
return transmute(__m256i)(#simd[32]i8)(a)
}
// Broadcasts 16-bit integer `a` to all elements of returned vector.
// This intrinsic may generate the `vpbroadcastw`.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi16)
@(require_results, enable_target_feature="avx")
_mm256_set1_epi16 :: #force_inline proc "c" (a: i16) -> __m256i {
return transmute(__m256i)(#simd[16]i16)(a)
}
// Broadcasts 32-bit integer `a` to all elements of returned vector.
// This intrinsic may generate the `vpbroadcastd`.
@(require_results, enable_target_feature="avx")
_mm256_set1_epi32 :: #force_inline proc "c" (a: i32) -> __m256i {
return transmute(__m256i)(#simd[8]i32)(a)
}
// Broadcasts 64-bit integer `a` to all elements of returned vector.
// This intrinsic may generate the `vpbroadcastq`.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi64x)
@(require_results, enable_target_feature="avx")
_mm256_set1_epi64x :: #force_inline proc "c" (a: i64) -> __m256i {
return a
}
// Cast vector of type __m256d to type __m256.
@(require_results, enable_target_feature="avx")
_mm256_castpd_ps :: #force_inline proc "c" (a: __m256d) -> __m256 {
return transmute(__m256)a
}
// Cast vector of type __m256 to type __m256d.
@(require_results, enable_target_feature="avx")
_mm256_castps_pd :: #force_inline proc "c" (a: __m256) -> __m256d {
return transmute(__m256d)a
}
// Casts vector of type __m256 to type __m256i.
@(require_results, enable_target_feature="avx")
_mm256_castps_si256 :: #force_inline proc "c" (a: __m256) -> __m256i {
return transmute(__m256i)a
}
// Casts vector of type __m256i to type __m256.
@(require_results, enable_target_feature="avx")
_mm256_castsi256_ps :: #force_inline proc "c" (a: __m256i) -> __m256 {
return transmute(__m256)a
}
// Casts vector of type __m256d to type __m256i.
@(require_results, enable_target_feature="avx")
_mm256_castpd_si256 :: #force_inline proc "c" (a: __m256d) -> __m256i {
return transmute(__m256i)a
}
// Casts vector of type __m256i to type __m256d.
@(require_results, enable_target_feature="avx")
_mm256_castsi256_pd :: #force_inline proc "c" (a: __m256i) -> __m256d {
return transmute(__m256d)a
}
// Casts vector of type __m256 to type __m128.
@(require_results, enable_target_feature="avx")
_mm256_castps256_ps128 :: #force_inline proc "c" (a: __m256) -> __m128 {
return intrinsics.simd_shuffle(a, a, 0, 1, 2, 3)
}
// Casts vector of type __m256d to type __m128d.
@(require_results, enable_target_feature="avx")
_mm256_castpd256_pd128 :: #force_inline proc "c" (a: __m256d) -> __m128d {
return intrinsics.simd_shuffle(a, a, 0, 1)
}
// Casts vector of type __m256i to type __m128i.
@(require_results, enable_target_feature="avx")
_mm256_castsi256_si128 :: #force_inline proc "c" (a: __m256i) -> __m128i {
return intrinsics.simd_shuffle(a, a, 0, 1)
}
// Casts vector of type __m128 to type __m256;
// the upper 128 bits of the result are indeterminate.
//
// In the Intel documentation, the upper bits are declared to be "undefined".
@(require_results, enable_target_feature="sse,avx")
_mm256_castps128_ps256 :: #force_inline proc "c" (a: __m128) -> __m256 {
return intrinsics.simd_shuffle(a, _mm_undefined_ps(), 0, 1, 2, 3, 4, 4, 4, 4)
}
// Casts vector of type __m128d to type __m256d;
// the upper 128 bits of the result are indeterminate.
//
// In the Intel documentation, the upper bits are declared to be "undefined".
@(require_results, enable_target_feature="sse2,avx")
_mm256_castpd128_pd256 :: #force_inline proc "c" (a: __m128d) -> __m256d {
return intrinsics.simd_shuffle(a, _mm_undefined_pd(), 0, 1, 2, 2)
}
// Casts vector of type __m128i to type __m256i;
// the upper 128 bits of the result are indeterminate.
//
// In the Intel documentation, the upper bits are declared to be "undefined".
@(require_results, enable_target_feature="avx")
_mm256_castsi128_si256 :: #force_inline proc "c" (a: __m128i) -> __m256i {
return intrinsics.simd_shuffle(a, __m128i(0), 0, 1, 2, 2)
}
// Constructs a 256-bit floating-point vector of `[8 x float]` from a
// 128-bit floating-point vector of `[4 x float]`. The lower 128 bits contain
// the value of the source vector. The upper 128 bits are set to zero.
@(require_results, enable_target_feature="sse,avx")
_mm256_zextps128_ps256 :: #force_inline proc "c" (a: __m128) -> __m256 {
return intrinsics.simd_shuffle(a, _mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7)
}
// Constructs a 256-bit integer vector from a 128-bit integer vector.
// The lower 128 bits contain the value of the source vector. The upper
// 128 bits are set to zero.
@(require_results, enable_target_feature="avx")
_mm256_zextsi128_si256 :: #force_inline proc "c" (a: __m128i) -> __m256i {
return intrinsics.simd_shuffle(a, __m128i(0), 0, 1, 2, 3)
}
// Constructs a 256-bit floating-point vector of `[4 x double]` from a
// 128-bit floating-point vector of `[2 x double]`. The lower 128 bits
// contain the value of the source vector. The upper 128 bits are set
// to zero.
@(require_results, enable_target_feature="sse2,avx")
_mm256_zextpd128_pd256 :: #force_inline proc "c" (a: __m128d) -> __m256d {
return intrinsics.simd_shuffle(a, _mm_setzero_pd(), 0, 1, 2, 3)
}
// Returns vector of type `__m256` with indeterminate elements.
// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
@(require_results, enable_target_feature="avx")
_mm256_undefined_ps :: #force_inline proc "c" () -> __m256 {
return 0
}
// Returns vector of type `__m256d` with indeterminate elements.
// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
@(require_results, enable_target_feature="avx")
_mm256_undefined_pd :: #force_inline proc "c" () -> __m256d {
return 0
}
// Returns vector of type __m256i with with indeterminate elements.
// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
@(require_results, enable_target_feature="avx")
_mm256_undefined_si256 :: #force_inline proc "c" () -> __m256i {
return 0
}
// Sets packed __m256 returned vector with the supplied values.
@(require_results, enable_target_feature="avx")
_mm256_set_m128 :: #force_inline proc "c" (hi: __m128, lo: __m128) -> __m256 {
return intrinsics.simd_shuffle(lo, hi, 0, 1, 2, 3, 4, 5, 6, 7)
}
// Sets packed __m256d returned vector with the supplied values.
@(require_results, enable_target_feature="avx")
_mm256_set_m128d :: #force_inline proc "c" (hi: __m128d, lo: __m128d) -> __m256d {
hi := transmute(__m128)hi
lo := transmute(__m128)lo
return transmute(__m256d)_mm256_set_m128(hi, lo)
}
// Sets packed __m256i returned vector with the supplied values.
@(require_results, enable_target_feature="avx")
_mm256_set_m128i :: #force_inline proc "c" (hi: __m128i, lo: __m128i) -> __m256i {
hi := transmute(__m128)hi
lo := transmute(__m128)lo
return transmute(__m256i)_mm256_set_m128(hi, lo)
}
// Sets packed __m256 returned vector with the supplied values.
@(require_results, enable_target_feature="avx")
_mm256_setr_m128 :: #force_inline proc "c" (lo: __m128, hi: __m128) -> __m256 {
return _mm256_set_m128(hi, lo)
}
// Sets packed __m256d returned vector with the supplied values.
@(require_results, enable_target_feature="avx")
_mm256_setr_m128d :: #force_inline proc "c" (lo: __m128d, hi: __m128d) -> __m256d {
return _mm256_set_m128d(hi, lo)
}
// Sets packed __m256i returned vector with the supplied values.
@(require_results, enable_target_feature="avx")
_mm256_setr_m128i :: #force_inline proc "c" (lo: __m128i, hi: __m128i) -> __m256i {
return _mm256_set_m128i(hi, lo)
}
// Loads two 128-bit values (composed of 4 packed single-precision (32-bit) floating-point elements) from memory, and combine them into a 256-bit value.
// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
@(require_results, enable_target_feature="sse,avx")
_mm256_loadu2_m128 :: #force_inline proc "c" (hiaddr, loaddr: ^f32) -> __m256 {
a := _mm256_castps128_ps256(_mm_loadu_ps(loaddr))
return _mm256_insertf128_ps(a, _mm_loadu_ps(hiaddr), 1)
}
// Loads two 128-bit values (composed of 2 packed double-precision (64-bit) floating-point elements) from memory, and combine them into a 256-bit value.
// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
@(require_results, enable_target_feature="sse2,avx")
_mm256_loadu2_m128d :: #force_inline proc "c" (hiaddr, loaddr: ^f64) -> __m256d {
a := _mm256_castpd128_pd256(_mm_loadu_pd(loaddr))
return _mm256_insertf128_pd(a, _mm_loadu_pd(hiaddr), 1)
}
// Loads two 128-bit values (composed of integer data) from memory, and combine them into a 256-bit value.
// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
@(require_results, enable_target_feature="sse2,avx")
_mm256_loadu2_m128i :: #force_inline proc "c" (hiaddr, loaddr: ^__m128i) -> __m256i {
a := _mm256_castsi128_si256(_mm_loadu_si128(loaddr))
return _mm256_insertf128_si256(a, _mm_loadu_si128(hiaddr), 1)
}
// Stores the high and low 128-bit halves (each composed of 4 packed
// single-precision (32-bit) floating-point elements) from `a` into memory two
// different 128-bit locations.
// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
@(enable_target_feature="sse,avx")
_mm256_storeu2_m128 :: #force_inline proc "c" (hiaddr, loaddr: ^f32, a: __m256) {
lo := _mm256_castps256_ps128(a)
_mm_storeu_ps(loaddr, lo)
hi := _mm256_extractf128_ps(a, 1)
_mm_storeu_ps(hiaddr, hi)
}
// Stores the high and low 128-bit halves (each composed of 2 packed
// double-precision (64-bit) floating-point elements) from `a` into memory two
// different 128-bit locations.
// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
@(enable_target_feature="sse2,avx")
_mm256_storeu2_m128d :: #force_inline proc "c" (hiaddr, loaddr: ^f64, a: __m256d) {
lo := _mm256_castpd256_pd128(a)
_mm_storeu_pd(loaddr, lo)
hi := _mm256_extractf128_pd(a, 1)
_mm_storeu_pd(hiaddr, hi)
}
// Stores the high and low 128-bit halves (each composed of integer data) from
// `a` into memory two different 128-bit locations.
// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
@(enable_target_feature="sse2,avx")
_mm256_storeu2_m128i :: #force_inline proc "c" (hiaddr, loaddr: ^__m128i, a: __m256i) {
lo := _mm256_castsi256_si128(a)
_mm_storeu_si128(loaddr, lo)
hi := _mm256_extractf128_si256(a, 1)
_mm_storeu_si128(hiaddr, hi)
}
// Returns the first element of the input vector of `[8 x float]`.
@(require_results, enable_target_feature="avx")
_mm256_cvtss_f32 :: #force_inline proc "c" (a: __m256) -> f32 {
return intrinsics.simd_extract(a, 0)
}
@(require_results, enable_target_feature="avx")
_mm256_insert_epi64 :: #force_inline proc "c" (a: __m256i, i: i64, $idx: u32) -> __m256i {
return intrinsics.simd_replace(transmute(#simd[4]i64)a, idx, i)
}
@(require_results, enable_target_feature="avx")
_mm256_extract_epi64 :: #force_inline proc "c" (a: __m256i, $idx: u32) -> i64 {
return intrinsics.simd_extract(transmute(#simd[4]i64)a, idx)
}
@(private, default_calling_convention="none")
foreign _ {
@(link_name="llvm.x86.avx.round.pd.256") llvm_roundpd256 :: proc(a: __m256d, #const b: u32) -> __m256d ---
@(link_name="llvm.x86.avx.round.ps.256") llvm_roundps256 :: proc(a: __m256, #const b: u32) -> __m256 ---
@(link_name="llvm.x86.avx.dp.ps.256") llvm_vdpps :: proc(a, b: __m256, #const imm8: u8) -> __m256 ---
@(link_name="llvm.x86.sse2.cmp.pd") llvm_vcmppd :: proc(a, b: __m128d, #const imm8: u8) -> __m128d ---
@(link_name="llvm.x86.avx.cmp.pd.256") llvm_vcmppd256 :: proc(a, b: __m256d, imm8: u8) -> __m256d ---
@(link_name="llvm.x86.sse.cmp.ps") llvm_vcmpps :: proc(a: __m128, b: __m128, #const imm8: u8) -> __m128 ---
@(link_name="llvm.x86.avx.cmp.ps.256") llvm_vcmpps256 :: proc(a, b: __m256, imm8: u8) -> __m256 ---
@(link_name="llvm.x86.sse2.cmp.sd") llvm_vcmpsd :: proc(a, b: __m128d, #const imm8: u8) -> __m128d ---
@(link_name="llvm.x86.sse.cmp.ss") llvm_vcmpss :: proc(a: __m128, b: __m128, #const imm8: u8) -> __m128 ---
@(link_name="llvm.x86.avx.cvt.ps2dq.256") llvm_vcvtps2dq :: proc(a: __m256) -> #simd[8]i32 ---
@(link_name="llvm.x86.avx.cvtt.pd2dq.256") llvm_vcvttpd2dq :: proc(a: __m256d) -> #simd[4]i32 ---
@(link_name="llvm.x86.avx.cvt.pd2dq.256") llvm_vcvtpd2dq :: proc(a: __m256d) -> #simd[4]i32 ---
@(link_name="llvm.x86.avx.cvtt.ps2dq.256") llvm_vcvttps2dq :: proc(a: __m256) -> #simd[8]i32 ---
@(link_name="llvm.x86.avx.vzeroall") llvm_vzeroall :: proc() ---
@(link_name="llvm.x86.avx.vzeroupper") llvm_vzeroupper :: proc() ---
@(link_name="llvm.x86.avx.vpermilvar.ps.256") llvm_vpermilps256 :: proc(a: __m256, b: #simd[8]i32) -> __m256 ---
@(link_name="llvm.x86.avx.vpermilvar.ps") llvm_vpermilps :: proc(a: __m128, b: #simd[4]i32) -> __m128 ---
@(link_name="llvm.x86.avx.vpermilvar.pd.256") llvm_vpermilpd256 :: proc(a: __m256d, b: #simd[4]i64) -> __m256d ---
@(link_name="llvm.x86.avx.vpermilvar.pd") llvm_vpermilpd :: proc(a: __m128d, b: #simd[2]i64) -> __m128d ---
@(link_name="llvm.x86.avx.ldu.dq.256") llvm_vlddqu :: proc(mem_addr: rawptr) -> #simd[32]i8 ---
@(link_name="llvm.x86.avx.rcp.ps.256") llvm_vrcpps :: proc(a: __m256) -> __m256 ---
@(link_name="llvm.x86.avx.rsqrt.ps.256") llvm_vrsqrtps :: proc(a: __m256) -> __m256 ---
@(link_name="llvm.x86.avx.ptestnzc.256") llvm_ptestnzc256 :: proc(a: #simd[4]i64, b: #simd[4]i64) -> i32 ---
@(link_name="llvm.x86.avx.vtestz.pd.256") llvm_vtestzpd256 :: proc(a, b: __m256d) -> i32 ---
@(link_name="llvm.x86.avx.vtestc.pd.256") llvm_vtestcpd256 :: proc(a, b: __m256d) -> i32 ---
@(link_name="llvm.x86.avx.vtestnzc.pd.256") llvm_vtestnzcpd256 :: proc(a, b: __m256d) -> i32 ---
@(link_name="llvm.x86.avx.vtestnzc.pd") llvm_vtestnzcpd :: proc(a, b: __m128d) -> i32 ---
@(link_name="llvm.x86.avx.vtestz.ps.256") llvm_vtestzps256 :: proc(a, b: __m256) -> i32 ---
@(link_name="llvm.x86.avx.vtestc.ps.256") llvm_vtestcps256 :: proc(a, b: __m256) -> i32 ---
@(link_name="llvm.x86.avx.vtestnzc.ps.256") llvm_vtestnzcps256 :: proc(a, b: __m256) -> i32 ---
@(link_name="llvm.x86.avx.vtestnzc.ps") llvm_vtestnzcps :: proc(a: __m128, b: __m128) -> i32 ---
@(link_name="llvm.x86.avx.min.ps.256") llvm_vminps :: proc(a, b: __m256) -> __m256 ---
@(link_name="llvm.x86.avx.max.ps.256") llvm_vmaxps :: proc(a, b: __m256) -> __m256 ---
@(link_name="llvm.x86.avx.min.pd.256") llvm_vminpd :: proc(a, b: __m256d) -> __m256d ---
@(link_name="llvm.x86.avx.max.pd.256") llvm_vmaxpd :: proc(a, b: __m256d) -> __m256d ---
}