Odin/core/simd/x86/avx.odin

#+build i386, amd64
package simd_x86

import "base:intrinsics"

// Adds packed double-precision (64-bit) floating-point elements in `a` and `b`.
@(require_results, enable_target_feature="avx")
_mm256_add_pd :: #force_inline proc "c" (a, b: __m256d) -> __m256d {
	return intrinsics.simd_add(a, b)
}

// Adds packed single-precision (32-bit) floating-point elements in `a` and `b`.
@(require_results, enable_target_feature="avx")
_mm256_add_ps :: #force_inline proc "c" (a, b: __m256) -> __m256 {
	return intrinsics.simd_add(a, b)
}

// Computes the bitwise AND of a packed double-precision (64-bit) floating-point elements in `a` and `b`.
@(require_results, enable_target_feature="avx")
_mm256_and_pd :: #force_inline proc "c" (a, b: __m256d) -> __m256d {
	a := transmute(#simd[4]u64)a
	b := transmute(#simd[4]u64)b
	return transmute(__m256d)intrinsics.simd_bit_and(a, b)
}

// Computes the bitwise AND of packed single-precision (32-bit) floating-point elements in `a` and `b`.
@(require_results, enable_target_feature="avx")
_mm256_and_ps :: #force_inline proc "c" (a, b: __m256) -> __m256 {
	a := transmute(#simd[8]u32)a
	b := transmute(#simd[8]u32)b
	return transmute(__m256)intrinsics.simd_bit_and(a, b)
}

// Computes the bitwise OR packed double-precision (64-bit) floating-point elements in `a` and `b`.
@(require_results, enable_target_feature="avx")
_mm256_or_pd :: #force_inline proc "c" (a, b: __m256d) -> __m256d {
	a := transmute(#simd[4]u64)a
	b := transmute(#simd[4]u64)b
	return transmute(__m256d)intrinsics.simd_bit_or(a, b)
}

// Computes the bitwise OR packed single-precision (32-bit) floating-point elements in `a` and `b`.
@(require_results, enable_target_feature="avx")
_mm256_or_ps :: #force_inline proc "c" (a, b: __m256) -> __m256 {
	a := transmute(#simd[8]u32)a
	b := transmute(#simd[8]u32)b
	return transmute(__m256)intrinsics.simd_bit_or(a, b)
}

// Shuffles double-precision (64-bit) floating-point elements within 128-bit lanes using the control in `imm8`.
@(require_results, enable_target_feature="avx")
_mm256_shuffle_pd :: #force_inline proc "c" (a, b: __m256d, $MASK: u8) -> __m256d {
	return intrinsics.simd_shuffle(
		a,
		b,
		MASK & 1,
		((MASK >> 1) & 1) + 4,
		((MASK >> 2) & 1) + 2,
		((MASK >> 3) & 1) + 6,
	)
}


// Shuffles single-precision (32-bit) floating-point elements in `a` within 128-bit lanes using the control in `imm8`.
@(require_results, enable_target_feature="avx")
_mm256_shuffle_ps :: #force_inline proc "c" (a, b: __m256, $MASK: u8) -> __m256 {
	return intrinsics.simd_shuffle(
		a,
		b,
		MASK & 0b11,
		(MASK >> 2) & 0b11,
		((MASK >> 4) & 0b11) + 8,
		((MASK >> 6) & 0b11) + 8,
		(MASK & 0b11) + 4,
		((MASK >> 2) & 0b11) + 4,
		((MASK >> 4) & 0b11) + 12,
		((MASK >> 6) & 0b11) + 12,
	)
}


// Computes the bitwise NOT of packed double-precision (64-bit) floating-point elements in `a`, and then AND with `b`.
@(require_results, enable_target_feature="avx")
_mm256_andnot_pd :: #force_inline proc "c" (a, b: __m256d) -> __m256d {
	a := transmute(#simd[4]u64)a
	b := transmute(#simd[4]u64)b
	return transmute(__m256d)intrinsics.simd_bit_and(intrinsics.simd_bit_xor((#simd[4]u64)(0), a), b)
}

// Computes the bitwise NOT of packed single-precision (32-bit) floating-point elements in `a` and then AND with `b`.
@(require_results, enable_target_feature="avx")
_mm256_andnot_ps :: #force_inline proc "c" (a, b: __m256) -> __m256 {
	a := transmute(#simd[8]u32)a
	b := transmute(#simd[8]u32)b
	return transmute(__m256)intrinsics.simd_bit_and(intrinsics.simd_bit_xor((#simd[8]u32)(0), a), b)
}


// Compares packed double-precision (64-bit) floating-point elements in `a` and `b`, and returns packed maximum values
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_pd)
@(require_results, enable_target_feature="avx")
_mm256_max_pd :: #force_inline proc "c" (a, b: __m256d) -> __m256d {
	return llvm_vmaxpd(a, b)
}

// Compares packed single-precision (32-bit) floating-point elements in `a` and `b`, and returns packed maximum values
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_ps)
@(require_results, enable_target_feature="avx")
_mm256_max_ps :: #force_inline proc "c" (a, b: __m256) -> __m256 {
	return llvm_vmaxps(a, b)
}

// Compares packed double-precision (64-bit) floating-point elements in `a` and `b`, and returns packed minimum values
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_pd)
@(require_results, enable_target_feature="avx")
_mm256_min_pd :: #force_inline proc "c" (a, b: __m256d) -> __m256d {
	return llvm_vminpd(a, b)
}

// Compares packed single-precision (32-bit) floating-point elements in `a` and `b`, and returns packed minimum values
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_ps)
@(require_results, enable_target_feature="avx")
_mm256_min_ps :: #force_inline proc "c" (a, b: __m256) -> __m256 {
	return llvm_vminps(a, b)
}


// Multiplies packed double-precision (64-bit) floating-point elements in `a` and `b`.
@(require_results, enable_target_feature="avx")
_mm256_mul_pd :: #force_inline proc "c" (a, b: __m256d) -> __m256d {
	return intrinsics.simd_mul(a, b)
}

// Multiplies packed single-precision (32-bit) floating-point elements in `a` and `b`.
@(require_results, enable_target_feature="avx")
_mm256_mul_ps :: #force_inline proc "c" (a, b: __m256) -> __m256 {
	return intrinsics.simd_mul(a, b)
}

// Alternatively adds and subtracts packed double-precision (64-bit) floating-point elements in `a` to/from packed elements in `b`.
@(require_results, enable_target_feature="avx")
_mm256_addsub_pd :: #force_inline proc "c" (a, b: __m256d) -> __m256d {
	add := intrinsics.simd_add(a, b)
	sub := intrinsics.simd_sub(a, b)
	return intrinsics.simd_shuffle(add, sub, 4, 1, 6, 3)
}


// Alternatively adds and subtracts packed single-precision (32-bit) floating-point elements in `a` to/from packed elements in `b`.
@(require_results, enable_target_feature="avx")
_mm256_addsub_ps :: #force_inline proc "c" (a, b: __m256) -> __m256 {
	add := intrinsics.simd_add(a, b)
	sub := intrinsics.simd_sub(a, b)
	return intrinsics.simd_shuffle(add, sub, 8, 1, 10, 3, 12, 5, 14, 7)
}


// Subtracts packed double-precision (64-bit) floating-point elements in `b`
// from packed elements in `a`.
@(require_results, enable_target_feature="avx")
_mm256_sub_pd :: #force_inline proc "c" (a, b: __m256d) -> __m256d {
	return intrinsics.simd_sub(a, b)
}

// Subtracts packed single-precision (32-bit) floating-point elements in `b`
// from packed elements in `a`.
@(require_results, enable_target_feature="avx")
_mm256_sub_ps :: #force_inline proc "c" (a, b: __m256) -> __m256 {
	return intrinsics.simd_sub(a, b)
}

// Computes the division of each of the 8 packed 32-bit floating-point elements
// in `a` by the corresponding packed elements in `b`.
@(require_results, enable_target_feature="avx")
_mm256_div_ps :: #force_inline proc "c" (a, b: __m256) -> __m256 {
	return intrinsics.simd_div(a, b)
}

// Computes the division of each of the 4 packed 64-bit floating-point elements
// in `a` by the corresponding packed elements in `b`.
@(require_results, enable_target_feature="avx")
_mm256_div_pd :: #force_inline proc "c" (a, b: __m256d) -> __m256d {
	return intrinsics.simd_div(a, b)
}


// Rounds packed double-precision (64-bit) floating point elements in `a`
// according to the flag `ROUNDING`. The value of `ROUNDING` may be as follows:
//
// - `0x00`: Round to the nearest whole number.
// - `0x01`: Round down, toward negative infinity.
// - `0x02`: Round up, toward positive infinity.
// - `0x03`: Truncate the values.
//
// For a complete list of options, check [the LLVM docs][llvm_docs].
//
// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_round_pd)
@(require_results, enable_target_feature="avx")
_mm256_round_pd :: #force_inline proc "c" (a: __m256d, $ROUNDING: u8) -> __m256d where ROUNDING < 16 {
	return llvm_roundpd256(a, ROUNDING)
}

// Rounds packed double-precision (64-bit) floating point elements in `a`
// toward positive infinity.
@(require_results, enable_target_feature="avx")
_mm256_ceil_pd :: #force_inline proc "c" (a: __m256d) -> __m256d {
	return intrinsics.simd_ceil(a)
}

// Rounds packed double-precision (64-bit) floating point elements in `a`
// toward negative infinity.
@(require_results, enable_target_feature="avx")
_mm256_floor_pd :: #force_inline proc "c" (a: __m256d) -> __m256d {
	return intrinsics.simd_floor(a)
}


// Rounds packed single-precision (32-bit) floating point elements in `a`
// according to the flag `ROUNDING`. The value of `ROUNDING` may be as follows:
//
// - `0x00`: Round to the nearest whole number.
// - `0x01`: Round down, toward negative infinity.
// - `0x02`: Round up, toward positive infinity.
// - `0x03`: Truncate the values.
//
// For a complete list of options, check [the LLVM docs][llvm_docs].
//
// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_round_ps)
@(require_results, enable_target_feature="avx")
_mm256_round_ps :: #force_inline proc(a: __m256, $ROUNDING: u8) -> __m256 where ROUNDING < 16 {
	return llvm_roundps256(a, u32(ROUNDING))
}

// Rounds packed single-precision (32-bit) floating point elements in `a`
// toward positive infinity.
@(require_results, enable_target_feature="avx")
_mm256_ceil_ps :: #force_inline proc "c" (a: __m256) -> __m256 {
	return intrinsics.simd_ceil(a)
}

// Rounds packed single-precision (32-bit) floating point elements in `a`
// toward negative infinity.
@(require_results, enable_target_feature="avx")
_mm256_floor_ps :: #force_inline proc "c" (a: __m256) -> __m256 {
	return intrinsics.simd_floor(a)
}

// Returns the square root of packed single-precision (32-bit) floating point elements in `a`.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sqrt_ps)
@(require_results, enable_target_feature="avx")
_mm256_sqrt_ps :: #force_inline proc "c" (a: __m256) -> __m256 {
	return intrinsics.sqrt(a)
}

// Returns the square root of packed double-precision (64-bit) floating point elements in `a`.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sqrt_pd)
@(require_results, enable_target_feature="avx")
_mm256_sqrt_pd :: #force_inline proc "c" (a: __m256d) -> __m256d {
	return intrinsics.sqrt(a)
}


// Blends packed double-precision (64-bit) floating-point elements from
// `a` and `b` using control mask `imm8`.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_pd)
@(require_results, enable_target_feature="avx")
_mm256_blend_pd :: #force_inline proc "c" (a, b: __m256d, $IIM4: u32) -> __m256d where IMM4 < 16 {
	return intrinsics.simd_shuffle(
		a,
		b,
		((IMM4 >> 0) & 1) * 4 + 0,
		((IMM4 >> 1) & 1) * 4 + 1,
		((IMM4 >> 2) & 1) * 4 + 2,
		((IMM4 >> 3) & 1) * 4 + 3,
	)
}

// Blends packed single-precision (32-bit) floating-point elements from
// `a` and `b` using control mask `imm8`.
@(require_results, enable_target_feature="avx")
_mm256_blend_ps :: #force_inline proc "c" (a, b: __m256, $IMM8: u8) -> __m256 {
	return intrinsics.simd_shuffle(
		a,
		b,
		((IMM8 >> 0) & 1) * 8 + 0,
		((IMM8 >> 1) & 1) * 8 + 1,
		((IMM8 >> 2) & 1) * 8 + 2,
		((IMM8 >> 3) & 1) * 8 + 3,
		((IMM8 >> 4) & 1) * 8 + 4,
		((IMM8 >> 5) & 1) * 8 + 5,
		((IMM8 >> 6) & 1) * 8 + 6,
		((IMM8 >> 7) & 1) * 8 + 7,
	)
}


// Blends packed double-precision (64-bit) floating-point elements from
// `a` and `b` using `c` as a mask.
@(require_results, enable_target_feature="avx")
_mm256_blendv_pd :: #force_inline proc "c" (a, b: __m256d, c: __m256d) -> __m256d {
	mask := intrinsics.simd_lanes_lt(transmute(#simd[4]i64)c, 0)
	return intrinsics.simd_select(mask, b, a)
}

// Blends packed single-precision (32-bit) floating-point elements from
// `a` and `b` using `c` as a mask.
@(require_results, enable_target_feature="avx")
_mm256_blendv_ps :: #force_inline proc "c" (a, b: __m256, c: __m256) -> __m256 {
	mask := intrinsics.simd_lanes_lt(transmute(#simd[8]i32)c, 0)
	return intrinsics.simd_select(mask, b, a)
}


// Conditionally multiplies the packed single-precision (32-bit) floating-point elements in `a` and `b` using the high 4 bits in `imm8`,
// sum the four products, and conditionally return the sum
//  using the low 4 bits of `imm8`.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dp_ps)
@(require_results, enable_target_feature="avx")
_mm256_dp_ps :: #force_inline proc "c" (a, b: __m256, $IMM8: i8) -> __m256 {
	return llvm_vdpps(a, b, IMM8)
}

// Horizontal addition of adjacent pairs in the two packed vectors
// of 4 64-bit floating points `a` and `b`.
// In the result, sums of elements from `a` are returned in even locations,
// while sums of elements from `b` are returned in odd locations.
@(require_results, enable_target_feature="avx")
_mm256_hadd_pd :: #force_inline proc "c" (a, b: __m256d) -> __m256d {
	even := intrinsics.simd_shuffle(a, b, 0, 4, 2, 6)
	odd  := intrinsics.simd_shuffle(a, b, 1, 5, 3, 7)
	return intrinsics.simd_add(even, odd)
}


// Horizontal addition of adjacent pairs in the two packed vectors
// of 8 32-bit floating points `a` and `b`.
// In the result, sums of elements from `a` are returned in locations of
// indices 0, 1, 4, 5; while sums of elements from `b` are locations
// 2, 3, 6, 7.
@(require_results, enable_target_feature="avx")
_mm256_hadd_ps :: #force_inline proc "c" (a, b: __m256) -> __m256 {
	even := intrinsics.simd_shuffle(a, b, 0, 2, 8, 10, 4, 6, 12, 14)
	odd  := intrinsics.simd_shuffle(a, b, 1, 3, 9, 11, 5, 7, 13, 15)
	return intrinsics.simd_add(even, odd)
}

// Horizontal subtraction of adjacent pairs in the two packed vectors
// of 4 64-bit floating points `a` and `b`.
// In the result, sums of elements from `a` are returned in even locations,
// while sums of elements from `b` are returned in odd locations.
@(require_results, enable_target_feature="avx")
_mm256_hsub_pd :: #force_inline proc "c" (a, b: __m256d) -> __m256d {
	even := intrinsics.simd_shuffle(a, b, 0, 4, 2, 6)
	odd  := intrinsics.simd_shuffle(a, b, 1, 5, 3, 7)
	return intrinsics.simd_sub(even, odd)
}

// Horizontal subtraction of adjacent pairs in the two packed vectors
// of 8 32-bit floating points `a` and `b`.
// In the result, sums of elements from `a` are returned in locations of
// indices 0, 1, 4, 5; while sums of elements from `b` are locations
// 2, 3, 6, 7.
@(require_results, enable_target_feature="avx")
_mm256_hsub_ps :: #force_inline proc "c" (a, b: __m256) -> __m256 {
	even := intrinsics.simd_shuffle(a, b, 0, 2, 8, 10, 4, 6, 12, 14)
	odd  := intrinsics.simd_shuffle(a, b, 1, 3, 9, 11, 5, 7, 13, 15)
	return intrinsics.simd_sub(even, odd)
}

// Computes the bitwise XOR of packed double-precision (64-bit) floating-point elements in `a` and `b`.
@(require_results, enable_target_feature="avx")
_mm256_xor_pd :: #force_inline proc "c" (a, b: __m256d) -> __m256d {
	a := transmute(#simd[4]u64)a
	b := transmute(#simd[4]u64)b
	return transmute(__m256d)intrinsics.simd_bit_xor(a, b)
}

// Computes the bitwise XOR of packed single-precision (32-bit) floating-point elements in `a` and `b`.
@(require_results, enable_target_feature="avx")
_mm256_xor_ps :: #force_inline proc "c" (a, b: __m256) -> __m256 {
	a := transmute(#simd[8]u32)a
	b := transmute(#simd[8]u32)b
	return transmute(__m256)intrinsics.simd_bit_xor(a, b)
}


_CMP_EQ_OQ    :: 0x00 // Equal (ordered, non-signaling)
_CMP_LT_OS    :: 0x01 // Less-than (ordered, signaling)
_CMP_LE_OS    :: 0x02 // Less-than-or-equal (ordered, signaling)
_CMP_UNORD_Q  :: 0x03 // Unordered (non-signaling)
_CMP_NEQ_UQ   :: 0x04 // Not-equal (unordered, non-signaling)
_CMP_NLT_US   :: 0x05 // Not-less-than (unordered, signaling)
_CMP_NLE_US   :: 0x06 // Not-less-than-or-equal (unordered, signaling)
_CMP_ORD_Q    :: 0x07 // Ordered (non-signaling)
_CMP_EQ_UQ    :: 0x08 // Equal (unordered, non-signaling)
_CMP_NGE_US   :: 0x09 // Not-greater-than-or-equal (unordered, signaling)
_CMP_NGT_US   :: 0x0a // Not-greater-than (unordered, signaling)
_CMP_FALSE_OQ :: 0x0b // False (ordered, non-signaling)
_CMP_NEQ_OQ   :: 0x0c // Not-equal (ordered, non-signaling)
_CMP_GE_OS    :: 0x0d // Greater-than-or-equal (ordered, signaling)
_CMP_GT_OS    :: 0x0e // Greater-than (ordered, signaling)
_CMP_TRUE_UQ  :: 0x0f // True (unordered, non-signaling)
_CMP_EQ_OS    :: 0x10 // Equal (ordered, signaling)
_CMP_LT_OQ    :: 0x11 // Less-than (ordered, non-signaling)
_CMP_LE_OQ    :: 0x12 // Less-than-or-equal (ordered, non-signaling)
_CMP_UNORD_S  :: 0x13 // Unordered (signaling)
_CMP_NEQ_US   :: 0x14 // Not-equal (unordered, signaling)
_CMP_NLT_UQ   :: 0x15 // Not-less-than (unordered, non-signaling)
_CMP_NLE_UQ   :: 0x16 // Not-less-than-or-equal (unordered, non-signaling)
_CMP_ORD_S    :: 0x17 // Ordered (signaling)
_CMP_EQ_US    :: 0x18 // Equal (unordered, signaling)
_CMP_NGE_UQ   :: 0x19 // Not-greater-than-or-equal (unordered, non-signaling)
_CMP_NGT_UQ   :: 0x1a // Not-greater-than (unordered, non-signaling)
_CMP_FALSE_OS :: 0x1b // False (ordered, signaling)
_CMP_NEQ_OS   :: 0x1c // Not-equal (ordered, signaling)
_CMP_GE_OQ    :: 0x1d // Greater-than-or-equal (ordered, non-signaling)
_CMP_GT_OQ    :: 0x1e // Greater-than (ordered, non-signaling)
_CMP_TRUE_US  :: 0x1f // True (unordered, signaling)


// Compares packed double-precision (64-bit) floating-point elements in `a` and `b` based on the comparison operand specified by `IMM5`.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_pd)
@(require_results, enable_target_feature="avx")
_mm_cmp_pd :: #force_inline proc "c" (a, b: __m128d, $IMM5: u8) -> __m128d where IMM5 < 32 {
	return llvm_vcmppd(a, b, IMM5)
}

// Compares packed double-precision (64-bit) floating-point elements in `a` and `b` based on the comparison operand specified by `IMM5`.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_pd)
@(require_results, enable_target_feature="avx")
_mm256_cmp_pd :: #force_inline proc "c" (a, b: __m256d, $IMM5: u8) -> __m256d where IMM5 < 32 {
	return llvm_vcmppd256(a, b, IMM5)
}

// Compares packed single-precision (32-bit) floating-point elements in `a` and `b` based on the comparison operand specified by `IMM5`.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_ps)
@(require_results, enable_target_feature="avx")
_mm_cmp_ps :: #force_inline proc "c" (a: __m128, b: __m128, $IMM5: u8) -> __m128 where IMM5 < 32 {
	return llvm_vcmpps(a, b, IMM5)
}

// Compares packed single-precision (32-bit) floating-point elements in `a` and `b` based on the comparison operand specified by `IMM5`.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_ps)
@(require_results, enable_target_feature="avx")
_mm256_cmp_ps :: #force_inline proc "c" (a, b: __m256, $IMM5: u8) -> __m256 where IMM5 < 32 {
	return llvm_vcmpps256(a, b, IMM5)
}

// Compares the lower double-precision (64-bit) floating-point element in
// `a` and `b` based on the comparison operand specified by `IMM5`,
// store the result in the lower element of returned vector,
// and copies the upper element from `a` to the upper element of returned
// vector.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_sd)
@(require_results, enable_target_feature="avx")
_mm_cmp_sd :: #force_inline proc "c" (a, b: __m128d, $IMM5: u8) -> __m128d where IMM5 < 32 {
	return llvm_vcmpsd(a, b, IMM5)
}

// Compares the lower single-precision (32-bit) floating-point element in
// `a` and `b` based on the comparison operand specified by `IMM5`,
// store the result in the lower element of returned vector,
// and copies the upper 3 packed elements from `a` to the upper elements of
// returned vector.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_ss)
@(require_results, enable_target_feature="avx")
_mm_cmp_ss :: #force_inline proc "c" (a: __m128, b: __m128, $IMM5: u8) -> __m128 where IMM5 < 32 {
	return llvm_vcmpss(a, b, IMM5)
}

// Converts packed 32-bit integers in `a` to packed double-precision (64-bit) floating-point elements.
@(require_results, enable_target_feature="avx")
_mm256_cvtepi32_pd :: #force_inline proc "c" (a: __m128i) -> __m256d {
	return __m256d(transmute(#simd[4]i32)a)
}

// Converts packed 32-bit integers in `a` to packed single-precision (32-bit) floating-point elements.
@(require_results, enable_target_feature="avx")
_mm256_cvtepi32_ps :: #force_inline proc "c" (a: __m256i) -> __m256 {
	return __m256(transmute(#simd[8]i32)a)
}

// Converts packed double-precision (64-bit) floating-point elements in `a` to packed single-precision (32-bit) floating-point elements.
@(require_results, enable_target_feature="avx")
_mm256_cvtpd_ps :: #force_inline proc "c" (a: __m256d) -> __m128 {
	return __m128(a)
}

// Converts packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit integers.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtps_epi32)
@(require_results, enable_target_feature="avx")
_mm256_cvtps_epi32 :: #force_inline proc "c" (a: __m256) -> __m256i {
	return transmute(__m256i)llvm_vcvtps2dq(a)
}

// Converts packed single-precision (32-bit) floating-point elements in `a` to packed double-precision (64-bit) floating-point elements.
@(require_results, enable_target_feature="avx")
_mm256_cvtps_pd :: #force_inline proc "c" (a: __m128) -> __m256d {
	return __m256d(a)
}

// Returns the first element of the input vector of `[4 x double]`.
@(require_results, enable_target_feature="avx")
_mm256_cvtsd_f64 :: #force_inline proc "c" (a: __m256d) -> f64 {
	return intrinsics.simd_extract(a, 0)
}

// Converts packed double-precision (64-bit) floating-point elements in `a` to packed 32-bit integers with truncation.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvttpd_epi32)
@(require_results, enable_target_feature="avx")
_mm256_cvttpd_epi32 :: #force_inline proc "c" (a: __m256d) -> __m128i {
	return transmute(__m128i)llvm_vcvttpd2dq(a)
}

// Converts packed double-precision (64-bit) floating-point elements in `a` to packed 32-bit integers.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtpd_epi32)
@(require_results, enable_target_feature="avx")
_mm256_cvtpd_epi32 :: #force_inline proc "c" (a: __m256d) -> __m128i {
	return transmute(__m128i)llvm_vcvtpd2dq(a)
}

// Converts packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit integers with truncation.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvttps_epi32)
@(require_results, enable_target_feature="avx")
_mm256_cvttps_epi32 :: #force_inline proc "c" (a: __m256) -> __m256i {
	return transmute(__m256i)llvm_vcvttps2dq(a)
}


// Extracts 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from `a`, selected with `imm8`.
@(require_results, enable_target_feature="avx")
_mm256_extractf128_ps :: #force_inline proc "c" (a: __m256, $IMM1: u8) -> __m128 where IMM1 < 2 {
	when IMM1 == 0 {
		return intrinsics.simd_shuffle(a, _mm256_undefined_ps(), 0, 1, 2, 3)
	} else {
		return intrinsics.simd_shuffle(a, _mm256_undefined_ps(), 4, 5, 6, 7)
	}
}

// Extracts 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from `a`, selected with `imm8`.
@(require_results, enable_target_feature="avx")
_mm256_extractf128_pd :: #force_inline proc "c" (a: __m256d, $IMM1: u8) -> __m128d where IMM1 < 2 {
	when IMM1 == 0 {
		return intrinsics.simd_shuffle(a, _mm256_undefined_pd(), 0, 1)
	} else {
		return intrinsics.simd_shuffle(a, _mm256_undefined_pd(), 2, 3)
	}
}

// Extracts 128 bits (composed of integer data) from `a`, selected with `imm8`.
@(require_results, enable_target_feature="avx")
_mm256_extractf128_si256 :: #force_inline proc "c" (a: __m256i, $IMM1: u8) -> __m128i where IMM1 < 2 {
	when IMM1 == 0 {
		dst := intrinsics.simd_shuffle(transmute(#simd[4]i64)a, (#simd[4]i64)(0), 0, 1)
		return transmute(__m128i)dst
	} else {
		dst := intrinsics.simd_shuffle(transmute(#simd[4]i64)a, (#simd[4]i64)(0), 2, 3)
		return transmute(__m128i)dst
	}
}

// Extracts a 32-bit integer from `a`, selected with `INDEX`.
@(require_results, enable_target_feature="avx")
_mm256_extract_epi32 :: #force_inline proc "c" (a: __m256i, $INDEX: u8) -> i32 where INDEX < 8 {
	return intrinsics.simd_extract(transmute(#simd[8]i32)a, INDEX)
}

@(require_results, enable_target_feature="avx")
_mm256_cvtsi256_si32 :: #force_inline proc "c" (a: __m256i) -> i32 {
	return intrinsics.simd_extract(transmute(#simd[8]i32)a, 0)
}

// Zeroes the contents of all XMM or YMM registers.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zeroall)
@(enable_target_feature="avx")
_mm256_zeroall :: #force_inline proc "c" () {
	llvm_vzeroall()
}

// Zeroes the upper 128 bits of all YMM registers; the lower 128-bits of the registers are unmodified.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zeroupper)
@(enable_target_feature="avx")
_mm256_zeroupper :: #force_inline proc "c" () {
	llvm_vzeroupper()
}

// Shuffles single-precision (32-bit) floating-point elements in `a` within 128-bit lanes using the control in `b`.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar_ps)
@(require_results, enable_target_feature="avx")
_mm256_permutevar_ps :: #force_inline proc "c" (a: __m256, b: __m256i) -> __m256 {
	return llvm_vpermilps256(a, transmute(#simd[8]i32)b)
}

// Shuffles single-precision (32-bit) floating-point elements in `a` using the control in `b`.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutevar_ps)
@(require_results, enable_target_feature="avx")
_mm_permutevar_ps :: #force_inline proc "c" (a: __m128, b: __m128i) -> __m128 {
	return llvm_vpermilps(a, transmute(#simd[4]i32)b)
}

// Shuffles single-precision (32-bit) floating-point elements in `a` within 128-bit lanes using the control in `imm8`.
@(require_results, enable_target_feature="avx")
_mm256_permute_ps :: #force_inline proc "c" (a: __m256, $IMM8: u8) -> __m256 {
	return intrinsics.simd_shuffle(
		a,
		_mm256_undefined_ps(),
		(IMM8 >> 0) & 0b11,
		(IMM8 >> 2) & 0b11,
		(IMM8 >> 4) & 0b11,
		(IMM8 >> 6) & 0b11,
		((IMM8 >> 0) & 0b11) + 4,
		((IMM8 >> 2) & 0b11) + 4,
		((IMM8 >> 4) & 0b11) + 4,
		((IMM8 >> 6) & 0b11) + 4,
	)
}

// Shuffles single-precision (32-bit) floating-point elements in `a` using the control in `imm8`.
@(require_results, enable_target_feature="avx")
_mm_permute_ps :: #force_inline proc "c" (a: __m128, $IMM8: u8) -> __m128 {
	return intrinsics.simd_shuffle(
		a,
		_mm_undefined_ps(),
		(IMM8 >> 0) & 0b11,
		(IMM8 >> 2) & 0b11,
		(IMM8 >> 4) & 0b11,
		(IMM8 >> 6) & 0b11,
	)
}

// Shuffles double-precision (64-bit) floating-point elements in `a` within 256-bit lanes using the control in `b`.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar_pd)
@(require_results, enable_target_feature="avx")
_mm256_permutevar_pd :: #force_inline proc "c" (a: __m256d, b: __m256i) -> __m256d {
	return llvm_vpermilpd256(a, b)
}

// Shuffles double-precision (64-bit) floating-point elements in `a` using the control in `b`.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutevar_pd)
@(require_results, enable_target_feature="avx")
_mm_permutevar_pd :: #force_inline proc "c" (a: __m128d, b: __m128i) -> __m128d {
	return llvm_vpermilpd(a, b)
}

// Shuffles double-precision (64-bit) floating-point elements in `a` within 128-bit lanes using the control in `imm8`.
@(require_results, enable_target_feature="avx")
_mm256_permute_pd :: #force_inline proc "c" (a: __m256d, $IMM4: u8) -> __m256d where IMM4 < 16 {
	return intrinsics.simd_shuffle(
		a,
		_mm256_undefined_pd(),
		((IMM4 >> 0) & 1),
		((IMM4 >> 1) & 1),
		((IMM4 >> 2) & 1) + 2,
		((IMM4 >> 3) & 1) + 2,
	)
}

// Shuffles double-precision (64-bit) floating-point elements in `a` using the control in `imm8`.
@(require_results, enable_target_feature="avx")
_mm_permute_pd :: #force_inline proc "c" (a: __m128d, $IMM2: u8) -> __m128d where IMM2 < 4 {
	return intrinsics.simd_shuffle(
		a,
		_mm_undefined_pd(),
		(IMM2) & 1,
		(IMM2 >> 1) & 1,
	)
}


// Shuffles 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) selected by `imm8` from `a` and `b`.
@(require_results, enable_target_feature="avx")
_mm256_permute2f128_ps :: #force_inline proc "c" (a, b: __m256, $IMM8: u8) -> __m256 {
	return _mm256_castsi256_ps(_mm256_permute2f128_si256(
		_mm256_castps_si256(a),
		_mm256_castps_si256(b),
		IMM8,
	))
}

// Shuffles 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) selected by `imm8` from `a` and `b`.
@(require_results, enable_target_feature="avx")
_mm256_permute2f128_pd :: #force_inline proc "c" (a, b: __m256d, $IMM8: u8) -> __m256d {
	_mm256_castsi256_pd(_mm256_permute2f128_si256(
		_mm256_castpd_si256(a),
		_mm256_castpd_si256(b),
		IMM8,
	))
}

// Shuffles 128-bits (composed of integer data) selected by `imm8` from `a` and `b`.
@(require_results, enable_target_feature="avx")
_mm256_permute2f128_si256 :: #force_inline proc "c" (a, b: __m256i, $IMM8: u8) -> __m256i {
	r := intrinsics.simd_shuffle(
		a,
		b,
		2 * ((IMM8 & 0xf) & 0b11) + 0,
		2 * ((IMM8 & 0xf) & 0b11) + 1,

		2 * (((IMM8 & 0xf0) >> 4) & 0b11) + 0,
		2 * (((IMM8 & 0xf0) >> 4) & 0b11) + 1,
	)
	return intrinsics.simd_shuffle(
		r,
		__m256i(0),

		4 if ((IMM8 & 0xf) & 0b1000) != 0 else 0,
		4 if ((IMM8 & 0xf) & 0b1000) != 0 else 1,

		4 if (((IMM8 & 0xf0)>>4) & 0b1000) != 0 else 2,
		4 if (((IMM8 & 0xf0)>>4) & 0b1000) != 0 else 3,
	)
}

// Broadcasts a single-precision (32-bit) floating-point element from memory to all elements of the returned vector.
@(require_results, enable_target_feature="avx")
_mm256_broadcast_ss :: #force_inline proc "c" (f: ^f32) -> __m256 {
	return _mm256_set1_ps(f^)
}

// Broadcasts a single-precision (32-bit) floating-point element from memory to all elements of the returned vector.
@(require_results, enable_target_feature="sse,avx")
_mm_broadcast_ss :: #force_inline proc "c" (f: ^f32) -> __m128 {
	return _mm_set1_ps(f^)
}

// Broadcasts a double-precision (64-bit) floating-point element from memory to all elements of the returned vector.
@(require_results, enable_target_feature="avx")
_mm256_broadcast_sd :: #force_inline proc "c" (f: ^f64) -> __m256d {
	return _mm256_set1_pd(f^)
}

// Broadcasts 128 bits from memory (composed of 4 packed single-precision (32-bit) floating-point elements) to all elements of the returned vector.
@(require_results, enable_target_feature="sse,avx")
_mm256_broadcast_ps :: #force_inline proc "c" (a: ^__m128) -> __m256 {
	return intrinsics.simd_shuffle(a^, _mm_setzero_ps(), 0, 1, 2, 3, 0, 1, 2, 3)
}

// Broadcasts 128 bits from memory (composed of 2 packed double-precision (64-bit) floating-point elements) to all elements of the returned vector.
@(require_results, enable_target_feature="sse2,avx")
_mm256_broadcast_pd :: #force_inline proc "c" (a: ^__m128d) -> __m256d {
	return intrinsics.simd_shuffle(a^, _mm_setzero_pd(), 0, 1, 0, 1)
}

// Copies `a` to result, then inserts 128 bits (composed of 4 packed
// single-precision (32-bit) floating-point elements) from `b` into result
// at the location specified by `imm8`.
@(require_results, enable_target_feature="sse,avx")
_mm256_insertf128_ps :: #force_inline proc "c" (a: __m256, b: __m128, $IMM1: u8) -> __m256 where IMM1 < 2 {
	when IMM1 == 0  {
		return intrinsics.simd_shuffle(
			a,
			_mm256_castps128_ps256(b),
			8, 9, 10, 11, 4, 5, 6, 7,
		)
	} else {
		return intrinsics.simd_shuffle(
			a,
			_mm256_castps128_ps256(b),
			0, 1, 2, 3, 8, 9, 10, 11,
		)
	}
}

// Copies `a` to result, then inserts 128 bits (composed of 2 packed
// double-precision (64-bit) floating-point elements) from `b` into result
// at the location specified by `imm8`.
@(require_results, enable_target_feature="sse2,avx")
_mm256_insertf128_pd :: #force_inline proc "c" (a: __m256d, b: __m128d, $IMM1: u8) -> __m256d where IMM1 < 2 {
	when IMM1 == 0 {
		return intrinsics.simd_shuffle(
			a,
			_mm256_castpd128_pd256(b),
			4, 5, 2, 3,
		)
	} else {
		return intrinsics.simd_shuffle(
			a,
			_mm256_castpd128_pd256(b),
			0, 1, 4, 5,
		)
	}
}

// Copies `a` to result, then inserts 128 bits from `b` into result at the location specified by `imm8`.
@(require_results, enable_target_feature="avx")
_mm256_insertf128_si256 :: #force_inline proc "c" (a: __m256i, b: __m128i, $IMM1: u8) -> __m256i where IMM1 < 2 {
	when IMM1 == 0 {
		return intrinsics.simd_shuffle(
			a,
			_mm256_castsi128_si256(b),
			4, 5, 2, 3,
		)
	} else {
		return intrinsics.simd_shuffle(
			a,
			_mm256_castsi128_si256(b),
			0, 1, 4, 5,
		)
	}
}

// Copies `a` to result, and inserts the 8-bit integer `i` into result at the location specified by `index`.
@(require_results, enable_target_feature="avx")
_mm256_insert_epi8 :: #force_inline proc "c" (a: __m256i, i: i8, $INDEX: u8) -> __m256i where INDEX < 32 {
	return transmute(__m256i)intrinsics.simd_replace(transmute(#simd[32]i8)a, INDEX, i)
}

// Copies `a` to result, and inserts the 16-bit integer `i` into result at the location specified by `index`.
@(require_results, enable_target_feature="avx")
_mm256_insert_epi16 :: #force_inline proc "c" (a: __m256i, i: i16, $INDEX: u8) -> __m256i where INDEX < 16 {
	return transmute(__m256i)intrinsics.simd_replace(transmute(#simd[16]i16)a, INDEX, i)
}

// Copies `a` to result, and inserts the 32-bit integer `i` into result at the location specified by `index`.
@(require_results, enable_target_feature="avx")
_mm256_insert_epi32 :: #force_inline proc "c" (a: __m256i, i: i32, $INDEX: u8) -> __m256i where INDEX < 8 {
	return transmute(__m256i)intrinsics.simd_replace(transmute(#simd[8]i32)a, INDEX, i)
}


// Loads 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from memory into result.
// `mem_addr` must be aligned on a 32-byte boundary or a
// general-protection exception may be generated.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_pd)
@(require_results, enable_target_feature="avx")
_mm256_load_pd :: #force_inline proc "c" (mem_addr: ^f64) -> __m256d {
	return (^__m256d)(mem_addr)^
}

// Stores 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from `a` into memory.
// `mem_addr` must be aligned on a 32-byte boundary or a
// general-protection exception may be generated.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_pd)
@(enable_target_feature="avx")
_mm256_store_pd :: #force_inline proc "c" (mem_addr: ^f64, a: __m256d) {
	(^__m256d)(mem_addr)^ = a
}

// Loads 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from memory into result.
// `mem_addr` must be aligned on a 32-byte boundary or a
// general-protection exception may be generated.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_ps)
@(require_results, enable_target_feature="avx")
_mm256_load_ps :: #force_inline proc "c" (mem_addr: ^f32) -> __m256 {
	return (^__m256)(mem_addr)^
}

// Stores 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from `a` into memory.
// `mem_addr` must be aligned on a 32-byte boundary or a
// general-protection exception may be generated.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_ps)
@(enable_target_feature="avx")
_mm256_store_ps :: #force_inline proc "c" (mem_addr: ^f32, a: __m256) {
	(^__m256)(mem_addr)^ = a
}

// Loads 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from memory into result.
// `mem_addr` does not need to be aligned on any particular boundary.
@(enable_target_feature="avx")
_mm256_loadu_pd :: #force_inline proc "c" (mem_addr: ^f64) -> __m256d {
	return intrinsics.unaligned_load((^__m256d)(mem_addr))
}

// Stores 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from `a` into memory.
// `mem_addr` does not need to be aligned on any particular boundary.
@(enable_target_feature="avx")
_mm256_storeu_pd :: #force_inline proc "c" (mem_addr: ^f64, a: __m256d) {
	intrinsics.unaligned_store((^__m256d)(mem_addr), a)
}

// Loads 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from memory into result.
// `mem_addr` does not need to be aligned on any particular boundary.
@(require_results, enable_target_feature="avx")
_mm256_loadu_ps :: #force_inline proc "c" (mem_addr: ^f32) -> __m256 {
	return intrinsics.unaligned_load((^__m256)(mem_addr))
}

// Stores 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from `a` into memory.
// `mem_addr` does not need to be aligned on any particular boundary.
@(enable_target_feature="avx")
_mm256_storeu_ps :: #force_inline proc "c" (mem_addr: ^f32, a: __m256) {
	intrinsics.unaligned_store((^__m256)(mem_addr), a)
}

// Loads 256-bits of integer data from memory into result.
// `mem_addr` must be aligned on a 32-byte boundary or a
// general-protection exception may be generated.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_si256)
@(require_results, enable_target_feature="avx")
_mm256_load_si256 :: #force_inline proc "c" (mem_addr: ^__m256i) -> __m256i {
	return mem_addr^
}

// Stores 256-bits of integer data from `a` into memory.
// `mem_addr` must be aligned on a 32-byte boundary or a
// general-protection exception may be generated.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_si256)
@(enable_target_feature="avx")
_mm256_store_si256 :: #force_inline proc "c" (mem_addr: ^__m256i, a: __m256i) {
	mem_addr^ = a
}

// Loads 256-bits of integer data from memory into result.
// `mem_addr` does not need to be aligned on any particular boundary.
@(require_results, enable_target_feature="avx")
_mm256_loadu_si256 :: #force_inline proc "c" (mem_addr: ^__m256i) -> __m256i {
	return intrinsics.unaligned_load(mem_addr)
}

// Stores 256-bits of integer data from `a` into memory.
// `mem_addr` does not need to be aligned on any particular boundary.
@(enable_target_feature="avx")
_mm256_storeu_si256 :: #force_inline proc "c" (mem_addr: ^__m256i, a: __m256i) {
	intrinsics.unaligned_store(mem_addr, a)
}

// Loads packed double-precision (64-bit) floating-point elements from memory
// into result using `mask` (elements are zeroed out when the high bit of the
// corresponding element is not set).
@(require_results, enable_target_feature="avx")
_mm256_maskload_pd :: #force_inline proc "c" (mem_addr: ^f64, mask: __m256i) -> __m256d {
	mask_mask := intrinsics.simd_shr(mask, 63)
	return intrinsics.simd_masked_load(mem_addr, _mm256_setzero_pd(), mask_mask)
}

// Stores packed double-precision (64-bit) floating-point elements from `a`
// into memory using `mask`.
@(enable_target_feature="avx")
_mm256_maskstore_pd :: #force_inline proc "c" (mem_addr: ^f64, mask: __m256i, a: __m256d) {
	mask_mask := intrinsics.simd_shr(mask, 63)
	intrinsics.simd_masked_store(mem_addr, a, mask_mask)
}


// Loads packed double-precision (64-bit) floating-point elements from memory
// into result using `mask` (elements are zeroed out when the high bit of the
// corresponding element is not set).
@(require_results, enable_target_feature="sse2,avx")
_mm_maskload_pd :: #force_inline proc "c" (mem_addr: ^f64, mask: __m128i) -> __m128d {
	mask_mask := intrinsics.simd_shr(mask, 63)
	return intrinsics.simd_masked_load(mem_addr, _mm_setzero_pd(), mask_mask)
}

// Stores packed double-precision (64-bit) floating-point elements from `a`
// into memory using `mask`.
@(enable_target_feature="avx")
_mm_maskstore_pd :: #force_inline proc "c" (mem_addr: ^f64, mask: __m128i, a: __m128d) {
	mask_mask := intrinsics.simd_shr(mask, 63)
	intrinsics.simd_masked_store(mem_addr, a, mask_mask)
}

// Loads packed single-precision (32-bit) floating-point elements from memory
// into result using `mask` (elements are zeroed out when the high bit of the
// corresponding element is not set).
@(require_results, enable_target_feature="avx")
_mm256_maskload_ps :: #force_inline proc "c" (mem_addr: ^f32, mask: __m256i) -> __m256 {
	mask_mask := intrinsics.simd_shr(transmute(#simd[8]i32)mask, 31)
	return intrinsics.simd_masked_load(mem_addr, _mm256_setzero_ps(), mask_mask)
}

// Stores packed single-precision (32-bit) floating-point elements from `a`
// into memory using `mask`.
@(enable_target_feature="avx")
_mm256_maskstore_ps :: #force_inline proc "c" (mem_addr: ^f32, mask: __m256i, a: __m256) {
	mask_mask := intrinsics.simd_shr(transmute(#simd[8]i32)mask, 31)
	intrinsics.simd_masked_store(mem_addr, a, mask_mask)
}

// Loads packed single-precision (32-bit) floating-point elements from memory
// into result using `mask` (elements are zeroed out when the high bit of the
// corresponding element is not set).
@(require_results, enable_target_feature="sse,avx")
_mm_maskload_ps :: #force_inline proc "c" (mem_addr: ^f32, mask: __m128i) -> __m128 {
	mask_mask := intrinsics.simd_shr(transmute(#simd[4]i32)mask, 31)
	return intrinsics.simd_masked_load(mem_addr, _mm_setzero_ps(), mask_mask)
}

// Stores packed single-precision (32-bit) floating-point elements from `a`
// into memory using `mask`.
@(enable_target_feature="avx")
_mm_maskstore_ps :: #force_inline proc "c" (mem_addr: ^f32, mask: __m128i, a: __m128) {
	mask_mask := intrinsics.simd_shr(transmute(#simd[4]i32)mask, 31)
	intrinsics.simd_masked_store(mem_addr, a, mask_mask)
}


// Duplicate odd-indexed single-precision (32-bit) floating-point elements from `a`, and returns the results.
@(require_results, enable_target_feature="avx")
_mm256_movehdup_ps :: #force_inline proc "c" (a: __m256) -> __m256 {
	return intrinsics.simd_shuffle(a, a, 1, 1, 3, 3, 5, 5, 7, 7)
}

// Duplicate even-indexed single-precision (32-bit) floating-point elements from `a`, and returns the results.
@(require_results, enable_target_feature="avx")
_mm256_moveldup_ps :: #force_inline proc "c" (a: __m256) -> __m256 {
	return intrinsics.simd_shuffle(a, a, 0, 0, 2, 2, 4, 4, 6, 6)
}

// Duplicate even-indexed double-precision (64-bit) floating-point elements from `a`, and returns the results.
@(require_results, enable_target_feature="avx")
_mm256_movedup_pd :: #force_inline proc "c" (a: __m256d) -> __m256d {
	return intrinsics.simd_shuffle(a, a, 0, 0, 2, 2)
}


// Loads 256-bits of integer data from unaligned memory into result.
// This intrinsic may perform better than `_mm256_loadu_si256` when the
// data crosses a cache line boundary.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_lddqu_si256)
@(require_results, enable_target_feature="avx")
_mm256_lddqu_si256 :: #force_inline proc "c" (mem_addr: ^__m256i) -> __m256i {
	return transmute(__m256i)llvm_vlddqu(mem_addr)
}

/*
// Moves integer data from a 256-bit integer vector to a 32-byte
// aligned memory location. To minimize caching, the data is flagged as
// non-temporal (unlikely to be used again soon)
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_si256)
//
// # Safety of non-temporal stores
//
// After using this intrinsic, but before any other access to the memory that this intrinsic
// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
// return.
//
// See [`_mm_sfence`] for details.
@(enable_target_feature="avx")
_mm256_stream_si256 :: #force_inline proc "c" (mem_addr: ^__m256i, a: __m256i) {
	panic_contextless("TODO: _mm256_stream_si256")
}

// Moves double-precision values from a 256-bit vector of `[4 x double]`
// to a 32-byte aligned memory location. To minimize caching, the data is
// flagged as non-temporal (unlikely to be used again soon).
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_pd)
//
// # Safety of non-temporal stores
//
// After using this intrinsic, but before any other access to the memory that this intrinsic
// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
// return.
//
// See [`_mm_sfence`] for details.
@(enable_target_feature="avx")
_mm256_stream_pd :: #force_inline proc "c" (mem_addr: ^f64, a: __m256d) {
	panic_contextless("TODO: _mm256_stream_pd")
}

// Moves single-precision floating point values from a 256-bit vector
// of `[8 x float]` to a 32-byte aligned memory location. To minimize
// caching, the data is flagged as non-temporal (unlikely to be used again
// soon).
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_ps)
//
// # Safety of non-temporal stores
//
// After using this intrinsic, but before any other access to the memory that this intrinsic
// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
// return.
//
// See [`_mm_sfence`] for details.
@(enable_target_feature="avx")
_mm256_stream_ps :: #force_inline proc "c" (mem_addr: ^f32, a: __m256) {
	panic_contextless("TODO: _mm256_stream_ps")
}
*/

// Computes the approximate reciprocal of packed single-precision (32-bit) floating-point elements in `a`, and returns the results. The maximum
// relative error for this approximation is less than 1.5*2^-12.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rcp_ps)
@(require_results, enable_target_feature="avx")
_mm256_rcp_ps :: #force_inline proc "c" (a: __m256) -> __m256 {
	return llvm_vrcpps(a)
}

// Computes the approximate reciprocal square root of packed single-precision
// (32-bit) floating-point elements in `a`, and returns the results.
// The maximum relative error for this approximation is less than 1.5*2^-12.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rsqrt_ps)
@(require_results, enable_target_feature="avx")
_mm256_rsqrt_ps :: #force_inline proc "c" (a: __m256) -> __m256 {
	return llvm_vrsqrtps(a)
}


// Unpacks and interleave double-precision (64-bit) floating-point elements
// from the high half of each 128-bit lane in `a` and `b`.
@(require_results, enable_target_feature="avx")
_mm256_unpackhi_pd :: #force_inline proc "c" (a, b: __m256d) -> __m256d {
	return intrinsics.simd_shuffle(a, b, 1, 5, 3, 7)
}

// Unpacks and interleave single-precision (32-bit) floating-point elements
// from the high half of each 128-bit lane in `a` and `b`.
@(require_results, enable_target_feature="avx")
_mm256_unpackhi_ps :: #force_inline proc "c" (a, b: __m256) -> __m256 {
	return intrinsics.simd_shuffle(a, b, 2, 10, 3, 11, 6, 14, 7, 15)
}

// Unpacks and interleave double-precision (64-bit) floating-point elements
// from the low half of each 128-bit lane in `a` and `b`.
@(require_results, enable_target_feature="avx")
_mm256_unpacklo_pd :: #force_inline proc "c" (a, b: __m256d) -> __m256d {
	return intrinsics.simd_shuffle(a, b, 0, 4, 2, 6)
}

// Unpacks and interleave single-precision (32-bit) floating-point elements
// from the low half of each 128-bit lane in `a` and `b`.
@(require_results, enable_target_feature="avx")
_mm256_unpacklo_ps :: #force_inline proc "c" (a, b: __m256) -> __m256 {
	return intrinsics.simd_shuffle(a, b, 0, 8, 1, 9, 4, 12, 5, 13)
}

// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
// the result is zero, otherwise set `CF` to 0. Return the `ZF` value.
@(require_results, enable_target_feature="avx")
_mm256_testz_si256 :: #force_inline proc "c" (a, b: __m256i) -> i32 {
	r := intrinsics.simd_bit_and(a, b)
	return i32(0 == intrinsics.simd_reduce_or(r))
}

// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
// the result is zero, otherwise set `CF` to 0. Return the `CF` value.
@(require_results, enable_target_feature="avx")
_mm256_testc_si256 :: #force_inline proc "c" (a, b: __m256i) -> i32 {
	r := intrinsics.simd_bit_and(intrinsics.simd_bit_xor(a, __m256i(~i64(0))), b)
	return i32(0 == intrinsics.simd_reduce_or(r))
}


// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
// the result is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and
// `CF` values are zero, otherwise return 0.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testnzc_si256)
@(require_results, enable_target_feature="avx")
_mm256_testnzc_si256 :: #force_inline proc "c" (a, b: __m256i) -> i32 {
	return llvm_ptestnzc256(a, b)
}

// Computes the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point elements) in `a` and `b`, producing an intermediate 256-bit
// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
// NOT of `a` and then AND with `b`, producing an intermediate value, and set
// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
// is zero, otherwise set `CF` to 0. Return the `ZF` value.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_pd)
@(require_results, enable_target_feature="avx")
_mm256_testz_pd :: #force_inline proc "c" (a, b: __m256d) -> i32 {
	return llvm_vtestzpd256(a, b)
}

// Computes the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point elements) in `a` and `b`, producing an intermediate 256-bit
// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
// NOT of `a` and then AND with `b`, producing an intermediate value, and set
// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
// is zero, otherwise set `CF` to 0. Return the `CF` value.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testc_pd)
@(require_results, enable_target_feature="avx")
_mm256_testc_pd :: #force_inline proc "c" (a, b: __m256d) -> i32 {
	return llvm_vtestcpd256(a, b)
}

// Computes the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point elements) in `a` and `b`, producing an intermediate 256-bit
// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
// NOT of `a` and then AND with `b`, producing an intermediate value, and set
// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
// are zero, otherwise return 0.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testnzc_pd)
@(require_results, enable_target_feature="avx")
_mm256_testnzc_pd :: #force_inline proc "c" (a, b: __m256d) -> i32 {
	return llvm_vtestnzcpd256(a, b)
}

// Computes the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point elements) in `a` and `b`, producing an intermediate 128-bit
// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
// NOT of `a` and then AND with `b`, producing an intermediate value, and set
// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
// is zero, otherwise set `CF` to 0. Return the `ZF` value.
@(require_results, enable_target_feature="sse2,avx")
_mm_testz_pd :: #force_inline proc "c" (a, b: __m128d) -> i32 {
	r := intrinsics.simd_lanes_lt(transmute(__m128i)_mm_and_pd(a, b), __m128i(0))
	return i32(0 == intrinsics.simd_reduce_or(r))
}

// Computes the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point elements) in `a` and `b`, producing an intermediate 128-bit
// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
// NOT of `a` and then AND with `b`, producing an intermediate value, and set
// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
// is zero, otherwise set `CF` to 0. Return the `CF` value.
@(require_results, enable_target_feature="sse2,avx")
_mm_testc_pd :: #force_inline proc "c" (a, b: __m128d) -> i32 {
	r := intrinsics.simd_lanes_lt(transmute(__m128i)_mm_andnot_pd(a, b), __m128i(0))
	return i32(0 == intrinsics.simd_reduce_or(r))
}

// Computes the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point elements) in `a` and `b`, producing an intermediate 128-bit
// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
// NOT of `a` and then AND with `b`, producing an intermediate value, and set
// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
// are zero, otherwise return 0.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_pd)
@(require_results, enable_target_feature="avx")
_mm_testnzc_pd :: #force_inline proc "c" (a, b: __m128d) -> i32 {
	return llvm_vtestnzcpd(a, b)
}

// Computes the bitwise AND of 256 bits (representing single-precision (32-bit) floating-point elements) in `a` and `b`, producing an intermediate 256-bit
// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
// NOT of `a` and then AND with `b`, producing an intermediate value, and set
// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
// is zero, otherwise set `CF` to 0. Return the `ZF` value.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_ps)
@(require_results, enable_target_feature="avx")
_mm256_testz_ps :: #force_inline proc "c" (a, b: __m256) -> i32 {
	return llvm_vtestzps256(a, b)
}

// Computes the bitwise AND of 256 bits (representing single-precision (32-bit) floating-point elements) in `a` and `b`, producing an intermediate 256-bit
// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
// NOT of `a` and then AND with `b`, producing an intermediate value, and set
// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
// is zero, otherwise set `CF` to 0. Return the `CF` value.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testc_ps)
@(require_results, enable_target_feature="avx")
_mm256_testc_ps :: #force_inline proc "c" (a, b: __m256) -> i32 {
	return llvm_vtestcps256(a, b)
}

// Computes the bitwise AND of 256 bits (representing single-precision (32-bit) floating-point elements) in `a` and `b`, producing an intermediate 256-bit
// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
// NOT of `a` and then AND with `b`, producing an intermediate value, and set
// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
// are zero, otherwise return 0.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testnzc_ps)
@(require_results, enable_target_feature="avx")
_mm256_testnzc_ps :: #force_inline proc "c" (a, b: __m256) -> i32 {
	return llvm_vtestnzcps256(a, b)
}

// Computes the bitwise AND of 128 bits (representing single-precision (32-bit) floating-point elements) in `a` and `b`, producing an intermediate 128-bit
// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
// NOT of `a` and then AND with `b`, producing an intermediate value, and set
// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
// is zero, otherwise set `CF` to 0. Return the `ZF` value.
@(require_results, enable_target_feature="sse,avx")
_mm_testz_ps :: #force_inline proc "c" (a: __m128, b: __m128) -> i32 {
	r := intrinsics.simd_lanes_lt(transmute(#simd[4]i32)_mm_and_ps(a, b), (#simd[4]i32)(0))
	return i32(0 == intrinsics.simd_reduce_or(r))
}

// Computes the bitwise AND of 128 bits (representing single-precision (32-bit) floating-point elements) in `a` and `b`, producing an intermediate 128-bit
// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
// NOT of `a` and then AND with `b`, producing an intermediate value, and set
// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
// is zero, otherwise set `CF` to 0. Return the `CF` value.
@(require_results, enable_target_feature="sse,avx")
_mm_testc_ps :: #force_inline proc "c" (a: __m128, b: __m128) -> i32 {
	r := intrinsics.simd_lanes_lt(transmute(#simd[4]i32)_mm_andnot_ps(a, b), (#simd[4]i32)(0))
	return i32(0 == intrinsics.simd_reduce_or(r))
}

// Computes the bitwise AND of 128 bits (representing single-precision (32-bit) floating-point elements) in `a` and `b`, producing an intermediate 128-bit
// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
// NOT of `a` and then AND with `b`, producing an intermediate value, and set
// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
// are zero, otherwise return 0.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_ps)
@(require_results, enable_target_feature="avx")
_mm_testnzc_ps :: #force_inline proc "c" (a: __m128, b: __m128) -> i32 {
	return llvm_vtestnzcps(a, b)
}

// Sets each bit of the returned mask based on the most significant bit of the
// corresponding packed double-precision (64-bit) floating-point element in
// `a`.
@(require_results, enable_target_feature="avx")
_mm256_movemask_pd :: #force_inline proc "c" (a: __m256d) -> i32 {
	mask := intrinsics.simd_lanes_lt(transmute(#simd[4]i64)a, (#simd[4]i64)(0))
	return i32(transmute(u8)intrinsics.simd_extract_lsbs(mask))
}

// Sets each bit of the returned mask based on the most significant bit of the
// corresponding packed single-precision (32-bit) floating-point element in
// `a`.
@(require_results, enable_target_feature="avx")
_mm256_movemask_ps :: #force_inline proc "c" (a: __m256) -> i32 {
	// Propagate the highest bit to the rest, because simd_bitmask
	// requires all-1 or all-0.
	mask := intrinsics.simd_lanes_lt(transmute(#simd[8]i32)a, (#simd[8]i32)(0))
	return i32(transmute(u8)intrinsics.simd_extract_lsbs(mask))
}

// Returns vector of type __m256d with all elements set to zero.
@(require_results, enable_target_feature="avx")
_mm256_setzero_pd :: #force_inline proc "c" () -> __m256d {
	return 0
}

// Returns vector of type __m256 with all elements set to zero.
@(require_results, enable_target_feature="avx")
_mm256_setzero_ps :: #force_inline proc "c" () -> __m256 {
	return 0
}

// Returns vector of type __m256i with all elements set to zero.
@(require_results, enable_target_feature="avx")
_mm256_setzero_si256 :: #force_inline proc "c" () -> __m256i {
	return 0
}

// Sets packed double-precision (64-bit) floating-point elements in returned
// vector with the supplied values.
@(require_results, enable_target_feature="avx")
_mm256_set_pd :: #force_inline proc "c" (a: f64, b: f64, c: f64, d: f64) -> __m256d {
	return _mm256_setr_pd(d, c, b, a)
}

// Sets packed single-precision (32-bit) floating-point elements in returned
// vector with the supplied values.
@(require_results, enable_target_feature="avx")
_mm256_set_ps :: #force_inline proc "c" (
	a: f32,
	b: f32,
	c: f32,
	d: f32,
	e: f32,
	f: f32,
	g: f32,
	h: f32,
) -> __m256 {
	return _mm256_setr_ps(h, g, f, e, d, c, b, a)
}

// Sets packed 8-bit integers in returned vector with the supplied values.
@(require_results, enable_target_feature="avx")
_mm256_set_epi8 :: #force_inline proc "c" (
	e00, e01, e02, e03, e04, e05, e06, e07: i8,
	e08, e09, e10, e11, e12, e13, e14, e15: i8,
	e16, e17, e18, e19, e20, e21, e22, e23: i8,
	e24, e25, e26, e27, e28, e29, e30, e31: i8,
) -> __m256i {
	return _mm256_setr_epi8(
		e31, e30, e29, e28, e27, e26, e25, e24,
		e23, e22, e21, e20, e19, e18, e17, e16,
		e15, e14, e13, e12, e11, e10, e09, e08,
		e07, e06, e05, e04, e03, e02, e01, e00,
	)
}

// Sets packed 16-bit integers in returned vector with the supplied values.
@(require_results, enable_target_feature="avx")
_mm256_set_epi16 :: #force_inline proc "c" (
	e00, e01, e02, e03, e04, e05, e06, e07: i16,
	e08, e09, e10, e11, e12, e13, e14, e15: i16,
) -> __m256i {
	return _mm256_setr_epi16(
		e15, e14, e13, e12,
		e11, e10, e09, e08,
		e07, e06, e05, e04,
		e03, e02, e01, e00,
	)
}

// Sets packed 32-bit integers in returned vector with the supplied values.
@(require_results, enable_target_feature="avx")
_mm256_set_epi32 :: #force_inline proc "c" (e0, e1, e2, e3, e4, e5, e6, e7: i32) -> __m256i {
	return _mm256_setr_epi32(e7, e6, e5, e4, e3, e2, e1, e0)
}

// Sets packed 64-bit integers in returned vector with the supplied values.
@(require_results, enable_target_feature="avx")
_mm256_set_epi64x :: #force_inline proc "c" (a: i64, b: i64, c: i64, d: i64) -> __m256i {
	return _mm256_setr_epi64x(d, c, b, a)
}

// Sets packed double-precision (64-bit) floating-point elements in returned
// vector with the supplied values in reverse order.
@(require_results, enable_target_feature="avx")
_mm256_setr_pd :: #force_inline proc "c" (a: f64, b: f64, c: f64, d: f64) -> __m256d {
	return __m256d{a, b, c, d}
}

// Sets packed single-precision (32-bit) floating-point elements in returned
// vector with the supplied values in reverse order.
@(require_results, enable_target_feature="avx")
_mm256_setr_ps :: #force_inline proc "c" (a, b, c, d, e, f, g, h: f32) -> __m256 {
	return __m256{a, b, c, d, e, f, g, h}
}

// Sets packed 8-bit integers in returned vector with the supplied values in
// reverse order.
@(require_results, enable_target_feature="avx")
_mm256_setr_epi8 :: #force_inline proc "c" (
	e00, e01, e02, e03, e04, e05, e06, e07: i8,
	e08, e09, e10, e11, e12, e13, e14, e15: i8,
	e16, e17, e18, e19, e20, e21, e22, e23: i8,
	e24, e25, e26, e27, e28, e29, e30, e31: i8,
) -> __m256i {
	return transmute(__m256i)#simd[32]i8{
		e00, e01, e02, e03, e04, e05, e06, e07,
		e08, e09, e10, e11, e12, e13, e14, e15,
		e16, e17, e18, e19, e20, e21, e22, e23,
		e24, e25, e26, e27, e28, e29, e30, e31,
	}
}

// Sets packed 16-bit integers in returned vector with the supplied values in
// reverse order.
@(require_results, enable_target_feature="avx")
_mm256_setr_epi16 :: #force_inline proc "c" (
	e00, e01, e02, e03, e04, e05, e06, e07: i16,
	e08, e09, e10, e11, e12, e13, e14, e15: i16,
) -> __m256i {
	return transmute(__m256i)#simd[16]i16{
		e00, e01, e02, e03,
		e04, e05, e06, e07,
		e08, e09, e10, e11,
		e12, e13, e14, e15,
	}
}

// Sets packed 32-bit integers in returned vector with the supplied values in
// reverse order.
@(require_results, enable_target_feature="avx")
_mm256_setr_epi32 :: #force_inline proc "c" (e0, e1, e2, e3, e4, e5, e6, e7: i32) -> __m256i {
	return transmute(__m256i)#simd[8]i32{e0, e1, e2, e3, e4, e5, e6, e7}
}

// Sets packed 64-bit integers in returned vector with the supplied values in
// reverse order.
@(require_results, enable_target_feature="avx")
_mm256_setr_epi64x :: #force_inline proc "c" (a: i64, b: i64, c: i64, d: i64) -> __m256i {
	return {a, b, c, d}
}

// Broadcasts double-precision (64-bit) floating-point value `a` to all elements of returned vector.
@(require_results, enable_target_feature="avx")
_mm256_set1_pd :: #force_inline proc "c" (a: f64) -> __m256d {
	return a
}

// Broadcasts single-precision (32-bit) floating-point value `a` to all elements of returned vector.
@(require_results, enable_target_feature="avx")
_mm256_set1_ps :: #force_inline proc "c" (a: f32) -> __m256 {
	return a
}

// Broadcasts 8-bit integer `a` to all elements of returned vector.
// This intrinsic may generate the `vpbroadcastb`.
@(require_results, enable_target_feature="avx")
_mm256_set1_epi8 :: #force_inline proc "c" (a: i8) -> __m256i {
	return transmute(__m256i)(#simd[32]i8)(a)
}

// Broadcasts 16-bit integer `a` to all elements of returned vector.
// This intrinsic may generate the `vpbroadcastw`.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi16)
@(require_results, enable_target_feature="avx")
_mm256_set1_epi16 :: #force_inline proc "c" (a: i16) -> __m256i {
	return transmute(__m256i)(#simd[16]i16)(a)
}

// Broadcasts 32-bit integer `a` to all elements of returned vector.
// This intrinsic may generate the `vpbroadcastd`.
@(require_results, enable_target_feature="avx")
_mm256_set1_epi32 :: #force_inline proc "c" (a: i32) -> __m256i {
	return transmute(__m256i)(#simd[8]i32)(a)
}

// Broadcasts 64-bit integer `a` to all elements of returned vector.
// This intrinsic may generate the `vpbroadcastq`.
//
// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi64x)
@(require_results, enable_target_feature="avx")
_mm256_set1_epi64x :: #force_inline proc "c" (a: i64) -> __m256i {
	return a
}

// Cast vector of type __m256d to type __m256.
@(require_results, enable_target_feature="avx")
_mm256_castpd_ps :: #force_inline proc "c" (a: __m256d) -> __m256 {
	return transmute(__m256)a
}

// Cast vector of type __m256 to type __m256d.
@(require_results, enable_target_feature="avx")
_mm256_castps_pd :: #force_inline proc "c" (a: __m256) -> __m256d {
	return transmute(__m256d)a
}

// Casts vector of type __m256 to type __m256i.
@(require_results, enable_target_feature="avx")
_mm256_castps_si256 :: #force_inline proc "c" (a: __m256) -> __m256i {
	return transmute(__m256i)a
}

// Casts vector of type __m256i to type __m256.
@(require_results, enable_target_feature="avx")
_mm256_castsi256_ps :: #force_inline proc "c" (a: __m256i) -> __m256 {
	return transmute(__m256)a
}

// Casts vector of type __m256d to type __m256i.
@(require_results, enable_target_feature="avx")
_mm256_castpd_si256 :: #force_inline proc "c" (a: __m256d) -> __m256i {
	return transmute(__m256i)a
}

// Casts vector of type __m256i to type __m256d.
@(require_results, enable_target_feature="avx")
_mm256_castsi256_pd :: #force_inline proc "c" (a: __m256i) -> __m256d {
	return transmute(__m256d)a
}

// Casts vector of type __m256 to type __m128.
@(require_results, enable_target_feature="avx")
_mm256_castps256_ps128 :: #force_inline proc "c" (a: __m256) -> __m128 {
	return intrinsics.simd_shuffle(a, a, 0, 1, 2, 3)
}

// Casts vector of type __m256d to type __m128d.
@(require_results, enable_target_feature="avx")
_mm256_castpd256_pd128 :: #force_inline proc "c" (a: __m256d) -> __m128d {
	return intrinsics.simd_shuffle(a, a, 0, 1)
}

// Casts vector of type __m256i to type __m128i.
@(require_results, enable_target_feature="avx")
_mm256_castsi256_si128 :: #force_inline proc "c" (a: __m256i) -> __m128i {
	return intrinsics.simd_shuffle(a, a, 0, 1)
}

// Casts vector of type __m128 to type __m256;
// the upper 128 bits of the result are indeterminate.
//
// In the Intel documentation, the upper bits are declared to be "undefined".
@(require_results, enable_target_feature="sse,avx")
_mm256_castps128_ps256 :: #force_inline proc "c" (a: __m128) -> __m256 {
	return intrinsics.simd_shuffle(a, _mm_undefined_ps(), 0, 1, 2, 3, 4, 4, 4, 4)
}

// Casts vector of type __m128d to type __m256d;
// the upper 128 bits of the result are indeterminate.
//
// In the Intel documentation, the upper bits are declared to be "undefined".
@(require_results, enable_target_feature="sse2,avx")
_mm256_castpd128_pd256 :: #force_inline proc "c" (a: __m128d) -> __m256d {
	return intrinsics.simd_shuffle(a, _mm_undefined_pd(), 0, 1, 2, 2)
}

// Casts vector of type __m128i to type __m256i;
// the upper 128 bits of the result are indeterminate.
//
// In the Intel documentation, the upper bits are declared to be "undefined".
@(require_results, enable_target_feature="avx")
_mm256_castsi128_si256 :: #force_inline proc "c" (a: __m128i) -> __m256i {
	return intrinsics.simd_shuffle(a, __m128i(0), 0, 1, 2, 2)
}

// Constructs a 256-bit floating-point vector of `[8 x float]` from a
// 128-bit floating-point vector of `[4 x float]`. The lower 128 bits contain
// the value of the source vector. The upper 128 bits are set to zero.
@(require_results, enable_target_feature="sse,avx")
_mm256_zextps128_ps256 :: #force_inline proc "c" (a: __m128) -> __m256 {
	return intrinsics.simd_shuffle(a, _mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7)
}

// Constructs a 256-bit integer vector from a 128-bit integer vector.
// The lower 128 bits contain the value of the source vector. The upper
// 128 bits are set to zero.
@(require_results, enable_target_feature="avx")
_mm256_zextsi128_si256 :: #force_inline proc "c" (a: __m128i) -> __m256i {
	return intrinsics.simd_shuffle(a, __m128i(0), 0, 1, 2, 3)
}

// Constructs a 256-bit floating-point vector of `[4 x double]` from a
// 128-bit floating-point vector of `[2 x double]`. The lower 128 bits
// contain the value of the source vector. The upper 128 bits are set
// to zero.
@(require_results, enable_target_feature="sse2,avx")
_mm256_zextpd128_pd256 :: #force_inline proc "c" (a: __m128d) -> __m256d {
	return intrinsics.simd_shuffle(a, _mm_setzero_pd(), 0, 1, 2, 3)
}

// Returns vector of type `__m256` with indeterminate elements.
// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
@(require_results, enable_target_feature="avx")
_mm256_undefined_ps :: #force_inline proc "c" () -> __m256 {
	return 0
}

// Returns vector of type `__m256d` with indeterminate elements.
// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
@(require_results, enable_target_feature="avx")
_mm256_undefined_pd :: #force_inline proc "c" () -> __m256d {
	return 0
}

// Returns vector of type __m256i with with indeterminate elements.
// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
@(require_results, enable_target_feature="avx")
_mm256_undefined_si256 :: #force_inline proc "c" () -> __m256i {
	return 0
}

// Sets packed __m256 returned vector with the supplied values.
@(require_results, enable_target_feature="avx")
_mm256_set_m128 :: #force_inline proc "c" (hi: __m128, lo: __m128) -> __m256 {
	return intrinsics.simd_shuffle(lo, hi, 0, 1, 2, 3, 4, 5, 6, 7)
}

// Sets packed __m256d returned vector with the supplied values.
@(require_results, enable_target_feature="avx")
_mm256_set_m128d :: #force_inline proc "c" (hi: __m128d, lo: __m128d) -> __m256d {
	hi := transmute(__m128)hi
	lo := transmute(__m128)lo
	return transmute(__m256d)_mm256_set_m128(hi, lo)
}

// Sets packed __m256i returned vector with the supplied values.
@(require_results, enable_target_feature="avx")
_mm256_set_m128i :: #force_inline proc "c" (hi: __m128i, lo: __m128i) -> __m256i {
	hi := transmute(__m128)hi
	lo := transmute(__m128)lo
	return transmute(__m256i)_mm256_set_m128(hi, lo)
}

// Sets packed __m256 returned vector with the supplied values.
@(require_results, enable_target_feature="avx")
_mm256_setr_m128 :: #force_inline proc "c" (lo: __m128, hi: __m128) -> __m256 {
	return _mm256_set_m128(hi, lo)
}

// Sets packed __m256d returned vector with the supplied values.
@(require_results, enable_target_feature="avx")
_mm256_setr_m128d :: #force_inline proc "c" (lo: __m128d, hi: __m128d) -> __m256d {
	return _mm256_set_m128d(hi, lo)
}

// Sets packed __m256i returned vector with the supplied values.
@(require_results, enable_target_feature="avx")
_mm256_setr_m128i :: #force_inline proc "c" (lo: __m128i, hi: __m128i) -> __m256i {
	return _mm256_set_m128i(hi, lo)
}

// Loads two 128-bit values (composed of 4 packed single-precision (32-bit) floating-point elements) from memory, and combine them into a 256-bit value.
// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
@(require_results, enable_target_feature="sse,avx")
_mm256_loadu2_m128 :: #force_inline proc "c" (hiaddr, loaddr: ^f32) -> __m256 {
	a := _mm256_castps128_ps256(_mm_loadu_ps(loaddr))
	return _mm256_insertf128_ps(a, _mm_loadu_ps(hiaddr), 1)
}

// Loads two 128-bit values (composed of 2 packed double-precision (64-bit) floating-point elements) from memory, and combine them into a 256-bit value.
// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
@(require_results, enable_target_feature="sse2,avx")
_mm256_loadu2_m128d :: #force_inline proc "c" (hiaddr, loaddr: ^f64) -> __m256d {
	a := _mm256_castpd128_pd256(_mm_loadu_pd(loaddr))
	return _mm256_insertf128_pd(a, _mm_loadu_pd(hiaddr), 1)
}

// Loads two 128-bit values (composed of integer data) from memory, and combine them into a 256-bit value.
// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
@(require_results, enable_target_feature="sse2,avx")
_mm256_loadu2_m128i :: #force_inline proc "c" (hiaddr, loaddr: ^__m128i) -> __m256i {
	a := _mm256_castsi128_si256(_mm_loadu_si128(loaddr))
	return _mm256_insertf128_si256(a, _mm_loadu_si128(hiaddr), 1)
}

// Stores the high and low 128-bit halves (each composed of 4 packed
// single-precision (32-bit) floating-point elements) from `a` into memory two
// different 128-bit locations.
// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
@(enable_target_feature="sse,avx")
_mm256_storeu2_m128 :: #force_inline proc "c" (hiaddr, loaddr: ^f32, a: __m256) {
	lo := _mm256_castps256_ps128(a)
	_mm_storeu_ps(loaddr, lo)
	hi := _mm256_extractf128_ps(a, 1)
	_mm_storeu_ps(hiaddr, hi)
}

// Stores the high and low 128-bit halves (each composed of 2 packed
// double-precision (64-bit) floating-point elements) from `a` into memory two
// different 128-bit locations.
// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
@(enable_target_feature="sse2,avx")
_mm256_storeu2_m128d :: #force_inline proc "c" (hiaddr, loaddr: ^f64, a: __m256d) {
	lo := _mm256_castpd256_pd128(a)
	_mm_storeu_pd(loaddr, lo)
	hi := _mm256_extractf128_pd(a, 1)
	_mm_storeu_pd(hiaddr, hi)
}

// Stores the high and low 128-bit halves (each composed of integer data) from
// `a` into memory two different 128-bit locations.
// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
@(enable_target_feature="sse2,avx")
_mm256_storeu2_m128i :: #force_inline proc "c" (hiaddr, loaddr: ^__m128i, a: __m256i) {
	lo := _mm256_castsi256_si128(a)
	_mm_storeu_si128(loaddr, lo)
	hi := _mm256_extractf128_si256(a, 1)
	_mm_storeu_si128(hiaddr, hi)
}

// Returns the first element of the input vector of `[8 x float]`.
@(require_results, enable_target_feature="avx")
_mm256_cvtss_f32 :: #force_inline proc "c" (a: __m256) -> f32 {
	return intrinsics.simd_extract(a, 0)
}


@(require_results, enable_target_feature="avx")
_mm256_insert_epi64 :: #force_inline proc "c" (a: __m256i, i: i64, $idx: u32) -> __m256i {
	return intrinsics.simd_replace(transmute(#simd[4]i64)a, idx, i)
}

@(require_results, enable_target_feature="avx")
_mm256_extract_epi64 :: #force_inline proc "c" (a: __m256i, $idx: u32) -> i64 {
	return intrinsics.simd_extract(transmute(#simd[4]i64)a, idx)
}


@(private, default_calling_convention="none")
foreign _ {
	@(link_name="llvm.x86.avx.round.pd.256")      llvm_roundpd256    :: proc(a: __m256d, #const b: u32) -> __m256d ---
	@(link_name="llvm.x86.avx.round.ps.256")      llvm_roundps256    :: proc(a: __m256, #const b: u32) -> __m256 ---
	@(link_name="llvm.x86.avx.dp.ps.256")         llvm_vdpps         :: proc(a, b: __m256, #const imm8: u8) -> __m256 ---
	@(link_name="llvm.x86.sse2.cmp.pd")           llvm_vcmppd        :: proc(a, b: __m128d, #const imm8: u8) -> __m128d ---
	@(link_name="llvm.x86.avx.cmp.pd.256")        llvm_vcmppd256     :: proc(a, b: __m256d, imm8: u8) -> __m256d ---
	@(link_name="llvm.x86.sse.cmp.ps")            llvm_vcmpps        :: proc(a: __m128, b: __m128, #const imm8: u8) -> __m128 ---
	@(link_name="llvm.x86.avx.cmp.ps.256")        llvm_vcmpps256     :: proc(a, b: __m256, imm8: u8) -> __m256 ---
	@(link_name="llvm.x86.sse2.cmp.sd")           llvm_vcmpsd        :: proc(a, b: __m128d, #const imm8: u8) -> __m128d ---
	@(link_name="llvm.x86.sse.cmp.ss")            llvm_vcmpss        :: proc(a: __m128, b: __m128, #const imm8: u8) -> __m128 ---
	@(link_name="llvm.x86.avx.cvt.ps2dq.256")     llvm_vcvtps2dq     :: proc(a: __m256) -> #simd[8]i32 ---
	@(link_name="llvm.x86.avx.cvtt.pd2dq.256")    llvm_vcvttpd2dq    :: proc(a: __m256d) -> #simd[4]i32 ---
	@(link_name="llvm.x86.avx.cvt.pd2dq.256")     llvm_vcvtpd2dq     :: proc(a: __m256d) -> #simd[4]i32 ---
	@(link_name="llvm.x86.avx.cvtt.ps2dq.256")    llvm_vcvttps2dq    :: proc(a: __m256) -> #simd[8]i32 ---
	@(link_name="llvm.x86.avx.vzeroall")          llvm_vzeroall      :: proc() ---
	@(link_name="llvm.x86.avx.vzeroupper")        llvm_vzeroupper    :: proc() ---
	@(link_name="llvm.x86.avx.vpermilvar.ps.256") llvm_vpermilps256  :: proc(a: __m256, b: #simd[8]i32) -> __m256 ---
	@(link_name="llvm.x86.avx.vpermilvar.ps")     llvm_vpermilps     :: proc(a: __m128, b: #simd[4]i32) -> __m128 ---
	@(link_name="llvm.x86.avx.vpermilvar.pd.256") llvm_vpermilpd256  :: proc(a: __m256d, b: #simd[4]i64) -> __m256d ---
	@(link_name="llvm.x86.avx.vpermilvar.pd")     llvm_vpermilpd     :: proc(a: __m128d, b: #simd[2]i64) -> __m128d ---
	@(link_name="llvm.x86.avx.ldu.dq.256")        llvm_vlddqu        :: proc(mem_addr: rawptr) -> #simd[32]i8 ---
	@(link_name="llvm.x86.avx.rcp.ps.256")        llvm_vrcpps        :: proc(a: __m256) -> __m256 ---
	@(link_name="llvm.x86.avx.rsqrt.ps.256")      llvm_vrsqrtps      :: proc(a: __m256) -> __m256 ---
	@(link_name="llvm.x86.avx.ptestnzc.256")      llvm_ptestnzc256   :: proc(a: #simd[4]i64, b: #simd[4]i64) -> i32 ---
	@(link_name="llvm.x86.avx.vtestz.pd.256")     llvm_vtestzpd256   :: proc(a, b: __m256d) -> i32 ---
	@(link_name="llvm.x86.avx.vtestc.pd.256")     llvm_vtestcpd256   :: proc(a, b: __m256d) -> i32 ---
	@(link_name="llvm.x86.avx.vtestnzc.pd.256")   llvm_vtestnzcpd256 :: proc(a, b: __m256d) -> i32 ---
	@(link_name="llvm.x86.avx.vtestnzc.pd")       llvm_vtestnzcpd    :: proc(a, b: __m128d) -> i32 ---
	@(link_name="llvm.x86.avx.vtestz.ps.256")     llvm_vtestzps256   :: proc(a, b: __m256) -> i32 ---
	@(link_name="llvm.x86.avx.vtestc.ps.256")     llvm_vtestcps256   :: proc(a, b: __m256) -> i32 ---
	@(link_name="llvm.x86.avx.vtestnzc.ps.256")   llvm_vtestnzcps256 :: proc(a, b: __m256) -> i32 ---
	@(link_name="llvm.x86.avx.vtestnzc.ps")       llvm_vtestnzcps    :: proc(a: __m128, b: __m128) -> i32 ---
	@(link_name="llvm.x86.avx.min.ps.256")        llvm_vminps        :: proc(a, b: __m256) -> __m256 ---
	@(link_name="llvm.x86.avx.max.ps.256")        llvm_vmaxps        :: proc(a, b: __m256) -> __m256 ---
	@(link_name="llvm.x86.avx.min.pd.256")        llvm_vminpd        :: proc(a, b: __m256d) -> __m256d ---
	@(link_name="llvm.x86.avx.max.pd.256")        llvm_vmaxpd        :: proc(a, b: __m256d) -> __m256d ---
}