diff --git a/core/simd/x86/avx.odin b/core/simd/x86/avx.odin
new file mode 100644
index 000000000..c274f4903
--- /dev/null
+++ b/core/simd/x86/avx.odin
@@ -0,0 +1,1852 @@
+#+build i386, amd64
+package simd_x86
+
+import "base:intrinsics"
+
+// Adds packed double-precision (64-bit) floating-point elements in `a` and `b`.
+@(require_results, enable_target_feature="avx")
+_mm256_add_pd :: #force_inline proc "c" (a, b: __m256d) -> __m256d {
+	return intrinsics.simd_add(a, b)
+}
+
+// Adds packed single-precision (32-bit) floating-point elements in `a` and `b`.
+@(require_results, enable_target_feature="avx")
+_mm256_add_ps :: #force_inline proc "c" (a, b: __m256) -> __m256 {
+	return intrinsics.simd_add(a, b)
+}
+
+// Computes the bitwise AND of a packed double-precision (64-bit) floating-point elements in `a` and `b`.
+@(require_results, enable_target_feature="avx")
+_mm256_and_pd :: #force_inline proc "c" (a, b: __m256d) -> __m256d {
+	a := transmute(#simd[4]u64)a
+	b := transmute(#simd[4]u64)b
+	return transmute(__m256d)intrinsics.simd_bit_and(a, b)
+}
+
+// Computes the bitwise AND of packed single-precision (32-bit) floating-point elements in `a` and `b`.
+@(require_results, enable_target_feature="avx")
+_mm256_and_ps :: #force_inline proc "c" (a, b: __m256) -> __m256 {
+	a := transmute(#simd[8]u32)a
+	b := transmute(#simd[8]u32)b
+	return transmute(__m256)intrinsics.simd_bit_and(a, b)
+}
+
+// Computes the bitwise OR packed double-precision (64-bit) floating-point elements in `a` and `b`.
+@(require_results, enable_target_feature="avx")
+_mm256_or_pd :: #force_inline proc "c" (a, b: __m256d) -> __m256d {
+	a := transmute(#simd[4]u64)a
+	b := transmute(#simd[4]u64)b
+	return transmute(__m256d)intrinsics.simd_bit_or(a, b)
+}
+
+// Computes the bitwise OR packed single-precision (32-bit) floating-point elements in `a` and `b`.
+@(require_results, enable_target_feature="avx")
+_mm256_or_ps :: #force_inline proc "c" (a, b: __m256) -> __m256 {
+	a := transmute(#simd[8]u32)a
+	b := transmute(#simd[8]u32)b
+	return transmute(__m256)intrinsics.simd_bit_or(a, b)
+}
+
+// Shuffles double-precision (64-bit) floating-point elements within 128-bit lanes using the control in `imm8`.
+@(require_results, enable_target_feature="avx")
+_mm256_shuffle_pd :: #force_inline proc "c" (a, b: __m256d, $MASK: u8) -> __m256d {
+	return intrinsics.simd_shuffle(
+		a,
+		b,
+		MASK & 1,
+		((MASK >> 1) & 1) + 4,
+		((MASK >> 2) & 1) + 2,
+		((MASK >> 3) & 1) + 6,
+	)
+}
+
+
+// Shuffles single-precision (32-bit) floating-point elements in `a` within 128-bit lanes using the control in `imm8`.
+@(require_results, enable_target_feature="avx")
+_mm256_shuffle_ps :: #force_inline proc "c" (a, b: __m256, $MASK: u8) -> __m256 {
+	return intrinsics.simd_shuffle(
+		a,
+		b,
+		MASK & 0b11,
+		(MASK >> 2) & 0b11,
+		((MASK >> 4) & 0b11) + 8,
+		((MASK >> 6) & 0b11) + 8,
+		(MASK & 0b11) + 4,
+		((MASK >> 2) & 0b11) + 4,
+		((MASK >> 4) & 0b11) + 12,
+		((MASK >> 6) & 0b11) + 12,
+	)
+}
+
+
+
+// Computes the bitwise NOT of packed double-precision (64-bit) floating-point elements in `a`, and then AND with `b`.
+@(require_results, enable_target_feature="avx")
+_mm256_andnot_pd :: #force_inline proc "c" (a, b: __m256d) -> __m256d {
+	a := transmute(#simd[4]u64)a
+	b := transmute(#simd[4]u64)b
+	return transmute(__m256d)intrinsics.simd_bit_and(intrinsics.simd_bit_xor((#simd[4]u64)(0), a), b)
+}
+
+// Computes the bitwise NOT of packed single-precision (32-bit) floating-point elements in `a` and then AND with `b`.
+@(require_results, enable_target_feature="avx")
+_mm256_andnot_ps :: #force_inline proc "c" (a, b: __m256) -> __m256 {
+	a := transmute(#simd[8]u32)a
+	b := transmute(#simd[8]u32)b
+	return transmute(__m256)intrinsics.simd_bit_and(intrinsics.simd_bit_xor((#simd[8]u32)(0), a), b)
+}
+
+
+
+// Compares packed double-precision (64-bit) floating-point elements in `a` and `b`, and returns packed maximum values
+//
+// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_pd)
+@(require_results, enable_target_feature="avx")
+_mm256_max_pd :: #force_inline proc "c" (a, b: __m256d) -> __m256d {
+	return llvm_vmaxpd(a, b)
+}
+
+// Compares packed single-precision (32-bit) floating-point elements in `a` and `b`, and returns packed maximum values
+//
+// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_ps)
+@(require_results, enable_target_feature="avx")
+_mm256_max_ps :: #force_inline proc "c" (a, b: __m256) -> __m256 {
+	return llvm_vmaxps(a, b)
+}
+
+// Compares packed double-precision (64-bit) floating-point elements in `a` and `b`, and returns packed minimum values
+//
+// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_pd)
+@(require_results, enable_target_feature="avx")
+_mm256_min_pd :: #force_inline proc "c" (a, b: __m256d) -> __m256d {
+	return llvm_vminpd(a, b)
+}
+
+// Compares packed single-precision (32-bit) floating-point elements in `a` and `b`, and returns packed minimum values
+//
+// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_ps)
+@(require_results, enable_target_feature="avx")
+_mm256_min_ps :: #force_inline proc "c" (a, b: __m256) -> __m256 {
+	return llvm_vminps(a, b)
+}
+
+
+
+// Multiplies packed double-precision (64-bit) floating-point elements in `a` and `b`.
+@(require_results, enable_target_feature="avx")
+_mm256_mul_pd :: #force_inline proc "c" (a, b: __m256d) -> __m256d {
+	return intrinsics.simd_mul(a, b)
+}
+
+// Multiplies packed single-precision (32-bit) floating-point elements in `a` and `b`.
+@(require_results, enable_target_feature="avx")
+_mm256_mul_ps :: #force_inline proc "c" (a, b: __m256) -> __m256 {
+	return intrinsics.simd_mul(a, b)
+}
+
+// Alternatively adds and subtracts packed double-precision (64-bit) floating-point elements in `a` to/from packed elements in `b`.
+@(require_results, enable_target_feature="avx")
+_mm256_addsub_pd :: #force_inline proc "c" (a, b: __m256d) -> __m256d {
+	add := intrinsics.simd_add(a, b)
+	sub := intrinsics.simd_sub(a, b)
+	return intrinsics.simd_shuffle(add, sub, 4, 1, 6, 3)
+}
+
+
+// Alternatively adds and subtracts packed single-precision (32-bit) floating-point elements in `a` to/from packed elements in `b`.
+@(require_results, enable_target_feature="avx")
+_mm256_addsub_ps :: #force_inline proc "c" (a, b: __m256) -> __m256 {
+	add := intrinsics.simd_add(a, b)
+	sub := intrinsics.simd_sub(a, b)
+	return intrinsics.simd_shuffle(add, sub, 8, 1, 10, 3, 12, 5, 14, 7)
+}
+
+
+// Subtracts packed double-precision (64-bit) floating-point elements in `b`
+// from packed elements in `a`.
+@(require_results, enable_target_feature="avx")
+_mm256_sub_pd :: #force_inline proc "c" (a, b: __m256d) -> __m256d {
+	return intrinsics.simd_sub(a, b)
+}
+
+// Subtracts packed single-precision (32-bit) floating-point elements in `b`
+// from packed elements in `a`.
+@(require_results, enable_target_feature="avx")
+_mm256_sub_ps :: #force_inline proc "c" (a, b: __m256) -> __m256 {
+	return intrinsics.simd_sub(a, b)
+}
+
+// Computes the division of each of the 8 packed 32-bit floating-point elements
+// in `a` by the corresponding packed elements in `b`.
+@(require_results, enable_target_feature="avx")
+_mm256_div_ps :: #force_inline proc "c" (a, b: __m256) -> __m256 {
+	return intrinsics.simd_div(a, b)
+}
+
+// Computes the division of each of the 4 packed 64-bit floating-point elements
+// in `a` by the corresponding packed elements in `b`.
+@(require_results, enable_target_feature="avx")
+_mm256_div_pd :: #force_inline proc "c" (a, b: __m256d) -> __m256d {
+	return intrinsics.simd_div(a, b)
+}
+
+
+
+// Rounds packed double-precision (64-bit) floating point elements in `a`
+// according to the flag `ROUNDING`. The value of `ROUNDING` may be as follows:
+//
+// - `0x00`: Round to the nearest whole number.
+// - `0x01`: Round down, toward negative infinity.
+// - `0x02`: Round up, toward positive infinity.
+// - `0x03`: Truncate the values.
+//
+// For a complete list of options, check [the LLVM docs][llvm_docs].
+//
+// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
+//
+// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_round_pd)
+@(require_results, enable_target_feature="avx")
+_mm256_round_pd :: #force_inline proc "c" (a: __m256d, $ROUNDING: u8) -> __m256d where ROUNDING < 16 {
+	return llvm_roundpd256(a, ROUNDING)
+}
+
+// Rounds packed double-precision (64-bit) floating point elements in `a`
+// toward positive infinity.
+@(require_results, enable_target_feature="avx")
+_mm256_ceil_pd :: #force_inline proc "c" (a: __m256d) -> __m256d {
+	return intrinsics.simd_ceil(a)
+}
+
+// Rounds packed double-precision (64-bit) floating point elements in `a`
+// toward negative infinity.
+@(require_results, enable_target_feature="avx")
+_mm256_floor_pd :: #force_inline proc "c" (a: __m256d) -> __m256d {
+	return intrinsics.simd_floor(a)
+}
+
+
+
+// Rounds packed single-precision (32-bit) floating point elements in `a`
+// according to the flag `ROUNDING`. The value of `ROUNDING` may be as follows:
+//
+// - `0x00`: Round to the nearest whole number.
+// - `0x01`: Round down, toward negative infinity.
+// - `0x02`: Round up, toward positive infinity.
+// - `0x03`: Truncate the values.
+//
+// For a complete list of options, check [the LLVM docs][llvm_docs].
+//
+// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
+//
+// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_round_ps)
+@(require_results, enable_target_feature="avx")
+_mm256_round_ps :: #force_inline proc(a: __m256, $ROUNDING: u8) -> __m256 where ROUNDING < 16 {
+	return llvm_roundps256(a, u32(ROUNDING))
+}
+
+// Rounds packed single-precision (32-bit) floating point elements in `a`
+// toward positive infinity.
+@(require_results, enable_target_feature="avx")
+_mm256_ceil_ps :: #force_inline proc "c" (a: __m256) -> __m256 {
+	return intrinsics.simd_ceil(a)
+}
+
+// Rounds packed single-precision (32-bit) floating point elements in `a`
+// toward negative infinity.
+@(require_results, enable_target_feature="avx")
+_mm256_floor_ps :: #force_inline proc "c" (a: __m256) -> __m256 {
+	return intrinsics.simd_floor(a)
+}
+
+// Returns the square root of packed single-precision (32-bit) floating point elements in `a`.
+//
+// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sqrt_ps)
+@(require_results, enable_target_feature="avx")
+_mm256_sqrt_ps :: #force_inline proc "c" (a: __m256) -> __m256 {
+	return intrinsics.sqrt(a)
+}
+
+// Returns the square root of packed double-precision (64-bit) floating point elements in `a`.
+//
+// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sqrt_pd)
+@(require_results, enable_target_feature="avx")
+_mm256_sqrt_pd :: #force_inline proc "c" (a: __m256d) -> __m256d {
+	return intrinsics.sqrt(a)
+}
+
+
+
+// Blends packed double-precision (64-bit) floating-point elements from
+// `a` and `b` using control mask `imm8`.
+//
+// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_pd)
+@(require_results, enable_target_feature="avx")
+_mm256_blend_pd :: #force_inline proc "c" (a, b: __m256d, $IIM4: u32) -> __m256d where IMM4 < 16 {
+	return intrinsics.simd_shuffle(
+		a,
+		b,
+		((IMM4 >> 0) & 1) * 4 + 0,
+		((IMM4 >> 1) & 1) * 4 + 1,
+		((IMM4 >> 2) & 1) * 4 + 2,
+		((IMM4 >> 3) & 1) * 4 + 3,
+	)
+}
+
+// Blends packed single-precision (32-bit) floating-point elements from
+// `a` and `b` using control mask `imm8`.
+@(require_results, enable_target_feature="avx")
+_mm256_blend_ps :: #force_inline proc "c" (a, b: __m256, $IMM8: u8) -> __m256 {
+	return intrinsics.simd_shuffle(
+		a,
+		b,
+		((IMM8 >> 0) & 1) * 8 + 0,
+		((IMM8 >> 1) & 1) * 8 + 1,
+		((IMM8 >> 2) & 1) * 8 + 2,
+		((IMM8 >> 3) & 1) * 8 + 3,
+		((IMM8 >> 4) & 1) * 8 + 4,
+		((IMM8 >> 5) & 1) * 8 + 5,
+		((IMM8 >> 6) & 1) * 8 + 6,
+		((IMM8 >> 7) & 1) * 8 + 7,
+	)
+}
+
+
+
+// Blends packed double-precision (64-bit) floating-point elements from
+// `a` and `b` using `c` as a mask.
+@(require_results, enable_target_feature="avx")
+_mm256_blendv_pd :: #force_inline proc "c" (a, b: __m256d, c: __m256d) -> __m256d {
+	mask := intrinsics.simd_lanes_lt(transmute(#simd[4]i64)c, 0)
+	return intrinsics.simd_select(mask, b, a)
+}
+
+// Blends packed single-precision (32-bit) floating-point elements from
+// `a` and `b` using `c` as a mask.
+@(require_results, enable_target_feature="avx")
+_mm256_blendv_ps :: #force_inline proc "c" (a, b: __m256, c: __m256) -> __m256 {
+	mask := intrinsics.simd_lanes_lt(transmute(#simd[8]i32)c, 0)
+	return intrinsics.simd_select(mask, b, a)
+}
+
+
+
+// Conditionally multiplies the packed single-precision (32-bit) floating-point elements in `a` and `b` using the high 4 bits in `imm8`,
+// sum the four products, and conditionally return the sum
+//  using the low 4 bits of `imm8`.
+//
+// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dp_ps)
+@(require_results, enable_target_feature="avx")
+_mm256_dp_ps :: #force_inline proc "c" (a, b: __m256, $IMM8: i8) -> __m256 {
+	return llvm_vdpps(a, b, IMM8)
+}
+
+// Horizontal addition of adjacent pairs in the two packed vectors
+// of 4 64-bit floating points `a` and `b`.
+// In the result, sums of elements from `a` are returned in even locations,
+// while sums of elements from `b` are returned in odd locations.
+@(require_results, enable_target_feature="avx")
+_mm256_hadd_pd :: #force_inline proc "c" (a, b: __m256d) -> __m256d {
+	even := intrinsics.simd_shuffle(a, b, 0, 4, 2, 6)
+	odd  := intrinsics.simd_shuffle(a, b, 1, 5, 3, 7)
+	return intrinsics.simd_add(even, odd)
+}
+
+
+
+// Horizontal addition of adjacent pairs in the two packed vectors
+// of 8 32-bit floating points `a` and `b`.
+// In the result, sums of elements from `a` are returned in locations of
+// indices 0, 1, 4, 5; while sums of elements from `b` are locations
+// 2, 3, 6, 7.
+@(require_results, enable_target_feature="avx")
+_mm256_hadd_ps :: #force_inline proc "c" (a, b: __m256) -> __m256 {
+	even := intrinsics.simd_shuffle(a, b, 0, 2, 8, 10, 4, 6, 12, 14)
+	odd  := intrinsics.simd_shuffle(a, b, 1, 3, 9, 11, 5, 7, 13, 15)
+	return intrinsics.simd_add(even, odd)
+}
+
+// Horizontal subtraction of adjacent pairs in the two packed vectors
+// of 4 64-bit floating points `a` and `b`.
+// In the result, sums of elements from `a` are returned in even locations,
+// while sums of elements from `b` are returned in odd locations.
+@(require_results, enable_target_feature="avx")
+_mm256_hsub_pd :: #force_inline proc "c" (a, b: __m256d) -> __m256d {
+	even := intrinsics.simd_shuffle(a, b, 0, 4, 2, 6)
+	odd  := intrinsics.simd_shuffle(a, b, 1, 5, 3, 7)
+	return intrinsics.simd_sub(even, odd)
+}
+
+// Horizontal subtraction of adjacent pairs in the two packed vectors
+// of 8 32-bit floating points `a` and `b`.
+// In the result, sums of elements from `a` are returned in locations of
+// indices 0, 1, 4, 5; while sums of elements from `b` are locations
+// 2, 3, 6, 7.
+@(require_results, enable_target_feature="avx")
+_mm256_hsub_ps :: #force_inline proc "c" (a, b: __m256) -> __m256 {
+	even := intrinsics.simd_shuffle(a, b, 0, 2, 8, 10, 4, 6, 12, 14)
+	odd  := intrinsics.simd_shuffle(a, b, 1, 3, 9, 11, 5, 7, 13, 15)
+	return intrinsics.simd_sub(even, odd)
+}
+
+// Computes the bitwise XOR of packed double-precision (64-bit) floating-point elements in `a` and `b`.
+@(require_results, enable_target_feature="avx")
+_mm256_xor_pd :: #force_inline proc "c" (a, b: __m256d) -> __m256d {
+	a := transmute(#simd[4]u64)a
+	b := transmute(#simd[4]u64)b
+	return transmute(__m256d)intrinsics.simd_bit_xor(a, b)
+}
+
+// Computes the bitwise XOR of packed single-precision (32-bit) floating-point elements in `a` and `b`.
+@(require_results, enable_target_feature="avx")
+_mm256_xor_ps :: #force_inline proc "c" (a, b: __m256) -> __m256 {
+	a := transmute(#simd[8]u32)a
+	b := transmute(#simd[8]u32)b
+	return transmute(__m256)intrinsics.simd_bit_xor(a, b)
+}
+
+
+
+_CMP_EQ_OQ    :: 0x00 // Equal (ordered, non-signaling)
+_CMP_LT_OS    :: 0x01 // Less-than (ordered, signaling)
+_CMP_LE_OS    :: 0x02 // Less-than-or-equal (ordered, signaling)
+_CMP_UNORD_Q  :: 0x03 // Unordered (non-signaling)
+_CMP_NEQ_UQ   :: 0x04 // Not-equal (unordered, non-signaling)
+_CMP_NLT_US   :: 0x05 // Not-less-than (unordered, signaling)
+_CMP_NLE_US   :: 0x06 // Not-less-than-or-equal (unordered, signaling)
+_CMP_ORD_Q    :: 0x07 // Ordered (non-signaling)
+_CMP_EQ_UQ    :: 0x08 // Equal (unordered, non-signaling)
+_CMP_NGE_US   :: 0x09 // Not-greater-than-or-equal (unordered, signaling)
+_CMP_NGT_US   :: 0x0a // Not-greater-than (unordered, signaling)
+_CMP_FALSE_OQ :: 0x0b // False (ordered, non-signaling)
+_CMP_NEQ_OQ   :: 0x0c // Not-equal (ordered, non-signaling)
+_CMP_GE_OS    :: 0x0d // Greater-than-or-equal (ordered, signaling)
+_CMP_GT_OS    :: 0x0e // Greater-than (ordered, signaling)
+_CMP_TRUE_UQ  :: 0x0f // True (unordered, non-signaling)
+_CMP_EQ_OS    :: 0x10 // Equal (ordered, signaling)
+_CMP_LT_OQ    :: 0x11 // Less-than (ordered, non-signaling)
+_CMP_LE_OQ    :: 0x12 // Less-than-or-equal (ordered, non-signaling)
+_CMP_UNORD_S  :: 0x13 // Unordered (signaling)
+_CMP_NEQ_US   :: 0x14 // Not-equal (unordered, signaling)
+_CMP_NLT_UQ   :: 0x15 // Not-less-than (unordered, non-signaling)
+_CMP_NLE_UQ   :: 0x16 // Not-less-than-or-equal (unordered, non-signaling)
+_CMP_ORD_S    :: 0x17 // Ordered (signaling)
+_CMP_EQ_US    :: 0x18 // Equal (unordered, signaling)
+_CMP_NGE_UQ   :: 0x19 // Not-greater-than-or-equal (unordered, non-signaling)
+_CMP_NGT_UQ   :: 0x1a // Not-greater-than (unordered, non-signaling)
+_CMP_FALSE_OS :: 0x1b // False (ordered, signaling)
+_CMP_NEQ_OS   :: 0x1c // Not-equal (ordered, signaling)
+_CMP_GE_OQ    :: 0x1d // Greater-than-or-equal (ordered, non-signaling)
+_CMP_GT_OQ    :: 0x1e // Greater-than (ordered, non-signaling)
+_CMP_TRUE_US  :: 0x1f // True (unordered, signaling)
+
+
+
+// Compares packed double-precision (64-bit) floating-point elements in `a` and `b` based on the comparison operand specified by `IMM5`.
+//
+// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_pd)
+@(require_results, enable_target_feature="avx")
+_mm_cmp_pd :: #force_inline proc "c" (a, b: __m128d, $IMM5: u8) -> __m128d where IMM5 < 32 {
+	return llvm_vcmppd(a, b, IMM5)
+}
+
+// Compares packed double-precision (64-bit) floating-point elements in `a` and `b` based on the comparison operand specified by `IMM5`.
+//
+// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_pd)
+@(require_results, enable_target_feature="avx")
+_mm256_cmp_pd :: #force_inline proc "c" (a, b: __m256d, $IMM5: u8) -> __m256d where IMM5 < 32 {
+	return llvm_vcmppd256(a, b, IMM5)
+}
+
+// Compares packed single-precision (32-bit) floating-point elements in `a` and `b` based on the comparison operand specified by `IMM5`.
+//
+// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_ps)
+@(require_results, enable_target_feature="avx")
+_mm_cmp_ps :: #force_inline proc "c" (a: __m128, b: __m128, $IMM5: u8) -> __m128 where IMM5 < 32 {
+	return llvm_vcmpps(a, b, IMM5)
+}
+
+// Compares packed single-precision (32-bit) floating-point elements in `a` and `b` based on the comparison operand specified by `IMM5`.
+//
+// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_ps)
+@(require_results, enable_target_feature="avx")
+_mm256_cmp_ps :: #force_inline proc "c" (a, b: __m256, $IMM5: u8) -> __m256 where IMM5 < 32 {
+	return llvm_vcmpps256(a, b, IMM5)
+}
+
+// Compares the lower double-precision (64-bit) floating-point element in
+// `a` and `b` based on the comparison operand specified by `IMM5`,
+// store the result in the lower element of returned vector,
+// and copies the upper element from `a` to the upper element of returned
+// vector.
+//
+// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_sd)
+@(require_results, enable_target_feature="avx")
+_mm_cmp_sd :: #force_inline proc "c" (a, b: __m128d, $IMM5: u8) -> __m128d where IMM5 < 32 {
+	return llvm_vcmpsd(a, b, IMM5)
+}
+
+// Compares the lower single-precision (32-bit) floating-point element in
+// `a` and `b` based on the comparison operand specified by `IMM5`,
+// store the result in the lower element of returned vector,
+// and copies the upper 3 packed elements from `a` to the upper elements of
+// returned vector.
+//
+// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_ss)
+@(require_results, enable_target_feature="avx")
+_mm_cmp_ss :: #force_inline proc "c" (a: __m128, b: __m128, $IMM5: u8) -> __m128 where IMM5 < 32 {
+	return llvm_vcmpss(a, b, IMM5)
+}
+
+// Converts packed 32-bit integers in `a` to packed double-precision (64-bit) floating-point elements.
+@(require_results, enable_target_feature="avx")
+_mm256_cvtepi32_pd :: #force_inline proc "c" (a: __m128i) -> __m256d {
+	return __m256d(transmute(#simd[4]i32)a)
+}
+
+// Converts packed 32-bit integers in `a` to packed single-precision (32-bit) floating-point elements.
+@(require_results, enable_target_feature="avx")
+_mm256_cvtepi32_ps :: #force_inline proc "c" (a: __m256i) -> __m256 {
+	return __m256(transmute(#simd[8]i32)a)
+}
+
+// Converts packed double-precision (64-bit) floating-point elements in `a` to packed single-precision (32-bit) floating-point elements.
+@(require_results, enable_target_feature="avx")
+_mm256_cvtpd_ps :: #force_inline proc "c" (a: __m256d) -> __m128 {
+	return __m128(a)
+}
+
+// Converts packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit integers.
+//
+// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtps_epi32)
+@(require_results, enable_target_feature="avx")
+_mm256_cvtps_epi32 :: #force_inline proc "c" (a: __m256) -> __m256i {
+	return transmute(__m256i)llvm_vcvtps2dq(a)
+}
+
+// Converts packed single-precision (32-bit) floating-point elements in `a` to packed double-precision (64-bit) floating-point elements.
+@(require_results, enable_target_feature="avx")
+_mm256_cvtps_pd :: #force_inline proc "c" (a: __m128) -> __m256d {
+	return __m256d(a)
+}
+
+// Returns the first element of the input vector of `[4 x double]`.
+@(require_results, enable_target_feature="avx")
+_mm256_cvtsd_f64 :: #force_inline proc "c" (a: __m256d) -> f64 {
+	return intrinsics.simd_extract(a, 0)
+}
+
+// Converts packed double-precision (64-bit) floating-point elements in `a` to packed 32-bit integers with truncation.
+//
+// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvttpd_epi32)
+@(require_results, enable_target_feature="avx")
+_mm256_cvttpd_epi32 :: #force_inline proc "c" (a: __m256d) -> __m128i {
+	return transmute(__m128i)llvm_vcvttpd2dq(a)
+}
+
+// Converts packed double-precision (64-bit) floating-point elements in `a` to packed 32-bit integers.
+//
+// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtpd_epi32)
+@(require_results, enable_target_feature="avx")
+_mm256_cvtpd_epi32 :: #force_inline proc "c" (a: __m256d) -> __m128i {
+	return transmute(__m128i)llvm_vcvtpd2dq(a)
+}
+
+// Converts packed single-precision (32-bit) floating-point elements in `a` to packed 32-bit integers with truncation.
+//
+// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvttps_epi32)
+@(require_results, enable_target_feature="avx")
+_mm256_cvttps_epi32 :: #force_inline proc "c" (a: __m256) -> __m256i {
+	return transmute(__m256i)llvm_vcvttps2dq(a)
+}
+
+
+
+// Extracts 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from `a`, selected with `imm8`.
+@(require_results, enable_target_feature="avx")
+_mm256_extractf128_ps :: #force_inline proc "c" (a: __m256, $IMM1: u8) -> __m128 where IMM1 < 2 {
+	when IMM1 == 0 {
+		return intrinsics.simd_shuffle(a, _mm256_undefined_ps(), 0, 1, 2, 3)
+	} else {
+		return intrinsics.simd_shuffle(a, _mm256_undefined_ps(), 4, 5, 6, 7)
+	}
+}
+
+// Extracts 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from `a`, selected with `imm8`.
+@(require_results, enable_target_feature="avx")
+_mm256_extractf128_pd :: #force_inline proc "c" (a: __m256d, $IMM1: u8) -> __m128d where IMM1 < 2 {
+	when IMM1 == 0 {
+		return intrinsics.simd_shuffle(a, _mm256_undefined_pd(), 0, 1)
+	} else {
+		return intrinsics.simd_shuffle(a, _mm256_undefined_pd(), 2, 3)
+	}
+}
+
+// Extracts 128 bits (composed of integer data) from `a`, selected with `imm8`.
+@(require_results, enable_target_feature="avx")
+_mm256_extractf128_si256 :: #force_inline proc "c" (a: __m256i, $IMM1: u8) -> __m128i where IMM1 < 2 {
+	when IMM1 == 0 {
+		dst := intrinsics.simd_shuffle(transmute(#simd[4]i64)a, (#simd[4]i64)(0), 0, 1)
+		return transmute(__m128i)dst
+	} else {
+		dst := intrinsics.simd_shuffle(transmute(#simd[4]i64)a, (#simd[4]i64)(0), 2, 3)
+		return transmute(__m128i)dst
+	}
+}
+
+// Extracts a 32-bit integer from `a`, selected with `INDEX`.
+@(require_results, enable_target_feature="avx")
+_mm256_extract_epi32 :: #force_inline proc "c" (a: __m256i, $INDEX: u8) -> i32 where INDEX < 8 {
+	return intrinsics.simd_extract(transmute(#simd[8]i32)a, INDEX)
+}
+
+@(require_results, enable_target_feature="avx")
+_mm256_cvtsi256_si32 :: #force_inline proc "c" (a: __m256i) -> i32 {
+	return intrinsics.simd_extract(transmute(#simd[8]i32)a, 0)
+}
+
+// Zeroes the contents of all XMM or YMM registers.
+//
+// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zeroall)
+@(enable_target_feature="avx")
+_mm256_zeroall :: #force_inline proc "c" () {
+	llvm_vzeroall()
+}
+
+// Zeroes the upper 128 bits of all YMM registers; the lower 128-bits of the registers are unmodified.
+//
+// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zeroupper)
+@(enable_target_feature="avx")
+_mm256_zeroupper :: #force_inline proc "c" () {
+	llvm_vzeroupper()
+}
+
+// Shuffles single-precision (32-bit) floating-point elements in `a` within 128-bit lanes using the control in `b`.
+//
+// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar_ps)
+@(require_results, enable_target_feature="avx")
+_mm256_permutevar_ps :: #force_inline proc "c" (a: __m256, b: __m256i) -> __m256 {
+	return llvm_vpermilps256(a, transmute(#simd[8]i32)b)
+}
+
+// Shuffles single-precision (32-bit) floating-point elements in `a` using the control in `b`.
+//
+// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutevar_ps)
+@(require_results, enable_target_feature="avx")
+_mm_permutevar_ps :: #force_inline proc "c" (a: __m128, b: __m128i) -> __m128 {
+	return llvm_vpermilps(a, transmute(#simd[4]i32)b)
+}
+
+// Shuffles single-precision (32-bit) floating-point elements in `a` within 128-bit lanes using the control in `imm8`.
+@(require_results, enable_target_feature="avx")
+_mm256_permute_ps :: #force_inline proc "c" (a: __m256, $IMM8: u8) -> __m256 {
+	return intrinsics.simd_shuffle(
+		a,
+		_mm256_undefined_ps(),
+		(IMM8 >> 0) & 0b11,
+		(IMM8 >> 2) & 0b11,
+		(IMM8 >> 4) & 0b11,
+		(IMM8 >> 6) & 0b11,
+		((IMM8 >> 0) & 0b11) + 4,
+		((IMM8 >> 2) & 0b11) + 4,
+		((IMM8 >> 4) & 0b11) + 4,
+		((IMM8 >> 6) & 0b11) + 4,
+	)
+}
+
+// Shuffles single-precision (32-bit) floating-point elements in `a` using the control in `imm8`.
+@(require_results, enable_target_feature="avx")
+_mm_permute_ps :: #force_inline proc "c" (a: __m128, $IMM8: u8) -> __m128 {
+	return intrinsics.simd_shuffle(
+		a,
+		_mm_undefined_ps(),
+		(IMM8 >> 0) & 0b11,
+		(IMM8 >> 2) & 0b11,
+		(IMM8 >> 4) & 0b11,
+		(IMM8 >> 6) & 0b11,
+	)
+}
+
+// Shuffles double-precision (64-bit) floating-point elements in `a` within 256-bit lanes using the control in `b`.
+//
+// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar_pd)
+@(require_results, enable_target_feature="avx")
+_mm256_permutevar_pd :: #force_inline proc "c" (a: __m256d, b: __m256i) -> __m256d {
+	return llvm_vpermilpd256(a, b)
+}
+
+// Shuffles double-precision (64-bit) floating-point elements in `a` using the control in `b`.
+//
+// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutevar_pd)
+@(require_results, enable_target_feature="avx")
+_mm_permutevar_pd :: #force_inline proc "c" (a: __m128d, b: __m128i) -> __m128d {
+	return llvm_vpermilpd(a, b)
+}
+
+// Shuffles double-precision (64-bit) floating-point elements in `a` within 128-bit lanes using the control in `imm8`.
+@(require_results, enable_target_feature="avx")
+_mm256_permute_pd :: #force_inline proc "c" (a: __m256d, $IMM4: u8) -> __m256d where IMM4 < 16 {
+	return intrinsics.simd_shuffle(
+		a,
+		_mm256_undefined_pd(),
+		((IMM4 >> 0) & 1),
+		((IMM4 >> 1) & 1),
+		((IMM4 >> 2) & 1) + 2,
+		((IMM4 >> 3) & 1) + 2,
+	)
+}
+
+// Shuffles double-precision (64-bit) floating-point elements in `a` using the control in `imm8`.
+@(require_results, enable_target_feature="avx")
+_mm_permute_pd :: #force_inline proc "c" (a: __m128d, $IMM2: u8) -> __m128d where IMM2 < 4 {
+	return intrinsics.simd_shuffle(
+		a,
+		_mm_undefined_pd(),
+		(IMM2) & 1,
+		(IMM2 >> 1) & 1,
+	)
+}
+
+
+
+// Shuffles 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) selected by `imm8` from `a` and `b`.
+@(require_results, enable_target_feature="avx")
+_mm256_permute2f128_ps :: #force_inline proc "c" (a, b: __m256, $IMM8: u8) -> __m256 {
+	return _mm256_castsi256_ps(_mm256_permute2f128_si256(
+		_mm256_castps_si256(a),
+		_mm256_castps_si256(b),
+		IMM8,
+	))
+}
+
+// Shuffles 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) selected by `imm8` from `a` and `b`.
+@(require_results, enable_target_feature="avx")
+_mm256_permute2f128_pd :: #force_inline proc "c" (a, b: __m256d, $IMM8: u8) -> __m256d {
+	_mm256_castsi256_pd(_mm256_permute2f128_si256(
+		_mm256_castpd_si256(a),
+		_mm256_castpd_si256(b),
+		IMM8,
+	))
+}
+
+// Shuffles 128-bits (composed of integer data) selected by `imm8` from `a` and `b`.
+@(require_results, enable_target_feature="avx")
+_mm256_permute2f128_si256 :: #force_inline proc "c" (a, b: __m256i, $IMM8: u8) -> __m256i {
+	r := intrinsics.simd_shuffle(
+		a,
+		b,
+		2 * ((IMM8 & 0xf) & 0b11) + 0,
+		2 * ((IMM8 & 0xf) & 0b11) + 1,
+
+		2 * (((IMM8 & 0xf0) >> 4) & 0b11) + 0,
+		2 * (((IMM8 & 0xf0) >> 4) & 0b11) + 1,
+	)
+	return intrinsics.simd_shuffle(
+		r,
+		__m256i(0),
+
+		4 if ((IMM8 & 0xf) & 0b1000) != 0 else 0,
+		4 if ((IMM8 & 0xf) & 0b1000) != 0 else 1,
+
+		4 if (((IMM8 & 0xf0)>>4) & 0b1000) != 0 else 2,
+		4 if (((IMM8 & 0xf0)>>4) & 0b1000) != 0 else 3,
+	)
+}
+
+// Broadcasts a single-precision (32-bit) floating-point element from memory to all elements of the returned vector.
+@(require_results, enable_target_feature="avx")
+_mm256_broadcast_ss :: #force_inline proc "c" (f: ^f32) -> __m256 {
+	return _mm256_set1_ps(f^)
+}
+
+// Broadcasts a single-precision (32-bit) floating-point element from memory to all elements of the returned vector.
+@(require_results, enable_target_feature="sse,avx")
+_mm_broadcast_ss :: #force_inline proc "c" (f: ^f32) -> __m128 {
+	return _mm_set1_ps(f^)
+}
+
+// Broadcasts a double-precision (64-bit) floating-point element from memory to all elements of the returned vector.
+@(require_results, enable_target_feature="avx")
+_mm256_broadcast_sd :: #force_inline proc "c" (f: ^f64) -> __m256d {
+	return _mm256_set1_pd(f^)
+}
+
+// Broadcasts 128 bits from memory (composed of 4 packed single-precision (32-bit) floating-point elements) to all elements of the returned vector.
+@(require_results, enable_target_feature="sse,avx")
+_mm256_broadcast_ps :: #force_inline proc "c" (a: ^__m128) -> __m256 {
+	return intrinsics.simd_shuffle(a^, _mm_setzero_ps(), 0, 1, 2, 3, 0, 1, 2, 3)
+}
+
+// Broadcasts 128 bits from memory (composed of 2 packed double-precision (64-bit) floating-point elements) to all elements of the returned vector.
+@(require_results, enable_target_feature="sse2,avx")
+_mm256_broadcast_pd :: #force_inline proc "c" (a: ^__m128d) -> __m256d {
+	return intrinsics.simd_shuffle(a^, _mm_setzero_pd(), 0, 1, 0, 1)
+}
+
+// Copies `a` to result, then inserts 128 bits (composed of 4 packed
+// single-precision (32-bit) floating-point elements) from `b` into result
+// at the location specified by `imm8`.
+@(require_results, enable_target_feature="sse,avx")
+_mm256_insertf128_ps :: #force_inline proc "c" (a: __m256, b: __m128, $IMM1: u8) -> __m256 where IMM1 < 2 {
+	when IMM1 == 0  {
+		return intrinsics.simd_shuffle(
+			a,
+			_mm256_castps128_ps256(b),
+			8, 9, 10, 11, 4, 5, 6, 7,
+		)
+	} else {
+		return intrinsics.simd_shuffle(
+			a,
+			_mm256_castps128_ps256(b),
+			0, 1, 2, 3, 8, 9, 10, 11,
+		)
+	}
+}
+
+// Copies `a` to result, then inserts 128 bits (composed of 2 packed
+// double-precision (64-bit) floating-point elements) from `b` into result
+// at the location specified by `imm8`.
+@(require_results, enable_target_feature="sse2,avx")
+_mm256_insertf128_pd :: #force_inline proc "c" (a: __m256d, b: __m128d, $IMM1: u8) -> __m256d where IMM1 < 2 {
+	when IMM1 == 0 {
+		return intrinsics.simd_shuffle(
+			a,
+			_mm256_castpd128_pd256(b),
+			4, 5, 2, 3,
+		)
+	} else {
+		return intrinsics.simd_shuffle(
+			a,
+			_mm256_castpd128_pd256(b),
+			0, 1, 4, 5,
+		)
+	}
+}
+
+// Copies `a` to result, then inserts 128 bits from `b` into result at the location specified by `imm8`.
+@(require_results, enable_target_feature="avx")
+_mm256_insertf128_si256 :: #force_inline proc "c" (a: __m256i, b: __m128i, $IMM1: u8) -> __m256i where IMM1 < 2 {
+	when IMM1 == 0 {
+		return intrinsics.simd_shuffle(
+			a,
+			_mm256_castsi128_si256(b),
+			4, 5, 2, 3,
+		)
+	} else {
+		return intrinsics.simd_shuffle(
+			a,
+			_mm256_castsi128_si256(b),
+			0, 1, 4, 5,
+		)
+	}
+}
+
+// Copies `a` to result, and inserts the 8-bit integer `i` into result at the location specified by `index`.
+@(require_results, enable_target_feature="avx")
+_mm256_insert_epi8 :: #force_inline proc "c" (a: __m256i, i: i8, $INDEX: u8) -> __m256i where INDEX < 32 {
+	return transmute(__m256i)intrinsics.simd_replace(transmute(#simd[32]i8)a, INDEX, i)
+}
+
+// Copies `a` to result, and inserts the 16-bit integer `i` into result at the location specified by `index`.
+@(require_results, enable_target_feature="avx")
+_mm256_insert_epi16 :: #force_inline proc "c" (a: __m256i, i: i16, $INDEX: u8) -> __m256i where INDEX < 16 {
+	return transmute(__m256i)intrinsics.simd_replace(transmute(#simd[16]i16)a, INDEX, i)
+}
+
+// Copies `a` to result, and inserts the 32-bit integer `i` into result at the location specified by `index`.
+@(require_results, enable_target_feature="avx")
+_mm256_insert_epi32 :: #force_inline proc "c" (a: __m256i, i: i32, $INDEX: u8) -> __m256i where INDEX < 8 {
+	return transmute(__m256i)intrinsics.simd_replace(transmute(#simd[8]i32)a, INDEX, i)
+}
+
+
+
+// Loads 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from memory into result.
+// `mem_addr` must be aligned on a 32-byte boundary or a
+// general-protection exception may be generated.
+//
+// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_pd)
+@(require_results, enable_target_feature="avx")
+_mm256_load_pd :: #force_inline proc "c" (mem_addr: ^f64) -> __m256d {
+	return (^__m256d)(mem_addr)^
+}
+
+// Stores 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from `a` into memory.
+// `mem_addr` must be aligned on a 32-byte boundary or a
+// general-protection exception may be generated.
+//
+// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_pd)
+@(enable_target_feature="avx")
+_mm256_store_pd :: #force_inline proc "c" (mem_addr: ^f64, a: __m256d) {
+	(^__m256d)(mem_addr)^ = a
+}
+
+// Loads 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from memory into result.
+// `mem_addr` must be aligned on a 32-byte boundary or a
+// general-protection exception may be generated.
+//
+// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_ps)
+@(require_results, enable_target_feature="avx")
+_mm256_load_ps :: #force_inline proc "c" (mem_addr: ^f32) -> __m256 {
+	return (^__m256)(mem_addr)^
+}
+
+// Stores 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from `a` into memory.
+// `mem_addr` must be aligned on a 32-byte boundary or a
+// general-protection exception may be generated.
+//
+// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_ps)
+@(enable_target_feature="avx")
+_mm256_store_ps :: #force_inline proc "c" (mem_addr: ^f32, a: __m256) {
+	(^__m256)(mem_addr)^ = a
+}
+
+// Loads 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from memory into result.
+// `mem_addr` does not need to be aligned on any particular boundary.
+@(enable_target_feature="avx")
+_mm256_loadu_pd :: #force_inline proc "c" (mem_addr: ^f64) -> __m256d {
+	return intrinsics.unaligned_load((^__m256d)(mem_addr))
+}
+
+// Stores 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from `a` into memory.
+// `mem_addr` does not need to be aligned on any particular boundary.
+@(enable_target_feature="avx")
+_mm256_storeu_pd :: #force_inline proc "c" (mem_addr: ^f64, a: __m256d) {
+	intrinsics.unaligned_store((^__m256d)(mem_addr), a)
+}
+
+// Loads 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from memory into result.
+// `mem_addr` does not need to be aligned on any particular boundary.
+@(require_results, enable_target_feature="avx")
+_mm256_loadu_ps :: #force_inline proc "c" (mem_addr: ^f32) -> __m256 {
+	return intrinsics.unaligned_load((^__m256)(mem_addr))
+}
+
+// Stores 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from `a` into memory.
+// `mem_addr` does not need to be aligned on any particular boundary.
+@(enable_target_feature="avx")
+_mm256_storeu_ps :: #force_inline proc "c" (mem_addr: ^f32, a: __m256) {
+	intrinsics.unaligned_store((^__m256)(mem_addr), a)
+}
+
+// Loads 256-bits of integer data from memory into result.
+// `mem_addr` must be aligned on a 32-byte boundary or a
+// general-protection exception may be generated.
+//
+// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_si256)
+@(require_results, enable_target_feature="avx")
+_mm256_load_si256 :: #force_inline proc "c" (mem_addr: ^__m256i) -> __m256i {
+	return mem_addr^
+}
+
+// Stores 256-bits of integer data from `a` into memory.
+// `mem_addr` must be aligned on a 32-byte boundary or a
+// general-protection exception may be generated.
+//
+// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_si256)
+@(enable_target_feature="avx")
+_mm256_store_si256 :: #force_inline proc "c" (mem_addr: ^__m256i, a: __m256i) {
+	mem_addr^ = a
+}
+
+// Loads 256-bits of integer data from memory into result.
+// `mem_addr` does not need to be aligned on any particular boundary.
+@(require_results, enable_target_feature="avx")
+_mm256_loadu_si256 :: #force_inline proc "c" (mem_addr: ^__m256i) -> __m256i {
+	return intrinsics.unaligned_load(mem_addr)
+}
+
+// Stores 256-bits of integer data from `a` into memory.
+// `mem_addr` does not need to be aligned on any particular boundary.
+@(enable_target_feature="avx")
+_mm256_storeu_si256 :: #force_inline proc "c" (mem_addr: ^__m256i, a: __m256i) {
+	intrinsics.unaligned_store(mem_addr, a)
+}
+
+// Loads packed double-precision (64-bit) floating-point elements from memory
+// into result using `mask` (elements are zeroed out when the high bit of the
+// corresponding element is not set).
+@(require_results, enable_target_feature="avx")
+_mm256_maskload_pd :: #force_inline proc "c" (mem_addr: ^f64, mask: __m256i) -> __m256d {
+	mask_mask := intrinsics.simd_shr(mask, 63)
+	return intrinsics.simd_masked_load(mem_addr, _mm256_setzero_pd(), mask_mask)
+}
+
+// Stores packed double-precision (64-bit) floating-point elements from `a`
+// into memory using `mask`.
+@(enable_target_feature="avx")
+_mm256_maskstore_pd :: #force_inline proc "c" (mem_addr: ^f64, mask: __m256i, a: __m256d) {
+	mask_mask := intrinsics.simd_shr(mask, 63)
+	intrinsics.simd_masked_store(mem_addr, a, mask_mask)
+}
+
+
+// Loads packed double-precision (64-bit) floating-point elements from memory
+// into result using `mask` (elements are zeroed out when the high bit of the
+// corresponding element is not set).
+@(require_results, enable_target_feature="sse2,avx")
+_mm_maskload_pd :: #force_inline proc "c" (mem_addr: ^f64, mask: __m128i) -> __m128d {
+	mask_mask := intrinsics.simd_shr(mask, 63)
+	return intrinsics.simd_masked_load(mem_addr, _mm_setzero_pd(), mask_mask)
+}
+
+// Stores packed double-precision (64-bit) floating-point elements from `a`
+// into memory using `mask`.
+@(enable_target_feature="avx")
+_mm_maskstore_pd :: #force_inline proc "c" (mem_addr: ^f64, mask: __m128i, a: __m128d) {
+	mask_mask := intrinsics.simd_shr(mask, 63)
+	intrinsics.simd_masked_store(mem_addr, a, mask_mask)
+}
+
+// Loads packed single-precision (32-bit) floating-point elements from memory
+// into result using `mask` (elements are zeroed out when the high bit of the
+// corresponding element is not set).
+@(require_results, enable_target_feature="avx")
+_mm256_maskload_ps :: #force_inline proc "c" (mem_addr: ^f32, mask: __m256i) -> __m256 {
+	mask_mask := intrinsics.simd_shr(transmute(#simd[8]i32)mask, 31)
+	return intrinsics.simd_masked_load(mem_addr, _mm256_setzero_ps(), mask_mask)
+}
+
+// Stores packed single-precision (32-bit) floating-point elements from `a`
+// into memory using `mask`.
+@(enable_target_feature="avx")
+_mm256_maskstore_ps :: #force_inline proc "c" (mem_addr: ^f32, mask: __m256i, a: __m256) {
+	mask_mask := intrinsics.simd_shr(transmute(#simd[8]i32)mask, 31)
+	intrinsics.simd_masked_store(mem_addr, a, mask_mask)
+}
+
+// Loads packed single-precision (32-bit) floating-point elements from memory
+// into result using `mask` (elements are zeroed out when the high bit of the
+// corresponding element is not set).
+@(require_results, enable_target_feature="sse,avx")
+_mm_maskload_ps :: #force_inline proc "c" (mem_addr: ^f32, mask: __m128i) -> __m128 {
+	mask_mask := intrinsics.simd_shr(transmute(#simd[4]i32)mask, 31)
+	return intrinsics.simd_masked_load(mem_addr, _mm_setzero_ps(), mask_mask)
+}
+
+// Stores packed single-precision (32-bit) floating-point elements from `a`
+// into memory using `mask`.
+@(enable_target_feature="avx")
+_mm_maskstore_ps :: #force_inline proc "c" (mem_addr: ^f32, mask: __m128i, a: __m128) {
+	mask_mask := intrinsics.simd_shr(transmute(#simd[4]i32)mask, 31)
+	intrinsics.simd_masked_store(mem_addr, a, mask_mask)
+}
+
+
+
+
+// Duplicate odd-indexed single-precision (32-bit) floating-point elements from `a`, and returns the results.
+@(require_results, enable_target_feature="avx")
+_mm256_movehdup_ps :: #force_inline proc "c" (a: __m256) -> __m256 {
+	return intrinsics.simd_shuffle(a, a, 1, 1, 3, 3, 5, 5, 7, 7)
+}
+
+// Duplicate even-indexed single-precision (32-bit) floating-point elements from `a`, and returns the results.
+@(require_results, enable_target_feature="avx")
+_mm256_moveldup_ps :: #force_inline proc "c" (a: __m256) -> __m256 {
+	return intrinsics.simd_shuffle(a, a, 0, 0, 2, 2, 4, 4, 6, 6)
+}
+
+// Duplicate even-indexed double-precision (64-bit) floating-point elements from `a`, and returns the results.
+@(require_results, enable_target_feature="avx")
+_mm256_movedup_pd :: #force_inline proc "c" (a: __m256d) -> __m256d {
+	return intrinsics.simd_shuffle(a, a, 0, 0, 2, 2)
+}
+
+
+// Loads 256-bits of integer data from unaligned memory into result.
+// This intrinsic may perform better than `_mm256_loadu_si256` when the
+// data crosses a cache line boundary.
+//
+// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_lddqu_si256)
+@(require_results, enable_target_feature="avx")
+_mm256_lddqu_si256 :: #force_inline proc "c" (mem_addr: ^__m256i) -> __m256i {
+	return transmute(__m256i)llvm_vlddqu(mem_addr)
+}
+
+// Moves integer data from a 256-bit integer vector to a 32-byte
+// aligned memory location. To minimize caching, the data is flagged as
+// non-temporal (unlikely to be used again soon)
+//
+// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_si256)
+//
+// # Safety of non-temporal stores
+//
+// After using this intrinsic, but before any other access to the memory that this intrinsic
+// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
+// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
+// return.
+//
+// See [`_mm_sfence`] for details.
+@(enable_target_feature="avx")
+_mm256_stream_si256 :: #force_inline proc "c" (mem_addr: ^__m256i, a: __m256i) {
+	panic_contextless("TODO: _mm256_stream_si256")
+}
+
+// Moves double-precision values from a 256-bit vector of `[4 x double]`
+// to a 32-byte aligned memory location. To minimize caching, the data is
+// flagged as non-temporal (unlikely to be used again soon).
+//
+// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_pd)
+//
+// # Safety of non-temporal stores
+//
+// After using this intrinsic, but before any other access to the memory that this intrinsic
+// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
+// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
+// return.
+//
+// See [`_mm_sfence`] for details.
+@(enable_target_feature="avx")
+_mm256_stream_pd :: #force_inline proc "c" (mem_addr: ^f64, a: __m256d) {
+	panic_contextless("TODO: _mm256_stream_pd")
+}
+
+// Moves single-precision floating point values from a 256-bit vector
+// of `[8 x float]` to a 32-byte aligned memory location. To minimize
+// caching, the data is flagged as non-temporal (unlikely to be used again
+// soon).
+//
+// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_ps)
+//
+// # Safety of non-temporal stores
+//
+// After using this intrinsic, but before any other access to the memory that this intrinsic
+// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
+// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
+// return.
+//
+// See [`_mm_sfence`] for details.
+@(enable_target_feature="avx")
+_mm256_stream_ps :: #force_inline proc "c" (mem_addr: ^f32, a: __m256) {
+	panic_contextless("TODO: _mm256_stream_ps")
+}
+
+// Computes the approximate reciprocal of packed single-precision (32-bit) floating-point elements in `a`, and returns the results. The maximum
+// relative error for this approximation is less than 1.5*2^-12.
+//
+// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rcp_ps)
+@(require_results, enable_target_feature="avx")
+_mm256_rcp_ps :: #force_inline proc "c" (a: __m256) -> __m256 {
+	return llvm_vrcpps(a)
+}
+
+// Computes the approximate reciprocal square root of packed single-precision
+// (32-bit) floating-point elements in `a`, and returns the results.
+// The maximum relative error for this approximation is less than 1.5*2^-12.
+//
+// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rsqrt_ps)
+@(require_results, enable_target_feature="avx")
+_mm256_rsqrt_ps :: #force_inline proc "c" (a: __m256) -> __m256 {
+	return llvm_vrsqrtps(a)
+}
+
+
+
+// Unpacks and interleave double-precision (64-bit) floating-point elements
+// from the high half of each 128-bit lane in `a` and `b`.
+@(require_results, enable_target_feature="avx")
+_mm256_unpackhi_pd :: #force_inline proc "c" (a, b: __m256d) -> __m256d {
+	return intrinsics.simd_shuffle(a, b, 1, 5, 3, 7)
+}
+
+// Unpacks and interleave single-precision (32-bit) floating-point elements
+// from the high half of each 128-bit lane in `a` and `b`.
+@(require_results, enable_target_feature="avx")
+_mm256_unpackhi_ps :: #force_inline proc "c" (a, b: __m256) -> __m256 {
+	return intrinsics.simd_shuffle(a, b, 2, 10, 3, 11, 6, 14, 7, 15)
+}
+
+// Unpacks and interleave double-precision (64-bit) floating-point elements
+// from the low half of each 128-bit lane in `a` and `b`.
+@(require_results, enable_target_feature="avx")
+_mm256_unpacklo_pd :: #force_inline proc "c" (a, b: __m256d) -> __m256d {
+	return intrinsics.simd_shuffle(a, b, 0, 4, 2, 6)
+}
+
+// Unpacks and interleave single-precision (32-bit) floating-point elements
+// from the low half of each 128-bit lane in `a` and `b`.
+@(require_results, enable_target_feature="avx")
+_mm256_unpacklo_ps :: #force_inline proc "c" (a, b: __m256) -> __m256 {
+	return intrinsics.simd_shuffle(a, b, 0, 8, 1, 9, 4, 12, 5, 13)
+}
+
+// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
+// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
+// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
+// the result is zero, otherwise set `CF` to 0. Return the `ZF` value.
+@(require_results, enable_target_feature="avx")
+_mm256_testz_si256 :: #force_inline proc "c" (a, b: __m256i) -> i32 {
+	r := intrinsics.simd_bit_and(a, b)
+	return i32(0 == intrinsics.simd_reduce_or(r))
+}
+
+// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
+// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
+// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
+// the result is zero, otherwise set `CF` to 0. Return the `CF` value.
+@(require_results, enable_target_feature="avx")
+_mm256_testc_si256 :: #force_inline proc "c" (a, b: __m256i) -> i32 {
+	r := intrinsics.simd_bit_and(intrinsics.simd_bit_xor(a, __m256i(~i64(0))), b)
+	return i32(0 == intrinsics.simd_reduce_or(r))
+}
+
+
+
+// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
+// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
+// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
+// the result is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and
+// `CF` values are zero, otherwise return 0.
+//
+// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testnzc_si256)
+@(require_results, enable_target_feature="avx")
+_mm256_testnzc_si256 :: #force_inline proc "c" (a, b: __m256i) -> i32 {
+	return llvm_ptestnzc256(a, b)
+}
+
+// Computes the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point elements) in `a` and `b`, producing an intermediate 256-bit
+// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
+// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
+// is zero, otherwise set `CF` to 0. Return the `ZF` value.
+//
+// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_pd)
+@(require_results, enable_target_feature="avx")
+_mm256_testz_pd :: #force_inline proc "c" (a, b: __m256d) -> i32 {
+	return llvm_vtestzpd256(a, b)
+}
+
+// Computes the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point elements) in `a` and `b`, producing an intermediate 256-bit
+// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
+// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
+// is zero, otherwise set `CF` to 0. Return the `CF` value.
+//
+// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testc_pd)
+@(require_results, enable_target_feature="avx")
+_mm256_testc_pd :: #force_inline proc "c" (a, b: __m256d) -> i32 {
+	return llvm_vtestcpd256(a, b)
+}
+
+// Computes the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point elements) in `a` and `b`, producing an intermediate 256-bit
+// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
+// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
+// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
+// are zero, otherwise return 0.
+//
+// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testnzc_pd)
+@(require_results, enable_target_feature="avx")
+_mm256_testnzc_pd :: #force_inline proc "c" (a, b: __m256d) -> i32 {
+	return llvm_vtestnzcpd256(a, b)
+}
+
+// Computes the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point elements) in `a` and `b`, producing an intermediate 128-bit
+// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
+// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
+// is zero, otherwise set `CF` to 0. Return the `ZF` value.
+@(require_results, enable_target_feature="sse2,avx")
+_mm_testz_pd :: #force_inline proc "c" (a, b: __m128d) -> i32 {
+	r := intrinsics.simd_lanes_lt(transmute(__m128i)_mm_and_pd(a, b), __m128i(0))
+	return i32(0 == intrinsics.simd_reduce_or(r))
+}
+
+// Computes the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point elements) in `a` and `b`, producing an intermediate 128-bit
+// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
+// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
+// is zero, otherwise set `CF` to 0. Return the `CF` value.
+@(require_results, enable_target_feature="sse2,avx")
+_mm_testc_pd :: #force_inline proc "c" (a, b: __m128d) -> i32 {
+	r := intrinsics.simd_lanes_lt(transmute(__m128i)_mm_andnot_pd(a, b), __m128i(0))
+	return i32(0 == intrinsics.simd_reduce_or(r))
+}
+
+// Computes the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point elements) in `a` and `b`, producing an intermediate 128-bit
+// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
+// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
+// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
+// are zero, otherwise return 0.
+//
+// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_pd)
+@(require_results, enable_target_feature="avx")
+_mm_testnzc_pd :: #force_inline proc "c" (a, b: __m128d) -> i32 {
+	return llvm_vtestnzcpd(a, b)
+}
+
+// Computes the bitwise AND of 256 bits (representing single-precision (32-bit) floating-point elements) in `a` and `b`, producing an intermediate 256-bit
+// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
+// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
+// is zero, otherwise set `CF` to 0. Return the `ZF` value.
+//
+// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_ps)
+@(require_results, enable_target_feature="avx")
+_mm256_testz_ps :: #force_inline proc "c" (a, b: __m256) -> i32 {
+	return llvm_vtestzps256(a, b)
+}
+
+// Computes the bitwise AND of 256 bits (representing single-precision (32-bit) floating-point elements) in `a` and `b`, producing an intermediate 256-bit
+// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
+// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
+// is zero, otherwise set `CF` to 0. Return the `CF` value.
+//
+// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testc_ps)
+@(require_results, enable_target_feature="avx")
+_mm256_testc_ps :: #force_inline proc "c" (a, b: __m256) -> i32 {
+	return llvm_vtestcps256(a, b)
+}
+
+// Computes the bitwise AND of 256 bits (representing single-precision (32-bit) floating-point elements) in `a` and `b`, producing an intermediate 256-bit
+// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
+// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
+// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
+// are zero, otherwise return 0.
+//
+// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testnzc_ps)
+@(require_results, enable_target_feature="avx")
+_mm256_testnzc_ps :: #force_inline proc "c" (a, b: __m256) -> i32 {
+	return llvm_vtestnzcps256(a, b)
+}
+
+// Computes the bitwise AND of 128 bits (representing single-precision (32-bit) floating-point elements) in `a` and `b`, producing an intermediate 128-bit
+// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
+// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
+// is zero, otherwise set `CF` to 0. Return the `ZF` value.
+@(require_results, enable_target_feature="sse,avx")
+_mm_testz_ps :: #force_inline proc "c" (a: __m128, b: __m128) -> i32 {
+	r := intrinsics.simd_lanes_lt(transmute(#simd[4]i32)_mm_and_ps(a, b), (#simd[4]i32)(0))
+	return i32(0 == intrinsics.simd_reduce_or(r))
+}
+
+// Computes the bitwise AND of 128 bits (representing single-precision (32-bit) floating-point elements) in `a` and `b`, producing an intermediate 128-bit
+// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
+// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
+// is zero, otherwise set `CF` to 0. Return the `CF` value.
+@(require_results, enable_target_feature="sse,avx")
+_mm_testc_ps :: #force_inline proc "c" (a: __m128, b: __m128) -> i32 {
+	r := intrinsics.simd_lanes_lt(transmute(#simd[4]i32)_mm_andnot_ps(a, b), (#simd[4]i32)(0))
+	return i32(0 == intrinsics.simd_reduce_or(r))
+}
+
+// Computes the bitwise AND of 128 bits (representing single-precision (32-bit) floating-point elements) in `a` and `b`, producing an intermediate 128-bit
+// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
+// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
+// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
+// are zero, otherwise return 0.
+//
+// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_ps)
+@(require_results, enable_target_feature="avx")
+_mm_testnzc_ps :: #force_inline proc "c" (a: __m128, b: __m128) -> i32 {
+	return llvm_vtestnzcps(a, b)
+}
+
+// Sets each bit of the returned mask based on the most significant bit of the
+// corresponding packed double-precision (64-bit) floating-point element in
+// `a`.
+@(require_results, enable_target_feature="avx")
+_mm256_movemask_pd :: #force_inline proc "c" (a: __m256d) -> i32 {
+	mask := intrinsics.simd_lanes_lt(transmute(#simd[4]i64)a, (#simd[4]i64)(0))
+	return i32(transmute(u8)intrinsics.simd_extract_lsbs(mask))
+}
+
+// Sets each bit of the returned mask based on the most significant bit of the
+// corresponding packed single-precision (32-bit) floating-point element in
+// `a`.
+@(require_results, enable_target_feature="avx")
+_mm256_movemask_ps :: #force_inline proc "c" (a: __m256) -> i32 {
+	// Propagate the highest bit to the rest, because simd_bitmask
+	// requires all-1 or all-0.
+	mask := intrinsics.simd_lanes_lt(transmute(#simd[8]i32)a, (#simd[8]i32)(0))
+	return i32(transmute(u8)intrinsics.simd_extract_lsbs(mask))
+}
+
+// Returns vector of type __m256d with all elements set to zero.
+@(require_results, enable_target_feature="avx")
+_mm256_setzero_pd :: #force_inline proc "c" () -> __m256d {
+	return 0
+}
+
+// Returns vector of type __m256 with all elements set to zero.
+@(require_results, enable_target_feature="avx")
+_mm256_setzero_ps :: #force_inline proc "c" () -> __m256 {
+	return 0
+}
+
+// Returns vector of type __m256i with all elements set to zero.
+@(require_results, enable_target_feature="avx")
+_mm256_setzero_si256 :: #force_inline proc "c" () -> __m256i {
+	return 0
+}
+
+// Sets packed double-precision (64-bit) floating-point elements in returned
+// vector with the supplied values.
+@(require_results, enable_target_feature="avx")
+_mm256_set_pd :: #force_inline proc "c" (a: f64, b: f64, c: f64, d: f64) -> __m256d {
+	return _mm256_setr_pd(d, c, b, a)
+}
+
+// Sets packed single-precision (32-bit) floating-point elements in returned
+// vector with the supplied values.
+@(require_results, enable_target_feature="avx")
+_mm256_set_ps :: #force_inline proc "c" (
+	a: f32,
+	b: f32,
+	c: f32,
+	d: f32,
+	e: f32,
+	f: f32,
+	g: f32,
+	h: f32,
+) -> __m256 {
+	return _mm256_setr_ps(h, g, f, e, d, c, b, a)
+}
+
+// Sets packed 8-bit integers in returned vector with the supplied values.
+@(require_results, enable_target_feature="avx")
+_mm256_set_epi8 :: #force_inline proc "c" (
+	e00, e01, e02, e03, e04, e05, e06, e07: i8,
+	e08, e09, e10, e11, e12, e13, e14, e15: i8,
+	e16, e17, e18, e19, e20, e21, e22, e23: i8,
+	e24, e25, e26, e27, e28, e29, e30, e31: i8,
+) -> __m256i {
+	return _mm256_setr_epi8(
+		e31, e30, e29, e28, e27, e26, e25, e24,
+		e23, e22, e21, e20, e19, e18, e17, e16,
+		e15, e14, e13, e12, e11, e10, e09, e08,
+		e07, e06, e05, e04, e03, e02, e01, e00,
+	)
+}
+
+// Sets packed 16-bit integers in returned vector with the supplied values.
+@(require_results, enable_target_feature="avx")
+_mm256_set_epi16 :: #force_inline proc "c" (
+	e00, e01, e02, e03, e04, e05, e06, e07: i16,
+	e08, e09, e10, e11, e12, e13, e14, e15: i16,
+) -> __m256i {
+	return _mm256_setr_epi16(
+		e15, e14, e13, e12,
+		e11, e10, e09, e08,
+		e07, e06, e05, e04,
+		e03, e02, e01, e00,
+	)
+}
+
+// Sets packed 32-bit integers in returned vector with the supplied values.
+@(require_results, enable_target_feature="avx")
+_mm256_set_epi32 :: #force_inline proc "c" (e0, e1, e2, e3, e4, e5, e6, e7: i32) -> __m256i {
+	return _mm256_setr_epi32(e7, e6, e5, e4, e3, e2, e1, e0)
+}
+
+// Sets packed 64-bit integers in returned vector with the supplied values.
+@(require_results, enable_target_feature="avx")
+_mm256_set_epi64x :: #force_inline proc "c" (a: i64, b: i64, c: i64, d: i64) -> __m256i {
+	return _mm256_setr_epi64x(d, c, b, a)
+}
+
+// Sets packed double-precision (64-bit) floating-point elements in returned
+// vector with the supplied values in reverse order.
+@(require_results, enable_target_feature="avx")
+_mm256_setr_pd :: #force_inline proc "c" (a: f64, b: f64, c: f64, d: f64) -> __m256d {
+	return __m256d{a, b, c, d}
+}
+
+// Sets packed single-precision (32-bit) floating-point elements in returned
+// vector with the supplied values in reverse order.
+@(require_results, enable_target_feature="avx")
+_mm256_setr_ps :: #force_inline proc "c" (a, b, c, d, e, f, g, h: f32) -> __m256 {
+	return __m256{a, b, c, d, e, f, g, h}
+}
+
+// Sets packed 8-bit integers in returned vector with the supplied values in
+// reverse order.
+@(require_results, enable_target_feature="avx")
+_mm256_setr_epi8 :: #force_inline proc "c" (
+	e00, e01, e02, e03, e04, e05, e06, e07: i8,
+	e08, e09, e10, e11, e12, e13, e14, e15: i8,
+	e16, e17, e18, e19, e20, e21, e22, e23: i8,
+	e24, e25, e26, e27, e28, e29, e30, e31: i8,
+) -> __m256i {
+	return transmute(__m256i)#simd[32]i8{
+		e00, e01, e02, e03, e04, e05, e06, e07,
+		e08, e09, e10, e11, e12, e13, e14, e15,
+		e16, e17, e18, e19, e20, e21, e22, e23,
+		e24, e25, e26, e27, e28, e29, e30, e31,
+	}
+}
+
+// Sets packed 16-bit integers in returned vector with the supplied values in
+// reverse order.
+@(require_results, enable_target_feature="avx")
+_mm256_setr_epi16 :: #force_inline proc "c" (
+	e00, e01, e02, e03, e04, e05, e06, e07: i16,
+	e08, e09, e10, e11, e12, e13, e14, e15: i16,
+) -> __m256i {
+	return transmute(__m256i)#simd[16]i16{
+		e00, e01, e02, e03,
+		e04, e05, e06, e07,
+		e08, e09, e10, e11,
+		e12, e13, e14, e15,
+	}
+}
+
+// Sets packed 32-bit integers in returned vector with the supplied values in
+// reverse order.
+@(require_results, enable_target_feature="avx")
+_mm256_setr_epi32 :: #force_inline proc "c" (e0, e1, e2, e3, e4, e5, e6, e7: i32) -> __m256i {
+	return transmute(__m256i)#simd[8]i32{e0, e1, e2, e3, e4, e5, e6, e7}
+}
+
+// Sets packed 64-bit integers in returned vector with the supplied values in
+// reverse order.
+@(require_results, enable_target_feature="avx")
+_mm256_setr_epi64x :: #force_inline proc "c" (a: i64, b: i64, c: i64, d: i64) -> __m256i {
+	return {a, b, c, d}
+}
+
+// Broadcasts double-precision (64-bit) floating-point value `a` to all elements of returned vector.
+@(require_results, enable_target_feature="avx")
+_mm256_set1_pd :: #force_inline proc "c" (a: f64) -> __m256d {
+	return a
+}
+
+// Broadcasts single-precision (32-bit) floating-point value `a` to all elements of returned vector.
+@(require_results, enable_target_feature="avx")
+_mm256_set1_ps :: #force_inline proc "c" (a: f32) -> __m256 {
+	return a
+}
+
+// Broadcasts 8-bit integer `a` to all elements of returned vector.
+// This intrinsic may generate the `vpbroadcastb`.
+@(require_results, enable_target_feature="avx")
+_mm256_set1_epi8 :: #force_inline proc "c" (a: i8) -> __m256i {
+	return transmute(__m256i)(#simd[32]i8)(a)
+}
+
+// Broadcasts 16-bit integer `a` to all elements of returned vector.
+// This intrinsic may generate the `vpbroadcastw`.
+//
+// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi16)
+@(require_results, enable_target_feature="avx")
+_mm256_set1_epi16 :: #force_inline proc "c" (a: i16) -> __m256i {
+	return transmute(__m256i)(#simd[16]i16)(a)
+}
+
+// Broadcasts 32-bit integer `a` to all elements of returned vector.
+// This intrinsic may generate the `vpbroadcastd`.
+@(require_results, enable_target_feature="avx")
+_mm256_set1_epi32 :: #force_inline proc "c" (a: i32) -> __m256i {
+	return transmute(__m256i)(#simd[8]i32)(a)
+}
+
+// Broadcasts 64-bit integer `a` to all elements of returned vector.
+// This intrinsic may generate the `vpbroadcastq`.
+//
+// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi64x)
+@(require_results, enable_target_feature="avx")
+_mm256_set1_epi64x :: #force_inline proc "c" (a: i64) -> __m256i {
+	return a
+}
+
+// Cast vector of type __m256d to type __m256.
+@(require_results, enable_target_feature="avx")
+_mm256_castpd_ps :: #force_inline proc "c" (a: __m256d) -> __m256 {
+	return transmute(__m256)a
+}
+
+// Cast vector of type __m256 to type __m256d.
+@(require_results, enable_target_feature="avx")
+_mm256_castps_pd :: #force_inline proc "c" (a: __m256) -> __m256d {
+	return transmute(__m256d)a
+}
+
+// Casts vector of type __m256 to type __m256i.
+@(require_results, enable_target_feature="avx")
+_mm256_castps_si256 :: #force_inline proc "c" (a: __m256) -> __m256i {
+	return transmute(__m256i)a
+}
+
+// Casts vector of type __m256i to type __m256.
+@(require_results, enable_target_feature="avx")
+_mm256_castsi256_ps :: #force_inline proc "c" (a: __m256i) -> __m256 {
+	return transmute(__m256)a
+}
+
+// Casts vector of type __m256d to type __m256i.
+@(require_results, enable_target_feature="avx")
+_mm256_castpd_si256 :: #force_inline proc "c" (a: __m256d) -> __m256i {
+	return transmute(__m256i)a
+}
+
+// Casts vector of type __m256i to type __m256d.
+@(require_results, enable_target_feature="avx")
+_mm256_castsi256_pd :: #force_inline proc "c" (a: __m256i) -> __m256d {
+	return transmute(__m256d)a
+}
+
+// Casts vector of type __m256 to type __m128.
+@(require_results, enable_target_feature="avx")
+_mm256_castps256_ps128 :: #force_inline proc "c" (a: __m256) -> __m128 {
+	return intrinsics.simd_shuffle(a, a, 0, 1, 2, 3)
+}
+
+// Casts vector of type __m256d to type __m128d.
+@(require_results, enable_target_feature="avx")
+_mm256_castpd256_pd128 :: #force_inline proc "c" (a: __m256d) -> __m128d {
+	return intrinsics.simd_shuffle(a, a, 0, 1)
+}
+
+// Casts vector of type __m256i to type __m128i.
+@(require_results, enable_target_feature="avx")
+_mm256_castsi256_si128 :: #force_inline proc "c" (a: __m256i) -> __m128i {
+	return intrinsics.simd_shuffle(a, a, 0, 1)
+}
+
+// Casts vector of type __m128 to type __m256;
+// the upper 128 bits of the result are indeterminate.
+//
+// In the Intel documentation, the upper bits are declared to be "undefined".
+@(require_results, enable_target_feature="sse,avx")
+_mm256_castps128_ps256 :: #force_inline proc "c" (a: __m128) -> __m256 {
+	return intrinsics.simd_shuffle(a, _mm_undefined_ps(), 0, 1, 2, 3, 4, 4, 4, 4)
+}
+
+// Casts vector of type __m128d to type __m256d;
+// the upper 128 bits of the result are indeterminate.
+//
+// In the Intel documentation, the upper bits are declared to be "undefined".
+@(require_results, enable_target_feature="sse2,avx")
+_mm256_castpd128_pd256 :: #force_inline proc "c" (a: __m128d) -> __m256d {
+	return intrinsics.simd_shuffle(a, _mm_undefined_pd(), 0, 1, 2, 2)
+}
+
+// Casts vector of type __m128i to type __m256i;
+// the upper 128 bits of the result are indeterminate.
+//
+// In the Intel documentation, the upper bits are declared to be "undefined".
+@(require_results, enable_target_feature="avx")
+_mm256_castsi128_si256 :: #force_inline proc "c" (a: __m128i) -> __m256i {
+	return intrinsics.simd_shuffle(a, __m128i(0), 0, 1, 2, 2)
+}
+
+// Constructs a 256-bit floating-point vector of `[8 x float]` from a
+// 128-bit floating-point vector of `[4 x float]`. The lower 128 bits contain
+// the value of the source vector. The upper 128 bits are set to zero.
+@(require_results, enable_target_feature="sse,avx")
+_mm256_zextps128_ps256 :: #force_inline proc "c" (a: __m128) -> __m256 {
+	return intrinsics.simd_shuffle(a, _mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7)
+}
+
+// Constructs a 256-bit integer vector from a 128-bit integer vector.
+// The lower 128 bits contain the value of the source vector. The upper
+// 128 bits are set to zero.
+@(require_results, enable_target_feature="avx")
+_mm256_zextsi128_si256 :: #force_inline proc "c" (a: __m128i) -> __m256i {
+	return intrinsics.simd_shuffle(a, __m128i(0), 0, 1, 2, 3)
+}
+
+// Constructs a 256-bit floating-point vector of `[4 x double]` from a
+// 128-bit floating-point vector of `[2 x double]`. The lower 128 bits
+// contain the value of the source vector. The upper 128 bits are set
+// to zero.
+@(require_results, enable_target_feature="sse2,avx")
+_mm256_zextpd128_pd256 :: #force_inline proc "c" (a: __m128d) -> __m256d {
+	return intrinsics.simd_shuffle(a, _mm_setzero_pd(), 0, 1, 2, 3)
+}
+
+// Returns vector of type `__m256` with indeterminate elements.
+// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
+@(require_results, enable_target_feature="avx")
+_mm256_undefined_ps :: #force_inline proc "c" () -> __m256 {
+	return 0
+}
+
+// Returns vector of type `__m256d` with indeterminate elements.
+// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
+@(require_results, enable_target_feature="avx")
+_mm256_undefined_pd :: #force_inline proc "c" () -> __m256d {
+	return 0
+}
+
+// Returns vector of type __m256i with with indeterminate elements.
+// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
+@(require_results, enable_target_feature="avx")
+_mm256_undefined_si256 :: #force_inline proc "c" () -> __m256i {
+	return 0
+}
+
+// Sets packed __m256 returned vector with the supplied values.
+@(require_results, enable_target_feature="avx")
+_mm256_set_m128 :: #force_inline proc "c" (hi: __m128, lo: __m128) -> __m256 {
+	return intrinsics.simd_shuffle(lo, hi, 0, 1, 2, 3, 4, 5, 6, 7)
+}
+
+// Sets packed __m256d returned vector with the supplied values.
+@(require_results, enable_target_feature="avx")
+_mm256_set_m128d :: #force_inline proc "c" (hi: __m128d, lo: __m128d) -> __m256d {
+	hi := transmute(__m128)hi
+	lo := transmute(__m128)lo
+	return transmute(__m256d)_mm256_set_m128(hi, lo)
+}
+
+// Sets packed __m256i returned vector with the supplied values.
+@(require_results, enable_target_feature="avx")
+_mm256_set_m128i :: #force_inline proc "c" (hi: __m128i, lo: __m128i) -> __m256i {
+	hi := transmute(__m128)hi
+	lo := transmute(__m128)lo
+	return transmute(__m256i)_mm256_set_m128(hi, lo)
+}
+
+// Sets packed __m256 returned vector with the supplied values.
+@(require_results, enable_target_feature="avx")
+_mm256_setr_m128 :: #force_inline proc "c" (lo: __m128, hi: __m128) -> __m256 {
+	return _mm256_set_m128(hi, lo)
+}
+
+// Sets packed __m256d returned vector with the supplied values.
+@(require_results, enable_target_feature="avx")
+_mm256_setr_m128d :: #force_inline proc "c" (lo: __m128d, hi: __m128d) -> __m256d {
+	return _mm256_set_m128d(hi, lo)
+}
+
+// Sets packed __m256i returned vector with the supplied values.
+@(require_results, enable_target_feature="avx")
+_mm256_setr_m128i :: #force_inline proc "c" (lo: __m128i, hi: __m128i) -> __m256i {
+	return _mm256_set_m128i(hi, lo)
+}
+
+// Loads two 128-bit values (composed of 4 packed single-precision (32-bit) floating-point elements) from memory, and combine them into a 256-bit value.
+// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
+@(require_results, enable_target_feature="sse,avx")
+_mm256_loadu2_m128 :: #force_inline proc "c" (hiaddr, loaddr: ^f32) -> __m256 {
+	a := _mm256_castps128_ps256(_mm_loadu_ps(loaddr))
+	return _mm256_insertf128_ps(a, _mm_loadu_ps(hiaddr), 1)
+}
+
+// Loads two 128-bit values (composed of 2 packed double-precision (64-bit) floating-point elements) from memory, and combine them into a 256-bit value.
+// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
+@(require_results, enable_target_feature="sse2,avx")
+_mm256_loadu2_m128d :: #force_inline proc "c" (hiaddr, loaddr: ^f64) -> __m256d {
+	a := _mm256_castpd128_pd256(_mm_loadu_pd(loaddr))
+	return _mm256_insertf128_pd(a, _mm_loadu_pd(hiaddr), 1)
+}
+
+// Loads two 128-bit values (composed of integer data) from memory, and combine them into a 256-bit value.
+// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
+@(require_results, enable_target_feature="sse2,avx")
+_mm256_loadu2_m128i :: #force_inline proc "c" (hiaddr, loaddr: ^__m128i) -> __m256i {
+	a := _mm256_castsi128_si256(_mm_loadu_si128(loaddr))
+	return _mm256_insertf128_si256(a, _mm_loadu_si128(hiaddr), 1)
+}
+
+// Stores the high and low 128-bit halves (each composed of 4 packed
+// single-precision (32-bit) floating-point elements) from `a` into memory two
+// different 128-bit locations.
+// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
+@(enable_target_feature="sse,avx")
+_mm256_storeu2_m128 :: #force_inline proc "c" (hiaddr, loaddr: ^f32, a: __m256) {
+	lo := _mm256_castps256_ps128(a)
+	_mm_storeu_ps(loaddr, lo)
+	hi := _mm256_extractf128_ps(a, 1)
+	_mm_storeu_ps(hiaddr, hi)
+}
+
+// Stores the high and low 128-bit halves (each composed of 2 packed
+// double-precision (64-bit) floating-point elements) from `a` into memory two
+// different 128-bit locations.
+// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
+@(enable_target_feature="sse2,avx")
+_mm256_storeu2_m128d :: #force_inline proc "c" (hiaddr, loaddr: ^f64, a: __m256d) {
+	lo := _mm256_castpd256_pd128(a)
+	_mm_storeu_pd(loaddr, lo)
+	hi := _mm256_extractf128_pd(a, 1)
+	_mm_storeu_pd(hiaddr, hi)
+}
+
+// Stores the high and low 128-bit halves (each composed of integer data) from
+// `a` into memory two different 128-bit locations.
+// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
+@(enable_target_feature="sse2,avx")
+_mm256_storeu2_m128i :: #force_inline proc "c" (hiaddr, loaddr: ^__m128i, a: __m256i) {
+	lo := _mm256_castsi256_si128(a)
+	_mm_storeu_si128(loaddr, lo)
+	hi := _mm256_extractf128_si256(a, 1)
+	_mm_storeu_si128(hiaddr, hi)
+}
+
+// Returns the first element of the input vector of `[8 x float]`.
+@(require_results, enable_target_feature="avx")
+_mm256_cvtss_f32 :: #force_inline proc "c" (a: __m256) -> f32 {
+	return intrinsics.simd_extract(a, 0)
+}
+
+
+
+@(require_results, enable_target_feature="avx")
+_mm256_insert_epi64 :: #force_inline proc "c" (a: __m256i, i: i64, $idx: u32) -> __m256i {
+	return intrinsics.simd_replace(transmute(#simd[4]i64)a, idx, i)
+}
+
+@(require_results, enable_target_feature="avx")
+_mm256_extract_epi64 :: #force_inline proc "c" (a: __m256i, $idx: u32) -> i64 {
+	return intrinsics.simd_extract(transmute(#simd[4]i64)a, idx)
+}
+
+
+@(private, default_calling_convention="none")
+foreign _ {
+	@(link_name="llvm.x86.avx.round.pd.256")      llvm_roundpd256    :: proc(a: __m256d, #const b: u32) -> __m256d ---
+	@(link_name="llvm.x86.avx.round.ps.256")      llvm_roundps256    :: proc(a: __m256, #const b: u32) -> __m256 ---
+	@(link_name="llvm.x86.avx.dp.ps.256")         llvm_vdpps         :: proc(a, b: __m256, #const imm8: u8) -> __m256 ---
+	@(link_name="llvm.x86.sse2.cmp.pd")           llvm_vcmppd        :: proc(a, b: __m128d, #const imm8: u8) -> __m128d ---
+	@(link_name="llvm.x86.avx.cmp.pd.256")        llvm_vcmppd256     :: proc(a, b: __m256d, imm8: u8) -> __m256d ---
+	@(link_name="llvm.x86.sse.cmp.ps")            llvm_vcmpps        :: proc(a: __m128, b: __m128, #const imm8: u8) -> __m128 ---
+	@(link_name="llvm.x86.avx.cmp.ps.256")        llvm_vcmpps256     :: proc(a, b: __m256, imm8: u8) -> __m256 ---
+	@(link_name="llvm.x86.sse2.cmp.sd")           llvm_vcmpsd        :: proc(a, b: __m128d, #const imm8: u8) -> __m128d ---
+	@(link_name="llvm.x86.sse.cmp.ss")            llvm_vcmpss        :: proc(a: __m128, b: __m128, #const imm8: u8) -> __m128 ---
+	@(link_name="llvm.x86.avx.cvt.ps2dq.256")     llvm_vcvtps2dq     :: proc(a: __m256) -> #simd[8]i32 ---
+	@(link_name="llvm.x86.avx.cvtt.pd2dq.256")    llvm_vcvttpd2dq    :: proc(a: __m256d) -> #simd[4]i32 ---
+	@(link_name="llvm.x86.avx.cvt.pd2dq.256")     llvm_vcvtpd2dq     :: proc(a: __m256d) -> #simd[4]i32 ---
+	@(link_name="llvm.x86.avx.cvtt.ps2dq.256")    llvm_vcvttps2dq    :: proc(a: __m256) -> #simd[8]i32 ---
+	@(link_name="llvm.x86.avx.vzeroall")          llvm_vzeroall      :: proc() ---
+	@(link_name="llvm.x86.avx.vzeroupper")        llvm_vzeroupper    :: proc() ---
+	@(link_name="llvm.x86.avx.vpermilvar.ps.256") llvm_vpermilps256  :: proc(a: __m256, b: #simd[8]i32) -> __m256 ---
+	@(link_name="llvm.x86.avx.vpermilvar.ps")     llvm_vpermilps     :: proc(a: __m128, b: #simd[4]i32) -> __m128 ---
+	@(link_name="llvm.x86.avx.vpermilvar.pd.256") llvm_vpermilpd256  :: proc(a: __m256d, b: #simd[4]i64) -> __m256d ---
+	@(link_name="llvm.x86.avx.vpermilvar.pd")     llvm_vpermilpd     :: proc(a: __m128d, b: #simd[2]i64) -> __m128d ---
+	@(link_name="llvm.x86.avx.ldu.dq.256")        llvm_vlddqu        :: proc(mem_addr: rawptr) -> #simd[32]i8 ---
+	@(link_name="llvm.x86.avx.rcp.ps.256")        llvm_vrcpps        :: proc(a: __m256) -> __m256 ---
+	@(link_name="llvm.x86.avx.rsqrt.ps.256")      llvm_vrsqrtps      :: proc(a: __m256) -> __m256 ---
+	@(link_name="llvm.x86.avx.ptestnzc.256")      llvm_ptestnzc256   :: proc(a: #simd[4]i64, b: #simd[4]i64) -> i32 ---
+	@(link_name="llvm.x86.avx.vtestz.pd.256")     llvm_vtestzpd256   :: proc(a, b: __m256d) -> i32 ---
+	@(link_name="llvm.x86.avx.vtestc.pd.256")     llvm_vtestcpd256   :: proc(a, b: __m256d) -> i32 ---
+	@(link_name="llvm.x86.avx.vtestnzc.pd.256")   llvm_vtestnzcpd256 :: proc(a, b: __m256d) -> i32 ---
+	@(link_name="llvm.x86.avx.vtestnzc.pd")       llvm_vtestnzcpd    :: proc(a, b: __m128d) -> i32 ---
+	@(link_name="llvm.x86.avx.vtestz.ps.256")     llvm_vtestzps256   :: proc(a, b: __m256) -> i32 ---
+	@(link_name="llvm.x86.avx.vtestc.ps.256")     llvm_vtestcps256   :: proc(a, b: __m256) -> i32 ---
+	@(link_name="llvm.x86.avx.vtestnzc.ps.256")   llvm_vtestnzcps256 :: proc(a, b: __m256) -> i32 ---
+	@(link_name="llvm.x86.avx.vtestnzc.ps")       llvm_vtestnzcps    :: proc(a: __m128, b: __m128) -> i32 ---
+	@(link_name="llvm.x86.avx.min.ps.256")        llvm_vminps        :: proc(a, b: __m256) -> __m256 ---
+	@(link_name="llvm.x86.avx.max.ps.256")        llvm_vmaxps        :: proc(a, b: __m256) -> __m256 ---
+	@(link_name="llvm.x86.avx.min.pd.256")        llvm_vminpd        :: proc(a, b: __m256d) -> __m256d ---
+	@(link_name="llvm.x86.avx.max.pd.256")        llvm_vmaxpd        :: proc(a, b: __m256d) -> __m256d ---
+}