From 45899dcd62cb653c39eb7bbdec37b7eb53491459 Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Sun, 20 Sep 2020 21:57:15 +0000
Subject: [PATCH 01/20] castps128_512

---
 crates/core_arch/avx512f.md         |  2 +-
 crates/core_arch/src/x86/avx512f.rs | 53 +++++++++++++++++++++++++++++
 crates/core_arch/src/x86/mod.rs     | 18 ++++++++++
 3 files changed, 72 insertions(+), 1 deletion(-)

diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index c978a63461..353dab7a3e 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -32,7 +32,7 @@
   * [ ] [`_mm512_castpd512_pd256`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd512_pd256&expand=5236)
   * [ ] [`_mm512_castpd_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd_ps&expand=5236)
   * [ ] [`_mm512_castpd_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd_si512&expand=5236)
-  * [ ] [`_mm512_castps128_ps512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castps128_ps512&expand=5236)
+  * [x] [`_mm512_castps128_ps512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castps128_ps512&expand=5236)
   * [ ] [`_mm512_castps256_ps512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castps256_ps512&expand=5236)
   * [ ] [`_mm512_castps512_ps128`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castps512_ps128&expand=5236)
   * [ ] [`_mm512_castps512_ps256`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castps512_ps256&expand=5236)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 3f9bbfb3e1..80cfe54a58 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -10228,6 +10228,32 @@ pub unsafe fn _mm512_maskz_movedup_pd(k: __mmask8, a: __m512d) -> __m512d {
     let zero = _mm512_setzero_pd().as_f64x8();
     transmute(simd_select_bitmask(k, mov, zero))
 }
+/*
+/// Copy a to dst, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into dst at the location specified by imm8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_insertf32x4&expand=3155)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vinsertf32x4))]
+pub unsafe fn _mm512_insertf32x4(a: __m512, b: __m128, imm8: i32) -> __m512 {
+    let a = a.as_f32x16();
+    let b = b.as_f32x4();
+    match imm8 & 3 {
+        0 => transmute(simd_insert(a, 0, b)),
+        1 => transmute(simd_insert(a, 1, b)),
+        2 => transmute(simd_insert(a, 2, b)),
+        _ => transmute(simd_insert(a, 3, b)),
+    }
+}
+*/
+/// Cast vector of type __m128 to type __m512; the upper 384 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castps128_ps512&expand=621)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_castps128_ps512(a: __m128) -> __m512 {
+    simd_shuffle16(a, _mm_set1_ps(-1.), [0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4])
+}
 
 /// Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 ///
@@ -18367,6 +18393,33 @@ mod tests {
         );
         assert_eq_m512(r, e);
     }
+/*
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_insertf32x4() {
+        let a = _mm512_setr_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let b = _mm_setr_ps(
+            17., 18., 19., 20.,
+        );
+        let r = _mm512_insertf32x4(a, b, 0);
+        let e = _mm512_setr_ps(
+            17., 18., 19., 20., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        assert_eq_m512(r, e);
+    }
+*/ 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castps128_ps512() {
+        let a = _mm_setr_ps(
+            17., 18., 19., 20.,
+        );
+        let r = _mm512_castps128_ps512(a);
+        let e = _mm512_setr_ps(
+            17., 18., 19., 20., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
+        );
+        assert_eq_m512(r, e);
+    }
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_and_epi32() {
diff --git a/crates/core_arch/src/x86/mod.rs b/crates/core_arch/src/x86/mod.rs
index abe99f23a2..a649cf3e2d 100644
--- a/crates/core_arch/src/x86/mod.rs
+++ b/crates/core_arch/src/x86/mod.rs
@@ -433,6 +433,24 @@ impl m256iExt for __m256i {
     }
 }
 
+#[allow(non_camel_case_types)]
+#[unstable(feature = "stdimd_internal", issue = "none")]
+pub(crate) trait m128Ext: Sized {
+    fn as_m128(self) -> __m128;
+
+    #[inline]
+    fn as_f32x4(self) -> crate::core_arch::simd::f32x4 {
+        unsafe { transmute(self.as_m128()) }
+    }
+}
+
+impl m128Ext for __m128 {
+    #[inline]
+    fn as_m128(self) -> Self {
+        self
+    }
+}
+
 #[allow(non_camel_case_types)]
 #[unstable(feature = "stdsimd_internal", issue = "none")]
 pub(crate) trait m256Ext: Sized {

From c73dd0f9ed175a5a3ecad1a84887ef95cee7bb35 Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Mon, 21 Sep 2020 00:55:17 +0000
Subject: [PATCH 02/20] castps_pd, castps_si512, castps512_ps256,
 castps512_ps128, castps128_ps512, castps256_ps512

---
 crates/core_arch/avx512f.md         | 10 +--
 crates/core_arch/src/x86/avx512f.rs | 97 +++++++++++++++++++++++++++++
 2 files changed, 102 insertions(+), 5 deletions(-)

diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index 353dab7a3e..4fb16cd404 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -33,11 +33,11 @@
   * [ ] [`_mm512_castpd_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd_ps&expand=5236)
   * [ ] [`_mm512_castpd_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd_si512&expand=5236)
   * [x] [`_mm512_castps128_ps512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castps128_ps512&expand=5236)
-  * [ ] [`_mm512_castps256_ps512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castps256_ps512&expand=5236)
-  * [ ] [`_mm512_castps512_ps128`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castps512_ps128&expand=5236)
-  * [ ] [`_mm512_castps512_ps256`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castps512_ps256&expand=5236)
-  * [ ] [`_mm512_castps_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castps_pd&expand=5236)
-  * [ ] [`_mm512_castps_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castps_si512&expand=5236)
+  * [x] [`_mm512_castps256_ps512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castps256_ps512&expand=5236)
+  * [x] [`_mm512_castps512_ps128`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castps512_ps128&expand=5236)
+  * [x] [`_mm512_castps512_ps256`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castps512_ps256&expand=5236)
+  * [x] [`_mm512_castps_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castps_pd&expand=5236)
+  * [x] [`_mm512_castps_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castps_si512&expand=5236)
   * [ ] [`_mm512_castsi128_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castsi128_si512&expand=5236)
   * [ ] [`_mm512_castsi256_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castsi256_si512&expand=5236)
   * [ ] [`_mm512_castsi512_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castsi512_pd&expand=5236)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 80cfe54a58..01bc8b1ad0 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -10255,6 +10255,51 @@ pub unsafe fn _mm512_castps128_ps512(a: __m128) -> __m512 {
     simd_shuffle16(a, _mm_set1_ps(-1.), [0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4])
 }
 
+/// Cast vector of type __m256 to type __m512; the upper 256 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castps256_ps512&expand=623)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_castps256_ps512(a: __m256) -> __m512 {
+    simd_shuffle16(a, _mm256_set1_ps(-1.), [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8])
+}
+
+/// Cast vector of type __m512 to type __m128. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castps512_ps128&expand=624)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_castps512_ps128(a: __m512) -> __m128 {
+    simd_shuffle4(a, a, [0, 1, 2, 3])
+}
+
+/// Cast vector of type __m512 to type __m256. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castps512_ps256&expand=625)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_castps512_ps256(a: __m512) -> __m256 {
+    simd_shuffle8(a, a, [0, 1, 2, 3, 4, 5, 6, 7])
+}
+
+/// Cast vector of type __m512 to type __m512d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castps_pd&expand=616)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_castps_pd(a: __m512) -> __m512d {
+    transmute(a.as_m512())
+}
+
+/// Cast vector of type __m512 to type __m512i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castps_si512&expand=619)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_castps_si512(a: __m512) -> __m512i {
+    transmute(a.as_m512())
+}
+
 /// Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 ///
 /// Compute the bitwise AND of packed 32-bit integers in a and b, and store the results in dst.
@@ -18421,6 +18466,58 @@ mod tests {
         assert_eq_m512(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castps256_ps512() {
+        let a = _mm256_setr_ps(
+            17., 18., 19., 20., 21., 22., 23., 24.,
+        );
+        let r = _mm512_castps256_ps512(a);
+        let e = _mm512_setr_ps(
+            17., 18., 19., 20., 21., 22., 23., 24., -1., -1., -1., -1., -1., -1., -1., -1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castps512_ps128() {
+        let a = _mm512_setr_ps(
+            17., 18., 19., 20., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
+        );
+        let r = _mm512_castps512_ps128(a);
+        let e = _mm_setr_ps(
+            17., 18., 19., 20.,
+        );
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castps512_ps256() {
+        let a = _mm512_setr_ps(
+            17., 18., 19., 20., 21., 22., 23., 24., -1., -1., -1., -1., -1., -1., -1., -1.,
+        );
+        let r = _mm512_castps512_ps256(a);
+        let e = _mm256_setr_ps(
+            17., 18., 19., 20., 21., 22., 23., 24.,
+        );
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castps_pd() {
+        let a = _mm512_set1_ps(1.);
+        let r = _mm512_castps_pd(a);
+        let e = _mm512_set1_pd(0.007812501848093234);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castps_si512() {
+        let a = _mm512_set1_ps(1.);
+        let r = _mm512_castps_si512(a);
+        let e = _mm512_set1_epi32(1065353216);
+        assert_eq_m512i(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_and_epi32() {
         let a = _mm512_set_epi32(

From 4e9d4d5f33617a24831c344ff2f51d1756e2060e Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Mon, 21 Sep 2020 21:46:32 +0000
Subject: [PATCH 03/20] castpd_ps, castpd_si512, castpd512_pd256,
 castpd512_pd128, castpd128_pd512, castpd256_pd512

---
 crates/core_arch/avx512f.md            |  8 +--
 crates/core_arch/src/x86/avx512f.rs    | 82 +++++++++++++++++++++++++-
 crates/core_arch/src/x86_64/avx512f.rs | 64 ++++++++++++++++++++
 3 files changed, 149 insertions(+), 5 deletions(-)

diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index 4fb16cd404..7cf023cbf5 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -26,10 +26,10 @@
   * [ ] [`_mm512_broadcastq_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcastq_epi64&expand=5236)
   * [ ] [`_mm512_broadcastsd_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcastsd_pd&expand=5236)
   * [ ] [`_mm512_broadcastss_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcastss_ps&expand=5236)
-  * [ ] [`_mm512_castpd128_pd512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd128_pd512&expand=5236)
-  * [ ] [`_mm512_castpd256_pd512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd256_pd512&expand=5236)
-  * [ ] [`_mm512_castpd512_pd128`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd512_pd128&expand=5236)
-  * [ ] [`_mm512_castpd512_pd256`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd512_pd256&expand=5236)
+  * [x] [`_mm512_castpd128_pd512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd128_pd512&expand=5236)
+  * [x] [`_mm512_castpd256_pd512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd256_pd512&expand=5236)
+  * [x] [`_mm512_castpd512_pd128`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd512_pd128&expand=5236)
+  * [x] [`_mm512_castpd512_pd256`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd512_pd256&expand=5236)
   * [ ] [`_mm512_castpd_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd_ps&expand=5236)
   * [ ] [`_mm512_castpd_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd_si512&expand=5236)
   * [x] [`_mm512_castps128_ps512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castps128_ps512&expand=5236)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 01bc8b1ad0..4f4b11e762 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -10300,6 +10300,74 @@ pub unsafe fn _mm512_castps_si512(a: __m512) -> __m512i {
     transmute(a.as_m512())
 }
 
+/// Cast vector of type __m128d to type __m512d; the upper 384 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castpd128_pd512&expand=609)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_castpd128_pd512(a: __m128d) -> __m512d {
+    simd_shuffle8(a, _mm_set1_pd(-1.), [0, 1, 2, 2, 2, 2, 2, 2])
+}
+
+/// Cast vector of type __m256d to type __m512d; the upper 256 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castpd256_pd512&expand=611)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_castpd256_pd512(a: __m256d) -> __m512d {
+    simd_shuffle8(a, _mm256_set1_pd(-1.), [0, 1, 2, 3, 4, 4, 4, 4])
+}
+
+/// Cast vector of type __m512d to type __m128d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castpd512_pd128&expand=612)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_castpd512_pd128(a: __m512d) -> __m128d {
+    simd_shuffle2(a, a, [0, 1])
+}
+
+/// Cast vector of type __m512d to type __m256d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castpd512_pd256&expand=613)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_castpd512_pd256(a: __m512d) -> __m256d {
+    simd_shuffle4(a, a, [0, 1, 2, 3])
+}
+
+/// Cast vector of type __m512d to type __m512. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castpd_ps&expand=604)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_castpd_ps(a: __m512d) -> __m512 {
+    transmute(a.as_m512d())
+}
+
+/// Cast vector of type __m512d to type __m512i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castpd_si512&expand=607)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_castpd_si512(a: __m512d) -> __m512i {
+    transmute(a.as_m512d())
+}
+
+/*
+/// Cast vector of type __m128i to type __m512i; the upper 384 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castsi128_si512&expand=629)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_castsi128_si512(a: __m128i) -> __m512i {
+    let a = a.as_m128i();
+    transmute(simd_shuffle16(a, _mm_set1_epi32(-1).as_m128i(), [0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]).as_m512i())
+//pub unsafe fn _mm512_castps128_ps512(a: __m128) -> __m512 {
+    //simd_shuffle16(a, _mm_set1_ps(-1.), [0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4])
+}
+*/
+
 /// Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 ///
 /// Compute the bitwise AND of packed 32-bit integers in a and b, and store the results in dst.
@@ -18517,7 +18585,19 @@ mod tests {
         let e = _mm512_set1_epi32(1065353216);
         assert_eq_m512i(r, e);
     }
-
+/*
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castsi128_si512() {
+        let a = _mm_setr_epi32(
+            17, 18, 19, 20,
+        );
+        let r = _mm512_castsi128_si512(a);
+        let e = _mm512_setr_epi32(
+            17, 18, 19, 20, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+        );
+        assert_eq_m512i(r, e);
+    }
+*/
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_and_epi32() {
         let a = _mm512_set_epi32(
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index 036cf36c74..c59c84ff6e 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -4278,6 +4278,70 @@ mod tests {
         assert_eq_m512d(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castpd128_pd512() {
+        let a = _mm_setr_pd(
+            17., 18.,
+        );
+        let r = _mm512_castpd128_pd512(a);
+        let e = _mm512_setr_pd(
+            17., 18., -1., -1., -1., -1., -1., -1.
+        );
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castpd256_pd512() {
+        let a = _mm256_setr_pd(
+            17., 18., 19., 20.,
+        );
+        let r = _mm512_castpd256_pd512(a);
+        let e = _mm512_setr_pd(
+            17., 18., 19., 20., -1., -1., -1., -1.,
+        );
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castpd512_pd128() {
+        let a = _mm512_setr_pd(
+            17., 18., -1., -1., -1., -1., -1., -1.,
+        );
+        let r = _mm512_castpd512_pd128(a);
+        let e = _mm_setr_pd(
+            17., 18.,
+        );
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castpd512_pd256() {
+        let a = _mm512_setr_pd(
+            17., 18., 19., 20., -1., -1., -1., -1.,
+        );
+        let r = _mm512_castpd512_pd256(a);
+        let e = _mm256_setr_pd(
+            17., 18., 19., 20.,
+        );
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castpd_ps() {
+        let a = _mm512_set1_pd(1.);
+        let r = _mm512_castpd_ps(a);
+        let e = _mm512_set_ps(1.875, 0.0, 1.875, 0.0, 1.875, 0.0, 1.875, 0.0, 1.875, 0.0, 1.875, 0.0, 1.875, 0.0, 1.875, 0.0);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castpd_si512() {
+        let a = _mm512_set1_pd(1.);
+        let r = _mm512_castpd_si512(a);
+        let e = _mm512_set_epi32(1072693248, 0, 1072693248, 0, 1072693248, 0, 1072693248, 0, 1072693248, 0, 1072693248, 0, 1072693248, 0, 1072693248, 0);
+        assert_eq_m512i(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_and_epi64() {
         let a = _mm512_set_epi64(1 << 0 | 1 << 15, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);

From 1d18142476efd28cc20333473bb010a7c4981c43 Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Tue, 22 Sep 2020 00:52:56 +0000
Subject: [PATCH 04/20] castsi512_si128, castsi512_si256, castsi128_si512,
 castsi256_si512, castsi512_ps, castsi512_pd

---
 crates/core_arch/avx512f.md            | 16 +++----
 crates/core_arch/src/x86/avx512f.rs    | 66 ++++++++++++++++++--------
 crates/core_arch/src/x86_64/avx512f.rs | 64 +++++++++++++++++++++++++
 3 files changed, 119 insertions(+), 27 deletions(-)

diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index 7cf023cbf5..a5f95b23ef 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -30,20 +30,20 @@
   * [x] [`_mm512_castpd256_pd512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd256_pd512&expand=5236)
   * [x] [`_mm512_castpd512_pd128`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd512_pd128&expand=5236)
   * [x] [`_mm512_castpd512_pd256`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd512_pd256&expand=5236)
-  * [ ] [`_mm512_castpd_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd_ps&expand=5236)
-  * [ ] [`_mm512_castpd_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd_si512&expand=5236)
+  * [x] [`_mm512_castpd_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd_ps&expand=5236)
+  * [x] [`_mm512_castpd_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd_si512&expand=5236)
   * [x] [`_mm512_castps128_ps512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castps128_ps512&expand=5236)
   * [x] [`_mm512_castps256_ps512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castps256_ps512&expand=5236)
   * [x] [`_mm512_castps512_ps128`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castps512_ps128&expand=5236)
   * [x] [`_mm512_castps512_ps256`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castps512_ps256&expand=5236)
   * [x] [`_mm512_castps_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castps_pd&expand=5236)
   * [x] [`_mm512_castps_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castps_si512&expand=5236)
-  * [ ] [`_mm512_castsi128_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castsi128_si512&expand=5236)
-  * [ ] [`_mm512_castsi256_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castsi256_si512&expand=5236)
-  * [ ] [`_mm512_castsi512_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castsi512_pd&expand=5236)
-  * [ ] [`_mm512_castsi512_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castsi512_ps&expand=5236)
-  * [ ] [`_mm512_castsi512_si128`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castsi512_si128&expand=5236)
-  * [ ] [`_mm512_castsi512_si256`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castsi512_si256&expand=5236)
+  * [x] [`_mm512_castsi128_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castsi128_si512&expand=5236)
+  * [x] [`_mm512_castsi256_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castsi256_si512&expand=5236)
+  * [x] [`_mm512_castsi512_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castsi512_pd&expand=5236)
+  * [x] [`_mm512_castsi512_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castsi512_ps&expand=5236)
+  * [x] [`_mm512_castsi512_si128`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castsi512_si128&expand=5236)
+  * [x] [`_mm512_castsi512_si256`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castsi512_si256&expand=5236)
   * [x] [`_mm512_cmp_epi32_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_epi32_mask&expand=5236)
   * [x] [`_mm512_cmp_epi64_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_epi64_mask&expand=5236)
   * [x] [`_mm512_cmp_epu32_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_epu32_mask&expand=5236)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 4f4b11e762..7ae09d463e 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -10354,19 +10354,59 @@ pub unsafe fn _mm512_castpd_si512(a: __m512d) -> __m512i {
     transmute(a.as_m512d())
 }
 
-/*
 /// Cast vector of type __m128i to type __m512i; the upper 384 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castsi128_si512&expand=629)
 #[inline]
 #[target_feature(enable = "avx512f")]
 pub unsafe fn _mm512_castsi128_si512(a: __m128i) -> __m512i {
-    let a = a.as_m128i();
-    transmute(simd_shuffle16(a, _mm_set1_epi32(-1).as_m128i(), [0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]).as_m512i())
-//pub unsafe fn _mm512_castps128_ps512(a: __m128) -> __m512 {
-    //simd_shuffle16(a, _mm_set1_ps(-1.), [0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4])
+    simd_shuffle8(a, _mm_set1_epi64x(-1), [0, 1, 2, 2, 2, 2, 2, 2])
+}
+
+/// Cast vector of type __m256i to type __m512i; the upper 256 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castsi256_si512&expand=633)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_castsi256_si512(a: __m256i) -> __m512i {
+    simd_shuffle8(a, _mm256_set1_epi64x(-1), [0, 1, 2, 3, 4, 4, 4, 4])
+}
+
+/// Cast vector of type __m512i to type __m128i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castsi512_si128&expand=636)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_castsi512_si128(a: __m512i) -> __m128i {
+    simd_shuffle2(a, a, [0, 1])
+}
+
+/// Cast vector of type __m512i to type __m256i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castsi512_si256&expand=637)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_castsi512_si256(a: __m512i) -> __m256i {
+    simd_shuffle4(a, a, [0, 1, 2, 3])
+}
+
+/// Cast vector of type __m512i to type __m512. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castsi512_ps&expand=635)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_castsi512_ps(a: __m512i) -> __m512 {
+    transmute(a)
+}
+
+/// Cast vector of type __m512i to type __m512d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castsi512_pd&expand=634)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_castsi512_pd(a: __m512i) -> __m512d {
+    transmute(a)
 }
-*/
 
 /// Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 ///
@@ -18585,19 +18625,7 @@ mod tests {
         let e = _mm512_set1_epi32(1065353216);
         assert_eq_m512i(r, e);
     }
-/*
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_castsi128_si512() {
-        let a = _mm_setr_epi32(
-            17, 18, 19, 20,
-        );
-        let r = _mm512_castsi128_si512(a);
-        let e = _mm512_setr_epi32(
-            17, 18, 19, 20, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-        );
-        assert_eq_m512i(r, e);
-    }
-*/
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_and_epi32() {
         let a = _mm512_set_epi32(
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index c59c84ff6e..327d78e2ec 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -4342,6 +4342,70 @@ mod tests {
         assert_eq_m512i(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castsi128_si512() {
+        let a = _mm_setr_epi64x(
+            17, 18,
+        );
+        let r = _mm512_castsi128_si512(a);
+        let e = _mm512_setr_epi64(
+            17, 18, -1, -1, -1, -1, -1, -1,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castsi256_si512() {
+        let a = _mm256_setr_epi64x(
+            17, 18, 19, 20,
+        );
+        let r = _mm512_castsi256_si512(a);
+        let e = _mm512_setr_epi64(
+            17, 18, 19, 20, -1, -1, -1, -1,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castsi512_si128() {
+        let a = _mm512_setr_epi64(
+            17, 18, -1, -1, -1, -1, -1, -1,
+        );
+        let r = _mm512_castsi512_si128(a);
+        let e = _mm_setr_epi64x(
+            17, 18,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castsi512_si256() {
+        let a = _mm512_setr_epi64(
+            17, 18, 19, 20, -1, -1, -1, -1,
+        );
+        let r = _mm512_castsi512_si256(a);
+        let e = _mm256_setr_epi64x(
+            17, 18, 19, 20,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castsi512_ps() {
+        let a = _mm512_set1_epi64(1<<62);
+        let r = _mm512_castsi512_ps(a);
+        let e = _mm512_set_ps(2., 0., 2., 0., 2., 0., 2., 0., 2., 0., 2., 0., 2., 0., 2., 0.);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_castsi512_pd() {
+        let a = _mm512_set1_epi64(1<<62);
+        let r = _mm512_castsi512_pd(a);
+        let e = _mm512_set_pd(2., 2., 2., 2., 2., 2., 2., 2.);
+        assert_eq_m512d(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_and_epi64() {
         let a = _mm512_set_epi64(1 << 0 | 1 << 15, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);

From f2723d8d76c9d69eccd42473addc7eddb4d54005 Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Tue, 22 Sep 2020 01:34:58 +0000
Subject: [PATCH 05/20] broadcastsd_pd, broadcastss_ps

---
 crates/core_arch/avx512f.md            |  4 ++--
 crates/core_arch/src/x86/avx512f.rs    | 30 +++++++++++++++++++++++++-
 crates/core_arch/src/x86_64/avx512f.rs | 10 +++++++++
 3 files changed, 41 insertions(+), 3 deletions(-)

diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index a5f95b23ef..7118655272 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -24,8 +24,8 @@
   * [ ] [`_mm512_broadcast_i64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcast_i64x4&expand=5236)
   * [ ] [`_mm512_broadcastd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcastd_epi32&expand=5236)
   * [ ] [`_mm512_broadcastq_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcastq_epi64&expand=5236)
-  * [ ] [`_mm512_broadcastsd_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcastsd_pd&expand=5236)
-  * [ ] [`_mm512_broadcastss_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcastss_ps&expand=5236)
+  * [x] [`_mm512_broadcastsd_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcastsd_pd&expand=5236)
+  * [x] [`_mm512_broadcastss_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcastss_ps&expand=5236)
   * [x] [`_mm512_castpd128_pd512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd128_pd512&expand=5236)
   * [x] [`_mm512_castpd256_pd512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd256_pd512&expand=5236)
   * [x] [`_mm512_castpd512_pd128`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd512_pd128&expand=5236)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 7ae09d463e..fcc58ff560 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -10408,8 +10408,26 @@ pub unsafe fn _mm512_castsi512_pd(a: __m512i) -> __m512d {
     transmute(a)
 }
 
-/// Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+/// Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcastsd_pd&expand=567)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vbroadcastsd))]
+pub unsafe fn _mm512_broadcastsd_pd(a: __m128d) -> __m512d {
+    simd_shuffle8(a, a, [1, 1, 1, 1, 1, 1, 1, 1])
+}
+
+/// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst.
 ///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcastss_ps&expand=578)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vbroadcastss))]
+pub unsafe fn _mm512_broadcastss_ps(a: __m128) -> __m512 {
+    simd_shuffle16(a, a, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+}
+
 /// Compute the bitwise AND of packed 32-bit integers in a and b, and store the results in dst.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_and_epi32&expand=272)
@@ -18626,6 +18644,16 @@ mod tests {
         assert_eq_m512i(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_broadcastss_ps() {
+        let a = _mm_set_ps(
+            17., 18., 19., 20.,
+        );
+        let r = _mm512_broadcastss_ps(a);
+        let e = _mm512_set1_ps(20.);
+        assert_eq_m512(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_and_epi32() {
         let a = _mm512_set_epi32(
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index 327d78e2ec..7401a96ecf 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -4406,6 +4406,16 @@ mod tests {
         assert_eq_m512d(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_broadcastsd_pd() {
+        let a = _mm_setr_pd(
+            17., 18.,
+        );
+        let r = _mm512_broadcastsd_pd(a);
+        let e = _mm512_set1_pd(18.);
+        assert_eq_m512d(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_and_epi64() {
         let a = _mm512_set_epi64(1 << 0 | 1 << 15, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);

From def78a5eab34706f65f036d7b90148f742481a36 Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Tue, 22 Sep 2020 15:22:48 +0000
Subject: [PATCH 06/20] broadcastd_epi32, broadcastq_epi64

---
 crates/core_arch/avx512f.md            |  20 +--
 crates/core_arch/src/x86/avx512f.rs    | 186 ++++++++++++++++++++++++-
 crates/core_arch/src/x86_64/avx512f.rs |  60 ++++++++
 3 files changed, 252 insertions(+), 14 deletions(-)

diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index 7118655272..fd4916c930 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -22,8 +22,8 @@
   * [ ] [`_mm512_broadcast_f64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcast_f64x4&expand=5236)
   * [ ] [`_mm512_broadcast_i32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcast_i32x4&expand=5236)
   * [ ] [`_mm512_broadcast_i64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcast_i64x4&expand=5236)
-  * [ ] [`_mm512_broadcastd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcastd_epi32&expand=5236)
-  * [ ] [`_mm512_broadcastq_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcastq_epi64&expand=5236)
+  * [x] [`_mm512_broadcastd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcastd_epi32&expand=5236)
+  * [x] [`_mm512_broadcastq_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcastq_epi64&expand=5236)
   * [x] [`_mm512_broadcastsd_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcastsd_pd&expand=5236)
   * [x] [`_mm512_broadcastss_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcastss_ps&expand=5236)
   * [x] [`_mm512_castpd128_pd512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd128_pd512&expand=5236)
@@ -298,10 +298,10 @@
   * [ ] [`_mm512_mask_broadcast_f64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcast_f64x4&expand=5236)
   * [ ] [`_mm512_mask_broadcast_i32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcast_i32x4&expand=5236)
   * [ ] [`_mm512_mask_broadcast_i64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcast_i64x4&expand=5236)
-  * [ ] [`_mm512_mask_broadcastd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcastd_epi32&expand=5236)
-  * [ ] [`_mm512_mask_broadcastq_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcastq_epi64&expand=5236)
-  * [ ] [`_mm512_mask_broadcastsd_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcastsd_pd&expand=5236)
-  * [ ] [`_mm512_mask_broadcastss_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcastss_ps&expand=5236)
+  * [x] [`_mm512_mask_broadcastd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcastd_epi32&expand=5236)
+  * [x] [`_mm512_mask_broadcastq_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcastq_epi64&expand=5236)
+  * [x] [`_mm512_mask_broadcastsd_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcastsd_pd&expand=5236)
+  * [x] [`_mm512_mask_broadcastss_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcastss_ps&expand=5236)
   * [x] [`_mm512_mask_cmp_epi32_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_epi32_mask&expand=5236)
   * [x] [`_mm512_mask_cmp_epi64_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_epi64_mask&expand=5236)
   * [x] [`_mm512_mask_cmp_epu32_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_epu32_mask&expand=5236)
@@ -694,10 +694,10 @@
   * [ ] [`_mm512_maskz_broadcast_f64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcast_f64x4&expand=5236)
   * [ ] [`_mm512_maskz_broadcast_i32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcast_i32x4&expand=5236)
   * [ ] [`_mm512_maskz_broadcast_i64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcast_i64x4&expand=5236)
-  * [ ] [`_mm512_maskz_broadcastd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcastd_epi32&expand=5236)
-  * [ ] [`_mm512_maskz_broadcastq_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcastq_epi64&expand=5236)
-  * [ ] [`_mm512_maskz_broadcastsd_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcastsd_pd&expand=5236)
-  * [ ] [`_mm512_maskz_broadcastss_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcastss_ps&expand=5236)
+  * [x] [`_mm512_maskz_broadcastd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcastd_epi32&expand=5236)
+  * [x] [`_mm512_maskz_broadcastq_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcastq_epi64&expand=5236)
+  * [x] [`_mm512_maskz_broadcastsd_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcastsd_pd&expand=5236)
+  * [x] [`_mm512_maskz_broadcastss_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcastss_ps&expand=5236)
   * [ ] [`_mm512_maskz_compress_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_compress_epi32&expand=5236)
   * [ ] [`_mm512_maskz_compress_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_compress_epi64&expand=5236)
   * [ ] [`_mm512_maskz_compress_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_compress_pd&expand=5236)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index fcc58ff560..5f6a06705e 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -10408,16 +10408,74 @@ pub unsafe fn _mm512_castsi512_pd(a: __m512i) -> __m512d {
     transmute(a)
 }
 
-/// Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst.
+/// Broadcast the low packed 32-bit integer from a to all elements of dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcastsd_pd&expand=567)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcastd_epi32&expand=545)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vbroadcastsd))]
-pub unsafe fn _mm512_broadcastsd_pd(a: __m128d) -> __m512d {
+#[cfg_attr(test, assert_instr(vbroadcast))] //should be vpbroadcastd
+pub unsafe fn _mm512_broadcastd_epi32(a: __m128i) -> __m512i {
+    let a = _mm512_castsi128_si512(a).as_i32x16();
+    let ret = simd_shuffle16(a, a, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
+    transmute::<i32x16, _>(ret)
+}
+
+/// Broadcast the low packed 32-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set). 
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcastd_epi32&expand=546)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastd
+pub unsafe fn _mm512_mask_broadcastd_epi32(src: __m512i, k: __mmask16, a: __m128i) -> __m512i {
+    let broadcast = _mm512_broadcastd_epi32(a).as_i32x16();
+    transmute(simd_select_bitmask(k, broadcast, src.as_i32x16()))
+}
+
+/// Broadcast the low packed 32-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcastd_epi32&expand=547)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastd
+pub unsafe fn _mm512_maskz_broadcastd_epi32(k: __mmask16, a: __m128i) -> __m512i {
+    let broadcast = _mm512_broadcastd_epi32(a).as_i32x16();
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, broadcast, zero))
+}
+
+/// Broadcast the low packed 64-bit integer from a to all elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcastq_epi64&expand=560)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vbroadcas))] //should be vpbroadcastq
+pub unsafe fn _mm512_broadcastq_epi64(a: __m128i) -> __m512i {
     simd_shuffle8(a, a, [1, 1, 1, 1, 1, 1, 1, 1])
 }
 
+/// Broadcast the low packed 64-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcastq_epi64&expand=561)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastq
+pub unsafe fn _mm512_mask_broadcastq_epi64(src: __m512i, k: __mmask8, a: __m128i) -> __m512i {
+    let broadcast = _mm512_broadcastq_epi64(a).as_i64x8();
+    transmute(simd_select_bitmask(k, broadcast, src.as_i64x8()))
+}
+
+/// Broadcast the low packed 64-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcastq_epi64&expand=562)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastq
+pub unsafe fn _mm512_maskz_broadcastq_epi64(k: __mmask8, a: __m128i) -> __m512i {
+    let broadcast = _mm512_broadcastq_epi64(a).as_i64x8();
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, broadcast, zero))
+}
+
 /// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcastss_ps&expand=578)
@@ -10428,6 +10486,62 @@ pub unsafe fn _mm512_broadcastss_ps(a: __m128) -> __m512 {
     simd_shuffle16(a, a, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
 }
 
+/// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcastss_ps&expand=579)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vbroadcastss))]
+pub unsafe fn _mm512_mask_broadcastss_ps(src: __m512, k: __mmask16, a: __m128) -> __m512 {
+    let broadcast = _mm512_broadcastss_ps(a).as_f32x16();
+    transmute(simd_select_bitmask(k, broadcast, src.as_f32x16()))
+}
+
+/// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcastss_ps&expand=580)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vbroadcastss))]
+pub unsafe fn _mm512_maskz_broadcastss_ps(k: __mmask16, a: __m128) -> __m512 {
+    let broadcast = _mm512_broadcastss_ps(a).as_f32x16();
+    let zero = _mm512_setzero_ps().as_f32x16();
+    transmute(simd_select_bitmask(k, broadcast, zero))
+}
+
+/// Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcastsd_pd&expand=567)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vbroadcastsd))]
+pub unsafe fn _mm512_broadcastsd_pd(a: __m128d) -> __m512d {
+    simd_shuffle8(a, a, [1, 1, 1, 1, 1, 1, 1, 1])
+}
+
+/// Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcastsd_pd&expand=568)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vbroadcastsd))]
+pub unsafe fn _mm512_mask_broadcastsd_pd(src: __m512d, k: __mmask8, a: __m128d) -> __m512d {
+    let broadcast = _mm512_broadcastsd_pd(a).as_f64x8();
+    transmute(simd_select_bitmask(k, broadcast, src.as_f64x8()))
+}
+
+/// Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcastsd_pd&expand=569)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vbroadcastsd))]
+pub unsafe fn _mm512_maskz_broadcastsd_pd(k: __mmask8, a: __m128d) -> __m512d {
+    let broadcast = _mm512_broadcastsd_pd(a).as_f64x8();
+    let zero = _mm512_setzero_pd().as_f64x8();
+    transmute(simd_select_bitmask(k, broadcast, zero))
+}
+
 /// Compute the bitwise AND of packed 32-bit integers in a and b, and store the results in dst.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_and_epi32&expand=272)
@@ -18644,6 +18758,43 @@ mod tests {
         assert_eq_m512i(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_broadcastd_epi32() {
+        let a = _mm_set_epi32(
+            17, 18, 19, 20,
+        );
+        let r = _mm512_broadcastd_epi32(a);
+        let e = _mm512_set1_epi32(20);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_broadcastd_epi32() {
+        let src = _mm512_set1_epi32(20);
+        let a = _mm_set_epi32(
+            17, 18, 19, 20,
+        );
+        let r = _mm512_mask_broadcastd_epi32(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_broadcastd_epi32(src, 0b11111111_11111111, a);
+        let e = _mm512_set1_epi32(20);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_broadcastd_epi32() {
+        let a = _mm_set_epi32(
+            17, 18, 19, 20,
+        );
+        let r = _mm512_maskz_broadcastd_epi32(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_broadcastd_epi32(0b00000000_11111111, a);
+        let e = _mm512_setr_epi32(
+            20, 20, 20, 20, 20, 20, 20, 20, 0, 0, 0, 0, 0, 0, 0, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_broadcastss_ps() {
         let a = _mm_set_ps(
@@ -18654,6 +18805,33 @@ mod tests {
         assert_eq_m512(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_broadcastss_ps() {
+        let src = _mm512_set1_ps(20.);
+        let a = _mm_set_ps(
+            17., 18., 19., 20.,
+        );
+        let r = _mm512_mask_broadcastss_ps(src, 0, a);
+        assert_eq_m512(r, src);
+        let r = _mm512_mask_broadcastss_ps(src, 0b11111111_11111111, a);
+        let e = _mm512_set1_ps(20.);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_broadcastss_ps() {
+        let a = _mm_set_ps(
+            17., 18., 19., 20.,
+        );
+        let r = _mm512_maskz_broadcastss_ps(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_broadcastss_ps(0b00000000_11111111, a);
+        let e = _mm512_setr_ps(
+            20., 20., 20., 20., 20., 20., 20., 20., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_and_epi32() {
         let a = _mm512_set_epi32(
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index 7401a96ecf..8f92b73e34 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -4406,6 +4406,41 @@ mod tests {
         assert_eq_m512d(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_broadcastq_epi64() {
+        let a = _mm_setr_epi64x(
+            17, 18,
+        );
+        let r = _mm512_broadcastq_epi64(a);
+        let e = _mm512_set1_epi64(18);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_broadcastq_epi64() {
+        let src = _mm512_set1_epi64(18);
+        let a = _mm_setr_epi64x(
+            17, 18,
+        );
+        let r = _mm512_mask_broadcastq_epi64(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_broadcastq_epi64(src, 0b01111111, a);
+        let e = _mm512_set1_epi64(18);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_broadcastq_epi64() {
+        let a = _mm_setr_epi64x(
+            17, 18,
+        );
+        let r = _mm512_maskz_broadcastq_epi64(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_broadcastq_epi64(0b00001111, a);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 18, 18, 18, 18);
+        assert_eq_m512i(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_broadcastsd_pd() {
         let a = _mm_setr_pd(
@@ -4416,6 +4451,31 @@ mod tests {
         assert_eq_m512d(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_broadcastsd_pd() {
+        let src = _mm512_set1_pd(18.);
+        let a = _mm_setr_pd(
+            17., 18.,
+        );
+        let r = _mm512_mask_broadcastsd_pd(src, 0, a);
+        assert_eq_m512d(r, src);
+        let r = _mm512_mask_broadcastsd_pd(src, 0b01111111, a);
+        let e = _mm512_set1_pd(18.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_broadcastsd_pd() {
+        let a = _mm_setr_pd(
+            17., 18.,
+        );
+        let r = _mm512_maskz_broadcastsd_pd(0, a);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_broadcastsd_pd(0b00001111, a);
+        let e = _mm512_set_pd(0., 0., 0., 0., 18., 18., 18., 18.);
+        assert_eq_m512d(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_and_epi64() {
         let a = _mm512_set_epi64(1 << 0 | 1 << 15, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);

From bc95e37b21a3a830851ee9e0e2b639b8906e779b Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Tue, 22 Sep 2020 20:50:26 +0000
Subject: [PATCH 07/20] broadcast_i32x4, broadcast_i64x4, broadcast_f32x4,
 broadcast_f64x4

---
 crates/core_arch/avx512f.md            |  24 +--
 crates/core_arch/src/x86/avx512f.rs    | 204 +++++++++++++++++++++++++
 crates/core_arch/src/x86_64/avx512f.rs |  70 +++++++++
 3 files changed, 286 insertions(+), 12 deletions(-)

diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index fd4916c930..815fc1091d 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -18,10 +18,10 @@
   * [ ] [`_mm512_andnot_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_andnot_epi32&expand=5236)
   * [ ] [`_mm512_andnot_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_andnot_epi64&expand=5236)
   * [ ] [`_mm512_andnot_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_andnot_si512&expand=5236)
-  * [ ] [`_mm512_broadcast_f32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcast_f32x4&expand=5236)
-  * [ ] [`_mm512_broadcast_f64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcast_f64x4&expand=5236)
-  * [ ] [`_mm512_broadcast_i32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcast_i32x4&expand=5236)
-  * [ ] [`_mm512_broadcast_i64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcast_i64x4&expand=5236)
+  * [x] [`_mm512_broadcast_f32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcast_f32x4&expand=5236)
+  * [x] [`_mm512_broadcast_f64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcast_f64x4&expand=5236)
+  * [x] [`_mm512_broadcast_i32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcast_i32x4&expand=5236)
+  * [x] [`_mm512_broadcast_i64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcast_i64x4&expand=5236)
   * [x] [`_mm512_broadcastd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcastd_epi32&expand=5236)
   * [x] [`_mm512_broadcastq_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcastq_epi64&expand=5236)
   * [x] [`_mm512_broadcastsd_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcastsd_pd&expand=5236)
@@ -294,10 +294,10 @@
   * [ ] [`_mm512_mask_blend_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_blend_epi64&expand=5236)
   * [ ] [`_mm512_mask_blend_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_blend_pd&expand=5236)
   * [ ] [`_mm512_mask_blend_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_blend_ps&expand=5236)
-  * [ ] [`_mm512_mask_broadcast_f32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcast_f32x4&expand=5236)
-  * [ ] [`_mm512_mask_broadcast_f64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcast_f64x4&expand=5236)
-  * [ ] [`_mm512_mask_broadcast_i32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcast_i32x4&expand=5236)
-  * [ ] [`_mm512_mask_broadcast_i64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcast_i64x4&expand=5236)
+  * [x] [`_mm512_mask_broadcast_f32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcast_f32x4&expand=5236)
+  * [x] [`_mm512_mask_broadcast_f64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcast_f64x4&expand=5236)
+  * [x] [`_mm512_mask_broadcast_i32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcast_i32x4&expand=5236)
+  * [x] [`_mm512_mask_broadcast_i64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcast_i64x4&expand=5236)
   * [x] [`_mm512_mask_broadcastd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcastd_epi32&expand=5236)
   * [x] [`_mm512_mask_broadcastq_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcastq_epi64&expand=5236)
   * [x] [`_mm512_mask_broadcastsd_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcastsd_pd&expand=5236)
@@ -690,10 +690,10 @@
   * [x] [`_mm512_maskz_and_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_and_epi64&expand=5236)
   * [ ] [`_mm512_maskz_andnot_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_andnot_epi32&expand=5236)
   * [ ] [`_mm512_maskz_andnot_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_andnot_epi64&expand=5236)
-  * [ ] [`_mm512_maskz_broadcast_f32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcast_f32x4&expand=5236)
-  * [ ] [`_mm512_maskz_broadcast_f64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcast_f64x4&expand=5236)
-  * [ ] [`_mm512_maskz_broadcast_i32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcast_i32x4&expand=5236)
-  * [ ] [`_mm512_maskz_broadcast_i64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcast_i64x4&expand=5236)
+  * [x] [`_mm512_maskz_broadcast_f32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcast_f32x4&expand=5236)
+  * [x] [`_mm512_maskz_broadcast_f64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcast_f64x4&expand=5236)
+  * [x] [`_mm512_maskz_broadcast_i32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcast_i32x4&expand=5236)
+  * [x] [`_mm512_maskz_broadcast_i64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcast_i64x4&expand=5236)
   * [x] [`_mm512_maskz_broadcastd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcastd_epi32&expand=5236)
   * [x] [`_mm512_maskz_broadcastq_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcastq_epi64&expand=5236)
   * [x] [`_mm512_maskz_broadcastsd_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcastsd_pd&expand=5236)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 5f6a06705e..099883e606 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -10542,6 +10542,140 @@ pub unsafe fn _mm512_maskz_broadcastsd_pd(k: __mmask8, a: __m128d) -> __m512d {
     transmute(simd_select_bitmask(k, broadcast, zero))
 }
 
+/// Broadcast the 4 packed 32-bit integers from a to all elements of dst. 
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcast_i32x4&expand=510)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vinsert))] //should be vpbroadcasti32x4
+pub unsafe fn _mm512_broadcast_i32x4(a: __m128i) -> __m512i {
+    let a = _mm512_castsi128_si512(a).as_i32x16();
+    let ret = simd_shuffle16(a, a, [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]);
+    transmute::<i32x16, _>(ret)
+}
+
+/// Broadcast the 4 packed 32-bit integers from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcast_i32x4&expand=511)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vshufi32x4))] //should be vbroadcasti32x4
+pub unsafe fn _mm512_mask_broadcast_i32x4(src: __m512i, k: __mmask16, a: __m128i) -> __m512i {
+    let broadcast = _mm512_broadcast_i32x4(a).as_i32x16();
+    transmute(simd_select_bitmask(k, broadcast, src.as_i32x16()))
+}
+
+/// Broadcast the 4 packed 32-bit integers from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcast_i32x4&expand=512)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vshufi32x4))] //should be vbroadcasti32x4
+pub unsafe fn _mm512_maskz_broadcast_i32x4(k: __mmask16, a: __m128i) -> __m512i {
+    let broadcast = _mm512_broadcast_i32x4(a).as_i32x16();
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, broadcast, zero))
+}
+
+/// Broadcast the 4 packed 64-bit integers from a to all elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcast_i64x4&expand=522)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vinsertf64x4))] //should be vbroadcasti64x4
+pub unsafe fn _mm512_broadcast_i64x4(a: __m256i) -> __m512i {
+    simd_shuffle8(a, a, [0, 1, 2, 3, 0, 1, 2, 3])
+}
+
+/// Broadcast the 4 packed 64-bit integers from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcast_i64x4&expand=523)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vinserti64x4))] //should be vbroadcasti64x4
+pub unsafe fn _mm512_mask_broadcast_i64x4(src: __m512i, k: __mmask8, a: __m256i) -> __m512i {
+    let broadcast = _mm512_broadcast_i64x4(a).as_i64x8();
+    transmute(simd_select_bitmask(k, broadcast, src.as_i64x8()))
+}
+
+/// Broadcast the 4 packed 64-bit integers from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcast_i64x4&expand=524)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vinserti64x4))] //should be vbroadcasti64x4
+pub unsafe fn _mm512_maskz_broadcast_i64x4(k: __mmask8, a: __m256i) -> __m512i {
+    let broadcast = _mm512_broadcast_i64x4(a).as_i64x8();
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, broadcast, zero))
+}
+
+/// Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcast_f32x4&expand=483)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vinsertf128))] //should be vbroadcastf32x4
+pub unsafe fn _mm512_broadcast_f32x4(a: __m128) -> __m512 {
+    simd_shuffle16(a, a, [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3])
+}
+
+/// Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcast_f32x4&expand=484)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vinsertf128))] //should be vbroadcastf32x4
+pub unsafe fn _mm512_mask_broadcast_f32x4(src: __m512, k: __mmask16, a: __m128) -> __m512 {
+    let broadcast = _mm512_broadcast_f32x4(a).as_f32x16();
+    transmute(simd_select_bitmask(k, broadcast, src.as_f32x16()))
+}
+
+/// Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcast_f32x4&expand=485)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vinsertf128))] //should be vbroadcastf32x4
+pub unsafe fn _mm512_maskz_broadcast_f32x4(k: __mmask16, a: __m128) -> __m512 {
+    let broadcast = _mm512_broadcast_f32x4(a).as_f32x16();
+    let zero = _mm512_setzero_ps().as_f32x16();
+    transmute(simd_select_bitmask(k, broadcast, zero))
+}
+
+/// Broadcast the 4 packed double-precision (64-bit) floating-point elements from a to all elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcast_f64x4&expand=495)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vinsertf64x4))] //should be vbroadcastf64x4
+pub unsafe fn _mm512_broadcast_f64x4(a: __m256d) -> __m512d {
+    simd_shuffle8(a, a, [0, 1, 2, 3, 0, 1, 2, 3])
+}
+
+/// Broadcast the 4 packed double-precision (64-bit) floating-point elements from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcast_f64x4&expand=496)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vinsertf64x4))] //should be vbroadcastf64x4
+pub unsafe fn _mm512_mask_broadcast_f64x4(src: __m512d, k: __mmask8, a: __m256d) -> __m512d {
+    let broadcast = _mm512_broadcast_f64x4(a).as_f64x8();
+    transmute(simd_select_bitmask(k, broadcast, src.as_f64x8()))
+}
+
+/// Broadcast the 4 packed double-precision (64-bit) floating-point elements from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcast_f64x4&expand=497)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vinsertf64x4))] //should be vbroadcastf64x4
+pub unsafe fn _mm512_maskz_broadcast_f64x4(k: __mmask8, a: __m256d) -> __m512d {
+    let broadcast = _mm512_broadcast_f64x4(a).as_f64x8();
+    let zero = _mm512_setzero_pd().as_f64x8();
+    transmute(simd_select_bitmask(k, broadcast, zero))
+}
+
 /// Compute the bitwise AND of packed 32-bit integers in a and b, and store the results in dst.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_and_epi32&expand=272)
@@ -18832,6 +18966,76 @@ mod tests {
         assert_eq_m512(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_broadcast_i32x4() {
+        let a = _mm_set_epi32(
+            17, 18, 19, 20,
+        );
+        let r = _mm512_broadcast_i32x4(a);
+        let e = _mm512_set_epi32(17, 18, 19, 20, 17, 18, 19, 20, 17, 18, 19, 20, 17, 18, 19, 20);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_broadcast_i32x4() {
+        let src = _mm512_set1_epi32(20);
+        let a = _mm_set_epi32(
+            17, 18, 19, 20,
+        );
+        let r = _mm512_mask_broadcast_i32x4(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_broadcast_i32x4(src, 0b11111111_11111111, a);
+        let e = _mm512_set_epi32(17, 18, 19, 20, 17, 18, 19, 20, 17, 18, 19, 20, 17, 18, 19, 20);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_broadcast_i32x4() {
+        let a = _mm_set_epi32(
+            17, 18, 19, 20,
+        );
+        let r = _mm512_maskz_broadcast_i32x4(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_broadcast_i32x4(0b00000000_11111111, a);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 17, 18, 19, 20, 17, 18, 19, 20);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_broadcast_f32x4() {
+        let a = _mm_set_ps(
+            17., 18., 19., 20.,
+        );
+        let r = _mm512_broadcast_f32x4(a);
+        let e = _mm512_set_ps(17., 18., 19., 20., 17., 18., 19., 20., 17., 18., 19., 20., 17., 18., 19., 20.);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_broadcast_f32x4() {
+        let src = _mm512_set1_ps(20.);
+        let a = _mm_set_ps(
+            17., 18., 19., 20.,
+        );
+        let r = _mm512_mask_broadcast_f32x4(src, 0, a);
+        assert_eq_m512(r, src);
+        let r = _mm512_mask_broadcast_f32x4(src, 0b11111111_11111111, a);
+        let e = _mm512_set_ps(17., 18., 19., 20., 17., 18., 19., 20., 17., 18., 19., 20., 17., 18., 19., 20.);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_broadcast_f32x4() {
+        let a = _mm_set_ps(
+            17., 18., 19., 20.,
+        );
+        let r = _mm512_maskz_broadcast_f32x4(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_broadcast_f32x4(0b00000000_11111111, a);
+        let e = _mm512_set_ps(0., 0., 0., 0., 0., 0., 0., 0., 17., 18., 19., 20., 17., 18., 19., 20.);
+        assert_eq_m512(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_and_epi32() {
         let a = _mm512_set_epi32(
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index 8f92b73e34..34f6b009b3 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -4476,6 +4476,76 @@ mod tests {
         assert_eq_m512d(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_broadcast_i64x4() {
+        let a = _mm256_set_epi64x(
+            17, 18, 19, 20
+        );
+        let r = _mm512_broadcast_i64x4(a);
+        let e = _mm512_set_epi64(17, 18, 19, 20, 17, 18, 19, 20);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_broadcast_i64x4() {
+        let src = _mm512_set1_epi64(18);
+        let a = _mm256_set_epi64x(
+            17, 18, 19, 20
+        );
+        let r = _mm512_mask_broadcast_i64x4(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_broadcast_i64x4(src, 0b11111111, a);
+        let e = _mm512_set_epi64(17, 18, 19, 20, 17, 18, 19, 20);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_broadcast_i64x4() {
+        let a = _mm256_set_epi64x(
+            17, 18, 19, 20
+        );
+        let r = _mm512_maskz_broadcast_i64x4(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_broadcast_i64x4(0b00001111, a);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 17, 18, 19, 20);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_broadcast_f64x4() {
+        let a = _mm256_set_pd(
+            17., 18., 19., 20.
+        );
+        let r = _mm512_broadcast_f64x4(a);
+        let e = _mm512_set_pd(17., 18., 19., 20., 17., 18., 19., 20.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_broadcast_f64x4() {
+        let src = _mm512_set1_pd(18.);
+        let a = _mm256_set_pd(
+            17., 18., 19., 20.
+        );
+        let r = _mm512_mask_broadcast_f64x4(src, 0, a);
+        assert_eq_m512d(r, src);
+        let r = _mm512_mask_broadcast_f64x4(src, 0b11111111, a);
+        let e = _mm512_set_pd(17., 18., 19., 20., 17., 18., 19., 20.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_broadcast_f64x4() {
+        let a = _mm256_set_pd(
+            17., 18., 19., 20.
+        );
+        let r = _mm512_maskz_broadcast_f64x4(0, a);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_broadcast_f64x4(0b00001111, a);
+        let e = _mm512_set_pd(0., 0., 0., 0., 17., 18., 19., 20.);
+        assert_eq_m512d(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_and_epi64() {
         let a = _mm512_set_epi64(1 << 0 | 1 << 15, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);

From 757df4162042c8cdc583c2ee93c9d1012c6dcc9f Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Wed, 23 Sep 2020 01:27:34 +0000
Subject: [PATCH 08/20] addnot: epi32, epi64

---
 crates/core_arch/avx512f.md            |  14 ++--
 crates/core_arch/src/x86/avx512f.rs    | 109 +++++++++++++++++++++++++
 crates/core_arch/src/x86_64/avx512f.rs |  42 ++++++++++
 3 files changed, 158 insertions(+), 7 deletions(-)

diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index 815fc1091d..2cd567cce0 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -15,9 +15,9 @@
   * [x] [`_mm512_and_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_and_epi32&expand=5236)
   * [x] [`_mm512_and_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_and_epi64&expand=5236)
   * [x] [`_mm512_and_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_and_si512&expand=5236)
-  * [ ] [`_mm512_andnot_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_andnot_epi32&expand=5236)
-  * [ ] [`_mm512_andnot_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_andnot_epi64&expand=5236)
-  * [ ] [`_mm512_andnot_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_andnot_si512&expand=5236)
+  * [x] [`_mm512_andnot_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_andnot_epi32&expand=5236)
+  * [x] [`_mm512_andnot_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_andnot_epi64&expand=5236)
+  * [x] [`_mm512_andnot_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_andnot_si512&expand=5236)
   * [x] [`_mm512_broadcast_f32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcast_f32x4&expand=5236)
   * [x] [`_mm512_broadcast_f64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcast_f64x4&expand=5236)
   * [x] [`_mm512_broadcast_i32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcast_i32x4&expand=5236)
@@ -288,8 +288,8 @@
   * [ ] [`_mm512_mask_alignr_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_alignr_epi64&expand=5236)
   * [x] [`_mm512_mask_and_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_and_epi32&expand=5236)
   * [x] [`_mm512_mask_and_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_and_epi64&expand=5236)
-  * [ ] [`_mm512_mask_andnot_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_andnot_epi32&expand=5236)
-  * [ ] [`_mm512_mask_andnot_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_andnot_epi64&expand=5236)
+  * [x] [`_mm512_mask_andnot_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_andnot_epi32&expand=5236)
+  * [x] [`_mm512_mask_andnot_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_andnot_epi64&expand=5236)
   * [ ] [`_mm512_mask_blend_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_blend_epi32&expand=5236)
   * [ ] [`_mm512_mask_blend_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_blend_epi64&expand=5236)
   * [ ] [`_mm512_mask_blend_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_blend_pd&expand=5236)
@@ -688,8 +688,8 @@
   * [ ] [`_mm512_maskz_alignr_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_alignr_epi64&expand=5236)
   * [x] [`_mm512_maskz_and_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_and_epi32&expand=5236)
   * [x] [`_mm512_maskz_and_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_and_epi64&expand=5236)
-  * [ ] [`_mm512_maskz_andnot_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_andnot_epi32&expand=5236)
-  * [ ] [`_mm512_maskz_andnot_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_andnot_epi64&expand=5236)
+  * [x] [`_mm512_maskz_andnot_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_andnot_epi32&expand=5236)
+  * [x] [`_mm512_maskz_andnot_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_andnot_epi64&expand=5236)
   * [x] [`_mm512_maskz_broadcast_f32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcast_f32x4&expand=5236)
   * [x] [`_mm512_maskz_broadcast_f64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcast_f64x4&expand=5236)
   * [x] [`_mm512_maskz_broadcast_i32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcast_i32x4&expand=5236)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 099883e606..231856e268 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -10904,6 +10904,82 @@ pub unsafe fn _mm512_xor_si512(a: __m512i, b: __m512i) -> __m512i {
     transmute(simd_xor(a.as_i32x16(), b.as_i32x16()))
 }
 
+/// Compute the bitwise NOT of packed 32-bit integers in a and then AND with b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_andnot_epi32&expand=310)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpandnq))] //should be vpandnd
+pub unsafe fn _mm512_andnot_epi32(a: __m512i, b: __m512i) -> __m512i {
+    _mm512_and_epi32(_mm512_xor_epi32(a, _mm512_set1_epi32(u32::MAX as i32)), b)
+}
+
+/// Compute the bitwise NOT of packed 32-bit integers in a and then AND with b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_andnot_epi32&expand=311)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpandnd))]
+pub unsafe fn _mm512_mask_andnot_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    let andnot = _mm512_andnot_epi32(a, b).as_i32x16();
+    transmute(simd_select_bitmask(k, andnot, src.as_i32x16()))
+}
+
+/// Compute the bitwise NOT of packed 32-bit integers in a and then AND with b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_andnot_epi32&expand=312)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpandnd))]
+pub unsafe fn _mm512_maskz_andnot_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    let andnot = _mm512_andnot_epi32(a, b).as_i32x16();
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, andnot, zero))
+}
+
+/// Compute the bitwise NOT of 512 bits (composed of packed 64-bit integers) in a and then AND with b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_andnot_epi64&expand=317)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpandnq))] //should be vpandnd
+pub unsafe fn _mm512_andnot_epi64(a: __m512i, b: __m512i) -> __m512i {
+    _mm512_and_epi64(_mm512_xor_epi64(a, _mm512_set1_epi64(u64::MAX as i64)), b)
+}
+
+/// Compute the bitwise NOT of packed 64-bit integers in a and then AND with b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_andnot_epi64&expand=318)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpandnq))]
+pub unsafe fn _mm512_mask_andnot_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    let andnot = _mm512_andnot_epi64(a, b).as_i64x8();
+    transmute(simd_select_bitmask(k, andnot, src.as_i64x8()))
+}
+
+/// Compute the bitwise NOT of packed 64-bit integers in a and then AND with b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_andnot_epi64&expand=319)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpandnq))]
+pub unsafe fn _mm512_maskz_andnot_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    let andnot = _mm512_andnot_epi64(a, b).as_i64x8();
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, andnot, zero))
+}
+
+/// Compute the bitwise NOT of 512 bits (representing integer data) in a and then AND with b, and store the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_andnot_si512&expand=340)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpandnq))]
+pub unsafe fn _mm512_andnot_si512(a: __m512i, b: __m512i) -> __m512i {
+    _mm512_and_epi64(_mm512_xor_epi64(a, _mm512_set1_epi64(u64::MAX as i64)), b)
+}
+
 /// Compute the bitwise AND of 16-bit masks a and b, and store the result in k.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=kand_mask16&expand=3212)
@@ -19706,6 +19782,39 @@ mod tests {
         assert_eq_m512i(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_andnot_epi32() {
+        let a = _mm512_set1_epi32(0);
+        let b = _mm512_set1_epi32(1<<3 | 1<< 4);
+        let r = _mm512_andnot_epi32(a, b);
+        let e = _mm512_set1_epi32(1 << 3 | 1 << 4);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_andnot_epi32() {
+        let a = _mm512_set1_epi32(1<<1 | 1<< 2);
+        let b = _mm512_set1_epi32(1<<3 | 1<< 4);
+        let r = _mm512_mask_andnot_epi32(a, 0, a, b);
+        assert_eq_m512i(r, a);
+
+        let r = _mm512_mask_andnot_epi32(a, 0b11111111_11111111, a, b);
+        let e = _mm512_set1_epi32(1 << 3 | 1 << 4);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_andnot_epi32() {
+        let a = _mm512_set1_epi32(1<<1 | 1<< 2);
+        let b = _mm512_set1_epi32(1<<3 | 1<< 4);
+        let r = _mm512_maskz_andnot_epi32(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+
+        let r = _mm512_maskz_andnot_epi32(0b00000000_11111111, a, b);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4);
+        assert_eq_m512i(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_kand() {
         let a: u16 = 0b11001100_00110011;
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index 34f6b009b3..a9692b358d 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -4704,4 +4704,46 @@ mod tests {
         let e = _mm512_set_epi64(1 << 0 | 1 << 13 | 1 << 15, 0, 0, 0, 0, 0, 0, 0);
         assert_eq_m512i(r, e);
     }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_andnot_epi64() {
+        let a = _mm512_set1_epi64(0);
+        let b = _mm512_set1_epi64(1<<3 | 1<< 4);
+        let r = _mm512_andnot_epi64(a, b);
+        let e = _mm512_set1_epi64(1 << 3 | 1 << 4);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_andnot_epi64() {
+        let a = _mm512_set1_epi64(1<<1 | 1<< 2);
+        let b = _mm512_set1_epi64(1<<3 | 1<< 4);
+        let r = _mm512_mask_andnot_epi64(a, 0, a, b);
+        assert_eq_m512i(r, a);
+
+        let r = _mm512_mask_andnot_epi64(a, 0b11111111, a, b);
+        let e = _mm512_set1_epi64(1 << 3 | 1 << 4);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_andnot_epi64() {
+        let a = _mm512_set1_epi64(1<<1 | 1<< 2);
+        let b = _mm512_set1_epi64(1<<3 | 1<< 4);
+        let r = _mm512_maskz_andnot_epi64(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+
+        let r = _mm512_maskz_andnot_epi64(0b00001111, a, b);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_andnot_si512() {
+        let a = _mm512_set1_epi64(0);
+        let b = _mm512_set1_epi64(1<<3 | 1<< 4);
+        let r = _mm512_andnot_si512(a, b);
+        let e = _mm512_set1_epi64(1 << 3 | 1 << 4);
+        assert_eq_m512i(r, e);
+    }
 }

From 2370f24a05ae89263c9a27f07765ce6e0526b28b Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Wed, 23 Sep 2020 15:05:27 +0000
Subject: [PATCH 09/20] insertf32x4, insertf64x4

---
 crates/core_arch/avx512f.md            |  12 +--
 crates/core_arch/src/x86/avx512f.rs    | 143 ++++++++++++++++++++++---
 crates/core_arch/src/x86_64/avx512f.rs |  49 +++++++++
 3 files changed, 186 insertions(+), 18 deletions(-)

diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index 2cd567cce0..e74d4e193a 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -222,8 +222,8 @@
   * [x] [`_mm512_i64scatter_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64scatter_epi64&expand=5236)
   * [x] [`_mm512_i64scatter_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64scatter_pd&expand=5236)
   * [x] [`_mm512_i64scatter_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64scatter_ps&expand=5236)
-  * [ ] [`_mm512_insertf32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_insertf32x4&expand=5236)
-  * [ ] [`_mm512_insertf64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_insertf64x4&expand=5236)
+  * [x] [`_mm512_insertf32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_insertf32x4&expand=5236)
+  * [x] [`_mm512_insertf64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_insertf64x4&expand=5236)
   * [ ] [`_mm512_inserti32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_inserti32x4&expand=5236)
   * [ ] [`_mm512_inserti64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_inserti64x4&expand=5236)
   * [ ] [`_mm512_int2mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_int2mask&expand=5236)
@@ -511,8 +511,8 @@
   * [x] [`_mm512_mask_i64scatter_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64scatter_epi64&expand=5236)
   * [x] [`_mm512_mask_i64scatter_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64scatter_pd&expand=5236)
   * [x] [`_mm512_mask_i64scatter_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64scatter_ps&expand=5236)
-  * [ ] [`_mm512_mask_insertf32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_insertf32x4&expand=5236)
-  * [ ] [`_mm512_mask_insertf64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_insertf64x4&expand=5236)
+  * [x] [`_mm512_mask_insertf32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_insertf32x4&expand=5236)
+  * [x] [`_mm512_mask_insertf64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_insertf64x4&expand=5236)
   * [ ] [`_mm512_mask_inserti32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_inserti32x4&expand=5236)
   * [ ] [`_mm512_mask_inserti64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_inserti64x4&expand=5236)
   * [ ] [`_mm512_mask_load_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_load_epi32&expand=5236)
@@ -809,8 +809,8 @@
   * [x] [`_mm512_maskz_getmant_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getmant_ps&expand=5236)
   * [x] [`_mm512_maskz_getmant_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getmant_round_pd&expand=5236)
   * [x] [`_mm512_maskz_getmant_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getmant_round_ps&expand=5236)
-  * [ ] [`_mm512_maskz_insertf32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_insertf32x4&expand=5236)
-  * [ ] [`_mm512_maskz_insertf64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_insertf64x4&expand=5236)
+  * [x] [`_mm512_maskz_insertf32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_insertf32x4&expand=5236)
+  * [x] [`_mm512_maskz_insertf64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_insertf64x4&expand=5236)
   * [ ] [`_mm512_maskz_inserti32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_inserti32x4&expand=5236)
   * [ ] [`_mm512_maskz_inserti64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_inserti64x4&expand=5236)
   * [ ] [`_mm512_maskz_load_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_load_epi32&expand=5236)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 231856e268..7fc2436810 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -10228,24 +10228,109 @@ pub unsafe fn _mm512_maskz_movedup_pd(k: __mmask8, a: __m512d) -> __m512d {
     let zero = _mm512_setzero_pd().as_f64x8();
     transmute(simd_select_bitmask(k, mov, zero))
 }
-/*
+
 /// Copy a to dst, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into dst at the location specified by imm8.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_insertf32x4&expand=3155)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vinsertf32x4))]
+#[cfg_attr(test, assert_instr(vinsertf32x4, imm8 = 2))]
+#[rustc_args_required_const(2)]
 pub unsafe fn _mm512_insertf32x4(a: __m512, b: __m128, imm8: i32) -> __m512 {
-    let a = a.as_f32x16();
-    let b = b.as_f32x4();
-    match imm8 & 3 {
-        0 => transmute(simd_insert(a, 0, b)),
-        1 => transmute(simd_insert(a, 1, b)),
-        2 => transmute(simd_insert(a, 2, b)),
-        _ => transmute(simd_insert(a, 3, b)),
+    let b = _mm512_castps128_ps512(b);
+    match imm8 & 0b11 {
+        0 => simd_shuffle16(a, b, [16, 17, 18, 19, 4, 5, 6, 7,     8, 9, 10, 11,   12, 13, 14, 15]),
+        1 => simd_shuffle16(a, b, [0, 1, 2, 3,     16, 17, 18, 19, 8, 9, 10, 11,   12, 13, 14, 15]),
+        2 => simd_shuffle16(a, b, [0, 1, 2, 3,     4, 5, 6, 7,     16, 17, 18, 19, 12, 13, 14, 15]),
+        _ => simd_shuffle16(a, b, [0, 1, 2, 3,     4, 5, 6, 7,     8, 9, 10, 11,   16, 17, 18, 19]), 
+    }
+}
+
+/// Copy a to tmp, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_insertf32x4&expand=3156)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vinsertf32x4, imm8 = 2))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm512_mask_insertf32x4(src: __m512, k: __mmask16, a: __m512, b: __m128, imm8: i32) -> __m512 {
+    let b = _mm512_castps128_ps512(b);
+    let insert = match imm8 & 0b11 {
+        0 => simd_shuffle16(a, b, [16, 17, 18, 19, 4, 5, 6, 7,     8, 9, 10, 11,   12, 13, 14, 15]),
+        1 => simd_shuffle16(a, b, [0, 1, 2, 3,     16, 17, 18, 19, 8, 9, 10, 11,   12, 13, 14, 15]),
+        2 => simd_shuffle16(a, b, [0, 1, 2, 3,     4, 5, 6, 7,     16, 17, 18, 19, 12, 13, 14, 15]),
+        _ => simd_shuffle16(a, b, [0, 1, 2, 3,     4, 5, 6, 7,     8, 9, 10, 11,   16, 17, 18, 19]), 
+    };
+    transmute(simd_select_bitmask(k, insert, src.as_f32x16()))
+}
+
+/// Copy a to tmp, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_insertf32x4&expand=3157)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vinsertf32x4, imm8 = 2))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm512_maskz_insertf32x4(k: __mmask16, a: __m512, b: __m128, imm8: i32) -> __m512 {
+    let b = _mm512_castps128_ps512(b);
+    let insert = match imm8 & 0b11 {
+        0 => simd_shuffle16(a, b, [16, 17, 18, 19, 4, 5, 6, 7,     8, 9, 10, 11,   12, 13, 14, 15]),
+        1 => simd_shuffle16(a, b, [0, 1, 2, 3,     16, 17, 18, 19, 8, 9, 10, 11,   12, 13, 14, 15]),
+        2 => simd_shuffle16(a, b, [0, 1, 2, 3,     4, 5, 6, 7,     16, 17, 18, 19, 12, 13, 14, 15]),
+        _ => simd_shuffle16(a, b, [0, 1, 2, 3,     4, 5, 6, 7,     8, 9, 10, 11,   16, 17, 18, 19]), 
+    };
+    let zero = _mm512_setzero_ps().as_f32x16();
+    transmute(simd_select_bitmask(k, insert, zero))
+}
+
+/// Copy a to dst, then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from b into dst at the location specified by imm8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_insertf64x4&expand=3167)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vinsertf64x4, imm8 = 1))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm512_insertf64x4(a: __m512d, b: __m256d, imm8: i32) -> __m512d {
+    let b = _mm512_castpd256_pd512(b);
+    match imm8 & 0b1 {
+        0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7  ]),
+        _ => simd_shuffle8(a, b, [0, 1, 2, 3,   8, 9, 10, 11]),
     }
 }
-*/
+
+/// Copy a to tmp, then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_insertf64x4&expand=3168)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vinsertf64x4, imm8 = 1))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm512_mask_insertf64x4(src: __m512d, k: __mmask8, a: __m512d, b: __m256d, imm8: i32) -> __m512d {
+    let b = _mm512_castpd256_pd512(b);
+    let insert = match imm8 & 0b1 {
+        0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7  ]),
+        _ => simd_shuffle8(a, b, [0, 1, 2, 3,   8, 9, 10, 11]),
+    };
+    transmute(simd_select_bitmask(k, insert, src.as_f64x8()))
+}
+
+/// Copy a to tmp, then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_insertf64x4&expand=3169)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vinsertf64x4, imm8 = 1))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm512_maskz_insertf64x4(k: __mmask8, a: __m512d, b: __m256d, imm8: i32) -> __m512d {
+    let b = _mm512_castpd256_pd512(b);
+    let insert = match imm8 & 0b1 {
+        0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7  ]),
+        _ => simd_shuffle8(a, b, [0, 1, 2, 3,   8, 9, 10, 11]),
+    };
+    let zero = _mm512_setzero_pd().as_f64x8();
+    transmute(simd_select_bitmask(k, insert, zero))
+}
+
 /// Cast vector of type __m128 to type __m512; the upper 384 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castps128_ps512&expand=621)
@@ -18888,7 +18973,7 @@ mod tests {
         );
         assert_eq_m512(r, e);
     }
-/*
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_insertf32x4() {
         let a = _mm512_setr_ps(
@@ -18903,7 +18988,41 @@ mod tests {
         );
         assert_eq_m512(r, e);
     }
-*/ 
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_insertf32x4() {
+        let a = _mm512_setr_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let b = _mm_setr_ps(
+            17., 18., 19., 20.,
+        );
+        let r = _mm512_mask_insertf32x4(a, 0, a, b, 0);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_insertf32x4(a, 0b11111111_11111111, a, b, 0);
+        let e = _mm512_setr_ps(
+            17., 18., 19., 20., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_insertf32x4() {
+        let a = _mm512_setr_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let b = _mm_setr_ps(
+            17., 18., 19., 20.,
+        );
+        let r = _mm512_maskz_insertf32x4(0, a, b, 0);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_insertf32x4(0b00000000_11111111, a, b, 0);
+        let e = _mm512_setr_ps(
+            17., 18., 19., 20., 5., 6., 7., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+ 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_castps128_ps512() {
         let a = _mm_setr_ps(
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index a9692b358d..83b884dc8d 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -4278,6 +4278,55 @@ mod tests {
         assert_eq_m512d(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_insertf64x4() {
+        let a = _mm512_setr_pd(
+            1., 2., 3., 4., 5., 6., 7., 8.,
+        );
+        let b = _mm256_setr_pd(
+            17., 18., 19., 20.,
+        );
+        let r = _mm512_insertf64x4(a, b, 1);
+        let e = _mm512_setr_pd(
+            1., 2., 3., 4., 17., 18., 19., 20.,
+        );
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_insertf64x4() {
+        let a = _mm512_setr_pd(
+            1., 2., 3., 4., 5., 6., 7., 8.,
+        );
+        let b = _mm256_setr_pd(
+            17., 18., 19., 20.,
+        );
+        let r = _mm512_mask_insertf64x4(a, 0, a, b, 1);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_insertf64x4(a, 0b11111111, a, b, 1);
+        let e = _mm512_setr_pd(
+            1., 2., 3., 4., 17., 18., 19., 20.,
+        );
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_insertf64x4() {
+        let a = _mm512_setr_pd(
+            1., 2., 3., 4., 5., 6., 7., 8.,
+        );
+        let b = _mm256_setr_pd(
+            17., 18., 19., 20.,
+        );
+        let r = _mm512_maskz_insertf64x4(0, a, b, 1);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_insertf64x4(0b00001111, a, b, 1);
+        let e = _mm512_setr_pd(
+            1., 2., 3., 4., 0., 0., 0., 0.,
+        );
+        assert_eq_m512d(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_castpd128_pd512() {
         let a = _mm_setr_pd(

From bec0e99f063d952372e39e2f9c5853387de5088a Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Wed, 23 Sep 2020 16:51:08 +0000
Subject: [PATCH 10/20] inserti32x4, inserti64x4

---
 crates/core_arch/avx512f.md            |  12 +-
 crates/core_arch/src/x86/avx512f.rs    | 163 ++++++++++++++++++++++++-
 crates/core_arch/src/x86_64/avx512f.rs |  49 ++++++++
 3 files changed, 214 insertions(+), 10 deletions(-)

diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index e74d4e193a..df6170e076 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -224,8 +224,8 @@
   * [x] [`_mm512_i64scatter_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64scatter_ps&expand=5236)
   * [x] [`_mm512_insertf32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_insertf32x4&expand=5236)
   * [x] [`_mm512_insertf64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_insertf64x4&expand=5236)
-  * [ ] [`_mm512_inserti32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_inserti32x4&expand=5236)
-  * [ ] [`_mm512_inserti64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_inserti64x4&expand=5236)
+  * [x] [`_mm512_inserti32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_inserti32x4&expand=5236)
+  * [x] [`_mm512_inserti64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_inserti64x4&expand=5236)
   * [ ] [`_mm512_int2mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_int2mask&expand=5236)
   * [x] [`_mm512_kand`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_kand&expand=5236)
   * [x] [`_mm512_kandn`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_kandn&expand=5236)
@@ -513,8 +513,8 @@
   * [x] [`_mm512_mask_i64scatter_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64scatter_ps&expand=5236)
   * [x] [`_mm512_mask_insertf32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_insertf32x4&expand=5236)
   * [x] [`_mm512_mask_insertf64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_insertf64x4&expand=5236)
-  * [ ] [`_mm512_mask_inserti32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_inserti32x4&expand=5236)
-  * [ ] [`_mm512_mask_inserti64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_inserti64x4&expand=5236)
+  * [x] [`_mm512_mask_inserti32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_inserti32x4&expand=5236)
+  * [x] [`_mm512_mask_inserti64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_inserti64x4&expand=5236)
   * [ ] [`_mm512_mask_load_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_load_epi32&expand=5236)
   * [ ] [`_mm512_mask_load_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_load_epi64&expand=5236)
   * [ ] [`_mm512_mask_load_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_load_pd&expand=5236)
@@ -811,8 +811,8 @@
   * [x] [`_mm512_maskz_getmant_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getmant_round_ps&expand=5236)
   * [x] [`_mm512_maskz_insertf32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_insertf32x4&expand=5236)
   * [x] [`_mm512_maskz_insertf64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_insertf64x4&expand=5236)
-  * [ ] [`_mm512_maskz_inserti32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_inserti32x4&expand=5236)
-  * [ ] [`_mm512_maskz_inserti64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_inserti64x4&expand=5236)
+  * [x] [`_mm512_maskz_inserti32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_inserti32x4&expand=5236)
+  * [x] [`_mm512_maskz_inserti64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_inserti64x4&expand=5236)
   * [ ] [`_mm512_maskz_load_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_load_epi32&expand=5236)
   * [ ] [`_mm512_maskz_load_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_load_epi64&expand=5236)
   * [ ] [`_mm512_maskz_load_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_load_pd&expand=5236)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 7fc2436810..60ebf8663b 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -10229,6 +10229,112 @@ pub unsafe fn _mm512_maskz_movedup_pd(k: __mmask8, a: __m512d) -> __m512d {
     transmute(simd_select_bitmask(k, mov, zero))
 }
 
+/// Copy a to dst, then insert 128 bits (composed of 4 packed 32-bit integers) from b into dst at the location specified by imm8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_inserti32x4&expand=3174)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vinsertf32x4, imm8 = 2))] //should be vinserti32x4
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm512_inserti32x4(a: __m512i, b: __m128i, imm8: i32) -> __m512i {
+    let a = a.as_i32x16();
+    let b = _mm512_castsi128_si512(b).as_i32x16();
+    let ret: i32x16 = match imm8 & 0b11 {
+        0 => simd_shuffle16(a, b, [16, 17, 18, 19, 4, 5, 6, 7,     8, 9, 10, 11,   12, 13, 14, 15]),
+        1 => simd_shuffle16(a, b, [0, 1, 2, 3,     16, 17, 18, 19, 8, 9, 10, 11,   12, 13, 14, 15]),
+        2 => simd_shuffle16(a, b, [0, 1, 2, 3,     4, 5, 6, 7,     16, 17, 18, 19, 12, 13, 14, 15]),
+        _ => simd_shuffle16(a, b, [0, 1, 2, 3,     4, 5, 6, 7,     8, 9, 10, 11,   16, 17, 18, 19]), 
+    };
+    transmute(ret)
+}
+
+/// Copy a to tmp, then insert 128 bits (composed of 4 packed 32-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_inserti32x4&expand=3175)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vinserti32x4, imm8 = 2))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm512_mask_inserti32x4(src: __m512i, k: __mmask16, a: __m512i, b: __m128i, imm8: i32) -> __m512i {
+    let a = a.as_i32x16();
+    let b = _mm512_castsi128_si512(b).as_i32x16();
+    let insert: i32x16 = match imm8 & 0b11 {
+        0 => simd_shuffle16(a, b, [16, 17, 18, 19, 4, 5, 6, 7,     8, 9, 10, 11,   12, 13, 14, 15]),
+        1 => simd_shuffle16(a, b, [0, 1, 2, 3,     16, 17, 18, 19, 8, 9, 10, 11,   12, 13, 14, 15]),
+        2 => simd_shuffle16(a, b, [0, 1, 2, 3,     4, 5, 6, 7,     16, 17, 18, 19, 12, 13, 14, 15]),
+        _ => simd_shuffle16(a, b, [0, 1, 2, 3,     4, 5, 6, 7,     8, 9, 10, 11,   16, 17, 18, 19]), 
+    };
+    transmute(simd_select_bitmask(k, insert, src.as_i32x16()))
+}
+
+/// Copy a to tmp, then insert 128 bits (composed of 4 packed 32-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_inserti32x4&expand=3176)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vinserti32x4, imm8 = 2))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm512_maskz_inserti32x4(k: __mmask16, a: __m512i, b: __m128i, imm8: i32) -> __m512i {
+    let a = a.as_i32x16();
+    let b = _mm512_castsi128_si512(b).as_i32x16();
+    let insert = match imm8 & 0b11 {
+        0 => simd_shuffle16(a, b, [16, 17, 18, 19, 4, 5, 6, 7,     8, 9, 10, 11,   12, 13, 14, 15]),
+        1 => simd_shuffle16(a, b, [0, 1, 2, 3,     16, 17, 18, 19, 8, 9, 10, 11,   12, 13, 14, 15]),
+        2 => simd_shuffle16(a, b, [0, 1, 2, 3,     4, 5, 6, 7,     16, 17, 18, 19, 12, 13, 14, 15]),
+        _ => simd_shuffle16(a, b, [0, 1, 2, 3,     4, 5, 6, 7,     8, 9, 10, 11,   16, 17, 18, 19]), 
+    };
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, insert, zero))
+}
+
+/// Copy a to dst, then insert 256 bits (composed of 4 packed 64-bit integers) from b into dst at the location specified by imm8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_inserti64x4&expand=3186)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vinsertf64x4, imm8 = 1))] //should be vinserti64x4
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm512_inserti64x4(a: __m512i, b: __m256i, imm8: i32) -> __m512i {
+    let b = _mm512_castsi256_si512(b);
+    match imm8 & 0b1 {
+        0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7  ]),
+        _ => simd_shuffle8(a, b, [0, 1, 2, 3,   8, 9, 10, 11]),
+    }
+}
+
+/// Copy a to tmp, then insert 256 bits (composed of 4 packed 64-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_inserti64x4&expand=3187)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vinserti64x4, imm8 = 1))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm512_mask_inserti64x4(src: __m512i, k: __mmask8, a: __m512i, b: __m256i, imm8: i32) -> __m512i {
+    let b = _mm512_castsi256_si512(b);
+    let insert = match imm8 & 0b1 {
+        0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7  ]),
+        _ => simd_shuffle8(a, b, [0, 1, 2, 3,   8, 9, 10, 11]),
+    };
+    transmute(simd_select_bitmask(k, insert, src.as_i64x8()))
+}
+
+/// Copy a to tmp, then insert 256 bits (composed of 4 packed 64-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_inserti64x4&expand=3188)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vinserti64x4, imm8 = 1))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm512_maskz_inserti64x4(k: __mmask8, a: __m512i, b: __m256i, imm8: i32) -> __m512i {
+    let b = _mm512_castsi256_si512(b);
+    let insert = match imm8 & 0b1 {
+        0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7  ]),
+        _ => simd_shuffle8(a, b, [0, 1, 2, 3,   8, 9, 10, 11]),
+    };
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, insert, zero))
+}
+
 /// Copy a to dst, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into dst at the location specified by imm8.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_insertf32x4&expand=3155)
@@ -10501,8 +10607,8 @@ pub unsafe fn _mm512_castsi512_pd(a: __m512i) -> __m512d {
 #[cfg_attr(test, assert_instr(vbroadcast))] //should be vpbroadcastd
 pub unsafe fn _mm512_broadcastd_epi32(a: __m128i) -> __m512i {
     let a = _mm512_castsi128_si512(a).as_i32x16();
-    let ret = simd_shuffle16(a, a, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
-    transmute::<i32x16, _>(ret)
+    let ret: i32x16 = simd_shuffle16(a, a, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
+    transmute(ret)
 }
 
 /// Broadcast the low packed 32-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set). 
@@ -10635,8 +10741,8 @@ pub unsafe fn _mm512_maskz_broadcastsd_pd(k: __mmask8, a: __m128d) -> __m512d {
 #[cfg_attr(test, assert_instr(vinsert))] //should be vpbroadcasti32x4
 pub unsafe fn _mm512_broadcast_i32x4(a: __m128i) -> __m512i {
     let a = _mm512_castsi128_si512(a).as_i32x16();
-    let ret = simd_shuffle16(a, a, [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]);
-    transmute::<i32x16, _>(ret)
+    let ret: i32x16 = simd_shuffle16(a, a, [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]);
+    transmute(ret)
 }
 
 /// Broadcast the 4 packed 32-bit integers from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -18974,6 +19080,55 @@ mod tests {
         assert_eq_m512(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_inserti32x4() {
+        let a = _mm512_setr_epi32(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+        );
+        let b = _mm_setr_epi32(
+            17, 18, 19, 20,
+        );
+        let r = _mm512_inserti32x4(a, b, 0);
+        let e = _mm512_setr_epi32(
+            17, 18, 19, 20, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_inserti32x4() {
+        let a = _mm512_setr_epi32(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+        );
+        let b = _mm_setr_epi32(
+            17, 18, 19, 20,
+        );
+        let r = _mm512_mask_inserti32x4(a, 0, a, b, 0);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_inserti32x4(a, 0b11111111_11111111, a, b, 0);
+        let e = _mm512_setr_epi32(
+            17, 18, 19, 20, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_inserti32x4() {
+        let a = _mm512_setr_epi32(
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+        );
+        let b = _mm_setr_epi32(
+            17, 18, 19, 20,
+        );
+        let r = _mm512_maskz_inserti32x4(0, a, b, 0);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_inserti32x4(0b00000000_11111111, a, b, 0);
+        let e = _mm512_setr_epi32(
+            17, 18, 19, 20, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_insertf32x4() {
         let a = _mm512_setr_ps(
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index 83b884dc8d..b767227b8a 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -4278,6 +4278,55 @@ mod tests {
         assert_eq_m512d(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_inserti64x4() {
+        let a = _mm512_setr_epi64(
+            1, 2, 3, 4, 5, 6, 7, 8,
+        );
+        let b = _mm256_setr_epi64x(
+            17, 18, 19, 20,
+        );
+        let r = _mm512_inserti64x4(a, b, 1);
+        let e = _mm512_setr_epi64(
+            1, 2, 3, 4, 17, 18, 19, 20,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_inserti64x4() {
+        let a = _mm512_setr_epi64(
+            1, 2, 3, 4, 5, 6, 7, 8,
+        );
+        let b = _mm256_setr_epi64x(
+            17, 18, 19, 20,
+        );
+        let r = _mm512_mask_inserti64x4(a, 0, a, b, 1);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_inserti64x4(a, 0b11111111, a, b, 1);
+        let e = _mm512_setr_epi64(
+            1, 2, 3, 4, 17, 18, 19, 20,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_inserti64x4() {
+        let a = _mm512_setr_epi64(
+            1, 2, 3, 4, 5, 6, 7, 8,
+        );
+        let b = _mm256_setr_epi64x(
+            17, 18, 19, 20,
+        );
+        let r = _mm512_maskz_inserti64x4(0, a, b, 1);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_inserti64x4(0b00001111, a, b, 1);
+        let e = _mm512_setr_epi64(
+            1, 2, 3, 4, 0, 0, 0, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_insertf64x4() {
         let a = _mm512_setr_pd(

From 907de2fd3b50de5765b925f4ef5a77faa74418da Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Wed, 23 Sep 2020 20:17:54 +0000
Subject: [PATCH 11/20] mask_blend: epi32,epi64,ps,pd

---
 crates/core_arch/avx512f.md            |  8 ++--
 crates/core_arch/src/x86/avx512f.rs    | 58 ++++++++++++++++++++++++++
 crates/core_arch/src/x86_64/avx512f.rs | 18 ++++++++
 3 files changed, 80 insertions(+), 4 deletions(-)

diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index df6170e076..fb13b28524 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -290,10 +290,10 @@
   * [x] [`_mm512_mask_and_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_and_epi64&expand=5236)
   * [x] [`_mm512_mask_andnot_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_andnot_epi32&expand=5236)
   * [x] [`_mm512_mask_andnot_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_andnot_epi64&expand=5236)
-  * [ ] [`_mm512_mask_blend_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_blend_epi32&expand=5236)
-  * [ ] [`_mm512_mask_blend_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_blend_epi64&expand=5236)
-  * [ ] [`_mm512_mask_blend_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_blend_pd&expand=5236)
-  * [ ] [`_mm512_mask_blend_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_blend_ps&expand=5236)
+  * [x] [`_mm512_mask_blend_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_blend_epi32&expand=5236)
+  * [x] [`_mm512_mask_blend_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_blend_epi64&expand=5236)
+  * [x] [`_mm512_mask_blend_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_blend_pd&expand=5236)
+  * [x] [`_mm512_mask_blend_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_blend_ps&expand=5236)
   * [x] [`_mm512_mask_broadcast_f32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcast_f32x4&expand=5236)
   * [x] [`_mm512_mask_broadcast_f64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcast_f64x4&expand=5236)
   * [x] [`_mm512_mask_broadcast_i32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcast_i32x4&expand=5236)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 60ebf8663b..07c3aebdad 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -10867,6 +10867,46 @@ pub unsafe fn _mm512_maskz_broadcast_f64x4(k: __mmask8, a: __m256d) -> __m512d {
     transmute(simd_select_bitmask(k, broadcast, zero))
 }
 
+/// Blend packed 32-bit integers from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_blend_epi32&expand=435)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovdqa32))] //should be vpblendmd
+pub unsafe fn _mm512_mask_blend_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    transmute(simd_select_bitmask(k, b.as_i32x16(), a.as_i32x16()))
+}
+
+/// Blend packed 64-bit integers from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_blend_epi64&expand=438)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovdqa64))] //should be vpblendmq
+pub unsafe fn _mm512_mask_blend_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    transmute(simd_select_bitmask(k, b.as_i64x8(), a.as_i64x8()))
+}
+
+/// Blend packed single-precision (32-bit) floating-point elements from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_blend_ps&expand=451)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovaps))] //should be vpblendmps
+pub unsafe fn _mm512_mask_blend_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    transmute(simd_select_bitmask(k, b.as_f32x16(), a.as_f32x16()))
+}
+
+/// Blend packed double-precision (64-bit) floating-point elements from a and b using control mask k, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_blend_pd&expand=446)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovapd))] //should be vpblendmpd
+pub unsafe fn _mm512_mask_blend_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    transmute(simd_select_bitmask(k, b.as_f64x8(), a.as_f64x8()))
+}
+
 /// Compute the bitwise AND of packed 32-bit integers in a and b, and store the results in dst.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_and_epi32&expand=272)
@@ -19386,6 +19426,24 @@ mod tests {
         assert_eq_m512(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_blend_epi32() {
+        let a = _mm512_set1_epi32(1);
+        let b = _mm512_set1_epi32(2);
+        let r = _mm512_mask_blend_epi32(0b11111111_00000000, a, b);
+        let e = _mm512_set_epi32(2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_blend_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_set1_ps(2.);
+        let r = _mm512_mask_blend_ps(0b11111111_00000000, a, b);
+        let e = _mm512_set_ps(2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1.);
+        assert_eq_m512(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_and_epi32() {
         let a = _mm512_set_epi32(
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index b767227b8a..796077f330 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -4644,6 +4644,24 @@ mod tests {
         assert_eq_m512d(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_blend_epi64() {
+        let a = _mm512_set1_epi64(1);
+        let b = _mm512_set1_epi64(2);
+        let r = _mm512_mask_blend_epi64(0b11110000, a, b);
+        let e = _mm512_set_epi64(2, 2, 2, 2, 1, 1, 1, 1);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_blend_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_set1_pd(2.);
+        let r = _mm512_mask_blend_pd(0b11110000, a, b);
+        let e = _mm512_set_pd(2., 2., 2., 2., 1., 1., 1., 1.);
+        assert_eq_m512d(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_and_epi64() {
         let a = _mm512_set_epi64(1 << 0 | 1 << 15, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);

From 6e08a62082d29d96dbdd110f2572cb720c63614c Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Thu, 24 Sep 2020 00:06:07 +0000
Subject: [PATCH 12/20] unpackhi: epi32,epi64,ps,pd

---
 crates/core_arch/avx512f.md            |  24 +-
 crates/core_arch/src/x86/avx512f.rs    | 656 +++++++++++++++++++------
 crates/core_arch/src/x86_64/avx512f.rs | 287 ++++++-----
 3 files changed, 659 insertions(+), 308 deletions(-)

diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index fb13b28524..f7c19dfc43 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -666,10 +666,10 @@
   * [ ] [`_mm512_mask_test_epi64_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_test_epi64_mask&expand=5236)
   * [ ] [`_mm512_mask_testn_epi32_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_testn_epi32_mask&expand=5236)
   * [ ] [`_mm512_mask_testn_epi64_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_testn_epi64_mask&expand=5236)
-  * [ ] [`_mm512_mask_unpackhi_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpackhi_epi32&expand=5236)
-  * [ ] [`_mm512_mask_unpackhi_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpackhi_epi64&expand=5236)
-  * [ ] [`_mm512_mask_unpackhi_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpackhi_pd&expand=5236)
-  * [ ] [`_mm512_mask_unpackhi_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpackhi_ps&expand=5236)
+  * [x] [`_mm512_mask_unpackhi_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpackhi_epi32&expand=5236)
+  * [x] [`_mm512_mask_unpackhi_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpackhi_epi64&expand=5236)
+  * [x] [`_mm512_mask_unpackhi_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpackhi_pd&expand=5236)
+  * [x] [`_mm512_mask_unpackhi_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpackhi_ps&expand=5236)
   * [ ] [`_mm512_mask_unpacklo_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpacklo_epi32&expand=5236)
   * [ ] [`_mm512_mask_unpacklo_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpacklo_epi64&expand=5236)
   * [ ] [`_mm512_mask_unpacklo_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpacklo_pd&expand=5236)
@@ -926,10 +926,10 @@
   * [x] [`_mm512_maskz_sub_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sub_round_ps&expand=5236)
   * [ ] [`_mm512_maskz_ternarylogic_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_ternarylogic_epi32&expand=5236)
   * [ ] [`_mm512_maskz_ternarylogic_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_ternarylogic_epi64&expand=5236)
-  * [ ] [`_mm512_maskz_unpackhi_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpackhi_epi32&expand=5236)
-  * [ ] [`_mm512_maskz_unpackhi_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpackhi_epi64&expand=5236)
-  * [ ] [`_mm512_maskz_unpackhi_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpackhi_pd&expand=5236)
-  * [ ] [`_mm512_maskz_unpackhi_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpackhi_ps&expand=5236)
+  * [x] [`_mm512_maskz_unpackhi_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpackhi_epi32&expand=5236)
+  * [x] [`_mm512_maskz_unpackhi_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpackhi_epi64&expand=5236)
+  * [x] [`_mm512_maskz_unpackhi_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpackhi_pd&expand=5236)
+  * [x] [`_mm512_maskz_unpackhi_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpackhi_ps&expand=5236)
   * [ ] [`_mm512_maskz_unpacklo_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpacklo_epi32&expand=5236)
   * [ ] [`_mm512_maskz_unpacklo_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpacklo_epi64&expand=5236)
   * [ ] [`_mm512_maskz_unpacklo_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpacklo_pd&expand=5236)
@@ -1112,10 +1112,10 @@
   * [x] [`_mm512_undefined_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_undefined_pd&expand=5236)
   * [x] [`_mm512_undefined_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_undefined_ps&expand=5236)
   * [ ] [`_mm512_undefined`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_undefined&expand=5236)
-  * [ ] [`_mm512_unpackhi_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpackhi_epi32&expand=5236)
-  * [ ] [`_mm512_unpackhi_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpackhi_epi64&expand=5236)
-  * [ ] [`_mm512_unpackhi_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpackhi_pd&expand=5236)
-  * [ ] [`_mm512_unpackhi_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpackhi_ps&expand=5236)
+  * [x] [`_mm512_unpackhi_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpackhi_epi32&expand=5236)
+  * [x] [`_mm512_unpackhi_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpackhi_epi64&expand=5236)
+  * [x] [`_mm512_unpackhi_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpackhi_pd&expand=5236)
+  * [x] [`_mm512_unpackhi_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpackhi_ps&expand=5236)
   * [ ] [`_mm512_unpacklo_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpacklo_epi32&expand=5236)
   * [ ] [`_mm512_unpacklo_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpacklo_epi64&expand=5236)
   * [ ] [`_mm512_unpacklo_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpacklo_pd&expand=5236)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 07c3aebdad..9963de1240 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -10240,10 +10240,22 @@ pub unsafe fn _mm512_inserti32x4(a: __m512i, b: __m128i, imm8: i32) -> __m512i {
     let a = a.as_i32x16();
     let b = _mm512_castsi128_si512(b).as_i32x16();
     let ret: i32x16 = match imm8 & 0b11 {
-        0 => simd_shuffle16(a, b, [16, 17, 18, 19, 4, 5, 6, 7,     8, 9, 10, 11,   12, 13, 14, 15]),
-        1 => simd_shuffle16(a, b, [0, 1, 2, 3,     16, 17, 18, 19, 8, 9, 10, 11,   12, 13, 14, 15]),
-        2 => simd_shuffle16(a, b, [0, 1, 2, 3,     4, 5, 6, 7,     16, 17, 18, 19, 12, 13, 14, 15]),
-        _ => simd_shuffle16(a, b, [0, 1, 2, 3,     4, 5, 6, 7,     8, 9, 10, 11,   16, 17, 18, 19]), 
+        0 => simd_shuffle16(
+            a,
+            b,
+            [16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+        ),
+        1 => simd_shuffle16(
+            a,
+            b,
+            [0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 12, 13, 14, 15],
+        ),
+        2 => simd_shuffle16(
+            a,
+            b,
+            [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15],
+        ),
+        _ => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19]),
     };
     transmute(ret)
 }
@@ -10255,14 +10267,32 @@ pub unsafe fn _mm512_inserti32x4(a: __m512i, b: __m128i, imm8: i32) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vinserti32x4, imm8 = 2))]
 #[rustc_args_required_const(4)]
-pub unsafe fn _mm512_mask_inserti32x4(src: __m512i, k: __mmask16, a: __m512i, b: __m128i, imm8: i32) -> __m512i {
+pub unsafe fn _mm512_mask_inserti32x4(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512i,
+    b: __m128i,
+    imm8: i32,
+) -> __m512i {
     let a = a.as_i32x16();
     let b = _mm512_castsi128_si512(b).as_i32x16();
     let insert: i32x16 = match imm8 & 0b11 {
-        0 => simd_shuffle16(a, b, [16, 17, 18, 19, 4, 5, 6, 7,     8, 9, 10, 11,   12, 13, 14, 15]),
-        1 => simd_shuffle16(a, b, [0, 1, 2, 3,     16, 17, 18, 19, 8, 9, 10, 11,   12, 13, 14, 15]),
-        2 => simd_shuffle16(a, b, [0, 1, 2, 3,     4, 5, 6, 7,     16, 17, 18, 19, 12, 13, 14, 15]),
-        _ => simd_shuffle16(a, b, [0, 1, 2, 3,     4, 5, 6, 7,     8, 9, 10, 11,   16, 17, 18, 19]), 
+        0 => simd_shuffle16(
+            a,
+            b,
+            [16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+        ),
+        1 => simd_shuffle16(
+            a,
+            b,
+            [0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 12, 13, 14, 15],
+        ),
+        2 => simd_shuffle16(
+            a,
+            b,
+            [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15],
+        ),
+        _ => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19]),
     };
     transmute(simd_select_bitmask(k, insert, src.as_i32x16()))
 }
@@ -10278,10 +10308,22 @@ pub unsafe fn _mm512_maskz_inserti32x4(k: __mmask16, a: __m512i, b: __m128i, imm
     let a = a.as_i32x16();
     let b = _mm512_castsi128_si512(b).as_i32x16();
     let insert = match imm8 & 0b11 {
-        0 => simd_shuffle16(a, b, [16, 17, 18, 19, 4, 5, 6, 7,     8, 9, 10, 11,   12, 13, 14, 15]),
-        1 => simd_shuffle16(a, b, [0, 1, 2, 3,     16, 17, 18, 19, 8, 9, 10, 11,   12, 13, 14, 15]),
-        2 => simd_shuffle16(a, b, [0, 1, 2, 3,     4, 5, 6, 7,     16, 17, 18, 19, 12, 13, 14, 15]),
-        _ => simd_shuffle16(a, b, [0, 1, 2, 3,     4, 5, 6, 7,     8, 9, 10, 11,   16, 17, 18, 19]), 
+        0 => simd_shuffle16(
+            a,
+            b,
+            [16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+        ),
+        1 => simd_shuffle16(
+            a,
+            b,
+            [0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 12, 13, 14, 15],
+        ),
+        2 => simd_shuffle16(
+            a,
+            b,
+            [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15],
+        ),
+        _ => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19]),
     };
     let zero = _mm512_setzero_si512().as_i32x16();
     transmute(simd_select_bitmask(k, insert, zero))
@@ -10297,8 +10339,8 @@ pub unsafe fn _mm512_maskz_inserti32x4(k: __mmask16, a: __m512i, b: __m128i, imm
 pub unsafe fn _mm512_inserti64x4(a: __m512i, b: __m256i, imm8: i32) -> __m512i {
     let b = _mm512_castsi256_si512(b);
     match imm8 & 0b1 {
-        0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7  ]),
-        _ => simd_shuffle8(a, b, [0, 1, 2, 3,   8, 9, 10, 11]),
+        0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
+        _ => simd_shuffle8(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
     }
 }
 
@@ -10309,11 +10351,17 @@ pub unsafe fn _mm512_inserti64x4(a: __m512i, b: __m256i, imm8: i32) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vinserti64x4, imm8 = 1))]
 #[rustc_args_required_const(4)]
-pub unsafe fn _mm512_mask_inserti64x4(src: __m512i, k: __mmask8, a: __m512i, b: __m256i, imm8: i32) -> __m512i {
+pub unsafe fn _mm512_mask_inserti64x4(
+    src: __m512i,
+    k: __mmask8,
+    a: __m512i,
+    b: __m256i,
+    imm8: i32,
+) -> __m512i {
     let b = _mm512_castsi256_si512(b);
     let insert = match imm8 & 0b1 {
-        0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7  ]),
-        _ => simd_shuffle8(a, b, [0, 1, 2, 3,   8, 9, 10, 11]),
+        0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
+        _ => simd_shuffle8(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
     };
     transmute(simd_select_bitmask(k, insert, src.as_i64x8()))
 }
@@ -10328,8 +10376,8 @@ pub unsafe fn _mm512_mask_inserti64x4(src: __m512i, k: __mmask8, a: __m512i, b:
 pub unsafe fn _mm512_maskz_inserti64x4(k: __mmask8, a: __m512i, b: __m256i, imm8: i32) -> __m512i {
     let b = _mm512_castsi256_si512(b);
     let insert = match imm8 & 0b1 {
-        0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7  ]),
-        _ => simd_shuffle8(a, b, [0, 1, 2, 3,   8, 9, 10, 11]),
+        0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
+        _ => simd_shuffle8(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
     };
     let zero = _mm512_setzero_si512().as_i64x8();
     transmute(simd_select_bitmask(k, insert, zero))
@@ -10345,10 +10393,22 @@ pub unsafe fn _mm512_maskz_inserti64x4(k: __mmask8, a: __m512i, b: __m256i, imm8
 pub unsafe fn _mm512_insertf32x4(a: __m512, b: __m128, imm8: i32) -> __m512 {
     let b = _mm512_castps128_ps512(b);
     match imm8 & 0b11 {
-        0 => simd_shuffle16(a, b, [16, 17, 18, 19, 4, 5, 6, 7,     8, 9, 10, 11,   12, 13, 14, 15]),
-        1 => simd_shuffle16(a, b, [0, 1, 2, 3,     16, 17, 18, 19, 8, 9, 10, 11,   12, 13, 14, 15]),
-        2 => simd_shuffle16(a, b, [0, 1, 2, 3,     4, 5, 6, 7,     16, 17, 18, 19, 12, 13, 14, 15]),
-        _ => simd_shuffle16(a, b, [0, 1, 2, 3,     4, 5, 6, 7,     8, 9, 10, 11,   16, 17, 18, 19]), 
+        0 => simd_shuffle16(
+            a,
+            b,
+            [16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+        ),
+        1 => simd_shuffle16(
+            a,
+            b,
+            [0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 12, 13, 14, 15],
+        ),
+        2 => simd_shuffle16(
+            a,
+            b,
+            [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15],
+        ),
+        _ => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19]),
     }
 }
 
@@ -10359,13 +10419,31 @@ pub unsafe fn _mm512_insertf32x4(a: __m512, b: __m128, imm8: i32) -> __m512 {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vinsertf32x4, imm8 = 2))]
 #[rustc_args_required_const(4)]
-pub unsafe fn _mm512_mask_insertf32x4(src: __m512, k: __mmask16, a: __m512, b: __m128, imm8: i32) -> __m512 {
+pub unsafe fn _mm512_mask_insertf32x4(
+    src: __m512,
+    k: __mmask16,
+    a: __m512,
+    b: __m128,
+    imm8: i32,
+) -> __m512 {
     let b = _mm512_castps128_ps512(b);
     let insert = match imm8 & 0b11 {
-        0 => simd_shuffle16(a, b, [16, 17, 18, 19, 4, 5, 6, 7,     8, 9, 10, 11,   12, 13, 14, 15]),
-        1 => simd_shuffle16(a, b, [0, 1, 2, 3,     16, 17, 18, 19, 8, 9, 10, 11,   12, 13, 14, 15]),
-        2 => simd_shuffle16(a, b, [0, 1, 2, 3,     4, 5, 6, 7,     16, 17, 18, 19, 12, 13, 14, 15]),
-        _ => simd_shuffle16(a, b, [0, 1, 2, 3,     4, 5, 6, 7,     8, 9, 10, 11,   16, 17, 18, 19]), 
+        0 => simd_shuffle16(
+            a,
+            b,
+            [16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+        ),
+        1 => simd_shuffle16(
+            a,
+            b,
+            [0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 12, 13, 14, 15],
+        ),
+        2 => simd_shuffle16(
+            a,
+            b,
+            [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15],
+        ),
+        _ => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19]),
     };
     transmute(simd_select_bitmask(k, insert, src.as_f32x16()))
 }
@@ -10380,10 +10458,22 @@ pub unsafe fn _mm512_mask_insertf32x4(src: __m512, k: __mmask16, a: __m512, b: _
 pub unsafe fn _mm512_maskz_insertf32x4(k: __mmask16, a: __m512, b: __m128, imm8: i32) -> __m512 {
     let b = _mm512_castps128_ps512(b);
     let insert = match imm8 & 0b11 {
-        0 => simd_shuffle16(a, b, [16, 17, 18, 19, 4, 5, 6, 7,     8, 9, 10, 11,   12, 13, 14, 15]),
-        1 => simd_shuffle16(a, b, [0, 1, 2, 3,     16, 17, 18, 19, 8, 9, 10, 11,   12, 13, 14, 15]),
-        2 => simd_shuffle16(a, b, [0, 1, 2, 3,     4, 5, 6, 7,     16, 17, 18, 19, 12, 13, 14, 15]),
-        _ => simd_shuffle16(a, b, [0, 1, 2, 3,     4, 5, 6, 7,     8, 9, 10, 11,   16, 17, 18, 19]), 
+        0 => simd_shuffle16(
+            a,
+            b,
+            [16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+        ),
+        1 => simd_shuffle16(
+            a,
+            b,
+            [0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 12, 13, 14, 15],
+        ),
+        2 => simd_shuffle16(
+            a,
+            b,
+            [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15],
+        ),
+        _ => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19]),
     };
     let zero = _mm512_setzero_ps().as_f32x16();
     transmute(simd_select_bitmask(k, insert, zero))
@@ -10399,8 +10489,8 @@ pub unsafe fn _mm512_maskz_insertf32x4(k: __mmask16, a: __m512, b: __m128, imm8:
 pub unsafe fn _mm512_insertf64x4(a: __m512d, b: __m256d, imm8: i32) -> __m512d {
     let b = _mm512_castpd256_pd512(b);
     match imm8 & 0b1 {
-        0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7  ]),
-        _ => simd_shuffle8(a, b, [0, 1, 2, 3,   8, 9, 10, 11]),
+        0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
+        _ => simd_shuffle8(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
     }
 }
 
@@ -10411,11 +10501,17 @@ pub unsafe fn _mm512_insertf64x4(a: __m512d, b: __m256d, imm8: i32) -> __m512d {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vinsertf64x4, imm8 = 1))]
 #[rustc_args_required_const(4)]
-pub unsafe fn _mm512_mask_insertf64x4(src: __m512d, k: __mmask8, a: __m512d, b: __m256d, imm8: i32) -> __m512d {
+pub unsafe fn _mm512_mask_insertf64x4(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+    b: __m256d,
+    imm8: i32,
+) -> __m512d {
     let b = _mm512_castpd256_pd512(b);
     let insert = match imm8 & 0b1 {
-        0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7  ]),
-        _ => simd_shuffle8(a, b, [0, 1, 2, 3,   8, 9, 10, 11]),
+        0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
+        _ => simd_shuffle8(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
     };
     transmute(simd_select_bitmask(k, insert, src.as_f64x8()))
 }
@@ -10430,20 +10526,217 @@ pub unsafe fn _mm512_mask_insertf64x4(src: __m512d, k: __mmask8, a: __m512d, b:
 pub unsafe fn _mm512_maskz_insertf64x4(k: __mmask8, a: __m512d, b: __m256d, imm8: i32) -> __m512d {
     let b = _mm512_castpd256_pd512(b);
     let insert = match imm8 & 0b1 {
-        0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7  ]),
-        _ => simd_shuffle8(a, b, [0, 1, 2, 3,   8, 9, 10, 11]),
+        0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
+        _ => simd_shuffle8(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
     };
     let zero = _mm512_setzero_pd().as_f64x8();
     transmute(simd_select_bitmask(k, insert, zero))
 }
 
+/// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_unpackhi_epi32&expand=6021)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vunpckhps))] //should be vpunpckhdq
+pub unsafe fn _mm512_unpackhi_epi32(a: __m512i, b: __m512i) -> __m512i {
+    let a = a.as_i32x16();
+    let b = b.as_i32x16();
+    let r: i32x16 = simd_shuffle16(
+        a,
+        b,
+        [
+            2,
+            18,
+            3,
+            19,
+            2 + 4,
+            18 + 4,
+            3 + 4,
+            19 + 4,
+            2 + 8,
+            18 + 8,
+            3 + 8,
+            19 + 8,
+            2 + 12,
+            18 + 12,
+            3 + 12,
+            19 + 12,
+        ],
+    );
+    transmute(r)
+}
+
+/// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_unpackhi_epi32&expand=6019)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpunpckhdq))]
+pub unsafe fn _mm512_mask_unpackhi_epi32(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    let unpackhi = _mm512_unpackhi_epi32(a, b).as_i32x16();
+    transmute(simd_select_bitmask(k, unpackhi, src.as_i32x16()))
+}
+
+/// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_unpackhi_epi32&expand=6020)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpunpckhdq))]
+pub unsafe fn _mm512_maskz_unpackhi_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    let unpackhi = _mm512_unpackhi_epi32(a, b).as_i32x16();
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, unpackhi, zero))
+}
+
+/// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and
+/// store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_unpackhi_epi64&expand=6030)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpermt2q))] //should be vpunpckhqdq
+pub unsafe fn _mm512_unpackhi_epi64(a: __m512i, b: __m512i) -> __m512i {
+    simd_shuffle8(a, b, [2, 10, 3, 11, 2 + 4, 10 + 4, 3 + 4, 11 + 4])
+}
+
+/// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_unpackhi_epi64&expand=6028)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpermi2q))] //should be vpunpckhqdq
+pub unsafe fn _mm512_mask_unpackhi_epi64(
+    src: __m512i,
+    k: __mmask8,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    let unpackhi = _mm512_unpackhi_epi64(a, b).as_i64x8();
+    transmute(simd_select_bitmask(k, unpackhi, src.as_i64x8()))
+}
+
+/// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_unpackhi_epi64&expand=6029)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpermt2q))] //should be vpunpckhqdq
+pub unsafe fn _mm512_maskz_unpackhi_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    let unpackhi = _mm512_unpackhi_epi64(a, b).as_i64x8();
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, unpackhi, zero))
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_unpackhi_ps&expand=6060)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vunpckhps))]
+pub unsafe fn _mm512_unpackhi_ps(a: __m512, b: __m512) -> __m512 {
+    simd_shuffle16(
+        a,
+        b,
+        [
+            2,
+            18,
+            3,
+            19,
+            2 + 4,
+            18 + 4,
+            3 + 4,
+            19 + 4,
+            2 + 8,
+            18 + 8,
+            3 + 8,
+            19 + 8,
+            2 + 12,
+            18 + 12,
+            3 + 12,
+            19 + 12,
+        ],
+    )
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_unpackhi_ps&expand=6058)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vunpckhps))]
+pub unsafe fn _mm512_mask_unpackhi_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    let unpackhi = _mm512_unpackhi_ps(a, b).as_f32x16();
+    transmute(simd_select_bitmask(k, unpackhi, src.as_f32x16()))
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_unpackhi_ps&expand=6059)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vunpckhps))]
+pub unsafe fn _mm512_maskz_unpackhi_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    let unpackhi = _mm512_unpackhi_ps(a, b).as_f32x16();
+    let zero = _mm512_setzero_ps().as_f32x16();
+    transmute(simd_select_bitmask(k, unpackhi, zero))
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_unpackhi_pd&expand=6048)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpermt2pd))] //should be vunpackhpd
+pub unsafe fn _mm512_unpackhi_pd(a: __m512d, b: __m512d) -> __m512d {
+    simd_shuffle8(a, b, [2, 10, 3, 11, 2 + 4, 10 + 4, 3 + 4, 11 + 4])
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_unpackhi_pd&expand=6046)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpermi2pd))] //should be vunpackhpd
+pub unsafe fn _mm512_mask_unpackhi_pd(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __m512d {
+    let unpackhi = _mm512_unpackhi_pd(a, b).as_f64x8();
+    transmute(simd_select_bitmask(k, unpackhi, src.as_f64x8()))
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_unpackhi_pd&expand=6047)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpermt2pd))] //should be vunpackhpd
+pub unsafe fn _mm512_maskz_unpackhi_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    let unpackhi = _mm512_unpackhi_pd(a, b).as_f64x8();
+    let zero = _mm512_setzero_pd().as_f64x8();
+    transmute(simd_select_bitmask(k, unpackhi, zero))
+}
+
 /// Cast vector of type __m128 to type __m512; the upper 384 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castps128_ps512&expand=621)
 #[inline]
 #[target_feature(enable = "avx512f")]
 pub unsafe fn _mm512_castps128_ps512(a: __m128) -> __m512 {
-    simd_shuffle16(a, _mm_set1_ps(-1.), [0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4])
+    simd_shuffle16(
+        a,
+        _mm_set1_ps(-1.),
+        [0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4],
+    )
 }
 
 /// Cast vector of type __m256 to type __m512; the upper 256 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@@ -10452,7 +10745,11 @@ pub unsafe fn _mm512_castps128_ps512(a: __m128) -> __m512 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 pub unsafe fn _mm512_castps256_ps512(a: __m256) -> __m512 {
-    simd_shuffle16(a, _mm256_set1_ps(-1.), [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8])
+    simd_shuffle16(
+        a,
+        _mm256_set1_ps(-1.),
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8],
+    )
 }
 
 /// Cast vector of type __m512 to type __m128. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@@ -10611,7 +10908,7 @@ pub unsafe fn _mm512_broadcastd_epi32(a: __m128i) -> __m512i {
     transmute(ret)
 }
 
-/// Broadcast the low packed 32-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set). 
+/// Broadcast the low packed 32-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcastd_epi32&expand=546)
 #[inline]
@@ -10733,7 +11030,7 @@ pub unsafe fn _mm512_maskz_broadcastsd_pd(k: __mmask8, a: __m128d) -> __m512d {
     transmute(simd_select_bitmask(k, broadcast, zero))
 }
 
-/// Broadcast the 4 packed 32-bit integers from a to all elements of dst. 
+/// Broadcast the 4 packed 32-bit integers from a to all elements of dst.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcast_i32x4&expand=510)
 #[inline]
@@ -11151,7 +11448,12 @@ pub unsafe fn _mm512_andnot_epi32(a: __m512i, b: __m512i) -> __m512i {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vpandnd))]
-pub unsafe fn _mm512_mask_andnot_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+pub unsafe fn _mm512_mask_andnot_epi32(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
     let andnot = _mm512_andnot_epi32(a, b).as_i32x16();
     transmute(simd_select_bitmask(k, andnot, src.as_i32x16()))
 }
@@ -11184,7 +11486,12 @@ pub unsafe fn _mm512_andnot_epi64(a: __m512i, b: __m512i) -> __m512i {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vpandnq))]
-pub unsafe fn _mm512_mask_andnot_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+pub unsafe fn _mm512_mask_andnot_epi64(
+    src: __m512i,
+    k: __mmask8,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
     let andnot = _mm512_andnot_epi64(a, b).as_i64x8();
     transmute(simd_select_bitmask(k, andnot, src.as_i64x8()))
 }
@@ -19122,50 +19429,32 @@ mod tests {
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_inserti32x4() {
-        let a = _mm512_setr_epi32(
-            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
-        );
-        let b = _mm_setr_epi32(
-            17, 18, 19, 20,
-        );
+        let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm_setr_epi32(17, 18, 19, 20);
         let r = _mm512_inserti32x4(a, b, 0);
-        let e = _mm512_setr_epi32(
-            17, 18, 19, 20, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
-        );
+        let e = _mm512_setr_epi32(17, 18, 19, 20, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
         assert_eq_m512i(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_mask_inserti32x4() {
-        let a = _mm512_setr_epi32(
-            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
-        );
-        let b = _mm_setr_epi32(
-            17, 18, 19, 20,
-        );
+        let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm_setr_epi32(17, 18, 19, 20);
         let r = _mm512_mask_inserti32x4(a, 0, a, b, 0);
         assert_eq_m512i(r, a);
         let r = _mm512_mask_inserti32x4(a, 0b11111111_11111111, a, b, 0);
-        let e = _mm512_setr_epi32(
-            17, 18, 19, 20, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
-        );
+        let e = _mm512_setr_epi32(17, 18, 19, 20, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
         assert_eq_m512i(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_maskz_inserti32x4() {
-        let a = _mm512_setr_epi32(
-            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
-        );
-        let b = _mm_setr_epi32(
-            17, 18, 19, 20,
-        );
+        let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm_setr_epi32(17, 18, 19, 20);
         let r = _mm512_maskz_inserti32x4(0, a, b, 0);
         assert_eq_m512i(r, _mm512_setzero_si512());
         let r = _mm512_maskz_inserti32x4(0b00000000_11111111, a, b, 0);
-        let e = _mm512_setr_epi32(
-            17, 18, 19, 20, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 0,
-        );
+        let e = _mm512_setr_epi32(17, 18, 19, 20, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 0);
         assert_eq_m512i(r, e);
     }
 
@@ -19174,9 +19463,7 @@ mod tests {
         let a = _mm512_setr_ps(
             1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
         );
-        let b = _mm_setr_ps(
-            17., 18., 19., 20.,
-        );
+        let b = _mm_setr_ps(17., 18., 19., 20.);
         let r = _mm512_insertf32x4(a, b, 0);
         let e = _mm512_setr_ps(
             17., 18., 19., 20., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
@@ -19189,9 +19476,7 @@ mod tests {
         let a = _mm512_setr_ps(
             1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
         );
-        let b = _mm_setr_ps(
-            17., 18., 19., 20.,
-        );
+        let b = _mm_setr_ps(17., 18., 19., 20.);
         let r = _mm512_mask_insertf32x4(a, 0, a, b, 0);
         assert_eq_m512(r, a);
         let r = _mm512_mask_insertf32x4(a, 0b11111111_11111111, a, b, 0);
@@ -19206,9 +19491,7 @@ mod tests {
         let a = _mm512_setr_ps(
             1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
         );
-        let b = _mm_setr_ps(
-            17., 18., 19., 20.,
-        );
+        let b = _mm_setr_ps(17., 18., 19., 20.);
         let r = _mm512_maskz_insertf32x4(0, a, b, 0);
         assert_eq_m512(r, _mm512_setzero_ps());
         let r = _mm512_maskz_insertf32x4(0b00000000_11111111, a, b, 0);
@@ -19217,12 +19500,10 @@ mod tests {
         );
         assert_eq_m512(r, e);
     }
- 
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_castps128_ps512() {
-        let a = _mm_setr_ps(
-            17., 18., 19., 20.,
-        );
+        let a = _mm_setr_ps(17., 18., 19., 20.);
         let r = _mm512_castps128_ps512(a);
         let e = _mm512_setr_ps(
             17., 18., 19., 20., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
@@ -19232,9 +19513,7 @@ mod tests {
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_castps256_ps512() {
-        let a = _mm256_setr_ps(
-            17., 18., 19., 20., 21., 22., 23., 24.,
-        );
+        let a = _mm256_setr_ps(17., 18., 19., 20., 21., 22., 23., 24.);
         let r = _mm512_castps256_ps512(a);
         let e = _mm512_setr_ps(
             17., 18., 19., 20., 21., 22., 23., 24., -1., -1., -1., -1., -1., -1., -1., -1.,
@@ -19248,9 +19527,7 @@ mod tests {
             17., 18., 19., 20., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
         );
         let r = _mm512_castps512_ps128(a);
-        let e = _mm_setr_ps(
-            17., 18., 19., 20.,
-        );
+        let e = _mm_setr_ps(17., 18., 19., 20.);
         assert_eq_m128(r, e);
     }
 
@@ -19260,9 +19537,7 @@ mod tests {
             17., 18., 19., 20., 21., 22., 23., 24., -1., -1., -1., -1., -1., -1., -1., -1.,
         );
         let r = _mm512_castps512_ps256(a);
-        let e = _mm256_setr_ps(
-            17., 18., 19., 20., 21., 22., 23., 24.,
-        );
+        let e = _mm256_setr_ps(17., 18., 19., 20., 21., 22., 23., 24.);
         assert_eq_m256(r, e);
     }
 
@@ -19284,9 +19559,7 @@ mod tests {
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_broadcastd_epi32() {
-        let a = _mm_set_epi32(
-            17, 18, 19, 20,
-        );
+        let a = _mm_set_epi32(17, 18, 19, 20);
         let r = _mm512_broadcastd_epi32(a);
         let e = _mm512_set1_epi32(20);
         assert_eq_m512i(r, e);
@@ -19295,9 +19568,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_mask_broadcastd_epi32() {
         let src = _mm512_set1_epi32(20);
-        let a = _mm_set_epi32(
-            17, 18, 19, 20,
-        );
+        let a = _mm_set_epi32(17, 18, 19, 20);
         let r = _mm512_mask_broadcastd_epi32(src, 0, a);
         assert_eq_m512i(r, src);
         let r = _mm512_mask_broadcastd_epi32(src, 0b11111111_11111111, a);
@@ -19307,23 +19578,17 @@ mod tests {
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_maskz_broadcastd_epi32() {
-        let a = _mm_set_epi32(
-            17, 18, 19, 20,
-        );
+        let a = _mm_set_epi32(17, 18, 19, 20);
         let r = _mm512_maskz_broadcastd_epi32(0, a);
         assert_eq_m512i(r, _mm512_setzero_si512());
         let r = _mm512_maskz_broadcastd_epi32(0b00000000_11111111, a);
-        let e = _mm512_setr_epi32(
-            20, 20, 20, 20, 20, 20, 20, 20, 0, 0, 0, 0, 0, 0, 0, 0,
-        );
+        let e = _mm512_setr_epi32(20, 20, 20, 20, 20, 20, 20, 20, 0, 0, 0, 0, 0, 0, 0, 0);
         assert_eq_m512i(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_broadcastss_ps() {
-        let a = _mm_set_ps(
-            17., 18., 19., 20.,
-        );
+        let a = _mm_set_ps(17., 18., 19., 20.);
         let r = _mm512_broadcastss_ps(a);
         let e = _mm512_set1_ps(20.);
         assert_eq_m512(r, e);
@@ -19332,9 +19597,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_mask_broadcastss_ps() {
         let src = _mm512_set1_ps(20.);
-        let a = _mm_set_ps(
-            17., 18., 19., 20.,
-        );
+        let a = _mm_set_ps(17., 18., 19., 20.);
         let r = _mm512_mask_broadcastss_ps(src, 0, a);
         assert_eq_m512(r, src);
         let r = _mm512_mask_broadcastss_ps(src, 0b11111111_11111111, a);
@@ -19344,9 +19607,7 @@ mod tests {
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_maskz_broadcastss_ps() {
-        let a = _mm_set_ps(
-            17., 18., 19., 20.,
-        );
+        let a = _mm_set_ps(17., 18., 19., 20.);
         let r = _mm512_maskz_broadcastss_ps(0, a);
         assert_eq_m512(r, _mm512_setzero_ps());
         let r = _mm512_maskz_broadcastss_ps(0b00000000_11111111, a);
@@ -19358,32 +19619,30 @@ mod tests {
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_broadcast_i32x4() {
-        let a = _mm_set_epi32(
-            17, 18, 19, 20,
-        );
+        let a = _mm_set_epi32(17, 18, 19, 20);
         let r = _mm512_broadcast_i32x4(a);
-        let e = _mm512_set_epi32(17, 18, 19, 20, 17, 18, 19, 20, 17, 18, 19, 20, 17, 18, 19, 20);
+        let e = _mm512_set_epi32(
+            17, 18, 19, 20, 17, 18, 19, 20, 17, 18, 19, 20, 17, 18, 19, 20,
+        );
         assert_eq_m512i(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_mask_broadcast_i32x4() {
         let src = _mm512_set1_epi32(20);
-        let a = _mm_set_epi32(
-            17, 18, 19, 20,
-        );
+        let a = _mm_set_epi32(17, 18, 19, 20);
         let r = _mm512_mask_broadcast_i32x4(src, 0, a);
         assert_eq_m512i(r, src);
         let r = _mm512_mask_broadcast_i32x4(src, 0b11111111_11111111, a);
-        let e = _mm512_set_epi32(17, 18, 19, 20, 17, 18, 19, 20, 17, 18, 19, 20, 17, 18, 19, 20);
+        let e = _mm512_set_epi32(
+            17, 18, 19, 20, 17, 18, 19, 20, 17, 18, 19, 20, 17, 18, 19, 20,
+        );
         assert_eq_m512i(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_maskz_broadcast_i32x4() {
-        let a = _mm_set_epi32(
-            17, 18, 19, 20,
-        );
+        let a = _mm_set_epi32(17, 18, 19, 20);
         let r = _mm512_maskz_broadcast_i32x4(0, a);
         assert_eq_m512i(r, _mm512_setzero_si512());
         let r = _mm512_maskz_broadcast_i32x4(0b00000000_11111111, a);
@@ -19393,36 +19652,36 @@ mod tests {
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_broadcast_f32x4() {
-        let a = _mm_set_ps(
-            17., 18., 19., 20.,
-        );
+        let a = _mm_set_ps(17., 18., 19., 20.);
         let r = _mm512_broadcast_f32x4(a);
-        let e = _mm512_set_ps(17., 18., 19., 20., 17., 18., 19., 20., 17., 18., 19., 20., 17., 18., 19., 20.);
+        let e = _mm512_set_ps(
+            17., 18., 19., 20., 17., 18., 19., 20., 17., 18., 19., 20., 17., 18., 19., 20.,
+        );
         assert_eq_m512(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_mask_broadcast_f32x4() {
         let src = _mm512_set1_ps(20.);
-        let a = _mm_set_ps(
-            17., 18., 19., 20.,
-        );
+        let a = _mm_set_ps(17., 18., 19., 20.);
         let r = _mm512_mask_broadcast_f32x4(src, 0, a);
         assert_eq_m512(r, src);
         let r = _mm512_mask_broadcast_f32x4(src, 0b11111111_11111111, a);
-        let e = _mm512_set_ps(17., 18., 19., 20., 17., 18., 19., 20., 17., 18., 19., 20., 17., 18., 19., 20.);
+        let e = _mm512_set_ps(
+            17., 18., 19., 20., 17., 18., 19., 20., 17., 18., 19., 20., 17., 18., 19., 20.,
+        );
         assert_eq_m512(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_maskz_broadcast_f32x4() {
-        let a = _mm_set_ps(
-            17., 18., 19., 20.,
-        );
+        let a = _mm_set_ps(17., 18., 19., 20.);
         let r = _mm512_maskz_broadcast_f32x4(0, a);
         assert_eq_m512(r, _mm512_setzero_ps());
         let r = _mm512_maskz_broadcast_f32x4(0b00000000_11111111, a);
-        let e = _mm512_set_ps(0., 0., 0., 0., 0., 0., 0., 0., 17., 18., 19., 20., 17., 18., 19., 20.);
+        let e = _mm512_set_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 17., 18., 19., 20., 17., 18., 19., 20.,
+        );
         assert_eq_m512(r, e);
     }
 
@@ -19440,7 +19699,95 @@ mod tests {
         let a = _mm512_set1_ps(1.);
         let b = _mm512_set1_ps(2.);
         let r = _mm512_mask_blend_ps(0b11111111_00000000, a, b);
-        let e = _mm512_set_ps(2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1.);
+        let e = _mm512_set_ps(
+            2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_unpackhi_epi32() {
+        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm512_set_epi32(
+            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm512_unpackhi_epi32(a, b);
+        let e = _mm512_set_epi32(17, 1, 18, 2, 21, 5, 22, 6, 25, 9, 26, 10, 29, 13, 30, 14);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_unpackhi_epi32() {
+        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm512_set_epi32(
+            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm512_mask_unpackhi_epi32(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_unpackhi_epi32(a, 0b11111111_11111111, a, b);
+        let e = _mm512_set_epi32(17, 1, 18, 2, 21, 5, 22, 6, 25, 9, 26, 10, 29, 13, 30, 14);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_unpackhi_epi32() {
+        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm512_set_epi32(
+            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm512_maskz_unpackhi_epi32(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_unpackhi_epi32(0b00000000_11111111, a, b);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 25, 9, 26, 10, 29, 13, 30, 14);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_unpackhi_ps() {
+        let a = _mm512_set_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let b = _mm512_set_ps(
+            17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
+        );
+        let r = _mm512_unpackhi_ps(a, b);
+        let e = _mm512_set_ps(
+            17., 1., 18., 2., 21., 5., 22., 6., 25., 9., 26., 10., 29., 13., 30., 14.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_unpackhi_ps() {
+        let a = _mm512_set_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let b = _mm512_set_ps(
+            17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
+        );
+        let r = _mm512_mask_unpackhi_ps(a, 0, a, b);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_unpackhi_ps(a, 0b11111111_11111111, a, b);
+        let e = _mm512_set_ps(
+            17., 1., 18., 2., 21., 5., 22., 6., 25., 9., 26., 10., 29., 13., 30., 14.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_unpackhi_ps() {
+        let a = _mm512_set_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let b = _mm512_set_ps(
+            17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
+        );
+        let r = _mm512_maskz_unpackhi_ps(0, a, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_unpackhi_ps(0b00000000_11111111, a, b);
+        let e = _mm512_set_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 25., 9., 26., 10., 29., 13., 30., 14.,
+        );
         assert_eq_m512(r, e);
     }
 
@@ -20117,7 +20464,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_andnot_epi32() {
         let a = _mm512_set1_epi32(0);
-        let b = _mm512_set1_epi32(1<<3 | 1<< 4);
+        let b = _mm512_set1_epi32(1 << 3 | 1 << 4);
         let r = _mm512_andnot_epi32(a, b);
         let e = _mm512_set1_epi32(1 << 3 | 1 << 4);
         assert_eq_m512i(r, e);
@@ -20125,8 +20472,8 @@ mod tests {
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_mask_andnot_epi32() {
-        let a = _mm512_set1_epi32(1<<1 | 1<< 2);
-        let b = _mm512_set1_epi32(1<<3 | 1<< 4);
+        let a = _mm512_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm512_set1_epi32(1 << 3 | 1 << 4);
         let r = _mm512_mask_andnot_epi32(a, 0, a, b);
         assert_eq_m512i(r, a);
 
@@ -20137,13 +20484,30 @@ mod tests {
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_maskz_andnot_epi32() {
-        let a = _mm512_set1_epi32(1<<1 | 1<< 2);
-        let b = _mm512_set1_epi32(1<<3 | 1<< 4);
+        let a = _mm512_set1_epi32(1 << 1 | 1 << 2);
+        let b = _mm512_set1_epi32(1 << 3 | 1 << 4);
         let r = _mm512_maskz_andnot_epi32(0, a, b);
         assert_eq_m512i(r, _mm512_setzero_si512());
 
         let r = _mm512_maskz_andnot_epi32(0b00000000_11111111, a, b);
-        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4);
+        let e = _mm512_set_epi32(
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            1 << 3 | 1 << 4,
+            1 << 3 | 1 << 4,
+            1 << 3 | 1 << 4,
+            1 << 3 | 1 << 4,
+            1 << 3 | 1 << 4,
+            1 << 3 | 1 << 4,
+            1 << 3 | 1 << 4,
+            1 << 3 | 1 << 4,
+        );
         assert_eq_m512i(r, e);
     }
 
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index 796077f330..b005136ebe 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -4280,147 +4280,95 @@ mod tests {
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_inserti64x4() {
-        let a = _mm512_setr_epi64(
-            1, 2, 3, 4, 5, 6, 7, 8,
-        );
-        let b = _mm256_setr_epi64x(
-            17, 18, 19, 20,
-        );
+        let a = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm256_setr_epi64x(17, 18, 19, 20);
         let r = _mm512_inserti64x4(a, b, 1);
-        let e = _mm512_setr_epi64(
-            1, 2, 3, 4, 17, 18, 19, 20,
-        );
+        let e = _mm512_setr_epi64(1, 2, 3, 4, 17, 18, 19, 20);
         assert_eq_m512i(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_mask_inserti64x4() {
-        let a = _mm512_setr_epi64(
-            1, 2, 3, 4, 5, 6, 7, 8,
-        );
-        let b = _mm256_setr_epi64x(
-            17, 18, 19, 20,
-        );
+        let a = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm256_setr_epi64x(17, 18, 19, 20);
         let r = _mm512_mask_inserti64x4(a, 0, a, b, 1);
         assert_eq_m512i(r, a);
         let r = _mm512_mask_inserti64x4(a, 0b11111111, a, b, 1);
-        let e = _mm512_setr_epi64(
-            1, 2, 3, 4, 17, 18, 19, 20,
-        );
+        let e = _mm512_setr_epi64(1, 2, 3, 4, 17, 18, 19, 20);
         assert_eq_m512i(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_maskz_inserti64x4() {
-        let a = _mm512_setr_epi64(
-            1, 2, 3, 4, 5, 6, 7, 8,
-        );
-        let b = _mm256_setr_epi64x(
-            17, 18, 19, 20,
-        );
+        let a = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm256_setr_epi64x(17, 18, 19, 20);
         let r = _mm512_maskz_inserti64x4(0, a, b, 1);
         assert_eq_m512i(r, _mm512_setzero_si512());
         let r = _mm512_maskz_inserti64x4(0b00001111, a, b, 1);
-        let e = _mm512_setr_epi64(
-            1, 2, 3, 4, 0, 0, 0, 0,
-        );
+        let e = _mm512_setr_epi64(1, 2, 3, 4, 0, 0, 0, 0);
         assert_eq_m512i(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_insertf64x4() {
-        let a = _mm512_setr_pd(
-            1., 2., 3., 4., 5., 6., 7., 8.,
-        );
-        let b = _mm256_setr_pd(
-            17., 18., 19., 20.,
-        );
+        let a = _mm512_setr_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm256_setr_pd(17., 18., 19., 20.);
         let r = _mm512_insertf64x4(a, b, 1);
-        let e = _mm512_setr_pd(
-            1., 2., 3., 4., 17., 18., 19., 20.,
-        );
+        let e = _mm512_setr_pd(1., 2., 3., 4., 17., 18., 19., 20.);
         assert_eq_m512d(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_mask_insertf64x4() {
-        let a = _mm512_setr_pd(
-            1., 2., 3., 4., 5., 6., 7., 8.,
-        );
-        let b = _mm256_setr_pd(
-            17., 18., 19., 20.,
-        );
+        let a = _mm512_setr_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm256_setr_pd(17., 18., 19., 20.);
         let r = _mm512_mask_insertf64x4(a, 0, a, b, 1);
         assert_eq_m512d(r, a);
         let r = _mm512_mask_insertf64x4(a, 0b11111111, a, b, 1);
-        let e = _mm512_setr_pd(
-            1., 2., 3., 4., 17., 18., 19., 20.,
-        );
+        let e = _mm512_setr_pd(1., 2., 3., 4., 17., 18., 19., 20.);
         assert_eq_m512d(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_maskz_insertf64x4() {
-        let a = _mm512_setr_pd(
-            1., 2., 3., 4., 5., 6., 7., 8.,
-        );
-        let b = _mm256_setr_pd(
-            17., 18., 19., 20.,
-        );
+        let a = _mm512_setr_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm256_setr_pd(17., 18., 19., 20.);
         let r = _mm512_maskz_insertf64x4(0, a, b, 1);
         assert_eq_m512d(r, _mm512_setzero_pd());
         let r = _mm512_maskz_insertf64x4(0b00001111, a, b, 1);
-        let e = _mm512_setr_pd(
-            1., 2., 3., 4., 0., 0., 0., 0.,
-        );
+        let e = _mm512_setr_pd(1., 2., 3., 4., 0., 0., 0., 0.);
         assert_eq_m512d(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_castpd128_pd512() {
-        let a = _mm_setr_pd(
-            17., 18.,
-        );
+        let a = _mm_setr_pd(17., 18.);
         let r = _mm512_castpd128_pd512(a);
-        let e = _mm512_setr_pd(
-            17., 18., -1., -1., -1., -1., -1., -1.
-        );
+        let e = _mm512_setr_pd(17., 18., -1., -1., -1., -1., -1., -1.);
         assert_eq_m512d(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_castpd256_pd512() {
-        let a = _mm256_setr_pd(
-            17., 18., 19., 20.,
-        );
+        let a = _mm256_setr_pd(17., 18., 19., 20.);
         let r = _mm512_castpd256_pd512(a);
-        let e = _mm512_setr_pd(
-            17., 18., 19., 20., -1., -1., -1., -1.,
-        );
+        let e = _mm512_setr_pd(17., 18., 19., 20., -1., -1., -1., -1.);
         assert_eq_m512d(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_castpd512_pd128() {
-        let a = _mm512_setr_pd(
-            17., 18., -1., -1., -1., -1., -1., -1.,
-        );
+        let a = _mm512_setr_pd(17., 18., -1., -1., -1., -1., -1., -1.);
         let r = _mm512_castpd512_pd128(a);
-        let e = _mm_setr_pd(
-            17., 18.,
-        );
+        let e = _mm_setr_pd(17., 18.);
         assert_eq_m128d(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_castpd512_pd256() {
-        let a = _mm512_setr_pd(
-            17., 18., 19., 20., -1., -1., -1., -1.,
-        );
+        let a = _mm512_setr_pd(17., 18., 19., 20., -1., -1., -1., -1.);
         let r = _mm512_castpd512_pd256(a);
-        let e = _mm256_setr_pd(
-            17., 18., 19., 20.,
-        );
+        let e = _mm256_setr_pd(17., 18., 19., 20.);
         assert_eq_m256d(r, e);
     }
 
@@ -4428,7 +4376,10 @@ mod tests {
     unsafe fn test_mm512_castpd_ps() {
         let a = _mm512_set1_pd(1.);
         let r = _mm512_castpd_ps(a);
-        let e = _mm512_set_ps(1.875, 0.0, 1.875, 0.0, 1.875, 0.0, 1.875, 0.0, 1.875, 0.0, 1.875, 0.0, 1.875, 0.0, 1.875, 0.0);
+        let e = _mm512_set_ps(
+            1.875, 0.0, 1.875, 0.0, 1.875, 0.0, 1.875, 0.0, 1.875, 0.0, 1.875, 0.0, 1.875, 0.0,
+            1.875, 0.0,
+        );
         assert_eq_m512(r, e);
     }
 
@@ -4436,69 +4387,58 @@ mod tests {
     unsafe fn test_mm512_castpd_si512() {
         let a = _mm512_set1_pd(1.);
         let r = _mm512_castpd_si512(a);
-        let e = _mm512_set_epi32(1072693248, 0, 1072693248, 0, 1072693248, 0, 1072693248, 0, 1072693248, 0, 1072693248, 0, 1072693248, 0, 1072693248, 0);
+        let e = _mm512_set_epi32(
+            1072693248, 0, 1072693248, 0, 1072693248, 0, 1072693248, 0, 1072693248, 0, 1072693248,
+            0, 1072693248, 0, 1072693248, 0,
+        );
         assert_eq_m512i(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_castsi128_si512() {
-        let a = _mm_setr_epi64x(
-            17, 18,
-        );
+        let a = _mm_setr_epi64x(17, 18);
         let r = _mm512_castsi128_si512(a);
-        let e = _mm512_setr_epi64(
-            17, 18, -1, -1, -1, -1, -1, -1,
-        );
+        let e = _mm512_setr_epi64(17, 18, -1, -1, -1, -1, -1, -1);
         assert_eq_m512i(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_castsi256_si512() {
-        let a = _mm256_setr_epi64x(
-            17, 18, 19, 20,
-        );
+        let a = _mm256_setr_epi64x(17, 18, 19, 20);
         let r = _mm512_castsi256_si512(a);
-        let e = _mm512_setr_epi64(
-            17, 18, 19, 20, -1, -1, -1, -1,
-        );
+        let e = _mm512_setr_epi64(17, 18, 19, 20, -1, -1, -1, -1);
         assert_eq_m512i(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_castsi512_si128() {
-        let a = _mm512_setr_epi64(
-            17, 18, -1, -1, -1, -1, -1, -1,
-        );
+        let a = _mm512_setr_epi64(17, 18, -1, -1, -1, -1, -1, -1);
         let r = _mm512_castsi512_si128(a);
-        let e = _mm_setr_epi64x(
-            17, 18,
-        );
+        let e = _mm_setr_epi64x(17, 18);
         assert_eq_m128i(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_castsi512_si256() {
-        let a = _mm512_setr_epi64(
-            17, 18, 19, 20, -1, -1, -1, -1,
-        );
+        let a = _mm512_setr_epi64(17, 18, 19, 20, -1, -1, -1, -1);
         let r = _mm512_castsi512_si256(a);
-        let e = _mm256_setr_epi64x(
-            17, 18, 19, 20,
-        );
+        let e = _mm256_setr_epi64x(17, 18, 19, 20);
         assert_eq_m256i(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_castsi512_ps() {
-        let a = _mm512_set1_epi64(1<<62);
+        let a = _mm512_set1_epi64(1 << 62);
         let r = _mm512_castsi512_ps(a);
-        let e = _mm512_set_ps(2., 0., 2., 0., 2., 0., 2., 0., 2., 0., 2., 0., 2., 0., 2., 0.);
+        let e = _mm512_set_ps(
+            2., 0., 2., 0., 2., 0., 2., 0., 2., 0., 2., 0., 2., 0., 2., 0.,
+        );
         assert_eq_m512(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_castsi512_pd() {
-        let a = _mm512_set1_epi64(1<<62);
+        let a = _mm512_set1_epi64(1 << 62);
         let r = _mm512_castsi512_pd(a);
         let e = _mm512_set_pd(2., 2., 2., 2., 2., 2., 2., 2.);
         assert_eq_m512d(r, e);
@@ -4506,9 +4446,7 @@ mod tests {
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_broadcastq_epi64() {
-        let a = _mm_setr_epi64x(
-            17, 18,
-        );
+        let a = _mm_setr_epi64x(17, 18);
         let r = _mm512_broadcastq_epi64(a);
         let e = _mm512_set1_epi64(18);
         assert_eq_m512i(r, e);
@@ -4517,9 +4455,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_mask_broadcastq_epi64() {
         let src = _mm512_set1_epi64(18);
-        let a = _mm_setr_epi64x(
-            17, 18,
-        );
+        let a = _mm_setr_epi64x(17, 18);
         let r = _mm512_mask_broadcastq_epi64(src, 0, a);
         assert_eq_m512i(r, src);
         let r = _mm512_mask_broadcastq_epi64(src, 0b01111111, a);
@@ -4529,9 +4465,7 @@ mod tests {
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_maskz_broadcastq_epi64() {
-        let a = _mm_setr_epi64x(
-            17, 18,
-        );
+        let a = _mm_setr_epi64x(17, 18);
         let r = _mm512_maskz_broadcastq_epi64(0, a);
         assert_eq_m512i(r, _mm512_setzero_si512());
         let r = _mm512_maskz_broadcastq_epi64(0b00001111, a);
@@ -4541,9 +4475,7 @@ mod tests {
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_broadcastsd_pd() {
-        let a = _mm_setr_pd(
-            17., 18.,
-        );
+        let a = _mm_setr_pd(17., 18.);
         let r = _mm512_broadcastsd_pd(a);
         let e = _mm512_set1_pd(18.);
         assert_eq_m512d(r, e);
@@ -4552,9 +4484,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_mask_broadcastsd_pd() {
         let src = _mm512_set1_pd(18.);
-        let a = _mm_setr_pd(
-            17., 18.,
-        );
+        let a = _mm_setr_pd(17., 18.);
         let r = _mm512_mask_broadcastsd_pd(src, 0, a);
         assert_eq_m512d(r, src);
         let r = _mm512_mask_broadcastsd_pd(src, 0b01111111, a);
@@ -4564,9 +4494,7 @@ mod tests {
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_maskz_broadcastsd_pd() {
-        let a = _mm_setr_pd(
-            17., 18.,
-        );
+        let a = _mm_setr_pd(17., 18.);
         let r = _mm512_maskz_broadcastsd_pd(0, a);
         assert_eq_m512d(r, _mm512_setzero_pd());
         let r = _mm512_maskz_broadcastsd_pd(0b00001111, a);
@@ -4576,9 +4504,7 @@ mod tests {
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_broadcast_i64x4() {
-        let a = _mm256_set_epi64x(
-            17, 18, 19, 20
-        );
+        let a = _mm256_set_epi64x(17, 18, 19, 20);
         let r = _mm512_broadcast_i64x4(a);
         let e = _mm512_set_epi64(17, 18, 19, 20, 17, 18, 19, 20);
         assert_eq_m512i(r, e);
@@ -4587,9 +4513,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_mask_broadcast_i64x4() {
         let src = _mm512_set1_epi64(18);
-        let a = _mm256_set_epi64x(
-            17, 18, 19, 20
-        );
+        let a = _mm256_set_epi64x(17, 18, 19, 20);
         let r = _mm512_mask_broadcast_i64x4(src, 0, a);
         assert_eq_m512i(r, src);
         let r = _mm512_mask_broadcast_i64x4(src, 0b11111111, a);
@@ -4599,9 +4523,7 @@ mod tests {
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_maskz_broadcast_i64x4() {
-        let a = _mm256_set_epi64x(
-            17, 18, 19, 20
-        );
+        let a = _mm256_set_epi64x(17, 18, 19, 20);
         let r = _mm512_maskz_broadcast_i64x4(0, a);
         assert_eq_m512i(r, _mm512_setzero_si512());
         let r = _mm512_maskz_broadcast_i64x4(0b00001111, a);
@@ -4611,9 +4533,7 @@ mod tests {
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_broadcast_f64x4() {
-        let a = _mm256_set_pd(
-            17., 18., 19., 20.
-        );
+        let a = _mm256_set_pd(17., 18., 19., 20.);
         let r = _mm512_broadcast_f64x4(a);
         let e = _mm512_set_pd(17., 18., 19., 20., 17., 18., 19., 20.);
         assert_eq_m512d(r, e);
@@ -4622,9 +4542,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_mask_broadcast_f64x4() {
         let src = _mm512_set1_pd(18.);
-        let a = _mm256_set_pd(
-            17., 18., 19., 20.
-        );
+        let a = _mm256_set_pd(17., 18., 19., 20.);
         let r = _mm512_mask_broadcast_f64x4(src, 0, a);
         assert_eq_m512d(r, src);
         let r = _mm512_mask_broadcast_f64x4(src, 0b11111111, a);
@@ -4634,9 +4552,7 @@ mod tests {
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_maskz_broadcast_f64x4() {
-        let a = _mm256_set_pd(
-            17., 18., 19., 20.
-        );
+        let a = _mm256_set_pd(17., 18., 19., 20.);
         let r = _mm512_maskz_broadcast_f64x4(0, a);
         assert_eq_m512d(r, _mm512_setzero_pd());
         let r = _mm512_maskz_broadcast_f64x4(0b00001111, a);
@@ -4662,6 +4578,68 @@ mod tests {
         assert_eq_m512d(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_unpackhi_epi64() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm512_set_epi64(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm512_unpackhi_epi64(a, b);
+        let e = _mm512_set_epi64(17, 1, 18, 2, 21, 5, 22, 6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_unpackhi_epi64() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm512_set_epi64(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm512_mask_unpackhi_epi64(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_unpackhi_epi64(a, 0b11111111, a, b);
+        let e = _mm512_set_epi64(17, 1, 18, 2, 21, 5, 22, 6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_unpackhi_epi64() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm512_set_epi64(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm512_maskz_unpackhi_epi64(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_unpackhi_epi64(0b00001111, a, b);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 21, 5, 22, 6);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_unpackhi_pd() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_pd(17., 18., 19., 20., 21., 22., 23., 24.);
+        let r = _mm512_unpackhi_pd(a, b);
+        let e = _mm512_set_pd(17., 1., 18., 2., 21., 5., 22., 6.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_unpackhi_pd() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_pd(17., 18., 19., 20., 21., 22., 23., 24.);
+        let r = _mm512_mask_unpackhi_pd(a, 0, a, b);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_unpackhi_pd(a, 0b11111111, a, b);
+        let e = _mm512_set_pd(17., 1., 18., 2., 21., 5., 22., 6.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_unpackhi_pd() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_pd(17., 18., 19., 20., 21., 22., 23., 24.);
+        let r = _mm512_maskz_unpackhi_pd(0, a, b);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_unpackhi_pd(0b00001111, a, b);
+        let e = _mm512_set_pd(0., 0., 0., 0., 21., 5., 22., 6.);
+        assert_eq_m512d(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_and_epi64() {
         let a = _mm512_set_epi64(1 << 0 | 1 << 15, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);
@@ -4824,7 +4802,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_andnot_epi64() {
         let a = _mm512_set1_epi64(0);
-        let b = _mm512_set1_epi64(1<<3 | 1<< 4);
+        let b = _mm512_set1_epi64(1 << 3 | 1 << 4);
         let r = _mm512_andnot_epi64(a, b);
         let e = _mm512_set1_epi64(1 << 3 | 1 << 4);
         assert_eq_m512i(r, e);
@@ -4832,8 +4810,8 @@ mod tests {
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_mask_andnot_epi64() {
-        let a = _mm512_set1_epi64(1<<1 | 1<< 2);
-        let b = _mm512_set1_epi64(1<<3 | 1<< 4);
+        let a = _mm512_set1_epi64(1 << 1 | 1 << 2);
+        let b = _mm512_set1_epi64(1 << 3 | 1 << 4);
         let r = _mm512_mask_andnot_epi64(a, 0, a, b);
         assert_eq_m512i(r, a);
 
@@ -4844,20 +4822,29 @@ mod tests {
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_maskz_andnot_epi64() {
-        let a = _mm512_set1_epi64(1<<1 | 1<< 2);
-        let b = _mm512_set1_epi64(1<<3 | 1<< 4);
+        let a = _mm512_set1_epi64(1 << 1 | 1 << 2);
+        let b = _mm512_set1_epi64(1 << 3 | 1 << 4);
         let r = _mm512_maskz_andnot_epi64(0, a, b);
         assert_eq_m512i(r, _mm512_setzero_si512());
 
         let r = _mm512_maskz_andnot_epi64(0b00001111, a, b);
-        let e = _mm512_set_epi64(0, 0, 0, 0, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4);
+        let e = _mm512_set_epi64(
+            0,
+            0,
+            0,
+            0,
+            1 << 3 | 1 << 4,
+            1 << 3 | 1 << 4,
+            1 << 3 | 1 << 4,
+            1 << 3 | 1 << 4,
+        );
         assert_eq_m512i(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_andnot_si512() {
         let a = _mm512_set1_epi64(0);
-        let b = _mm512_set1_epi64(1<<3 | 1<< 4);
+        let b = _mm512_set1_epi64(1 << 3 | 1 << 4);
         let r = _mm512_andnot_si512(a, b);
         let e = _mm512_set1_epi64(1 << 3 | 1 << 4);
         assert_eq_m512i(r, e);

From 6876994931e6251e1f33ce90e55ca2b3df8c865c Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Thu, 24 Sep 2020 00:21:22 +0000
Subject: [PATCH 13/20] fix assert_instr v1

---
 crates/core_arch/src/x86/avx512f.rs | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 9963de1240..a5da0c42bc 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -11035,7 +11035,7 @@ pub unsafe fn _mm512_maskz_broadcastsd_pd(k: __mmask8, a: __m128d) -> __m512d {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcast_i32x4&expand=510)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vinsert))] //should be vpbroadcasti32x4
+#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcasti32x4
 pub unsafe fn _mm512_broadcast_i32x4(a: __m128i) -> __m512i {
     let a = _mm512_castsi128_si512(a).as_i32x16();
     let ret: i32x16 = simd_shuffle16(a, a, [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]);
@@ -11047,7 +11047,7 @@ pub unsafe fn _mm512_broadcast_i32x4(a: __m128i) -> __m512i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcast_i32x4&expand=511)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vshufi32x4))] //should be vbroadcasti32x4
+#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcasti32x4
 pub unsafe fn _mm512_mask_broadcast_i32x4(src: __m512i, k: __mmask16, a: __m128i) -> __m512i {
     let broadcast = _mm512_broadcast_i32x4(a).as_i32x16();
     transmute(simd_select_bitmask(k, broadcast, src.as_i32x16()))
@@ -11058,7 +11058,7 @@ pub unsafe fn _mm512_mask_broadcast_i32x4(src: __m512i, k: __mmask16, a: __m128i
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcast_i32x4&expand=512)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vshufi32x4))] //should be vbroadcasti32x4
+#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcasti32x4
 pub unsafe fn _mm512_maskz_broadcast_i32x4(k: __mmask16, a: __m128i) -> __m512i {
     let broadcast = _mm512_broadcast_i32x4(a).as_i32x16();
     let zero = _mm512_setzero_si512().as_i32x16();
@@ -11070,7 +11070,7 @@ pub unsafe fn _mm512_maskz_broadcast_i32x4(k: __mmask16, a: __m128i) -> __m512i
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcast_i64x4&expand=522)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vinsertf64x4))] //should be vbroadcasti64x4
+#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcasti64x4
 pub unsafe fn _mm512_broadcast_i64x4(a: __m256i) -> __m512i {
     simd_shuffle8(a, a, [0, 1, 2, 3, 0, 1, 2, 3])
 }
@@ -11080,7 +11080,7 @@ pub unsafe fn _mm512_broadcast_i64x4(a: __m256i) -> __m512i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcast_i64x4&expand=523)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vinserti64x4))] //should be vbroadcasti64x4
+#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcasti64x4
 pub unsafe fn _mm512_mask_broadcast_i64x4(src: __m512i, k: __mmask8, a: __m256i) -> __m512i {
     let broadcast = _mm512_broadcast_i64x4(a).as_i64x8();
     transmute(simd_select_bitmask(k, broadcast, src.as_i64x8()))
@@ -11091,7 +11091,7 @@ pub unsafe fn _mm512_mask_broadcast_i64x4(src: __m512i, k: __mmask8, a: __m256i)
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcast_i64x4&expand=524)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vinserti64x4))] //should be vbroadcasti64x4
+#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcasti64x4
 pub unsafe fn _mm512_maskz_broadcast_i64x4(k: __mmask8, a: __m256i) -> __m512i {
     let broadcast = _mm512_broadcast_i64x4(a).as_i64x8();
     let zero = _mm512_setzero_si512().as_i64x8();
@@ -11103,7 +11103,7 @@ pub unsafe fn _mm512_maskz_broadcast_i64x4(k: __mmask8, a: __m256i) -> __m512i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcast_f32x4&expand=483)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vinsertf128))] //should be vbroadcastf32x4
+#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcastf32x4
 pub unsafe fn _mm512_broadcast_f32x4(a: __m128) -> __m512 {
     simd_shuffle16(a, a, [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3])
 }
@@ -11113,7 +11113,7 @@ pub unsafe fn _mm512_broadcast_f32x4(a: __m128) -> __m512 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcast_f32x4&expand=484)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vinsertf128))] //should be vbroadcastf32x4
+#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcastf32x4
 pub unsafe fn _mm512_mask_broadcast_f32x4(src: __m512, k: __mmask16, a: __m128) -> __m512 {
     let broadcast = _mm512_broadcast_f32x4(a).as_f32x16();
     transmute(simd_select_bitmask(k, broadcast, src.as_f32x16()))
@@ -11124,7 +11124,7 @@ pub unsafe fn _mm512_mask_broadcast_f32x4(src: __m512, k: __mmask16, a: __m128)
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcast_f32x4&expand=485)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vinsertf128))] //should be vbroadcastf32x4
+#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcastf32x4
 pub unsafe fn _mm512_maskz_broadcast_f32x4(k: __mmask16, a: __m128) -> __m512 {
     let broadcast = _mm512_broadcast_f32x4(a).as_f32x16();
     let zero = _mm512_setzero_ps().as_f32x16();
@@ -11136,7 +11136,7 @@ pub unsafe fn _mm512_maskz_broadcast_f32x4(k: __mmask16, a: __m128) -> __m512 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcast_f64x4&expand=495)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vinsertf64x4))] //should be vbroadcastf64x4
+#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcastf64x4
 pub unsafe fn _mm512_broadcast_f64x4(a: __m256d) -> __m512d {
     simd_shuffle8(a, a, [0, 1, 2, 3, 0, 1, 2, 3])
 }
@@ -11146,7 +11146,7 @@ pub unsafe fn _mm512_broadcast_f64x4(a: __m256d) -> __m512d {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcast_f64x4&expand=496)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vinsertf64x4))] //should be vbroadcastf64x4
+#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcastf64x4
 pub unsafe fn _mm512_mask_broadcast_f64x4(src: __m512d, k: __mmask8, a: __m256d) -> __m512d {
     let broadcast = _mm512_broadcast_f64x4(a).as_f64x8();
     transmute(simd_select_bitmask(k, broadcast, src.as_f64x8()))
@@ -11157,7 +11157,7 @@ pub unsafe fn _mm512_mask_broadcast_f64x4(src: __m512d, k: __mmask8, a: __m256d)
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcast_f64x4&expand=497)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vinsertf64x4))] //should be vbroadcastf64x4
+#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcastf64x4
 pub unsafe fn _mm512_maskz_broadcast_f64x4(k: __mmask8, a: __m256d) -> __m512d {
     let broadcast = _mm512_broadcast_f64x4(a).as_f64x8();
     let zero = _mm512_setzero_pd().as_f64x8();

From 0629fe3ecc0e26a353ed5f263f427059b4a0c9a5 Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Thu, 24 Sep 2020 13:55:40 +0000
Subject: [PATCH 14/20] comment out assert_instr for broadcast_X4 to test

---
 crates/core_arch/src/x86/avx512f.rs | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index a5da0c42bc..a4ee76e833 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -11035,7 +11035,7 @@ pub unsafe fn _mm512_maskz_broadcastsd_pd(k: __mmask8, a: __m128d) -> __m512d {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcast_i32x4&expand=510)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcasti32x4
+//#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcasti32x4
 pub unsafe fn _mm512_broadcast_i32x4(a: __m128i) -> __m512i {
     let a = _mm512_castsi128_si512(a).as_i32x16();
     let ret: i32x16 = simd_shuffle16(a, a, [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]);
@@ -11047,7 +11047,7 @@ pub unsafe fn _mm512_broadcast_i32x4(a: __m128i) -> __m512i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcast_i32x4&expand=511)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcasti32x4
+//#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcasti32x4
 pub unsafe fn _mm512_mask_broadcast_i32x4(src: __m512i, k: __mmask16, a: __m128i) -> __m512i {
     let broadcast = _mm512_broadcast_i32x4(a).as_i32x16();
     transmute(simd_select_bitmask(k, broadcast, src.as_i32x16()))
@@ -11058,7 +11058,7 @@ pub unsafe fn _mm512_mask_broadcast_i32x4(src: __m512i, k: __mmask16, a: __m128i
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcast_i32x4&expand=512)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcasti32x4
+//#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcasti32x4
 pub unsafe fn _mm512_maskz_broadcast_i32x4(k: __mmask16, a: __m128i) -> __m512i {
     let broadcast = _mm512_broadcast_i32x4(a).as_i32x16();
     let zero = _mm512_setzero_si512().as_i32x16();
@@ -11070,7 +11070,7 @@ pub unsafe fn _mm512_maskz_broadcast_i32x4(k: __mmask16, a: __m128i) -> __m512i
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcast_i64x4&expand=522)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcasti64x4
+//#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcasti64x4
 pub unsafe fn _mm512_broadcast_i64x4(a: __m256i) -> __m512i {
     simd_shuffle8(a, a, [0, 1, 2, 3, 0, 1, 2, 3])
 }
@@ -11080,7 +11080,7 @@ pub unsafe fn _mm512_broadcast_i64x4(a: __m256i) -> __m512i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcast_i64x4&expand=523)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcasti64x4
+//#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcasti64x4
 pub unsafe fn _mm512_mask_broadcast_i64x4(src: __m512i, k: __mmask8, a: __m256i) -> __m512i {
     let broadcast = _mm512_broadcast_i64x4(a).as_i64x8();
     transmute(simd_select_bitmask(k, broadcast, src.as_i64x8()))
@@ -11091,7 +11091,7 @@ pub unsafe fn _mm512_mask_broadcast_i64x4(src: __m512i, k: __mmask8, a: __m256i)
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcast_i64x4&expand=524)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcasti64x4
+//#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcasti64x4
 pub unsafe fn _mm512_maskz_broadcast_i64x4(k: __mmask8, a: __m256i) -> __m512i {
     let broadcast = _mm512_broadcast_i64x4(a).as_i64x8();
     let zero = _mm512_setzero_si512().as_i64x8();
@@ -11103,7 +11103,7 @@ pub unsafe fn _mm512_maskz_broadcast_i64x4(k: __mmask8, a: __m256i) -> __m512i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcast_f32x4&expand=483)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcastf32x4
+//#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcastf32x4
 pub unsafe fn _mm512_broadcast_f32x4(a: __m128) -> __m512 {
     simd_shuffle16(a, a, [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3])
 }
@@ -11113,7 +11113,7 @@ pub unsafe fn _mm512_broadcast_f32x4(a: __m128) -> __m512 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcast_f32x4&expand=484)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcastf32x4
+//#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcastf32x4
 pub unsafe fn _mm512_mask_broadcast_f32x4(src: __m512, k: __mmask16, a: __m128) -> __m512 {
     let broadcast = _mm512_broadcast_f32x4(a).as_f32x16();
     transmute(simd_select_bitmask(k, broadcast, src.as_f32x16()))
@@ -11124,7 +11124,7 @@ pub unsafe fn _mm512_mask_broadcast_f32x4(src: __m512, k: __mmask16, a: __m128)
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcast_f32x4&expand=485)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcastf32x4
+//#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcastf32x4
 pub unsafe fn _mm512_maskz_broadcast_f32x4(k: __mmask16, a: __m128) -> __m512 {
     let broadcast = _mm512_broadcast_f32x4(a).as_f32x16();
     let zero = _mm512_setzero_ps().as_f32x16();
@@ -11136,7 +11136,7 @@ pub unsafe fn _mm512_maskz_broadcast_f32x4(k: __mmask16, a: __m128) -> __m512 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcast_f64x4&expand=495)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcastf64x4
+//#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcastf64x4
 pub unsafe fn _mm512_broadcast_f64x4(a: __m256d) -> __m512d {
     simd_shuffle8(a, a, [0, 1, 2, 3, 0, 1, 2, 3])
 }
@@ -11146,7 +11146,7 @@ pub unsafe fn _mm512_broadcast_f64x4(a: __m256d) -> __m512d {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcast_f64x4&expand=496)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcastf64x4
+//#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcastf64x4
 pub unsafe fn _mm512_mask_broadcast_f64x4(src: __m512d, k: __mmask8, a: __m256d) -> __m512d {
     let broadcast = _mm512_broadcast_f64x4(a).as_f64x8();
     transmute(simd_select_bitmask(k, broadcast, src.as_f64x8()))
@@ -11157,7 +11157,7 @@ pub unsafe fn _mm512_mask_broadcast_f64x4(src: __m512d, k: __mmask8, a: __m256d)
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcast_f64x4&expand=497)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcastf64x4
+//#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcastf64x4
 pub unsafe fn _mm512_maskz_broadcast_f64x4(k: __mmask8, a: __m256d) -> __m512d {
     let broadcast = _mm512_broadcast_f64x4(a).as_f64x8();
     let zero = _mm512_setzero_pd().as_f64x8();

From 79b5fb23192fb55d83f18a3c56164b8b43a2d7dc Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Thu, 24 Sep 2020 14:02:43 +0000
Subject: [PATCH 15/20] comment out assert_instr for broadcast_X4 to test v2

---
 crates/core_arch/src/x86/avx512f.rs | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index a4ee76e833..21f10780c2 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -11035,7 +11035,7 @@ pub unsafe fn _mm512_maskz_broadcastsd_pd(k: __mmask8, a: __m128d) -> __m512d {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcast_i32x4&expand=510)
 #[inline]
 #[target_feature(enable = "avx512f")]
-//#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcasti32x4
+//#[cfg_attr(test, assert_instr(vbroadcast))] should be vbroadcasti32x4
 pub unsafe fn _mm512_broadcast_i32x4(a: __m128i) -> __m512i {
     let a = _mm512_castsi128_si512(a).as_i32x16();
     let ret: i32x16 = simd_shuffle16(a, a, [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]);
@@ -11047,7 +11047,7 @@ pub unsafe fn _mm512_broadcast_i32x4(a: __m128i) -> __m512i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcast_i32x4&expand=511)
 #[inline]
 #[target_feature(enable = "avx512f")]
-//#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcasti32x4
+//#[cfg_attr(test, assert_instr(vbroadcast))] should be vbroadcasti32x4
 pub unsafe fn _mm512_mask_broadcast_i32x4(src: __m512i, k: __mmask16, a: __m128i) -> __m512i {
     let broadcast = _mm512_broadcast_i32x4(a).as_i32x16();
     transmute(simd_select_bitmask(k, broadcast, src.as_i32x16()))
@@ -11058,7 +11058,7 @@ pub unsafe fn _mm512_mask_broadcast_i32x4(src: __m512i, k: __mmask16, a: __m128i
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcast_i32x4&expand=512)
 #[inline]
 #[target_feature(enable = "avx512f")]
-//#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcasti32x4
+//#[cfg_attr(test, assert_instr(vbroadcast))] should be vbroadcasti32x4
 pub unsafe fn _mm512_maskz_broadcast_i32x4(k: __mmask16, a: __m128i) -> __m512i {
     let broadcast = _mm512_broadcast_i32x4(a).as_i32x16();
     let zero = _mm512_setzero_si512().as_i32x16();
@@ -11070,7 +11070,7 @@ pub unsafe fn _mm512_maskz_broadcast_i32x4(k: __mmask16, a: __m128i) -> __m512i
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcast_i64x4&expand=522)
 #[inline]
 #[target_feature(enable = "avx512f")]
-//#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcasti64x4
+//#[cfg_attr(test, assert_instr(vbroadcast))] should be vbroadcasti64x4
 pub unsafe fn _mm512_broadcast_i64x4(a: __m256i) -> __m512i {
     simd_shuffle8(a, a, [0, 1, 2, 3, 0, 1, 2, 3])
 }
@@ -11080,7 +11080,7 @@ pub unsafe fn _mm512_broadcast_i64x4(a: __m256i) -> __m512i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcast_i64x4&expand=523)
 #[inline]
 #[target_feature(enable = "avx512f")]
-//#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcasti64x4
+//#[cfg_attr(test, assert_instr(vbroadcast))] should be vbroadcasti64x4
 pub unsafe fn _mm512_mask_broadcast_i64x4(src: __m512i, k: __mmask8, a: __m256i) -> __m512i {
     let broadcast = _mm512_broadcast_i64x4(a).as_i64x8();
     transmute(simd_select_bitmask(k, broadcast, src.as_i64x8()))
@@ -11091,7 +11091,7 @@ pub unsafe fn _mm512_mask_broadcast_i64x4(src: __m512i, k: __mmask8, a: __m256i)
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcast_i64x4&expand=524)
 #[inline]
 #[target_feature(enable = "avx512f")]
-//#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcasti64x4
+//#[cfg_attr(test, assert_instr(vbroadcast))] should be vbroadcasti64x4
 pub unsafe fn _mm512_maskz_broadcast_i64x4(k: __mmask8, a: __m256i) -> __m512i {
     let broadcast = _mm512_broadcast_i64x4(a).as_i64x8();
     let zero = _mm512_setzero_si512().as_i64x8();
@@ -11103,7 +11103,7 @@ pub unsafe fn _mm512_maskz_broadcast_i64x4(k: __mmask8, a: __m256i) -> __m512i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcast_f32x4&expand=483)
 #[inline]
 #[target_feature(enable = "avx512f")]
-//#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcastf32x4
+//#[cfg_attr(test, assert_instr(vbroadcast))] should be vbroadcastf32x4
 pub unsafe fn _mm512_broadcast_f32x4(a: __m128) -> __m512 {
     simd_shuffle16(a, a, [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3])
 }
@@ -11113,7 +11113,7 @@ pub unsafe fn _mm512_broadcast_f32x4(a: __m128) -> __m512 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcast_f32x4&expand=484)
 #[inline]
 #[target_feature(enable = "avx512f")]
-//#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcastf32x4
+//#[cfg_attr(test, assert_instr(vbroadcast))] should be vbroadcastf32x4
 pub unsafe fn _mm512_mask_broadcast_f32x4(src: __m512, k: __mmask16, a: __m128) -> __m512 {
     let broadcast = _mm512_broadcast_f32x4(a).as_f32x16();
     transmute(simd_select_bitmask(k, broadcast, src.as_f32x16()))
@@ -11124,7 +11124,7 @@ pub unsafe fn _mm512_mask_broadcast_f32x4(src: __m512, k: __mmask16, a: __m128)
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcast_f32x4&expand=485)
 #[inline]
 #[target_feature(enable = "avx512f")]
-//#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcastf32x4
+//#[cfg_attr(test, assert_instr(vbroadcast))] should be vbroadcastf32x4
 pub unsafe fn _mm512_maskz_broadcast_f32x4(k: __mmask16, a: __m128) -> __m512 {
     let broadcast = _mm512_broadcast_f32x4(a).as_f32x16();
     let zero = _mm512_setzero_ps().as_f32x16();
@@ -11136,7 +11136,7 @@ pub unsafe fn _mm512_maskz_broadcast_f32x4(k: __mmask16, a: __m128) -> __m512 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcast_f64x4&expand=495)
 #[inline]
 #[target_feature(enable = "avx512f")]
-//#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcastf64x4
+//#[cfg_attr(test, assert_instr(vbroadcast))] should be vbroadcastf64x4
 pub unsafe fn _mm512_broadcast_f64x4(a: __m256d) -> __m512d {
     simd_shuffle8(a, a, [0, 1, 2, 3, 0, 1, 2, 3])
 }
@@ -11146,7 +11146,7 @@ pub unsafe fn _mm512_broadcast_f64x4(a: __m256d) -> __m512d {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcast_f64x4&expand=496)
 #[inline]
 #[target_feature(enable = "avx512f")]
-//#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcastf64x4
+//#[cfg_attr(test, assert_instr(vbroadcast))] should be vbroadcastf64x4
 pub unsafe fn _mm512_mask_broadcast_f64x4(src: __m512d, k: __mmask8, a: __m256d) -> __m512d {
     let broadcast = _mm512_broadcast_f64x4(a).as_f64x8();
     transmute(simd_select_bitmask(k, broadcast, src.as_f64x8()))
@@ -11157,7 +11157,7 @@ pub unsafe fn _mm512_mask_broadcast_f64x4(src: __m512d, k: __mmask8, a: __m256d)
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcast_f64x4&expand=497)
 #[inline]
 #[target_feature(enable = "avx512f")]
-//#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcastf64x4
+//#[cfg_attr(test, assert_instr(vbroadcast))] should be vbroadcastf64x4
 pub unsafe fn _mm512_maskz_broadcast_f64x4(k: __mmask8, a: __m256d) -> __m512d {
     let broadcast = _mm512_broadcast_f64x4(a).as_f64x8();
     let zero = _mm512_setzero_pd().as_f64x8();

From 9dfe42f70fbf7cbba5824fcb849be317639d4aa7 Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Thu, 24 Sep 2020 17:12:16 +0000
Subject: [PATCH 16/20] add const_fn_transmute

---
 crates/core_arch/src/lib.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/crates/core_arch/src/lib.rs b/crates/core_arch/src/lib.rs
index cca9f7f323..5dc390753b 100644
--- a/crates/core_arch/src/lib.rs
+++ b/crates/core_arch/src/lib.rs
@@ -36,7 +36,8 @@
     f16c_target_feature,
     external_doc,
     allow_internal_unstable,
-    decl_macro
+    decl_macro,
+    const_fn_transmute
 )]
 #![cfg_attr(test, feature(test, abi_vectorcall, untagged_unions))]
 #![cfg_attr(all(test, target_arch = "wasm32"), feature(wasm_simd))]

From 11a64fb15d12badbf2466aec674a855dccec35b0 Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Thu, 24 Sep 2020 17:22:43 +0000
Subject: [PATCH 17/20] fix assert_instr for unpackhi

---
 crates/core_arch/src/x86/avx512f.rs | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 21f10780c2..ba711f4766 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -10601,7 +10601,7 @@ pub unsafe fn _mm512_maskz_unpackhi_epi32(k: __mmask16, a: __m512i, b: __m512i)
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_unpackhi_epi64&expand=6030)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vpermt2q))] //should be vpunpckhqdq
+#[cfg_attr(test, assert_instr(vperm))] //should be vpunpckhqdq
 pub unsafe fn _mm512_unpackhi_epi64(a: __m512i, b: __m512i) -> __m512i {
     simd_shuffle8(a, b, [2, 10, 3, 11, 2 + 4, 10 + 4, 3 + 4, 11 + 4])
 }
@@ -10611,7 +10611,7 @@ pub unsafe fn _mm512_unpackhi_epi64(a: __m512i, b: __m512i) -> __m512i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_unpackhi_epi64&expand=6028)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vpermi2q))] //should be vpunpckhqdq
+#[cfg_attr(test, assert_instr(vperm))] //should be vpunpckhqdq
 pub unsafe fn _mm512_mask_unpackhi_epi64(
     src: __m512i,
     k: __mmask8,
@@ -10627,7 +10627,7 @@ pub unsafe fn _mm512_mask_unpackhi_epi64(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_unpackhi_epi64&expand=6029)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vpermt2q))] //should be vpunpckhqdq
+#[cfg_attr(test, assert_instr(vperm))] //should be vpunpckhqdq
 pub unsafe fn _mm512_maskz_unpackhi_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
     let unpackhi = _mm512_unpackhi_epi64(a, b).as_i64x8();
     let zero = _mm512_setzero_si512().as_i64x8();
@@ -10693,7 +10693,7 @@ pub unsafe fn _mm512_maskz_unpackhi_ps(k: __mmask16, a: __m512, b: __m512) -> __
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_unpackhi_pd&expand=6048)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vpermt2pd))] //should be vunpackhpd
+#[cfg_attr(test, assert_instr(vperm))] //should be vunpackhpd
 pub unsafe fn _mm512_unpackhi_pd(a: __m512d, b: __m512d) -> __m512d {
     simd_shuffle8(a, b, [2, 10, 3, 11, 2 + 4, 10 + 4, 3 + 4, 11 + 4])
 }
@@ -10703,7 +10703,7 @@ pub unsafe fn _mm512_unpackhi_pd(a: __m512d, b: __m512d) -> __m512d {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_unpackhi_pd&expand=6046)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vpermi2pd))] //should be vunpackhpd
+#[cfg_attr(test, assert_instr(vperm))] //should be vunpackhpd
 pub unsafe fn _mm512_mask_unpackhi_pd(
     src: __m512d,
     k: __mmask8,
@@ -10719,7 +10719,7 @@ pub unsafe fn _mm512_mask_unpackhi_pd(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_unpackhi_pd&expand=6047)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vpermt2pd))] //should be vunpackhpd
+#[cfg_attr(test, assert_instr(vperm))] //should be vunpackhpd
 pub unsafe fn _mm512_maskz_unpackhi_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
     let unpackhi = _mm512_unpackhi_pd(a, b).as_f64x8();
     let zero = _mm512_setzero_pd().as_f64x8();

From 90f2ae6bdac2e47f099ba5789f6ac3c4dc63de8b Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Thu, 24 Sep 2020 21:00:35 +0000
Subject: [PATCH 18/20] Not fix const check. Fix mistakes and remove comment

---
 crates/core_arch/src/x86/avx512f.rs    | 54 ++++++++++----------------
 crates/core_arch/src/x86_64/avx512f.rs | 20 +++++-----
 2 files changed, 31 insertions(+), 43 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index ba711f4766..055767e051 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -10601,9 +10601,9 @@ pub unsafe fn _mm512_maskz_unpackhi_epi32(k: __mmask16, a: __m512i, b: __m512i)
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_unpackhi_epi64&expand=6030)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vperm))] //should be vpunpckhqdq
+#[cfg_attr(test, assert_instr(vunpckhpd))] //should be vpunpckhqdq
 pub unsafe fn _mm512_unpackhi_epi64(a: __m512i, b: __m512i) -> __m512i {
-    simd_shuffle8(a, b, [2, 10, 3, 11, 2 + 4, 10 + 4, 3 + 4, 11 + 4])
+    simd_shuffle8(a, b, [1, 9, 1 + 2, 9 + 2, 1 + 4, 9 + 4, 1 + 6, 9 + 6])
 }
 
 /// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -10611,7 +10611,7 @@ pub unsafe fn _mm512_unpackhi_epi64(a: __m512i, b: __m512i) -> __m512i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_unpackhi_epi64&expand=6028)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vperm))] //should be vpunpckhqdq
+#[cfg_attr(test, assert_instr(vpunpckhqdq))]
 pub unsafe fn _mm512_mask_unpackhi_epi64(
     src: __m512i,
     k: __mmask8,
@@ -10627,7 +10627,7 @@ pub unsafe fn _mm512_mask_unpackhi_epi64(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_unpackhi_epi64&expand=6029)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vperm))] //should be vpunpckhqdq
+#[cfg_attr(test, assert_instr(vpunpckhqdq))]
 pub unsafe fn _mm512_maskz_unpackhi_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
     let unpackhi = _mm512_unpackhi_epi64(a, b).as_i64x8();
     let zero = _mm512_setzero_si512().as_i64x8();
@@ -10693,9 +10693,9 @@ pub unsafe fn _mm512_maskz_unpackhi_ps(k: __mmask16, a: __m512, b: __m512) -> __
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_unpackhi_pd&expand=6048)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vperm))] //should be vunpackhpd
+#[cfg_attr(test, assert_instr(vunpckhpd))]
 pub unsafe fn _mm512_unpackhi_pd(a: __m512d, b: __m512d) -> __m512d {
-    simd_shuffle8(a, b, [2, 10, 3, 11, 2 + 4, 10 + 4, 3 + 4, 11 + 4])
+    simd_shuffle8(a, b, [1, 9, 1 + 2, 9 + 2, 1 + 4, 9 + 4, 1 + 6, 9 + 6])
 }
 
 /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -10703,7 +10703,7 @@ pub unsafe fn _mm512_unpackhi_pd(a: __m512d, b: __m512d) -> __m512d {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_unpackhi_pd&expand=6046)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vperm))] //should be vunpackhpd
+#[cfg_attr(test, assert_instr(vunpckhpd))]
 pub unsafe fn _mm512_mask_unpackhi_pd(
     src: __m512d,
     k: __mmask8,
@@ -10719,7 +10719,7 @@ pub unsafe fn _mm512_mask_unpackhi_pd(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_unpackhi_pd&expand=6047)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vperm))] //should be vunpackhpd
+#[cfg_attr(test, assert_instr(vunpckhpd))]
 pub unsafe fn _mm512_maskz_unpackhi_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
     let unpackhi = _mm512_unpackhi_pd(a, b).as_f64x8();
     let zero = _mm512_setzero_pd().as_f64x8();
@@ -10938,7 +10938,7 @@ pub unsafe fn _mm512_maskz_broadcastd_epi32(k: __mmask16, a: __m128i) -> __m512i
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vbroadcas))] //should be vpbroadcastq
 pub unsafe fn _mm512_broadcastq_epi64(a: __m128i) -> __m512i {
-    simd_shuffle8(a, a, [1, 1, 1, 1, 1, 1, 1, 1])
+    simd_shuffle8(a, a, [0, 0, 0, 0, 0, 0, 0, 0])
 }
 
 /// Broadcast the low packed 64-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -11034,8 +11034,7 @@ pub unsafe fn _mm512_maskz_broadcastsd_pd(k: __mmask8, a: __m128d) -> __m512d {
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcast_i32x4&expand=510)
 #[inline]
-#[target_feature(enable = "avx512f")]
-//#[cfg_attr(test, assert_instr(vbroadcast))] should be vbroadcasti32x4
+#[target_feature(enable = "avx512f")] //msvc: vbroadcasti32x4, linux: vshuf
 pub unsafe fn _mm512_broadcast_i32x4(a: __m128i) -> __m512i {
     let a = _mm512_castsi128_si512(a).as_i32x16();
     let ret: i32x16 = simd_shuffle16(a, a, [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]);
@@ -11046,8 +11045,7 @@ pub unsafe fn _mm512_broadcast_i32x4(a: __m128i) -> __m512i {
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcast_i32x4&expand=511)
 #[inline]
-#[target_feature(enable = "avx512f")]
-//#[cfg_attr(test, assert_instr(vbroadcast))] should be vbroadcasti32x4
+#[target_feature(enable = "avx512f")] //msvc: vbroadcasti32x4, linux: vshuf
 pub unsafe fn _mm512_mask_broadcast_i32x4(src: __m512i, k: __mmask16, a: __m128i) -> __m512i {
     let broadcast = _mm512_broadcast_i32x4(a).as_i32x16();
     transmute(simd_select_bitmask(k, broadcast, src.as_i32x16()))
@@ -11057,8 +11055,7 @@ pub unsafe fn _mm512_mask_broadcast_i32x4(src: __m512i, k: __mmask16, a: __m128i
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcast_i32x4&expand=512)
 #[inline]
-#[target_feature(enable = "avx512f")]
-//#[cfg_attr(test, assert_instr(vbroadcast))] should be vbroadcasti32x4
+#[target_feature(enable = "avx512f")] //msvc: vbroadcasti32x4, linux: vshuf
 pub unsafe fn _mm512_maskz_broadcast_i32x4(k: __mmask16, a: __m128i) -> __m512i {
     let broadcast = _mm512_broadcast_i32x4(a).as_i32x16();
     let zero = _mm512_setzero_si512().as_i32x16();
@@ -11069,8 +11066,7 @@ pub unsafe fn _mm512_maskz_broadcast_i32x4(k: __mmask16, a: __m128i) -> __m512i
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcast_i64x4&expand=522)
 #[inline]
-#[target_feature(enable = "avx512f")]
-//#[cfg_attr(test, assert_instr(vbroadcast))] should be vbroadcasti64x4
+#[target_feature(enable = "avx512f")] //msvc: vbroadcasti64x4, linux: vperm
 pub unsafe fn _mm512_broadcast_i64x4(a: __m256i) -> __m512i {
     simd_shuffle8(a, a, [0, 1, 2, 3, 0, 1, 2, 3])
 }
@@ -11079,8 +11075,7 @@ pub unsafe fn _mm512_broadcast_i64x4(a: __m256i) -> __m512i {
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcast_i64x4&expand=523)
 #[inline]
-#[target_feature(enable = "avx512f")]
-//#[cfg_attr(test, assert_instr(vbroadcast))] should be vbroadcasti64x4
+#[target_feature(enable = "avx512f")] //msvc: vbroadcasti64x4, linux: vperm
 pub unsafe fn _mm512_mask_broadcast_i64x4(src: __m512i, k: __mmask8, a: __m256i) -> __m512i {
     let broadcast = _mm512_broadcast_i64x4(a).as_i64x8();
     transmute(simd_select_bitmask(k, broadcast, src.as_i64x8()))
@@ -11090,8 +11085,7 @@ pub unsafe fn _mm512_mask_broadcast_i64x4(src: __m512i, k: __mmask8, a: __m256i)
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcast_i64x4&expand=524)
 #[inline]
-#[target_feature(enable = "avx512f")]
-//#[cfg_attr(test, assert_instr(vbroadcast))] should be vbroadcasti64x4
+#[target_feature(enable = "avx512f")] //msvc: vbroadcasti64x4, linux: vperm
 pub unsafe fn _mm512_maskz_broadcast_i64x4(k: __mmask8, a: __m256i) -> __m512i {
     let broadcast = _mm512_broadcast_i64x4(a).as_i64x8();
     let zero = _mm512_setzero_si512().as_i64x8();
@@ -11102,8 +11096,7 @@ pub unsafe fn _mm512_maskz_broadcast_i64x4(k: __mmask8, a: __m256i) -> __m512i {
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcast_f32x4&expand=483)
 #[inline]
-#[target_feature(enable = "avx512f")]
-//#[cfg_attr(test, assert_instr(vbroadcast))] should be vbroadcastf32x4
+#[target_feature(enable = "avx512f")] //msvc: vbroadcastf32x4, linux: vshuf
 pub unsafe fn _mm512_broadcast_f32x4(a: __m128) -> __m512 {
     simd_shuffle16(a, a, [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3])
 }
@@ -11112,8 +11105,7 @@ pub unsafe fn _mm512_broadcast_f32x4(a: __m128) -> __m512 {
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcast_f32x4&expand=484)
 #[inline]
-#[target_feature(enable = "avx512f")]
-//#[cfg_attr(test, assert_instr(vbroadcast))] should be vbroadcastf32x4
+#[target_feature(enable = "avx512f")] //msvc: vbroadcastf32x4, linux: vshu
 pub unsafe fn _mm512_mask_broadcast_f32x4(src: __m512, k: __mmask16, a: __m128) -> __m512 {
     let broadcast = _mm512_broadcast_f32x4(a).as_f32x16();
     transmute(simd_select_bitmask(k, broadcast, src.as_f32x16()))
@@ -11123,8 +11115,7 @@ pub unsafe fn _mm512_mask_broadcast_f32x4(src: __m512, k: __mmask16, a: __m128)
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcast_f32x4&expand=485)
 #[inline]
-#[target_feature(enable = "avx512f")]
-//#[cfg_attr(test, assert_instr(vbroadcast))] should be vbroadcastf32x4
+#[target_feature(enable = "avx512f")] //msvc: vbroadcastf32x4, linux: vshu
 pub unsafe fn _mm512_maskz_broadcast_f32x4(k: __mmask16, a: __m128) -> __m512 {
     let broadcast = _mm512_broadcast_f32x4(a).as_f32x16();
     let zero = _mm512_setzero_ps().as_f32x16();
@@ -11135,8 +11126,7 @@ pub unsafe fn _mm512_maskz_broadcast_f32x4(k: __mmask16, a: __m128) -> __m512 {
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcast_f64x4&expand=495)
 #[inline]
-#[target_feature(enable = "avx512f")]
-//#[cfg_attr(test, assert_instr(vbroadcast))] should be vbroadcastf64x4
+#[target_feature(enable = "avx512f")] //msvc: vbroadcastf64x4, linux: vperm
 pub unsafe fn _mm512_broadcast_f64x4(a: __m256d) -> __m512d {
     simd_shuffle8(a, a, [0, 1, 2, 3, 0, 1, 2, 3])
 }
@@ -11145,8 +11135,7 @@ pub unsafe fn _mm512_broadcast_f64x4(a: __m256d) -> __m512d {
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcast_f64x4&expand=496)
 #[inline]
-#[target_feature(enable = "avx512f")]
-//#[cfg_attr(test, assert_instr(vbroadcast))] should be vbroadcastf64x4
+#[target_feature(enable = "avx512f")] //msvc: vbroadcastf64x4, linux: vper
 pub unsafe fn _mm512_mask_broadcast_f64x4(src: __m512d, k: __mmask8, a: __m256d) -> __m512d {
     let broadcast = _mm512_broadcast_f64x4(a).as_f64x8();
     transmute(simd_select_bitmask(k, broadcast, src.as_f64x8()))
@@ -11156,8 +11145,7 @@ pub unsafe fn _mm512_mask_broadcast_f64x4(src: __m512d, k: __mmask8, a: __m256d)
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcast_f64x4&expand=497)
 #[inline]
-#[target_feature(enable = "avx512f")]
-//#[cfg_attr(test, assert_instr(vbroadcast))] should be vbroadcastf64x4
+#[target_feature(enable = "avx512f")] //msvc: vbroadcastf64x4, linux: vper
 pub unsafe fn _mm512_maskz_broadcast_f64x4(k: __mmask8, a: __m256d) -> __m512d {
     let broadcast = _mm512_broadcast_f64x4(a).as_f64x8();
     let zero = _mm512_setzero_pd().as_f64x8();
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index b005136ebe..f160011122 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -4448,7 +4448,7 @@ mod tests {
     unsafe fn test_mm512_broadcastq_epi64() {
         let a = _mm_setr_epi64x(17, 18);
         let r = _mm512_broadcastq_epi64(a);
-        let e = _mm512_set1_epi64(18);
+        let e = _mm512_set1_epi64(17);
         assert_eq_m512i(r, e);
     }
 
@@ -4458,8 +4458,8 @@ mod tests {
         let a = _mm_setr_epi64x(17, 18);
         let r = _mm512_mask_broadcastq_epi64(src, 0, a);
         assert_eq_m512i(r, src);
-        let r = _mm512_mask_broadcastq_epi64(src, 0b01111111, a);
-        let e = _mm512_set1_epi64(18);
+        let r = _mm512_mask_broadcastq_epi64(src, 0b11111111, a);
+        let e = _mm512_set1_epi64(17);
         assert_eq_m512i(r, e);
     }
 
@@ -4469,7 +4469,7 @@ mod tests {
         let r = _mm512_maskz_broadcastq_epi64(0, a);
         assert_eq_m512i(r, _mm512_setzero_si512());
         let r = _mm512_maskz_broadcastq_epi64(0b00001111, a);
-        let e = _mm512_set_epi64(0, 0, 0, 0, 18, 18, 18, 18);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 17, 17, 17, 17);
         assert_eq_m512i(r, e);
     }
 
@@ -4583,7 +4583,7 @@ mod tests {
         let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
         let b = _mm512_set_epi64(17, 18, 19, 20, 21, 22, 23, 24);
         let r = _mm512_unpackhi_epi64(a, b);
-        let e = _mm512_set_epi64(17, 1, 18, 2, 21, 5, 22, 6);
+        let e = _mm512_set_epi64(17, 1, 19, 3, 21, 5, 23, 7);
         assert_eq_m512i(r, e);
     }
 
@@ -4594,7 +4594,7 @@ mod tests {
         let r = _mm512_mask_unpackhi_epi64(a, 0, a, b);
         assert_eq_m512i(r, a);
         let r = _mm512_mask_unpackhi_epi64(a, 0b11111111, a, b);
-        let e = _mm512_set_epi64(17, 1, 18, 2, 21, 5, 22, 6);
+        let e = _mm512_set_epi64(17, 1, 19, 3, 21, 5, 23, 7);
         assert_eq_m512i(r, e);
     }
 
@@ -4605,7 +4605,7 @@ mod tests {
         let r = _mm512_maskz_unpackhi_epi64(0, a, b);
         assert_eq_m512i(r, _mm512_setzero_si512());
         let r = _mm512_maskz_unpackhi_epi64(0b00001111, a, b);
-        let e = _mm512_set_epi64(0, 0, 0, 0, 21, 5, 22, 6);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 21, 5, 23, 7);
         assert_eq_m512i(r, e);
     }
 
@@ -4614,7 +4614,7 @@ mod tests {
         let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
         let b = _mm512_set_pd(17., 18., 19., 20., 21., 22., 23., 24.);
         let r = _mm512_unpackhi_pd(a, b);
-        let e = _mm512_set_pd(17., 1., 18., 2., 21., 5., 22., 6.);
+        let e = _mm512_set_pd(17., 1., 19., 3., 21., 5., 23., 7.);
         assert_eq_m512d(r, e);
     }
 
@@ -4625,7 +4625,7 @@ mod tests {
         let r = _mm512_mask_unpackhi_pd(a, 0, a, b);
         assert_eq_m512d(r, a);
         let r = _mm512_mask_unpackhi_pd(a, 0b11111111, a, b);
-        let e = _mm512_set_pd(17., 1., 18., 2., 21., 5., 22., 6.);
+        let e = _mm512_set_pd(17., 1., 19., 3., 21., 5., 23., 7.);
         assert_eq_m512d(r, e);
     }
 
@@ -4636,7 +4636,7 @@ mod tests {
         let r = _mm512_maskz_unpackhi_pd(0, a, b);
         assert_eq_m512d(r, _mm512_setzero_pd());
         let r = _mm512_maskz_unpackhi_pd(0b00001111, a, b);
-        let e = _mm512_set_pd(0., 0., 0., 0., 21., 5., 22., 6.);
+        let e = _mm512_set_pd(0., 0., 0., 0., 21., 5., 23., 7.);
         assert_eq_m512d(r, e);
     }
 

From 6a9d0c9bf84c69f9ebf1f72fad2d42705e25ce88 Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Fri, 25 Sep 2020 17:44:08 +0000
Subject: [PATCH 19/20] unpacklo: epi32,epi64,ps,pd

---
 crates/core_arch/avx512f.md            |  24 +--
 crates/core_arch/src/x86/avx512f.rs    | 278 +++++++++++++++++++++++++
 crates/core_arch/src/x86_64/avx512f.rs |  62 ++++++
 3 files changed, 352 insertions(+), 12 deletions(-)

diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index f7c19dfc43..f8612abba3 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -670,10 +670,10 @@
   * [x] [`_mm512_mask_unpackhi_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpackhi_epi64&expand=5236)
   * [x] [`_mm512_mask_unpackhi_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpackhi_pd&expand=5236)
   * [x] [`_mm512_mask_unpackhi_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpackhi_ps&expand=5236)
-  * [ ] [`_mm512_mask_unpacklo_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpacklo_epi32&expand=5236)
-  * [ ] [`_mm512_mask_unpacklo_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpacklo_epi64&expand=5236)
-  * [ ] [`_mm512_mask_unpacklo_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpacklo_pd&expand=5236)
-  * [ ] [`_mm512_mask_unpacklo_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpacklo_ps&expand=5236)
+  * [x] [`_mm512_mask_unpacklo_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpacklo_epi32&expand=5236)
+  * [x] [`_mm512_mask_unpacklo_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpacklo_epi64&expand=5236)
+  * [x] [`_mm512_mask_unpacklo_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpacklo_pd&expand=5236)
+  * [x] [`_mm512_mask_unpacklo_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpacklo_ps&expand=5236)
   * [x] [`_mm512_mask_xor_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_xor_epi32&expand=5236)
   * [x] [`_mm512_mask_xor_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_xor_epi64&expand=5236)
   * [x] [`_mm512_maskz_abs_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_abs_epi32&expand=5236)
@@ -930,10 +930,10 @@
   * [x] [`_mm512_maskz_unpackhi_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpackhi_epi64&expand=5236)
   * [x] [`_mm512_maskz_unpackhi_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpackhi_pd&expand=5236)
   * [x] [`_mm512_maskz_unpackhi_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpackhi_ps&expand=5236)
-  * [ ] [`_mm512_maskz_unpacklo_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpacklo_epi32&expand=5236)
-  * [ ] [`_mm512_maskz_unpacklo_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpacklo_epi64&expand=5236)
-  * [ ] [`_mm512_maskz_unpacklo_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpacklo_pd&expand=5236)
-  * [ ] [`_mm512_maskz_unpacklo_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpacklo_ps&expand=5236)
+  * [x] [`_mm512_maskz_unpacklo_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpacklo_epi32&expand=5236)
+  * [x] [`_mm512_maskz_unpacklo_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpacklo_epi64&expand=5236)
+  * [x] [`_mm512_maskz_unpacklo_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpacklo_pd&expand=5236)
+  * [x] [`_mm512_maskz_unpacklo_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpacklo_ps&expand=5236)
   * [x] [`_mm512_maskz_xor_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_xor_epi32&expand=5236)
   * [x] [`_mm512_maskz_xor_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_xor_epi64&expand=5236)
   * [x] [`_mm512_max_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_max_epi32&expand=5236)
@@ -1116,10 +1116,10 @@
   * [x] [`_mm512_unpackhi_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpackhi_epi64&expand=5236)
   * [x] [`_mm512_unpackhi_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpackhi_pd&expand=5236)
   * [x] [`_mm512_unpackhi_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpackhi_ps&expand=5236)
-  * [ ] [`_mm512_unpacklo_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpacklo_epi32&expand=5236)
-  * [ ] [`_mm512_unpacklo_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpacklo_epi64&expand=5236)
-  * [ ] [`_mm512_unpacklo_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpacklo_pd&expand=5236)
-  * [ ] [`_mm512_unpacklo_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpacklo_ps&expand=5236)
+  * [x] [`_mm512_unpacklo_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpacklo_epi32&expand=5236)
+  * [x] [`_mm512_unpacklo_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpacklo_epi64&expand=5236)
+  * [x] [`_mm512_unpacklo_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpacklo_pd&expand=5236)
+  * [x] [`_mm512_unpacklo_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpacklo_ps&expand=5236)
   * [x] [`_mm512_xor_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_xor_epi32&expand=5236)
   * [x] [`_mm512_xor_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_xor_epi64&expand=5236)
   * [x] [`_mm512_xor_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_xor_si512&expand=5236)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 055767e051..e38ac8c85b 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -10726,6 +10726,198 @@ pub unsafe fn _mm512_maskz_unpackhi_pd(k: __mmask8, a: __m512d, b: __m512d) -> _
     transmute(simd_select_bitmask(k, unpackhi, zero))
 }
 
+/// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_unpacklo_epi32&expand=6078)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vunpcklps))] //should be vpunpckldq
+pub unsafe fn _mm512_unpacklo_epi32(a: __m512i, b: __m512i) -> __m512i {
+    let a = a.as_i32x16();
+    let b = b.as_i32x16();
+    let r: i32x16 = simd_shuffle16(
+        a,
+        b,
+        [
+            0,
+            16,
+            1,
+            17,
+            0 + 4,
+            16 + 4,
+            1 + 4,
+            17 + 4,
+            0 + 8,
+            16 + 8,
+            1 + 8,
+            17 + 8,
+            0 + 12,
+            16 + 12,
+            1 + 12,
+            17 + 12,
+        ],
+    );
+    transmute(r)
+}
+
+/// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_unpacklo_epi32&expand=6076)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpunpckldq))]
+pub unsafe fn _mm512_mask_unpacklo_epi32(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    let unpackhi = _mm512_unpacklo_epi32(a, b).as_i32x16();
+    transmute(simd_select_bitmask(k, unpackhi, src.as_i32x16()))
+}
+
+/// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_unpacklo_epi32&expand=6077)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpunpckldq))]
+pub unsafe fn _mm512_maskz_unpacklo_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    let unpackhi = _mm512_unpacklo_epi32(a, b).as_i32x16();
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, unpackhi, zero))
+}
+
+/// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_unpacklo_epi64&expand=6087)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vunpcklpd))] //should be vpunpcklqdq
+pub unsafe fn _mm512_unpacklo_epi64(a: __m512i, b: __m512i) -> __m512i {
+    simd_shuffle8(a, b, [0, 8, 0 + 2, 8 + 2, 0 + 4, 8 + 4, 0 + 6, 8 + 6])
+}
+
+/// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_unpacklo_epi64&expand=6085)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpunpcklqdq))]
+pub unsafe fn _mm512_mask_unpacklo_epi64(
+    src: __m512i,
+    k: __mmask8,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    let unpackhi = _mm512_unpacklo_epi64(a, b).as_i64x8();
+    transmute(simd_select_bitmask(k, unpackhi, src.as_i64x8()))
+}
+
+/// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_unpacklo_epi64&expand=6086)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpunpcklqdq))]
+pub unsafe fn _mm512_maskz_unpacklo_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    let unpackhi = _mm512_unpacklo_epi64(a, b).as_i64x8();
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, unpackhi, zero))
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_unpacklo_ps&expand=6117)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vunpcklps))]
+pub unsafe fn _mm512_unpacklo_ps(a: __m512, b: __m512) -> __m512 {
+    simd_shuffle16(
+        a,
+        b,
+        [
+            0,
+            16,
+            1,
+            17,
+            0 + 4,
+            16 + 4,
+            1 + 4,
+            17 + 4,
+            0 + 8,
+            16 + 8,
+            1 + 8,
+            17 + 8,
+            0 + 12,
+            16 + 12,
+            1 + 12,
+            17 + 12,
+        ],
+    )
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_unpacklo_ps&expand=6115)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vunpcklps))]
+pub unsafe fn _mm512_mask_unpacklo_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    let unpackhi = _mm512_unpacklo_ps(a, b).as_f32x16();
+    transmute(simd_select_bitmask(k, unpackhi, src.as_f32x16()))
+}
+
+/// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_unpacklo_ps&expand=6116)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vunpcklps))]
+pub unsafe fn _mm512_maskz_unpacklo_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    let unpackhi = _mm512_unpacklo_ps(a, b).as_f32x16();
+    let zero = _mm512_setzero_ps().as_f32x16();
+    transmute(simd_select_bitmask(k, unpackhi, zero))
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_unpacklo_pd&expand=6105)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vunpcklpd))]
+pub unsafe fn _mm512_unpacklo_pd(a: __m512d, b: __m512d) -> __m512d {
+    simd_shuffle8(a, b, [0, 8, 0 + 2, 8 + 2, 0 + 4, 8 + 4, 0 + 6, 8 + 6])
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_unpacklo_pd&expand=6103)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vunpcklpd))]
+pub unsafe fn _mm512_mask_unpacklo_pd(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __m512d {
+    let unpackhi = _mm512_unpacklo_pd(a, b).as_f64x8();
+    transmute(simd_select_bitmask(k, unpackhi, src.as_f64x8()))
+}
+
+/// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_unpacklo_pd&expand=6104)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vunpcklpd))]
+pub unsafe fn _mm512_maskz_unpacklo_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    let unpackhi = _mm512_unpacklo_pd(a, b).as_f64x8();
+    let zero = _mm512_setzero_pd().as_f64x8();
+    transmute(simd_select_bitmask(k, unpackhi, zero))
+}
+
 /// Cast vector of type __m128 to type __m512; the upper 384 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castps128_ps512&expand=621)
@@ -19779,6 +19971,92 @@ mod tests {
         assert_eq_m512(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_unpacklo_epi32() {
+        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm512_set_epi32(
+            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm512_unpacklo_epi32(a, b);
+        let e = _mm512_set_epi32(19, 3, 20, 4, 23, 7, 24, 8, 27, 11, 28, 12, 31, 15, 32, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_unpacklo_epi32() {
+        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm512_set_epi32(
+            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm512_mask_unpacklo_epi32(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_unpacklo_epi32(a, 0b11111111_11111111, a, b);
+        let e = _mm512_set_epi32(19, 3, 20, 4, 23, 7, 24, 8, 27, 11, 28, 12, 31, 15, 32, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_unpacklo_epi32() {
+        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm512_set_epi32(
+            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let r = _mm512_maskz_unpacklo_epi32(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_unpacklo_epi32(0b00000000_11111111, a, b);
+        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 27, 11, 28, 12, 31, 15, 32, 16);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_unpacklo_ps() {
+        let a = _mm512_set_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let b = _mm512_set_ps(
+            17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
+        );
+        let r = _mm512_unpacklo_ps(a, b);
+        let e = _mm512_set_ps(
+            19., 3., 20., 4., 23., 7., 24., 8., 27., 11., 28., 12., 31., 15., 32., 16.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_unpacklo_ps() {
+        let a = _mm512_set_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let b = _mm512_set_ps(
+            17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
+        );
+        let r = _mm512_mask_unpacklo_ps(a, 0, a, b);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_unpacklo_ps(a, 0b11111111_11111111, a, b);
+        let e = _mm512_set_ps(
+            19., 3., 20., 4., 23., 7., 24., 8., 27., 11., 28., 12., 31., 15., 32., 16.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_unpacklo_ps() {
+        let a = _mm512_set_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        let b = _mm512_set_ps(
+            17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
+        );
+        let r = _mm512_maskz_unpacklo_ps(0, a, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_unpacklo_ps(0b00000000_11111111, a, b);
+        let e = _mm512_set_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 27., 11., 28., 12., 31., 15., 32., 16.,
+        );
+        assert_eq_m512(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_and_epi32() {
         let a = _mm512_set_epi32(
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index f160011122..54291877a0 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -4640,6 +4640,68 @@ mod tests {
         assert_eq_m512d(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_unpacklo_epi64() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm512_set_epi64(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm512_unpacklo_epi64(a, b);
+        let e = _mm512_set_epi64(18, 2, 20, 4, 22, 6, 24, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_unpacklo_epi64() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm512_set_epi64(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm512_mask_unpacklo_epi64(a, 0, a, b);
+        assert_eq_m512i(r, a);
+        let r = _mm512_mask_unpacklo_epi64(a, 0b11111111, a, b);
+        let e = _mm512_set_epi64(18, 2, 20, 4, 22, 6, 24, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_unpacklo_epi64() {
+        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm512_set_epi64(17, 18, 19, 20, 21, 22, 23, 24);
+        let r = _mm512_maskz_unpacklo_epi64(0, a, b);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_unpacklo_epi64(0b00001111, a, b);
+        let e = _mm512_set_epi64(0, 0, 0, 0, 22, 6, 24, 8);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_unpacklo_pd() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_pd(17., 18., 19., 20., 21., 22., 23., 24.);
+        let r = _mm512_unpacklo_pd(a, b);
+        let e = _mm512_set_pd(18., 2., 20., 4., 22., 6., 24., 8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_unpacklo_pd() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_pd(17., 18., 19., 20., 21., 22., 23., 24.);
+        let r = _mm512_mask_unpacklo_pd(a, 0, a, b);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_unpacklo_pd(a, 0b11111111, a, b);
+        let e = _mm512_set_pd(18., 2., 20., 4., 22., 6., 24., 8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_unpacklo_pd() {
+        let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm512_set_pd(17., 18., 19., 20., 21., 22., 23., 24.);
+        let r = _mm512_maskz_unpacklo_pd(0, a, b);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_unpacklo_pd(0b00001111, a, b);
+        let e = _mm512_set_pd(0., 0., 0., 0., 22., 6., 24., 8.);
+        assert_eq_m512d(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_and_epi64() {
         let a = _mm512_set_epi64(1 << 0 | 1 << 15, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);

From 34723b1c90a76bdb7e4a04e3fabd5a5ef3f408dd Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Fri, 25 Sep 2020 18:46:29 +0000
Subject: [PATCH 20/20] use assert to check imm8 boundary

---
 crates/core_arch/src/x86/avx512f.rs | 34 ++++++++++++++++++++++++++---
 1 file changed, 31 insertions(+), 3 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index e38ac8c85b..32724bb292 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -8755,6 +8755,7 @@ pub unsafe fn _mm512_maskz_shuffle_epi32(k: __mmask16, a: __m512i, imm8: _MM_PER
 #[cfg_attr(test, assert_instr(vshufps, imm8 = 0))]
 #[rustc_args_required_const(2)]
 pub unsafe fn _mm512_shuffle_ps(a: __m512, b: __m512, imm8: i32) -> __m512 {
+    assert!(imm8 >= 0 && imm8 <= 255);
     let imm8 = (imm8 & 0xFF) as u8;
     macro_rules! shuffle4 {
         (
@@ -8836,6 +8837,7 @@ pub unsafe fn _mm512_mask_shuffle_ps(
     b: __m512,
     imm8: i32,
 ) -> __m512 {
+    assert!(imm8 >= 0 && imm8 <= 255);
     let imm8 = (imm8 & 0xFF) as u8;
     macro_rules! shuffle4 {
         (
@@ -8913,6 +8915,7 @@ pub unsafe fn _mm512_mask_shuffle_ps(
 #[cfg_attr(test, assert_instr(vshufps, imm8 = 0))]
 #[rustc_args_required_const(3)]
 pub unsafe fn _mm512_maskz_shuffle_ps(k: __mmask16, a: __m512, b: __m512, imm8: i32) -> __m512 {
+    assert!(imm8 >= 0 && imm8 <= 255);
     let imm8 = (imm8 & 0xFF) as u8;
     macro_rules! shuffle4 {
         (
@@ -8991,6 +8994,7 @@ pub unsafe fn _mm512_maskz_shuffle_ps(k: __mmask16, a: __m512, b: __m512, imm8:
 #[cfg_attr(test, assert_instr(vshufpd, imm8 = 3))]
 #[rustc_args_required_const(2)]
 pub unsafe fn _mm512_shuffle_pd(a: __m512d, b: __m512d, imm8: i32) -> __m512d {
+    assert!(imm8 >= 0 && imm8 <= 255);
     let imm8 = (imm8 & 0xFF) as u8;
     macro_rules! shuffle8 {
         ($a:expr, $b:expr, $c:expr, $d:expr, $e:expr, $f:expr, $g:expr, $h:expr) => {
@@ -9073,6 +9077,7 @@ pub unsafe fn _mm512_mask_shuffle_pd(
     b: __m512d,
     imm8: i32,
 ) -> __m512d {
+    assert!(imm8 >= 0 && imm8 <= 255);
     let imm8 = (imm8 & 0xFF) as u8;
     macro_rules! shuffle8 {
         ($a:expr, $b:expr, $c:expr, $d:expr, $e:expr, $f:expr, $g:expr, $h:expr) => {
@@ -9151,6 +9156,7 @@ pub unsafe fn _mm512_mask_shuffle_pd(
 #[cfg_attr(test, assert_instr(vshufpd, imm8 = 3))]
 #[rustc_args_required_const(3)]
 pub unsafe fn _mm512_maskz_shuffle_pd(k: __mmask8, a: __m512d, b: __m512d, imm8: i32) -> __m512d {
+    assert!(imm8 >= 0 && imm8 <= 255);
     let imm8 = (imm8 & 0xFF) as u8;
     macro_rules! shuffle8 {
         ($a:expr, $b:expr, $c:expr, $d:expr, $e:expr, $f:expr, $g:expr, $h:expr) => {
@@ -9230,8 +9236,8 @@ pub unsafe fn _mm512_maskz_shuffle_pd(k: __mmask8, a: __m512d, b: __m512d, imm8:
 #[cfg_attr(test, assert_instr(vshufi64x2, imm8 = 0b10111111))] //should be vshufi32x4, but generate vshufi64x2
 #[rustc_args_required_const(2)]
 pub unsafe fn _mm512_shuffle_i32x4(a: __m512i, b: __m512i, imm8: i32) -> __m512i {
+    assert!(imm8 >= 0 && imm8 <= 255);
     let imm8 = (imm8 & 0xFF) as u8;
-
     let a = a.as_i32x16();
     let b = b.as_i32x16();
     macro_rules! shuffle4 {
@@ -9316,8 +9322,8 @@ pub unsafe fn _mm512_mask_shuffle_i32x4(
     b: __m512i,
     imm8: i32,
 ) -> __m512i {
+    assert!(imm8 >= 0 && imm8 <= 255);
     let imm8 = (imm8 & 0xFF) as u8;
-
     let a = a.as_i32x16();
     let b = b.as_i32x16();
     macro_rules! shuffle4 {
@@ -9401,8 +9407,8 @@ pub unsafe fn _mm512_maskz_shuffle_i32x4(
     b: __m512i,
     imm8: i32,
 ) -> __m512i {
+    assert!(imm8 >= 0 && imm8 <= 255);
     let imm8 = (imm8 & 0xFF) as u8;
-
     let a = a.as_i32x16();
     let b = b.as_i32x16();
     macro_rules! shuffle4 {
@@ -9482,6 +9488,7 @@ pub unsafe fn _mm512_maskz_shuffle_i32x4(
 #[cfg_attr(test, assert_instr(vshufi64x2, imm8 = 0b10111111))]
 #[rustc_args_required_const(2)]
 pub unsafe fn _mm512_shuffle_i64x2(a: __m512i, b: __m512i, imm8: i32) -> __m512i {
+    assert!(imm8 >= 0 && imm8 <= 255);
     let imm8 = (imm8 & 0xFF) as u8;
     macro_rules! shuffle4 {
         (
@@ -9549,6 +9556,7 @@ pub unsafe fn _mm512_mask_shuffle_i64x2(
     b: __m512i,
     imm8: i32,
 ) -> __m512i {
+    assert!(imm8 >= 0 && imm8 <= 255);
     let imm8 = (imm8 & 0xFF) as u8;
     macro_rules! shuffle4 {
         (
@@ -9617,6 +9625,7 @@ pub unsafe fn _mm512_maskz_shuffle_i64x2(
     b: __m512i,
     imm8: i32,
 ) -> __m512i {
+    assert!(imm8 >= 0 && imm8 <= 255);
     let imm8 = (imm8 & 0xFF) as u8;
     macro_rules! shuffle4 {
         (
@@ -9681,6 +9690,7 @@ pub unsafe fn _mm512_maskz_shuffle_i64x2(
 #[cfg_attr(test, assert_instr(vshuff64x2, imm8 = 0b10111111))] //should be vshuff32x4, but generate vshuff64x2
 #[rustc_args_required_const(2)]
 pub unsafe fn _mm512_shuffle_f32x4(a: __m512, b: __m512, imm8: i32) -> __m512 {
+    assert!(imm8 >= 0 && imm8 <= 255);
     let imm8 = (imm8 & 0xFF) as u8;
     macro_rules! shuffle4 {
         (
@@ -9762,6 +9772,7 @@ pub unsafe fn _mm512_mask_shuffle_f32x4(
     b: __m512,
     imm8: i32,
 ) -> __m512 {
+    assert!(imm8 >= 0 && imm8 <= 255);
     let imm8 = (imm8 & 0xFF) as u8;
     macro_rules! shuffle4 {
         (
@@ -9839,6 +9850,7 @@ pub unsafe fn _mm512_mask_shuffle_f32x4(
 #[cfg_attr(test, assert_instr(vshuff32x4, imm8 = 0b10111111))]
 #[rustc_args_required_const(3)]
 pub unsafe fn _mm512_maskz_shuffle_f32x4(k: __mmask16, a: __m512, b: __m512, imm8: i32) -> __m512 {
+    assert!(imm8 >= 0 && imm8 <= 255);
     let imm8 = (imm8 & 0xFF) as u8;
     macro_rules! shuffle4 {
         (
@@ -9917,6 +9929,7 @@ pub unsafe fn _mm512_maskz_shuffle_f32x4(k: __mmask16, a: __m512, b: __m512, imm
 #[cfg_attr(test, assert_instr(vshuff64x2, imm8 = 0b10111111))]
 #[rustc_args_required_const(2)]
 pub unsafe fn _mm512_shuffle_f64x2(a: __m512d, b: __m512d, imm8: i32) -> __m512d {
+    assert!(imm8 >= 0 && imm8 <= 255);
     let imm8 = (imm8 & 0xFF) as u8;
     macro_rules! shuffle4 {
         (
@@ -9984,6 +9997,7 @@ pub unsafe fn _mm512_mask_shuffle_f64x2(
     b: __m512d,
     imm8: i32,
 ) -> __m512d {
+    assert!(imm8 >= 0 && imm8 <= 255);
     let imm8 = (imm8 & 0xFF) as u8;
     macro_rules! shuffle4 {
         (
@@ -10052,6 +10066,7 @@ pub unsafe fn _mm512_maskz_shuffle_f64x2(
     b: __m512d,
     imm8: i32,
 ) -> __m512d {
+    assert!(imm8 >= 0 && imm8 <= 255);
     let imm8 = (imm8 & 0xFF) as u8;
     macro_rules! shuffle4 {
         (
@@ -10119,6 +10134,7 @@ pub unsafe fn _mm512_maskz_shuffle_f64x2(
 )]
 #[rustc_args_required_const(1)]
 pub unsafe fn _mm512_extractf32x4_ps(a: __m512, imm8: i32) -> __m128 {
+    assert!(imm8 >= 0 && imm8 <= 3);
     match imm8 & 0x3 {
         0 => simd_shuffle4(a, _mm512_undefined_ps(), [0, 1, 2, 3]),
         1 => simd_shuffle4(a, _mm512_undefined_ps(), [4, 5, 6, 7]),
@@ -10237,6 +10253,7 @@ pub unsafe fn _mm512_maskz_movedup_pd(k: __mmask8, a: __m512d) -> __m512d {
 #[cfg_attr(test, assert_instr(vinsertf32x4, imm8 = 2))] //should be vinserti32x4
 #[rustc_args_required_const(2)]
 pub unsafe fn _mm512_inserti32x4(a: __m512i, b: __m128i, imm8: i32) -> __m512i {
+    assert!(imm8 >= 0 && imm8 <= 3);
     let a = a.as_i32x16();
     let b = _mm512_castsi128_si512(b).as_i32x16();
     let ret: i32x16 = match imm8 & 0b11 {
@@ -10274,6 +10291,7 @@ pub unsafe fn _mm512_mask_inserti32x4(
     b: __m128i,
     imm8: i32,
 ) -> __m512i {
+    assert!(imm8 >= 0 && imm8 <= 3);
     let a = a.as_i32x16();
     let b = _mm512_castsi128_si512(b).as_i32x16();
     let insert: i32x16 = match imm8 & 0b11 {
@@ -10305,6 +10323,7 @@ pub unsafe fn _mm512_mask_inserti32x4(
 #[cfg_attr(test, assert_instr(vinserti32x4, imm8 = 2))]
 #[rustc_args_required_const(3)]
 pub unsafe fn _mm512_maskz_inserti32x4(k: __mmask16, a: __m512i, b: __m128i, imm8: i32) -> __m512i {
+    assert!(imm8 >= 0 && imm8 <= 3);
     let a = a.as_i32x16();
     let b = _mm512_castsi128_si512(b).as_i32x16();
     let insert = match imm8 & 0b11 {
@@ -10337,6 +10356,7 @@ pub unsafe fn _mm512_maskz_inserti32x4(k: __mmask16, a: __m512i, b: __m128i, imm
 #[cfg_attr(test, assert_instr(vinsertf64x4, imm8 = 1))] //should be vinserti64x4
 #[rustc_args_required_const(2)]
 pub unsafe fn _mm512_inserti64x4(a: __m512i, b: __m256i, imm8: i32) -> __m512i {
+    assert!(imm8 >= 0 && imm8 <= 1);
     let b = _mm512_castsi256_si512(b);
     match imm8 & 0b1 {
         0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
@@ -10358,6 +10378,7 @@ pub unsafe fn _mm512_mask_inserti64x4(
     b: __m256i,
     imm8: i32,
 ) -> __m512i {
+    assert!(imm8 >= 0 && imm8 <= 1);
     let b = _mm512_castsi256_si512(b);
     let insert = match imm8 & 0b1 {
         0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
@@ -10374,6 +10395,7 @@ pub unsafe fn _mm512_mask_inserti64x4(
 #[cfg_attr(test, assert_instr(vinserti64x4, imm8 = 1))]
 #[rustc_args_required_const(3)]
 pub unsafe fn _mm512_maskz_inserti64x4(k: __mmask8, a: __m512i, b: __m256i, imm8: i32) -> __m512i {
+    assert!(imm8 >= 0 && imm8 <= 1);
     let b = _mm512_castsi256_si512(b);
     let insert = match imm8 & 0b1 {
         0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
@@ -10391,6 +10413,7 @@ pub unsafe fn _mm512_maskz_inserti64x4(k: __mmask8, a: __m512i, b: __m256i, imm8
 #[cfg_attr(test, assert_instr(vinsertf32x4, imm8 = 2))]
 #[rustc_args_required_const(2)]
 pub unsafe fn _mm512_insertf32x4(a: __m512, b: __m128, imm8: i32) -> __m512 {
+    assert!(imm8 >= 0 && imm8 <= 3);
     let b = _mm512_castps128_ps512(b);
     match imm8 & 0b11 {
         0 => simd_shuffle16(
@@ -10426,6 +10449,7 @@ pub unsafe fn _mm512_mask_insertf32x4(
     b: __m128,
     imm8: i32,
 ) -> __m512 {
+    assert!(imm8 >= 0 && imm8 <= 3);
     let b = _mm512_castps128_ps512(b);
     let insert = match imm8 & 0b11 {
         0 => simd_shuffle16(
@@ -10456,6 +10480,7 @@ pub unsafe fn _mm512_mask_insertf32x4(
 #[cfg_attr(test, assert_instr(vinsertf32x4, imm8 = 2))]
 #[rustc_args_required_const(3)]
 pub unsafe fn _mm512_maskz_insertf32x4(k: __mmask16, a: __m512, b: __m128, imm8: i32) -> __m512 {
+    assert!(imm8 >= 0 && imm8 <= 3);
     let b = _mm512_castps128_ps512(b);
     let insert = match imm8 & 0b11 {
         0 => simd_shuffle16(
@@ -10487,6 +10512,7 @@ pub unsafe fn _mm512_maskz_insertf32x4(k: __mmask16, a: __m512, b: __m128, imm8:
 #[cfg_attr(test, assert_instr(vinsertf64x4, imm8 = 1))]
 #[rustc_args_required_const(2)]
 pub unsafe fn _mm512_insertf64x4(a: __m512d, b: __m256d, imm8: i32) -> __m512d {
+    assert!(imm8 >= 0 && imm8 <= 1);
     let b = _mm512_castpd256_pd512(b);
     match imm8 & 0b1 {
         0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
@@ -10508,6 +10534,7 @@ pub unsafe fn _mm512_mask_insertf64x4(
     b: __m256d,
     imm8: i32,
 ) -> __m512d {
+    assert!(imm8 >= 0 && imm8 <= 1);
     let b = _mm512_castpd256_pd512(b);
     let insert = match imm8 & 0b1 {
         0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
@@ -10524,6 +10551,7 @@ pub unsafe fn _mm512_mask_insertf64x4(
 #[cfg_attr(test, assert_instr(vinsertf64x4, imm8 = 1))]
 #[rustc_args_required_const(3)]
 pub unsafe fn _mm512_maskz_insertf64x4(k: __mmask8, a: __m512d, b: __m256d, imm8: i32) -> __m512d {
+    assert!(imm8 >= 0 && imm8 <= 1);
     let b = _mm512_castpd256_pd512(b);
     let insert = match imm8 & 0b1 {
         0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),