From 45899dcd62cb653c39eb7bbdec37b7eb53491459 Mon Sep 17 00:00:00 2001 From: jironglin Date: Sun, 20 Sep 2020 21:57:15 +0000 Subject: [PATCH 01/20] castps128_512 --- crates/core_arch/avx512f.md | 2 +- crates/core_arch/src/x86/avx512f.rs | 53 +++++++++++++++++++++++++++++ crates/core_arch/src/x86/mod.rs | 18 ++++++++++ 3 files changed, 72 insertions(+), 1 deletion(-) diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md index c978a63461..353dab7a3e 100644 --- a/crates/core_arch/avx512f.md +++ b/crates/core_arch/avx512f.md @@ -32,7 +32,7 @@ * [ ] [`_mm512_castpd512_pd256`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd512_pd256&expand=5236) * [ ] [`_mm512_castpd_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd_ps&expand=5236) * [ ] [`_mm512_castpd_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd_si512&expand=5236) - * [ ] [`_mm512_castps128_ps512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castps128_ps512&expand=5236) + * [x] [`_mm512_castps128_ps512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castps128_ps512&expand=5236) * [ ] [`_mm512_castps256_ps512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castps256_ps512&expand=5236) * [ ] [`_mm512_castps512_ps128`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castps512_ps128&expand=5236) * [ ] [`_mm512_castps512_ps256`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castps512_ps256&expand=5236) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 3f9bbfb3e1..80cfe54a58 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -10228,6 +10228,32 @@ pub unsafe fn _mm512_maskz_movedup_pd(k: __mmask8, a: __m512d) -> __m512d { let zero = _mm512_setzero_pd().as_f64x8(); transmute(simd_select_bitmask(k, mov, zero)) } +/* +/// Copy a to dst, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into dst at the location specified by imm8. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_insertf32x4&expand=3155) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vinsertf32x4))] +pub unsafe fn _mm512_insertf32x4(a: __m512, b: __m128, imm8: i32) -> __m512 { + let a = a.as_f32x16(); + let b = b.as_f32x4(); + match imm8 & 3 { + 0 => transmute(simd_insert(a, 0, b)), + 1 => transmute(simd_insert(a, 1, b)), + 2 => transmute(simd_insert(a, 2, b)), + _ => transmute(simd_insert(a, 3, b)), + } +} +*/ +/// Cast vector of type __m128 to type __m512; the upper 384 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castps128_ps512&expand=621) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_castps128_ps512(a: __m128) -> __m512 { + simd_shuffle16(a, _mm_set1_ps(-1.), [0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]) +} /// Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). /// @@ -18367,6 +18393,33 @@ mod tests { ); assert_eq_m512(r, e); } +/* + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_insertf32x4() { + let a = _mm512_setr_ps( + 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., + ); + let b = _mm_setr_ps( + 17., 18., 19., 20., + ); + let r = _mm512_insertf32x4(a, b, 0); + let e = _mm512_setr_ps( + 17., 18., 19., 20., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., + ); + assert_eq_m512(r, e); + } +*/ + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_castps128_ps512() { + let a = _mm_setr_ps( + 17., 18., 19., 20., + ); + let r = _mm512_castps128_ps512(a); + let e = _mm512_setr_ps( + 17., 18., 19., 20., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., + ); + assert_eq_m512(r, e); + } #[simd_test(enable = "avx512f")] unsafe fn test_mm512_and_epi32() { diff --git a/crates/core_arch/src/x86/mod.rs b/crates/core_arch/src/x86/mod.rs index abe99f23a2..a649cf3e2d 100644 --- a/crates/core_arch/src/x86/mod.rs +++ b/crates/core_arch/src/x86/mod.rs @@ -433,6 +433,24 @@ impl m256iExt for __m256i { } } +#[allow(non_camel_case_types)] +#[unstable(feature = "stdimd_internal", issue = "none")] +pub(crate) trait m128Ext: Sized { + fn as_m128(self) -> __m128; + + #[inline] + fn as_f32x4(self) -> crate::core_arch::simd::f32x4 { + unsafe { transmute(self.as_m128()) } + } +} + +impl m128Ext for __m128 { + #[inline] + fn as_m128(self) -> Self { + self + } +} + #[allow(non_camel_case_types)] #[unstable(feature = "stdsimd_internal", issue = "none")] pub(crate) trait m256Ext: Sized { From c73dd0f9ed175a5a3ecad1a84887ef95cee7bb35 Mon Sep 17 00:00:00 2001 From: jironglin Date: Mon, 21 Sep 2020 00:55:17 +0000 Subject: [PATCH 02/20] castps_pd, castps_si512, castps512_ps256, castps512_ps128, castps128_ps512, castps256_ps512 --- crates/core_arch/avx512f.md | 10 +-- crates/core_arch/src/x86/avx512f.rs | 97 +++++++++++++++++++++++++++++ 2 files changed, 102 insertions(+), 5 deletions(-) diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md index 353dab7a3e..4fb16cd404 100644 --- a/crates/core_arch/avx512f.md +++ b/crates/core_arch/avx512f.md @@ -33,11 +33,11 @@ * [ ] [`_mm512_castpd_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd_ps&expand=5236) * [ ] [`_mm512_castpd_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd_si512&expand=5236) * [x] [`_mm512_castps128_ps512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castps128_ps512&expand=5236) - * [ ] [`_mm512_castps256_ps512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castps256_ps512&expand=5236) - * [ ] [`_mm512_castps512_ps128`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castps512_ps128&expand=5236) - * [ ] [`_mm512_castps512_ps256`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castps512_ps256&expand=5236) - * [ ] [`_mm512_castps_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castps_pd&expand=5236) - * [ ] [`_mm512_castps_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castps_si512&expand=5236) + * [x] [`_mm512_castps256_ps512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castps256_ps512&expand=5236) + * [x] [`_mm512_castps512_ps128`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castps512_ps128&expand=5236) + * [x] [`_mm512_castps512_ps256`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castps512_ps256&expand=5236) + * [x] [`_mm512_castps_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castps_pd&expand=5236) + * [x] [`_mm512_castps_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castps_si512&expand=5236) * [ ] [`_mm512_castsi128_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castsi128_si512&expand=5236) * [ ] [`_mm512_castsi256_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castsi256_si512&expand=5236) * [ ] [`_mm512_castsi512_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castsi512_pd&expand=5236) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 80cfe54a58..01bc8b1ad0 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -10255,6 +10255,51 @@ pub unsafe fn _mm512_castps128_ps512(a: __m128) -> __m512 { simd_shuffle16(a, _mm_set1_ps(-1.), [0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]) } +/// Cast vector of type __m256 to type __m512; the upper 256 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castps256_ps512&expand=623) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_castps256_ps512(a: __m256) -> __m512 { + simd_shuffle16(a, _mm256_set1_ps(-1.), [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8]) +} + +/// Cast vector of type __m512 to type __m128. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castps512_ps128&expand=624) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_castps512_ps128(a: __m512) -> __m128 { + simd_shuffle4(a, a, [0, 1, 2, 3]) +} + +/// Cast vector of type __m512 to type __m256. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castps512_ps256&expand=625) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_castps512_ps256(a: __m512) -> __m256 { + simd_shuffle8(a, a, [0, 1, 2, 3, 4, 5, 6, 7]) +} + +/// Cast vector of type __m512 to type __m512d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castps_pd&expand=616) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_castps_pd(a: __m512) -> __m512d { + transmute(a.as_m512()) +} + +/// Cast vector of type __m512 to type __m512i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castps_si512&expand=619) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_castps_si512(a: __m512) -> __m512i { + transmute(a.as_m512()) +} + /// Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). /// /// Compute the bitwise AND of packed 32-bit integers in a and b, and store the results in dst. @@ -18421,6 +18466,58 @@ mod tests { assert_eq_m512(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_castps256_ps512() { + let a = _mm256_setr_ps( + 17., 18., 19., 20., 21., 22., 23., 24., + ); + let r = _mm512_castps256_ps512(a); + let e = _mm512_setr_ps( + 17., 18., 19., 20., 21., 22., 23., 24., -1., -1., -1., -1., -1., -1., -1., -1., + ); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_castps512_ps128() { + let a = _mm512_setr_ps( + 17., 18., 19., 20., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., + ); + let r = _mm512_castps512_ps128(a); + let e = _mm_setr_ps( + 17., 18., 19., 20., + ); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_castps512_ps256() { + let a = _mm512_setr_ps( + 17., 18., 19., 20., 21., 22., 23., 24., -1., -1., -1., -1., -1., -1., -1., -1., + ); + let r = _mm512_castps512_ps256(a); + let e = _mm256_setr_ps( + 17., 18., 19., 20., 21., 22., 23., 24., + ); + assert_eq_m256(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_castps_pd() { + let a = _mm512_set1_ps(1.); + let r = _mm512_castps_pd(a); + let e = _mm512_set1_pd(0.007812501848093234); + assert_eq_m512d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_castps_si512() { + let a = _mm512_set1_ps(1.); + let r = _mm512_castps_si512(a); + let e = _mm512_set1_epi32(1065353216); + assert_eq_m512i(r, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_and_epi32() { let a = _mm512_set_epi32( From 4e9d4d5f33617a24831c344ff2f51d1756e2060e Mon Sep 17 00:00:00 2001 From: jironglin Date: Mon, 21 Sep 2020 21:46:32 +0000 Subject: [PATCH 03/20] castpd_ps, castpd_si512, castpd512_pd256, castpd512_pd128, castpd128_pd512, castpd256_pd512 --- crates/core_arch/avx512f.md | 8 +-- crates/core_arch/src/x86/avx512f.rs | 82 +++++++++++++++++++++++++- crates/core_arch/src/x86_64/avx512f.rs | 64 ++++++++++++++++++++ 3 files changed, 149 insertions(+), 5 deletions(-) diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md index 4fb16cd404..7cf023cbf5 100644 --- a/crates/core_arch/avx512f.md +++ b/crates/core_arch/avx512f.md @@ -26,10 +26,10 @@ * [ ] [`_mm512_broadcastq_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcastq_epi64&expand=5236) * [ ] [`_mm512_broadcastsd_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcastsd_pd&expand=5236) * [ ] [`_mm512_broadcastss_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcastss_ps&expand=5236) - * [ ] [`_mm512_castpd128_pd512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd128_pd512&expand=5236) - * [ ] [`_mm512_castpd256_pd512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd256_pd512&expand=5236) - * [ ] [`_mm512_castpd512_pd128`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd512_pd128&expand=5236) - * [ ] [`_mm512_castpd512_pd256`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd512_pd256&expand=5236) + * [x] [`_mm512_castpd128_pd512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd128_pd512&expand=5236) + * [x] [`_mm512_castpd256_pd512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd256_pd512&expand=5236) + * [x] [`_mm512_castpd512_pd128`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd512_pd128&expand=5236) + * [x] [`_mm512_castpd512_pd256`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd512_pd256&expand=5236) * [ ] [`_mm512_castpd_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd_ps&expand=5236) * [ ] [`_mm512_castpd_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd_si512&expand=5236) * [x] [`_mm512_castps128_ps512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castps128_ps512&expand=5236) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 01bc8b1ad0..4f4b11e762 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -10300,6 +10300,74 @@ pub unsafe fn _mm512_castps_si512(a: __m512) -> __m512i { transmute(a.as_m512()) } +/// Cast vector of type __m128d to type __m512d; the upper 384 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castpd128_pd512&expand=609) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_castpd128_pd512(a: __m128d) -> __m512d { + simd_shuffle8(a, _mm_set1_pd(-1.), [0, 1, 2, 2, 2, 2, 2, 2]) +} + +/// Cast vector of type __m256d to type __m512d; the upper 256 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castpd256_pd512&expand=611) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_castpd256_pd512(a: __m256d) -> __m512d { + simd_shuffle8(a, _mm256_set1_pd(-1.), [0, 1, 2, 3, 4, 4, 4, 4]) +} + +/// Cast vector of type __m512d to type __m128d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castpd512_pd128&expand=612) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_castpd512_pd128(a: __m512d) -> __m128d { + simd_shuffle2(a, a, [0, 1]) +} + +/// Cast vector of type __m512d to type __m256d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castpd512_pd256&expand=613) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_castpd512_pd256(a: __m512d) -> __m256d { + simd_shuffle4(a, a, [0, 1, 2, 3]) +} + +/// Cast vector of type __m512d to type __m512. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castpd_ps&expand=604) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_castpd_ps(a: __m512d) -> __m512 { + transmute(a.as_m512d()) +} + +/// Cast vector of type __m512d to type __m512i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castpd_si512&expand=607) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_castpd_si512(a: __m512d) -> __m512i { + transmute(a.as_m512d()) +} + +/* +/// Cast vector of type __m128i to type __m512i; the upper 384 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castsi128_si512&expand=629) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_castsi128_si512(a: __m128i) -> __m512i { + let a = a.as_m128i(); + transmute(simd_shuffle16(a, _mm_set1_epi32(-1).as_m128i(), [0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]).as_m512i()) +//pub unsafe fn _mm512_castps128_ps512(a: __m128) -> __m512 { + //simd_shuffle16(a, _mm_set1_ps(-1.), [0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]) +} +*/ + /// Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). /// /// Compute the bitwise AND of packed 32-bit integers in a and b, and store the results in dst. @@ -18517,7 +18585,19 @@ mod tests { let e = _mm512_set1_epi32(1065353216); assert_eq_m512i(r, e); } - +/* + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_castsi128_si512() { + let a = _mm_setr_epi32( + 17, 18, 19, 20, + ); + let r = _mm512_castsi128_si512(a); + let e = _mm512_setr_epi32( + 17, 18, 19, 20, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + ); + assert_eq_m512i(r, e); + } +*/ #[simd_test(enable = "avx512f")] unsafe fn test_mm512_and_epi32() { let a = _mm512_set_epi32( diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs index 036cf36c74..c59c84ff6e 100644 --- a/crates/core_arch/src/x86_64/avx512f.rs +++ b/crates/core_arch/src/x86_64/avx512f.rs @@ -4278,6 +4278,70 @@ mod tests { assert_eq_m512d(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_castpd128_pd512() { + let a = _mm_setr_pd( + 17., 18., + ); + let r = _mm512_castpd128_pd512(a); + let e = _mm512_setr_pd( + 17., 18., -1., -1., -1., -1., -1., -1. + ); + assert_eq_m512d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_castpd256_pd512() { + let a = _mm256_setr_pd( + 17., 18., 19., 20., + ); + let r = _mm512_castpd256_pd512(a); + let e = _mm512_setr_pd( + 17., 18., 19., 20., -1., -1., -1., -1., + ); + assert_eq_m512d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_castpd512_pd128() { + let a = _mm512_setr_pd( + 17., 18., -1., -1., -1., -1., -1., -1., + ); + let r = _mm512_castpd512_pd128(a); + let e = _mm_setr_pd( + 17., 18., + ); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_castpd512_pd256() { + let a = _mm512_setr_pd( + 17., 18., 19., 20., -1., -1., -1., -1., + ); + let r = _mm512_castpd512_pd256(a); + let e = _mm256_setr_pd( + 17., 18., 19., 20., + ); + assert_eq_m256d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_castpd_ps() { + let a = _mm512_set1_pd(1.); + let r = _mm512_castpd_ps(a); + let e = _mm512_set_ps(1.875, 0.0, 1.875, 0.0, 1.875, 0.0, 1.875, 0.0, 1.875, 0.0, 1.875, 0.0, 1.875, 0.0, 1.875, 0.0); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_castpd_si512() { + let a = _mm512_set1_pd(1.); + let r = _mm512_castpd_si512(a); + let e = _mm512_set_epi32(1072693248, 0, 1072693248, 0, 1072693248, 0, 1072693248, 0, 1072693248, 0, 1072693248, 0, 1072693248, 0, 1072693248, 0); + assert_eq_m512i(r, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_and_epi64() { let a = _mm512_set_epi64(1 << 0 | 1 << 15, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3); From 1d18142476efd28cc20333473bb010a7c4981c43 Mon Sep 17 00:00:00 2001 From: jironglin Date: Tue, 22 Sep 2020 00:52:56 +0000 Subject: [PATCH 04/20] castsi512_si128, castsi512_si256, castsi128_si512, castsi256_si512, castsi512_ps, castsi512_pd --- crates/core_arch/avx512f.md | 16 +++---- crates/core_arch/src/x86/avx512f.rs | 66 ++++++++++++++++++-------- crates/core_arch/src/x86_64/avx512f.rs | 64 +++++++++++++++++++++++++ 3 files changed, 119 insertions(+), 27 deletions(-) diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md index 7cf023cbf5..a5f95b23ef 100644 --- a/crates/core_arch/avx512f.md +++ b/crates/core_arch/avx512f.md @@ -30,20 +30,20 @@ * [x] [`_mm512_castpd256_pd512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd256_pd512&expand=5236) * [x] [`_mm512_castpd512_pd128`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd512_pd128&expand=5236) * [x] [`_mm512_castpd512_pd256`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd512_pd256&expand=5236) - * [ ] [`_mm512_castpd_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd_ps&expand=5236) - * [ ] [`_mm512_castpd_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd_si512&expand=5236) + * [x] [`_mm512_castpd_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd_ps&expand=5236) + * [x] [`_mm512_castpd_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd_si512&expand=5236) * [x] [`_mm512_castps128_ps512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castps128_ps512&expand=5236) * [x] [`_mm512_castps256_ps512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castps256_ps512&expand=5236) * [x] [`_mm512_castps512_ps128`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castps512_ps128&expand=5236) * [x] [`_mm512_castps512_ps256`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castps512_ps256&expand=5236) * [x] [`_mm512_castps_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castps_pd&expand=5236) * [x] [`_mm512_castps_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castps_si512&expand=5236) - * [ ] [`_mm512_castsi128_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castsi128_si512&expand=5236) - * [ ] [`_mm512_castsi256_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castsi256_si512&expand=5236) - * [ ] [`_mm512_castsi512_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castsi512_pd&expand=5236) - * [ ] [`_mm512_castsi512_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castsi512_ps&expand=5236) - * [ ] [`_mm512_castsi512_si128`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castsi512_si128&expand=5236) - * [ ] [`_mm512_castsi512_si256`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castsi512_si256&expand=5236) + * [x] [`_mm512_castsi128_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castsi128_si512&expand=5236) + * [x] [`_mm512_castsi256_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castsi256_si512&expand=5236) + * [x] [`_mm512_castsi512_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castsi512_pd&expand=5236) + * [x] [`_mm512_castsi512_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castsi512_ps&expand=5236) + * [x] [`_mm512_castsi512_si128`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castsi512_si128&expand=5236) + * [x] [`_mm512_castsi512_si256`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castsi512_si256&expand=5236) * [x] [`_mm512_cmp_epi32_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_epi32_mask&expand=5236) * [x] [`_mm512_cmp_epi64_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_epi64_mask&expand=5236) * [x] [`_mm512_cmp_epu32_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_epu32_mask&expand=5236) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 4f4b11e762..7ae09d463e 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -10354,19 +10354,59 @@ pub unsafe fn _mm512_castpd_si512(a: __m512d) -> __m512i { transmute(a.as_m512d()) } -/* /// Cast vector of type __m128i to type __m512i; the upper 384 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castsi128_si512&expand=629) #[inline] #[target_feature(enable = "avx512f")] pub unsafe fn _mm512_castsi128_si512(a: __m128i) -> __m512i { - let a = a.as_m128i(); - transmute(simd_shuffle16(a, _mm_set1_epi32(-1).as_m128i(), [0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]).as_m512i()) -//pub unsafe fn _mm512_castps128_ps512(a: __m128) -> __m512 { - //simd_shuffle16(a, _mm_set1_ps(-1.), [0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]) + simd_shuffle8(a, _mm_set1_epi64x(-1), [0, 1, 2, 2, 2, 2, 2, 2]) +} + +/// Cast vector of type __m256i to type __m512i; the upper 256 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castsi256_si512&expand=633) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_castsi256_si512(a: __m256i) -> __m512i { + simd_shuffle8(a, _mm256_set1_epi64x(-1), [0, 1, 2, 3, 4, 4, 4, 4]) +} + +/// Cast vector of type __m512i to type __m128i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castsi512_si128&expand=636) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_castsi512_si128(a: __m512i) -> __m128i { + simd_shuffle2(a, a, [0, 1]) +} + +/// Cast vector of type __m512i to type __m256i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castsi512_si256&expand=637) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_castsi512_si256(a: __m512i) -> __m256i { + simd_shuffle4(a, a, [0, 1, 2, 3]) +} + +/// Cast vector of type __m512i to type __m512. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castsi512_ps&expand=635) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_castsi512_ps(a: __m512i) -> __m512 { + transmute(a) +} + +/// Cast vector of type __m512i to type __m512d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castsi512_pd&expand=634) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_castsi512_pd(a: __m512i) -> __m512d { + transmute(a) } -*/ /// Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). /// @@ -18585,19 +18625,7 @@ mod tests { let e = _mm512_set1_epi32(1065353216); assert_eq_m512i(r, e); } -/* - #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_castsi128_si512() { - let a = _mm_setr_epi32( - 17, 18, 19, 20, - ); - let r = _mm512_castsi128_si512(a); - let e = _mm512_setr_epi32( - 17, 18, 19, 20, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - ); - assert_eq_m512i(r, e); - } -*/ + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_and_epi32() { let a = _mm512_set_epi32( diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs index c59c84ff6e..327d78e2ec 100644 --- a/crates/core_arch/src/x86_64/avx512f.rs +++ b/crates/core_arch/src/x86_64/avx512f.rs @@ -4342,6 +4342,70 @@ mod tests { assert_eq_m512i(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_castsi128_si512() { + let a = _mm_setr_epi64x( + 17, 18, + ); + let r = _mm512_castsi128_si512(a); + let e = _mm512_setr_epi64( + 17, 18, -1, -1, -1, -1, -1, -1, + ); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_castsi256_si512() { + let a = _mm256_setr_epi64x( + 17, 18, 19, 20, + ); + let r = _mm512_castsi256_si512(a); + let e = _mm512_setr_epi64( + 17, 18, 19, 20, -1, -1, -1, -1, + ); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_castsi512_si128() { + let a = _mm512_setr_epi64( + 17, 18, -1, -1, -1, -1, -1, -1, + ); + let r = _mm512_castsi512_si128(a); + let e = _mm_setr_epi64x( + 17, 18, + ); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_castsi512_si256() { + let a = _mm512_setr_epi64( + 17, 18, 19, 20, -1, -1, -1, -1, + ); + let r = _mm512_castsi512_si256(a); + let e = _mm256_setr_epi64x( + 17, 18, 19, 20, + ); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_castsi512_ps() { + let a = _mm512_set1_epi64(1<<62); + let r = _mm512_castsi512_ps(a); + let e = _mm512_set_ps(2., 0., 2., 0., 2., 0., 2., 0., 2., 0., 2., 0., 2., 0., 2., 0.); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_castsi512_pd() { + let a = _mm512_set1_epi64(1<<62); + let r = _mm512_castsi512_pd(a); + let e = _mm512_set_pd(2., 2., 2., 2., 2., 2., 2., 2.); + assert_eq_m512d(r, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_and_epi64() { let a = _mm512_set_epi64(1 << 0 | 1 << 15, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3); From f2723d8d76c9d69eccd42473addc7eddb4d54005 Mon Sep 17 00:00:00 2001 From: jironglin Date: Tue, 22 Sep 2020 01:34:58 +0000 Subject: [PATCH 05/20] broadcastsd_pd, broadcastss_ps --- crates/core_arch/avx512f.md | 4 ++-- crates/core_arch/src/x86/avx512f.rs | 30 +++++++++++++++++++++++++- crates/core_arch/src/x86_64/avx512f.rs | 10 +++++++++ 3 files changed, 41 insertions(+), 3 deletions(-) diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md index a5f95b23ef..7118655272 100644 --- a/crates/core_arch/avx512f.md +++ b/crates/core_arch/avx512f.md @@ -24,8 +24,8 @@ * [ ] [`_mm512_broadcast_i64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcast_i64x4&expand=5236) * [ ] [`_mm512_broadcastd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcastd_epi32&expand=5236) * [ ] [`_mm512_broadcastq_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcastq_epi64&expand=5236) - * [ ] [`_mm512_broadcastsd_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcastsd_pd&expand=5236) - * [ ] [`_mm512_broadcastss_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcastss_ps&expand=5236) + * [x] [`_mm512_broadcastsd_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcastsd_pd&expand=5236) + * [x] [`_mm512_broadcastss_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcastss_ps&expand=5236) * [x] [`_mm512_castpd128_pd512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd128_pd512&expand=5236) * [x] [`_mm512_castpd256_pd512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd256_pd512&expand=5236) * [x] [`_mm512_castpd512_pd128`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd512_pd128&expand=5236) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 7ae09d463e..fcc58ff560 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -10408,8 +10408,26 @@ pub unsafe fn _mm512_castsi512_pd(a: __m512i) -> __m512d { transmute(a) } -/// Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcastsd_pd&expand=567) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vbroadcastsd))] +pub unsafe fn _mm512_broadcastsd_pd(a: __m128d) -> __m512d { + simd_shuffle8(a, a, [1, 1, 1, 1, 1, 1, 1, 1]) +} + +/// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst. /// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcastss_ps&expand=578) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vbroadcastss))] +pub unsafe fn _mm512_broadcastss_ps(a: __m128) -> __m512 { + simd_shuffle16(a, a, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) +} + /// Compute the bitwise AND of packed 32-bit integers in a and b, and store the results in dst. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_and_epi32&expand=272) @@ -18626,6 +18644,16 @@ mod tests { assert_eq_m512i(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_broadcastss_ps() { + let a = _mm_set_ps( + 17., 18., 19., 20., + ); + let r = _mm512_broadcastss_ps(a); + let e = _mm512_set1_ps(20.); + assert_eq_m512(r, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_and_epi32() { let a = _mm512_set_epi32( diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs index 327d78e2ec..7401a96ecf 100644 --- a/crates/core_arch/src/x86_64/avx512f.rs +++ b/crates/core_arch/src/x86_64/avx512f.rs @@ -4406,6 +4406,16 @@ mod tests { assert_eq_m512d(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_broadcastsd_pd() { + let a = _mm_setr_pd( + 17., 18., + ); + let r = _mm512_broadcastsd_pd(a); + let e = _mm512_set1_pd(18.); + assert_eq_m512d(r, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_and_epi64() { let a = _mm512_set_epi64(1 << 0 | 1 << 15, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3); From def78a5eab34706f65f036d7b90148f742481a36 Mon Sep 17 00:00:00 2001 From: jironglin Date: Tue, 22 Sep 2020 15:22:48 +0000 Subject: [PATCH 06/20] broadcastd_epi32, broadcastq_epi64 --- crates/core_arch/avx512f.md | 20 +-- crates/core_arch/src/x86/avx512f.rs | 186 ++++++++++++++++++++++++- crates/core_arch/src/x86_64/avx512f.rs | 60 ++++++++ 3 files changed, 252 insertions(+), 14 deletions(-) diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md index 7118655272..fd4916c930 100644 --- a/crates/core_arch/avx512f.md +++ b/crates/core_arch/avx512f.md @@ -22,8 +22,8 @@ * [ ] [`_mm512_broadcast_f64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcast_f64x4&expand=5236) * [ ] [`_mm512_broadcast_i32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcast_i32x4&expand=5236) * [ ] [`_mm512_broadcast_i64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcast_i64x4&expand=5236) - * [ ] [`_mm512_broadcastd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcastd_epi32&expand=5236) - * [ ] [`_mm512_broadcastq_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcastq_epi64&expand=5236) + * [x] [`_mm512_broadcastd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcastd_epi32&expand=5236) + * [x] [`_mm512_broadcastq_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcastq_epi64&expand=5236) * [x] [`_mm512_broadcastsd_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcastsd_pd&expand=5236) * [x] [`_mm512_broadcastss_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcastss_ps&expand=5236) * [x] [`_mm512_castpd128_pd512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd128_pd512&expand=5236) @@ -298,10 +298,10 @@ * [ ] [`_mm512_mask_broadcast_f64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcast_f64x4&expand=5236) * [ ] [`_mm512_mask_broadcast_i32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcast_i32x4&expand=5236) * [ ] [`_mm512_mask_broadcast_i64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcast_i64x4&expand=5236) - * [ ] [`_mm512_mask_broadcastd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcastd_epi32&expand=5236) - * [ ] [`_mm512_mask_broadcastq_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcastq_epi64&expand=5236) - * [ ] [`_mm512_mask_broadcastsd_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcastsd_pd&expand=5236) - * [ ] [`_mm512_mask_broadcastss_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcastss_ps&expand=5236) + * [x] [`_mm512_mask_broadcastd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcastd_epi32&expand=5236) + * [x] [`_mm512_mask_broadcastq_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcastq_epi64&expand=5236) + * [x] [`_mm512_mask_broadcastsd_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcastsd_pd&expand=5236) + * [x] [`_mm512_mask_broadcastss_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcastss_ps&expand=5236) * [x] [`_mm512_mask_cmp_epi32_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_epi32_mask&expand=5236) * [x] [`_mm512_mask_cmp_epi64_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_epi64_mask&expand=5236) * [x] [`_mm512_mask_cmp_epu32_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_epu32_mask&expand=5236) @@ -694,10 +694,10 @@ * [ ] [`_mm512_maskz_broadcast_f64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcast_f64x4&expand=5236) * [ ] [`_mm512_maskz_broadcast_i32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcast_i32x4&expand=5236) * [ ] [`_mm512_maskz_broadcast_i64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcast_i64x4&expand=5236) - * [ ] [`_mm512_maskz_broadcastd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcastd_epi32&expand=5236) - * [ ] [`_mm512_maskz_broadcastq_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcastq_epi64&expand=5236) - * [ ] [`_mm512_maskz_broadcastsd_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcastsd_pd&expand=5236) - * [ ] [`_mm512_maskz_broadcastss_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcastss_ps&expand=5236) + * [x] [`_mm512_maskz_broadcastd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcastd_epi32&expand=5236) + * [x] [`_mm512_maskz_broadcastq_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcastq_epi64&expand=5236) + * [x] [`_mm512_maskz_broadcastsd_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcastsd_pd&expand=5236) + * [x] [`_mm512_maskz_broadcastss_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcastss_ps&expand=5236) * [ ] [`_mm512_maskz_compress_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_compress_epi32&expand=5236) * [ ] [`_mm512_maskz_compress_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_compress_epi64&expand=5236) * [ ] [`_mm512_maskz_compress_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_compress_pd&expand=5236) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index fcc58ff560..5f6a06705e 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -10408,16 +10408,74 @@ pub unsafe fn _mm512_castsi512_pd(a: __m512i) -> __m512d { transmute(a) } -/// Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst. +/// Broadcast the low packed 32-bit integer from a to all elements of dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcastsd_pd&expand=567) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcastd_epi32&expand=545) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vbroadcastsd))] -pub unsafe fn _mm512_broadcastsd_pd(a: __m128d) -> __m512d { +#[cfg_attr(test, assert_instr(vbroadcast))] //should be vpbroadcastd +pub unsafe fn _mm512_broadcastd_epi32(a: __m128i) -> __m512i { + let a = _mm512_castsi128_si512(a).as_i32x16(); + let ret = simd_shuffle16(a, a, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); + transmute::(ret) +} + +/// Broadcast the low packed 32-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcastd_epi32&expand=546) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastd +pub unsafe fn _mm512_mask_broadcastd_epi32(src: __m512i, k: __mmask16, a: __m128i) -> __m512i { + let broadcast = _mm512_broadcastd_epi32(a).as_i32x16(); + transmute(simd_select_bitmask(k, broadcast, src.as_i32x16())) +} + +/// Broadcast the low packed 32-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcastd_epi32&expand=547) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastd +pub unsafe fn _mm512_maskz_broadcastd_epi32(k: __mmask16, a: __m128i) -> __m512i { + let broadcast = _mm512_broadcastd_epi32(a).as_i32x16(); + let zero = _mm512_setzero_si512().as_i32x16(); + transmute(simd_select_bitmask(k, broadcast, zero)) +} + +/// Broadcast the low packed 64-bit integer from a to all elements of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcastq_epi64&expand=560) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vbroadcas))] //should be vpbroadcastq +pub unsafe fn _mm512_broadcastq_epi64(a: __m128i) -> __m512i { simd_shuffle8(a, a, [1, 1, 1, 1, 1, 1, 1, 1]) } +/// Broadcast the low packed 64-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcastq_epi64&expand=561) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastq +pub unsafe fn _mm512_mask_broadcastq_epi64(src: __m512i, k: __mmask8, a: __m128i) -> __m512i { + let broadcast = _mm512_broadcastq_epi64(a).as_i64x8(); + transmute(simd_select_bitmask(k, broadcast, src.as_i64x8())) +} + +/// Broadcast the low packed 64-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcastq_epi64&expand=562) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastq +pub unsafe fn _mm512_maskz_broadcastq_epi64(k: __mmask8, a: __m128i) -> __m512i { + let broadcast = _mm512_broadcastq_epi64(a).as_i64x8(); + let zero = _mm512_setzero_si512().as_i64x8(); + transmute(simd_select_bitmask(k, broadcast, zero)) +} + /// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcastss_ps&expand=578) @@ -10428,6 +10486,62 @@ pub unsafe fn _mm512_broadcastss_ps(a: __m128) -> __m512 { simd_shuffle16(a, a, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) } +/// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcastss_ps&expand=579) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vbroadcastss))] +pub unsafe fn _mm512_mask_broadcastss_ps(src: __m512, k: __mmask16, a: __m128) -> __m512 { + let broadcast = _mm512_broadcastss_ps(a).as_f32x16(); + transmute(simd_select_bitmask(k, broadcast, src.as_f32x16())) +} + +/// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcastss_ps&expand=580) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vbroadcastss))] +pub unsafe fn _mm512_maskz_broadcastss_ps(k: __mmask16, a: __m128) -> __m512 { + let broadcast = _mm512_broadcastss_ps(a).as_f32x16(); + let zero = _mm512_setzero_ps().as_f32x16(); + transmute(simd_select_bitmask(k, broadcast, zero)) +} + +/// Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcastsd_pd&expand=567) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vbroadcastsd))] +pub unsafe fn _mm512_broadcastsd_pd(a: __m128d) -> __m512d { + simd_shuffle8(a, a, [1, 1, 1, 1, 1, 1, 1, 1]) +} + +/// Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcastsd_pd&expand=568) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vbroadcastsd))] +pub unsafe fn _mm512_mask_broadcastsd_pd(src: __m512d, k: __mmask8, a: __m128d) -> __m512d { + let broadcast = _mm512_broadcastsd_pd(a).as_f64x8(); + transmute(simd_select_bitmask(k, broadcast, src.as_f64x8())) +} + +/// Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcastsd_pd&expand=569) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vbroadcastsd))] +pub unsafe fn _mm512_maskz_broadcastsd_pd(k: __mmask8, a: __m128d) -> __m512d { + let broadcast = _mm512_broadcastsd_pd(a).as_f64x8(); + let zero = _mm512_setzero_pd().as_f64x8(); + transmute(simd_select_bitmask(k, broadcast, zero)) +} + /// Compute the bitwise AND of packed 32-bit integers in a and b, and store the results in dst. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_and_epi32&expand=272) @@ -18644,6 +18758,43 @@ mod tests { assert_eq_m512i(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_broadcastd_epi32() { + let a = _mm_set_epi32( + 17, 18, 19, 20, + ); + let r = _mm512_broadcastd_epi32(a); + let e = _mm512_set1_epi32(20); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_broadcastd_epi32() { + let src = _mm512_set1_epi32(20); + let a = _mm_set_epi32( + 17, 18, 19, 20, + ); + let r = _mm512_mask_broadcastd_epi32(src, 0, a); + assert_eq_m512i(r, src); + let r = _mm512_mask_broadcastd_epi32(src, 0b11111111_11111111, a); + let e = _mm512_set1_epi32(20); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_broadcastd_epi32() { + let a = _mm_set_epi32( + 17, 18, 19, 20, + ); + let r = _mm512_maskz_broadcastd_epi32(0, a); + assert_eq_m512i(r, _mm512_setzero_si512()); + let r = _mm512_maskz_broadcastd_epi32(0b00000000_11111111, a); + let e = _mm512_setr_epi32( + 20, 20, 20, 20, 20, 20, 20, 20, 0, 0, 0, 0, 0, 0, 0, 0, + ); + assert_eq_m512i(r, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_broadcastss_ps() { let a = _mm_set_ps( @@ -18654,6 +18805,33 @@ mod tests { assert_eq_m512(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_broadcastss_ps() { + let src = _mm512_set1_ps(20.); + let a = _mm_set_ps( + 17., 18., 19., 20., + ); + let r = _mm512_mask_broadcastss_ps(src, 0, a); + assert_eq_m512(r, src); + let r = _mm512_mask_broadcastss_ps(src, 0b11111111_11111111, a); + let e = _mm512_set1_ps(20.); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_broadcastss_ps() { + let a = _mm_set_ps( + 17., 18., 19., 20., + ); + let r = _mm512_maskz_broadcastss_ps(0, a); + assert_eq_m512(r, _mm512_setzero_ps()); + let r = _mm512_maskz_broadcastss_ps(0b00000000_11111111, a); + let e = _mm512_setr_ps( + 20., 20., 20., 20., 20., 20., 20., 20., 0., 0., 0., 0., 0., 0., 0., 0., + ); + assert_eq_m512(r, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_and_epi32() { let a = _mm512_set_epi32( diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs index 7401a96ecf..8f92b73e34 100644 --- a/crates/core_arch/src/x86_64/avx512f.rs +++ b/crates/core_arch/src/x86_64/avx512f.rs @@ -4406,6 +4406,41 @@ mod tests { assert_eq_m512d(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_broadcastq_epi64() { + let a = _mm_setr_epi64x( + 17, 18, + ); + let r = _mm512_broadcastq_epi64(a); + let e = _mm512_set1_epi64(18); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_broadcastq_epi64() { + let src = _mm512_set1_epi64(18); + let a = _mm_setr_epi64x( + 17, 18, + ); + let r = _mm512_mask_broadcastq_epi64(src, 0, a); + assert_eq_m512i(r, src); + let r = _mm512_mask_broadcastq_epi64(src, 0b01111111, a); + let e = _mm512_set1_epi64(18); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_broadcastq_epi64() { + let a = _mm_setr_epi64x( + 17, 18, + ); + let r = _mm512_maskz_broadcastq_epi64(0, a); + assert_eq_m512i(r, _mm512_setzero_si512()); + let r = _mm512_maskz_broadcastq_epi64(0b00001111, a); + let e = _mm512_set_epi64(0, 0, 0, 0, 18, 18, 18, 18); + assert_eq_m512i(r, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_broadcastsd_pd() { let a = _mm_setr_pd( @@ -4416,6 +4451,31 @@ mod tests { assert_eq_m512d(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_broadcastsd_pd() { + let src = _mm512_set1_pd(18.); + let a = _mm_setr_pd( + 17., 18., + ); + let r = _mm512_mask_broadcastsd_pd(src, 0, a); + assert_eq_m512d(r, src); + let r = _mm512_mask_broadcastsd_pd(src, 0b01111111, a); + let e = _mm512_set1_pd(18.); + assert_eq_m512d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_broadcastsd_pd() { + let a = _mm_setr_pd( + 17., 18., + ); + let r = _mm512_maskz_broadcastsd_pd(0, a); + assert_eq_m512d(r, _mm512_setzero_pd()); + let r = _mm512_maskz_broadcastsd_pd(0b00001111, a); + let e = _mm512_set_pd(0., 0., 0., 0., 18., 18., 18., 18.); + assert_eq_m512d(r, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_and_epi64() { let a = _mm512_set_epi64(1 << 0 | 1 << 15, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3); From bc95e37b21a3a830851ee9e0e2b639b8906e779b Mon Sep 17 00:00:00 2001 From: jironglin Date: Tue, 22 Sep 2020 20:50:26 +0000 Subject: [PATCH 07/20] broadcast_i32x4, broadcast_i64x4, broadcast_f32x4, broadcast_f64x4 --- crates/core_arch/avx512f.md | 24 +-- crates/core_arch/src/x86/avx512f.rs | 204 +++++++++++++++++++++++++ crates/core_arch/src/x86_64/avx512f.rs | 70 +++++++++ 3 files changed, 286 insertions(+), 12 deletions(-) diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md index fd4916c930..815fc1091d 100644 --- a/crates/core_arch/avx512f.md +++ b/crates/core_arch/avx512f.md @@ -18,10 +18,10 @@ * [ ] [`_mm512_andnot_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_andnot_epi32&expand=5236) * [ ] [`_mm512_andnot_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_andnot_epi64&expand=5236) * [ ] [`_mm512_andnot_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_andnot_si512&expand=5236) - * [ ] [`_mm512_broadcast_f32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcast_f32x4&expand=5236) - * [ ] [`_mm512_broadcast_f64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcast_f64x4&expand=5236) - * [ ] [`_mm512_broadcast_i32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcast_i32x4&expand=5236) - * [ ] [`_mm512_broadcast_i64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcast_i64x4&expand=5236) + * [x] [`_mm512_broadcast_f32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcast_f32x4&expand=5236) + * [x] [`_mm512_broadcast_f64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcast_f64x4&expand=5236) + * [x] [`_mm512_broadcast_i32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcast_i32x4&expand=5236) + * [x] [`_mm512_broadcast_i64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcast_i64x4&expand=5236) * [x] [`_mm512_broadcastd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcastd_epi32&expand=5236) * [x] [`_mm512_broadcastq_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcastq_epi64&expand=5236) * [x] [`_mm512_broadcastsd_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcastsd_pd&expand=5236) @@ -294,10 +294,10 @@ * [ ] [`_mm512_mask_blend_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_blend_epi64&expand=5236) * [ ] [`_mm512_mask_blend_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_blend_pd&expand=5236) * [ ] [`_mm512_mask_blend_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_blend_ps&expand=5236) - * [ ] [`_mm512_mask_broadcast_f32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcast_f32x4&expand=5236) - * [ ] [`_mm512_mask_broadcast_f64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcast_f64x4&expand=5236) - * [ ] [`_mm512_mask_broadcast_i32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcast_i32x4&expand=5236) - * [ ] [`_mm512_mask_broadcast_i64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcast_i64x4&expand=5236) + * [x] [`_mm512_mask_broadcast_f32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcast_f32x4&expand=5236) + * [x] [`_mm512_mask_broadcast_f64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcast_f64x4&expand=5236) + * [x] [`_mm512_mask_broadcast_i32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcast_i32x4&expand=5236) + * [x] [`_mm512_mask_broadcast_i64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcast_i64x4&expand=5236) * [x] [`_mm512_mask_broadcastd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcastd_epi32&expand=5236) * [x] [`_mm512_mask_broadcastq_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcastq_epi64&expand=5236) * [x] [`_mm512_mask_broadcastsd_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcastsd_pd&expand=5236) @@ -690,10 +690,10 @@ * [x] [`_mm512_maskz_and_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_and_epi64&expand=5236) * [ ] [`_mm512_maskz_andnot_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_andnot_epi32&expand=5236) * [ ] [`_mm512_maskz_andnot_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_andnot_epi64&expand=5236) - * [ ] [`_mm512_maskz_broadcast_f32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcast_f32x4&expand=5236) - * [ ] [`_mm512_maskz_broadcast_f64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcast_f64x4&expand=5236) - * [ ] [`_mm512_maskz_broadcast_i32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcast_i32x4&expand=5236) - * [ ] [`_mm512_maskz_broadcast_i64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcast_i64x4&expand=5236) + * [x] [`_mm512_maskz_broadcast_f32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcast_f32x4&expand=5236) + * [x] [`_mm512_maskz_broadcast_f64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcast_f64x4&expand=5236) + * [x] [`_mm512_maskz_broadcast_i32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcast_i32x4&expand=5236) + * [x] [`_mm512_maskz_broadcast_i64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcast_i64x4&expand=5236) * [x] [`_mm512_maskz_broadcastd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcastd_epi32&expand=5236) * [x] [`_mm512_maskz_broadcastq_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcastq_epi64&expand=5236) * [x] [`_mm512_maskz_broadcastsd_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcastsd_pd&expand=5236) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 5f6a06705e..099883e606 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -10542,6 +10542,140 @@ pub unsafe fn _mm512_maskz_broadcastsd_pd(k: __mmask8, a: __m128d) -> __m512d { transmute(simd_select_bitmask(k, broadcast, zero)) } +/// Broadcast the 4 packed 32-bit integers from a to all elements of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcast_i32x4&expand=510) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vinsert))] //should be vpbroadcasti32x4 +pub unsafe fn _mm512_broadcast_i32x4(a: __m128i) -> __m512i { + let a = _mm512_castsi128_si512(a).as_i32x16(); + let ret = simd_shuffle16(a, a, [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]); + transmute::(ret) +} + +/// Broadcast the 4 packed 32-bit integers from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcast_i32x4&expand=511) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vshufi32x4))] //should be vbroadcasti32x4 +pub unsafe fn _mm512_mask_broadcast_i32x4(src: __m512i, k: __mmask16, a: __m128i) -> __m512i { + let broadcast = _mm512_broadcast_i32x4(a).as_i32x16(); + transmute(simd_select_bitmask(k, broadcast, src.as_i32x16())) +} + +/// Broadcast the 4 packed 32-bit integers from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcast_i32x4&expand=512) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vshufi32x4))] //should be vbroadcasti32x4 +pub unsafe fn _mm512_maskz_broadcast_i32x4(k: __mmask16, a: __m128i) -> __m512i { + let broadcast = _mm512_broadcast_i32x4(a).as_i32x16(); + let zero = _mm512_setzero_si512().as_i32x16(); + transmute(simd_select_bitmask(k, broadcast, zero)) +} + +/// Broadcast the 4 packed 64-bit integers from a to all elements of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcast_i64x4&expand=522) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vinsertf64x4))] //should be vbroadcasti64x4 +pub unsafe fn _mm512_broadcast_i64x4(a: __m256i) -> __m512i { + simd_shuffle8(a, a, [0, 1, 2, 3, 0, 1, 2, 3]) +} + +/// Broadcast the 4 packed 64-bit integers from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcast_i64x4&expand=523) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vinserti64x4))] //should be vbroadcasti64x4 +pub unsafe fn _mm512_mask_broadcast_i64x4(src: __m512i, k: __mmask8, a: __m256i) -> __m512i { + let broadcast = _mm512_broadcast_i64x4(a).as_i64x8(); + transmute(simd_select_bitmask(k, broadcast, src.as_i64x8())) +} + +/// Broadcast the 4 packed 64-bit integers from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcast_i64x4&expand=524) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vinserti64x4))] //should be vbroadcasti64x4 +pub unsafe fn _mm512_maskz_broadcast_i64x4(k: __mmask8, a: __m256i) -> __m512i { + let broadcast = _mm512_broadcast_i64x4(a).as_i64x8(); + let zero = _mm512_setzero_si512().as_i64x8(); + transmute(simd_select_bitmask(k, broadcast, zero)) +} + +/// Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcast_f32x4&expand=483) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vinsertf128))] //should be vbroadcastf32x4 +pub unsafe fn _mm512_broadcast_f32x4(a: __m128) -> __m512 { + simd_shuffle16(a, a, [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]) +} + +/// Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcast_f32x4&expand=484) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vinsertf128))] //should be vbroadcastf32x4 +pub unsafe fn _mm512_mask_broadcast_f32x4(src: __m512, k: __mmask16, a: __m128) -> __m512 { + let broadcast = _mm512_broadcast_f32x4(a).as_f32x16(); + transmute(simd_select_bitmask(k, broadcast, src.as_f32x16())) +} + +/// Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcast_f32x4&expand=485) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vinsertf128))] //should be vbroadcastf32x4 +pub unsafe fn _mm512_maskz_broadcast_f32x4(k: __mmask16, a: __m128) -> __m512 { + let broadcast = _mm512_broadcast_f32x4(a).as_f32x16(); + let zero = _mm512_setzero_ps().as_f32x16(); + transmute(simd_select_bitmask(k, broadcast, zero)) +} + +/// Broadcast the 4 packed double-precision (64-bit) floating-point elements from a to all elements of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcast_f64x4&expand=495) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vinsertf64x4))] //should be vbroadcastf64x4 +pub unsafe fn _mm512_broadcast_f64x4(a: __m256d) -> __m512d { + simd_shuffle8(a, a, [0, 1, 2, 3, 0, 1, 2, 3]) +} + +/// Broadcast the 4 packed double-precision (64-bit) floating-point elements from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcast_f64x4&expand=496) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vinsertf64x4))] //should be vbroadcastf64x4 +pub unsafe fn _mm512_mask_broadcast_f64x4(src: __m512d, k: __mmask8, a: __m256d) -> __m512d { + let broadcast = _mm512_broadcast_f64x4(a).as_f64x8(); + transmute(simd_select_bitmask(k, broadcast, src.as_f64x8())) +} + +/// Broadcast the 4 packed double-precision (64-bit) floating-point elements from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcast_f64x4&expand=497) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vinsertf64x4))] //should be vbroadcastf64x4 +pub unsafe fn _mm512_maskz_broadcast_f64x4(k: __mmask8, a: __m256d) -> __m512d { + let broadcast = _mm512_broadcast_f64x4(a).as_f64x8(); + let zero = _mm512_setzero_pd().as_f64x8(); + transmute(simd_select_bitmask(k, broadcast, zero)) +} + /// Compute the bitwise AND of packed 32-bit integers in a and b, and store the results in dst. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_and_epi32&expand=272) @@ -18832,6 +18966,76 @@ mod tests { assert_eq_m512(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_broadcast_i32x4() { + let a = _mm_set_epi32( + 17, 18, 19, 20, + ); + let r = _mm512_broadcast_i32x4(a); + let e = _mm512_set_epi32(17, 18, 19, 20, 17, 18, 19, 20, 17, 18, 19, 20, 17, 18, 19, 20); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_broadcast_i32x4() { + let src = _mm512_set1_epi32(20); + let a = _mm_set_epi32( + 17, 18, 19, 20, + ); + let r = _mm512_mask_broadcast_i32x4(src, 0, a); + assert_eq_m512i(r, src); + let r = _mm512_mask_broadcast_i32x4(src, 0b11111111_11111111, a); + let e = _mm512_set_epi32(17, 18, 19, 20, 17, 18, 19, 20, 17, 18, 19, 20, 17, 18, 19, 20); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_broadcast_i32x4() { + let a = _mm_set_epi32( + 17, 18, 19, 20, + ); + let r = _mm512_maskz_broadcast_i32x4(0, a); + assert_eq_m512i(r, _mm512_setzero_si512()); + let r = _mm512_maskz_broadcast_i32x4(0b00000000_11111111, a); + let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 17, 18, 19, 20, 17, 18, 19, 20); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_broadcast_f32x4() { + let a = _mm_set_ps( + 17., 18., 19., 20., + ); + let r = _mm512_broadcast_f32x4(a); + let e = _mm512_set_ps(17., 18., 19., 20., 17., 18., 19., 20., 17., 18., 19., 20., 17., 18., 19., 20.); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_broadcast_f32x4() { + let src = _mm512_set1_ps(20.); + let a = _mm_set_ps( + 17., 18., 19., 20., + ); + let r = _mm512_mask_broadcast_f32x4(src, 0, a); + assert_eq_m512(r, src); + let r = _mm512_mask_broadcast_f32x4(src, 0b11111111_11111111, a); + let e = _mm512_set_ps(17., 18., 19., 20., 17., 18., 19., 20., 17., 18., 19., 20., 17., 18., 19., 20.); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_broadcast_f32x4() { + let a = _mm_set_ps( + 17., 18., 19., 20., + ); + let r = _mm512_maskz_broadcast_f32x4(0, a); + assert_eq_m512(r, _mm512_setzero_ps()); + let r = _mm512_maskz_broadcast_f32x4(0b00000000_11111111, a); + let e = _mm512_set_ps(0., 0., 0., 0., 0., 0., 0., 0., 17., 18., 19., 20., 17., 18., 19., 20.); + assert_eq_m512(r, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_and_epi32() { let a = _mm512_set_epi32( diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs index 8f92b73e34..34f6b009b3 100644 --- a/crates/core_arch/src/x86_64/avx512f.rs +++ b/crates/core_arch/src/x86_64/avx512f.rs @@ -4476,6 +4476,76 @@ mod tests { assert_eq_m512d(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_broadcast_i64x4() { + let a = _mm256_set_epi64x( + 17, 18, 19, 20 + ); + let r = _mm512_broadcast_i64x4(a); + let e = _mm512_set_epi64(17, 18, 19, 20, 17, 18, 19, 20); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_broadcast_i64x4() { + let src = _mm512_set1_epi64(18); + let a = _mm256_set_epi64x( + 17, 18, 19, 20 + ); + let r = _mm512_mask_broadcast_i64x4(src, 0, a); + assert_eq_m512i(r, src); + let r = _mm512_mask_broadcast_i64x4(src, 0b11111111, a); + let e = _mm512_set_epi64(17, 18, 19, 20, 17, 18, 19, 20); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_broadcast_i64x4() { + let a = _mm256_set_epi64x( + 17, 18, 19, 20 + ); + let r = _mm512_maskz_broadcast_i64x4(0, a); + assert_eq_m512i(r, _mm512_setzero_si512()); + let r = _mm512_maskz_broadcast_i64x4(0b00001111, a); + let e = _mm512_set_epi64(0, 0, 0, 0, 17, 18, 19, 20); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_broadcast_f64x4() { + let a = _mm256_set_pd( + 17., 18., 19., 20. + ); + let r = _mm512_broadcast_f64x4(a); + let e = _mm512_set_pd(17., 18., 19., 20., 17., 18., 19., 20.); + assert_eq_m512d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_broadcast_f64x4() { + let src = _mm512_set1_pd(18.); + let a = _mm256_set_pd( + 17., 18., 19., 20. + ); + let r = _mm512_mask_broadcast_f64x4(src, 0, a); + assert_eq_m512d(r, src); + let r = _mm512_mask_broadcast_f64x4(src, 0b11111111, a); + let e = _mm512_set_pd(17., 18., 19., 20., 17., 18., 19., 20.); + assert_eq_m512d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_broadcast_f64x4() { + let a = _mm256_set_pd( + 17., 18., 19., 20. + ); + let r = _mm512_maskz_broadcast_f64x4(0, a); + assert_eq_m512d(r, _mm512_setzero_pd()); + let r = _mm512_maskz_broadcast_f64x4(0b00001111, a); + let e = _mm512_set_pd(0., 0., 0., 0., 17., 18., 19., 20.); + assert_eq_m512d(r, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_and_epi64() { let a = _mm512_set_epi64(1 << 0 | 1 << 15, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3); From 757df4162042c8cdc583c2ee93c9d1012c6dcc9f Mon Sep 17 00:00:00 2001 From: jironglin Date: Wed, 23 Sep 2020 01:27:34 +0000 Subject: [PATCH 08/20] addnot: epi32, epi64 --- crates/core_arch/avx512f.md | 14 ++-- crates/core_arch/src/x86/avx512f.rs | 109 +++++++++++++++++++++++++ crates/core_arch/src/x86_64/avx512f.rs | 42 ++++++++++ 3 files changed, 158 insertions(+), 7 deletions(-) diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md index 815fc1091d..2cd567cce0 100644 --- a/crates/core_arch/avx512f.md +++ b/crates/core_arch/avx512f.md @@ -15,9 +15,9 @@ * [x] [`_mm512_and_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_and_epi32&expand=5236) * [x] [`_mm512_and_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_and_epi64&expand=5236) * [x] [`_mm512_and_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_and_si512&expand=5236) - * [ ] [`_mm512_andnot_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_andnot_epi32&expand=5236) - * [ ] [`_mm512_andnot_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_andnot_epi64&expand=5236) - * [ ] [`_mm512_andnot_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_andnot_si512&expand=5236) + * [x] [`_mm512_andnot_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_andnot_epi32&expand=5236) + * [x] [`_mm512_andnot_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_andnot_epi64&expand=5236) + * [x] [`_mm512_andnot_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_andnot_si512&expand=5236) * [x] [`_mm512_broadcast_f32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcast_f32x4&expand=5236) * [x] [`_mm512_broadcast_f64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcast_f64x4&expand=5236) * [x] [`_mm512_broadcast_i32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcast_i32x4&expand=5236) @@ -288,8 +288,8 @@ * [ ] [`_mm512_mask_alignr_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_alignr_epi64&expand=5236) * [x] [`_mm512_mask_and_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_and_epi32&expand=5236) * [x] [`_mm512_mask_and_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_and_epi64&expand=5236) - * [ ] [`_mm512_mask_andnot_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_andnot_epi32&expand=5236) - * [ ] [`_mm512_mask_andnot_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_andnot_epi64&expand=5236) + * [x] [`_mm512_mask_andnot_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_andnot_epi32&expand=5236) + * [x] [`_mm512_mask_andnot_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_andnot_epi64&expand=5236) * [ ] [`_mm512_mask_blend_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_blend_epi32&expand=5236) * [ ] [`_mm512_mask_blend_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_blend_epi64&expand=5236) * [ ] [`_mm512_mask_blend_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_blend_pd&expand=5236) @@ -688,8 +688,8 @@ * [ ] [`_mm512_maskz_alignr_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_alignr_epi64&expand=5236) * [x] [`_mm512_maskz_and_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_and_epi32&expand=5236) * [x] [`_mm512_maskz_and_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_and_epi64&expand=5236) - * [ ] [`_mm512_maskz_andnot_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_andnot_epi32&expand=5236) - * [ ] [`_mm512_maskz_andnot_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_andnot_epi64&expand=5236) + * [x] [`_mm512_maskz_andnot_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_andnot_epi32&expand=5236) + * [x] [`_mm512_maskz_andnot_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_andnot_epi64&expand=5236) * [x] [`_mm512_maskz_broadcast_f32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcast_f32x4&expand=5236) * [x] [`_mm512_maskz_broadcast_f64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcast_f64x4&expand=5236) * [x] [`_mm512_maskz_broadcast_i32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcast_i32x4&expand=5236) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 099883e606..231856e268 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -10904,6 +10904,82 @@ pub unsafe fn _mm512_xor_si512(a: __m512i, b: __m512i) -> __m512i { transmute(simd_xor(a.as_i32x16(), b.as_i32x16())) } +/// Compute the bitwise NOT of packed 32-bit integers in a and then AND with b, and store the results in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_andnot_epi32&expand=310) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpandnq))] //should be vpandnd +pub unsafe fn _mm512_andnot_epi32(a: __m512i, b: __m512i) -> __m512i { + _mm512_and_epi32(_mm512_xor_epi32(a, _mm512_set1_epi32(u32::MAX as i32)), b) +} + +/// Compute the bitwise NOT of packed 32-bit integers in a and then AND with b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_andnot_epi32&expand=311) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpandnd))] +pub unsafe fn _mm512_mask_andnot_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i { + let andnot = _mm512_andnot_epi32(a, b).as_i32x16(); + transmute(simd_select_bitmask(k, andnot, src.as_i32x16())) +} + +/// Compute the bitwise NOT of packed 32-bit integers in a and then AND with b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_andnot_epi32&expand=312) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpandnd))] +pub unsafe fn _mm512_maskz_andnot_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i { + let andnot = _mm512_andnot_epi32(a, b).as_i32x16(); + let zero = _mm512_setzero_si512().as_i32x16(); + transmute(simd_select_bitmask(k, andnot, zero)) +} + +/// Compute the bitwise NOT of 512 bits (composed of packed 64-bit integers) in a and then AND with b, and store the results in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_andnot_epi64&expand=317) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpandnq))] //should be vpandnd +pub unsafe fn _mm512_andnot_epi64(a: __m512i, b: __m512i) -> __m512i { + _mm512_and_epi64(_mm512_xor_epi64(a, _mm512_set1_epi64(u64::MAX as i64)), b) +} + +/// Compute the bitwise NOT of packed 64-bit integers in a and then AND with b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_andnot_epi64&expand=318) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpandnq))] +pub unsafe fn _mm512_mask_andnot_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i { + let andnot = _mm512_andnot_epi64(a, b).as_i64x8(); + transmute(simd_select_bitmask(k, andnot, src.as_i64x8())) +} + +/// Compute the bitwise NOT of packed 64-bit integers in a and then AND with b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_andnot_epi64&expand=319) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpandnq))] +pub unsafe fn _mm512_maskz_andnot_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i { + let andnot = _mm512_andnot_epi64(a, b).as_i64x8(); + let zero = _mm512_setzero_si512().as_i64x8(); + transmute(simd_select_bitmask(k, andnot, zero)) +} + +/// Compute the bitwise NOT of 512 bits (representing integer data) in a and then AND with b, and store the result in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_andnot_si512&expand=340) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpandnq))] +pub unsafe fn _mm512_andnot_si512(a: __m512i, b: __m512i) -> __m512i { + _mm512_and_epi64(_mm512_xor_epi64(a, _mm512_set1_epi64(u64::MAX as i64)), b) +} + /// Compute the bitwise AND of 16-bit masks a and b, and store the result in k. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=kand_mask16&expand=3212) @@ -19706,6 +19782,39 @@ mod tests { assert_eq_m512i(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_andnot_epi32() { + let a = _mm512_set1_epi32(0); + let b = _mm512_set1_epi32(1<<3 | 1<< 4); + let r = _mm512_andnot_epi32(a, b); + let e = _mm512_set1_epi32(1 << 3 | 1 << 4); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_andnot_epi32() { + let a = _mm512_set1_epi32(1<<1 | 1<< 2); + let b = _mm512_set1_epi32(1<<3 | 1<< 4); + let r = _mm512_mask_andnot_epi32(a, 0, a, b); + assert_eq_m512i(r, a); + + let r = _mm512_mask_andnot_epi32(a, 0b11111111_11111111, a, b); + let e = _mm512_set1_epi32(1 << 3 | 1 << 4); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_andnot_epi32() { + let a = _mm512_set1_epi32(1<<1 | 1<< 2); + let b = _mm512_set1_epi32(1<<3 | 1<< 4); + let r = _mm512_maskz_andnot_epi32(0, a, b); + assert_eq_m512i(r, _mm512_setzero_si512()); + + let r = _mm512_maskz_andnot_epi32(0b00000000_11111111, a, b); + let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4); + assert_eq_m512i(r, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_kand() { let a: u16 = 0b11001100_00110011; diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs index 34f6b009b3..a9692b358d 100644 --- a/crates/core_arch/src/x86_64/avx512f.rs +++ b/crates/core_arch/src/x86_64/avx512f.rs @@ -4704,4 +4704,46 @@ mod tests { let e = _mm512_set_epi64(1 << 0 | 1 << 13 | 1 << 15, 0, 0, 0, 0, 0, 0, 0); assert_eq_m512i(r, e); } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_andnot_epi64() { + let a = _mm512_set1_epi64(0); + let b = _mm512_set1_epi64(1<<3 | 1<< 4); + let r = _mm512_andnot_epi64(a, b); + let e = _mm512_set1_epi64(1 << 3 | 1 << 4); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_andnot_epi64() { + let a = _mm512_set1_epi64(1<<1 | 1<< 2); + let b = _mm512_set1_epi64(1<<3 | 1<< 4); + let r = _mm512_mask_andnot_epi64(a, 0, a, b); + assert_eq_m512i(r, a); + + let r = _mm512_mask_andnot_epi64(a, 0b11111111, a, b); + let e = _mm512_set1_epi64(1 << 3 | 1 << 4); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_andnot_epi64() { + let a = _mm512_set1_epi64(1<<1 | 1<< 2); + let b = _mm512_set1_epi64(1<<3 | 1<< 4); + let r = _mm512_maskz_andnot_epi64(0, a, b); + assert_eq_m512i(r, _mm512_setzero_si512()); + + let r = _mm512_maskz_andnot_epi64(0b00001111, a, b); + let e = _mm512_set_epi64(0, 0, 0, 0, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_andnot_si512() { + let a = _mm512_set1_epi64(0); + let b = _mm512_set1_epi64(1<<3 | 1<< 4); + let r = _mm512_andnot_si512(a, b); + let e = _mm512_set1_epi64(1 << 3 | 1 << 4); + assert_eq_m512i(r, e); + } } From 2370f24a05ae89263c9a27f07765ce6e0526b28b Mon Sep 17 00:00:00 2001 From: jironglin Date: Wed, 23 Sep 2020 15:05:27 +0000 Subject: [PATCH 09/20] insertf32x4, insertf64x4 --- crates/core_arch/avx512f.md | 12 +-- crates/core_arch/src/x86/avx512f.rs | 143 ++++++++++++++++++++++--- crates/core_arch/src/x86_64/avx512f.rs | 49 +++++++++ 3 files changed, 186 insertions(+), 18 deletions(-) diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md index 2cd567cce0..e74d4e193a 100644 --- a/crates/core_arch/avx512f.md +++ b/crates/core_arch/avx512f.md @@ -222,8 +222,8 @@ * [x] [`_mm512_i64scatter_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64scatter_epi64&expand=5236) * [x] [`_mm512_i64scatter_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64scatter_pd&expand=5236) * [x] [`_mm512_i64scatter_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64scatter_ps&expand=5236) - * [ ] [`_mm512_insertf32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_insertf32x4&expand=5236) - * [ ] [`_mm512_insertf64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_insertf64x4&expand=5236) + * [x] [`_mm512_insertf32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_insertf32x4&expand=5236) + * [x] [`_mm512_insertf64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_insertf64x4&expand=5236) * [ ] [`_mm512_inserti32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_inserti32x4&expand=5236) * [ ] [`_mm512_inserti64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_inserti64x4&expand=5236) * [ ] [`_mm512_int2mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_int2mask&expand=5236) @@ -511,8 +511,8 @@ * [x] [`_mm512_mask_i64scatter_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64scatter_epi64&expand=5236) * [x] [`_mm512_mask_i64scatter_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64scatter_pd&expand=5236) * [x] [`_mm512_mask_i64scatter_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64scatter_ps&expand=5236) - * [ ] [`_mm512_mask_insertf32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_insertf32x4&expand=5236) - * [ ] [`_mm512_mask_insertf64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_insertf64x4&expand=5236) + * [x] [`_mm512_mask_insertf32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_insertf32x4&expand=5236) + * [x] [`_mm512_mask_insertf64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_insertf64x4&expand=5236) * [ ] [`_mm512_mask_inserti32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_inserti32x4&expand=5236) * [ ] [`_mm512_mask_inserti64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_inserti64x4&expand=5236) * [ ] [`_mm512_mask_load_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_load_epi32&expand=5236) @@ -809,8 +809,8 @@ * [x] [`_mm512_maskz_getmant_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getmant_ps&expand=5236) * [x] [`_mm512_maskz_getmant_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getmant_round_pd&expand=5236) * [x] [`_mm512_maskz_getmant_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getmant_round_ps&expand=5236) - * [ ] [`_mm512_maskz_insertf32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_insertf32x4&expand=5236) - * [ ] [`_mm512_maskz_insertf64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_insertf64x4&expand=5236) + * [x] [`_mm512_maskz_insertf32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_insertf32x4&expand=5236) + * [x] [`_mm512_maskz_insertf64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_insertf64x4&expand=5236) * [ ] [`_mm512_maskz_inserti32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_inserti32x4&expand=5236) * [ ] [`_mm512_maskz_inserti64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_inserti64x4&expand=5236) * [ ] [`_mm512_maskz_load_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_load_epi32&expand=5236) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 231856e268..7fc2436810 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -10228,24 +10228,109 @@ pub unsafe fn _mm512_maskz_movedup_pd(k: __mmask8, a: __m512d) -> __m512d { let zero = _mm512_setzero_pd().as_f64x8(); transmute(simd_select_bitmask(k, mov, zero)) } -/* + /// Copy a to dst, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into dst at the location specified by imm8. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_insertf32x4&expand=3155) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vinsertf32x4))] +#[cfg_attr(test, assert_instr(vinsertf32x4, imm8 = 2))] +#[rustc_args_required_const(2)] pub unsafe fn _mm512_insertf32x4(a: __m512, b: __m128, imm8: i32) -> __m512 { - let a = a.as_f32x16(); - let b = b.as_f32x4(); - match imm8 & 3 { - 0 => transmute(simd_insert(a, 0, b)), - 1 => transmute(simd_insert(a, 1, b)), - 2 => transmute(simd_insert(a, 2, b)), - _ => transmute(simd_insert(a, 3, b)), + let b = _mm512_castps128_ps512(b); + match imm8 & 0b11 { + 0 => simd_shuffle16(a, b, [16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), + 1 => simd_shuffle16(a, b, [0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 12, 13, 14, 15]), + 2 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15]), + _ => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19]), + } +} + +/// Copy a to tmp, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_insertf32x4&expand=3156) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vinsertf32x4, imm8 = 2))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm512_mask_insertf32x4(src: __m512, k: __mmask16, a: __m512, b: __m128, imm8: i32) -> __m512 { + let b = _mm512_castps128_ps512(b); + let insert = match imm8 & 0b11 { + 0 => simd_shuffle16(a, b, [16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), + 1 => simd_shuffle16(a, b, [0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 12, 13, 14, 15]), + 2 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15]), + _ => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19]), + }; + transmute(simd_select_bitmask(k, insert, src.as_f32x16())) +} + +/// Copy a to tmp, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_insertf32x4&expand=3157) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vinsertf32x4, imm8 = 2))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm512_maskz_insertf32x4(k: __mmask16, a: __m512, b: __m128, imm8: i32) -> __m512 { + let b = _mm512_castps128_ps512(b); + let insert = match imm8 & 0b11 { + 0 => simd_shuffle16(a, b, [16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), + 1 => simd_shuffle16(a, b, [0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 12, 13, 14, 15]), + 2 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15]), + _ => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19]), + }; + let zero = _mm512_setzero_ps().as_f32x16(); + transmute(simd_select_bitmask(k, insert, zero)) +} + +/// Copy a to dst, then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from b into dst at the location specified by imm8. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_insertf64x4&expand=3167) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vinsertf64x4, imm8 = 1))] +#[rustc_args_required_const(2)] +pub unsafe fn _mm512_insertf64x4(a: __m512d, b: __m256d, imm8: i32) -> __m512d { + let b = _mm512_castpd256_pd512(b); + match imm8 & 0b1 { + 0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7 ]), + _ => simd_shuffle8(a, b, [0, 1, 2, 3, 8, 9, 10, 11]), } } -*/ + +/// Copy a to tmp, then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_insertf64x4&expand=3168) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vinsertf64x4, imm8 = 1))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm512_mask_insertf64x4(src: __m512d, k: __mmask8, a: __m512d, b: __m256d, imm8: i32) -> __m512d { + let b = _mm512_castpd256_pd512(b); + let insert = match imm8 & 0b1 { + 0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7 ]), + _ => simd_shuffle8(a, b, [0, 1, 2, 3, 8, 9, 10, 11]), + }; + transmute(simd_select_bitmask(k, insert, src.as_f64x8())) +} + +/// Copy a to tmp, then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_insertf64x4&expand=3169) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vinsertf64x4, imm8 = 1))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm512_maskz_insertf64x4(k: __mmask8, a: __m512d, b: __m256d, imm8: i32) -> __m512d { + let b = _mm512_castpd256_pd512(b); + let insert = match imm8 & 0b1 { + 0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7 ]), + _ => simd_shuffle8(a, b, [0, 1, 2, 3, 8, 9, 10, 11]), + }; + let zero = _mm512_setzero_pd().as_f64x8(); + transmute(simd_select_bitmask(k, insert, zero)) +} + /// Cast vector of type __m128 to type __m512; the upper 384 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castps128_ps512&expand=621) @@ -18888,7 +18973,7 @@ mod tests { ); assert_eq_m512(r, e); } -/* + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_insertf32x4() { let a = _mm512_setr_ps( @@ -18903,7 +18988,41 @@ mod tests { ); assert_eq_m512(r, e); } -*/ + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_insertf32x4() { + let a = _mm512_setr_ps( + 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., + ); + let b = _mm_setr_ps( + 17., 18., 19., 20., + ); + let r = _mm512_mask_insertf32x4(a, 0, a, b, 0); + assert_eq_m512(r, a); + let r = _mm512_mask_insertf32x4(a, 0b11111111_11111111, a, b, 0); + let e = _mm512_setr_ps( + 17., 18., 19., 20., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., + ); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_insertf32x4() { + let a = _mm512_setr_ps( + 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., + ); + let b = _mm_setr_ps( + 17., 18., 19., 20., + ); + let r = _mm512_maskz_insertf32x4(0, a, b, 0); + assert_eq_m512(r, _mm512_setzero_ps()); + let r = _mm512_maskz_insertf32x4(0b00000000_11111111, a, b, 0); + let e = _mm512_setr_ps( + 17., 18., 19., 20., 5., 6., 7., 8., 0., 0., 0., 0., 0., 0., 0., 0., + ); + assert_eq_m512(r, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_castps128_ps512() { let a = _mm_setr_ps( diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs index a9692b358d..83b884dc8d 100644 --- a/crates/core_arch/src/x86_64/avx512f.rs +++ b/crates/core_arch/src/x86_64/avx512f.rs @@ -4278,6 +4278,55 @@ mod tests { assert_eq_m512d(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_insertf64x4() { + let a = _mm512_setr_pd( + 1., 2., 3., 4., 5., 6., 7., 8., + ); + let b = _mm256_setr_pd( + 17., 18., 19., 20., + ); + let r = _mm512_insertf64x4(a, b, 1); + let e = _mm512_setr_pd( + 1., 2., 3., 4., 17., 18., 19., 20., + ); + assert_eq_m512d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_insertf64x4() { + let a = _mm512_setr_pd( + 1., 2., 3., 4., 5., 6., 7., 8., + ); + let b = _mm256_setr_pd( + 17., 18., 19., 20., + ); + let r = _mm512_mask_insertf64x4(a, 0, a, b, 1); + assert_eq_m512d(r, a); + let r = _mm512_mask_insertf64x4(a, 0b11111111, a, b, 1); + let e = _mm512_setr_pd( + 1., 2., 3., 4., 17., 18., 19., 20., + ); + assert_eq_m512d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_insertf64x4() { + let a = _mm512_setr_pd( + 1., 2., 3., 4., 5., 6., 7., 8., + ); + let b = _mm256_setr_pd( + 17., 18., 19., 20., + ); + let r = _mm512_maskz_insertf64x4(0, a, b, 1); + assert_eq_m512d(r, _mm512_setzero_pd()); + let r = _mm512_maskz_insertf64x4(0b00001111, a, b, 1); + let e = _mm512_setr_pd( + 1., 2., 3., 4., 0., 0., 0., 0., + ); + assert_eq_m512d(r, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_castpd128_pd512() { let a = _mm_setr_pd( From bec0e99f063d952372e39e2f9c5853387de5088a Mon Sep 17 00:00:00 2001 From: jironglin Date: Wed, 23 Sep 2020 16:51:08 +0000 Subject: [PATCH 10/20] inserti32x4, inserti64x4 --- crates/core_arch/avx512f.md | 12 +- crates/core_arch/src/x86/avx512f.rs | 163 ++++++++++++++++++++++++- crates/core_arch/src/x86_64/avx512f.rs | 49 ++++++++ 3 files changed, 214 insertions(+), 10 deletions(-) diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md index e74d4e193a..df6170e076 100644 --- a/crates/core_arch/avx512f.md +++ b/crates/core_arch/avx512f.md @@ -224,8 +224,8 @@ * [x] [`_mm512_i64scatter_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64scatter_ps&expand=5236) * [x] [`_mm512_insertf32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_insertf32x4&expand=5236) * [x] [`_mm512_insertf64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_insertf64x4&expand=5236) - * [ ] [`_mm512_inserti32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_inserti32x4&expand=5236) - * [ ] [`_mm512_inserti64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_inserti64x4&expand=5236) + * [x] [`_mm512_inserti32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_inserti32x4&expand=5236) + * [x] [`_mm512_inserti64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_inserti64x4&expand=5236) * [ ] [`_mm512_int2mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_int2mask&expand=5236) * [x] [`_mm512_kand`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_kand&expand=5236) * [x] [`_mm512_kandn`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_kandn&expand=5236) @@ -513,8 +513,8 @@ * [x] [`_mm512_mask_i64scatter_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64scatter_ps&expand=5236) * [x] [`_mm512_mask_insertf32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_insertf32x4&expand=5236) * [x] [`_mm512_mask_insertf64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_insertf64x4&expand=5236) - * [ ] [`_mm512_mask_inserti32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_inserti32x4&expand=5236) - * [ ] [`_mm512_mask_inserti64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_inserti64x4&expand=5236) + * [x] [`_mm512_mask_inserti32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_inserti32x4&expand=5236) + * [x] [`_mm512_mask_inserti64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_inserti64x4&expand=5236) * [ ] [`_mm512_mask_load_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_load_epi32&expand=5236) * [ ] [`_mm512_mask_load_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_load_epi64&expand=5236) * [ ] [`_mm512_mask_load_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_load_pd&expand=5236) @@ -811,8 +811,8 @@ * [x] [`_mm512_maskz_getmant_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getmant_round_ps&expand=5236) * [x] [`_mm512_maskz_insertf32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_insertf32x4&expand=5236) * [x] [`_mm512_maskz_insertf64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_insertf64x4&expand=5236) - * [ ] [`_mm512_maskz_inserti32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_inserti32x4&expand=5236) - * [ ] [`_mm512_maskz_inserti64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_inserti64x4&expand=5236) + * [x] [`_mm512_maskz_inserti32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_inserti32x4&expand=5236) + * [x] [`_mm512_maskz_inserti64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_inserti64x4&expand=5236) * [ ] [`_mm512_maskz_load_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_load_epi32&expand=5236) * [ ] [`_mm512_maskz_load_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_load_epi64&expand=5236) * [ ] [`_mm512_maskz_load_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_load_pd&expand=5236) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 7fc2436810..60ebf8663b 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -10229,6 +10229,112 @@ pub unsafe fn _mm512_maskz_movedup_pd(k: __mmask8, a: __m512d) -> __m512d { transmute(simd_select_bitmask(k, mov, zero)) } +/// Copy a to dst, then insert 128 bits (composed of 4 packed 32-bit integers) from b into dst at the location specified by imm8. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_inserti32x4&expand=3174) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vinsertf32x4, imm8 = 2))] //should be vinserti32x4 +#[rustc_args_required_const(2)] +pub unsafe fn _mm512_inserti32x4(a: __m512i, b: __m128i, imm8: i32) -> __m512i { + let a = a.as_i32x16(); + let b = _mm512_castsi128_si512(b).as_i32x16(); + let ret: i32x16 = match imm8 & 0b11 { + 0 => simd_shuffle16(a, b, [16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), + 1 => simd_shuffle16(a, b, [0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 12, 13, 14, 15]), + 2 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15]), + _ => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19]), + }; + transmute(ret) +} + +/// Copy a to tmp, then insert 128 bits (composed of 4 packed 32-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_inserti32x4&expand=3175) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vinserti32x4, imm8 = 2))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm512_mask_inserti32x4(src: __m512i, k: __mmask16, a: __m512i, b: __m128i, imm8: i32) -> __m512i { + let a = a.as_i32x16(); + let b = _mm512_castsi128_si512(b).as_i32x16(); + let insert: i32x16 = match imm8 & 0b11 { + 0 => simd_shuffle16(a, b, [16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), + 1 => simd_shuffle16(a, b, [0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 12, 13, 14, 15]), + 2 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15]), + _ => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19]), + }; + transmute(simd_select_bitmask(k, insert, src.as_i32x16())) +} + +/// Copy a to tmp, then insert 128 bits (composed of 4 packed 32-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_inserti32x4&expand=3176) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vinserti32x4, imm8 = 2))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm512_maskz_inserti32x4(k: __mmask16, a: __m512i, b: __m128i, imm8: i32) -> __m512i { + let a = a.as_i32x16(); + let b = _mm512_castsi128_si512(b).as_i32x16(); + let insert = match imm8 & 0b11 { + 0 => simd_shuffle16(a, b, [16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), + 1 => simd_shuffle16(a, b, [0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 12, 13, 14, 15]), + 2 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15]), + _ => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19]), + }; + let zero = _mm512_setzero_si512().as_i32x16(); + transmute(simd_select_bitmask(k, insert, zero)) +} + +/// Copy a to dst, then insert 256 bits (composed of 4 packed 64-bit integers) from b into dst at the location specified by imm8. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_inserti64x4&expand=3186) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vinsertf64x4, imm8 = 1))] //should be vinserti64x4 +#[rustc_args_required_const(2)] +pub unsafe fn _mm512_inserti64x4(a: __m512i, b: __m256i, imm8: i32) -> __m512i { + let b = _mm512_castsi256_si512(b); + match imm8 & 0b1 { + 0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7 ]), + _ => simd_shuffle8(a, b, [0, 1, 2, 3, 8, 9, 10, 11]), + } +} + +/// Copy a to tmp, then insert 256 bits (composed of 4 packed 64-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_inserti64x4&expand=3187) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vinserti64x4, imm8 = 1))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm512_mask_inserti64x4(src: __m512i, k: __mmask8, a: __m512i, b: __m256i, imm8: i32) -> __m512i { + let b = _mm512_castsi256_si512(b); + let insert = match imm8 & 0b1 { + 0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7 ]), + _ => simd_shuffle8(a, b, [0, 1, 2, 3, 8, 9, 10, 11]), + }; + transmute(simd_select_bitmask(k, insert, src.as_i64x8())) +} + +/// Copy a to tmp, then insert 256 bits (composed of 4 packed 64-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_inserti64x4&expand=3188) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vinserti64x4, imm8 = 1))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm512_maskz_inserti64x4(k: __mmask8, a: __m512i, b: __m256i, imm8: i32) -> __m512i { + let b = _mm512_castsi256_si512(b); + let insert = match imm8 & 0b1 { + 0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7 ]), + _ => simd_shuffle8(a, b, [0, 1, 2, 3, 8, 9, 10, 11]), + }; + let zero = _mm512_setzero_si512().as_i64x8(); + transmute(simd_select_bitmask(k, insert, zero)) +} + /// Copy a to dst, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into dst at the location specified by imm8. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_insertf32x4&expand=3155) @@ -10501,8 +10607,8 @@ pub unsafe fn _mm512_castsi512_pd(a: __m512i) -> __m512d { #[cfg_attr(test, assert_instr(vbroadcast))] //should be vpbroadcastd pub unsafe fn _mm512_broadcastd_epi32(a: __m128i) -> __m512i { let a = _mm512_castsi128_si512(a).as_i32x16(); - let ret = simd_shuffle16(a, a, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); - transmute::(ret) + let ret: i32x16 = simd_shuffle16(a, a, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); + transmute(ret) } /// Broadcast the low packed 32-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -10635,8 +10741,8 @@ pub unsafe fn _mm512_maskz_broadcastsd_pd(k: __mmask8, a: __m128d) -> __m512d { #[cfg_attr(test, assert_instr(vinsert))] //should be vpbroadcasti32x4 pub unsafe fn _mm512_broadcast_i32x4(a: __m128i) -> __m512i { let a = _mm512_castsi128_si512(a).as_i32x16(); - let ret = simd_shuffle16(a, a, [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]); - transmute::(ret) + let ret: i32x16 = simd_shuffle16(a, a, [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]); + transmute(ret) } /// Broadcast the 4 packed 32-bit integers from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -18974,6 +19080,55 @@ mod tests { assert_eq_m512(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_inserti32x4() { + let a = _mm512_setr_epi32( + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + ); + let b = _mm_setr_epi32( + 17, 18, 19, 20, + ); + let r = _mm512_inserti32x4(a, b, 0); + let e = _mm512_setr_epi32( + 17, 18, 19, 20, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + ); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_inserti32x4() { + let a = _mm512_setr_epi32( + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + ); + let b = _mm_setr_epi32( + 17, 18, 19, 20, + ); + let r = _mm512_mask_inserti32x4(a, 0, a, b, 0); + assert_eq_m512i(r, a); + let r = _mm512_mask_inserti32x4(a, 0b11111111_11111111, a, b, 0); + let e = _mm512_setr_epi32( + 17, 18, 19, 20, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + ); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_inserti32x4() { + let a = _mm512_setr_epi32( + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + ); + let b = _mm_setr_epi32( + 17, 18, 19, 20, + ); + let r = _mm512_maskz_inserti32x4(0, a, b, 0); + assert_eq_m512i(r, _mm512_setzero_si512()); + let r = _mm512_maskz_inserti32x4(0b00000000_11111111, a, b, 0); + let e = _mm512_setr_epi32( + 17, 18, 19, 20, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 0, + ); + assert_eq_m512i(r, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_insertf32x4() { let a = _mm512_setr_ps( diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs index 83b884dc8d..b767227b8a 100644 --- a/crates/core_arch/src/x86_64/avx512f.rs +++ b/crates/core_arch/src/x86_64/avx512f.rs @@ -4278,6 +4278,55 @@ mod tests { assert_eq_m512d(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_inserti64x4() { + let a = _mm512_setr_epi64( + 1, 2, 3, 4, 5, 6, 7, 8, + ); + let b = _mm256_setr_epi64x( + 17, 18, 19, 20, + ); + let r = _mm512_inserti64x4(a, b, 1); + let e = _mm512_setr_epi64( + 1, 2, 3, 4, 17, 18, 19, 20, + ); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_inserti64x4() { + let a = _mm512_setr_epi64( + 1, 2, 3, 4, 5, 6, 7, 8, + ); + let b = _mm256_setr_epi64x( + 17, 18, 19, 20, + ); + let r = _mm512_mask_inserti64x4(a, 0, a, b, 1); + assert_eq_m512i(r, a); + let r = _mm512_mask_inserti64x4(a, 0b11111111, a, b, 1); + let e = _mm512_setr_epi64( + 1, 2, 3, 4, 17, 18, 19, 20, + ); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_inserti64x4() { + let a = _mm512_setr_epi64( + 1, 2, 3, 4, 5, 6, 7, 8, + ); + let b = _mm256_setr_epi64x( + 17, 18, 19, 20, + ); + let r = _mm512_maskz_inserti64x4(0, a, b, 1); + assert_eq_m512i(r, _mm512_setzero_si512()); + let r = _mm512_maskz_inserti64x4(0b00001111, a, b, 1); + let e = _mm512_setr_epi64( + 1, 2, 3, 4, 0, 0, 0, 0, + ); + assert_eq_m512i(r, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_insertf64x4() { let a = _mm512_setr_pd( From 907de2fd3b50de5765b925f4ef5a77faa74418da Mon Sep 17 00:00:00 2001 From: jironglin Date: Wed, 23 Sep 2020 20:17:54 +0000 Subject: [PATCH 11/20] mask_blend: epi32,epi64,ps,pd --- crates/core_arch/avx512f.md | 8 ++-- crates/core_arch/src/x86/avx512f.rs | 58 ++++++++++++++++++++++++++ crates/core_arch/src/x86_64/avx512f.rs | 18 ++++++++ 3 files changed, 80 insertions(+), 4 deletions(-) diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md index df6170e076..fb13b28524 100644 --- a/crates/core_arch/avx512f.md +++ b/crates/core_arch/avx512f.md @@ -290,10 +290,10 @@ * [x] [`_mm512_mask_and_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_and_epi64&expand=5236) * [x] [`_mm512_mask_andnot_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_andnot_epi32&expand=5236) * [x] [`_mm512_mask_andnot_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_andnot_epi64&expand=5236) - * [ ] [`_mm512_mask_blend_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_blend_epi32&expand=5236) - * [ ] [`_mm512_mask_blend_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_blend_epi64&expand=5236) - * [ ] [`_mm512_mask_blend_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_blend_pd&expand=5236) - * [ ] [`_mm512_mask_blend_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_blend_ps&expand=5236) + * [x] [`_mm512_mask_blend_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_blend_epi32&expand=5236) + * [x] [`_mm512_mask_blend_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_blend_epi64&expand=5236) + * [x] [`_mm512_mask_blend_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_blend_pd&expand=5236) + * [x] [`_mm512_mask_blend_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_blend_ps&expand=5236) * [x] [`_mm512_mask_broadcast_f32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcast_f32x4&expand=5236) * [x] [`_mm512_mask_broadcast_f64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcast_f64x4&expand=5236) * [x] [`_mm512_mask_broadcast_i32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcast_i32x4&expand=5236) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 60ebf8663b..07c3aebdad 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -10867,6 +10867,46 @@ pub unsafe fn _mm512_maskz_broadcast_f64x4(k: __mmask8, a: __m256d) -> __m512d { transmute(simd_select_bitmask(k, broadcast, zero)) } +/// Blend packed 32-bit integers from a and b using control mask k, and store the results in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_blend_epi32&expand=435) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovdqa32))] //should be vpblendmd +pub unsafe fn _mm512_mask_blend_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i { + transmute(simd_select_bitmask(k, b.as_i32x16(), a.as_i32x16())) +} + +/// Blend packed 64-bit integers from a and b using control mask k, and store the results in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_blend_epi64&expand=438) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovdqa64))] //should be vpblendmq +pub unsafe fn _mm512_mask_blend_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i { + transmute(simd_select_bitmask(k, b.as_i64x8(), a.as_i64x8())) +} + +/// Blend packed single-precision (32-bit) floating-point elements from a and b using control mask k, and store the results in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_blend_ps&expand=451) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovaps))] //should be vpblendmps +pub unsafe fn _mm512_mask_blend_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 { + transmute(simd_select_bitmask(k, b.as_f32x16(), a.as_f32x16())) +} + +/// Blend packed double-precision (64-bit) floating-point elements from a and b using control mask k, and store the results in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_blend_pd&expand=446) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovapd))] //should be vpblendmpd +pub unsafe fn _mm512_mask_blend_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d { + transmute(simd_select_bitmask(k, b.as_f64x8(), a.as_f64x8())) +} + /// Compute the bitwise AND of packed 32-bit integers in a and b, and store the results in dst. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_and_epi32&expand=272) @@ -19386,6 +19426,24 @@ mod tests { assert_eq_m512(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_blend_epi32() { + let a = _mm512_set1_epi32(1); + let b = _mm512_set1_epi32(2); + let r = _mm512_mask_blend_epi32(0b11111111_00000000, a, b); + let e = _mm512_set_epi32(2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_blend_ps() { + let a = _mm512_set1_ps(1.); + let b = _mm512_set1_ps(2.); + let r = _mm512_mask_blend_ps(0b11111111_00000000, a, b); + let e = _mm512_set_ps(2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1.); + assert_eq_m512(r, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_and_epi32() { let a = _mm512_set_epi32( diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs index b767227b8a..796077f330 100644 --- a/crates/core_arch/src/x86_64/avx512f.rs +++ b/crates/core_arch/src/x86_64/avx512f.rs @@ -4644,6 +4644,24 @@ mod tests { assert_eq_m512d(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_blend_epi64() { + let a = _mm512_set1_epi64(1); + let b = _mm512_set1_epi64(2); + let r = _mm512_mask_blend_epi64(0b11110000, a, b); + let e = _mm512_set_epi64(2, 2, 2, 2, 1, 1, 1, 1); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_blend_pd() { + let a = _mm512_set1_pd(1.); + let b = _mm512_set1_pd(2.); + let r = _mm512_mask_blend_pd(0b11110000, a, b); + let e = _mm512_set_pd(2., 2., 2., 2., 1., 1., 1., 1.); + assert_eq_m512d(r, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_and_epi64() { let a = _mm512_set_epi64(1 << 0 | 1 << 15, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3); From 6e08a62082d29d96dbdd110f2572cb720c63614c Mon Sep 17 00:00:00 2001 From: jironglin Date: Thu, 24 Sep 2020 00:06:07 +0000 Subject: [PATCH 12/20] unpackhi: epi32,epi64,ps,pd --- crates/core_arch/avx512f.md | 24 +- crates/core_arch/src/x86/avx512f.rs | 656 +++++++++++++++++++------ crates/core_arch/src/x86_64/avx512f.rs | 287 ++++++----- 3 files changed, 659 insertions(+), 308 deletions(-) diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md index fb13b28524..f7c19dfc43 100644 --- a/crates/core_arch/avx512f.md +++ b/crates/core_arch/avx512f.md @@ -666,10 +666,10 @@ * [ ] [`_mm512_mask_test_epi64_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_test_epi64_mask&expand=5236) * [ ] [`_mm512_mask_testn_epi32_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_testn_epi32_mask&expand=5236) * [ ] [`_mm512_mask_testn_epi64_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_testn_epi64_mask&expand=5236) - * [ ] [`_mm512_mask_unpackhi_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpackhi_epi32&expand=5236) - * [ ] [`_mm512_mask_unpackhi_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpackhi_epi64&expand=5236) - * [ ] [`_mm512_mask_unpackhi_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpackhi_pd&expand=5236) - * [ ] [`_mm512_mask_unpackhi_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpackhi_ps&expand=5236) + * [x] [`_mm512_mask_unpackhi_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpackhi_epi32&expand=5236) + * [x] [`_mm512_mask_unpackhi_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpackhi_epi64&expand=5236) + * [x] [`_mm512_mask_unpackhi_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpackhi_pd&expand=5236) + * [x] [`_mm512_mask_unpackhi_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpackhi_ps&expand=5236) * [ ] [`_mm512_mask_unpacklo_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpacklo_epi32&expand=5236) * [ ] [`_mm512_mask_unpacklo_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpacklo_epi64&expand=5236) * [ ] [`_mm512_mask_unpacklo_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpacklo_pd&expand=5236) @@ -926,10 +926,10 @@ * [x] [`_mm512_maskz_sub_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sub_round_ps&expand=5236) * [ ] [`_mm512_maskz_ternarylogic_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_ternarylogic_epi32&expand=5236) * [ ] [`_mm512_maskz_ternarylogic_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_ternarylogic_epi64&expand=5236) - * [ ] [`_mm512_maskz_unpackhi_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpackhi_epi32&expand=5236) - * [ ] [`_mm512_maskz_unpackhi_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpackhi_epi64&expand=5236) - * [ ] [`_mm512_maskz_unpackhi_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpackhi_pd&expand=5236) - * [ ] [`_mm512_maskz_unpackhi_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpackhi_ps&expand=5236) + * [x] [`_mm512_maskz_unpackhi_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpackhi_epi32&expand=5236) + * [x] [`_mm512_maskz_unpackhi_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpackhi_epi64&expand=5236) + * [x] [`_mm512_maskz_unpackhi_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpackhi_pd&expand=5236) + * [x] [`_mm512_maskz_unpackhi_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpackhi_ps&expand=5236) * [ ] [`_mm512_maskz_unpacklo_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpacklo_epi32&expand=5236) * [ ] [`_mm512_maskz_unpacklo_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpacklo_epi64&expand=5236) * [ ] [`_mm512_maskz_unpacklo_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpacklo_pd&expand=5236) @@ -1112,10 +1112,10 @@ * [x] [`_mm512_undefined_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_undefined_pd&expand=5236) * [x] [`_mm512_undefined_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_undefined_ps&expand=5236) * [ ] [`_mm512_undefined`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_undefined&expand=5236) - * [ ] [`_mm512_unpackhi_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpackhi_epi32&expand=5236) - * [ ] [`_mm512_unpackhi_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpackhi_epi64&expand=5236) - * [ ] [`_mm512_unpackhi_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpackhi_pd&expand=5236) - * [ ] [`_mm512_unpackhi_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpackhi_ps&expand=5236) + * [x] [`_mm512_unpackhi_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpackhi_epi32&expand=5236) + * [x] [`_mm512_unpackhi_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpackhi_epi64&expand=5236) + * [x] [`_mm512_unpackhi_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpackhi_pd&expand=5236) + * [x] [`_mm512_unpackhi_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpackhi_ps&expand=5236) * [ ] [`_mm512_unpacklo_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpacklo_epi32&expand=5236) * [ ] [`_mm512_unpacklo_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpacklo_epi64&expand=5236) * [ ] [`_mm512_unpacklo_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpacklo_pd&expand=5236) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 07c3aebdad..9963de1240 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -10240,10 +10240,22 @@ pub unsafe fn _mm512_inserti32x4(a: __m512i, b: __m128i, imm8: i32) -> __m512i { let a = a.as_i32x16(); let b = _mm512_castsi128_si512(b).as_i32x16(); let ret: i32x16 = match imm8 & 0b11 { - 0 => simd_shuffle16(a, b, [16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), - 1 => simd_shuffle16(a, b, [0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 12, 13, 14, 15]), - 2 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15]), - _ => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19]), + 0 => simd_shuffle16( + a, + b, + [16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], + ), + 1 => simd_shuffle16( + a, + b, + [0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 12, 13, 14, 15], + ), + 2 => simd_shuffle16( + a, + b, + [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15], + ), + _ => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19]), }; transmute(ret) } @@ -10255,14 +10267,32 @@ pub unsafe fn _mm512_inserti32x4(a: __m512i, b: __m128i, imm8: i32) -> __m512i { #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr(vinserti32x4, imm8 = 2))] #[rustc_args_required_const(4)] -pub unsafe fn _mm512_mask_inserti32x4(src: __m512i, k: __mmask16, a: __m512i, b: __m128i, imm8: i32) -> __m512i { +pub unsafe fn _mm512_mask_inserti32x4( + src: __m512i, + k: __mmask16, + a: __m512i, + b: __m128i, + imm8: i32, +) -> __m512i { let a = a.as_i32x16(); let b = _mm512_castsi128_si512(b).as_i32x16(); let insert: i32x16 = match imm8 & 0b11 { - 0 => simd_shuffle16(a, b, [16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), - 1 => simd_shuffle16(a, b, [0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 12, 13, 14, 15]), - 2 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15]), - _ => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19]), + 0 => simd_shuffle16( + a, + b, + [16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], + ), + 1 => simd_shuffle16( + a, + b, + [0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 12, 13, 14, 15], + ), + 2 => simd_shuffle16( + a, + b, + [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15], + ), + _ => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19]), }; transmute(simd_select_bitmask(k, insert, src.as_i32x16())) } @@ -10278,10 +10308,22 @@ pub unsafe fn _mm512_maskz_inserti32x4(k: __mmask16, a: __m512i, b: __m128i, imm let a = a.as_i32x16(); let b = _mm512_castsi128_si512(b).as_i32x16(); let insert = match imm8 & 0b11 { - 0 => simd_shuffle16(a, b, [16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), - 1 => simd_shuffle16(a, b, [0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 12, 13, 14, 15]), - 2 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15]), - _ => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19]), + 0 => simd_shuffle16( + a, + b, + [16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], + ), + 1 => simd_shuffle16( + a, + b, + [0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 12, 13, 14, 15], + ), + 2 => simd_shuffle16( + a, + b, + [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15], + ), + _ => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19]), }; let zero = _mm512_setzero_si512().as_i32x16(); transmute(simd_select_bitmask(k, insert, zero)) @@ -10297,8 +10339,8 @@ pub unsafe fn _mm512_maskz_inserti32x4(k: __mmask16, a: __m512i, b: __m128i, imm pub unsafe fn _mm512_inserti64x4(a: __m512i, b: __m256i, imm8: i32) -> __m512i { let b = _mm512_castsi256_si512(b); match imm8 & 0b1 { - 0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7 ]), - _ => simd_shuffle8(a, b, [0, 1, 2, 3, 8, 9, 10, 11]), + 0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7]), + _ => simd_shuffle8(a, b, [0, 1, 2, 3, 8, 9, 10, 11]), } } @@ -10309,11 +10351,17 @@ pub unsafe fn _mm512_inserti64x4(a: __m512i, b: __m256i, imm8: i32) -> __m512i { #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr(vinserti64x4, imm8 = 1))] #[rustc_args_required_const(4)] -pub unsafe fn _mm512_mask_inserti64x4(src: __m512i, k: __mmask8, a: __m512i, b: __m256i, imm8: i32) -> __m512i { +pub unsafe fn _mm512_mask_inserti64x4( + src: __m512i, + k: __mmask8, + a: __m512i, + b: __m256i, + imm8: i32, +) -> __m512i { let b = _mm512_castsi256_si512(b); let insert = match imm8 & 0b1 { - 0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7 ]), - _ => simd_shuffle8(a, b, [0, 1, 2, 3, 8, 9, 10, 11]), + 0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7]), + _ => simd_shuffle8(a, b, [0, 1, 2, 3, 8, 9, 10, 11]), }; transmute(simd_select_bitmask(k, insert, src.as_i64x8())) } @@ -10328,8 +10376,8 @@ pub unsafe fn _mm512_mask_inserti64x4(src: __m512i, k: __mmask8, a: __m512i, b: pub unsafe fn _mm512_maskz_inserti64x4(k: __mmask8, a: __m512i, b: __m256i, imm8: i32) -> __m512i { let b = _mm512_castsi256_si512(b); let insert = match imm8 & 0b1 { - 0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7 ]), - _ => simd_shuffle8(a, b, [0, 1, 2, 3, 8, 9, 10, 11]), + 0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7]), + _ => simd_shuffle8(a, b, [0, 1, 2, 3, 8, 9, 10, 11]), }; let zero = _mm512_setzero_si512().as_i64x8(); transmute(simd_select_bitmask(k, insert, zero)) @@ -10345,10 +10393,22 @@ pub unsafe fn _mm512_maskz_inserti64x4(k: __mmask8, a: __m512i, b: __m256i, imm8 pub unsafe fn _mm512_insertf32x4(a: __m512, b: __m128, imm8: i32) -> __m512 { let b = _mm512_castps128_ps512(b); match imm8 & 0b11 { - 0 => simd_shuffle16(a, b, [16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), - 1 => simd_shuffle16(a, b, [0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 12, 13, 14, 15]), - 2 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15]), - _ => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19]), + 0 => simd_shuffle16( + a, + b, + [16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], + ), + 1 => simd_shuffle16( + a, + b, + [0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 12, 13, 14, 15], + ), + 2 => simd_shuffle16( + a, + b, + [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15], + ), + _ => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19]), } } @@ -10359,13 +10419,31 @@ pub unsafe fn _mm512_insertf32x4(a: __m512, b: __m128, imm8: i32) -> __m512 { #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr(vinsertf32x4, imm8 = 2))] #[rustc_args_required_const(4)] -pub unsafe fn _mm512_mask_insertf32x4(src: __m512, k: __mmask16, a: __m512, b: __m128, imm8: i32) -> __m512 { +pub unsafe fn _mm512_mask_insertf32x4( + src: __m512, + k: __mmask16, + a: __m512, + b: __m128, + imm8: i32, +) -> __m512 { let b = _mm512_castps128_ps512(b); let insert = match imm8 & 0b11 { - 0 => simd_shuffle16(a, b, [16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), - 1 => simd_shuffle16(a, b, [0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 12, 13, 14, 15]), - 2 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15]), - _ => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19]), + 0 => simd_shuffle16( + a, + b, + [16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], + ), + 1 => simd_shuffle16( + a, + b, + [0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 12, 13, 14, 15], + ), + 2 => simd_shuffle16( + a, + b, + [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15], + ), + _ => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19]), }; transmute(simd_select_bitmask(k, insert, src.as_f32x16())) } @@ -10380,10 +10458,22 @@ pub unsafe fn _mm512_mask_insertf32x4(src: __m512, k: __mmask16, a: __m512, b: _ pub unsafe fn _mm512_maskz_insertf32x4(k: __mmask16, a: __m512, b: __m128, imm8: i32) -> __m512 { let b = _mm512_castps128_ps512(b); let insert = match imm8 & 0b11 { - 0 => simd_shuffle16(a, b, [16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), - 1 => simd_shuffle16(a, b, [0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 12, 13, 14, 15]), - 2 => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15]), - _ => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19]), + 0 => simd_shuffle16( + a, + b, + [16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], + ), + 1 => simd_shuffle16( + a, + b, + [0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 12, 13, 14, 15], + ), + 2 => simd_shuffle16( + a, + b, + [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15], + ), + _ => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19]), }; let zero = _mm512_setzero_ps().as_f32x16(); transmute(simd_select_bitmask(k, insert, zero)) @@ -10399,8 +10489,8 @@ pub unsafe fn _mm512_maskz_insertf32x4(k: __mmask16, a: __m512, b: __m128, imm8: pub unsafe fn _mm512_insertf64x4(a: __m512d, b: __m256d, imm8: i32) -> __m512d { let b = _mm512_castpd256_pd512(b); match imm8 & 0b1 { - 0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7 ]), - _ => simd_shuffle8(a, b, [0, 1, 2, 3, 8, 9, 10, 11]), + 0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7]), + _ => simd_shuffle8(a, b, [0, 1, 2, 3, 8, 9, 10, 11]), } } @@ -10411,11 +10501,17 @@ pub unsafe fn _mm512_insertf64x4(a: __m512d, b: __m256d, imm8: i32) -> __m512d { #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr(vinsertf64x4, imm8 = 1))] #[rustc_args_required_const(4)] -pub unsafe fn _mm512_mask_insertf64x4(src: __m512d, k: __mmask8, a: __m512d, b: __m256d, imm8: i32) -> __m512d { +pub unsafe fn _mm512_mask_insertf64x4( + src: __m512d, + k: __mmask8, + a: __m512d, + b: __m256d, + imm8: i32, +) -> __m512d { let b = _mm512_castpd256_pd512(b); let insert = match imm8 & 0b1 { - 0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7 ]), - _ => simd_shuffle8(a, b, [0, 1, 2, 3, 8, 9, 10, 11]), + 0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7]), + _ => simd_shuffle8(a, b, [0, 1, 2, 3, 8, 9, 10, 11]), }; transmute(simd_select_bitmask(k, insert, src.as_f64x8())) } @@ -10430,20 +10526,217 @@ pub unsafe fn _mm512_mask_insertf64x4(src: __m512d, k: __mmask8, a: __m512d, b: pub unsafe fn _mm512_maskz_insertf64x4(k: __mmask8, a: __m512d, b: __m256d, imm8: i32) -> __m512d { let b = _mm512_castpd256_pd512(b); let insert = match imm8 & 0b1 { - 0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7 ]), - _ => simd_shuffle8(a, b, [0, 1, 2, 3, 8, 9, 10, 11]), + 0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7]), + _ => simd_shuffle8(a, b, [0, 1, 2, 3, 8, 9, 10, 11]), }; let zero = _mm512_setzero_pd().as_f64x8(); transmute(simd_select_bitmask(k, insert, zero)) } +/// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_unpackhi_epi32&expand=6021) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vunpckhps))] //should be vpunpckhdq +pub unsafe fn _mm512_unpackhi_epi32(a: __m512i, b: __m512i) -> __m512i { + let a = a.as_i32x16(); + let b = b.as_i32x16(); + let r: i32x16 = simd_shuffle16( + a, + b, + [ + 2, + 18, + 3, + 19, + 2 + 4, + 18 + 4, + 3 + 4, + 19 + 4, + 2 + 8, + 18 + 8, + 3 + 8, + 19 + 8, + 2 + 12, + 18 + 12, + 3 + 12, + 19 + 12, + ], + ); + transmute(r) +} + +/// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_unpackhi_epi32&expand=6019) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpunpckhdq))] +pub unsafe fn _mm512_mask_unpackhi_epi32( + src: __m512i, + k: __mmask16, + a: __m512i, + b: __m512i, +) -> __m512i { + let unpackhi = _mm512_unpackhi_epi32(a, b).as_i32x16(); + transmute(simd_select_bitmask(k, unpackhi, src.as_i32x16())) +} + +/// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_unpackhi_epi32&expand=6020) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpunpckhdq))] +pub unsafe fn _mm512_maskz_unpackhi_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i { + let unpackhi = _mm512_unpackhi_epi32(a, b).as_i32x16(); + let zero = _mm512_setzero_si512().as_i32x16(); + transmute(simd_select_bitmask(k, unpackhi, zero)) +} + +/// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and +/// store the results in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_unpackhi_epi64&expand=6030) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpermt2q))] //should be vpunpckhqdq +pub unsafe fn _mm512_unpackhi_epi64(a: __m512i, b: __m512i) -> __m512i { + simd_shuffle8(a, b, [2, 10, 3, 11, 2 + 4, 10 + 4, 3 + 4, 11 + 4]) +} + +/// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_unpackhi_epi64&expand=6028) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpermi2q))] //should be vpunpckhqdq +pub unsafe fn _mm512_mask_unpackhi_epi64( + src: __m512i, + k: __mmask8, + a: __m512i, + b: __m512i, +) -> __m512i { + let unpackhi = _mm512_unpackhi_epi64(a, b).as_i64x8(); + transmute(simd_select_bitmask(k, unpackhi, src.as_i64x8())) +} + +/// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_unpackhi_epi64&expand=6029) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpermt2q))] //should be vpunpckhqdq +pub unsafe fn _mm512_maskz_unpackhi_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i { + let unpackhi = _mm512_unpackhi_epi64(a, b).as_i64x8(); + let zero = _mm512_setzero_si512().as_i64x8(); + transmute(simd_select_bitmask(k, unpackhi, zero)) +} + +/// Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_unpackhi_ps&expand=6060) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vunpckhps))] +pub unsafe fn _mm512_unpackhi_ps(a: __m512, b: __m512) -> __m512 { + simd_shuffle16( + a, + b, + [ + 2, + 18, + 3, + 19, + 2 + 4, + 18 + 4, + 3 + 4, + 19 + 4, + 2 + 8, + 18 + 8, + 3 + 8, + 19 + 8, + 2 + 12, + 18 + 12, + 3 + 12, + 19 + 12, + ], + ) +} + +/// Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_unpackhi_ps&expand=6058) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vunpckhps))] +pub unsafe fn _mm512_mask_unpackhi_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 { + let unpackhi = _mm512_unpackhi_ps(a, b).as_f32x16(); + transmute(simd_select_bitmask(k, unpackhi, src.as_f32x16())) +} + +/// Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_unpackhi_ps&expand=6059) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vunpckhps))] +pub unsafe fn _mm512_maskz_unpackhi_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 { + let unpackhi = _mm512_unpackhi_ps(a, b).as_f32x16(); + let zero = _mm512_setzero_ps().as_f32x16(); + transmute(simd_select_bitmask(k, unpackhi, zero)) +} + +/// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_unpackhi_pd&expand=6048) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpermt2pd))] //should be vunpackhpd +pub unsafe fn _mm512_unpackhi_pd(a: __m512d, b: __m512d) -> __m512d { + simd_shuffle8(a, b, [2, 10, 3, 11, 2 + 4, 10 + 4, 3 + 4, 11 + 4]) +} + +/// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_unpackhi_pd&expand=6046) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpermi2pd))] //should be vunpackhpd +pub unsafe fn _mm512_mask_unpackhi_pd( + src: __m512d, + k: __mmask8, + a: __m512d, + b: __m512d, +) -> __m512d { + let unpackhi = _mm512_unpackhi_pd(a, b).as_f64x8(); + transmute(simd_select_bitmask(k, unpackhi, src.as_f64x8())) +} + +/// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_unpackhi_pd&expand=6047) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpermt2pd))] //should be vunpackhpd +pub unsafe fn _mm512_maskz_unpackhi_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d { + let unpackhi = _mm512_unpackhi_pd(a, b).as_f64x8(); + let zero = _mm512_setzero_pd().as_f64x8(); + transmute(simd_select_bitmask(k, unpackhi, zero)) +} + /// Cast vector of type __m128 to type __m512; the upper 384 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castps128_ps512&expand=621) #[inline] #[target_feature(enable = "avx512f")] pub unsafe fn _mm512_castps128_ps512(a: __m128) -> __m512 { - simd_shuffle16(a, _mm_set1_ps(-1.), [0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]) + simd_shuffle16( + a, + _mm_set1_ps(-1.), + [0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4], + ) } /// Cast vector of type __m256 to type __m512; the upper 256 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. @@ -10452,7 +10745,11 @@ pub unsafe fn _mm512_castps128_ps512(a: __m128) -> __m512 { #[inline] #[target_feature(enable = "avx512f")] pub unsafe fn _mm512_castps256_ps512(a: __m256) -> __m512 { - simd_shuffle16(a, _mm256_set1_ps(-1.), [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8]) + simd_shuffle16( + a, + _mm256_set1_ps(-1.), + [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8], + ) } /// Cast vector of type __m512 to type __m128. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. @@ -10611,7 +10908,7 @@ pub unsafe fn _mm512_broadcastd_epi32(a: __m128i) -> __m512i { transmute(ret) } -/// Broadcast the low packed 32-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// Broadcast the low packed 32-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set). /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcastd_epi32&expand=546) #[inline] @@ -10733,7 +11030,7 @@ pub unsafe fn _mm512_maskz_broadcastsd_pd(k: __mmask8, a: __m128d) -> __m512d { transmute(simd_select_bitmask(k, broadcast, zero)) } -/// Broadcast the 4 packed 32-bit integers from a to all elements of dst. +/// Broadcast the 4 packed 32-bit integers from a to all elements of dst. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcast_i32x4&expand=510) #[inline] @@ -11151,7 +11448,12 @@ pub unsafe fn _mm512_andnot_epi32(a: __m512i, b: __m512i) -> __m512i { #[inline] #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr(vpandnd))] -pub unsafe fn _mm512_mask_andnot_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i { +pub unsafe fn _mm512_mask_andnot_epi32( + src: __m512i, + k: __mmask16, + a: __m512i, + b: __m512i, +) -> __m512i { let andnot = _mm512_andnot_epi32(a, b).as_i32x16(); transmute(simd_select_bitmask(k, andnot, src.as_i32x16())) } @@ -11184,7 +11486,12 @@ pub unsafe fn _mm512_andnot_epi64(a: __m512i, b: __m512i) -> __m512i { #[inline] #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr(vpandnq))] -pub unsafe fn _mm512_mask_andnot_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i { +pub unsafe fn _mm512_mask_andnot_epi64( + src: __m512i, + k: __mmask8, + a: __m512i, + b: __m512i, +) -> __m512i { let andnot = _mm512_andnot_epi64(a, b).as_i64x8(); transmute(simd_select_bitmask(k, andnot, src.as_i64x8())) } @@ -19122,50 +19429,32 @@ mod tests { #[simd_test(enable = "avx512f")] unsafe fn test_mm512_inserti32x4() { - let a = _mm512_setr_epi32( - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, - ); - let b = _mm_setr_epi32( - 17, 18, 19, 20, - ); + let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); + let b = _mm_setr_epi32(17, 18, 19, 20); let r = _mm512_inserti32x4(a, b, 0); - let e = _mm512_setr_epi32( - 17, 18, 19, 20, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, - ); + let e = _mm512_setr_epi32(17, 18, 19, 20, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); assert_eq_m512i(r, e); } #[simd_test(enable = "avx512f")] unsafe fn test_mm512_mask_inserti32x4() { - let a = _mm512_setr_epi32( - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, - ); - let b = _mm_setr_epi32( - 17, 18, 19, 20, - ); + let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); + let b = _mm_setr_epi32(17, 18, 19, 20); let r = _mm512_mask_inserti32x4(a, 0, a, b, 0); assert_eq_m512i(r, a); let r = _mm512_mask_inserti32x4(a, 0b11111111_11111111, a, b, 0); - let e = _mm512_setr_epi32( - 17, 18, 19, 20, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, - ); + let e = _mm512_setr_epi32(17, 18, 19, 20, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); assert_eq_m512i(r, e); } #[simd_test(enable = "avx512f")] unsafe fn test_mm512_maskz_inserti32x4() { - let a = _mm512_setr_epi32( - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, - ); - let b = _mm_setr_epi32( - 17, 18, 19, 20, - ); + let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); + let b = _mm_setr_epi32(17, 18, 19, 20); let r = _mm512_maskz_inserti32x4(0, a, b, 0); assert_eq_m512i(r, _mm512_setzero_si512()); let r = _mm512_maskz_inserti32x4(0b00000000_11111111, a, b, 0); - let e = _mm512_setr_epi32( - 17, 18, 19, 20, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 0, - ); + let e = _mm512_setr_epi32(17, 18, 19, 20, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 0); assert_eq_m512i(r, e); } @@ -19174,9 +19463,7 @@ mod tests { let a = _mm512_setr_ps( 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., ); - let b = _mm_setr_ps( - 17., 18., 19., 20., - ); + let b = _mm_setr_ps(17., 18., 19., 20.); let r = _mm512_insertf32x4(a, b, 0); let e = _mm512_setr_ps( 17., 18., 19., 20., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., @@ -19189,9 +19476,7 @@ mod tests { let a = _mm512_setr_ps( 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., ); - let b = _mm_setr_ps( - 17., 18., 19., 20., - ); + let b = _mm_setr_ps(17., 18., 19., 20.); let r = _mm512_mask_insertf32x4(a, 0, a, b, 0); assert_eq_m512(r, a); let r = _mm512_mask_insertf32x4(a, 0b11111111_11111111, a, b, 0); @@ -19206,9 +19491,7 @@ mod tests { let a = _mm512_setr_ps( 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., ); - let b = _mm_setr_ps( - 17., 18., 19., 20., - ); + let b = _mm_setr_ps(17., 18., 19., 20.); let r = _mm512_maskz_insertf32x4(0, a, b, 0); assert_eq_m512(r, _mm512_setzero_ps()); let r = _mm512_maskz_insertf32x4(0b00000000_11111111, a, b, 0); @@ -19217,12 +19500,10 @@ mod tests { ); assert_eq_m512(r, e); } - + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_castps128_ps512() { - let a = _mm_setr_ps( - 17., 18., 19., 20., - ); + let a = _mm_setr_ps(17., 18., 19., 20.); let r = _mm512_castps128_ps512(a); let e = _mm512_setr_ps( 17., 18., 19., 20., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., @@ -19232,9 +19513,7 @@ mod tests { #[simd_test(enable = "avx512f")] unsafe fn test_mm512_castps256_ps512() { - let a = _mm256_setr_ps( - 17., 18., 19., 20., 21., 22., 23., 24., - ); + let a = _mm256_setr_ps(17., 18., 19., 20., 21., 22., 23., 24.); let r = _mm512_castps256_ps512(a); let e = _mm512_setr_ps( 17., 18., 19., 20., 21., 22., 23., 24., -1., -1., -1., -1., -1., -1., -1., -1., @@ -19248,9 +19527,7 @@ mod tests { 17., 18., 19., 20., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., ); let r = _mm512_castps512_ps128(a); - let e = _mm_setr_ps( - 17., 18., 19., 20., - ); + let e = _mm_setr_ps(17., 18., 19., 20.); assert_eq_m128(r, e); } @@ -19260,9 +19537,7 @@ mod tests { 17., 18., 19., 20., 21., 22., 23., 24., -1., -1., -1., -1., -1., -1., -1., -1., ); let r = _mm512_castps512_ps256(a); - let e = _mm256_setr_ps( - 17., 18., 19., 20., 21., 22., 23., 24., - ); + let e = _mm256_setr_ps(17., 18., 19., 20., 21., 22., 23., 24.); assert_eq_m256(r, e); } @@ -19284,9 +19559,7 @@ mod tests { #[simd_test(enable = "avx512f")] unsafe fn test_mm512_broadcastd_epi32() { - let a = _mm_set_epi32( - 17, 18, 19, 20, - ); + let a = _mm_set_epi32(17, 18, 19, 20); let r = _mm512_broadcastd_epi32(a); let e = _mm512_set1_epi32(20); assert_eq_m512i(r, e); @@ -19295,9 +19568,7 @@ mod tests { #[simd_test(enable = "avx512f")] unsafe fn test_mm512_mask_broadcastd_epi32() { let src = _mm512_set1_epi32(20); - let a = _mm_set_epi32( - 17, 18, 19, 20, - ); + let a = _mm_set_epi32(17, 18, 19, 20); let r = _mm512_mask_broadcastd_epi32(src, 0, a); assert_eq_m512i(r, src); let r = _mm512_mask_broadcastd_epi32(src, 0b11111111_11111111, a); @@ -19307,23 +19578,17 @@ mod tests { #[simd_test(enable = "avx512f")] unsafe fn test_mm512_maskz_broadcastd_epi32() { - let a = _mm_set_epi32( - 17, 18, 19, 20, - ); + let a = _mm_set_epi32(17, 18, 19, 20); let r = _mm512_maskz_broadcastd_epi32(0, a); assert_eq_m512i(r, _mm512_setzero_si512()); let r = _mm512_maskz_broadcastd_epi32(0b00000000_11111111, a); - let e = _mm512_setr_epi32( - 20, 20, 20, 20, 20, 20, 20, 20, 0, 0, 0, 0, 0, 0, 0, 0, - ); + let e = _mm512_setr_epi32(20, 20, 20, 20, 20, 20, 20, 20, 0, 0, 0, 0, 0, 0, 0, 0); assert_eq_m512i(r, e); } #[simd_test(enable = "avx512f")] unsafe fn test_mm512_broadcastss_ps() { - let a = _mm_set_ps( - 17., 18., 19., 20., - ); + let a = _mm_set_ps(17., 18., 19., 20.); let r = _mm512_broadcastss_ps(a); let e = _mm512_set1_ps(20.); assert_eq_m512(r, e); @@ -19332,9 +19597,7 @@ mod tests { #[simd_test(enable = "avx512f")] unsafe fn test_mm512_mask_broadcastss_ps() { let src = _mm512_set1_ps(20.); - let a = _mm_set_ps( - 17., 18., 19., 20., - ); + let a = _mm_set_ps(17., 18., 19., 20.); let r = _mm512_mask_broadcastss_ps(src, 0, a); assert_eq_m512(r, src); let r = _mm512_mask_broadcastss_ps(src, 0b11111111_11111111, a); @@ -19344,9 +19607,7 @@ mod tests { #[simd_test(enable = "avx512f")] unsafe fn test_mm512_maskz_broadcastss_ps() { - let a = _mm_set_ps( - 17., 18., 19., 20., - ); + let a = _mm_set_ps(17., 18., 19., 20.); let r = _mm512_maskz_broadcastss_ps(0, a); assert_eq_m512(r, _mm512_setzero_ps()); let r = _mm512_maskz_broadcastss_ps(0b00000000_11111111, a); @@ -19358,32 +19619,30 @@ mod tests { #[simd_test(enable = "avx512f")] unsafe fn test_mm512_broadcast_i32x4() { - let a = _mm_set_epi32( - 17, 18, 19, 20, - ); + let a = _mm_set_epi32(17, 18, 19, 20); let r = _mm512_broadcast_i32x4(a); - let e = _mm512_set_epi32(17, 18, 19, 20, 17, 18, 19, 20, 17, 18, 19, 20, 17, 18, 19, 20); + let e = _mm512_set_epi32( + 17, 18, 19, 20, 17, 18, 19, 20, 17, 18, 19, 20, 17, 18, 19, 20, + ); assert_eq_m512i(r, e); } #[simd_test(enable = "avx512f")] unsafe fn test_mm512_mask_broadcast_i32x4() { let src = _mm512_set1_epi32(20); - let a = _mm_set_epi32( - 17, 18, 19, 20, - ); + let a = _mm_set_epi32(17, 18, 19, 20); let r = _mm512_mask_broadcast_i32x4(src, 0, a); assert_eq_m512i(r, src); let r = _mm512_mask_broadcast_i32x4(src, 0b11111111_11111111, a); - let e = _mm512_set_epi32(17, 18, 19, 20, 17, 18, 19, 20, 17, 18, 19, 20, 17, 18, 19, 20); + let e = _mm512_set_epi32( + 17, 18, 19, 20, 17, 18, 19, 20, 17, 18, 19, 20, 17, 18, 19, 20, + ); assert_eq_m512i(r, e); } #[simd_test(enable = "avx512f")] unsafe fn test_mm512_maskz_broadcast_i32x4() { - let a = _mm_set_epi32( - 17, 18, 19, 20, - ); + let a = _mm_set_epi32(17, 18, 19, 20); let r = _mm512_maskz_broadcast_i32x4(0, a); assert_eq_m512i(r, _mm512_setzero_si512()); let r = _mm512_maskz_broadcast_i32x4(0b00000000_11111111, a); @@ -19393,36 +19652,36 @@ mod tests { #[simd_test(enable = "avx512f")] unsafe fn test_mm512_broadcast_f32x4() { - let a = _mm_set_ps( - 17., 18., 19., 20., - ); + let a = _mm_set_ps(17., 18., 19., 20.); let r = _mm512_broadcast_f32x4(a); - let e = _mm512_set_ps(17., 18., 19., 20., 17., 18., 19., 20., 17., 18., 19., 20., 17., 18., 19., 20.); + let e = _mm512_set_ps( + 17., 18., 19., 20., 17., 18., 19., 20., 17., 18., 19., 20., 17., 18., 19., 20., + ); assert_eq_m512(r, e); } #[simd_test(enable = "avx512f")] unsafe fn test_mm512_mask_broadcast_f32x4() { let src = _mm512_set1_ps(20.); - let a = _mm_set_ps( - 17., 18., 19., 20., - ); + let a = _mm_set_ps(17., 18., 19., 20.); let r = _mm512_mask_broadcast_f32x4(src, 0, a); assert_eq_m512(r, src); let r = _mm512_mask_broadcast_f32x4(src, 0b11111111_11111111, a); - let e = _mm512_set_ps(17., 18., 19., 20., 17., 18., 19., 20., 17., 18., 19., 20., 17., 18., 19., 20.); + let e = _mm512_set_ps( + 17., 18., 19., 20., 17., 18., 19., 20., 17., 18., 19., 20., 17., 18., 19., 20., + ); assert_eq_m512(r, e); } #[simd_test(enable = "avx512f")] unsafe fn test_mm512_maskz_broadcast_f32x4() { - let a = _mm_set_ps( - 17., 18., 19., 20., - ); + let a = _mm_set_ps(17., 18., 19., 20.); let r = _mm512_maskz_broadcast_f32x4(0, a); assert_eq_m512(r, _mm512_setzero_ps()); let r = _mm512_maskz_broadcast_f32x4(0b00000000_11111111, a); - let e = _mm512_set_ps(0., 0., 0., 0., 0., 0., 0., 0., 17., 18., 19., 20., 17., 18., 19., 20.); + let e = _mm512_set_ps( + 0., 0., 0., 0., 0., 0., 0., 0., 17., 18., 19., 20., 17., 18., 19., 20., + ); assert_eq_m512(r, e); } @@ -19440,7 +19699,95 @@ mod tests { let a = _mm512_set1_ps(1.); let b = _mm512_set1_ps(2.); let r = _mm512_mask_blend_ps(0b11111111_00000000, a, b); - let e = _mm512_set_ps(2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1.); + let e = _mm512_set_ps( + 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., + ); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_unpackhi_epi32() { + let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); + let b = _mm512_set_epi32( + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + ); + let r = _mm512_unpackhi_epi32(a, b); + let e = _mm512_set_epi32(17, 1, 18, 2, 21, 5, 22, 6, 25, 9, 26, 10, 29, 13, 30, 14); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_unpackhi_epi32() { + let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); + let b = _mm512_set_epi32( + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + ); + let r = _mm512_mask_unpackhi_epi32(a, 0, a, b); + assert_eq_m512i(r, a); + let r = _mm512_mask_unpackhi_epi32(a, 0b11111111_11111111, a, b); + let e = _mm512_set_epi32(17, 1, 18, 2, 21, 5, 22, 6, 25, 9, 26, 10, 29, 13, 30, 14); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_unpackhi_epi32() { + let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); + let b = _mm512_set_epi32( + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + ); + let r = _mm512_maskz_unpackhi_epi32(0, a, b); + assert_eq_m512i(r, _mm512_setzero_si512()); + let r = _mm512_maskz_unpackhi_epi32(0b00000000_11111111, a, b); + let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 25, 9, 26, 10, 29, 13, 30, 14); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_unpackhi_ps() { + let a = _mm512_set_ps( + 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., + ); + let b = _mm512_set_ps( + 17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., + ); + let r = _mm512_unpackhi_ps(a, b); + let e = _mm512_set_ps( + 17., 1., 18., 2., 21., 5., 22., 6., 25., 9., 26., 10., 29., 13., 30., 14., + ); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_unpackhi_ps() { + let a = _mm512_set_ps( + 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., + ); + let b = _mm512_set_ps( + 17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., + ); + let r = _mm512_mask_unpackhi_ps(a, 0, a, b); + assert_eq_m512(r, a); + let r = _mm512_mask_unpackhi_ps(a, 0b11111111_11111111, a, b); + let e = _mm512_set_ps( + 17., 1., 18., 2., 21., 5., 22., 6., 25., 9., 26., 10., 29., 13., 30., 14., + ); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_unpackhi_ps() { + let a = _mm512_set_ps( + 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., + ); + let b = _mm512_set_ps( + 17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., + ); + let r = _mm512_maskz_unpackhi_ps(0, a, b); + assert_eq_m512(r, _mm512_setzero_ps()); + let r = _mm512_maskz_unpackhi_ps(0b00000000_11111111, a, b); + let e = _mm512_set_ps( + 0., 0., 0., 0., 0., 0., 0., 0., 25., 9., 26., 10., 29., 13., 30., 14., + ); assert_eq_m512(r, e); } @@ -20117,7 +20464,7 @@ mod tests { #[simd_test(enable = "avx512f")] unsafe fn test_mm512_andnot_epi32() { let a = _mm512_set1_epi32(0); - let b = _mm512_set1_epi32(1<<3 | 1<< 4); + let b = _mm512_set1_epi32(1 << 3 | 1 << 4); let r = _mm512_andnot_epi32(a, b); let e = _mm512_set1_epi32(1 << 3 | 1 << 4); assert_eq_m512i(r, e); @@ -20125,8 +20472,8 @@ mod tests { #[simd_test(enable = "avx512f")] unsafe fn test_mm512_mask_andnot_epi32() { - let a = _mm512_set1_epi32(1<<1 | 1<< 2); - let b = _mm512_set1_epi32(1<<3 | 1<< 4); + let a = _mm512_set1_epi32(1 << 1 | 1 << 2); + let b = _mm512_set1_epi32(1 << 3 | 1 << 4); let r = _mm512_mask_andnot_epi32(a, 0, a, b); assert_eq_m512i(r, a); @@ -20137,13 +20484,30 @@ mod tests { #[simd_test(enable = "avx512f")] unsafe fn test_mm512_maskz_andnot_epi32() { - let a = _mm512_set1_epi32(1<<1 | 1<< 2); - let b = _mm512_set1_epi32(1<<3 | 1<< 4); + let a = _mm512_set1_epi32(1 << 1 | 1 << 2); + let b = _mm512_set1_epi32(1 << 3 | 1 << 4); let r = _mm512_maskz_andnot_epi32(0, a, b); assert_eq_m512i(r, _mm512_setzero_si512()); let r = _mm512_maskz_andnot_epi32(0b00000000_11111111, a, b); - let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4); + let e = _mm512_set_epi32( + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 1 << 3 | 1 << 4, + 1 << 3 | 1 << 4, + 1 << 3 | 1 << 4, + 1 << 3 | 1 << 4, + 1 << 3 | 1 << 4, + 1 << 3 | 1 << 4, + 1 << 3 | 1 << 4, + 1 << 3 | 1 << 4, + ); assert_eq_m512i(r, e); } diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs index 796077f330..b005136ebe 100644 --- a/crates/core_arch/src/x86_64/avx512f.rs +++ b/crates/core_arch/src/x86_64/avx512f.rs @@ -4280,147 +4280,95 @@ mod tests { #[simd_test(enable = "avx512f")] unsafe fn test_mm512_inserti64x4() { - let a = _mm512_setr_epi64( - 1, 2, 3, 4, 5, 6, 7, 8, - ); - let b = _mm256_setr_epi64x( - 17, 18, 19, 20, - ); + let a = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8); + let b = _mm256_setr_epi64x(17, 18, 19, 20); let r = _mm512_inserti64x4(a, b, 1); - let e = _mm512_setr_epi64( - 1, 2, 3, 4, 17, 18, 19, 20, - ); + let e = _mm512_setr_epi64(1, 2, 3, 4, 17, 18, 19, 20); assert_eq_m512i(r, e); } #[simd_test(enable = "avx512f")] unsafe fn test_mm512_mask_inserti64x4() { - let a = _mm512_setr_epi64( - 1, 2, 3, 4, 5, 6, 7, 8, - ); - let b = _mm256_setr_epi64x( - 17, 18, 19, 20, - ); + let a = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8); + let b = _mm256_setr_epi64x(17, 18, 19, 20); let r = _mm512_mask_inserti64x4(a, 0, a, b, 1); assert_eq_m512i(r, a); let r = _mm512_mask_inserti64x4(a, 0b11111111, a, b, 1); - let e = _mm512_setr_epi64( - 1, 2, 3, 4, 17, 18, 19, 20, - ); + let e = _mm512_setr_epi64(1, 2, 3, 4, 17, 18, 19, 20); assert_eq_m512i(r, e); } #[simd_test(enable = "avx512f")] unsafe fn test_mm512_maskz_inserti64x4() { - let a = _mm512_setr_epi64( - 1, 2, 3, 4, 5, 6, 7, 8, - ); - let b = _mm256_setr_epi64x( - 17, 18, 19, 20, - ); + let a = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8); + let b = _mm256_setr_epi64x(17, 18, 19, 20); let r = _mm512_maskz_inserti64x4(0, a, b, 1); assert_eq_m512i(r, _mm512_setzero_si512()); let r = _mm512_maskz_inserti64x4(0b00001111, a, b, 1); - let e = _mm512_setr_epi64( - 1, 2, 3, 4, 0, 0, 0, 0, - ); + let e = _mm512_setr_epi64(1, 2, 3, 4, 0, 0, 0, 0); assert_eq_m512i(r, e); } #[simd_test(enable = "avx512f")] unsafe fn test_mm512_insertf64x4() { - let a = _mm512_setr_pd( - 1., 2., 3., 4., 5., 6., 7., 8., - ); - let b = _mm256_setr_pd( - 17., 18., 19., 20., - ); + let a = _mm512_setr_pd(1., 2., 3., 4., 5., 6., 7., 8.); + let b = _mm256_setr_pd(17., 18., 19., 20.); let r = _mm512_insertf64x4(a, b, 1); - let e = _mm512_setr_pd( - 1., 2., 3., 4., 17., 18., 19., 20., - ); + let e = _mm512_setr_pd(1., 2., 3., 4., 17., 18., 19., 20.); assert_eq_m512d(r, e); } #[simd_test(enable = "avx512f")] unsafe fn test_mm512_mask_insertf64x4() { - let a = _mm512_setr_pd( - 1., 2., 3., 4., 5., 6., 7., 8., - ); - let b = _mm256_setr_pd( - 17., 18., 19., 20., - ); + let a = _mm512_setr_pd(1., 2., 3., 4., 5., 6., 7., 8.); + let b = _mm256_setr_pd(17., 18., 19., 20.); let r = _mm512_mask_insertf64x4(a, 0, a, b, 1); assert_eq_m512d(r, a); let r = _mm512_mask_insertf64x4(a, 0b11111111, a, b, 1); - let e = _mm512_setr_pd( - 1., 2., 3., 4., 17., 18., 19., 20., - ); + let e = _mm512_setr_pd(1., 2., 3., 4., 17., 18., 19., 20.); assert_eq_m512d(r, e); } #[simd_test(enable = "avx512f")] unsafe fn test_mm512_maskz_insertf64x4() { - let a = _mm512_setr_pd( - 1., 2., 3., 4., 5., 6., 7., 8., - ); - let b = _mm256_setr_pd( - 17., 18., 19., 20., - ); + let a = _mm512_setr_pd(1., 2., 3., 4., 5., 6., 7., 8.); + let b = _mm256_setr_pd(17., 18., 19., 20.); let r = _mm512_maskz_insertf64x4(0, a, b, 1); assert_eq_m512d(r, _mm512_setzero_pd()); let r = _mm512_maskz_insertf64x4(0b00001111, a, b, 1); - let e = _mm512_setr_pd( - 1., 2., 3., 4., 0., 0., 0., 0., - ); + let e = _mm512_setr_pd(1., 2., 3., 4., 0., 0., 0., 0.); assert_eq_m512d(r, e); } #[simd_test(enable = "avx512f")] unsafe fn test_mm512_castpd128_pd512() { - let a = _mm_setr_pd( - 17., 18., - ); + let a = _mm_setr_pd(17., 18.); let r = _mm512_castpd128_pd512(a); - let e = _mm512_setr_pd( - 17., 18., -1., -1., -1., -1., -1., -1. - ); + let e = _mm512_setr_pd(17., 18., -1., -1., -1., -1., -1., -1.); assert_eq_m512d(r, e); } #[simd_test(enable = "avx512f")] unsafe fn test_mm512_castpd256_pd512() { - let a = _mm256_setr_pd( - 17., 18., 19., 20., - ); + let a = _mm256_setr_pd(17., 18., 19., 20.); let r = _mm512_castpd256_pd512(a); - let e = _mm512_setr_pd( - 17., 18., 19., 20., -1., -1., -1., -1., - ); + let e = _mm512_setr_pd(17., 18., 19., 20., -1., -1., -1., -1.); assert_eq_m512d(r, e); } #[simd_test(enable = "avx512f")] unsafe fn test_mm512_castpd512_pd128() { - let a = _mm512_setr_pd( - 17., 18., -1., -1., -1., -1., -1., -1., - ); + let a = _mm512_setr_pd(17., 18., -1., -1., -1., -1., -1., -1.); let r = _mm512_castpd512_pd128(a); - let e = _mm_setr_pd( - 17., 18., - ); + let e = _mm_setr_pd(17., 18.); assert_eq_m128d(r, e); } #[simd_test(enable = "avx512f")] unsafe fn test_mm512_castpd512_pd256() { - let a = _mm512_setr_pd( - 17., 18., 19., 20., -1., -1., -1., -1., - ); + let a = _mm512_setr_pd(17., 18., 19., 20., -1., -1., -1., -1.); let r = _mm512_castpd512_pd256(a); - let e = _mm256_setr_pd( - 17., 18., 19., 20., - ); + let e = _mm256_setr_pd(17., 18., 19., 20.); assert_eq_m256d(r, e); } @@ -4428,7 +4376,10 @@ mod tests { unsafe fn test_mm512_castpd_ps() { let a = _mm512_set1_pd(1.); let r = _mm512_castpd_ps(a); - let e = _mm512_set_ps(1.875, 0.0, 1.875, 0.0, 1.875, 0.0, 1.875, 0.0, 1.875, 0.0, 1.875, 0.0, 1.875, 0.0, 1.875, 0.0); + let e = _mm512_set_ps( + 1.875, 0.0, 1.875, 0.0, 1.875, 0.0, 1.875, 0.0, 1.875, 0.0, 1.875, 0.0, 1.875, 0.0, + 1.875, 0.0, + ); assert_eq_m512(r, e); } @@ -4436,69 +4387,58 @@ mod tests { unsafe fn test_mm512_castpd_si512() { let a = _mm512_set1_pd(1.); let r = _mm512_castpd_si512(a); - let e = _mm512_set_epi32(1072693248, 0, 1072693248, 0, 1072693248, 0, 1072693248, 0, 1072693248, 0, 1072693248, 0, 1072693248, 0, 1072693248, 0); + let e = _mm512_set_epi32( + 1072693248, 0, 1072693248, 0, 1072693248, 0, 1072693248, 0, 1072693248, 0, 1072693248, + 0, 1072693248, 0, 1072693248, 0, + ); assert_eq_m512i(r, e); } #[simd_test(enable = "avx512f")] unsafe fn test_mm512_castsi128_si512() { - let a = _mm_setr_epi64x( - 17, 18, - ); + let a = _mm_setr_epi64x(17, 18); let r = _mm512_castsi128_si512(a); - let e = _mm512_setr_epi64( - 17, 18, -1, -1, -1, -1, -1, -1, - ); + let e = _mm512_setr_epi64(17, 18, -1, -1, -1, -1, -1, -1); assert_eq_m512i(r, e); } #[simd_test(enable = "avx512f")] unsafe fn test_mm512_castsi256_si512() { - let a = _mm256_setr_epi64x( - 17, 18, 19, 20, - ); + let a = _mm256_setr_epi64x(17, 18, 19, 20); let r = _mm512_castsi256_si512(a); - let e = _mm512_setr_epi64( - 17, 18, 19, 20, -1, -1, -1, -1, - ); + let e = _mm512_setr_epi64(17, 18, 19, 20, -1, -1, -1, -1); assert_eq_m512i(r, e); } #[simd_test(enable = "avx512f")] unsafe fn test_mm512_castsi512_si128() { - let a = _mm512_setr_epi64( - 17, 18, -1, -1, -1, -1, -1, -1, - ); + let a = _mm512_setr_epi64(17, 18, -1, -1, -1, -1, -1, -1); let r = _mm512_castsi512_si128(a); - let e = _mm_setr_epi64x( - 17, 18, - ); + let e = _mm_setr_epi64x(17, 18); assert_eq_m128i(r, e); } #[simd_test(enable = "avx512f")] unsafe fn test_mm512_castsi512_si256() { - let a = _mm512_setr_epi64( - 17, 18, 19, 20, -1, -1, -1, -1, - ); + let a = _mm512_setr_epi64(17, 18, 19, 20, -1, -1, -1, -1); let r = _mm512_castsi512_si256(a); - let e = _mm256_setr_epi64x( - 17, 18, 19, 20, - ); + let e = _mm256_setr_epi64x(17, 18, 19, 20); assert_eq_m256i(r, e); } #[simd_test(enable = "avx512f")] unsafe fn test_mm512_castsi512_ps() { - let a = _mm512_set1_epi64(1<<62); + let a = _mm512_set1_epi64(1 << 62); let r = _mm512_castsi512_ps(a); - let e = _mm512_set_ps(2., 0., 2., 0., 2., 0., 2., 0., 2., 0., 2., 0., 2., 0., 2., 0.); + let e = _mm512_set_ps( + 2., 0., 2., 0., 2., 0., 2., 0., 2., 0., 2., 0., 2., 0., 2., 0., + ); assert_eq_m512(r, e); } #[simd_test(enable = "avx512f")] unsafe fn test_mm512_castsi512_pd() { - let a = _mm512_set1_epi64(1<<62); + let a = _mm512_set1_epi64(1 << 62); let r = _mm512_castsi512_pd(a); let e = _mm512_set_pd(2., 2., 2., 2., 2., 2., 2., 2.); assert_eq_m512d(r, e); @@ -4506,9 +4446,7 @@ mod tests { #[simd_test(enable = "avx512f")] unsafe fn test_mm512_broadcastq_epi64() { - let a = _mm_setr_epi64x( - 17, 18, - ); + let a = _mm_setr_epi64x(17, 18); let r = _mm512_broadcastq_epi64(a); let e = _mm512_set1_epi64(18); assert_eq_m512i(r, e); @@ -4517,9 +4455,7 @@ mod tests { #[simd_test(enable = "avx512f")] unsafe fn test_mm512_mask_broadcastq_epi64() { let src = _mm512_set1_epi64(18); - let a = _mm_setr_epi64x( - 17, 18, - ); + let a = _mm_setr_epi64x(17, 18); let r = _mm512_mask_broadcastq_epi64(src, 0, a); assert_eq_m512i(r, src); let r = _mm512_mask_broadcastq_epi64(src, 0b01111111, a); @@ -4529,9 +4465,7 @@ mod tests { #[simd_test(enable = "avx512f")] unsafe fn test_mm512_maskz_broadcastq_epi64() { - let a = _mm_setr_epi64x( - 17, 18, - ); + let a = _mm_setr_epi64x(17, 18); let r = _mm512_maskz_broadcastq_epi64(0, a); assert_eq_m512i(r, _mm512_setzero_si512()); let r = _mm512_maskz_broadcastq_epi64(0b00001111, a); @@ -4541,9 +4475,7 @@ mod tests { #[simd_test(enable = "avx512f")] unsafe fn test_mm512_broadcastsd_pd() { - let a = _mm_setr_pd( - 17., 18., - ); + let a = _mm_setr_pd(17., 18.); let r = _mm512_broadcastsd_pd(a); let e = _mm512_set1_pd(18.); assert_eq_m512d(r, e); @@ -4552,9 +4484,7 @@ mod tests { #[simd_test(enable = "avx512f")] unsafe fn test_mm512_mask_broadcastsd_pd() { let src = _mm512_set1_pd(18.); - let a = _mm_setr_pd( - 17., 18., - ); + let a = _mm_setr_pd(17., 18.); let r = _mm512_mask_broadcastsd_pd(src, 0, a); assert_eq_m512d(r, src); let r = _mm512_mask_broadcastsd_pd(src, 0b01111111, a); @@ -4564,9 +4494,7 @@ mod tests { #[simd_test(enable = "avx512f")] unsafe fn test_mm512_maskz_broadcastsd_pd() { - let a = _mm_setr_pd( - 17., 18., - ); + let a = _mm_setr_pd(17., 18.); let r = _mm512_maskz_broadcastsd_pd(0, a); assert_eq_m512d(r, _mm512_setzero_pd()); let r = _mm512_maskz_broadcastsd_pd(0b00001111, a); @@ -4576,9 +4504,7 @@ mod tests { #[simd_test(enable = "avx512f")] unsafe fn test_mm512_broadcast_i64x4() { - let a = _mm256_set_epi64x( - 17, 18, 19, 20 - ); + let a = _mm256_set_epi64x(17, 18, 19, 20); let r = _mm512_broadcast_i64x4(a); let e = _mm512_set_epi64(17, 18, 19, 20, 17, 18, 19, 20); assert_eq_m512i(r, e); @@ -4587,9 +4513,7 @@ mod tests { #[simd_test(enable = "avx512f")] unsafe fn test_mm512_mask_broadcast_i64x4() { let src = _mm512_set1_epi64(18); - let a = _mm256_set_epi64x( - 17, 18, 19, 20 - ); + let a = _mm256_set_epi64x(17, 18, 19, 20); let r = _mm512_mask_broadcast_i64x4(src, 0, a); assert_eq_m512i(r, src); let r = _mm512_mask_broadcast_i64x4(src, 0b11111111, a); @@ -4599,9 +4523,7 @@ mod tests { #[simd_test(enable = "avx512f")] unsafe fn test_mm512_maskz_broadcast_i64x4() { - let a = _mm256_set_epi64x( - 17, 18, 19, 20 - ); + let a = _mm256_set_epi64x(17, 18, 19, 20); let r = _mm512_maskz_broadcast_i64x4(0, a); assert_eq_m512i(r, _mm512_setzero_si512()); let r = _mm512_maskz_broadcast_i64x4(0b00001111, a); @@ -4611,9 +4533,7 @@ mod tests { #[simd_test(enable = "avx512f")] unsafe fn test_mm512_broadcast_f64x4() { - let a = _mm256_set_pd( - 17., 18., 19., 20. - ); + let a = _mm256_set_pd(17., 18., 19., 20.); let r = _mm512_broadcast_f64x4(a); let e = _mm512_set_pd(17., 18., 19., 20., 17., 18., 19., 20.); assert_eq_m512d(r, e); @@ -4622,9 +4542,7 @@ mod tests { #[simd_test(enable = "avx512f")] unsafe fn test_mm512_mask_broadcast_f64x4() { let src = _mm512_set1_pd(18.); - let a = _mm256_set_pd( - 17., 18., 19., 20. - ); + let a = _mm256_set_pd(17., 18., 19., 20.); let r = _mm512_mask_broadcast_f64x4(src, 0, a); assert_eq_m512d(r, src); let r = _mm512_mask_broadcast_f64x4(src, 0b11111111, a); @@ -4634,9 +4552,7 @@ mod tests { #[simd_test(enable = "avx512f")] unsafe fn test_mm512_maskz_broadcast_f64x4() { - let a = _mm256_set_pd( - 17., 18., 19., 20. - ); + let a = _mm256_set_pd(17., 18., 19., 20.); let r = _mm512_maskz_broadcast_f64x4(0, a); assert_eq_m512d(r, _mm512_setzero_pd()); let r = _mm512_maskz_broadcast_f64x4(0b00001111, a); @@ -4662,6 +4578,68 @@ mod tests { assert_eq_m512d(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_unpackhi_epi64() { + let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8); + let b = _mm512_set_epi64(17, 18, 19, 20, 21, 22, 23, 24); + let r = _mm512_unpackhi_epi64(a, b); + let e = _mm512_set_epi64(17, 1, 18, 2, 21, 5, 22, 6); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_unpackhi_epi64() { + let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8); + let b = _mm512_set_epi64(17, 18, 19, 20, 21, 22, 23, 24); + let r = _mm512_mask_unpackhi_epi64(a, 0, a, b); + assert_eq_m512i(r, a); + let r = _mm512_mask_unpackhi_epi64(a, 0b11111111, a, b); + let e = _mm512_set_epi64(17, 1, 18, 2, 21, 5, 22, 6); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_unpackhi_epi64() { + let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8); + let b = _mm512_set_epi64(17, 18, 19, 20, 21, 22, 23, 24); + let r = _mm512_maskz_unpackhi_epi64(0, a, b); + assert_eq_m512i(r, _mm512_setzero_si512()); + let r = _mm512_maskz_unpackhi_epi64(0b00001111, a, b); + let e = _mm512_set_epi64(0, 0, 0, 0, 21, 5, 22, 6); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_unpackhi_pd() { + let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.); + let b = _mm512_set_pd(17., 18., 19., 20., 21., 22., 23., 24.); + let r = _mm512_unpackhi_pd(a, b); + let e = _mm512_set_pd(17., 1., 18., 2., 21., 5., 22., 6.); + assert_eq_m512d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_unpackhi_pd() { + let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.); + let b = _mm512_set_pd(17., 18., 19., 20., 21., 22., 23., 24.); + let r = _mm512_mask_unpackhi_pd(a, 0, a, b); + assert_eq_m512d(r, a); + let r = _mm512_mask_unpackhi_pd(a, 0b11111111, a, b); + let e = _mm512_set_pd(17., 1., 18., 2., 21., 5., 22., 6.); + assert_eq_m512d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_unpackhi_pd() { + let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.); + let b = _mm512_set_pd(17., 18., 19., 20., 21., 22., 23., 24.); + let r = _mm512_maskz_unpackhi_pd(0, a, b); + assert_eq_m512d(r, _mm512_setzero_pd()); + let r = _mm512_maskz_unpackhi_pd(0b00001111, a, b); + let e = _mm512_set_pd(0., 0., 0., 0., 21., 5., 22., 6.); + assert_eq_m512d(r, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_and_epi64() { let a = _mm512_set_epi64(1 << 0 | 1 << 15, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3); @@ -4824,7 +4802,7 @@ mod tests { #[simd_test(enable = "avx512f")] unsafe fn test_mm512_andnot_epi64() { let a = _mm512_set1_epi64(0); - let b = _mm512_set1_epi64(1<<3 | 1<< 4); + let b = _mm512_set1_epi64(1 << 3 | 1 << 4); let r = _mm512_andnot_epi64(a, b); let e = _mm512_set1_epi64(1 << 3 | 1 << 4); assert_eq_m512i(r, e); @@ -4832,8 +4810,8 @@ mod tests { #[simd_test(enable = "avx512f")] unsafe fn test_mm512_mask_andnot_epi64() { - let a = _mm512_set1_epi64(1<<1 | 1<< 2); - let b = _mm512_set1_epi64(1<<3 | 1<< 4); + let a = _mm512_set1_epi64(1 << 1 | 1 << 2); + let b = _mm512_set1_epi64(1 << 3 | 1 << 4); let r = _mm512_mask_andnot_epi64(a, 0, a, b); assert_eq_m512i(r, a); @@ -4844,20 +4822,29 @@ mod tests { #[simd_test(enable = "avx512f")] unsafe fn test_mm512_maskz_andnot_epi64() { - let a = _mm512_set1_epi64(1<<1 | 1<< 2); - let b = _mm512_set1_epi64(1<<3 | 1<< 4); + let a = _mm512_set1_epi64(1 << 1 | 1 << 2); + let b = _mm512_set1_epi64(1 << 3 | 1 << 4); let r = _mm512_maskz_andnot_epi64(0, a, b); assert_eq_m512i(r, _mm512_setzero_si512()); let r = _mm512_maskz_andnot_epi64(0b00001111, a, b); - let e = _mm512_set_epi64(0, 0, 0, 0, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4); + let e = _mm512_set_epi64( + 0, + 0, + 0, + 0, + 1 << 3 | 1 << 4, + 1 << 3 | 1 << 4, + 1 << 3 | 1 << 4, + 1 << 3 | 1 << 4, + ); assert_eq_m512i(r, e); } #[simd_test(enable = "avx512f")] unsafe fn test_mm512_andnot_si512() { let a = _mm512_set1_epi64(0); - let b = _mm512_set1_epi64(1<<3 | 1<< 4); + let b = _mm512_set1_epi64(1 << 3 | 1 << 4); let r = _mm512_andnot_si512(a, b); let e = _mm512_set1_epi64(1 << 3 | 1 << 4); assert_eq_m512i(r, e); From 6876994931e6251e1f33ce90e55ca2b3df8c865c Mon Sep 17 00:00:00 2001 From: jironglin Date: Thu, 24 Sep 2020 00:21:22 +0000 Subject: [PATCH 13/20] fix assert_instr v1 --- crates/core_arch/src/x86/avx512f.rs | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 9963de1240..a5da0c42bc 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -11035,7 +11035,7 @@ pub unsafe fn _mm512_maskz_broadcastsd_pd(k: __mmask8, a: __m128d) -> __m512d { /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcast_i32x4&expand=510) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vinsert))] //should be vpbroadcasti32x4 +#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcasti32x4 pub unsafe fn _mm512_broadcast_i32x4(a: __m128i) -> __m512i { let a = _mm512_castsi128_si512(a).as_i32x16(); let ret: i32x16 = simd_shuffle16(a, a, [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]); @@ -11047,7 +11047,7 @@ pub unsafe fn _mm512_broadcast_i32x4(a: __m128i) -> __m512i { /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcast_i32x4&expand=511) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vshufi32x4))] //should be vbroadcasti32x4 +#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcasti32x4 pub unsafe fn _mm512_mask_broadcast_i32x4(src: __m512i, k: __mmask16, a: __m128i) -> __m512i { let broadcast = _mm512_broadcast_i32x4(a).as_i32x16(); transmute(simd_select_bitmask(k, broadcast, src.as_i32x16())) @@ -11058,7 +11058,7 @@ pub unsafe fn _mm512_mask_broadcast_i32x4(src: __m512i, k: __mmask16, a: __m128i /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcast_i32x4&expand=512) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vshufi32x4))] //should be vbroadcasti32x4 +#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcasti32x4 pub unsafe fn _mm512_maskz_broadcast_i32x4(k: __mmask16, a: __m128i) -> __m512i { let broadcast = _mm512_broadcast_i32x4(a).as_i32x16(); let zero = _mm512_setzero_si512().as_i32x16(); @@ -11070,7 +11070,7 @@ pub unsafe fn _mm512_maskz_broadcast_i32x4(k: __mmask16, a: __m128i) -> __m512i /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcast_i64x4&expand=522) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vinsertf64x4))] //should be vbroadcasti64x4 +#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcasti64x4 pub unsafe fn _mm512_broadcast_i64x4(a: __m256i) -> __m512i { simd_shuffle8(a, a, [0, 1, 2, 3, 0, 1, 2, 3]) } @@ -11080,7 +11080,7 @@ pub unsafe fn _mm512_broadcast_i64x4(a: __m256i) -> __m512i { /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcast_i64x4&expand=523) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vinserti64x4))] //should be vbroadcasti64x4 +#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcasti64x4 pub unsafe fn _mm512_mask_broadcast_i64x4(src: __m512i, k: __mmask8, a: __m256i) -> __m512i { let broadcast = _mm512_broadcast_i64x4(a).as_i64x8(); transmute(simd_select_bitmask(k, broadcast, src.as_i64x8())) @@ -11091,7 +11091,7 @@ pub unsafe fn _mm512_mask_broadcast_i64x4(src: __m512i, k: __mmask8, a: __m256i) /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcast_i64x4&expand=524) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vinserti64x4))] //should be vbroadcasti64x4 +#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcasti64x4 pub unsafe fn _mm512_maskz_broadcast_i64x4(k: __mmask8, a: __m256i) -> __m512i { let broadcast = _mm512_broadcast_i64x4(a).as_i64x8(); let zero = _mm512_setzero_si512().as_i64x8(); @@ -11103,7 +11103,7 @@ pub unsafe fn _mm512_maskz_broadcast_i64x4(k: __mmask8, a: __m256i) -> __m512i { /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcast_f32x4&expand=483) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vinsertf128))] //should be vbroadcastf32x4 +#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcastf32x4 pub unsafe fn _mm512_broadcast_f32x4(a: __m128) -> __m512 { simd_shuffle16(a, a, [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]) } @@ -11113,7 +11113,7 @@ pub unsafe fn _mm512_broadcast_f32x4(a: __m128) -> __m512 { /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcast_f32x4&expand=484) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vinsertf128))] //should be vbroadcastf32x4 +#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcastf32x4 pub unsafe fn _mm512_mask_broadcast_f32x4(src: __m512, k: __mmask16, a: __m128) -> __m512 { let broadcast = _mm512_broadcast_f32x4(a).as_f32x16(); transmute(simd_select_bitmask(k, broadcast, src.as_f32x16())) @@ -11124,7 +11124,7 @@ pub unsafe fn _mm512_mask_broadcast_f32x4(src: __m512, k: __mmask16, a: __m128) /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcast_f32x4&expand=485) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vinsertf128))] //should be vbroadcastf32x4 +#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcastf32x4 pub unsafe fn _mm512_maskz_broadcast_f32x4(k: __mmask16, a: __m128) -> __m512 { let broadcast = _mm512_broadcast_f32x4(a).as_f32x16(); let zero = _mm512_setzero_ps().as_f32x16(); @@ -11136,7 +11136,7 @@ pub unsafe fn _mm512_maskz_broadcast_f32x4(k: __mmask16, a: __m128) -> __m512 { /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcast_f64x4&expand=495) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vinsertf64x4))] //should be vbroadcastf64x4 +#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcastf64x4 pub unsafe fn _mm512_broadcast_f64x4(a: __m256d) -> __m512d { simd_shuffle8(a, a, [0, 1, 2, 3, 0, 1, 2, 3]) } @@ -11146,7 +11146,7 @@ pub unsafe fn _mm512_broadcast_f64x4(a: __m256d) -> __m512d { /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcast_f64x4&expand=496) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vinsertf64x4))] //should be vbroadcastf64x4 +#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcastf64x4 pub unsafe fn _mm512_mask_broadcast_f64x4(src: __m512d, k: __mmask8, a: __m256d) -> __m512d { let broadcast = _mm512_broadcast_f64x4(a).as_f64x8(); transmute(simd_select_bitmask(k, broadcast, src.as_f64x8())) @@ -11157,7 +11157,7 @@ pub unsafe fn _mm512_mask_broadcast_f64x4(src: __m512d, k: __mmask8, a: __m256d) /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcast_f64x4&expand=497) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vinsertf64x4))] //should be vbroadcastf64x4 +#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcastf64x4 pub unsafe fn _mm512_maskz_broadcast_f64x4(k: __mmask8, a: __m256d) -> __m512d { let broadcast = _mm512_broadcast_f64x4(a).as_f64x8(); let zero = _mm512_setzero_pd().as_f64x8(); From 0629fe3ecc0e26a353ed5f263f427059b4a0c9a5 Mon Sep 17 00:00:00 2001 From: jironglin Date: Thu, 24 Sep 2020 13:55:40 +0000 Subject: [PATCH 14/20] comment out assert_instr for broadcast_X4 to test --- crates/core_arch/src/x86/avx512f.rs | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index a5da0c42bc..a4ee76e833 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -11035,7 +11035,7 @@ pub unsafe fn _mm512_maskz_broadcastsd_pd(k: __mmask8, a: __m128d) -> __m512d { /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcast_i32x4&expand=510) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcasti32x4 +//#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcasti32x4 pub unsafe fn _mm512_broadcast_i32x4(a: __m128i) -> __m512i { let a = _mm512_castsi128_si512(a).as_i32x16(); let ret: i32x16 = simd_shuffle16(a, a, [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]); @@ -11047,7 +11047,7 @@ pub unsafe fn _mm512_broadcast_i32x4(a: __m128i) -> __m512i { /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcast_i32x4&expand=511) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcasti32x4 +//#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcasti32x4 pub unsafe fn _mm512_mask_broadcast_i32x4(src: __m512i, k: __mmask16, a: __m128i) -> __m512i { let broadcast = _mm512_broadcast_i32x4(a).as_i32x16(); transmute(simd_select_bitmask(k, broadcast, src.as_i32x16())) @@ -11058,7 +11058,7 @@ pub unsafe fn _mm512_mask_broadcast_i32x4(src: __m512i, k: __mmask16, a: __m128i /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcast_i32x4&expand=512) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcasti32x4 +//#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcasti32x4 pub unsafe fn _mm512_maskz_broadcast_i32x4(k: __mmask16, a: __m128i) -> __m512i { let broadcast = _mm512_broadcast_i32x4(a).as_i32x16(); let zero = _mm512_setzero_si512().as_i32x16(); @@ -11070,7 +11070,7 @@ pub unsafe fn _mm512_maskz_broadcast_i32x4(k: __mmask16, a: __m128i) -> __m512i /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcast_i64x4&expand=522) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcasti64x4 +//#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcasti64x4 pub unsafe fn _mm512_broadcast_i64x4(a: __m256i) -> __m512i { simd_shuffle8(a, a, [0, 1, 2, 3, 0, 1, 2, 3]) } @@ -11080,7 +11080,7 @@ pub unsafe fn _mm512_broadcast_i64x4(a: __m256i) -> __m512i { /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcast_i64x4&expand=523) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcasti64x4 +//#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcasti64x4 pub unsafe fn _mm512_mask_broadcast_i64x4(src: __m512i, k: __mmask8, a: __m256i) -> __m512i { let broadcast = _mm512_broadcast_i64x4(a).as_i64x8(); transmute(simd_select_bitmask(k, broadcast, src.as_i64x8())) @@ -11091,7 +11091,7 @@ pub unsafe fn _mm512_mask_broadcast_i64x4(src: __m512i, k: __mmask8, a: __m256i) /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcast_i64x4&expand=524) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcasti64x4 +//#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcasti64x4 pub unsafe fn _mm512_maskz_broadcast_i64x4(k: __mmask8, a: __m256i) -> __m512i { let broadcast = _mm512_broadcast_i64x4(a).as_i64x8(); let zero = _mm512_setzero_si512().as_i64x8(); @@ -11103,7 +11103,7 @@ pub unsafe fn _mm512_maskz_broadcast_i64x4(k: __mmask8, a: __m256i) -> __m512i { /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcast_f32x4&expand=483) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcastf32x4 +//#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcastf32x4 pub unsafe fn _mm512_broadcast_f32x4(a: __m128) -> __m512 { simd_shuffle16(a, a, [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]) } @@ -11113,7 +11113,7 @@ pub unsafe fn _mm512_broadcast_f32x4(a: __m128) -> __m512 { /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcast_f32x4&expand=484) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcastf32x4 +//#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcastf32x4 pub unsafe fn _mm512_mask_broadcast_f32x4(src: __m512, k: __mmask16, a: __m128) -> __m512 { let broadcast = _mm512_broadcast_f32x4(a).as_f32x16(); transmute(simd_select_bitmask(k, broadcast, src.as_f32x16())) @@ -11124,7 +11124,7 @@ pub unsafe fn _mm512_mask_broadcast_f32x4(src: __m512, k: __mmask16, a: __m128) /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcast_f32x4&expand=485) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcastf32x4 +//#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcastf32x4 pub unsafe fn _mm512_maskz_broadcast_f32x4(k: __mmask16, a: __m128) -> __m512 { let broadcast = _mm512_broadcast_f32x4(a).as_f32x16(); let zero = _mm512_setzero_ps().as_f32x16(); @@ -11136,7 +11136,7 @@ pub unsafe fn _mm512_maskz_broadcast_f32x4(k: __mmask16, a: __m128) -> __m512 { /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcast_f64x4&expand=495) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcastf64x4 +//#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcastf64x4 pub unsafe fn _mm512_broadcast_f64x4(a: __m256d) -> __m512d { simd_shuffle8(a, a, [0, 1, 2, 3, 0, 1, 2, 3]) } @@ -11146,7 +11146,7 @@ pub unsafe fn _mm512_broadcast_f64x4(a: __m256d) -> __m512d { /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcast_f64x4&expand=496) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcastf64x4 +//#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcastf64x4 pub unsafe fn _mm512_mask_broadcast_f64x4(src: __m512d, k: __mmask8, a: __m256d) -> __m512d { let broadcast = _mm512_broadcast_f64x4(a).as_f64x8(); transmute(simd_select_bitmask(k, broadcast, src.as_f64x8())) @@ -11157,7 +11157,7 @@ pub unsafe fn _mm512_mask_broadcast_f64x4(src: __m512d, k: __mmask8, a: __m256d) /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcast_f64x4&expand=497) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcastf64x4 +//#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcastf64x4 pub unsafe fn _mm512_maskz_broadcast_f64x4(k: __mmask8, a: __m256d) -> __m512d { let broadcast = _mm512_broadcast_f64x4(a).as_f64x8(); let zero = _mm512_setzero_pd().as_f64x8(); From 79b5fb23192fb55d83f18a3c56164b8b43a2d7dc Mon Sep 17 00:00:00 2001 From: jironglin Date: Thu, 24 Sep 2020 14:02:43 +0000 Subject: [PATCH 15/20] comment out assert_instr for broadcast_X4 to test v2 --- crates/core_arch/src/x86/avx512f.rs | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index a4ee76e833..21f10780c2 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -11035,7 +11035,7 @@ pub unsafe fn _mm512_maskz_broadcastsd_pd(k: __mmask8, a: __m128d) -> __m512d { /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcast_i32x4&expand=510) #[inline] #[target_feature(enable = "avx512f")] -//#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcasti32x4 +//#[cfg_attr(test, assert_instr(vbroadcast))] should be vbroadcasti32x4 pub unsafe fn _mm512_broadcast_i32x4(a: __m128i) -> __m512i { let a = _mm512_castsi128_si512(a).as_i32x16(); let ret: i32x16 = simd_shuffle16(a, a, [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]); @@ -11047,7 +11047,7 @@ pub unsafe fn _mm512_broadcast_i32x4(a: __m128i) -> __m512i { /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcast_i32x4&expand=511) #[inline] #[target_feature(enable = "avx512f")] -//#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcasti32x4 +//#[cfg_attr(test, assert_instr(vbroadcast))] should be vbroadcasti32x4 pub unsafe fn _mm512_mask_broadcast_i32x4(src: __m512i, k: __mmask16, a: __m128i) -> __m512i { let broadcast = _mm512_broadcast_i32x4(a).as_i32x16(); transmute(simd_select_bitmask(k, broadcast, src.as_i32x16())) @@ -11058,7 +11058,7 @@ pub unsafe fn _mm512_mask_broadcast_i32x4(src: __m512i, k: __mmask16, a: __m128i /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcast_i32x4&expand=512) #[inline] #[target_feature(enable = "avx512f")] -//#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcasti32x4 +//#[cfg_attr(test, assert_instr(vbroadcast))] should be vbroadcasti32x4 pub unsafe fn _mm512_maskz_broadcast_i32x4(k: __mmask16, a: __m128i) -> __m512i { let broadcast = _mm512_broadcast_i32x4(a).as_i32x16(); let zero = _mm512_setzero_si512().as_i32x16(); @@ -11070,7 +11070,7 @@ pub unsafe fn _mm512_maskz_broadcast_i32x4(k: __mmask16, a: __m128i) -> __m512i /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcast_i64x4&expand=522) #[inline] #[target_feature(enable = "avx512f")] -//#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcasti64x4 +//#[cfg_attr(test, assert_instr(vbroadcast))] should be vbroadcasti64x4 pub unsafe fn _mm512_broadcast_i64x4(a: __m256i) -> __m512i { simd_shuffle8(a, a, [0, 1, 2, 3, 0, 1, 2, 3]) } @@ -11080,7 +11080,7 @@ pub unsafe fn _mm512_broadcast_i64x4(a: __m256i) -> __m512i { /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcast_i64x4&expand=523) #[inline] #[target_feature(enable = "avx512f")] -//#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcasti64x4 +//#[cfg_attr(test, assert_instr(vbroadcast))] should be vbroadcasti64x4 pub unsafe fn _mm512_mask_broadcast_i64x4(src: __m512i, k: __mmask8, a: __m256i) -> __m512i { let broadcast = _mm512_broadcast_i64x4(a).as_i64x8(); transmute(simd_select_bitmask(k, broadcast, src.as_i64x8())) @@ -11091,7 +11091,7 @@ pub unsafe fn _mm512_mask_broadcast_i64x4(src: __m512i, k: __mmask8, a: __m256i) /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcast_i64x4&expand=524) #[inline] #[target_feature(enable = "avx512f")] -//#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcasti64x4 +//#[cfg_attr(test, assert_instr(vbroadcast))] should be vbroadcasti64x4 pub unsafe fn _mm512_maskz_broadcast_i64x4(k: __mmask8, a: __m256i) -> __m512i { let broadcast = _mm512_broadcast_i64x4(a).as_i64x8(); let zero = _mm512_setzero_si512().as_i64x8(); @@ -11103,7 +11103,7 @@ pub unsafe fn _mm512_maskz_broadcast_i64x4(k: __mmask8, a: __m256i) -> __m512i { /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcast_f32x4&expand=483) #[inline] #[target_feature(enable = "avx512f")] -//#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcastf32x4 +//#[cfg_attr(test, assert_instr(vbroadcast))] should be vbroadcastf32x4 pub unsafe fn _mm512_broadcast_f32x4(a: __m128) -> __m512 { simd_shuffle16(a, a, [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]) } @@ -11113,7 +11113,7 @@ pub unsafe fn _mm512_broadcast_f32x4(a: __m128) -> __m512 { /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcast_f32x4&expand=484) #[inline] #[target_feature(enable = "avx512f")] -//#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcastf32x4 +//#[cfg_attr(test, assert_instr(vbroadcast))] should be vbroadcastf32x4 pub unsafe fn _mm512_mask_broadcast_f32x4(src: __m512, k: __mmask16, a: __m128) -> __m512 { let broadcast = _mm512_broadcast_f32x4(a).as_f32x16(); transmute(simd_select_bitmask(k, broadcast, src.as_f32x16())) @@ -11124,7 +11124,7 @@ pub unsafe fn _mm512_mask_broadcast_f32x4(src: __m512, k: __mmask16, a: __m128) /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcast_f32x4&expand=485) #[inline] #[target_feature(enable = "avx512f")] -//#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcastf32x4 +//#[cfg_attr(test, assert_instr(vbroadcast))] should be vbroadcastf32x4 pub unsafe fn _mm512_maskz_broadcast_f32x4(k: __mmask16, a: __m128) -> __m512 { let broadcast = _mm512_broadcast_f32x4(a).as_f32x16(); let zero = _mm512_setzero_ps().as_f32x16(); @@ -11136,7 +11136,7 @@ pub unsafe fn _mm512_maskz_broadcast_f32x4(k: __mmask16, a: __m128) -> __m512 { /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcast_f64x4&expand=495) #[inline] #[target_feature(enable = "avx512f")] -//#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcastf64x4 +//#[cfg_attr(test, assert_instr(vbroadcast))] should be vbroadcastf64x4 pub unsafe fn _mm512_broadcast_f64x4(a: __m256d) -> __m512d { simd_shuffle8(a, a, [0, 1, 2, 3, 0, 1, 2, 3]) } @@ -11146,7 +11146,7 @@ pub unsafe fn _mm512_broadcast_f64x4(a: __m256d) -> __m512d { /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcast_f64x4&expand=496) #[inline] #[target_feature(enable = "avx512f")] -//#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcastf64x4 +//#[cfg_attr(test, assert_instr(vbroadcast))] should be vbroadcastf64x4 pub unsafe fn _mm512_mask_broadcast_f64x4(src: __m512d, k: __mmask8, a: __m256d) -> __m512d { let broadcast = _mm512_broadcast_f64x4(a).as_f64x8(); transmute(simd_select_bitmask(k, broadcast, src.as_f64x8())) @@ -11157,7 +11157,7 @@ pub unsafe fn _mm512_mask_broadcast_f64x4(src: __m512d, k: __mmask8, a: __m256d) /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcast_f64x4&expand=497) #[inline] #[target_feature(enable = "avx512f")] -//#[cfg_attr(test, assert_instr(vbroadcast))] //should be vbroadcastf64x4 +//#[cfg_attr(test, assert_instr(vbroadcast))] should be vbroadcastf64x4 pub unsafe fn _mm512_maskz_broadcast_f64x4(k: __mmask8, a: __m256d) -> __m512d { let broadcast = _mm512_broadcast_f64x4(a).as_f64x8(); let zero = _mm512_setzero_pd().as_f64x8(); From 9dfe42f70fbf7cbba5824fcb849be317639d4aa7 Mon Sep 17 00:00:00 2001 From: jironglin Date: Thu, 24 Sep 2020 17:12:16 +0000 Subject: [PATCH 16/20] add const_fn_transmute --- crates/core_arch/src/lib.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/crates/core_arch/src/lib.rs b/crates/core_arch/src/lib.rs index cca9f7f323..5dc390753b 100644 --- a/crates/core_arch/src/lib.rs +++ b/crates/core_arch/src/lib.rs @@ -36,7 +36,8 @@ f16c_target_feature, external_doc, allow_internal_unstable, - decl_macro + decl_macro, + const_fn_transmute )] #![cfg_attr(test, feature(test, abi_vectorcall, untagged_unions))] #![cfg_attr(all(test, target_arch = "wasm32"), feature(wasm_simd))] From 11a64fb15d12badbf2466aec674a855dccec35b0 Mon Sep 17 00:00:00 2001 From: jironglin Date: Thu, 24 Sep 2020 17:22:43 +0000 Subject: [PATCH 17/20] fix assert_instr for unpackhi --- crates/core_arch/src/x86/avx512f.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 21f10780c2..ba711f4766 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -10601,7 +10601,7 @@ pub unsafe fn _mm512_maskz_unpackhi_epi32(k: __mmask16, a: __m512i, b: __m512i) /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_unpackhi_epi64&expand=6030) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpermt2q))] //should be vpunpckhqdq +#[cfg_attr(test, assert_instr(vperm))] //should be vpunpckhqdq pub unsafe fn _mm512_unpackhi_epi64(a: __m512i, b: __m512i) -> __m512i { simd_shuffle8(a, b, [2, 10, 3, 11, 2 + 4, 10 + 4, 3 + 4, 11 + 4]) } @@ -10611,7 +10611,7 @@ pub unsafe fn _mm512_unpackhi_epi64(a: __m512i, b: __m512i) -> __m512i { /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_unpackhi_epi64&expand=6028) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpermi2q))] //should be vpunpckhqdq +#[cfg_attr(test, assert_instr(vperm))] //should be vpunpckhqdq pub unsafe fn _mm512_mask_unpackhi_epi64( src: __m512i, k: __mmask8, @@ -10627,7 +10627,7 @@ pub unsafe fn _mm512_mask_unpackhi_epi64( /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_unpackhi_epi64&expand=6029) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpermt2q))] //should be vpunpckhqdq +#[cfg_attr(test, assert_instr(vperm))] //should be vpunpckhqdq pub unsafe fn _mm512_maskz_unpackhi_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i { let unpackhi = _mm512_unpackhi_epi64(a, b).as_i64x8(); let zero = _mm512_setzero_si512().as_i64x8(); @@ -10693,7 +10693,7 @@ pub unsafe fn _mm512_maskz_unpackhi_ps(k: __mmask16, a: __m512, b: __m512) -> __ /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_unpackhi_pd&expand=6048) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpermt2pd))] //should be vunpackhpd +#[cfg_attr(test, assert_instr(vperm))] //should be vunpackhpd pub unsafe fn _mm512_unpackhi_pd(a: __m512d, b: __m512d) -> __m512d { simd_shuffle8(a, b, [2, 10, 3, 11, 2 + 4, 10 + 4, 3 + 4, 11 + 4]) } @@ -10703,7 +10703,7 @@ pub unsafe fn _mm512_unpackhi_pd(a: __m512d, b: __m512d) -> __m512d { /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_unpackhi_pd&expand=6046) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpermi2pd))] //should be vunpackhpd +#[cfg_attr(test, assert_instr(vperm))] //should be vunpackhpd pub unsafe fn _mm512_mask_unpackhi_pd( src: __m512d, k: __mmask8, @@ -10719,7 +10719,7 @@ pub unsafe fn _mm512_mask_unpackhi_pd( /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_unpackhi_pd&expand=6047) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpermt2pd))] //should be vunpackhpd +#[cfg_attr(test, assert_instr(vperm))] //should be vunpackhpd pub unsafe fn _mm512_maskz_unpackhi_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d { let unpackhi = _mm512_unpackhi_pd(a, b).as_f64x8(); let zero = _mm512_setzero_pd().as_f64x8(); From 90f2ae6bdac2e47f099ba5789f6ac3c4dc63de8b Mon Sep 17 00:00:00 2001 From: jironglin Date: Thu, 24 Sep 2020 21:00:35 +0000 Subject: [PATCH 18/20] Not fix const check. Fix mistakes and remove comment --- crates/core_arch/src/x86/avx512f.rs | 54 ++++++++++---------------- crates/core_arch/src/x86_64/avx512f.rs | 20 +++++----- 2 files changed, 31 insertions(+), 43 deletions(-) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index ba711f4766..055767e051 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -10601,9 +10601,9 @@ pub unsafe fn _mm512_maskz_unpackhi_epi32(k: __mmask16, a: __m512i, b: __m512i) /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_unpackhi_epi64&expand=6030) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vperm))] //should be vpunpckhqdq +#[cfg_attr(test, assert_instr(vunpckhpd))] //should be vpunpckhqdq pub unsafe fn _mm512_unpackhi_epi64(a: __m512i, b: __m512i) -> __m512i { - simd_shuffle8(a, b, [2, 10, 3, 11, 2 + 4, 10 + 4, 3 + 4, 11 + 4]) + simd_shuffle8(a, b, [1, 9, 1 + 2, 9 + 2, 1 + 4, 9 + 4, 1 + 6, 9 + 6]) } /// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -10611,7 +10611,7 @@ pub unsafe fn _mm512_unpackhi_epi64(a: __m512i, b: __m512i) -> __m512i { /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_unpackhi_epi64&expand=6028) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vperm))] //should be vpunpckhqdq +#[cfg_attr(test, assert_instr(vpunpckhqdq))] pub unsafe fn _mm512_mask_unpackhi_epi64( src: __m512i, k: __mmask8, @@ -10627,7 +10627,7 @@ pub unsafe fn _mm512_mask_unpackhi_epi64( /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_unpackhi_epi64&expand=6029) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vperm))] //should be vpunpckhqdq +#[cfg_attr(test, assert_instr(vpunpckhqdq))] pub unsafe fn _mm512_maskz_unpackhi_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i { let unpackhi = _mm512_unpackhi_epi64(a, b).as_i64x8(); let zero = _mm512_setzero_si512().as_i64x8(); @@ -10693,9 +10693,9 @@ pub unsafe fn _mm512_maskz_unpackhi_ps(k: __mmask16, a: __m512, b: __m512) -> __ /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_unpackhi_pd&expand=6048) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vperm))] //should be vunpackhpd +#[cfg_attr(test, assert_instr(vunpckhpd))] pub unsafe fn _mm512_unpackhi_pd(a: __m512d, b: __m512d) -> __m512d { - simd_shuffle8(a, b, [2, 10, 3, 11, 2 + 4, 10 + 4, 3 + 4, 11 + 4]) + simd_shuffle8(a, b, [1, 9, 1 + 2, 9 + 2, 1 + 4, 9 + 4, 1 + 6, 9 + 6]) } /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -10703,7 +10703,7 @@ pub unsafe fn _mm512_unpackhi_pd(a: __m512d, b: __m512d) -> __m512d { /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_unpackhi_pd&expand=6046) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vperm))] //should be vunpackhpd +#[cfg_attr(test, assert_instr(vunpckhpd))] pub unsafe fn _mm512_mask_unpackhi_pd( src: __m512d, k: __mmask8, @@ -10719,7 +10719,7 @@ pub unsafe fn _mm512_mask_unpackhi_pd( /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_unpackhi_pd&expand=6047) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vperm))] //should be vunpackhpd +#[cfg_attr(test, assert_instr(vunpckhpd))] pub unsafe fn _mm512_maskz_unpackhi_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d { let unpackhi = _mm512_unpackhi_pd(a, b).as_f64x8(); let zero = _mm512_setzero_pd().as_f64x8(); @@ -10938,7 +10938,7 @@ pub unsafe fn _mm512_maskz_broadcastd_epi32(k: __mmask16, a: __m128i) -> __m512i #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr(vbroadcas))] //should be vpbroadcastq pub unsafe fn _mm512_broadcastq_epi64(a: __m128i) -> __m512i { - simd_shuffle8(a, a, [1, 1, 1, 1, 1, 1, 1, 1]) + simd_shuffle8(a, a, [0, 0, 0, 0, 0, 0, 0, 0]) } /// Broadcast the low packed 64-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -11034,8 +11034,7 @@ pub unsafe fn _mm512_maskz_broadcastsd_pd(k: __mmask8, a: __m128d) -> __m512d { /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcast_i32x4&expand=510) #[inline] -#[target_feature(enable = "avx512f")] -//#[cfg_attr(test, assert_instr(vbroadcast))] should be vbroadcasti32x4 +#[target_feature(enable = "avx512f")] //msvc: vbroadcasti32x4, linux: vshuf pub unsafe fn _mm512_broadcast_i32x4(a: __m128i) -> __m512i { let a = _mm512_castsi128_si512(a).as_i32x16(); let ret: i32x16 = simd_shuffle16(a, a, [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]); @@ -11046,8 +11045,7 @@ pub unsafe fn _mm512_broadcast_i32x4(a: __m128i) -> __m512i { /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcast_i32x4&expand=511) #[inline] -#[target_feature(enable = "avx512f")] -//#[cfg_attr(test, assert_instr(vbroadcast))] should be vbroadcasti32x4 +#[target_feature(enable = "avx512f")] //msvc: vbroadcasti32x4, linux: vshuf pub unsafe fn _mm512_mask_broadcast_i32x4(src: __m512i, k: __mmask16, a: __m128i) -> __m512i { let broadcast = _mm512_broadcast_i32x4(a).as_i32x16(); transmute(simd_select_bitmask(k, broadcast, src.as_i32x16())) @@ -11057,8 +11055,7 @@ pub unsafe fn _mm512_mask_broadcast_i32x4(src: __m512i, k: __mmask16, a: __m128i /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcast_i32x4&expand=512) #[inline] -#[target_feature(enable = "avx512f")] -//#[cfg_attr(test, assert_instr(vbroadcast))] should be vbroadcasti32x4 +#[target_feature(enable = "avx512f")] //msvc: vbroadcasti32x4, linux: vshuf pub unsafe fn _mm512_maskz_broadcast_i32x4(k: __mmask16, a: __m128i) -> __m512i { let broadcast = _mm512_broadcast_i32x4(a).as_i32x16(); let zero = _mm512_setzero_si512().as_i32x16(); @@ -11069,8 +11066,7 @@ pub unsafe fn _mm512_maskz_broadcast_i32x4(k: __mmask16, a: __m128i) -> __m512i /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcast_i64x4&expand=522) #[inline] -#[target_feature(enable = "avx512f")] -//#[cfg_attr(test, assert_instr(vbroadcast))] should be vbroadcasti64x4 +#[target_feature(enable = "avx512f")] //msvc: vbroadcasti64x4, linux: vperm pub unsafe fn _mm512_broadcast_i64x4(a: __m256i) -> __m512i { simd_shuffle8(a, a, [0, 1, 2, 3, 0, 1, 2, 3]) } @@ -11079,8 +11075,7 @@ pub unsafe fn _mm512_broadcast_i64x4(a: __m256i) -> __m512i { /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcast_i64x4&expand=523) #[inline] -#[target_feature(enable = "avx512f")] -//#[cfg_attr(test, assert_instr(vbroadcast))] should be vbroadcasti64x4 +#[target_feature(enable = "avx512f")] //msvc: vbroadcasti64x4, linux: vperm pub unsafe fn _mm512_mask_broadcast_i64x4(src: __m512i, k: __mmask8, a: __m256i) -> __m512i { let broadcast = _mm512_broadcast_i64x4(a).as_i64x8(); transmute(simd_select_bitmask(k, broadcast, src.as_i64x8())) @@ -11090,8 +11085,7 @@ pub unsafe fn _mm512_mask_broadcast_i64x4(src: __m512i, k: __mmask8, a: __m256i) /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcast_i64x4&expand=524) #[inline] -#[target_feature(enable = "avx512f")] -//#[cfg_attr(test, assert_instr(vbroadcast))] should be vbroadcasti64x4 +#[target_feature(enable = "avx512f")] //msvc: vbroadcasti64x4, linux: vperm pub unsafe fn _mm512_maskz_broadcast_i64x4(k: __mmask8, a: __m256i) -> __m512i { let broadcast = _mm512_broadcast_i64x4(a).as_i64x8(); let zero = _mm512_setzero_si512().as_i64x8(); @@ -11102,8 +11096,7 @@ pub unsafe fn _mm512_maskz_broadcast_i64x4(k: __mmask8, a: __m256i) -> __m512i { /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcast_f32x4&expand=483) #[inline] -#[target_feature(enable = "avx512f")] -//#[cfg_attr(test, assert_instr(vbroadcast))] should be vbroadcastf32x4 +#[target_feature(enable = "avx512f")] //msvc: vbroadcastf32x4, linux: vshuf pub unsafe fn _mm512_broadcast_f32x4(a: __m128) -> __m512 { simd_shuffle16(a, a, [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]) } @@ -11112,8 +11105,7 @@ pub unsafe fn _mm512_broadcast_f32x4(a: __m128) -> __m512 { /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcast_f32x4&expand=484) #[inline] -#[target_feature(enable = "avx512f")] -//#[cfg_attr(test, assert_instr(vbroadcast))] should be vbroadcastf32x4 +#[target_feature(enable = "avx512f")] //msvc: vbroadcastf32x4, linux: vshu pub unsafe fn _mm512_mask_broadcast_f32x4(src: __m512, k: __mmask16, a: __m128) -> __m512 { let broadcast = _mm512_broadcast_f32x4(a).as_f32x16(); transmute(simd_select_bitmask(k, broadcast, src.as_f32x16())) @@ -11123,8 +11115,7 @@ pub unsafe fn _mm512_mask_broadcast_f32x4(src: __m512, k: __mmask16, a: __m128) /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcast_f32x4&expand=485) #[inline] -#[target_feature(enable = "avx512f")] -//#[cfg_attr(test, assert_instr(vbroadcast))] should be vbroadcastf32x4 +#[target_feature(enable = "avx512f")] //msvc: vbroadcastf32x4, linux: vshu pub unsafe fn _mm512_maskz_broadcast_f32x4(k: __mmask16, a: __m128) -> __m512 { let broadcast = _mm512_broadcast_f32x4(a).as_f32x16(); let zero = _mm512_setzero_ps().as_f32x16(); @@ -11135,8 +11126,7 @@ pub unsafe fn _mm512_maskz_broadcast_f32x4(k: __mmask16, a: __m128) -> __m512 { /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcast_f64x4&expand=495) #[inline] -#[target_feature(enable = "avx512f")] -//#[cfg_attr(test, assert_instr(vbroadcast))] should be vbroadcastf64x4 +#[target_feature(enable = "avx512f")] //msvc: vbroadcastf64x4, linux: vperm pub unsafe fn _mm512_broadcast_f64x4(a: __m256d) -> __m512d { simd_shuffle8(a, a, [0, 1, 2, 3, 0, 1, 2, 3]) } @@ -11145,8 +11135,7 @@ pub unsafe fn _mm512_broadcast_f64x4(a: __m256d) -> __m512d { /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcast_f64x4&expand=496) #[inline] -#[target_feature(enable = "avx512f")] -//#[cfg_attr(test, assert_instr(vbroadcast))] should be vbroadcastf64x4 +#[target_feature(enable = "avx512f")] //msvc: vbroadcastf64x4, linux: vper pub unsafe fn _mm512_mask_broadcast_f64x4(src: __m512d, k: __mmask8, a: __m256d) -> __m512d { let broadcast = _mm512_broadcast_f64x4(a).as_f64x8(); transmute(simd_select_bitmask(k, broadcast, src.as_f64x8())) @@ -11156,8 +11145,7 @@ pub unsafe fn _mm512_mask_broadcast_f64x4(src: __m512d, k: __mmask8, a: __m256d) /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcast_f64x4&expand=497) #[inline] -#[target_feature(enable = "avx512f")] -//#[cfg_attr(test, assert_instr(vbroadcast))] should be vbroadcastf64x4 +#[target_feature(enable = "avx512f")] //msvc: vbroadcastf64x4, linux: vper pub unsafe fn _mm512_maskz_broadcast_f64x4(k: __mmask8, a: __m256d) -> __m512d { let broadcast = _mm512_broadcast_f64x4(a).as_f64x8(); let zero = _mm512_setzero_pd().as_f64x8(); diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs index b005136ebe..f160011122 100644 --- a/crates/core_arch/src/x86_64/avx512f.rs +++ b/crates/core_arch/src/x86_64/avx512f.rs @@ -4448,7 +4448,7 @@ mod tests { unsafe fn test_mm512_broadcastq_epi64() { let a = _mm_setr_epi64x(17, 18); let r = _mm512_broadcastq_epi64(a); - let e = _mm512_set1_epi64(18); + let e = _mm512_set1_epi64(17); assert_eq_m512i(r, e); } @@ -4458,8 +4458,8 @@ mod tests { let a = _mm_setr_epi64x(17, 18); let r = _mm512_mask_broadcastq_epi64(src, 0, a); assert_eq_m512i(r, src); - let r = _mm512_mask_broadcastq_epi64(src, 0b01111111, a); - let e = _mm512_set1_epi64(18); + let r = _mm512_mask_broadcastq_epi64(src, 0b11111111, a); + let e = _mm512_set1_epi64(17); assert_eq_m512i(r, e); } @@ -4469,7 +4469,7 @@ mod tests { let r = _mm512_maskz_broadcastq_epi64(0, a); assert_eq_m512i(r, _mm512_setzero_si512()); let r = _mm512_maskz_broadcastq_epi64(0b00001111, a); - let e = _mm512_set_epi64(0, 0, 0, 0, 18, 18, 18, 18); + let e = _mm512_set_epi64(0, 0, 0, 0, 17, 17, 17, 17); assert_eq_m512i(r, e); } @@ -4583,7 +4583,7 @@ mod tests { let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8); let b = _mm512_set_epi64(17, 18, 19, 20, 21, 22, 23, 24); let r = _mm512_unpackhi_epi64(a, b); - let e = _mm512_set_epi64(17, 1, 18, 2, 21, 5, 22, 6); + let e = _mm512_set_epi64(17, 1, 19, 3, 21, 5, 23, 7); assert_eq_m512i(r, e); } @@ -4594,7 +4594,7 @@ mod tests { let r = _mm512_mask_unpackhi_epi64(a, 0, a, b); assert_eq_m512i(r, a); let r = _mm512_mask_unpackhi_epi64(a, 0b11111111, a, b); - let e = _mm512_set_epi64(17, 1, 18, 2, 21, 5, 22, 6); + let e = _mm512_set_epi64(17, 1, 19, 3, 21, 5, 23, 7); assert_eq_m512i(r, e); } @@ -4605,7 +4605,7 @@ mod tests { let r = _mm512_maskz_unpackhi_epi64(0, a, b); assert_eq_m512i(r, _mm512_setzero_si512()); let r = _mm512_maskz_unpackhi_epi64(0b00001111, a, b); - let e = _mm512_set_epi64(0, 0, 0, 0, 21, 5, 22, 6); + let e = _mm512_set_epi64(0, 0, 0, 0, 21, 5, 23, 7); assert_eq_m512i(r, e); } @@ -4614,7 +4614,7 @@ mod tests { let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.); let b = _mm512_set_pd(17., 18., 19., 20., 21., 22., 23., 24.); let r = _mm512_unpackhi_pd(a, b); - let e = _mm512_set_pd(17., 1., 18., 2., 21., 5., 22., 6.); + let e = _mm512_set_pd(17., 1., 19., 3., 21., 5., 23., 7.); assert_eq_m512d(r, e); } @@ -4625,7 +4625,7 @@ mod tests { let r = _mm512_mask_unpackhi_pd(a, 0, a, b); assert_eq_m512d(r, a); let r = _mm512_mask_unpackhi_pd(a, 0b11111111, a, b); - let e = _mm512_set_pd(17., 1., 18., 2., 21., 5., 22., 6.); + let e = _mm512_set_pd(17., 1., 19., 3., 21., 5., 23., 7.); assert_eq_m512d(r, e); } @@ -4636,7 +4636,7 @@ mod tests { let r = _mm512_maskz_unpackhi_pd(0, a, b); assert_eq_m512d(r, _mm512_setzero_pd()); let r = _mm512_maskz_unpackhi_pd(0b00001111, a, b); - let e = _mm512_set_pd(0., 0., 0., 0., 21., 5., 22., 6.); + let e = _mm512_set_pd(0., 0., 0., 0., 21., 5., 23., 7.); assert_eq_m512d(r, e); } From 6a9d0c9bf84c69f9ebf1f72fad2d42705e25ce88 Mon Sep 17 00:00:00 2001 From: jironglin Date: Fri, 25 Sep 2020 17:44:08 +0000 Subject: [PATCH 19/20] unpacklo: epi32,epi64,ps,pd --- crates/core_arch/avx512f.md | 24 +-- crates/core_arch/src/x86/avx512f.rs | 278 +++++++++++++++++++++++++ crates/core_arch/src/x86_64/avx512f.rs | 62 ++++++ 3 files changed, 352 insertions(+), 12 deletions(-) diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md index f7c19dfc43..f8612abba3 100644 --- a/crates/core_arch/avx512f.md +++ b/crates/core_arch/avx512f.md @@ -670,10 +670,10 @@ * [x] [`_mm512_mask_unpackhi_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpackhi_epi64&expand=5236) * [x] [`_mm512_mask_unpackhi_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpackhi_pd&expand=5236) * [x] [`_mm512_mask_unpackhi_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpackhi_ps&expand=5236) - * [ ] [`_mm512_mask_unpacklo_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpacklo_epi32&expand=5236) - * [ ] [`_mm512_mask_unpacklo_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpacklo_epi64&expand=5236) - * [ ] [`_mm512_mask_unpacklo_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpacklo_pd&expand=5236) - * [ ] [`_mm512_mask_unpacklo_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpacklo_ps&expand=5236) + * [x] [`_mm512_mask_unpacklo_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpacklo_epi32&expand=5236) + * [x] [`_mm512_mask_unpacklo_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpacklo_epi64&expand=5236) + * [x] [`_mm512_mask_unpacklo_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpacklo_pd&expand=5236) + * [x] [`_mm512_mask_unpacklo_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpacklo_ps&expand=5236) * [x] [`_mm512_mask_xor_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_xor_epi32&expand=5236) * [x] [`_mm512_mask_xor_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_xor_epi64&expand=5236) * [x] [`_mm512_maskz_abs_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_abs_epi32&expand=5236) @@ -930,10 +930,10 @@ * [x] [`_mm512_maskz_unpackhi_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpackhi_epi64&expand=5236) * [x] [`_mm512_maskz_unpackhi_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpackhi_pd&expand=5236) * [x] [`_mm512_maskz_unpackhi_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpackhi_ps&expand=5236) - * [ ] [`_mm512_maskz_unpacklo_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpacklo_epi32&expand=5236) - * [ ] [`_mm512_maskz_unpacklo_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpacklo_epi64&expand=5236) - * [ ] [`_mm512_maskz_unpacklo_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpacklo_pd&expand=5236) - * [ ] [`_mm512_maskz_unpacklo_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpacklo_ps&expand=5236) + * [x] [`_mm512_maskz_unpacklo_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpacklo_epi32&expand=5236) + * [x] [`_mm512_maskz_unpacklo_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpacklo_epi64&expand=5236) + * [x] [`_mm512_maskz_unpacklo_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpacklo_pd&expand=5236) + * [x] [`_mm512_maskz_unpacklo_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpacklo_ps&expand=5236) * [x] [`_mm512_maskz_xor_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_xor_epi32&expand=5236) * [x] [`_mm512_maskz_xor_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_xor_epi64&expand=5236) * [x] [`_mm512_max_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_max_epi32&expand=5236) @@ -1116,10 +1116,10 @@ * [x] [`_mm512_unpackhi_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpackhi_epi64&expand=5236) * [x] [`_mm512_unpackhi_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpackhi_pd&expand=5236) * [x] [`_mm512_unpackhi_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpackhi_ps&expand=5236) - * [ ] [`_mm512_unpacklo_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpacklo_epi32&expand=5236) - * [ ] [`_mm512_unpacklo_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpacklo_epi64&expand=5236) - * [ ] [`_mm512_unpacklo_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpacklo_pd&expand=5236) - * [ ] [`_mm512_unpacklo_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpacklo_ps&expand=5236) + * [x] [`_mm512_unpacklo_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpacklo_epi32&expand=5236) + * [x] [`_mm512_unpacklo_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpacklo_epi64&expand=5236) + * [x] [`_mm512_unpacklo_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpacklo_pd&expand=5236) + * [x] [`_mm512_unpacklo_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpacklo_ps&expand=5236) * [x] [`_mm512_xor_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_xor_epi32&expand=5236) * [x] [`_mm512_xor_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_xor_epi64&expand=5236) * [x] [`_mm512_xor_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_xor_si512&expand=5236) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 055767e051..e38ac8c85b 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -10726,6 +10726,198 @@ pub unsafe fn _mm512_maskz_unpackhi_pd(k: __mmask8, a: __m512d, b: __m512d) -> _ transmute(simd_select_bitmask(k, unpackhi, zero)) } +/// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_unpacklo_epi32&expand=6078) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vunpcklps))] //should be vpunpckldq +pub unsafe fn _mm512_unpacklo_epi32(a: __m512i, b: __m512i) -> __m512i { + let a = a.as_i32x16(); + let b = b.as_i32x16(); + let r: i32x16 = simd_shuffle16( + a, + b, + [ + 0, + 16, + 1, + 17, + 0 + 4, + 16 + 4, + 1 + 4, + 17 + 4, + 0 + 8, + 16 + 8, + 1 + 8, + 17 + 8, + 0 + 12, + 16 + 12, + 1 + 12, + 17 + 12, + ], + ); + transmute(r) +} + +/// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_unpacklo_epi32&expand=6076) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpunpckldq))] +pub unsafe fn _mm512_mask_unpacklo_epi32( + src: __m512i, + k: __mmask16, + a: __m512i, + b: __m512i, +) -> __m512i { + let unpackhi = _mm512_unpacklo_epi32(a, b).as_i32x16(); + transmute(simd_select_bitmask(k, unpackhi, src.as_i32x16())) +} + +/// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_unpacklo_epi32&expand=6077) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpunpckldq))] +pub unsafe fn _mm512_maskz_unpacklo_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i { + let unpackhi = _mm512_unpacklo_epi32(a, b).as_i32x16(); + let zero = _mm512_setzero_si512().as_i32x16(); + transmute(simd_select_bitmask(k, unpackhi, zero)) +} + +/// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_unpacklo_epi64&expand=6087) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vunpcklpd))] //should be vpunpcklqdq +pub unsafe fn _mm512_unpacklo_epi64(a: __m512i, b: __m512i) -> __m512i { + simd_shuffle8(a, b, [0, 8, 0 + 2, 8 + 2, 0 + 4, 8 + 4, 0 + 6, 8 + 6]) +} + +/// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_unpacklo_epi64&expand=6085) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpunpcklqdq))] +pub unsafe fn _mm512_mask_unpacklo_epi64( + src: __m512i, + k: __mmask8, + a: __m512i, + b: __m512i, +) -> __m512i { + let unpackhi = _mm512_unpacklo_epi64(a, b).as_i64x8(); + transmute(simd_select_bitmask(k, unpackhi, src.as_i64x8())) +} + +/// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_unpacklo_epi64&expand=6086) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpunpcklqdq))] +pub unsafe fn _mm512_maskz_unpacklo_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i { + let unpackhi = _mm512_unpacklo_epi64(a, b).as_i64x8(); + let zero = _mm512_setzero_si512().as_i64x8(); + transmute(simd_select_bitmask(k, unpackhi, zero)) +} + +/// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_unpacklo_ps&expand=6117) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vunpcklps))] +pub unsafe fn _mm512_unpacklo_ps(a: __m512, b: __m512) -> __m512 { + simd_shuffle16( + a, + b, + [ + 0, + 16, + 1, + 17, + 0 + 4, + 16 + 4, + 1 + 4, + 17 + 4, + 0 + 8, + 16 + 8, + 1 + 8, + 17 + 8, + 0 + 12, + 16 + 12, + 1 + 12, + 17 + 12, + ], + ) +} + +/// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_unpacklo_ps&expand=6115) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vunpcklps))] +pub unsafe fn _mm512_mask_unpacklo_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 { + let unpackhi = _mm512_unpacklo_ps(a, b).as_f32x16(); + transmute(simd_select_bitmask(k, unpackhi, src.as_f32x16())) +} + +/// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_unpacklo_ps&expand=6116) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vunpcklps))] +pub unsafe fn _mm512_maskz_unpacklo_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 { + let unpackhi = _mm512_unpacklo_ps(a, b).as_f32x16(); + let zero = _mm512_setzero_ps().as_f32x16(); + transmute(simd_select_bitmask(k, unpackhi, zero)) +} + +/// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_unpacklo_pd&expand=6105) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vunpcklpd))] +pub unsafe fn _mm512_unpacklo_pd(a: __m512d, b: __m512d) -> __m512d { + simd_shuffle8(a, b, [0, 8, 0 + 2, 8 + 2, 0 + 4, 8 + 4, 0 + 6, 8 + 6]) +} + +/// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_unpacklo_pd&expand=6103) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vunpcklpd))] +pub unsafe fn _mm512_mask_unpacklo_pd( + src: __m512d, + k: __mmask8, + a: __m512d, + b: __m512d, +) -> __m512d { + let unpackhi = _mm512_unpacklo_pd(a, b).as_f64x8(); + transmute(simd_select_bitmask(k, unpackhi, src.as_f64x8())) +} + +/// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_unpacklo_pd&expand=6104) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vunpcklpd))] +pub unsafe fn _mm512_maskz_unpacklo_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d { + let unpackhi = _mm512_unpacklo_pd(a, b).as_f64x8(); + let zero = _mm512_setzero_pd().as_f64x8(); + transmute(simd_select_bitmask(k, unpackhi, zero)) +} + /// Cast vector of type __m128 to type __m512; the upper 384 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castps128_ps512&expand=621) @@ -19779,6 +19971,92 @@ mod tests { assert_eq_m512(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_unpacklo_epi32() { + let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); + let b = _mm512_set_epi32( + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + ); + let r = _mm512_unpacklo_epi32(a, b); + let e = _mm512_set_epi32(19, 3, 20, 4, 23, 7, 24, 8, 27, 11, 28, 12, 31, 15, 32, 16); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_unpacklo_epi32() { + let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); + let b = _mm512_set_epi32( + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + ); + let r = _mm512_mask_unpacklo_epi32(a, 0, a, b); + assert_eq_m512i(r, a); + let r = _mm512_mask_unpacklo_epi32(a, 0b11111111_11111111, a, b); + let e = _mm512_set_epi32(19, 3, 20, 4, 23, 7, 24, 8, 27, 11, 28, 12, 31, 15, 32, 16); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_unpacklo_epi32() { + let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); + let b = _mm512_set_epi32( + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + ); + let r = _mm512_maskz_unpacklo_epi32(0, a, b); + assert_eq_m512i(r, _mm512_setzero_si512()); + let r = _mm512_maskz_unpacklo_epi32(0b00000000_11111111, a, b); + let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 27, 11, 28, 12, 31, 15, 32, 16); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_unpacklo_ps() { + let a = _mm512_set_ps( + 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., + ); + let b = _mm512_set_ps( + 17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., + ); + let r = _mm512_unpacklo_ps(a, b); + let e = _mm512_set_ps( + 19., 3., 20., 4., 23., 7., 24., 8., 27., 11., 28., 12., 31., 15., 32., 16., + ); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_unpacklo_ps() { + let a = _mm512_set_ps( + 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., + ); + let b = _mm512_set_ps( + 17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., + ); + let r = _mm512_mask_unpacklo_ps(a, 0, a, b); + assert_eq_m512(r, a); + let r = _mm512_mask_unpacklo_ps(a, 0b11111111_11111111, a, b); + let e = _mm512_set_ps( + 19., 3., 20., 4., 23., 7., 24., 8., 27., 11., 28., 12., 31., 15., 32., 16., + ); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_unpacklo_ps() { + let a = _mm512_set_ps( + 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., + ); + let b = _mm512_set_ps( + 17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., + ); + let r = _mm512_maskz_unpacklo_ps(0, a, b); + assert_eq_m512(r, _mm512_setzero_ps()); + let r = _mm512_maskz_unpacklo_ps(0b00000000_11111111, a, b); + let e = _mm512_set_ps( + 0., 0., 0., 0., 0., 0., 0., 0., 27., 11., 28., 12., 31., 15., 32., 16., + ); + assert_eq_m512(r, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_and_epi32() { let a = _mm512_set_epi32( diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs index f160011122..54291877a0 100644 --- a/crates/core_arch/src/x86_64/avx512f.rs +++ b/crates/core_arch/src/x86_64/avx512f.rs @@ -4640,6 +4640,68 @@ mod tests { assert_eq_m512d(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_unpacklo_epi64() { + let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8); + let b = _mm512_set_epi64(17, 18, 19, 20, 21, 22, 23, 24); + let r = _mm512_unpacklo_epi64(a, b); + let e = _mm512_set_epi64(18, 2, 20, 4, 22, 6, 24, 8); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_unpacklo_epi64() { + let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8); + let b = _mm512_set_epi64(17, 18, 19, 20, 21, 22, 23, 24); + let r = _mm512_mask_unpacklo_epi64(a, 0, a, b); + assert_eq_m512i(r, a); + let r = _mm512_mask_unpacklo_epi64(a, 0b11111111, a, b); + let e = _mm512_set_epi64(18, 2, 20, 4, 22, 6, 24, 8); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_unpacklo_epi64() { + let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8); + let b = _mm512_set_epi64(17, 18, 19, 20, 21, 22, 23, 24); + let r = _mm512_maskz_unpacklo_epi64(0, a, b); + assert_eq_m512i(r, _mm512_setzero_si512()); + let r = _mm512_maskz_unpacklo_epi64(0b00001111, a, b); + let e = _mm512_set_epi64(0, 0, 0, 0, 22, 6, 24, 8); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_unpacklo_pd() { + let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.); + let b = _mm512_set_pd(17., 18., 19., 20., 21., 22., 23., 24.); + let r = _mm512_unpacklo_pd(a, b); + let e = _mm512_set_pd(18., 2., 20., 4., 22., 6., 24., 8.); + assert_eq_m512d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_unpacklo_pd() { + let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.); + let b = _mm512_set_pd(17., 18., 19., 20., 21., 22., 23., 24.); + let r = _mm512_mask_unpacklo_pd(a, 0, a, b); + assert_eq_m512d(r, a); + let r = _mm512_mask_unpacklo_pd(a, 0b11111111, a, b); + let e = _mm512_set_pd(18., 2., 20., 4., 22., 6., 24., 8.); + assert_eq_m512d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_unpacklo_pd() { + let a = _mm512_set_pd(1., 2., 3., 4., 5., 6., 7., 8.); + let b = _mm512_set_pd(17., 18., 19., 20., 21., 22., 23., 24.); + let r = _mm512_maskz_unpacklo_pd(0, a, b); + assert_eq_m512d(r, _mm512_setzero_pd()); + let r = _mm512_maskz_unpacklo_pd(0b00001111, a, b); + let e = _mm512_set_pd(0., 0., 0., 0., 22., 6., 24., 8.); + assert_eq_m512d(r, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_and_epi64() { let a = _mm512_set_epi64(1 << 0 | 1 << 15, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3); From 34723b1c90a76bdb7e4a04e3fabd5a5ef3f408dd Mon Sep 17 00:00:00 2001 From: jironglin Date: Fri, 25 Sep 2020 18:46:29 +0000 Subject: [PATCH 20/20] use assert to check imm8 boundary --- crates/core_arch/src/x86/avx512f.rs | 34 ++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index e38ac8c85b..32724bb292 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -8755,6 +8755,7 @@ pub unsafe fn _mm512_maskz_shuffle_epi32(k: __mmask16, a: __m512i, imm8: _MM_PER #[cfg_attr(test, assert_instr(vshufps, imm8 = 0))] #[rustc_args_required_const(2)] pub unsafe fn _mm512_shuffle_ps(a: __m512, b: __m512, imm8: i32) -> __m512 { + assert!(imm8 >= 0 && imm8 <= 255); let imm8 = (imm8 & 0xFF) as u8; macro_rules! shuffle4 { ( @@ -8836,6 +8837,7 @@ pub unsafe fn _mm512_mask_shuffle_ps( b: __m512, imm8: i32, ) -> __m512 { + assert!(imm8 >= 0 && imm8 <= 255); let imm8 = (imm8 & 0xFF) as u8; macro_rules! shuffle4 { ( @@ -8913,6 +8915,7 @@ pub unsafe fn _mm512_mask_shuffle_ps( #[cfg_attr(test, assert_instr(vshufps, imm8 = 0))] #[rustc_args_required_const(3)] pub unsafe fn _mm512_maskz_shuffle_ps(k: __mmask16, a: __m512, b: __m512, imm8: i32) -> __m512 { + assert!(imm8 >= 0 && imm8 <= 255); let imm8 = (imm8 & 0xFF) as u8; macro_rules! shuffle4 { ( @@ -8991,6 +8994,7 @@ pub unsafe fn _mm512_maskz_shuffle_ps(k: __mmask16, a: __m512, b: __m512, imm8: #[cfg_attr(test, assert_instr(vshufpd, imm8 = 3))] #[rustc_args_required_const(2)] pub unsafe fn _mm512_shuffle_pd(a: __m512d, b: __m512d, imm8: i32) -> __m512d { + assert!(imm8 >= 0 && imm8 <= 255); let imm8 = (imm8 & 0xFF) as u8; macro_rules! shuffle8 { ($a:expr, $b:expr, $c:expr, $d:expr, $e:expr, $f:expr, $g:expr, $h:expr) => { @@ -9073,6 +9077,7 @@ pub unsafe fn _mm512_mask_shuffle_pd( b: __m512d, imm8: i32, ) -> __m512d { + assert!(imm8 >= 0 && imm8 <= 255); let imm8 = (imm8 & 0xFF) as u8; macro_rules! shuffle8 { ($a:expr, $b:expr, $c:expr, $d:expr, $e:expr, $f:expr, $g:expr, $h:expr) => { @@ -9151,6 +9156,7 @@ pub unsafe fn _mm512_mask_shuffle_pd( #[cfg_attr(test, assert_instr(vshufpd, imm8 = 3))] #[rustc_args_required_const(3)] pub unsafe fn _mm512_maskz_shuffle_pd(k: __mmask8, a: __m512d, b: __m512d, imm8: i32) -> __m512d { + assert!(imm8 >= 0 && imm8 <= 255); let imm8 = (imm8 & 0xFF) as u8; macro_rules! shuffle8 { ($a:expr, $b:expr, $c:expr, $d:expr, $e:expr, $f:expr, $g:expr, $h:expr) => { @@ -9230,8 +9236,8 @@ pub unsafe fn _mm512_maskz_shuffle_pd(k: __mmask8, a: __m512d, b: __m512d, imm8: #[cfg_attr(test, assert_instr(vshufi64x2, imm8 = 0b10111111))] //should be vshufi32x4, but generate vshufi64x2 #[rustc_args_required_const(2)] pub unsafe fn _mm512_shuffle_i32x4(a: __m512i, b: __m512i, imm8: i32) -> __m512i { + assert!(imm8 >= 0 && imm8 <= 255); let imm8 = (imm8 & 0xFF) as u8; - let a = a.as_i32x16(); let b = b.as_i32x16(); macro_rules! shuffle4 { @@ -9316,8 +9322,8 @@ pub unsafe fn _mm512_mask_shuffle_i32x4( b: __m512i, imm8: i32, ) -> __m512i { + assert!(imm8 >= 0 && imm8 <= 255); let imm8 = (imm8 & 0xFF) as u8; - let a = a.as_i32x16(); let b = b.as_i32x16(); macro_rules! shuffle4 { @@ -9401,8 +9407,8 @@ pub unsafe fn _mm512_maskz_shuffle_i32x4( b: __m512i, imm8: i32, ) -> __m512i { + assert!(imm8 >= 0 && imm8 <= 255); let imm8 = (imm8 & 0xFF) as u8; - let a = a.as_i32x16(); let b = b.as_i32x16(); macro_rules! shuffle4 { @@ -9482,6 +9488,7 @@ pub unsafe fn _mm512_maskz_shuffle_i32x4( #[cfg_attr(test, assert_instr(vshufi64x2, imm8 = 0b10111111))] #[rustc_args_required_const(2)] pub unsafe fn _mm512_shuffle_i64x2(a: __m512i, b: __m512i, imm8: i32) -> __m512i { + assert!(imm8 >= 0 && imm8 <= 255); let imm8 = (imm8 & 0xFF) as u8; macro_rules! shuffle4 { ( @@ -9549,6 +9556,7 @@ pub unsafe fn _mm512_mask_shuffle_i64x2( b: __m512i, imm8: i32, ) -> __m512i { + assert!(imm8 >= 0 && imm8 <= 255); let imm8 = (imm8 & 0xFF) as u8; macro_rules! shuffle4 { ( @@ -9617,6 +9625,7 @@ pub unsafe fn _mm512_maskz_shuffle_i64x2( b: __m512i, imm8: i32, ) -> __m512i { + assert!(imm8 >= 0 && imm8 <= 255); let imm8 = (imm8 & 0xFF) as u8; macro_rules! shuffle4 { ( @@ -9681,6 +9690,7 @@ pub unsafe fn _mm512_maskz_shuffle_i64x2( #[cfg_attr(test, assert_instr(vshuff64x2, imm8 = 0b10111111))] //should be vshuff32x4, but generate vshuff64x2 #[rustc_args_required_const(2)] pub unsafe fn _mm512_shuffle_f32x4(a: __m512, b: __m512, imm8: i32) -> __m512 { + assert!(imm8 >= 0 && imm8 <= 255); let imm8 = (imm8 & 0xFF) as u8; macro_rules! shuffle4 { ( @@ -9762,6 +9772,7 @@ pub unsafe fn _mm512_mask_shuffle_f32x4( b: __m512, imm8: i32, ) -> __m512 { + assert!(imm8 >= 0 && imm8 <= 255); let imm8 = (imm8 & 0xFF) as u8; macro_rules! shuffle4 { ( @@ -9839,6 +9850,7 @@ pub unsafe fn _mm512_mask_shuffle_f32x4( #[cfg_attr(test, assert_instr(vshuff32x4, imm8 = 0b10111111))] #[rustc_args_required_const(3)] pub unsafe fn _mm512_maskz_shuffle_f32x4(k: __mmask16, a: __m512, b: __m512, imm8: i32) -> __m512 { + assert!(imm8 >= 0 && imm8 <= 255); let imm8 = (imm8 & 0xFF) as u8; macro_rules! shuffle4 { ( @@ -9917,6 +9929,7 @@ pub unsafe fn _mm512_maskz_shuffle_f32x4(k: __mmask16, a: __m512, b: __m512, imm #[cfg_attr(test, assert_instr(vshuff64x2, imm8 = 0b10111111))] #[rustc_args_required_const(2)] pub unsafe fn _mm512_shuffle_f64x2(a: __m512d, b: __m512d, imm8: i32) -> __m512d { + assert!(imm8 >= 0 && imm8 <= 255); let imm8 = (imm8 & 0xFF) as u8; macro_rules! shuffle4 { ( @@ -9984,6 +9997,7 @@ pub unsafe fn _mm512_mask_shuffle_f64x2( b: __m512d, imm8: i32, ) -> __m512d { + assert!(imm8 >= 0 && imm8 <= 255); let imm8 = (imm8 & 0xFF) as u8; macro_rules! shuffle4 { ( @@ -10052,6 +10066,7 @@ pub unsafe fn _mm512_maskz_shuffle_f64x2( b: __m512d, imm8: i32, ) -> __m512d { + assert!(imm8 >= 0 && imm8 <= 255); let imm8 = (imm8 & 0xFF) as u8; macro_rules! shuffle4 { ( @@ -10119,6 +10134,7 @@ pub unsafe fn _mm512_maskz_shuffle_f64x2( )] #[rustc_args_required_const(1)] pub unsafe fn _mm512_extractf32x4_ps(a: __m512, imm8: i32) -> __m128 { + assert!(imm8 >= 0 && imm8 <= 3); match imm8 & 0x3 { 0 => simd_shuffle4(a, _mm512_undefined_ps(), [0, 1, 2, 3]), 1 => simd_shuffle4(a, _mm512_undefined_ps(), [4, 5, 6, 7]), @@ -10237,6 +10253,7 @@ pub unsafe fn _mm512_maskz_movedup_pd(k: __mmask8, a: __m512d) -> __m512d { #[cfg_attr(test, assert_instr(vinsertf32x4, imm8 = 2))] //should be vinserti32x4 #[rustc_args_required_const(2)] pub unsafe fn _mm512_inserti32x4(a: __m512i, b: __m128i, imm8: i32) -> __m512i { + assert!(imm8 >= 0 && imm8 <= 3); let a = a.as_i32x16(); let b = _mm512_castsi128_si512(b).as_i32x16(); let ret: i32x16 = match imm8 & 0b11 { @@ -10274,6 +10291,7 @@ pub unsafe fn _mm512_mask_inserti32x4( b: __m128i, imm8: i32, ) -> __m512i { + assert!(imm8 >= 0 && imm8 <= 3); let a = a.as_i32x16(); let b = _mm512_castsi128_si512(b).as_i32x16(); let insert: i32x16 = match imm8 & 0b11 { @@ -10305,6 +10323,7 @@ pub unsafe fn _mm512_mask_inserti32x4( #[cfg_attr(test, assert_instr(vinserti32x4, imm8 = 2))] #[rustc_args_required_const(3)] pub unsafe fn _mm512_maskz_inserti32x4(k: __mmask16, a: __m512i, b: __m128i, imm8: i32) -> __m512i { + assert!(imm8 >= 0 && imm8 <= 3); let a = a.as_i32x16(); let b = _mm512_castsi128_si512(b).as_i32x16(); let insert = match imm8 & 0b11 { @@ -10337,6 +10356,7 @@ pub unsafe fn _mm512_maskz_inserti32x4(k: __mmask16, a: __m512i, b: __m128i, imm #[cfg_attr(test, assert_instr(vinsertf64x4, imm8 = 1))] //should be vinserti64x4 #[rustc_args_required_const(2)] pub unsafe fn _mm512_inserti64x4(a: __m512i, b: __m256i, imm8: i32) -> __m512i { + assert!(imm8 >= 0 && imm8 <= 1); let b = _mm512_castsi256_si512(b); match imm8 & 0b1 { 0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7]), @@ -10358,6 +10378,7 @@ pub unsafe fn _mm512_mask_inserti64x4( b: __m256i, imm8: i32, ) -> __m512i { + assert!(imm8 >= 0 && imm8 <= 1); let b = _mm512_castsi256_si512(b); let insert = match imm8 & 0b1 { 0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7]), @@ -10374,6 +10395,7 @@ pub unsafe fn _mm512_mask_inserti64x4( #[cfg_attr(test, assert_instr(vinserti64x4, imm8 = 1))] #[rustc_args_required_const(3)] pub unsafe fn _mm512_maskz_inserti64x4(k: __mmask8, a: __m512i, b: __m256i, imm8: i32) -> __m512i { + assert!(imm8 >= 0 && imm8 <= 1); let b = _mm512_castsi256_si512(b); let insert = match imm8 & 0b1 { 0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7]), @@ -10391,6 +10413,7 @@ pub unsafe fn _mm512_maskz_inserti64x4(k: __mmask8, a: __m512i, b: __m256i, imm8 #[cfg_attr(test, assert_instr(vinsertf32x4, imm8 = 2))] #[rustc_args_required_const(2)] pub unsafe fn _mm512_insertf32x4(a: __m512, b: __m128, imm8: i32) -> __m512 { + assert!(imm8 >= 0 && imm8 <= 3); let b = _mm512_castps128_ps512(b); match imm8 & 0b11 { 0 => simd_shuffle16( @@ -10426,6 +10449,7 @@ pub unsafe fn _mm512_mask_insertf32x4( b: __m128, imm8: i32, ) -> __m512 { + assert!(imm8 >= 0 && imm8 <= 3); let b = _mm512_castps128_ps512(b); let insert = match imm8 & 0b11 { 0 => simd_shuffle16( @@ -10456,6 +10480,7 @@ pub unsafe fn _mm512_mask_insertf32x4( #[cfg_attr(test, assert_instr(vinsertf32x4, imm8 = 2))] #[rustc_args_required_const(3)] pub unsafe fn _mm512_maskz_insertf32x4(k: __mmask16, a: __m512, b: __m128, imm8: i32) -> __m512 { + assert!(imm8 >= 0 && imm8 <= 3); let b = _mm512_castps128_ps512(b); let insert = match imm8 & 0b11 { 0 => simd_shuffle16( @@ -10487,6 +10512,7 @@ pub unsafe fn _mm512_maskz_insertf32x4(k: __mmask16, a: __m512, b: __m128, imm8: #[cfg_attr(test, assert_instr(vinsertf64x4, imm8 = 1))] #[rustc_args_required_const(2)] pub unsafe fn _mm512_insertf64x4(a: __m512d, b: __m256d, imm8: i32) -> __m512d { + assert!(imm8 >= 0 && imm8 <= 1); let b = _mm512_castpd256_pd512(b); match imm8 & 0b1 { 0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7]), @@ -10508,6 +10534,7 @@ pub unsafe fn _mm512_mask_insertf64x4( b: __m256d, imm8: i32, ) -> __m512d { + assert!(imm8 >= 0 && imm8 <= 1); let b = _mm512_castpd256_pd512(b); let insert = match imm8 & 0b1 { 0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7]), @@ -10524,6 +10551,7 @@ pub unsafe fn _mm512_mask_insertf64x4( #[cfg_attr(test, assert_instr(vinsertf64x4, imm8 = 1))] #[rustc_args_required_const(3)] pub unsafe fn _mm512_maskz_insertf64x4(k: __mmask8, a: __m512d, b: __m256d, imm8: i32) -> __m512d { + assert!(imm8 >= 0 && imm8 <= 1); let b = _mm512_castpd256_pd512(b); let insert = match imm8 & 0b1 { 0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),