Skip to content

Commit

Permalink
Implement avx512 compressstore intrinsics (rust-lang#1273)
Browse files Browse the repository at this point in the history
  • Loading branch information
jhorstmann authored Jan 24, 2022
1 parent af80aef commit f4c5507
Show file tree
Hide file tree
Showing 2 changed files with 292 additions and 12 deletions.
24 changes: 12 additions & 12 deletions crates/core_arch/avx512f.md
Original file line number Diff line number Diff line change
Expand Up @@ -1629,18 +1629,18 @@
* [x] [`_mm_maskz_compress_pd`]
* [x] [`_mm256_mask_compress_pd`]
* [x] [`_mm256_maskz_compress_pd`]
* [ ] [`_mm512_mask_compressstoreu_epi32`] //need i1
* [_] [`_mm_mask_compressstoreu_epi32`] //need i1
* [_] [`_mm256_mask_compressstoreu_epi32`] //need i1
* [ ] [`_mm512_mask_compressstoreu_epi64`] //need i1
* [_] [`_mm_mask_compressstoreu_epi64`] //need i1
* [_] [`_mm256_mask_compressstoreu_epi64`] //need i1
* [ ] [`_mm512_mask_compressstoreu_ps`] //need i1
* [_] [`_mm_mask_compressstoreu_ps`] //need i1
* [_] [`_mm256_mask_compressstoreu_ps`] //need i1
* [ ] [`_mm512_mask_compressstoreu_pd`] //need i1
* [_] [`_mm_mask_compressstoreu_pd`] //need i1
* [_] [`_mm256_mask_compressstoreu_pd`] //need i1
* [x] [`_mm512_mask_compressstoreu_epi32`] //need i1
* [x] [`_mm_mask_compressstoreu_epi32`] //need i1
* [x] [`_mm256_mask_compressstoreu_epi32`] //need i1
* [x] [`_mm512_mask_compressstoreu_epi64`] //need i1
* [x] [`_mm_mask_compressstoreu_epi64`] //need i1
* [x] [`_mm256_mask_compressstoreu_epi64`] //need i1
* [x] [`_mm512_mask_compressstoreu_ps`] //need i1
* [x] [`_mm_mask_compressstoreu_ps`] //need i1
* [x] [`_mm256_mask_compressstoreu_ps`] //need i1
* [x] [`_mm512_mask_compressstoreu_pd`] //need i1
* [x] [`_mm_mask_compressstoreu_pd`] //need i1
* [x] [`_mm256_mask_compressstoreu_pd`] //need i1
* [x] [`_mm512_mask_expand_epi32`]
* [x] [`_mm512_maskz_expand_epi32`]
* [x] [`_mm_mask_expand_epi32`]
Expand Down
280 changes: 280 additions & 0 deletions crates/core_arch/src/x86/avx512f.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16156,6 +16156,126 @@ pub unsafe fn _mm_maskz_compress_pd(k: __mmask8, a: __m128d) -> __m128d {
transmute(vcompresspd128(a.as_f64x2(), _mm_setzero_pd().as_f64x2(), k))
}

/// Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_compressstoreu_epi32)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpcompressd))]
pub unsafe fn _mm512_mask_compressstoreu_epi32(base_addr: *mut u8, k: __mmask16, a: __m512i) {
vcompressstored(base_addr as *mut _, a.as_i32x16(), k)
}

/// Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_compressstoreu_epi32)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcompressd))]
pub unsafe fn _mm256_mask_compressstoreu_epi32(base_addr: *mut u8, k: __mmask8, a: __m256i) {
vcompressstored256(base_addr as *mut _, a.as_i32x8(), k)
}

/// Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_compressstoreu_epi32)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcompressd))]
pub unsafe fn _mm_mask_compressstoreu_epi32(base_addr: *mut u8, k: __mmask8, a: __m128i) {
vcompressstored128(base_addr as *mut _, a.as_i32x4(), k)
}

/// Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_compressstoreu_epi64)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpcompressq))]
pub unsafe fn _mm512_mask_compressstoreu_epi64(base_addr: *mut u8, k: __mmask8, a: __m512i) {
vcompressstoreq(base_addr as *mut _, a.as_i64x8(), k)
}

/// Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_compressstoreu_epi64)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcompressq))]
pub unsafe fn _mm256_mask_compressstoreu_epi64(base_addr: *mut u8, k: __mmask8, a: __m256i) {
vcompressstoreq256(base_addr as *mut _, a.as_i64x4(), k)
}

/// Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_compressstoreu_epi64)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcompressq))]
pub unsafe fn _mm_mask_compressstoreu_epi64(base_addr: *mut u8, k: __mmask8, a: __m128i) {
vcompressstoreq128(base_addr as *mut _, a.as_i64x2(), k)
}

/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_compressstoreu_ps)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcompressps))]
pub unsafe fn _mm512_mask_compressstoreu_ps(base_addr: *mut u8, k: __mmask16, a: __m512) {
vcompressstoreps(base_addr as *mut _, a.as_f32x16(), k)
}

/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_compressstoreu_ps)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcompressps))]
pub unsafe fn _mm256_mask_compressstoreu_ps(base_addr: *mut u8, k: __mmask8, a: __m256) {
vcompressstoreps256(base_addr as *mut _, a.as_f32x8(), k)
}

/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_compressstoreu_ps)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcompressps))]
pub unsafe fn _mm_mask_compressstoreu_ps(base_addr: *mut u8, k: __mmask8, a: __m128) {
vcompressstoreps128(base_addr as *mut _, a.as_f32x4(), k)
}

/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_compressstoreu_pd)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcompresspd))]
pub unsafe fn _mm512_mask_compressstoreu_pd(base_addr: *mut u8, k: __mmask8, a: __m512d) {
vcompressstorepd(base_addr as *mut _, a.as_f64x8(), k)
}

/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_compressstoreu_pd)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcompresspd))]
pub unsafe fn _mm256_mask_compressstoreu_pd(base_addr: *mut u8, k: __mmask8, a: __m256d) {
vcompressstorepd256(base_addr as *mut _, a.as_f64x4(), k)
}

/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_compressstoreu_pd)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcompresspd))]
pub unsafe fn _mm_mask_compressstoreu_pd(base_addr: *mut u8, k: __mmask8, a: __m128d) {
vcompressstorepd128(base_addr as *mut _, a.as_f64x2(), k)
}

/// Load contiguous active 32-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_expand_epi32&expand=2316)
Expand Down Expand Up @@ -38007,6 +38127,34 @@ extern "C" {
#[link_name = "llvm.x86.avx512.mask.compress.pd.128"]
fn vcompresspd128(a: f64x2, src: f64x2, mask: u8) -> f64x2;

#[link_name = "llvm.x86.avx512.mask.compress.store.d.512"]
fn vcompressstored(mem: *mut i8, data: i32x16, mask: u16);
#[link_name = "llvm.x86.avx512.mask.compress.store.d.256"]
fn vcompressstored256(mem: *mut i8, data: i32x8, mask: u8);
#[link_name = "llvm.x86.avx512.mask.compress.store.d.128"]
fn vcompressstored128(mem: *mut i8, data: i32x4, mask: u8);

#[link_name = "llvm.x86.avx512.mask.compress.store.q.512"]
fn vcompressstoreq(mem: *mut i8, data: i64x8, mask: u8);
#[link_name = "llvm.x86.avx512.mask.compress.store.q.256"]
fn vcompressstoreq256(mem: *mut i8, data: i64x4, mask: u8);
#[link_name = "llvm.x86.avx512.mask.compress.store.q.128"]
fn vcompressstoreq128(mem: *mut i8, data: i64x2, mask: u8);

#[link_name = "llvm.x86.avx512.mask.compress.store.ps.512"]
fn vcompressstoreps(mem: *mut i8, data: f32x16, mask: u16);
#[link_name = "llvm.x86.avx512.mask.compress.store.ps.256"]
fn vcompressstoreps256(mem: *mut i8, data: f32x8, mask: u8);
#[link_name = "llvm.x86.avx512.mask.compress.store.ps.128"]
fn vcompressstoreps128(mem: *mut i8, data: f32x4, mask: u8);

#[link_name = "llvm.x86.avx512.mask.compress.store.pd.512"]
fn vcompressstorepd(mem: *mut i8, data: f64x8, mask: u8);
#[link_name = "llvm.x86.avx512.mask.compress.store.pd.256"]
fn vcompressstorepd256(mem: *mut i8, data: f64x4, mask: u8);
#[link_name = "llvm.x86.avx512.mask.compress.store.pd.128"]
fn vcompressstorepd128(mem: *mut i8, data: f64x2, mask: u8);

#[link_name = "llvm.x86.avx512.mask.expand.d.512"]
fn vpexpandd(a: i32x16, src: i32x16, mask: u16) -> i32x16;
#[link_name = "llvm.x86.avx512.mask.expand.d.256"]
Expand Down Expand Up @@ -51357,6 +51505,138 @@ mod tests {
assert_eq_m128(r, e);
}

#[simd_test(enable = "avx512f")]
unsafe fn test_mm512_mask_compressstoreu_epi32() {
let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
let mut r = [0_i32; 16];
_mm512_mask_compressstoreu_epi32(r.as_mut_ptr() as *mut _, 0, a);
assert_eq!(&r, &[0_i32; 16]);
_mm512_mask_compressstoreu_epi32(r.as_mut_ptr() as *mut _, 0b1111000011001010, a);
assert_eq!(&r, &[2, 4, 7, 8, 13, 14, 15, 16, 0, 0, 0, 0, 0, 0, 0, 0]);
}

#[simd_test(enable = "avx512f,avx512vl")]
unsafe fn test_mm256_mask_compressstoreu_epi32() {
let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
let mut r = [0_i32; 8];
_mm256_mask_compressstoreu_epi32(r.as_mut_ptr() as *mut _, 0, a);
assert_eq!(&r, &[0_i32; 8]);
_mm256_mask_compressstoreu_epi32(r.as_mut_ptr() as *mut _, 0b11001010, a);
assert_eq!(&r, &[2, 4, 7, 8, 0, 0, 0, 0]);
}

#[simd_test(enable = "avx512f,avx512vl")]
unsafe fn test_mm_mask_compressstoreu_epi32() {
let a = _mm_setr_epi32(1, 2, 3, 4);
let mut r = [0_i32; 4];
_mm_mask_compressstoreu_epi32(r.as_mut_ptr() as *mut _, 0, a);
assert_eq!(&r, &[0_i32; 4]);
_mm_mask_compressstoreu_epi32(r.as_mut_ptr() as *mut _, 0b1011, a);
assert_eq!(&r, &[1, 2, 4, 0]);
}

#[simd_test(enable = "avx512f")]
unsafe fn test_mm512_mask_compressstoreu_epi64() {
let a = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8);
let mut r = [0_i64; 8];
_mm512_mask_compressstoreu_epi64(r.as_mut_ptr() as *mut _, 0, a);
assert_eq!(&r, &[0_i64; 8]);
_mm512_mask_compressstoreu_epi64(r.as_mut_ptr() as *mut _, 0b11001010, a);
assert_eq!(&r, &[2, 4, 7, 8, 0, 0, 0, 0]);
}

#[simd_test(enable = "avx512f,avx512vl")]
unsafe fn test_mm256_mask_compressstoreu_epi64() {
let a = _mm256_setr_epi64x(1, 2, 3, 4);
let mut r = [0_i64; 4];
_mm256_mask_compressstoreu_epi64(r.as_mut_ptr() as *mut _, 0, a);
assert_eq!(&r, &[0_i64; 4]);
_mm256_mask_compressstoreu_epi64(r.as_mut_ptr() as *mut _, 0b1011, a);
assert_eq!(&r, &[1, 2, 4, 0]);
}

#[simd_test(enable = "avx512f,avx512vl")]
unsafe fn test_mm_mask_compressstoreu_epi64() {
let a = _mm_setr_epi64x(1, 2);
let mut r = [0_i64; 2];
_mm_mask_compressstoreu_epi64(r.as_mut_ptr() as *mut _, 0, a);
assert_eq!(&r, &[0_i64; 2]);
_mm_mask_compressstoreu_epi64(r.as_mut_ptr() as *mut _, 0b10, a);
assert_eq!(&r, &[2, 0]);
}

#[simd_test(enable = "avx512f")]
unsafe fn test_mm512_mask_compressstoreu_ps() {
let a = _mm512_setr_ps(
1_f32, 2_f32, 3_f32, 4_f32, 5_f32, 6_f32, 7_f32, 8_f32, 9_f32, 10_f32, 11_f32, 12_f32,
13_f32, 14_f32, 15_f32, 16_f32,
);
let mut r = [0_f32; 16];
_mm512_mask_compressstoreu_ps(r.as_mut_ptr() as *mut _, 0, a);
assert_eq!(&r, &[0_f32; 16]);
_mm512_mask_compressstoreu_ps(r.as_mut_ptr() as *mut _, 0b1111000011001010, a);
assert_eq!(
&r,
&[
2_f32, 4_f32, 7_f32, 8_f32, 13_f32, 14_f32, 15_f32, 16_f32, 0_f32, 0_f32, 0_f32,
0_f32, 0_f32, 0_f32, 0_f32, 0_f32
]
);
}

#[simd_test(enable = "avx512f,avx512vl")]
unsafe fn test_mm256_mask_compressstoreu_ps() {
let a = _mm256_setr_ps(1_f32, 2_f32, 3_f32, 4_f32, 5_f32, 6_f32, 7_f32, 8_f32);
let mut r = [0_f32; 8];
_mm256_mask_compressstoreu_ps(r.as_mut_ptr() as *mut _, 0, a);
assert_eq!(&r, &[0_f32; 8]);
_mm256_mask_compressstoreu_ps(r.as_mut_ptr() as *mut _, 0b11001010, a);
assert_eq!(
&r,
&[2_f32, 4_f32, 7_f32, 8_f32, 0_f32, 0_f32, 0_f32, 0_f32]
);
}

#[simd_test(enable = "avx512f,avx512vl")]
unsafe fn test_mm_mask_compressstoreu_ps() {
let a = _mm_setr_ps(1_f32, 2_f32, 3_f32, 4_f32);
let mut r = [0.; 4];
_mm_mask_compressstoreu_ps(r.as_mut_ptr() as *mut _, 0, a);
assert_eq!(&r, &[0.; 4]);
_mm_mask_compressstoreu_ps(r.as_mut_ptr() as *mut _, 0b1011, a);
assert_eq!(&r, &[1_f32, 2_f32, 4_f32, 0_f32]);
}

#[simd_test(enable = "avx512f")]
unsafe fn test_mm512_mask_compressstoreu_pd() {
let a = _mm512_setr_pd(1., 2., 3., 4., 5., 6., 7., 8.);
let mut r = [0.; 8];
_mm512_mask_compressstoreu_pd(r.as_mut_ptr() as *mut _, 0, a);
assert_eq!(&r, &[0.; 8]);
_mm512_mask_compressstoreu_pd(r.as_mut_ptr() as *mut _, 0b11001010, a);
assert_eq!(&r, &[2., 4., 7., 8., 0., 0., 0., 0.]);
}

#[simd_test(enable = "avx512f,avx512vl")]
unsafe fn test_mm256_mask_compressstoreu_pd() {
let a = _mm256_setr_pd(1., 2., 3., 4.);
let mut r = [0.; 4];
_mm256_mask_compressstoreu_pd(r.as_mut_ptr() as *mut _, 0, a);
assert_eq!(&r, &[0.; 4]);
_mm256_mask_compressstoreu_pd(r.as_mut_ptr() as *mut _, 0b1011, a);
assert_eq!(&r, &[1., 2., 4., 0.]);
}

#[simd_test(enable = "avx512f,avx512vl")]
unsafe fn test_mm_mask_compressstoreu_pd() {
let a = _mm_setr_pd(1., 2.);
let mut r = [0.; 2];
_mm_mask_compressstoreu_pd(r.as_mut_ptr() as *mut _, 0, a);
assert_eq!(&r, &[0.; 2]);
_mm_mask_compressstoreu_pd(r.as_mut_ptr() as *mut _, 0b10, a);
assert_eq!(&r, &[2., 0.]);
}

#[simd_test(enable = "avx512f")]
unsafe fn test_mm512_mask_expand_epi32() {
let src = _mm512_set1_epi32(200);
Expand Down

0 comments on commit f4c5507

Please sign in to comment.