From 3849d63fee2b0350e7270c9fdbbdf1ed0d31c4f6 Mon Sep 17 00:00:00 2001 From: Alex Crichton Date: Fri, 19 Jan 2018 10:32:16 -0600 Subject: [PATCH] Migrate the `i586::avx2` module to vendor types (#287) --- coresimd/src/x86/i586/avx2.rs | 3026 +++++++++++++++++---------------- 1 file changed, 1595 insertions(+), 1431 deletions(-) diff --git a/coresimd/src/x86/i586/avx2.rs b/coresimd/src/x86/i586/avx2.rs index 80d757d50674f..c965375a2fe22 100644 --- a/coresimd/src/x86/i586/avx2.rs +++ b/coresimd/src/x86/i586/avx2.rs @@ -20,12 +20,12 @@ use core::mem; -use simd_llvm::simd_cast; -use simd_llvm::{simd_shuffle2, simd_shuffle4, simd_shuffle8}; -use simd_llvm::{simd_shuffle16, simd_shuffle32}; - +use simd_llvm::*; use v256::*; use v128::*; +use v64::*; +use v32::*; +use x86::*; #[cfg(test)] use stdsimd_test::assert_instr; @@ -34,88 +34,88 @@ use stdsimd_test::assert_instr; #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpabsd))] -pub unsafe fn _mm256_abs_epi32(a: i32x8) -> u32x8 { - pabsd(a) +pub unsafe fn _mm256_abs_epi32(a: __m256i) -> __m256i { + mem::transmute(pabsd(a.as_i32x8())) } /// Computes the absolute values of packed 16-bit integers in `a`. #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpabsw))] -pub unsafe fn _mm256_abs_epi16(a: i16x16) -> u16x16 { - pabsw(a) +pub unsafe fn _mm256_abs_epi16(a: __m256i) -> __m256i { + mem::transmute(pabsw(a.as_i16x16())) } /// Computes the absolute values of packed 8-bit integers in `a`. #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpabsb))] -pub unsafe fn _mm256_abs_epi8(a: i8x32) -> u8x32 { - pabsb(a) +pub unsafe fn _mm256_abs_epi8(a: __m256i) -> __m256i { + mem::transmute(pabsb(a.as_i8x32())) } /// Add packed 64-bit integers in `a` and `b`. #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpaddq))] -pub unsafe fn _mm256_add_epi64(a: i64x4, b: i64x4) -> i64x4 { - a + b +pub unsafe fn _mm256_add_epi64(a: __m256i, b: __m256i) -> __m256i { + mem::transmute(simd_add(a.as_i64x4(), b.as_i64x4())) } /// Add packed 32-bit integers in `a` and `b`. #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpaddd))] -pub unsafe fn _mm256_add_epi32(a: i32x8, b: i32x8) -> i32x8 { - a + b +pub unsafe fn _mm256_add_epi32(a: __m256i, b: __m256i) -> __m256i { + mem::transmute(simd_add(a.as_i32x8(), b.as_i32x8())) } /// Add packed 16-bit integers in `a` and `b`. #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpaddw))] -pub unsafe fn _mm256_add_epi16(a: i16x16, b: i16x16) -> i16x16 { - a + b +pub unsafe fn _mm256_add_epi16(a: __m256i, b: __m256i) -> __m256i { + mem::transmute(simd_add(a.as_i16x16(), b.as_i16x16())) } /// Add packed 8-bit integers in `a` and `b`. #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpaddb))] -pub unsafe fn _mm256_add_epi8(a: i8x32, b: i8x32) -> i8x32 { - a + b +pub unsafe fn _mm256_add_epi8(a: __m256i, b: __m256i) -> __m256i { + mem::transmute(simd_add(a.as_i8x32(), b.as_i8x32())) } /// Add packed 8-bit integers in `a` and `b` using saturation. #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpaddsb))] -pub unsafe fn _mm256_adds_epi8(a: i8x32, b: i8x32) -> i8x32 { - paddsb(a, b) +pub unsafe fn _mm256_adds_epi8(a: __m256i, b: __m256i) -> __m256i { + mem::transmute(paddsb(a.as_i8x32(), b.as_i8x32())) } /// Add packed 16-bit integers in `a` and `b` using saturation. #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpaddsw))] -pub unsafe fn _mm256_adds_epi16(a: i16x16, b: i16x16) -> i16x16 { - paddsw(a, b) +pub unsafe fn _mm256_adds_epi16(a: __m256i, b: __m256i) -> __m256i { + mem::transmute(paddsw(a.as_i16x16(), b.as_i16x16())) } /// Add packed unsigned 8-bit integers in `a` and `b` using saturation. #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpaddusb))] -pub unsafe fn _mm256_adds_epu8(a: u8x32, b: u8x32) -> u8x32 { - paddusb(a, b) +pub unsafe fn _mm256_adds_epu8(a: __m256i, b: __m256i) -> __m256i { + mem::transmute(paddusb(a.as_u8x32(), b.as_u8x32())) } /// Add packed unsigned 16-bit integers in `a` and `b` using saturation. #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpaddusw))] -pub unsafe fn _mm256_adds_epu16(a: u16x16, b: u16x16) -> u16x16 { - paddusw(a, b) +pub unsafe fn _mm256_adds_epu16(a: __m256i, b: __m256i) -> __m256i { + mem::transmute(paddusw(a.as_u16x16(), b.as_u16x16())) } /// Concatenate pairs of 16-byte blocks in `a` and `b` into a 32-byte temporary @@ -123,21 +123,24 @@ pub unsafe fn _mm256_adds_epu16(a: u16x16, b: u16x16) -> u16x16 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpalignr, n = 15))] -pub unsafe fn _mm256_alignr_epi8(a: i8x32, b: i8x32, n: i32) -> i8x32 { +pub unsafe fn _mm256_alignr_epi8(a: __m256i, b: __m256i, n: i32) -> __m256i { let n = n as u32; // If palignr is shifting the pair of vectors more than the size of two // lanes, emit zero. if n > 32 { - return i8x32::splat(0); + return _mm256_set1_epi8(0); } // If palignr is shifting the pair of input vectors more than one lane, // but less than two lanes, convert to shifting in zeroes. let (a, b, n) = if n > 16 { - (i8x32::splat(0), a, n - 16) + (_mm256_set1_epi8(0), a, n - 16) } else { (a, b, n) }; + let a = a.as_i8x32(); + let b = b.as_i8x32(); + macro_rules! shuffle { ($shift:expr) => { simd_shuffle32(b, a, [ @@ -160,7 +163,7 @@ pub unsafe fn _mm256_alignr_epi8(a: i8x32, b: i8x32, n: i32) -> i8x32 { ]) } } - match n { + let r: i8x32 = match n { 0 => shuffle!(0), 1 => shuffle!(1), 2 => shuffle!(2), @@ -178,7 +181,8 @@ pub unsafe fn _mm256_alignr_epi8(a: i8x32, b: i8x32, n: i32) -> i8x32 { 14 => shuffle!(14), 15 => shuffle!(15), _ => shuffle!(16), - } + }; + mem::transmute(r) } /// Compute the bitwise AND of 256 bits (representing integer data) @@ -187,7 +191,7 @@ pub unsafe fn _mm256_alignr_epi8(a: i8x32, b: i8x32, n: i32) -> i8x32 { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vandps))] pub unsafe fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i { - __m256i::from(i8x32::from(a) & i8x32::from(b)) + mem::transmute(simd_and(a.as_i64x4(), b.as_i64x4())) } /// Compute the bitwise NOT of 256 bits (representing integer data) @@ -196,31 +200,34 @@ pub unsafe fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vandnps))] pub unsafe fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i { - __m256i::from((!i8x32::from(a)) & i8x32::from(b)) + let all_ones = _mm256_set1_epi8(-1); + mem::transmute(simd_and(simd_xor(a.as_i64x4(), all_ones.as_i64x4()), b.as_i64x4())) } /// Average packed unsigned 16-bit integers in `a` and `b`. #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpavgw))] -pub unsafe fn _mm256_avg_epu16(a: u16x16, b: u16x16) -> u16x16 { - pavgw(a, b) +pub unsafe fn _mm256_avg_epu16(a: __m256i, b: __m256i) -> __m256i { + mem::transmute(pavgw(a.as_u16x16(), b.as_u16x16())) } /// Average packed unsigned 8-bit integers in `a` and `b`. #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpavgb))] -pub unsafe fn _mm256_avg_epu8(a: u8x32, b: u8x32) -> u8x32 { - pavgb(a, b) +pub unsafe fn _mm256_avg_epu8(a: __m256i, b: __m256i) -> __m256i { + mem::transmute(pavgb(a.as_u8x32(), b.as_u8x32())) } /// Blend packed 32-bit integers from `a` and `b` using control mask `imm8`. #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpblendd, imm8 = 9))] -pub unsafe fn _mm_blend_epi32(a: i32x4, b: i32x4, imm8: i32) -> i32x4 { +pub unsafe fn _mm_blend_epi32(a: __m128i, b: __m128i, imm8: i32) -> __m128i { let imm8 = (imm8 & 0xFF) as u8; + let a = a.as_i32x4(); + let b = b.as_i32x4(); macro_rules! blend2 { ($a:expr, $b:expr, $c:expr, $d:expr) => { simd_shuffle4(a, b, [$a, $b, $c, $d]); @@ -236,20 +243,23 @@ pub unsafe fn _mm_blend_epi32(a: i32x4, b: i32x4, imm8: i32) -> i32x4 { } } } - match imm8 & 0b11 { + let r: i32x4 = match imm8 & 0b11 { 0b00 => blend1!(0, 1), 0b01 => blend1!(4, 1), 0b10 => blend1!(0, 5), _ => blend1!(4, 5), - } + }; + mem::transmute(r) } /// Blend packed 32-bit integers from `a` and `b` using control mask `imm8`. #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpblendd, imm8 = 9))] -pub unsafe fn _mm256_blend_epi32(a: i32x8, b: i32x8, imm8: i32) -> i32x8 { +pub unsafe fn _mm256_blend_epi32(a: __m256i, b: __m256i, imm8: i32) -> __m256i { let imm8 = (imm8 & 0xFF) as u8; + let a = a.as_i32x8(); + let b = b.as_i32x8(); macro_rules! blend4 { ($a:expr, $b:expr, $c:expr, $d:expr, $e:expr, $f:expr, $g:expr, $h:expr) => { simd_shuffle8(a, b, [$a, $b, $c, $d, $e, $f, $g, $h]); @@ -285,20 +295,23 @@ pub unsafe fn _mm256_blend_epi32(a: i32x8, b: i32x8, imm8: i32) -> i32x8 { } } } - match imm8 & 0b11 { + let r: i32x8 = match imm8 & 0b11 { 0b00 => blend1!(0, 1), 0b01 => blend1!(8, 1), 0b10 => blend1!(0, 9), _ => blend1!(8, 9), - } + }; + mem::transmute(r) } /// Blend packed 16-bit integers from `a` and `b` using control mask `imm8`. #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpblendw, imm8 = 9))] -pub unsafe fn _mm256_blend_epi16(a: i16x16, b: i16x16, imm8: i32) -> i16x16 { +pub unsafe fn _mm256_blend_epi16(a: __m256i, b: __m256i, imm8: i32) -> __m256i { let imm8 = (imm8 & 0xFF) as u8; + let a = a.as_i16x16(); + let b = b.as_i16x16(); macro_rules! blend4 { ($a:expr, $b:expr, $c:expr, $d:expr, $e:expr, $f:expr, $g:expr, $h:expr, $i:expr, $j:expr, $k:expr, $l:expr, $m:expr, $n:expr, $o:expr, $p:expr) => { @@ -336,20 +349,21 @@ pub unsafe fn _mm256_blend_epi16(a: i16x16, b: i16x16, imm8: i32) -> i16x16 { } } } - match imm8 & 0b11 { + let r: i16x16 = match imm8 & 0b11 { 0b00 => blend1!(0, 1, 8, 9), 0b01 => blend1!(16, 1, 24, 9), 0b10 => blend1!(0, 17, 8, 25), _ => blend1!(16, 17, 24, 25), - } + }; + mem::transmute(r) } /// Blend packed 8-bit integers from `a` and `b` using `mask`. #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpblendvb))] -pub unsafe fn _mm256_blendv_epi8(a: i8x32, b: i8x32, mask: __m256i) -> i8x32 { - pblendvb(a, b, i8x32::from(mask)) +pub unsafe fn _mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i) -> __m256i { + mem::transmute(pblendvb(a.as_i8x32(), b.as_i8x32(), mask.as_i8x32())) } /// Broadcast the low packed 8-bit integer from `a` to all elements of @@ -357,8 +371,10 @@ pub unsafe fn _mm256_blendv_epi8(a: i8x32, b: i8x32, mask: __m256i) -> i8x32 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpbroadcastb))] -pub unsafe fn _mm_broadcastb_epi8(a: i8x16) -> i8x16 { - simd_shuffle16(a, i8x16::splat(0_i8), [0_u32; 16]) +pub unsafe fn _mm_broadcastb_epi8(a: __m128i) -> __m128i { + let zero = _mm_setzero_si128(); + let ret = simd_shuffle16(a.as_i8x16(), zero.as_i8x16(), [0_u32; 16]); + mem::transmute::(ret) } /// Broadcast the low packed 8-bit integer from `a` to all elements of @@ -366,8 +382,10 @@ pub unsafe fn _mm_broadcastb_epi8(a: i8x16) -> i8x16 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpbroadcastb))] -pub unsafe fn _mm256_broadcastb_epi8(a: i8x16) -> i8x32 { - simd_shuffle32(a, i8x16::splat(0_i8), [0_u32; 32]) +pub unsafe fn _mm256_broadcastb_epi8(a: __m128i) -> __m256i { + let zero = _mm_setzero_si128(); + let ret = simd_shuffle32(a.as_i8x16(), zero.as_i8x16(), [0_u32; 32]); + mem::transmute::(ret) } // NB: simd_shuffle4 with integer data types for `a` and `b` is @@ -377,8 +395,10 @@ pub unsafe fn _mm256_broadcastb_epi8(a: i8x16) -> i8x32 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vbroadcastss))] -pub unsafe fn _mm_broadcastd_epi32(a: i32x4) -> i32x4 { - simd_shuffle4(a, i32x4::splat(0_i32), [0_u32; 4]) +pub unsafe fn _mm_broadcastd_epi32(a: __m128i) -> __m128i { + let zero = _mm_setzero_si128(); + let ret = simd_shuffle4(a.as_i32x4(), zero.as_i32x4(), [0_u32; 4]); + mem::transmute::(ret) } // NB: simd_shuffle4 with integer data types for `a` and `b` is @@ -388,8 +408,10 @@ pub unsafe fn _mm_broadcastd_epi32(a: i32x4) -> i32x4 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vbroadcastss))] -pub unsafe fn _mm256_broadcastd_epi32(a: i32x4) -> i32x8 { - simd_shuffle8(a, i32x4::splat(0_i32), [0_u32; 8]) +pub unsafe fn _mm256_broadcastd_epi32(a: __m128i) -> __m256i { + let zero = _mm_setzero_si128(); + let ret = simd_shuffle8(a.as_i32x4(), zero.as_i32x4(), [0_u32; 8]); + mem::transmute::(ret) } /// Broadcast the low packed 64-bit integer from `a` to all elements of @@ -397,8 +419,10 @@ pub unsafe fn _mm256_broadcastd_epi32(a: i32x4) -> i32x8 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpbroadcastq))] -pub unsafe fn _mm_broadcastq_epi64(a: i64x2) -> i64x2 { - simd_shuffle2(a, i64x2::splat(0_i64), [0_u32; 2]) +pub unsafe fn _mm_broadcastq_epi64(a: __m128i) -> __m128i { + let zero = _mm_setzero_si128().as_i64x2(); + let ret = simd_shuffle2(a.as_i64x2(), zero, [0_u32; 2]); + mem::transmute::(ret) } // NB: simd_shuffle4 with integer data types for `a` and `b` is @@ -408,8 +432,10 @@ pub unsafe fn _mm_broadcastq_epi64(a: i64x2) -> i64x2 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vbroadcastsd))] -pub unsafe fn _mm256_broadcastq_epi64(a: i64x2) -> i64x4 { - simd_shuffle4(a, i64x2::splat(0_i64), [0_u32; 4]) +pub unsafe fn _mm256_broadcastq_epi64(a: __m128i) -> __m256i { + let zero = _mm_setzero_si128(); + let ret = simd_shuffle4(a.as_i64x2(), zero.as_i64x2(), [0_u32; 4]); + mem::transmute::(ret) } /// Broadcast the low double-precision (64-bit) floating-point element @@ -417,8 +443,8 @@ pub unsafe fn _mm256_broadcastq_epi64(a: i64x2) -> i64x4 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vmovddup))] -pub unsafe fn _mm_broadcastsd_pd(a: f64x2) -> f64x2 { - simd_shuffle2(a, f64x2::splat(0_f64), [0_u32; 2]) +pub unsafe fn _mm_broadcastsd_pd(a: __m128d) -> __m128d { + simd_shuffle2(a, _mm_setzero_pd(), [0_u32; 2]) } /// Broadcast the low double-precision (64-bit) floating-point element @@ -426,8 +452,8 @@ pub unsafe fn _mm_broadcastsd_pd(a: f64x2) -> f64x2 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vbroadcastsd))] -pub unsafe fn _mm256_broadcastsd_pd(a: f64x2) -> f64x4 { - simd_shuffle4(a, f64x2::splat(0_f64), [0_u32; 4]) +pub unsafe fn _mm256_broadcastsd_pd(a: __m128d) -> __m256d { + simd_shuffle4(a, _mm_setzero_pd(), [0_u32; 4]) } // NB: broadcastsi128_si256 is often compiled to vinsertf128 or @@ -436,8 +462,10 @@ pub unsafe fn _mm256_broadcastsd_pd(a: f64x2) -> f64x4 { /// the 256-bit returned value. #[inline(always)] #[target_feature(enable = "avx2")] -pub unsafe fn _mm256_broadcastsi128_si256(a: i64x2) -> i64x4 { - simd_shuffle4(a, i64x2::splat(0_i64), [0, 1, 0, 1]) +pub unsafe fn _mm256_broadcastsi128_si256(a: __m128i) -> __m256i { + let zero = _mm_setzero_si128(); + let ret = simd_shuffle4(a.as_i64x2(), zero.as_i64x2(), [0, 1, 0, 1]); + mem::transmute::(ret) } /// Broadcast the low single-precision (32-bit) floating-point element @@ -445,8 +473,8 @@ pub unsafe fn _mm256_broadcastsi128_si256(a: i64x2) -> i64x4 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vbroadcastss))] -pub unsafe fn _mm_broadcastss_ps(a: f32x4) -> f32x4 { - simd_shuffle4(a, f32x4::splat(0_f32), [0_u32; 4]) +pub unsafe fn _mm_broadcastss_ps(a: __m128) -> __m128 { + simd_shuffle4(a, _mm_setzero_ps(), [0_u32; 4]) } /// Broadcast the low single-precision (32-bit) floating-point element @@ -454,8 +482,8 @@ pub unsafe fn _mm_broadcastss_ps(a: f32x4) -> f32x4 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vbroadcastss))] -pub unsafe fn _mm256_broadcastss_ps(a: f32x4) -> f32x8 { - simd_shuffle8(a, f32x4::splat(0_f32), [0_u32; 8]) +pub unsafe fn _mm256_broadcastss_ps(a: __m128) -> __m256 { + simd_shuffle8(a, _mm_setzero_ps(), [0_u32; 8]) } /// Broadcast the low packed 16-bit integer from a to all elements of @@ -463,8 +491,10 @@ pub unsafe fn _mm256_broadcastss_ps(a: f32x4) -> f32x8 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpbroadcastw))] -pub unsafe fn _mm_broadcastw_epi16(a: i16x8) -> i16x8 { - simd_shuffle8(a, i16x8::splat(0_i16), [0_u32; 8]) +pub unsafe fn _mm_broadcastw_epi16(a: __m128i) -> __m128i { + let zero = _mm_setzero_si128(); + let ret = simd_shuffle8(a.as_i16x8(), zero.as_i16x8(), [0_u32; 8]); + mem::transmute::(ret) } /// Broadcast the low packed 16-bit integer from a to all elements of @@ -472,129 +502,137 @@ pub unsafe fn _mm_broadcastw_epi16(a: i16x8) -> i16x8 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpbroadcastw))] -pub unsafe fn _mm256_broadcastw_epi16(a: i16x8) -> i16x16 { - simd_shuffle16(a, i16x8::splat(0_i16), [0_u32; 16]) +pub unsafe fn _mm256_broadcastw_epi16(a: __m128i) -> __m256i { + let zero = _mm_setzero_si128(); + let ret = simd_shuffle16(a.as_i16x8(), zero.as_i16x8(), [0_u32; 16]); + mem::transmute::(ret) } /// Compare packed 64-bit integers in `a` and `b` for equality. #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpcmpeqq))] -pub unsafe fn _mm256_cmpeq_epi64(a: i64x4, b: i64x4) -> i64x4 { - a.eq(b) +pub unsafe fn _mm256_cmpeq_epi64(a: __m256i, b: __m256i) -> __m256i { + mem::transmute::(simd_eq(a.as_i64x4(), b.as_i64x4())) } /// Compare packed 32-bit integers in `a` and `b` for equality. #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpcmpeqd))] -pub unsafe fn _mm256_cmpeq_epi32(a: i32x8, b: i32x8) -> i32x8 { - a.eq(b) +pub unsafe fn _mm256_cmpeq_epi32(a: __m256i, b: __m256i) -> __m256i { + mem::transmute::(simd_eq(a.as_i32x8(), b.as_i32x8())) } /// Compare packed 16-bit integers in `a` and `b` for equality. #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpcmpeqw))] -pub unsafe fn _mm256_cmpeq_epi16(a: i16x16, b: i16x16) -> i16x16 { - a.eq(b) +pub unsafe fn _mm256_cmpeq_epi16(a: __m256i, b: __m256i) -> __m256i { + mem::transmute::(simd_eq(a.as_i16x16(), b.as_i16x16())) } /// Compare packed 8-bit integers in `a` and `b` for equality. #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpcmpeqb))] -pub unsafe fn _mm256_cmpeq_epi8(a: i8x32, b: i8x32) -> i8x32 { - a.eq(b) +pub unsafe fn _mm256_cmpeq_epi8(a: __m256i, b: __m256i) -> __m256i { + mem::transmute::(simd_eq(a.as_i8x32(), b.as_i8x32())) } /// Compare packed 64-bit integers in `a` and `b` for greater-than. #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpcmpgtq))] -pub unsafe fn _mm256_cmpgt_epi64(a: i64x4, b: i64x4) -> i64x4 { - a.gt(b) +pub unsafe fn _mm256_cmpgt_epi64(a: __m256i, b: __m256i) -> __m256i { + mem::transmute::(simd_gt(a.as_i64x4(), b.as_i64x4())) } /// Compare packed 32-bit integers in `a` and `b` for greater-than. #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpcmpgtd))] -pub unsafe fn _mm256_cmpgt_epi32(a: i32x8, b: i32x8) -> i32x8 { - a.gt(b) +pub unsafe fn _mm256_cmpgt_epi32(a: __m256i, b: __m256i) -> __m256i { + mem::transmute::(simd_gt(a.as_i32x8(), b.as_i32x8())) } /// Compare packed 16-bit integers in `a` and `b` for greater-than. #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpcmpgtw))] -pub unsafe fn _mm256_cmpgt_epi16(a: i16x16, b: i16x16) -> i16x16 { - a.gt(b) +pub unsafe fn _mm256_cmpgt_epi16(a: __m256i, b: __m256i) -> __m256i { + mem::transmute::(simd_gt(a.as_i16x16(), b.as_i16x16())) } /// Compare packed 8-bit integers in `a` and `b` for greater-than. #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpcmpgtb))] -pub unsafe fn _mm256_cmpgt_epi8(a: i8x32, b: i8x32) -> i8x32 { - a.gt(b) +pub unsafe fn _mm256_cmpgt_epi8(a: __m256i, b: __m256i) -> __m256i { + mem::transmute::(simd_gt(a.as_i8x32(), b.as_i8x32())) } /// Sign-extend 16-bit integers to 32-bit integers. #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmovsxwd))] -pub unsafe fn _mm256_cvtepi16_epi32(a: i16x8) -> i32x8 { - simd_cast(a) +pub unsafe fn _mm256_cvtepi16_epi32(a: __m128i) -> __m256i { + mem::transmute::(simd_cast(a.as_i16x8())) } /// Sign-extend 16-bit integers to 64-bit integers. #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmovsxwq))] -pub unsafe fn _mm256_cvtepi16_epi64(a: i16x8) -> i64x4 { - simd_cast::<::v64::i16x4, _>(simd_shuffle4(a, a, [0, 1, 2, 3])) +pub unsafe fn _mm256_cvtepi16_epi64(a: __m128i) -> __m256i { + let a = a.as_i16x8(); + let v64: i16x4 = simd_shuffle4(a, a, [0, 1, 2, 3]); + mem::transmute::(simd_cast(v64)) } /// Sign-extend 32-bit integers to 64-bit integers. #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmovsxdq))] -pub unsafe fn _mm256_cvtepi32_epi64(a: i32x4) -> i64x4 { - simd_cast(a) +pub unsafe fn _mm256_cvtepi32_epi64(a: __m128i) -> __m256i { + mem::transmute::(simd_cast(a.as_i32x4())) } /// Sign-extend 8-bit integers to 16-bit integers. #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmovsxbw))] -pub unsafe fn _mm256_cvtepi8_epi16(a: i8x16) -> i16x16 { - simd_cast(a) +pub unsafe fn _mm256_cvtepi8_epi16(a: __m128i) -> __m256i { + mem::transmute::(simd_cast(a.as_i8x16())) } /// Sign-extend 8-bit integers to 32-bit integers. #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmovsxbd))] -pub unsafe fn _mm256_cvtepi8_epi32(a: i8x16) -> i32x8 { - simd_cast::<::v64::i8x8, _>(simd_shuffle8(a, a, [0, 1, 2, 3, 4, 5, 6, 7])) +pub unsafe fn _mm256_cvtepi8_epi32(a: __m128i) -> __m256i { + let a = a.as_i8x16(); + let v64: i8x8 = simd_shuffle8(a, a, [0, 1, 2, 3, 4, 5, 6, 7]); + mem::transmute::(simd_cast(v64)) } /// Sign-extend 8-bit integers to 64-bit integers. #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmovsxbq))] -pub unsafe fn _mm256_cvtepi8_epi64(a: i8x16) -> i64x4 { - simd_cast::<::v32::i8x4, _>(simd_shuffle4(a, a, [0, 1, 2, 3])) +pub unsafe fn _mm256_cvtepi8_epi64(a: __m128i) -> __m256i { + let a = a.as_i8x16(); + let v32: i8x4 = simd_shuffle4(a, a, [0, 1, 2, 3]); + mem::transmute::(simd_cast(v32)) } -/// Zero-extend the lower four unsigned 16-bit integers in `a` to 32-bit -/// integers. The upper four elements of `a` are unused. +/// Zero extend packed unsigned 16-bit integers in `a` to packed 32-bit +/// integers, and store the results in dst. #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmovzxwd))] -pub unsafe fn _mm256_cvtepu16_epi32(a: u16x8) -> i32x8 { - simd_cast(a) +pub unsafe fn _mm256_cvtepu16_epi32(a: __m128i) -> __m256i { + mem::transmute::(simd_cast(a.as_u16x8())) } /// Zero-extend the lower four unsigned 16-bit integers in `a` to 64-bit @@ -602,24 +640,26 @@ pub unsafe fn _mm256_cvtepu16_epi32(a: u16x8) -> i32x8 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmovzxwq))] -pub unsafe fn _mm256_cvtepu16_epi64(a: u16x8) -> i64x4 { - simd_cast::<::v64::u16x4, _>(simd_shuffle4(a, a, [0, 1, 2, 3])) +pub unsafe fn _mm256_cvtepu16_epi64(a: __m128i) -> __m256i { + let a = a.as_u16x8(); + let v64: u16x4 = simd_shuffle4(a, a, [0, 1, 2, 3]); + mem::transmute::(simd_cast(v64)) } /// Zero-extend unsigned 32-bit integers in `a` to 64-bit integers. #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmovzxdq))] -pub unsafe fn _mm256_cvtepu32_epi64(a: u32x4) -> i64x4 { - simd_cast(a) +pub unsafe fn _mm256_cvtepu32_epi64(a: __m128i) -> __m256i { + mem::transmute::(simd_cast(a.as_u32x4())) } /// Zero-extend unsigned 8-bit integers in `a` to 16-bit integers. #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmovzxbw))] -pub unsafe fn _mm256_cvtepu8_epi16(a: u8x16) -> i16x16 { - simd_cast(a) +pub unsafe fn _mm256_cvtepu8_epi16(a: __m128i) -> __m256i { + mem::transmute::(simd_cast(a.as_u8x16())) } /// Zero-extend the lower eight unsigned 8-bit integers in `a` to 32-bit @@ -627,8 +667,10 @@ pub unsafe fn _mm256_cvtepu8_epi16(a: u8x16) -> i16x16 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmovzxbd))] -pub unsafe fn _mm256_cvtepu8_epi32(a: u8x16) -> i32x8 { - simd_cast::<::v64::u8x8, _>(simd_shuffle8(a, a, [0, 1, 2, 3, 4, 5, 6, 7])) +pub unsafe fn _mm256_cvtepu8_epi32(a: __m128i) -> __m256i { + let a = a.as_u8x16(); + let v64: u8x8 = simd_shuffle8(a, a, [0, 1, 2, 3, 4, 5, 6, 7]); + mem::transmute::(simd_cast(v64)) } /// Zero-extend the lower four unsigned 8-bit integers in `a` to 64-bit @@ -636,8 +678,10 @@ pub unsafe fn _mm256_cvtepu8_epi32(a: u8x16) -> i32x8 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmovzxbq))] -pub unsafe fn _mm256_cvtepu8_epi64(a: u8x16) -> i64x4 { - simd_cast::<::v32::u8x4, _>(simd_shuffle4(a, a, [0, 1, 2, 3])) +pub unsafe fn _mm256_cvtepu8_epi64(a: __m128i) -> __m256i { + let a = a.as_u8x16(); + let v32: u8x4 = simd_shuffle4(a, a, [0, 1, 2, 3]); + mem::transmute::(simd_cast(v32)) } /// Extract 128 bits (of integer data) from `a` selected with `imm8`. @@ -645,30 +689,29 @@ pub unsafe fn _mm256_cvtepu8_epi64(a: u8x16) -> i64x4 { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vextractf128, imm8 = 1))] pub unsafe fn _mm256_extracti128_si256(a: __m256i, imm8: i32) -> __m128i { - use x86::i586::avx::_mm256_undefined_si256; - let imm8 = (imm8 & 0xFF) as u8; - let b = i64x4::from(_mm256_undefined_si256()); + let a = a.as_i64x4(); + let b = _mm256_undefined_si256().as_i64x4(); let dst: i64x2 = match imm8 & 0b01 { - 0 => simd_shuffle2(i64x4::from(a), b, [0, 1]), - _ => simd_shuffle2(i64x4::from(a), b, [2, 3]), + 0 => simd_shuffle2(a, b, [0, 1]), + _ => simd_shuffle2(a, b, [2, 3]), }; - __m128i::from(dst) + mem::transmute(dst) } /// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`. #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vphaddw))] -pub unsafe fn _mm256_hadd_epi16(a: i16x16, b: i16x16) -> i16x16 { - phaddw(a, b) +pub unsafe fn _mm256_hadd_epi16(a: __m256i, b: __m256i) -> __m256i { + mem::transmute(phaddw(a.as_i16x16(), b.as_i16x16())) } /// Horizontally add adjacent pairs of 32-bit integers in `a` and `b`. #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vphaddd))] -pub unsafe fn _mm256_hadd_epi32(a: i32x8, b: i32x8) -> i32x8 { - phaddd(a, b) +pub unsafe fn _mm256_hadd_epi32(a: __m256i, b: __m256i) -> __m256i { + mem::transmute(phaddd(a.as_i32x8(), b.as_i32x8())) } /// Horizontally add adjacent pairs of 16-bit integers in `a` and `b` @@ -676,24 +719,24 @@ pub unsafe fn _mm256_hadd_epi32(a: i32x8, b: i32x8) -> i32x8 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vphaddsw))] -pub unsafe fn _mm256_hadds_epi16(a: i16x16, b: i16x16) -> i16x16 { - phaddsw(a, b) +pub unsafe fn _mm256_hadds_epi16(a: __m256i, b: __m256i) -> __m256i { + mem::transmute(phaddsw(a.as_i16x16(), b.as_i16x16())) } /// Horizontally substract adjacent pairs of 16-bit integers in `a` and `b`. #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vphsubw))] -pub unsafe fn _mm256_hsub_epi16(a: i16x16, b: i16x16) -> i16x16 { - phsubw(a, b) +pub unsafe fn _mm256_hsub_epi16(a: __m256i, b: __m256i) -> __m256i { + mem::transmute(phsubw(a.as_i16x16(), b.as_i16x16())) } /// Horizontally substract adjacent pairs of 32-bit integers in `a` and `b`. #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vphsubd))] -pub unsafe fn _mm256_hsub_epi32(a: i32x8, b: i32x8) -> i32x8 { - phsubd(a, b) +pub unsafe fn _mm256_hsub_epi32(a: __m256i, b: __m256i) -> __m256i { + mem::transmute(phsubd(a.as_i32x8(), b.as_i32x8())) } /// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b` @@ -701,8 +744,8 @@ pub unsafe fn _mm256_hsub_epi32(a: i32x8, b: i32x8) -> i32x8 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vphsubsw))] -pub unsafe fn _mm256_hsubs_epi16(a: i16x16, b: i16x16) -> i16x16 { - phsubsw(a, b) +pub unsafe fn _mm256_hsubs_epi16(a: __m256i, b: __m256i) -> __m256i { + mem::transmute(phsubsw(a.as_i16x16(), b.as_i16x16())) } /// Return values from `slice` at offsets determined by `offsets * scale`, @@ -712,12 +755,17 @@ pub unsafe fn _mm256_hsubs_epi16(a: i16x16, b: i16x16) -> i16x16 { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))] pub unsafe fn _mm_i32gather_epi32( - slice: *const i32, offsets: i32x4, scale: i32 -) -> i32x4 { + slice: *const i32, offsets: __m128i, scale: i32 +) -> __m128i { + let zero = _mm_setzero_si128().as_i32x4(); + let neg_one = _mm_set1_epi32(-1).as_i32x4(); + let offsets = offsets.as_i32x4(); + let slice = slice as *const i8; macro_rules! call { - ($imm8:expr) => (pgatherdd(i32x4::splat(0), slice as *const i8, offsets, i32x4::splat(-1), $imm8)) + ($imm8:expr) => (pgatherdd(zero, slice, offsets, neg_one, $imm8)) } - constify_imm8!(scale, call) + let r = constify_imm8!(scale, call); + mem::transmute(r) } /// Return values from `slice` at offsets determined by `offsets * scale`, @@ -728,12 +776,17 @@ pub unsafe fn _mm_i32gather_epi32( #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))] pub unsafe fn _mm_mask_i32gather_epi32( - src: i32x4, slice: *const i32, offsets: i32x4, mask: i32x4, scale: i32 -) -> i32x4 { + src: __m128i, slice: *const i32, offsets: __m128i, mask: __m128i, scale: i32 +) -> __m128i { + let src = src.as_i32x4(); + let mask = mask.as_i32x4(); + let offsets = offsets.as_i32x4(); + let slice = slice as *const i8; macro_rules! call { - ($imm8:expr) => (pgatherdd(src, slice as *const i8, offsets, mask, $imm8)) + ($imm8:expr) => (pgatherdd(src, slice, offsets, mask, $imm8)) } - constify_imm8!(scale, call) + let r = constify_imm8!(scale, call); + mem::transmute(r) } /// Return values from `slice` at offsets determined by `offsets * scale`, @@ -743,12 +796,17 @@ pub unsafe fn _mm_mask_i32gather_epi32( #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))] pub unsafe fn _mm256_i32gather_epi32( - slice: *const i32, offsets: i32x8, scale: i32 -) -> i32x8 { + slice: *const i32, offsets: __m256i, scale: i32 +) -> __m256i { + let zero = _mm256_setzero_si256().as_i32x8(); + let neg_one = _mm256_set1_epi32(-1).as_i32x8(); + let offsets = offsets.as_i32x8(); + let slice = slice as *const i8; macro_rules! call { - ($imm8:expr) => (vpgatherdd(i32x8::splat(0), slice as *const i8, offsets, i32x8::splat(-1), $imm8)) + ($imm8:expr) => (vpgatherdd(zero, slice, offsets, neg_one, $imm8)) } - constify_imm8!(scale, call) + let r = constify_imm8!(scale, call); + mem::transmute(r) } /// Return values from `slice` at offsets determined by `offsets * scale`, @@ -759,12 +817,17 @@ pub unsafe fn _mm256_i32gather_epi32( #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))] pub unsafe fn _mm256_mask_i32gather_epi32( - src: i32x8, slice: *const i32, offsets: i32x8, mask: i32x8, scale: i32 -) -> i32x8 { + src: __m256i, slice: *const i32, offsets: __m256i, mask: __m256i, scale: i32 +) -> __m256i { + let src = src.as_i32x8(); + let mask = mask.as_i32x8(); + let offsets = offsets.as_i32x8(); + let slice = slice as *const i8; macro_rules! call { - ($imm8:expr) => (vpgatherdd(src, slice as *const i8, offsets, mask, $imm8)) + ($imm8:expr) => (vpgatherdd(src, slice, offsets, mask, $imm8)) } - constify_imm8!(scale, call) + let r = constify_imm8!(scale, call); + mem::transmute(r) } /// Return values from `slice` at offsets determined by `offsets * scale`, @@ -774,10 +837,14 @@ pub unsafe fn _mm256_mask_i32gather_epi32( #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vgatherdps, scale = 1))] pub unsafe fn _mm_i32gather_ps( - slice: *const f32, offsets: i32x4, scale: i32 -) -> f32x4 { + slice: *const f32, offsets: __m128i, scale: i32 +) -> __m128 { + let zero = _mm_setzero_ps(); + let neg_one = _mm_set1_ps(-1.0); + let offsets = offsets.as_i32x4(); + let slice = slice as *const i8; macro_rules! call { - ($imm8:expr) => (pgatherdps(f32x4::splat(0.0), slice as *const i8, offsets, f32x4::splat(-1.0), $imm8)) + ($imm8:expr) => (pgatherdps(zero, slice, offsets, neg_one, $imm8)) } constify_imm8!(scale, call) } @@ -790,10 +857,12 @@ pub unsafe fn _mm_i32gather_ps( #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vgatherdps, scale = 1))] pub unsafe fn _mm_mask_i32gather_ps( - src: f32x4, slice: *const f32, offsets: i32x4, mask: f32x4, scale: i32 -) -> f32x4 { + src: __m128, slice: *const f32, offsets: __m128i, mask: __m128, scale: i32 +) -> __m128 { + let offsets = offsets.as_i32x4(); + let slice = slice as *const i8; macro_rules! call { - ($imm8:expr) => (pgatherdps(src, slice as *const i8, offsets, mask, $imm8)) + ($imm8:expr) => (pgatherdps(src, slice, offsets, mask, $imm8)) } constify_imm8!(scale, call) } @@ -805,10 +874,14 @@ pub unsafe fn _mm_mask_i32gather_ps( #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vgatherdps, scale = 1))] pub unsafe fn _mm256_i32gather_ps( - slice: *const f32, offsets: i32x8, scale: i32 -) -> f32x8 { + slice: *const f32, offsets: __m256i, scale: i32 +) -> __m256 { + let zero = _mm256_setzero_ps(); + let neg_one = _mm256_set1_ps(-1.0); + let offsets = offsets.as_i32x8(); + let slice = slice as *const i8; macro_rules! call { - ($imm8:expr) => (vpgatherdps(f32x8::splat(0.0), slice as *const i8, offsets, f32x8::splat(-1.0), $imm8)) + ($imm8:expr) => (vpgatherdps(zero, slice, offsets, neg_one, $imm8)) } constify_imm8!(scale, call) } @@ -821,10 +894,12 @@ pub unsafe fn _mm256_i32gather_ps( #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vgatherdps, scale = 1))] pub unsafe fn _mm256_mask_i32gather_ps( - src: f32x8, slice: *const f32, offsets: i32x8, mask: f32x8, scale: i32 -) -> f32x8 { + src: __m256, slice: *const f32, offsets: __m256i, mask: __m256, scale: i32 +) -> __m256 { + let offsets = offsets.as_i32x8(); + let slice = slice as *const i8; macro_rules! call { - ($imm8:expr) => (vpgatherdps(src, slice as *const i8, offsets, mask, $imm8)) + ($imm8:expr) => (vpgatherdps(src, slice, offsets, mask, $imm8)) } constify_imm8!(scale, call) } @@ -836,12 +911,17 @@ pub unsafe fn _mm256_mask_i32gather_ps( #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))] pub unsafe fn _mm_i32gather_epi64( - slice: *const i64, offsets: i32x4, scale: i32 -) -> i64x2 { + slice: *const i64, offsets: __m128i, scale: i32 +) -> __m128i { + let zero = _mm_setzero_si128().as_i64x2(); + let neg_one = _mm_set1_epi64x(-1).as_i64x2(); + let offsets = offsets.as_i32x4(); + let slice = slice as *const i8; macro_rules! call { - ($imm8:expr) => (pgatherdq(i64x2::splat(0), slice as *const i8, offsets, i64x2::splat(-1), $imm8)) + ($imm8:expr) => (pgatherdq(zero, slice, offsets, neg_one, $imm8)) } - constify_imm8!(scale, call) + let r = constify_imm8!(scale, call); + mem::transmute(r) } /// Return values from `slice` at offsets determined by `offsets * scale`, @@ -852,12 +932,17 @@ pub unsafe fn _mm_i32gather_epi64( #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))] pub unsafe fn _mm_mask_i32gather_epi64( - src: i64x2, slice: *const i64, offsets: i32x4, mask: i64x2, scale: i32 -) -> i64x2 { + src: __m128i, slice: *const i64, offsets: __m128i, mask: __m128i, scale: i32 +) -> __m128i { + let src = src.as_i64x2(); + let mask = mask.as_i64x2(); + let offsets = offsets.as_i32x4(); + let slice = slice as *const i8; macro_rules! call { - ($imm8:expr) => (pgatherdq(src, slice as *const i8, offsets, mask, $imm8)) + ($imm8:expr) => (pgatherdq(src, slice, offsets, mask, $imm8)) } - constify_imm8!(scale, call) + let r = constify_imm8!(scale, call); + mem::transmute(r) } /// Return values from `slice` at offsets determined by `offsets * scale`, @@ -867,12 +952,17 @@ pub unsafe fn _mm_mask_i32gather_epi64( #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))] pub unsafe fn _mm256_i32gather_epi64( - slice: *const i64, offsets: i32x4, scale: i32 -) -> i64x4 { + slice: *const i64, offsets: __m128i, scale: i32 +) -> __m256i { + let zero = _mm256_setzero_si256().as_i64x4(); + let neg_one = _mm256_set1_epi64x(-1).as_i64x4(); + let offsets = offsets.as_i32x4(); + let slice = slice as *const i8; macro_rules! call { - ($imm8:expr) => (vpgatherdq(i64x4::splat(0), slice as *const i8, offsets, i64x4::splat(-1), $imm8)) + ($imm8:expr) => (vpgatherdq(zero, slice, offsets, neg_one, $imm8)) } - constify_imm8!(scale, call) + let r = constify_imm8!(scale, call); + mem::transmute(r) } /// Return values from `slice` at offsets determined by `offsets * scale`, @@ -883,12 +973,17 @@ pub unsafe fn _mm256_i32gather_epi64( #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))] pub unsafe fn _mm256_mask_i32gather_epi64( - src: i64x4, slice: *const i64, offsets: i32x4, mask: i64x4, scale: i32 -) -> i64x4 { + src: __m256i, slice: *const i64, offsets: __m128i, mask: __m256i, scale: i32 +) -> __m256i { + let src = src.as_i64x4(); + let mask = mask.as_i64x4(); + let offsets = offsets.as_i32x4(); + let slice = slice as *const i8; macro_rules! call { - ($imm8:expr) => (vpgatherdq(src, slice as *const i8, offsets, mask, $imm8)) + ($imm8:expr) => (vpgatherdq(src, slice, offsets, mask, $imm8)) } - constify_imm8!(scale, call) + let r = constify_imm8!(scale, call); + mem::transmute(r) } /// Return values from `slice` at offsets determined by `offsets * scale`, @@ -898,10 +993,14 @@ pub unsafe fn _mm256_mask_i32gather_epi64( #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))] pub unsafe fn _mm_i32gather_pd( - slice: *const f64, offsets: i32x4, scale: i32 -) -> f64x2 { + slice: *const f64, offsets: __m128i, scale: i32 +) -> __m128d { + let zero = _mm_setzero_pd(); + let neg_one = _mm_set1_pd(-1.0); + let offsets = offsets.as_i32x4(); + let slice = slice as *const i8; macro_rules! call { - ($imm8:expr) => (pgatherdpd(f64x2::splat(0.0), slice as *const i8, offsets, f64x2::splat(-1.0), $imm8)) + ($imm8:expr) => (pgatherdpd(zero, slice, offsets, neg_one, $imm8)) } constify_imm8!(scale, call) } @@ -914,10 +1013,12 @@ pub unsafe fn _mm_i32gather_pd( #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))] pub unsafe fn _mm_mask_i32gather_pd( - src: f64x2, slice: *const f64, offsets: i32x4, mask: f64x2, scale: i32 -) -> f64x2 { + src: __m128d, slice: *const f64, offsets: __m128i, mask: __m128d, scale: i32 +) -> __m128d { + let offsets = offsets.as_i32x4(); + let slice = slice as *const i8; macro_rules! call { - ($imm8:expr) => (pgatherdpd(src, slice as *const i8, offsets, mask, $imm8)) + ($imm8:expr) => (pgatherdpd(src, slice, offsets, mask, $imm8)) } constify_imm8!(scale, call) } @@ -929,10 +1030,14 @@ pub unsafe fn _mm_mask_i32gather_pd( #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))] pub unsafe fn _mm256_i32gather_pd( - slice: *const f64, offsets: i32x4, scale: i32 -) -> f64x4 { + slice: *const f64, offsets: __m128i, scale: i32 +) -> __m256d { + let zero = _mm256_setzero_pd(); + let neg_one = _mm256_set1_pd(-1.0); + let offsets = offsets.as_i32x4(); + let slice = slice as *const i8; macro_rules! call { - ($imm8:expr) => (vpgatherdpd(f64x4::splat(0.0), slice as *const i8, offsets, f64x4::splat(-1.0), $imm8)) + ($imm8:expr) => (vpgatherdpd(zero, slice, offsets, neg_one, $imm8)) } constify_imm8!(scale, call) } @@ -945,10 +1050,12 @@ pub unsafe fn _mm256_i32gather_pd( #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))] pub unsafe fn _mm256_mask_i32gather_pd( - src: f64x4, slice: *const f64, offsets: i32x4, mask: f64x4, scale: i32 -) -> f64x4 { + src: __m256d, slice: *const f64, offsets: __m128i, mask: __m256d, scale: i32 +) -> __m256d { + let offsets = offsets.as_i32x4(); + let slice = slice as *const i8; macro_rules! call { - ($imm8:expr) => (vpgatherdpd(src, slice as *const i8, offsets, mask, $imm8)) + ($imm8:expr) => (vpgatherdpd(src, slice, offsets, mask, $imm8)) } constify_imm8!(scale, call) } @@ -960,12 +1067,17 @@ pub unsafe fn _mm256_mask_i32gather_pd( #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))] pub unsafe fn _mm_i64gather_epi32( - slice: *const i32, offsets: i64x2, scale: i32 -) -> i32x4 { + slice: *const i32, offsets: __m128i, scale: i32 +) -> __m128i { + let zero = _mm_setzero_si128().as_i32x4(); + let neg_one = _mm_set1_epi64x(-1).as_i32x4(); + let offsets = offsets.as_i64x2(); + let slice = slice as *const i8; macro_rules! call { - ($imm8:expr) => (pgatherqd(i32x4::splat(0), slice as *const i8, offsets, i32x4::splat(-1), $imm8)) + ($imm8:expr) => (pgatherqd(zero, slice, offsets, neg_one, $imm8)) } - constify_imm8!(scale, call) + let r = constify_imm8!(scale, call); + mem::transmute(r) } /// Return values from `slice` at offsets determined by `offsets * scale`, @@ -976,12 +1088,17 @@ pub unsafe fn _mm_i64gather_epi32( #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))] pub unsafe fn _mm_mask_i64gather_epi32( - src: i32x4, slice: *const i32, offsets: i64x2, mask: i32x4, scale: i32 -) -> i32x4 { + src: __m128i, slice: *const i32, offsets: __m128i, mask: __m128i, scale: i32 +) -> __m128i { + let src = src.as_i32x4(); + let mask = mask.as_i32x4(); + let offsets = offsets.as_i64x2(); + let slice = slice as *const i8; macro_rules! call { - ($imm8:expr) => (pgatherqd(src, slice as *const i8, offsets, mask, $imm8)) + ($imm8:expr) => (pgatherqd(src, slice, offsets, mask, $imm8)) } - constify_imm8!(scale, call) + let r = constify_imm8!(scale, call); + mem::transmute(r) } /// Return values from `slice` at offsets determined by `offsets * scale`, @@ -991,12 +1108,17 @@ pub unsafe fn _mm_mask_i64gather_epi32( #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))] pub unsafe fn _mm256_i64gather_epi32( - slice: *const i32, offsets: i64x4, scale: i32 -) -> i32x4 { + slice: *const i32, offsets: __m256i, scale: i32 +) -> __m128i { + let zero = _mm_setzero_si128().as_i32x4(); + let neg_one = _mm_set1_epi64x(-1).as_i32x4(); + let offsets = offsets.as_i64x4(); + let slice = slice as *const i8; macro_rules! call { - ($imm8:expr) => (vpgatherqd(i32x4::splat(0), slice as *const i8, offsets, i32x4::splat(-1), $imm8)) + ($imm8:expr) => (vpgatherqd(zero, slice, offsets, neg_one, $imm8)) } - constify_imm8!(scale, call) + let r = constify_imm8!(scale, call); + mem::transmute(r) } /// Return values from `slice` at offsets determined by `offsets * scale`, @@ -1007,12 +1129,17 @@ pub unsafe fn _mm256_i64gather_epi32( #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))] pub unsafe fn _mm256_mask_i64gather_epi32( - src: i32x4, slice: *const i32, offsets: i64x4, mask: i32x4, scale: i32 -) -> i32x4 { + src: __m128i, slice: *const i32, offsets: __m256i, mask: __m128i, scale: i32 +) -> __m128i { + let src = src.as_i32x4(); + let mask = mask.as_i32x4(); + let offsets = offsets.as_i64x4(); + let slice = slice as *const i8; macro_rules! call { - ($imm8:expr) => (vpgatherqd(src, slice as *const i8, offsets, mask, $imm8)) + ($imm8:expr) => (vpgatherqd(src, slice, offsets, mask, $imm8)) } - constify_imm8!(scale, call) + let r = constify_imm8!(scale, call); + mem::transmute(r) } /// Return values from `slice` at offsets determined by `offsets * scale`, @@ -1022,10 +1149,14 @@ pub unsafe fn _mm256_mask_i64gather_epi32( #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vgatherqps, scale = 1))] pub unsafe fn _mm_i64gather_ps( - slice: *const f32, offsets: i64x2, scale: i32 -) -> f32x4 { + slice: *const f32, offsets: __m128i, scale: i32 +) -> __m128 { + let zero = _mm_setzero_ps(); + let neg_one = _mm_set1_ps(-1.0); + let offsets = offsets.as_i64x2(); + let slice = slice as *const i8; macro_rules! call { - ($imm8:expr) => (pgatherqps(f32x4::splat(0.0), slice as *const i8, offsets, f32x4::splat(-1.0), $imm8)) + ($imm8:expr) => (pgatherqps(zero, slice, offsets, neg_one, $imm8)) } constify_imm8!(scale, call) } @@ -1038,10 +1169,12 @@ pub unsafe fn _mm_i64gather_ps( #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vgatherqps, scale = 1))] pub unsafe fn _mm_mask_i64gather_ps( - src: f32x4, slice: *const f32, offsets: i64x2, mask: f32x4, scale: i32 -) -> f32x4 { + src: __m128, slice: *const f32, offsets: __m128i, mask: __m128, scale: i32 +) -> __m128 { + let offsets = offsets.as_i64x2(); + let slice = slice as *const i8; macro_rules! call { - ($imm8:expr) => (pgatherqps(src, slice as *const i8, offsets, mask, $imm8)) + ($imm8:expr) => (pgatherqps(src, slice, offsets, mask, $imm8)) } constify_imm8!(scale, call) } @@ -1053,10 +1186,14 @@ pub unsafe fn _mm_mask_i64gather_ps( #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vgatherqps, scale = 1))] pub unsafe fn _mm256_i64gather_ps( - slice: *const f32, offsets: i64x4, scale: i32 -) -> f32x4 { + slice: *const f32, offsets: __m256i, scale: i32 +) -> __m128 { + let zero = _mm_setzero_ps(); + let neg_one = _mm_set1_ps(-1.0); + let offsets = offsets.as_i64x4(); + let slice = slice as *const i8; macro_rules! call { - ($imm8:expr) => (vpgatherqps(f32x4::splat(0.0), slice as *const i8, offsets, f32x4::splat(-1.0), $imm8)) + ($imm8:expr) => (vpgatherqps(zero, slice, offsets, neg_one, $imm8)) } constify_imm8!(scale, call) } @@ -1069,10 +1206,12 @@ pub unsafe fn _mm256_i64gather_ps( #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vgatherqps, scale = 1))] pub unsafe fn _mm256_mask_i64gather_ps( - src: f32x4, slice: *const f32, offsets: i64x4, mask: f32x4, scale: i32 -) -> f32x4 { + src: __m128, slice: *const f32, offsets: __m256i, mask: __m128, scale: i32 +) -> __m128 { + let offsets = offsets.as_i64x4(); + let slice = slice as *const i8; macro_rules! call { - ($imm8:expr) => (vpgatherqps(src, slice as *const i8, offsets, mask, $imm8)) + ($imm8:expr) => (vpgatherqps(src, slice, offsets, mask, $imm8)) } constify_imm8!(scale, call) } @@ -1084,12 +1223,17 @@ pub unsafe fn _mm256_mask_i64gather_ps( #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))] pub unsafe fn _mm_i64gather_epi64( - slice: *const i64, offsets: i64x2, scale: i32 -) -> i64x2 { + slice: *const i64, offsets: __m128i, scale: i32 +) -> __m128i { + let zero = _mm_setzero_si128().as_i64x2(); + let neg_one = _mm_set1_epi64x(-1).as_i64x2(); + let slice = slice as *const i8; + let offsets = offsets.as_i64x2(); macro_rules! call { - ($imm8:expr) => (pgatherqq(i64x2::splat(0), slice as *const i8, offsets, i64x2::splat(-1), $imm8)) + ($imm8:expr) => (pgatherqq(zero, slice, offsets, neg_one, $imm8)) } - constify_imm8!(scale, call) + let r = constify_imm8!(scale, call); + mem::transmute(r) } /// Return values from `slice` at offsets determined by `offsets * scale`, @@ -1100,12 +1244,17 @@ pub unsafe fn _mm_i64gather_epi64( #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))] pub unsafe fn _mm_mask_i64gather_epi64( - src: i64x2, slice: *const i64, offsets: i64x2, mask: i64x2, scale: i32 -) -> i64x2 { + src: __m128i, slice: *const i64, offsets: __m128i, mask: __m128i, scale: i32 +) -> __m128i { + let src = src.as_i64x2(); + let mask = mask.as_i64x2(); + let offsets = offsets.as_i64x2(); + let slice = slice as *const i8; macro_rules! call { - ($imm8:expr) => (pgatherqq(src, slice as *const i8, offsets, mask, $imm8)) + ($imm8:expr) => (pgatherqq(src, slice, offsets, mask, $imm8)) } - constify_imm8!(scale, call) + let r = constify_imm8!(scale, call); + mem::transmute(r) } /// Return values from `slice` at offsets determined by `offsets * scale`, @@ -1115,12 +1264,17 @@ pub unsafe fn _mm_mask_i64gather_epi64( #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))] pub unsafe fn _mm256_i64gather_epi64( - slice: *const i64, offsets: i64x4, scale: i32 -) -> i64x4 { + slice: *const i64, offsets: __m256i, scale: i32 +) -> __m256i { + let zero = _mm256_setzero_si256().as_i64x4(); + let neg_one = _mm256_set1_epi64x(-1).as_i64x4(); + let slice = slice as *const i8; + let offsets = offsets.as_i64x4(); macro_rules! call { - ($imm8:expr) => (vpgatherqq(i64x4::splat(0), slice as *const i8, offsets, i64x4::splat(-1), $imm8)) + ($imm8:expr) => (vpgatherqq(zero, slice, offsets, neg_one, $imm8)) } - constify_imm8!(scale, call) + let r = constify_imm8!(scale, call); + mem::transmute(r) } /// Return values from `slice` at offsets determined by `offsets * scale`, @@ -1131,12 +1285,17 @@ pub unsafe fn _mm256_i64gather_epi64( #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))] pub unsafe fn _mm256_mask_i64gather_epi64( - src: i64x4, slice: *const i64, offsets: i64x4, mask: i64x4, scale: i32 -) -> i64x4 { + src: __m256i, slice: *const i64, offsets: __m256i, mask: __m256i, scale: i32 +) -> __m256i { + let src = src.as_i64x4(); + let mask = mask.as_i64x4(); + let offsets = offsets.as_i64x4(); + let slice = slice as *const i8; macro_rules! call { - ($imm8:expr) => (vpgatherqq(src, slice as *const i8, offsets, mask, $imm8)) + ($imm8:expr) => (vpgatherqq(src, slice, offsets, mask, $imm8)) } - constify_imm8!(scale, call) + let r = constify_imm8!(scale, call); + mem::transmute(r) } /// Return values from `slice` at offsets determined by `offsets * scale`, @@ -1146,10 +1305,14 @@ pub unsafe fn _mm256_mask_i64gather_epi64( #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))] pub unsafe fn _mm_i64gather_pd( - slice: *const f64, offsets: i64x2, scale: i32 -) -> f64x2 { + slice: *const f64, offsets: __m128i, scale: i32 +) -> __m128d { + let zero = _mm_setzero_pd(); + let neg_one = _mm_set1_pd(-1.0); + let slice = slice as *const i8; + let offsets = offsets.as_i64x2(); macro_rules! call { - ($imm8:expr) => (pgatherqpd(f64x2::splat(0.0), slice as *const i8, offsets, f64x2::splat(-1.0), $imm8)) + ($imm8:expr) => (pgatherqpd(zero, slice, offsets, neg_one, $imm8)) } constify_imm8!(scale, call) } @@ -1162,10 +1325,12 @@ pub unsafe fn _mm_i64gather_pd( #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))] pub unsafe fn _mm_mask_i64gather_pd( - src: f64x2, slice: *const f64, offsets: i64x2, mask: f64x2, scale: i32 -) -> f64x2 { + src: __m128d, slice: *const f64, offsets: __m128i, mask: __m128d, scale: i32 +) -> __m128d { + let slice = slice as *const i8; + let offsets = offsets.as_i64x2(); macro_rules! call { - ($imm8:expr) => (pgatherqpd(src, slice as *const i8, offsets, mask, $imm8)) + ($imm8:expr) => (pgatherqpd(src, slice, offsets, mask, $imm8)) } constify_imm8!(scale, call) } @@ -1177,10 +1342,14 @@ pub unsafe fn _mm_mask_i64gather_pd( #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))] pub unsafe fn _mm256_i64gather_pd( - slice: *const f64, offsets: i64x4, scale: i32 -) -> f64x4 { + slice: *const f64, offsets: __m256i, scale: i32 +) -> __m256d { + let zero = _mm256_setzero_pd(); + let neg_one = _mm256_set1_pd(-1.0); + let slice = slice as *const i8; + let offsets = offsets.as_i64x4(); macro_rules! call { - ($imm8:expr) => (vpgatherqpd(f64x4::splat(0.0), slice as *const i8, offsets, f64x4::splat(-1.0), $imm8)) + ($imm8:expr) => (vpgatherqpd(zero, slice, offsets, neg_one, $imm8)) } constify_imm8!(scale, call) } @@ -1193,10 +1362,12 @@ pub unsafe fn _mm256_i64gather_pd( #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))] pub unsafe fn _mm256_mask_i64gather_pd( - src: f64x4, slice: *const f64, offsets: i64x4, mask: f64x4, scale: i32 -) -> f64x4 { + src: __m256d, slice: *const f64, offsets: __m256i, mask: __m256d, scale: i32 +) -> __m256d { + let slice = slice as *const i8; + let offsets = offsets.as_i64x4(); macro_rules! call { - ($imm8:expr) => (vpgatherqpd(src, slice as *const i8, offsets, mask, $imm8)) + ($imm8:expr) => (vpgatherqpd(src, slice, offsets, mask, $imm8)) } constify_imm8!(scale, call) } @@ -1209,14 +1380,13 @@ pub unsafe fn _mm256_mask_i64gather_pd( pub unsafe fn _mm256_inserti128_si256( a: __m256i, b: __m128i, imm8: i32 ) -> __m256i { - use x86::i586::avx::_mm256_castsi128_si256; - let imm8 = (imm8 & 0b01) as u8; - let b = i64x4::from(_mm256_castsi128_si256(b)); + let a = a.as_i64x4(); + let b = _mm256_castsi128_si256(b).as_i64x4(); let dst: i64x4 = match imm8 & 0b01 { - 0 => simd_shuffle4(i64x4::from(a), b, [4, 5, 2, 3]), - _ => simd_shuffle4(i64x4::from(a), b, [0, 1, 4, 5]), + 0 => simd_shuffle4(a, b, [4, 5, 2, 3]), + _ => simd_shuffle4(a, b, [0, 1, 4, 5]), }; - __m256i::from(dst) + mem::transmute(dst) } /// Multiply packed signed 16-bit integers in `a` and `b`, producing @@ -1225,8 +1395,8 @@ pub unsafe fn _mm256_inserti128_si256( #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmaddwd))] -pub unsafe fn _mm256_madd_epi16(a: i16x16, b: i16x16) -> i32x8 { - pmaddwd(a, b) +pub unsafe fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i { + mem::transmute(pmaddwd(a.as_i16x16(), b.as_i16x16())) } /// Vertically multiply each unsigned 8-bit integer from `a` with the @@ -1236,8 +1406,8 @@ pub unsafe fn _mm256_madd_epi16(a: i16x16, b: i16x16) -> i32x8 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmaddubsw))] -pub unsafe fn _mm256_maddubs_epi16(a: u8x32, b: u8x32) -> i16x16 { - pmaddubsw(a, b) +pub unsafe fn _mm256_maddubs_epi16(a: __m256i, b: __m256i) -> __m256i { + mem::transmute(pmaddubsw(a.as_u8x32(), b.as_u8x32())) } /// Load packed 32-bit integers from memory pointed by `mem_addr` using `mask` @@ -1246,8 +1416,8 @@ pub unsafe fn _mm256_maddubs_epi16(a: u8x32, b: u8x32) -> i16x16 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmaskmovd))] -pub unsafe fn _mm_maskload_epi32(mem_addr: *const i32, mask: i32x4) -> i32x4 { - maskloadd(mem_addr as *const i8, mask) +pub unsafe fn _mm_maskload_epi32(mem_addr: *const i32, mask: __m128i) -> __m128i { + mem::transmute(maskloadd(mem_addr as *const i8, mask.as_i32x4())) } /// Load packed 32-bit integers from memory pointed by `mem_addr` using `mask` @@ -1257,9 +1427,9 @@ pub unsafe fn _mm_maskload_epi32(mem_addr: *const i32, mask: i32x4) -> i32x4 { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmaskmovd))] pub unsafe fn _mm256_maskload_epi32( - mem_addr: *const i32, mask: i32x8 -) -> i32x8 { - maskloadd256(mem_addr as *const i8, mask) + mem_addr: *const i32, mask: __m256i +) -> __m256i { + mem::transmute(maskloadd256(mem_addr as *const i8, mask.as_i32x8())) } /// Load packed 64-bit integers from memory pointed by `mem_addr` using `mask` @@ -1268,8 +1438,8 @@ pub unsafe fn _mm256_maskload_epi32( #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmaskmovq))] -pub unsafe fn _mm_maskload_epi64(mem_addr: *const i64, mask: i64x2) -> i64x2 { - maskloadq(mem_addr as *const i8, mask) +pub unsafe fn _mm_maskload_epi64(mem_addr: *const i64, mask: __m128i) -> __m128i { + mem::transmute(maskloadq(mem_addr as *const i8, mask.as_i64x2())) } /// Load packed 64-bit integers from memory pointed by `mem_addr` using `mask` @@ -1279,9 +1449,9 @@ pub unsafe fn _mm_maskload_epi64(mem_addr: *const i64, mask: i64x2) -> i64x2 { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmaskmovq))] pub unsafe fn _mm256_maskload_epi64( - mem_addr: *const i64, mask: i64x4 -) -> i64x4 { - maskloadq256(mem_addr as *const i8, mask) + mem_addr: *const i64, mask: __m256i +) -> __m256i { + mem::transmute(maskloadq256(mem_addr as *const i8, mask.as_i64x4())) } /// Store packed 32-bit integers from `a` into memory pointed by `mem_addr` @@ -1290,8 +1460,8 @@ pub unsafe fn _mm256_maskload_epi64( #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmaskmovd))] -pub unsafe fn _mm_maskstore_epi32(mem_addr: *mut i32, mask: i32x4, a: i32x4) { - maskstored(mem_addr as *mut i8, mask, a) +pub unsafe fn _mm_maskstore_epi32(mem_addr: *mut i32, mask: __m128i, a: __m128i) { + maskstored(mem_addr as *mut i8, mask.as_i32x4(), a.as_i32x4()) } /// Store packed 32-bit integers from `a` into memory pointed by `mem_addr` @@ -1301,9 +1471,9 @@ pub unsafe fn _mm_maskstore_epi32(mem_addr: *mut i32, mask: i32x4, a: i32x4) { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmaskmovd))] pub unsafe fn _mm256_maskstore_epi32( - mem_addr: *mut i32, mask: i32x8, a: i32x8 + mem_addr: *mut i32, mask: __m256i, a: __m256i ) { - maskstored256(mem_addr as *mut i8, mask, a) + maskstored256(mem_addr as *mut i8, mask.as_i32x8(), a.as_i32x8()) } /// Store packed 64-bit integers from `a` into memory pointed by `mem_addr` @@ -1312,8 +1482,8 @@ pub unsafe fn _mm256_maskstore_epi32( #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmaskmovq))] -pub unsafe fn _mm_maskstore_epi64(mem_addr: *mut i64, mask: i64x2, a: i64x2) { - maskstoreq(mem_addr as *mut i8, mask, a) +pub unsafe fn _mm_maskstore_epi64(mem_addr: *mut i64, mask: __m128i, a: __m128i) { + maskstoreq(mem_addr as *mut i8, mask.as_i64x2(), a.as_i64x2()) } /// Store packed 64-bit integers from `a` into memory pointed by `mem_addr` @@ -1323,9 +1493,9 @@ pub unsafe fn _mm_maskstore_epi64(mem_addr: *mut i64, mask: i64x2, a: i64x2) { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmaskmovq))] pub unsafe fn _mm256_maskstore_epi64( - mem_addr: *mut i64, mask: i64x4, a: i64x4 + mem_addr: *mut i64, mask: __m256i, a: __m256i ) { - maskstoreq256(mem_addr as *mut i8, mask, a) + maskstoreq256(mem_addr as *mut i8, mask.as_i64x4(), a.as_i64x4()) } /// Compare packed 16-bit integers in `a` and `b`, and return the packed @@ -1333,8 +1503,8 @@ pub unsafe fn _mm256_maskstore_epi64( #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmaxsw))] -pub unsafe fn _mm256_max_epi16(a: i16x16, b: i16x16) -> i16x16 { - pmaxsw(a, b) +pub unsafe fn _mm256_max_epi16(a: __m256i, b: __m256i) -> __m256i { + mem::transmute(pmaxsw(a.as_i16x16(), b.as_i16x16())) } /// Compare packed 32-bit integers in `a` and `b`, and return the packed @@ -1342,8 +1512,8 @@ pub unsafe fn _mm256_max_epi16(a: i16x16, b: i16x16) -> i16x16 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmaxsd))] -pub unsafe fn _mm256_max_epi32(a: i32x8, b: i32x8) -> i32x8 { - pmaxsd(a, b) +pub unsafe fn _mm256_max_epi32(a: __m256i, b: __m256i) -> __m256i { + mem::transmute(pmaxsd(a.as_i32x8(), b.as_i32x8())) } /// Compare packed 8-bit integers in `a` and `b`, and return the packed @@ -1351,8 +1521,8 @@ pub unsafe fn _mm256_max_epi32(a: i32x8, b: i32x8) -> i32x8 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmaxsb))] -pub unsafe fn _mm256_max_epi8(a: i8x32, b: i8x32) -> i8x32 { - pmaxsb(a, b) +pub unsafe fn _mm256_max_epi8(a: __m256i, b: __m256i) -> __m256i { + mem::transmute(pmaxsb(a.as_i8x32(), b.as_i8x32())) } /// Compare packed unsigned 16-bit integers in `a` and `b`, and return @@ -1360,8 +1530,8 @@ pub unsafe fn _mm256_max_epi8(a: i8x32, b: i8x32) -> i8x32 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmaxuw))] -pub unsafe fn _mm256_max_epu16(a: u16x16, b: u16x16) -> u16x16 { - pmaxuw(a, b) +pub unsafe fn _mm256_max_epu16(a: __m256i, b: __m256i) -> __m256i { + mem::transmute(pmaxuw(a.as_u16x16(), b.as_u16x16())) } /// Compare packed unsigned 32-bit integers in `a` and `b`, and return @@ -1369,8 +1539,8 @@ pub unsafe fn _mm256_max_epu16(a: u16x16, b: u16x16) -> u16x16 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmaxud))] -pub unsafe fn _mm256_max_epu32(a: u32x8, b: u32x8) -> u32x8 { - pmaxud(a, b) +pub unsafe fn _mm256_max_epu32(a: __m256i, b: __m256i) -> __m256i { + mem::transmute(pmaxud(a.as_u32x8(), b.as_u32x8())) } /// Compare packed unsigned 8-bit integers in `a` and `b`, and return @@ -1378,8 +1548,8 @@ pub unsafe fn _mm256_max_epu32(a: u32x8, b: u32x8) -> u32x8 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmaxub))] -pub unsafe fn _mm256_max_epu8(a: u8x32, b: u8x32) -> u8x32 { - pmaxub(a, b) +pub unsafe fn _mm256_max_epu8(a: __m256i, b: __m256i) -> __m256i { + mem::transmute(pmaxub(a.as_u8x32(), b.as_u8x32())) } /// Compare packed 16-bit integers in `a` and `b`, and return the packed @@ -1387,8 +1557,8 @@ pub unsafe fn _mm256_max_epu8(a: u8x32, b: u8x32) -> u8x32 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpminsw))] -pub unsafe fn _mm256_min_epi16(a: i16x16, b: i16x16) -> i16x16 { - pminsw(a, b) +pub unsafe fn _mm256_min_epi16(a: __m256i, b: __m256i) -> __m256i { + mem::transmute(pminsw(a.as_i16x16(), b.as_i16x16())) } /// Compare packed 32-bit integers in `a` and `b`, and return the packed @@ -1396,8 +1566,8 @@ pub unsafe fn _mm256_min_epi16(a: i16x16, b: i16x16) -> i16x16 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpminsd))] -pub unsafe fn _mm256_min_epi32(a: i32x8, b: i32x8) -> i32x8 { - pminsd(a, b) +pub unsafe fn _mm256_min_epi32(a: __m256i, b: __m256i) -> __m256i { + mem::transmute(pminsd(a.as_i32x8(), b.as_i32x8())) } /// Compare packed 8-bit integers in `a` and `b`, and return the packed @@ -1405,8 +1575,8 @@ pub unsafe fn _mm256_min_epi32(a: i32x8, b: i32x8) -> i32x8 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpminsb))] -pub unsafe fn _mm256_min_epi8(a: i8x32, b: i8x32) -> i8x32 { - pminsb(a, b) +pub unsafe fn _mm256_min_epi8(a: __m256i, b: __m256i) -> __m256i { + mem::transmute(pminsb(a.as_i8x32(), b.as_i8x32())) } /// Compare packed unsigned 16-bit integers in `a` and `b`, and return @@ -1414,8 +1584,8 @@ pub unsafe fn _mm256_min_epi8(a: i8x32, b: i8x32) -> i8x32 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpminuw))] -pub unsafe fn _mm256_min_epu16(a: u16x16, b: u16x16) -> u16x16 { - pminuw(a, b) +pub unsafe fn _mm256_min_epu16(a: __m256i, b: __m256i) -> __m256i { + mem::transmute(pminuw(a.as_u16x16(), b.as_u16x16())) } /// Compare packed unsigned 32-bit integers in `a` and `b`, and return @@ -1423,8 +1593,8 @@ pub unsafe fn _mm256_min_epu16(a: u16x16, b: u16x16) -> u16x16 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpminud))] -pub unsafe fn _mm256_min_epu32(a: u32x8, b: u32x8) -> u32x8 { - pminud(a, b) +pub unsafe fn _mm256_min_epu32(a: __m256i, b: __m256i) -> __m256i { + mem::transmute(pminud(a.as_u32x8(), b.as_u32x8())) } /// Compare packed unsigned 8-bit integers in `a` and `b`, and return @@ -1432,8 +1602,8 @@ pub unsafe fn _mm256_min_epu32(a: u32x8, b: u32x8) -> u32x8 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpminub))] -pub unsafe fn _mm256_min_epu8(a: u8x32, b: u8x32) -> u8x32 { - pminub(a, b) +pub unsafe fn _mm256_min_epu8(a: __m256i, b: __m256i) -> __m256i { + mem::transmute(pminub(a.as_u8x32(), b.as_u8x32())) } /// Create mask from the most significant bit of each 8-bit element in `a`, @@ -1441,8 +1611,8 @@ pub unsafe fn _mm256_min_epu8(a: u8x32, b: u8x32) -> u8x32 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmovmskb))] -pub unsafe fn _mm256_movemask_epi8(a: i8x32) -> i32 { - pmovmskb(a) +pub unsafe fn _mm256_movemask_epi8(a: __m256i) -> i32 { + pmovmskb(a.as_i8x32()) } /// Compute the sum of absolute differences (SADs) of quadruplets of unsigned @@ -1455,11 +1625,14 @@ pub unsafe fn _mm256_movemask_epi8(a: i8x32) -> i32 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vmpsadbw, imm8 = 0))] -pub unsafe fn _mm256_mpsadbw_epu8(a: u8x32, b: u8x32, imm8: i32) -> u16x16 { +pub unsafe fn _mm256_mpsadbw_epu8(a: __m256i, b: __m256i, imm8: i32) -> __m256i { + let a = a.as_u8x32(); + let b = b.as_u8x32(); macro_rules! call { ($imm8:expr) => (mpsadbw(a, b, $imm8)) } - constify_imm8!(imm8, call) + let r = constify_imm8!(imm8, call); + mem::transmute(r) } /// Multiply the low 32-bit integers from each packed 64-bit element in @@ -1469,8 +1642,8 @@ pub unsafe fn _mm256_mpsadbw_epu8(a: u8x32, b: u8x32, imm8: i32) -> u16x16 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmuldq))] -pub unsafe fn _mm256_mul_epi32(a: i32x8, b: i32x8) -> i64x4 { - pmuldq(a, b) +pub unsafe fn _mm256_mul_epi32(a: __m256i, b: __m256i) -> __m256i { + mem::transmute(pmuldq(a.as_i32x8(), b.as_i32x8())) } /// Multiply the low unsigned 32-bit integers from each packed 64-bit @@ -1480,8 +1653,8 @@ pub unsafe fn _mm256_mul_epi32(a: i32x8, b: i32x8) -> i64x4 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmuludq))] -pub unsafe fn _mm256_mul_epu32(a: u32x8, b: u32x8) -> u64x4 { - pmuludq(a, b) +pub unsafe fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i { + mem::transmute(pmuludq(a.as_u32x8(), b.as_u32x8())) } /// Multiply the packed 16-bit integers in `a` and `b`, producing @@ -1490,8 +1663,8 @@ pub unsafe fn _mm256_mul_epu32(a: u32x8, b: u32x8) -> u64x4 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmulhw))] -pub unsafe fn _mm256_mulhi_epi16(a: i16x16, b: i16x16) -> i16x16 { - pmulhw(a, b) +pub unsafe fn _mm256_mulhi_epi16(a: __m256i, b: __m256i) -> __m256i { + mem::transmute(pmulhw(a.as_i16x16(), b.as_i16x16())) } /// Multiply the packed unsigned 16-bit integers in `a` and `b`, producing @@ -1500,8 +1673,8 @@ pub unsafe fn _mm256_mulhi_epi16(a: i16x16, b: i16x16) -> i16x16 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmulhuw))] -pub unsafe fn _mm256_mulhi_epu16(a: u16x16, b: u16x16) -> u16x16 { - pmulhuw(a, b) +pub unsafe fn _mm256_mulhi_epu16(a: __m256i, b: __m256i) -> __m256i { + mem::transmute(pmulhuw(a.as_u16x16(), b.as_u16x16())) } /// Multiply the packed 16-bit integers in `a` and `b`, producing @@ -1510,8 +1683,8 @@ pub unsafe fn _mm256_mulhi_epu16(a: u16x16, b: u16x16) -> u16x16 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmullw))] -pub unsafe fn _mm256_mullo_epi16(a: i16x16, b: i16x16) -> i16x16 { - a * b +pub unsafe fn _mm256_mullo_epi16(a: __m256i, b: __m256i) -> __m256i { + mem::transmute(simd_mul(a.as_i16x16(), b.as_i16x16())) } /// Multiply the packed 32-bit integers in `a` and `b`, producing @@ -1520,8 +1693,8 @@ pub unsafe fn _mm256_mullo_epi16(a: i16x16, b: i16x16) -> i16x16 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmulld))] -pub unsafe fn _mm256_mullo_epi32(a: i32x8, b: i32x8) -> i32x8 { - a * b +pub unsafe fn _mm256_mullo_epi32(a: __m256i, b: __m256i) -> __m256i { + mem::transmute(simd_mul(a.as_i32x8(), b.as_i32x8())) } /// Multiply packed 16-bit integers in `a` and `b`, producing @@ -1531,8 +1704,8 @@ pub unsafe fn _mm256_mullo_epi32(a: i32x8, b: i32x8) -> i32x8 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmulhrsw))] -pub unsafe fn _mm256_mulhrs_epi16(a: i16x16, b: i16x16) -> i16x16 { - pmulhrsw(a, b) +pub unsafe fn _mm256_mulhrs_epi16(a: __m256i, b: __m256i) -> __m256i { + mem::transmute(pmulhrsw(a.as_i16x16(), b.as_i16x16())) } /// Compute the bitwise OR of 256 bits (representing integer data) in `a` @@ -1541,7 +1714,7 @@ pub unsafe fn _mm256_mulhrs_epi16(a: i16x16, b: i16x16) -> i16x16 { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vorps))] pub unsafe fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i { - __m256i::from(i8x32::from(a) | i8x32::from(b)) + mem::transmute(simd_or(a.as_i32x8(), b.as_i32x8())) } /// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers @@ -1549,8 +1722,8 @@ pub unsafe fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpacksswb))] -pub unsafe fn _mm256_packs_epi16(a: i16x16, b: i16x16) -> i8x32 { - packsswb(a, b) +pub unsafe fn _mm256_packs_epi16(a: __m256i, b: __m256i) -> __m256i { + mem::transmute(packsswb(a.as_i16x16(), b.as_i16x16())) } /// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers @@ -1558,8 +1731,8 @@ pub unsafe fn _mm256_packs_epi16(a: i16x16, b: i16x16) -> i8x32 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpackssdw))] -pub unsafe fn _mm256_packs_epi32(a: i32x8, b: i32x8) -> i16x16 { - packssdw(a, b) +pub unsafe fn _mm256_packs_epi32(a: __m256i, b: __m256i) -> __m256i { + mem::transmute(packssdw(a.as_i32x8(), b.as_i32x8())) } /// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers @@ -1567,8 +1740,8 @@ pub unsafe fn _mm256_packs_epi32(a: i32x8, b: i32x8) -> i16x16 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpackuswb))] -pub unsafe fn _mm256_packus_epi16(a: i16x16, b: i16x16) -> u8x32 { - packuswb(a, b) +pub unsafe fn _mm256_packus_epi16(a: __m256i, b: __m256i) -> __m256i { + mem::transmute(packuswb(a.as_i16x16(), b.as_i16x16())) } /// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers @@ -1576,8 +1749,8 @@ pub unsafe fn _mm256_packus_epi16(a: i16x16, b: i16x16) -> u8x32 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpackusdw))] -pub unsafe fn _mm256_packus_epi32(a: i32x8, b: i32x8) -> u16x16 { - packusdw(a, b) +pub unsafe fn _mm256_packus_epi32(a: __m256i, b: __m256i) -> __m256i { + mem::transmute(packusdw(a.as_i32x8(), b.as_i32x8())) } /// Permutes packed 32-bit integers from `a` according to the content of `b`. @@ -1587,19 +1760,21 @@ pub unsafe fn _mm256_packus_epi32(a: i32x8, b: i32x8) -> u16x16 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpermd))] -pub unsafe fn _mm256_permutevar8x32_epi32(a: u32x8, b: u32x8) -> u32x8 { - permd(a, b) +pub unsafe fn _mm256_permutevar8x32_epi32(a: __m256i, b: __m256i) -> __m256i { + mem::transmute(permd(a.as_u32x8(), b.as_u32x8())) } /// Permutes 64-bit integers from `a` using control mask `imm8`. #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpermq, imm8 = 9))] -pub unsafe fn _mm256_permute4x64_epi64(a: i64x4, imm8: i32) -> i64x4 { +pub unsafe fn _mm256_permute4x64_epi64(a: __m256i, imm8: i32) -> __m256i { let imm8 = (imm8 & 0xFF) as u8; + let zero = _mm256_setzero_si256().as_i64x4(); + let a = a.as_i64x4(); macro_rules! permute4 { ($a:expr, $b:expr, $c:expr, $d:expr) => { - simd_shuffle4(a, i64x4::splat(0), [$a, $b, $c, $d]); + simd_shuffle4(a, zero, [$a, $b, $c, $d]); } } macro_rules! permute3 { @@ -1632,12 +1807,13 @@ pub unsafe fn _mm256_permute4x64_epi64(a: i64x4, imm8: i32) -> i64x4 { } } } - match imm8 & 0b11 { + let r: i64x4 = match imm8 & 0b11 { 0b00 => permute1!(0), 0b01 => permute1!(1), 0b10 => permute1!(2), _ => permute1!(3), - } + }; + mem::transmute(r) } /// Shuffle 128-bits of integer data selected by `imm8` from `a` and `b`. @@ -1647,12 +1823,14 @@ pub unsafe fn _mm256_permute4x64_epi64(a: i64x4, imm8: i32) -> i64x4 { pub unsafe fn _mm256_permute2x128_si256( a: __m256i, b: __m256i, imm8: i32 ) -> __m256i { + let a = a.as_i64x4(); + let b = b.as_i64x4(); macro_rules! call { ($imm8:expr) => { - __m256i::from(vperm2i128(i64x4::from(a), i64x4::from(b), $imm8)) + vperm2i128(a, b, $imm8) } } - constify_imm8!(imm8, call) + mem::transmute(constify_imm8!(imm8, call)) } /// Shuffle 64-bit floating-point elements in `a` across lanes using the @@ -1660,10 +1838,10 @@ pub unsafe fn _mm256_permute2x128_si256( #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpermpd, imm8 = 1))] -pub unsafe fn _mm256_permute4x64_pd(a: f64x4, imm8: i32) -> f64x4 { +pub unsafe fn _mm256_permute4x64_pd(a: __m256d, imm8: i32) -> __m256d { use x86::i586::avx::_mm256_undefined_pd; let imm8 = (imm8 & 0xFF) as u8; - let undef: f64x4 = mem::transmute(_mm256_undefined_pd()); + let undef = _mm256_undefined_pd(); macro_rules! shuffle_done { ($x01:expr, $x23:expr, $x45:expr, $x67:expr) => { simd_shuffle4(a, undef, [$x01, $x23, $x45, $x67]) @@ -1712,8 +1890,8 @@ pub unsafe fn _mm256_permute4x64_pd(a: f64x4, imm8: i32) -> f64x4 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpermps))] -pub unsafe fn _mm256_permutevar8x32_ps(a: f32x8, idx: i32x8) -> f32x8 { - permps(a, idx) +pub unsafe fn _mm256_permutevar8x32_ps(a: __m256, idx: __m256i) -> __m256 { + permps(a, idx.as_i32x8()) } /// Compute the absolute differences of packed unsigned 8-bit integers in `a` @@ -1723,8 +1901,8 @@ pub unsafe fn _mm256_permutevar8x32_ps(a: f32x8, idx: i32x8) -> f32x8 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsadbw))] -pub unsafe fn _mm256_sad_epu8(a: u8x32, b: u8x32) -> u64x4 { - psadbw(a, b) +pub unsafe fn _mm256_sad_epu8(a: __m256i, b: __m256i) -> __m256i { + mem::transmute(psadbw(a.as_u8x32(), b.as_u8x32())) } /// Shuffle bytes from `a` according to the content of `b`. @@ -1759,8 +1937,8 @@ pub unsafe fn _mm256_sad_epu8(a: u8x32, b: u8x32) -> u64x4 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpshufb))] -pub unsafe fn _mm256_shuffle_epi8(a: u8x32, b: u8x32) -> u8x32 { - pshufb(a, b) +pub unsafe fn _mm256_shuffle_epi8(a: __m256i, b: __m256i) -> __m256i { + mem::transmute(pshufb(a.as_u8x32(), b.as_u8x32())) } /// Shuffle 32-bit integers in 128-bit lanes of `a` using the control in @@ -1776,22 +1954,18 @@ pub unsafe fn _mm256_shuffle_epi8(a: u8x32, b: u8x32) -> u8x32 { /// # if cfg_feature_enabled!("avx2") { /// # #[target_feature(enable = "avx2")] /// # unsafe fn worker() { -/// use stdsimd::simd::i32x8; -/// use stdsimd::vendor::_mm256_shuffle_epi32; +/// use stdsimd::vendor::*; /// -/// let a = i32x8::new(0, 1, 2, 3, 4, 5, 6, 7); +/// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); /// /// let shuffle1 = 0b00_11_10_01; /// let shuffle2 = 0b01_00_10_11; /// -/// let c1: i32x8; let c2: i32x8; -/// unsafe { -/// c1 = _mm256_shuffle_epi32(a, shuffle1); -/// c2 = _mm256_shuffle_epi32(a, shuffle2); -/// } +/// let c1 = _mm256_shuffle_epi32(a, shuffle1); +/// let c2 = _mm256_shuffle_epi32(a, shuffle2); /// -/// let expected1 = i32x8::new(1, 2, 3, 0, 5, 6, 7, 4); -/// let expected2 = i32x8::new(3, 2, 0, 1, 7, 6, 4, 5); +/// let expected1 = _mm256_setr_epi32(1, 2, 3, 0, 5, 6, 7, 4); +/// let expected2 = _mm256_setr_epi32(3, 2, 0, 1, 7, 6, 4, 5); /// /// assert_eq!(c1, expected1); /// assert_eq!(c2, expected2); @@ -1803,7 +1977,7 @@ pub unsafe fn _mm256_shuffle_epi8(a: u8x32, b: u8x32) -> u8x32 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpshufd, imm8 = 9))] -pub unsafe fn _mm256_shuffle_epi32(a: i32x8, imm8: i32) -> i32x8 { +pub unsafe fn _mm256_shuffle_epi32(a: __m256i, imm8: i32) -> __m256i { // simd_shuffleX requires that its selector parameter be made up of // constant values, but we can't enforce that here. In spirit, we need // to write a `match` on all possible values of a byte, and for each value, @@ -1813,6 +1987,7 @@ pub unsafe fn _mm256_shuffle_epi32(a: i32x8, imm8: i32) -> i32x8 { // Of course, that's... awful. So we try to use macros to do it for us. let imm8 = (imm8 & 0xFF) as u8; + let a = a.as_i32x8(); macro_rules! shuffle_done { ($x01:expr, $x23:expr, $x45:expr, $x67:expr) => { simd_shuffle8(a, a, [$x01, $x23, $x45, $x67, 4+$x01, 4+$x23, 4+$x45, 4+$x67]) @@ -1848,12 +2023,13 @@ pub unsafe fn _mm256_shuffle_epi32(a: i32x8, imm8: i32) -> i32x8 { } } } - match imm8 & 0b11 { + let r: i32x8 = match imm8 & 0b11 { 0b00 => shuffle_x23!(0), 0b01 => shuffle_x23!(1), 0b10 => shuffle_x23!(2), _ => shuffle_x23!(3), - } + }; + mem::transmute(r) } /// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of `a` using @@ -1862,8 +2038,9 @@ pub unsafe fn _mm256_shuffle_epi32(a: i32x8, imm8: i32) -> i32x8 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpshufhw, imm8 = 9))] -pub unsafe fn _mm256_shufflehi_epi16(a: i16x16, imm8: i32) -> i16x16 { +pub unsafe fn _mm256_shufflehi_epi16(a: __m256i, imm8: i32) -> __m256i { let imm8 = (imm8 & 0xFF) as u8; + let a = a.as_i16x16(); macro_rules! shuffle_done { ($x01:expr, $x23:expr, $x45:expr, $x67:expr) => { #[cfg_attr(rustfmt, rustfmt_skip)] @@ -1903,12 +2080,13 @@ pub unsafe fn _mm256_shufflehi_epi16(a: i16x16, imm8: i32) -> i16x16 { } } } - match imm8 & 0b11 { + let r: i16x16 = match imm8 & 0b11 { 0b00 => shuffle_x23!(0), 0b01 => shuffle_x23!(1), 0b10 => shuffle_x23!(2), _ => shuffle_x23!(3), - } + }; + mem::transmute(r) } /// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of `a` using @@ -1917,8 +2095,9 @@ pub unsafe fn _mm256_shufflehi_epi16(a: i16x16, imm8: i32) -> i16x16 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpshuflw, imm8 = 9))] -pub unsafe fn _mm256_shufflelo_epi16(a: i16x16, imm8: i32) -> i16x16 { +pub unsafe fn _mm256_shufflelo_epi16(a: __m256i, imm8: i32) -> __m256i { let imm8 = (imm8 & 0xFF) as u8; + let a = a.as_i16x16(); macro_rules! shuffle_done { ($x01:expr, $x23:expr, $x45:expr, $x67:expr) => { #[cfg_attr(rustfmt, rustfmt_skip)] @@ -1958,12 +2137,13 @@ pub unsafe fn _mm256_shufflelo_epi16(a: i16x16, imm8: i32) -> i16x16 { } } } - match imm8 & 0b11 { + let r: i16x16 = match imm8 & 0b11 { 0b00 => shuffle_x23!(0), 0b01 => shuffle_x23!(1), 0b10 => shuffle_x23!(2), _ => shuffle_x23!(3), - } + }; + mem::transmute(r) } /// Negate packed 16-bit integers in `a` when the corresponding signed @@ -1972,8 +2152,8 @@ pub unsafe fn _mm256_shufflelo_epi16(a: i16x16, imm8: i32) -> i16x16 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsignw))] -pub unsafe fn _mm256_sign_epi16(a: i16x16, b: i16x16) -> i16x16 { - psignw(a, b) +pub unsafe fn _mm256_sign_epi16(a: __m256i, b: __m256i) -> __m256i { + mem::transmute(psignw(a.as_i16x16(), b.as_i16x16())) } /// Negate packed 32-bit integers in `a` when the corresponding signed @@ -1982,8 +2162,8 @@ pub unsafe fn _mm256_sign_epi16(a: i16x16, b: i16x16) -> i16x16 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsignd))] -pub unsafe fn _mm256_sign_epi32(a: i32x8, b: i32x8) -> i32x8 { - psignd(a, b) +pub unsafe fn _mm256_sign_epi32(a: __m256i, b: __m256i) -> __m256i { + mem::transmute(psignd(a.as_i32x8(), b.as_i32x8())) } /// Negate packed 8-bit integers in `a` when the corresponding signed @@ -1992,8 +2172,8 @@ pub unsafe fn _mm256_sign_epi32(a: i32x8, b: i32x8) -> i32x8 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsignb))] -pub unsafe fn _mm256_sign_epi8(a: i8x32, b: i8x32) -> i8x32 { - psignb(a, b) +pub unsafe fn _mm256_sign_epi8(a: __m256i, b: __m256i) -> __m256i { + mem::transmute(psignb(a.as_i8x32(), b.as_i8x32())) } /// Shift packed 16-bit integers in `a` left by `count` while @@ -2001,8 +2181,8 @@ pub unsafe fn _mm256_sign_epi8(a: i8x32, b: i8x32) -> i8x32 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsllw))] -pub unsafe fn _mm256_sll_epi16(a: i16x16, count: i16x8) -> i16x16 { - psllw(a, count) +pub unsafe fn _mm256_sll_epi16(a: __m256i, count: __m128i) -> __m256i { + mem::transmute(psllw(a.as_i16x16(), count.as_i16x8())) } /// Shift packed 32-bit integers in `a` left by `count` while @@ -2010,8 +2190,8 @@ pub unsafe fn _mm256_sll_epi16(a: i16x16, count: i16x8) -> i16x16 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpslld))] -pub unsafe fn _mm256_sll_epi32(a: i32x8, count: i32x4) -> i32x8 { - pslld(a, count) +pub unsafe fn _mm256_sll_epi32(a: __m256i, count: __m128i) -> __m256i { + mem::transmute(pslld(a.as_i32x8(), count.as_i32x4())) } /// Shift packed 64-bit integers in `a` left by `count` while @@ -2019,8 +2199,8 @@ pub unsafe fn _mm256_sll_epi32(a: i32x8, count: i32x4) -> i32x8 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsllq))] -pub unsafe fn _mm256_sll_epi64(a: i64x4, count: i64x2) -> i64x4 { - psllq(a, count) +pub unsafe fn _mm256_sll_epi64(a: __m256i, count: __m128i) -> __m256i { + mem::transmute(psllq(a.as_i64x4(), count.as_i64x2())) } /// Shift packed 16-bit integers in `a` left by `imm8` while @@ -2028,8 +2208,8 @@ pub unsafe fn _mm256_sll_epi64(a: i64x4, count: i64x2) -> i64x4 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsllw))] -pub unsafe fn _mm256_slli_epi16(a: i16x16, imm8: i32) -> i16x16 { - pslliw(a, imm8) +pub unsafe fn _mm256_slli_epi16(a: __m256i, imm8: i32) -> __m256i { + mem::transmute(pslliw(a.as_i16x16(), imm8)) } /// Shift packed 32-bit integers in `a` left by `imm8` while @@ -2037,8 +2217,8 @@ pub unsafe fn _mm256_slli_epi16(a: i16x16, imm8: i32) -> i16x16 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpslld))] -pub unsafe fn _mm256_slli_epi32(a: i32x8, imm8: i32) -> i32x8 { - psllid(a, imm8) +pub unsafe fn _mm256_slli_epi32(a: __m256i, imm8: i32) -> __m256i { + mem::transmute(psllid(a.as_i32x8(), imm8)) } /// Shift packed 64-bit integers in `a` left by `imm8` while @@ -2046,8 +2226,8 @@ pub unsafe fn _mm256_slli_epi32(a: i32x8, imm8: i32) -> i32x8 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsllq))] -pub unsafe fn _mm256_slli_epi64(a: i64x4, imm8: i32) -> i64x4 { - pslliq(a, imm8) +pub unsafe fn _mm256_slli_epi64(a: __m256i, imm8: i32) -> __m256i { + mem::transmute(pslliq(a.as_i64x4(), imm8)) } /// Shift 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros. @@ -2055,12 +2235,13 @@ pub unsafe fn _mm256_slli_epi64(a: i64x4, imm8: i32) -> i64x4 { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpslldq, imm8 = 3))] pub unsafe fn _mm256_slli_si256(a: __m256i, imm8: i32) -> __m256i { + let a = a.as_i64x4(); macro_rules! call { ($imm8:expr) => { vpslldq(a, $imm8) } } - constify_imm8!(imm8 * 8, call) + mem::transmute(constify_imm8!(imm8 * 8, call)) } /// Shift 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros. @@ -2077,8 +2258,8 @@ pub unsafe fn _mm256_bslli_epi128(a: __m256i, imm8: i32) -> __m256i { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsllvd))] -pub unsafe fn _mm_sllv_epi32(a: i32x4, count: i32x4) -> i32x4 { - psllvd(a, count) +pub unsafe fn _mm_sllv_epi32(a: __m128i, count: __m128i) -> __m128i { + mem::transmute(psllvd(a.as_i32x4(), count.as_i32x4())) } /// Shift packed 32-bit integers in `a` left by the amount @@ -2087,8 +2268,8 @@ pub unsafe fn _mm_sllv_epi32(a: i32x4, count: i32x4) -> i32x4 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsllvd))] -pub unsafe fn _mm256_sllv_epi32(a: i32x8, count: i32x8) -> i32x8 { - psllvd256(a, count) +pub unsafe fn _mm256_sllv_epi32(a: __m256i, count: __m256i) -> __m256i { + mem::transmute(psllvd256(a.as_i32x8(), count.as_i32x8())) } /// Shift packed 64-bit integers in `a` left by the amount @@ -2097,8 +2278,8 @@ pub unsafe fn _mm256_sllv_epi32(a: i32x8, count: i32x8) -> i32x8 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsllvq))] -pub unsafe fn _mm_sllv_epi64(a: i64x2, count: i64x2) -> i64x2 { - psllvq(a, count) +pub unsafe fn _mm_sllv_epi64(a: __m128i, count: __m128i) -> __m128i { + mem::transmute(psllvq(a.as_i64x2(), count.as_i64x2())) } /// Shift packed 64-bit integers in `a` left by the amount @@ -2107,8 +2288,8 @@ pub unsafe fn _mm_sllv_epi64(a: i64x2, count: i64x2) -> i64x2 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsllvq))] -pub unsafe fn _mm256_sllv_epi64(a: i64x4, count: i64x4) -> i64x4 { - psllvq256(a, count) +pub unsafe fn _mm256_sllv_epi64(a: __m256i, count: __m256i) -> __m256i { + mem::transmute(psllvq256(a.as_i64x4(), count.as_i64x4())) } /// Shift packed 16-bit integers in `a` right by `count` while @@ -2116,8 +2297,8 @@ pub unsafe fn _mm256_sllv_epi64(a: i64x4, count: i64x4) -> i64x4 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsraw))] -pub unsafe fn _mm256_sra_epi16(a: i16x16, count: i16x8) -> i16x16 { - psraw(a, count) +pub unsafe fn _mm256_sra_epi16(a: __m256i, count: __m128i) -> __m256i { + mem::transmute(psraw(a.as_i16x16(), count.as_i16x8())) } /// Shift packed 32-bit integers in `a` right by `count` while @@ -2125,8 +2306,8 @@ pub unsafe fn _mm256_sra_epi16(a: i16x16, count: i16x8) -> i16x16 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsrad))] -pub unsafe fn _mm256_sra_epi32(a: i32x8, count: i32x4) -> i32x8 { - psrad(a, count) +pub unsafe fn _mm256_sra_epi32(a: __m256i, count: __m128i) -> __m256i { + mem::transmute(psrad(a.as_i32x8(), count.as_i32x4())) } /// Shift packed 16-bit integers in `a` right by `imm8` while @@ -2134,8 +2315,8 @@ pub unsafe fn _mm256_sra_epi32(a: i32x8, count: i32x4) -> i32x8 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsraw))] -pub unsafe fn _mm256_srai_epi16(a: i16x16, imm8: i32) -> i16x16 { - psraiw(a, imm8) +pub unsafe fn _mm256_srai_epi16(a: __m256i, imm8: i32) -> __m256i { + mem::transmute(psraiw(a.as_i16x16(), imm8)) } /// Shift packed 32-bit integers in `a` right by `imm8` while @@ -2143,8 +2324,8 @@ pub unsafe fn _mm256_srai_epi16(a: i16x16, imm8: i32) -> i16x16 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsrad))] -pub unsafe fn _mm256_srai_epi32(a: i32x8, imm8: i32) -> i32x8 { - psraid(a, imm8) +pub unsafe fn _mm256_srai_epi32(a: __m256i, imm8: i32) -> __m256i { + mem::transmute(psraid(a.as_i32x8(), imm8)) } /// Shift packed 32-bit integers in `a` right by the amount specified by the @@ -2152,8 +2333,8 @@ pub unsafe fn _mm256_srai_epi32(a: i32x8, imm8: i32) -> i32x8 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsravd))] -pub unsafe fn _mm_srav_epi32(a: i32x4, count: i32x4) -> i32x4 { - psravd(a, count) +pub unsafe fn _mm_srav_epi32(a: __m128i, count: __m128i) -> __m128i { + mem::transmute(psravd(a.as_i32x4(), count.as_i32x4())) } /// Shift packed 32-bit integers in `a` right by the amount specified by the @@ -2161,8 +2342,8 @@ pub unsafe fn _mm_srav_epi32(a: i32x4, count: i32x4) -> i32x4 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsravd))] -pub unsafe fn _mm256_srav_epi32(a: i32x8, count: i32x8) -> i32x8 { - psravd256(a, count) +pub unsafe fn _mm256_srav_epi32(a: __m256i, count: __m256i) -> __m256i { + mem::transmute(psravd256(a.as_i32x8(), count.as_i32x8())) } /// Shift 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros. @@ -2170,12 +2351,13 @@ pub unsafe fn _mm256_srav_epi32(a: i32x8, count: i32x8) -> i32x8 { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsrldq, imm8 = 3))] pub unsafe fn _mm256_srli_si256(a: __m256i, imm8: i32) -> __m256i { + let a = a.as_i64x4(); macro_rules! call { ($imm8:expr) => { vpsrldq(a, $imm8) } } - constify_imm8!(imm8 * 8, call) + mem::transmute(constify_imm8!(imm8 * 8, call)) } /// Shift 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros. @@ -2191,8 +2373,8 @@ pub unsafe fn _mm256_bsrli_epi128(a: __m256i, imm8: i32) -> __m256i { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsrlw))] -pub unsafe fn _mm256_srl_epi16(a: i16x16, count: i16x8) -> i16x16 { - psrlw(a, count) +pub unsafe fn _mm256_srl_epi16(a: __m256i, count: __m128i) -> __m256i { + mem::transmute(psrlw(a.as_i16x16(), count.as_i16x8())) } /// Shift packed 32-bit integers in `a` right by `count` while shifting in @@ -2200,8 +2382,8 @@ pub unsafe fn _mm256_srl_epi16(a: i16x16, count: i16x8) -> i16x16 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsrld))] -pub unsafe fn _mm256_srl_epi32(a: i32x8, count: i32x4) -> i32x8 { - psrld(a, count) +pub unsafe fn _mm256_srl_epi32(a: __m256i, count: __m128i) -> __m256i { + mem::transmute(psrld(a.as_i32x8(), count.as_i32x4())) } /// Shift packed 64-bit integers in `a` right by `count` while shifting in @@ -2209,8 +2391,8 @@ pub unsafe fn _mm256_srl_epi32(a: i32x8, count: i32x4) -> i32x8 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsrlq))] -pub unsafe fn _mm256_srl_epi64(a: i64x4, count: i64x2) -> i64x4 { - psrlq(a, count) +pub unsafe fn _mm256_srl_epi64(a: __m256i, count: __m128i) -> __m256i { + mem::transmute(psrlq(a.as_i64x4(), count.as_i64x2())) } /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in @@ -2218,8 +2400,8 @@ pub unsafe fn _mm256_srl_epi64(a: i64x4, count: i64x2) -> i64x4 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsrlw))] -pub unsafe fn _mm256_srli_epi16(a: i16x16, imm8: i32) -> i16x16 { - psrliw(a, imm8) +pub unsafe fn _mm256_srli_epi16(a: __m256i, imm8: i32) -> __m256i { + mem::transmute(psrliw(a.as_i16x16(), imm8)) } /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in @@ -2227,8 +2409,8 @@ pub unsafe fn _mm256_srli_epi16(a: i16x16, imm8: i32) -> i16x16 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsrld))] -pub unsafe fn _mm256_srli_epi32(a: i32x8, imm8: i32) -> i32x8 { - psrlid(a, imm8) +pub unsafe fn _mm256_srli_epi32(a: __m256i, imm8: i32) -> __m256i { + mem::transmute(psrlid(a.as_i32x8(), imm8)) } /// Shift packed 64-bit integers in `a` right by `imm8` while shifting in @@ -2236,8 +2418,8 @@ pub unsafe fn _mm256_srli_epi32(a: i32x8, imm8: i32) -> i32x8 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsrlq))] -pub unsafe fn _mm256_srli_epi64(a: i64x4, imm8: i32) -> i64x4 { - psrliq(a, imm8) +pub unsafe fn _mm256_srli_epi64(a: __m256i, imm8: i32) -> __m256i { + mem::transmute(psrliq(a.as_i64x4(), imm8)) } /// Shift packed 32-bit integers in `a` right by the amount specified by @@ -2245,8 +2427,8 @@ pub unsafe fn _mm256_srli_epi64(a: i64x4, imm8: i32) -> i64x4 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsrlvd))] -pub unsafe fn _mm_srlv_epi32(a: i32x4, count: i32x4) -> i32x4 { - psrlvd(a, count) +pub unsafe fn _mm_srlv_epi32(a: __m128i, count: __m128i) -> __m128i { + mem::transmute(psrlvd(a.as_i32x4(), count.as_i32x4())) } /// Shift packed 32-bit integers in `a` right by the amount specified by @@ -2254,8 +2436,8 @@ pub unsafe fn _mm_srlv_epi32(a: i32x4, count: i32x4) -> i32x4 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsrlvd))] -pub unsafe fn _mm256_srlv_epi32(a: i32x8, count: i32x8) -> i32x8 { - psrlvd256(a, count) +pub unsafe fn _mm256_srlv_epi32(a: __m256i, count: __m256i) -> __m256i { + mem::transmute(psrlvd256(a.as_i32x8(), count.as_i32x8())) } /// Shift packed 64-bit integers in `a` right by the amount specified by @@ -2263,8 +2445,8 @@ pub unsafe fn _mm256_srlv_epi32(a: i32x8, count: i32x8) -> i32x8 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsrlvq))] -pub unsafe fn _mm_srlv_epi64(a: i64x2, count: i64x2) -> i64x2 { - psrlvq(a, count) +pub unsafe fn _mm_srlv_epi64(a: __m128i, count: __m128i) -> __m128i { + mem::transmute(psrlvq(a.as_i64x2(), count.as_i64x2())) } /// Shift packed 64-bit integers in `a` right by the amount specified by @@ -2272,8 +2454,8 @@ pub unsafe fn _mm_srlv_epi64(a: i64x2, count: i64x2) -> i64x2 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsrlvq))] -pub unsafe fn _mm256_srlv_epi64(a: i64x4, count: i64x4) -> i64x4 { - psrlvq256(a, count) +pub unsafe fn _mm256_srlv_epi64(a: __m256i, count: __m256i) -> __m256i { + mem::transmute(psrlvq256(a.as_i64x4(), count.as_i64x4())) } // TODO _mm256_stream_load_si256 (__m256i const* mem_addr) @@ -2282,32 +2464,32 @@ pub unsafe fn _mm256_srlv_epi64(a: i64x4, count: i64x4) -> i64x4 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsubw))] -pub unsafe fn _mm256_sub_epi16(a: i16x16, b: i16x16) -> i16x16 { - a - b +pub unsafe fn _mm256_sub_epi16(a: __m256i, b: __m256i) -> __m256i { + mem::transmute(simd_sub(a.as_i16x16(), b.as_i16x16())) } /// Subtract packed 32-bit integers in `b` from packed 16-bit integers in `a` #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsubd))] -pub unsafe fn _mm256_sub_epi32(a: i32x8, b: i32x8) -> i32x8 { - a - b +pub unsafe fn _mm256_sub_epi32(a: __m256i, b: __m256i) -> __m256i { + mem::transmute(simd_sub(a.as_i32x8(), b.as_i32x8())) } /// Subtract packed 64-bit integers in `b` from packed 16-bit integers in `a` #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsubq))] -pub unsafe fn _mm256_sub_epi64(a: i64x4, b: i64x4) -> i64x4 { - a - b +pub unsafe fn _mm256_sub_epi64(a: __m256i, b: __m256i) -> __m256i { + mem::transmute(simd_sub(a.as_i64x4(), b.as_i64x4())) } /// Subtract packed 8-bit integers in `b` from packed 16-bit integers in `a` #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsubb))] -pub unsafe fn _mm256_sub_epi8(a: i8x32, b: i8x32) -> i8x32 { - a - b +pub unsafe fn _mm256_sub_epi8(a: __m256i, b: __m256i) -> __m256i { + mem::transmute(simd_sub(a.as_i8x32(), b.as_i8x32())) } /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in @@ -2315,8 +2497,8 @@ pub unsafe fn _mm256_sub_epi8(a: i8x32, b: i8x32) -> i8x32 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsubsw))] -pub unsafe fn _mm256_subs_epi16(a: i16x16, b: i16x16) -> i16x16 { - psubsw(a, b) +pub unsafe fn _mm256_subs_epi16(a: __m256i, b: __m256i) -> __m256i { + mem::transmute(psubsw(a.as_i16x16(), b.as_i16x16())) } /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in @@ -2324,8 +2506,8 @@ pub unsafe fn _mm256_subs_epi16(a: i16x16, b: i16x16) -> i16x16 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsubsb))] -pub unsafe fn _mm256_subs_epi8(a: i8x32, b: i8x32) -> i8x32 { - psubsb(a, b) +pub unsafe fn _mm256_subs_epi8(a: __m256i, b: __m256i) -> __m256i { + mem::transmute(psubsb(a.as_i8x32(), b.as_i8x32())) } /// Subtract packed unsigned 16-bit integers in `b` from packed 16-bit @@ -2333,8 +2515,8 @@ pub unsafe fn _mm256_subs_epi8(a: i8x32, b: i8x32) -> i8x32 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsubusw))] -pub unsafe fn _mm256_subs_epu16(a: u16x16, b: u16x16) -> u16x16 { - psubusw(a, b) +pub unsafe fn _mm256_subs_epu16(a: __m256i, b: __m256i) -> __m256i { + mem::transmute(psubusw(a.as_u16x16(), b.as_u16x16())) } /// Subtract packed unsigned 8-bit integers in `b` from packed 8-bit @@ -2342,8 +2524,8 @@ pub unsafe fn _mm256_subs_epu16(a: u16x16, b: u16x16) -> u16x16 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpsubusb))] -pub unsafe fn _mm256_subs_epu8(a: u8x32, b: u8x32) -> u8x32 { - psubusb(a, b) +pub unsafe fn _mm256_subs_epu8(a: __m256i, b: __m256i) -> __m256i { + mem::transmute(psubusb(a.as_u8x32(), b.as_u8x32())) } /// Unpack and interleave 8-bit integers from the high half of each @@ -2359,20 +2541,16 @@ pub unsafe fn _mm256_subs_epu8(a: u8x32, b: u8x32) -> u8x32 { /// # if cfg_feature_enabled!("avx2") { /// # #[target_feature(enable = "avx2")] /// # unsafe fn worker() { -/// use stdsimd::simd::i8x32; -/// use stdsimd::vendor::_mm256_unpackhi_epi8; +/// use stdsimd::vendor::*; /// -/// let a = i8x32::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, +/// let a = _mm256_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, /// 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); -/// let b = i8x32::new(0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15, +/// let b = _mm256_setr_epi8(0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15, /// -16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31); /// -/// let c: i8x32; -/// unsafe { -/// c = _mm256_unpackhi_epi8(a, b); -/// } +/// let c = _mm256_unpackhi_epi8(a, b); /// -/// let expected = i8x32::new(8,-8, 9,-9, 10,-10, 11,-11, 12,-12, 13,-13, +/// let expected = _mm256_setr_epi8(8,-8, 9,-9, 10,-10, 11,-11, 12,-12, 13,-13, /// 14,-14, 15,-15, 24,-24, 25,-25, 26,-26, 27,-27, 28,-28, 29,-29, 30,-30, /// 31,-31); /// assert_eq!(c, expected); @@ -2385,14 +2563,15 @@ pub unsafe fn _mm256_subs_epu8(a: u8x32, b: u8x32) -> u8x32 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpunpckhbw))] -pub unsafe fn _mm256_unpackhi_epi8(a: i8x32, b: i8x32) -> i8x32 { +pub unsafe fn _mm256_unpackhi_epi8(a: __m256i, b: __m256i) -> __m256i { #[cfg_attr(rustfmt, rustfmt_skip)] - simd_shuffle32(a, b, [ + let r: i8x32 = simd_shuffle32(a.as_i8x32(), b.as_i8x32(), [ 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47, 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63, - ]) + ]); + mem::transmute(r) } /// Unpack and interleave 8-bit integers from the low half of each @@ -2408,20 +2587,16 @@ pub unsafe fn _mm256_unpackhi_epi8(a: i8x32, b: i8x32) -> i8x32 { /// # if cfg_feature_enabled!("avx2") { /// # #[target_feature(enable = "avx2")] /// # unsafe fn worker() { -/// use stdsimd::simd::i8x32; -/// use stdsimd::vendor::_mm256_unpacklo_epi8; +/// use stdsimd::vendor::*; /// -/// let a = i8x32::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, +/// let a = _mm256_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, /// 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); -/// let b = i8x32::new(0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15, +/// let b = _mm256_setr_epi8(0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15, /// -16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31); /// -/// let c: i8x32; -/// unsafe { -/// c = _mm256_unpacklo_epi8(a, b); -/// } +/// let c = _mm256_unpacklo_epi8(a, b); /// -/// let expected = i8x32::new(0, 0, 1,-1, 2,-2, 3,-3, 4,-4, 5,-5, 6,-6, 7,-7, +/// let expected = _mm256_setr_epi8(0, 0, 1,-1, 2,-2, 3,-3, 4,-4, 5,-5, 6,-6, 7,-7, /// 16,-16, 17,-17, 18,-18, 19,-19, 20,-20, 21,-21, 22,-22, 23,-23); /// assert_eq!(c, expected); /// @@ -2433,14 +2608,15 @@ pub unsafe fn _mm256_unpackhi_epi8(a: i8x32, b: i8x32) -> i8x32 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpunpcklbw))] -pub unsafe fn _mm256_unpacklo_epi8(a: i8x32, b: i8x32) -> i8x32 { +pub unsafe fn _mm256_unpacklo_epi8(a: __m256i, b: __m256i) -> __m256i { #[cfg_attr(rustfmt, rustfmt_skip)] - simd_shuffle32(a, b, [ + let r: i8x32 = simd_shuffle32(a.as_i8x32(), b.as_i8x32(), [ 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, - ]) + ]); + mem::transmute(r) } /// Unpack and interleave 16-bit integers from the high half of each @@ -2456,18 +2632,14 @@ pub unsafe fn _mm256_unpacklo_epi8(a: i8x32, b: i8x32) -> i8x32 { /// # if cfg_feature_enabled!("avx2") { /// # #[target_feature(enable = "avx2")] /// # unsafe fn worker() { -/// use stdsimd::simd::i16x16; -/// use stdsimd::vendor::_mm256_unpackhi_epi16; +/// use stdsimd::vendor::*; /// -/// let a = i16x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); -/// let b = i16x16::new(0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15); +/// let a = _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); +/// let b = _mm256_setr_epi16(0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15); /// -/// let c: i16x16; -/// unsafe { -/// c = _mm256_unpackhi_epi16(a, b); -/// } +/// let c = _mm256_unpackhi_epi16(a, b); /// -/// let expected = i16x16::new(4,-4, 5,-5, 6,-6, 7,-7, 12,-12, 13,-13, 14,-14, +/// let expected = _mm256_setr_epi16(4,-4, 5,-5, 6,-6, 7,-7, 12,-12, 13,-13, 14,-14, /// 15,-15); /// assert_eq!(c, expected); /// @@ -2479,12 +2651,13 @@ pub unsafe fn _mm256_unpacklo_epi8(a: i8x32, b: i8x32) -> i8x32 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpunpckhwd))] -pub unsafe fn _mm256_unpackhi_epi16(a: i16x16, b: i16x16) -> i16x16 { - simd_shuffle16( - a, - b, +pub unsafe fn _mm256_unpackhi_epi16(a: __m256i, b: __m256i) -> __m256i { + let r: i16x16 = simd_shuffle16( + a.as_i16x16(), + b.as_i16x16(), [4, 20, 5, 21, 6, 22, 7, 23, 12, 28, 13, 29, 14, 30, 15, 31], - ) + ); + mem::transmute(r) } /// Unpack and interleave 16-bit integers from the low half of each @@ -2500,18 +2673,14 @@ pub unsafe fn _mm256_unpackhi_epi16(a: i16x16, b: i16x16) -> i16x16 { /// # if cfg_feature_enabled!("avx2") { /// # #[target_feature(enable = "avx2")] /// # unsafe fn worker() { -/// use stdsimd::simd::i16x16; -/// use stdsimd::vendor::_mm256_unpacklo_epi16; +/// use stdsimd::vendor::*; /// -/// let a = i16x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); -/// let b = i16x16::new(0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15); +/// let a = _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); +/// let b = _mm256_setr_epi16(0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15); /// -/// let c: i16x16; -/// unsafe { -/// c = _mm256_unpacklo_epi16(a, b); -/// } +/// let c = _mm256_unpacklo_epi16(a, b); /// -/// let expected = i16x16::new(0, 0, 1,-1, 2,-2, 3,-3, 8,-8, 9,-9, 10,-10, +/// let expected = _mm256_setr_epi16(0, 0, 1,-1, 2,-2, 3,-3, 8,-8, 9,-9, 10,-10, /// 11,-11); /// assert_eq!(c, expected); /// @@ -2523,12 +2692,13 @@ pub unsafe fn _mm256_unpackhi_epi16(a: i16x16, b: i16x16) -> i16x16 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpunpcklwd))] -pub unsafe fn _mm256_unpacklo_epi16(a: i16x16, b: i16x16) -> i16x16 { - simd_shuffle16( - a, - b, +pub unsafe fn _mm256_unpacklo_epi16(a: __m256i, b: __m256i) -> __m256i { + let r: i16x16 = simd_shuffle16( + a.as_i16x16(), + b.as_i16x16(), [0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27], - ) + ); + mem::transmute(r) } /// Unpack and interleave 32-bit integers from the high half of each @@ -2544,18 +2714,14 @@ pub unsafe fn _mm256_unpacklo_epi16(a: i16x16, b: i16x16) -> i16x16 { /// # if cfg_feature_enabled!("avx2") { /// # #[target_feature(enable = "avx2")] /// # unsafe fn worker() { -/// use stdsimd::simd::i32x8; -/// use stdsimd::vendor::_mm256_unpackhi_epi32; +/// use stdsimd::vendor::*; /// -/// let a = i32x8::new(0, 1, 2, 3, 4, 5, 6, 7); -/// let b = i32x8::new(0,-1,-2,-3,-4,-5,-6,-7); +/// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); +/// let b = _mm256_setr_epi32(0,-1,-2,-3,-4,-5,-6,-7); /// -/// let c: i32x8; -/// unsafe { -/// c = _mm256_unpackhi_epi32(a, b); -/// } +/// let c = _mm256_unpackhi_epi32(a, b); /// -/// let expected = i32x8::new(2,-2, 3,-3, 6,-6, 7,-7); +/// let expected = _mm256_setr_epi32(2,-2, 3,-3, 6,-6, 7,-7); /// assert_eq!(c, expected); /// /// # } @@ -2566,8 +2732,9 @@ pub unsafe fn _mm256_unpacklo_epi16(a: i16x16, b: i16x16) -> i16x16 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpunpckhdq))] -pub unsafe fn _mm256_unpackhi_epi32(a: i32x8, b: i32x8) -> i32x8 { - simd_shuffle8(a, b, [2, 10, 3, 11, 6, 14, 7, 15]) +pub unsafe fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i { + let r: i32x8 = simd_shuffle8(a.as_i32x8(), b.as_i32x8(), [2, 10, 3, 11, 6, 14, 7, 15]); + mem::transmute(r) } /// Unpack and interleave 32-bit integers from the low half of each @@ -2583,18 +2750,14 @@ pub unsafe fn _mm256_unpackhi_epi32(a: i32x8, b: i32x8) -> i32x8 { /// # if cfg_feature_enabled!("avx2") { /// # #[target_feature(enable = "avx2")] /// # unsafe fn worker() { -/// use stdsimd::simd::i32x8; -/// use stdsimd::vendor::_mm256_unpacklo_epi32; +/// use stdsimd::vendor::*; /// -/// let a = i32x8::new(0, 1, 2, 3, 4, 5, 6, 7); -/// let b = i32x8::new(0,-1,-2,-3,-4,-5,-6,-7); +/// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); +/// let b = _mm256_setr_epi32(0,-1,-2,-3,-4,-5,-6,-7); /// -/// let c: i32x8; -/// unsafe { -/// c = _mm256_unpacklo_epi32(a, b); -/// } +/// let c = _mm256_unpacklo_epi32(a, b); /// -/// let expected = i32x8::new(0, 0, 1,-1, 4,-4, 5,-5); +/// let expected = _mm256_setr_epi32(0, 0, 1,-1, 4,-4, 5,-5); /// assert_eq!(c, expected); /// /// # } @@ -2605,8 +2768,9 @@ pub unsafe fn _mm256_unpackhi_epi32(a: i32x8, b: i32x8) -> i32x8 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpunpckldq))] -pub unsafe fn _mm256_unpacklo_epi32(a: i32x8, b: i32x8) -> i32x8 { - simd_shuffle8(a, b, [0, 8, 1, 9, 4, 12, 5, 13]) +pub unsafe fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i { + let r: i32x8 = simd_shuffle8(a.as_i32x8(), b.as_i32x8(), [0, 8, 1, 9, 4, 12, 5, 13]); + mem::transmute(r) } /// Unpack and interleave 64-bit integers from the high half of each @@ -2622,18 +2786,14 @@ pub unsafe fn _mm256_unpacklo_epi32(a: i32x8, b: i32x8) -> i32x8 { /// # if cfg_feature_enabled!("avx2") { /// # #[target_feature(enable = "avx2")] /// # unsafe fn worker() { -/// use stdsimd::simd::i64x4; -/// use stdsimd::vendor::_mm256_unpackhi_epi64; +/// use stdsimd::vendor::*; /// -/// let a = i64x4::new(0, 1, 2, 3); -/// let b = i64x4::new(0,-1,-2,-3); +/// let a = _mm256_setr_epi64x(0, 1, 2, 3); +/// let b = _mm256_setr_epi64x(0,-1,-2,-3); /// -/// let c: i64x4; -/// unsafe { -/// c = _mm256_unpackhi_epi64(a, b); -/// } +/// let c = _mm256_unpackhi_epi64(a, b); /// -/// let expected = i64x4::new(1,-1, 3,-3); +/// let expected = _mm256_setr_epi64x(1,-1, 3,-3); /// assert_eq!(c, expected); /// /// # } @@ -2644,8 +2804,9 @@ pub unsafe fn _mm256_unpacklo_epi32(a: i32x8, b: i32x8) -> i32x8 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpunpckhqdq))] -pub unsafe fn _mm256_unpackhi_epi64(a: i64x4, b: i64x4) -> i64x4 { - simd_shuffle4(a, b, [1, 5, 3, 7]) +pub unsafe fn _mm256_unpackhi_epi64(a: __m256i, b: __m256i) -> __m256i { + let r: i64x4 = simd_shuffle4(a.as_i64x4(), b.as_i64x4(), [1, 5, 3, 7]); + mem::transmute(r) } /// Unpack and interleave 64-bit integers from the low half of each @@ -2661,18 +2822,14 @@ pub unsafe fn _mm256_unpackhi_epi64(a: i64x4, b: i64x4) -> i64x4 { /// # if cfg_feature_enabled!("avx2") { /// # #[target_feature(enable = "avx2")] /// # unsafe fn worker() { -/// use stdsimd::simd::i64x4; -/// use stdsimd::vendor::_mm256_unpacklo_epi64; +/// use stdsimd::vendor::*; /// -/// let a = i64x4::new(0, 1, 2, 3); -/// let b = i64x4::new(0,-1,-2,-3); +/// let a = _mm256_setr_epi64x(0, 1, 2, 3); +/// let b = _mm256_setr_epi64x(0,-1,-2,-3); /// -/// let c: i64x4; -/// unsafe { -/// c = _mm256_unpacklo_epi64(a, b); -/// } +/// let c = _mm256_unpacklo_epi64(a, b); /// -/// let expected = i64x4::new(0, 0, 2,-2); +/// let expected = _mm256_setr_epi64x(0, 0, 2,-2); /// assert_eq!(c, expected); /// /// # } @@ -2683,8 +2840,9 @@ pub unsafe fn _mm256_unpackhi_epi64(a: i64x4, b: i64x4) -> i64x4 { #[inline(always)] #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpunpcklqdq))] -pub unsafe fn _mm256_unpacklo_epi64(a: i64x4, b: i64x4) -> i64x4 { - simd_shuffle4(a, b, [0, 4, 2, 6]) +pub unsafe fn _mm256_unpacklo_epi64(a: __m256i, b: __m256i) -> __m256i { + let r: i64x4 = simd_shuffle4(a.as_i64x4(), b.as_i64x4(), [0, 4, 2, 6]); + mem::transmute(r) } /// Compute the bitwise XOR of 256 bits (representing integer data) @@ -2693,7 +2851,7 @@ pub unsafe fn _mm256_unpacklo_epi64(a: i64x4, b: i64x4) -> i64x4 { #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vxorps))] pub unsafe fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i { - __m256i::from(i8x32::from(a) ^ i8x32::from(b)) + mem::transmute(simd_xor(a.as_i64x4(), b.as_i64x4())) } /// Extract an 8-bit integer from `a`, selected with `imm8`. Returns a 32-bit @@ -2703,9 +2861,9 @@ pub unsafe fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i { #[inline(always)] #[target_feature(enable = "avx2")] // This intrinsic has no corresponding instruction. -pub unsafe fn _mm256_extract_epi8(a: i8x32, imm8: i32) -> i8 { +pub unsafe fn _mm256_extract_epi8(a: __m256i, imm8: i32) -> i8 { let imm8 = (imm8 & 31) as u32; - a.extract_unchecked(imm8) + simd_extract(a.as_i8x32(), imm8) } /// Extract a 16-bit integer from `a`, selected with `imm8`. Returns a 32-bit @@ -2715,43 +2873,43 @@ pub unsafe fn _mm256_extract_epi8(a: i8x32, imm8: i32) -> i8 { #[inline(always)] #[target_feature(enable = "avx2")] // This intrinsic has no corresponding instruction. -pub unsafe fn _mm256_extract_epi16(a: i16x16, imm8: i32) -> i16 { +pub unsafe fn _mm256_extract_epi16(a: __m256i, imm8: i32) -> i16 { let imm8 = (imm8 & 15) as u32; - a.extract_unchecked(imm8) + simd_extract(a.as_i16x16(), imm8) } /// Extract a 32-bit integer from `a`, selected with `imm8`. #[inline(always)] #[target_feature(enable = "avx2")] // This intrinsic has no corresponding instruction. -pub unsafe fn _mm256_extract_epi32(a: i32x8, imm8: i32) -> i32 { +pub unsafe fn _mm256_extract_epi32(a: __m256i, imm8: i32) -> i32 { let imm8 = (imm8 & 7) as u32; - a.extract_unchecked(imm8) + simd_extract(a.as_i32x8(), imm8) } /// Extract a 64-bit integer from `a`, selected with `imm8`. #[inline(always)] #[target_feature(enable = "avx2")] // This intrinsic has no corresponding instruction. -pub unsafe fn _mm256_extract_epi64(a: i64x4, imm8: i32) -> i64 { +pub unsafe fn _mm256_extract_epi64(a: __m256i, imm8: i32) -> i64 { let imm8 = (imm8 & 3) as u32; - a.extract_unchecked(imm8) + simd_extract(a.as_i64x4(), imm8) } /// Returns the first element of the input vector of [4 x double]. #[inline(always)] #[target_feature(enable = "avx2")] //#[cfg_attr(test, assert_instr(movsd))] FIXME -pub unsafe fn _mm256_cvtsd_f64(a: f64x4) -> f64 { - a.extract(0) +pub unsafe fn _mm256_cvtsd_f64(a: __m256d) -> f64 { + simd_extract(a, 0) } /// Returns the first element of the input vector of [8 x i32]. #[inline(always)] #[target_feature(enable = "avx2")] //#[cfg_attr(test, assert_instr(movd))] FIXME -pub unsafe fn _mm256_cvtsi256_si32(a: i32x8) -> i32 { - a.extract(0) +pub unsafe fn _mm256_cvtsi256_si32(a: __m256i) -> i32 { + simd_extract(a.as_i32x8(), 0) } #[allow(improper_ctypes)] @@ -2927,7 +3085,7 @@ extern "C" { #[link_name = "llvm.x86.avx2.permd"] fn permd(a: u32x8, b: u32x8) -> u32x8; #[link_name = "llvm.x86.avx2.permps"] - fn permps(a: f32x8, b: i32x8) -> f32x8; + fn permps(a: __m256, b: i32x8) -> __m256; #[link_name = "llvm.x86.avx2.vperm2i128"] fn vperm2i128(a: i64x4, b: i64x4, imm8: i8) -> i64x4; #[link_name = "llvm.x86.avx2.gather.d.d"] @@ -2964,40 +3122,40 @@ extern "C" { ) -> i64x4; #[link_name = "llvm.x86.avx2.gather.d.pd"] fn pgatherdpd( - src: f64x2, slice: *const i8, offsets: i32x4, mask: f64x2, scale: i8 - ) -> f64x2; + src: __m128d, slice: *const i8, offsets: i32x4, mask: __m128d, scale: i8 + ) -> __m128d; #[link_name = "llvm.x86.avx2.gather.d.pd.256"] fn vpgatherdpd( - src: f64x4, slice: *const i8, offsets: i32x4, mask: f64x4, scale: i8 - ) -> f64x4; + src: __m256d, slice: *const i8, offsets: i32x4, mask: __m256d, scale: i8 + ) -> __m256d; #[link_name = "llvm.x86.avx2.gather.q.pd"] fn pgatherqpd( - src: f64x2, slice: *const i8, offsets: i64x2, mask: f64x2, scale: i8 - ) -> f64x2; + src: __m128d, slice: *const i8, offsets: i64x2, mask: __m128d, scale: i8 + ) -> __m128d; #[link_name = "llvm.x86.avx2.gather.q.pd.256"] fn vpgatherqpd( - src: f64x4, slice: *const i8, offsets: i64x4, mask: f64x4, scale: i8 - ) -> f64x4; + src: __m256d, slice: *const i8, offsets: i64x4, mask: __m256d, scale: i8 + ) -> __m256d; #[link_name = "llvm.x86.avx2.gather.d.ps"] fn pgatherdps( - src: f32x4, slice: *const i8, offsets: i32x4, mask: f32x4, scale: i8 - ) -> f32x4; + src: __m128, slice: *const i8, offsets: i32x4, mask: __m128, scale: i8 + ) -> __m128; #[link_name = "llvm.x86.avx2.gather.d.ps.256"] fn vpgatherdps( - src: f32x8, slice: *const i8, offsets: i32x8, mask: f32x8, scale: i8 - ) -> f32x8; + src: __m256, slice: *const i8, offsets: i32x8, mask: __m256, scale: i8 + ) -> __m256; #[link_name = "llvm.x86.avx2.gather.q.ps"] fn pgatherqps( - src: f32x4, slice: *const i8, offsets: i64x2, mask: f32x4, scale: i8 - ) -> f32x4; + src: __m128, slice: *const i8, offsets: i64x2, mask: __m128, scale: i8 + ) -> __m128; #[link_name = "llvm.x86.avx2.gather.q.ps.256"] fn vpgatherqps( - src: f32x4, slice: *const i8, offsets: i64x4, mask: f32x4, scale: i8 - ) -> f32x4; + src: __m128, slice: *const i8, offsets: i64x4, mask: __m128, scale: i8 + ) -> __m128; #[link_name = "llvm.x86.avx2.psll.dq"] - fn vpslldq(a: __m256i, b: i32) -> __m256i; + fn vpslldq(a: i64x4, b: i32) -> i64x4; #[link_name = "llvm.x86.avx2.psrl.dq"] - fn vpsrldq(a: __m256i, b: i32) -> __m256i; + fn vpsrldq(a: i64x4, b: i32) -> i64x4; } #[cfg(test)] @@ -3006,88 +3164,88 @@ mod tests { use v256::*; use v128::*; - use x86::i586::avx2; + use x86::*; use std; #[simd_test = "avx2"] - unsafe fn _mm256_abs_epi32() { + unsafe fn test_mm256_abs_epi32() { #[cfg_attr(rustfmt, rustfmt_skip)] - let a = i32x8::new( + let a = _mm256_setr_epi32( 0, 1, -1, std::i32::MAX, std::i32::MIN, 100, -100, -32, ); - let r = avx2::_mm256_abs_epi32(a); + let r = _mm256_abs_epi32(a); #[cfg_attr(rustfmt, rustfmt_skip)] - let e = u32x8::new( - 0, 1, 1, std::i32::MAX as u32, - std::i32::MAX as u32 + 1, 100, 100, 32, + let e = _mm256_setr_epi32( + 0, 1, 1, std::i32::MAX, + std::i32::MAX.wrapping_add(1), 100, 100, 32, ); assert_eq!(r, e); } #[simd_test = "avx2"] - unsafe fn _mm256_abs_epi16() { + unsafe fn test_mm256_abs_epi16() { #[cfg_attr(rustfmt, rustfmt_skip)] - let a = i16x16::new( + let a = _mm256_setr_epi16( 0, 1, -1, 2, -2, 3, -3, 4, -4, 5, -5, std::i16::MAX, std::i16::MIN, 100, -100, -32, ); - let r = avx2::_mm256_abs_epi16(a); + let r = _mm256_abs_epi16(a); #[cfg_attr(rustfmt, rustfmt_skip)] - let e = u16x16::new( + let e = _mm256_setr_epi16( 0, 1, 1, 2, 2, 3, 3, 4, - 4, 5, 5, std::i16::MAX as u16, std::i16::MAX as u16 + 1, 100, 100, 32, + 4, 5, 5, std::i16::MAX, std::i16::MAX.wrapping_add(1), 100, 100, 32, ); assert_eq!(r, e); } #[simd_test = "avx2"] - unsafe fn _mm256_abs_epi8() { + unsafe fn test_mm256_abs_epi8() { #[cfg_attr(rustfmt, rustfmt_skip)] - let a = i8x32::new( + let a = _mm256_setr_epi8( 0, 1, -1, 2, -2, 3, -3, 4, -4, 5, -5, std::i8::MAX, std::i8::MIN, 100, -100, -32, 0, 1, -1, 2, -2, 3, -3, 4, -4, 5, -5, std::i8::MAX, std::i8::MIN, 100, -100, -32, ); - let r = avx2::_mm256_abs_epi8(a); + let r = _mm256_abs_epi8(a); #[cfg_attr(rustfmt, rustfmt_skip)] - let e = u8x32::new( + let e = _mm256_setr_epi8( 0, 1, 1, 2, 2, 3, 3, 4, - 4, 5, 5, std::i8::MAX as u8, std::i8::MAX as u8 + 1, 100, 100, 32, + 4, 5, 5, std::i8::MAX, std::i8::MAX.wrapping_add(1), 100, 100, 32, 0, 1, 1, 2, 2, 3, 3, 4, - 4, 5, 5, std::i8::MAX as u8, std::i8::MAX as u8 + 1, 100, 100, 32, + 4, 5, 5, std::i8::MAX, std::i8::MAX.wrapping_add(1), 100, 100, 32, ); assert_eq!(r, e); } #[simd_test = "avx2"] - unsafe fn _mm256_add_epi64() { - let a = i64x4::new(-10, 0, 100, 1_000_000_000); - let b = i64x4::new(-1, 0, 1, 2); - let r = avx2::_mm256_add_epi64(a, b); - let e = i64x4::new(-11, 0, 101, 1_000_000_002); + unsafe fn test_mm256_add_epi64() { + let a = _mm256_setr_epi64x(-10, 0, 100, 1_000_000_000); + let b = _mm256_setr_epi64x(-1, 0, 1, 2); + let r = _mm256_add_epi64(a, b); + let e = _mm256_setr_epi64x(-11, 0, 101, 1_000_000_002); assert_eq!(r, e); } #[simd_test = "avx2"] - unsafe fn _mm256_add_epi32() { - let a = i32x8::new(-1, 0, 1, 2, 3, 4, 5, 6); - let b = i32x8::new(1, 2, 3, 4, 5, 6, 7, 8); - let r = avx2::_mm256_add_epi32(a, b); - let e = i32x8::new(0, 2, 4, 6, 8, 10, 12, 14); + unsafe fn test_mm256_add_epi32() { + let a = _mm256_setr_epi32(-1, 0, 1, 2, 3, 4, 5, 6); + let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8); + let r = _mm256_add_epi32(a, b); + let e = _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14); assert_eq!(r, e); } #[simd_test = "avx2"] - unsafe fn _mm256_add_epi16() { + unsafe fn test_mm256_add_epi16() { let a = - i16x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); let b = - i16x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - let r = avx2::_mm256_add_epi16(a, b); + _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let r = _mm256_add_epi16(a, b); #[cfg_attr(rustfmt, rustfmt_skip)] - let e = i16x16::new( + let e = _mm256_setr_epi16( 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, ); @@ -3095,24 +3253,24 @@ mod tests { } #[simd_test = "avx2"] - unsafe fn _mm256_add_epi8() { + unsafe fn test_mm256_add_epi8() { #[cfg_attr(rustfmt, rustfmt_skip)] - let a = i8x32::new( + let a = _mm256_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, ); #[cfg_attr(rustfmt, rustfmt_skip)] - let b = i8x32::new( + let b = _mm256_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, ); - let r = avx2::_mm256_add_epi8(a, b); + let r = _mm256_add_epi8(a, b); #[cfg_attr(rustfmt, rustfmt_skip)] - let e = i8x32::new( + let e = _mm256_setr_epi8( 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, @@ -3122,24 +3280,24 @@ mod tests { } #[simd_test = "avx2"] - unsafe fn _mm256_adds_epi8() { + unsafe fn test_mm256_adds_epi8() { #[cfg_attr(rustfmt, rustfmt_skip)] - let a = i8x32::new( + let a = _mm256_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, ); #[cfg_attr(rustfmt, rustfmt_skip)] - let b = i8x32::new( + let b = _mm256_setr_epi8( 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, ); - let r = avx2::_mm256_adds_epi8(a, b); + let r = _mm256_adds_epi8(a, b); #[cfg_attr(rustfmt, rustfmt_skip)] - let e = i8x32::new( + let e = _mm256_setr_epi8( 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, @@ -3149,33 +3307,33 @@ mod tests { } #[simd_test = "avx2"] - unsafe fn _mm256_adds_epi8_saturate_positive() { - let a = i8x32::splat(0x7F); - let b = i8x32::splat(1); - let r = avx2::_mm256_adds_epi8(a, b); + unsafe fn test_mm256_adds_epi8_saturate_positive() { + let a = _mm256_set1_epi8(0x7F); + let b = _mm256_set1_epi8(1); + let r = _mm256_adds_epi8(a, b); assert_eq!(r, a); } #[simd_test = "avx2"] - unsafe fn _mm256_adds_epi8_saturate_negative() { - let a = i8x32::splat(-0x80); - let b = i8x32::splat(-1); - let r = avx2::_mm256_adds_epi8(a, b); + unsafe fn test_mm256_adds_epi8_saturate_negative() { + let a = _mm256_set1_epi8(-0x80); + let b = _mm256_set1_epi8(-1); + let r = _mm256_adds_epi8(a, b); assert_eq!(r, a); } #[simd_test = "avx2"] - unsafe fn _mm256_adds_epi16() { + unsafe fn test_mm256_adds_epi16() { let a = - i16x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); #[cfg_attr(rustfmt, rustfmt_skip)] - let b = i16x16::new( + let b = _mm256_setr_epi16( 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, ); - let r = avx2::_mm256_adds_epi16(a, b); + let r = _mm256_adds_epi16(a, b); #[cfg_attr(rustfmt, rustfmt_skip)] - let e = i16x16::new( + let e = _mm256_setr_epi16( 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, ); @@ -3184,40 +3342,40 @@ mod tests { } #[simd_test = "avx2"] - unsafe fn _mm256_adds_epi16_saturate_positive() { - let a = i16x16::splat(0x7FFF); - let b = i16x16::splat(1); - let r = avx2::_mm256_adds_epi16(a, b); + unsafe fn test_mm256_adds_epi16_saturate_positive() { + let a = _mm256_set1_epi16(0x7FFF); + let b = _mm256_set1_epi16(1); + let r = _mm256_adds_epi16(a, b); assert_eq!(r, a); } #[simd_test = "avx2"] - unsafe fn _mm256_adds_epi16_saturate_negative() { - let a = i16x16::splat(-0x8000); - let b = i16x16::splat(-1); - let r = avx2::_mm256_adds_epi16(a, b); + unsafe fn test_mm256_adds_epi16_saturate_negative() { + let a = _mm256_set1_epi16(-0x8000); + let b = _mm256_set1_epi16(-1); + let r = _mm256_adds_epi16(a, b); assert_eq!(r, a); } #[simd_test = "avx2"] - unsafe fn _mm256_adds_epu8() { + unsafe fn test_mm256_adds_epu8() { #[cfg_attr(rustfmt, rustfmt_skip)] - let a = u8x32::new( + let a = _mm256_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, ); #[cfg_attr(rustfmt, rustfmt_skip)] - let b = u8x32::new( + let b = _mm256_setr_epi8( 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, ); - let r = avx2::_mm256_adds_epu8(a, b); + let r = _mm256_adds_epu8(a, b); #[cfg_attr(rustfmt, rustfmt_skip)] - let e = u8x32::new( + let e = _mm256_setr_epi8( 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, @@ -3227,25 +3385,25 @@ mod tests { } #[simd_test = "avx2"] - unsafe fn _mm256_adds_epu8_saturate() { - let a = u8x32::splat(0xFF); - let b = u8x32::splat(1); - let r = avx2::_mm256_adds_epu8(a, b); + unsafe fn test_mm256_adds_epu8_saturate() { + let a = _mm256_set1_epi8(!0); + let b = _mm256_set1_epi8(1); + let r = _mm256_adds_epu8(a, b); assert_eq!(r, a); } #[simd_test = "avx2"] - unsafe fn _mm256_adds_epu16() { + unsafe fn test_mm256_adds_epu16() { let a = - u16x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); #[cfg_attr(rustfmt, rustfmt_skip)] - let b = u16x16::new( + let b = _mm256_setr_epi16( 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, ); - let r = avx2::_mm256_adds_epu16(a, b); + let r = _mm256_adds_epu16(a, b); #[cfg_attr(rustfmt, rustfmt_skip)] - let e = u16x16::new( + let e = _mm256_setr_epi16( 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, ); @@ -3254,152 +3412,152 @@ mod tests { } #[simd_test = "avx2"] - unsafe fn _mm256_adds_epu16_saturate() { - let a = u16x16::splat(0xFFFF); - let b = u16x16::splat(1); - let r = avx2::_mm256_adds_epu16(a, b); + unsafe fn test_mm256_adds_epu16_saturate() { + let a = _mm256_set1_epi16(!0); + let b = _mm256_set1_epi16(1); + let r = _mm256_adds_epu16(a, b); assert_eq!(r, a); } #[simd_test = "avx2"] - unsafe fn _mm256_and_si256() { - let a = i8x32::splat(5); - let b = i8x32::splat(3); - let got = avx2::_mm256_and_si256(__m256i::from(a), __m256i::from(b)); - assert_eq!(got, __m256i::from(i8x32::splat(1))); + unsafe fn test_mm256_and_si256() { + let a = _mm256_set1_epi8(5); + let b = _mm256_set1_epi8(3); + let got = _mm256_and_si256(__m256i::from(a), __m256i::from(b)); + assert_eq!(got, __m256i::from(_mm256_set1_epi8(1))); } #[simd_test = "avx2"] - unsafe fn _mm256_andnot_si256() { - let a = i8x32::splat(5); - let b = i8x32::splat(3); + unsafe fn test_mm256_andnot_si256() { + let a = _mm256_set1_epi8(5); + let b = _mm256_set1_epi8(3); let got = - avx2::_mm256_andnot_si256(__m256i::from(a), __m256i::from(b)); - assert_eq!(got, __m256i::from(i8x32::splat(2))); + _mm256_andnot_si256(__m256i::from(a), __m256i::from(b)); + assert_eq!(got, __m256i::from(_mm256_set1_epi8(2))); } #[simd_test = "avx2"] - unsafe fn _mm256_avg_epu8() { - let (a, b) = (u8x32::splat(3), u8x32::splat(9)); - let r = avx2::_mm256_avg_epu8(a, b); - assert_eq!(r, u8x32::splat(6)); + unsafe fn test_mm256_avg_epu8() { + let (a, b) = (_mm256_set1_epi8(3), _mm256_set1_epi8(9)); + let r = _mm256_avg_epu8(a, b); + assert_eq!(r, _mm256_set1_epi8(6)); } #[simd_test = "avx2"] - unsafe fn _mm256_avg_epu16() { - let (a, b) = (u16x16::splat(3), u16x16::splat(9)); - let r = avx2::_mm256_avg_epu16(a, b); - assert_eq!(r, u16x16::splat(6)); + unsafe fn test_mm256_avg_epu16() { + let (a, b) = (_mm256_set1_epi16(3), _mm256_set1_epi16(9)); + let r = _mm256_avg_epu16(a, b); + assert_eq!(r, _mm256_set1_epi16(6)); } #[simd_test = "avx2"] - unsafe fn _mm_blend_epi32() { - let (a, b) = (i32x4::splat(3), i32x4::splat(9)); - let e = i32x4::splat(3).replace(0, 9); - let r = avx2::_mm_blend_epi32(a, b, 0x01 as i32); + unsafe fn test_mm_blend_epi32() { + let (a, b) = (_mm_set1_epi32(3), _mm_set1_epi32(9)); + let e = _mm_setr_epi32(9, 3, 3, 3); + let r = _mm_blend_epi32(a, b, 0x01 as i32); assert_eq!(r, e); - let r = avx2::_mm_blend_epi32(b, a, 0x0E as i32); + let r = _mm_blend_epi32(b, a, 0x0E as i32); assert_eq!(r, e); } #[simd_test = "avx2"] - unsafe fn _mm256_blend_epi32() { - let (a, b) = (i32x8::splat(3), i32x8::splat(9)); - let e = i32x8::splat(3).replace(0, 9); - let r = avx2::_mm256_blend_epi32(a, b, 0x01 as i32); + unsafe fn test_mm256_blend_epi32() { + let (a, b) = (_mm256_set1_epi32(3), _mm256_set1_epi32(9)); + let e = _mm256_setr_epi32(9, 3, 3, 3, 3, 3, 3, 3); + let r = _mm256_blend_epi32(a, b, 0x01 as i32); assert_eq!(r, e); - let e = i32x8::splat(3).replace(1, 9).replace(7, 9); - let r = avx2::_mm256_blend_epi32(a, b, 0x82 as i32); + let e = _mm256_setr_epi32(3, 9, 3, 3, 3, 3, 3, 9); + let r = _mm256_blend_epi32(a, b, 0x82 as i32); assert_eq!(r, e); - let e = i32x8::splat(9).replace(0, 3).replace(1, 3).replace(7, 3); - let r = avx2::_mm256_blend_epi32(a, b, 0x7C as i32); + let e = _mm256_setr_epi32(3, 3, 9, 9, 9, 9, 9, 3); + let r = _mm256_blend_epi32(a, b, 0x7C as i32); assert_eq!(r, e); } #[simd_test = "avx2"] - unsafe fn _mm256_blend_epi16() { - let (a, b) = (i16x16::splat(3), i16x16::splat(9)); - let e = i16x16::splat(3).replace(0, 9).replace(8, 9); - let r = avx2::_mm256_blend_epi16(a, b, 0x01 as i32); + unsafe fn test_mm256_blend_epi16() { + let (a, b) = (_mm256_set1_epi16(3), _mm256_set1_epi16(9)); + let e = _mm256_setr_epi16(9, 3, 3, 3, 3, 3, 3, 3, 9, 3, 3,3, 3, 3, 3, 3); + let r = _mm256_blend_epi16(a, b, 0x01 as i32); assert_eq!(r, e); - let r = avx2::_mm256_blend_epi16(b, a, 0xFE as i32); + let r = _mm256_blend_epi16(b, a, 0xFE as i32); assert_eq!(r, e); } #[simd_test = "avx2"] - unsafe fn _mm256_blendv_epi8() { - let (a, b) = (i8x32::splat(4), i8x32::splat(2)); - let mask = __m256i::from(i8x32::splat(0).replace(2, -1)); - let e = i8x32::splat(4).replace(2, 2); - let r = avx2::_mm256_blendv_epi8(a, b, mask); + unsafe fn test_mm256_blendv_epi8() { + let (a, b) = (_mm256_set1_epi8(4), _mm256_set1_epi8(2)); + let mask = _mm256_insert_epi8(_mm256_set1_epi8(0), -1, 2); + let e = _mm256_insert_epi8(_mm256_set1_epi8(4), 2, 2); + let r = _mm256_blendv_epi8(a, b, mask); assert_eq!(r, e); } #[simd_test = "avx2"] - unsafe fn _mm_broadcastb_epi8() { - let a = i8x16::splat(0x00).replace(0, 0x2a); - let res = avx2::_mm_broadcastb_epi8(a); - assert_eq!(res, i8x16::splat(0x2a)); + unsafe fn test_mm_broadcastb_epi8() { + let a = _mm_insert_epi8(_mm_set1_epi8(0x00), 0x2a, 0); + let res = _mm_broadcastb_epi8(a); + assert_eq!(res, _mm_set1_epi8(0x2a)); } #[simd_test = "avx2"] - unsafe fn _mm256_broadcastb_epi8() { - let a = i8x16::splat(0x00).replace(0, 0x2a); - let res = avx2::_mm256_broadcastb_epi8(a); - assert_eq!(res, i8x32::splat(0x2a)); + unsafe fn test_mm256_broadcastb_epi8() { + let a = _mm_insert_epi8(_mm_set1_epi8(0x00), 0x2a, 0); + let res = _mm256_broadcastb_epi8(a); + assert_eq!(res, _mm256_set1_epi8(0x2a)); } #[simd_test = "avx2"] - unsafe fn _mm_broadcastd_epi32() { - let a = i32x4::splat(0x00).replace(0, 0x2a).replace(1, 0x8000000); - let res = avx2::_mm_broadcastd_epi32(a); - assert_eq!(res, i32x4::splat(0x2a)); + unsafe fn test_mm_broadcastd_epi32() { + let a = _mm_setr_epi32(0x2a, 0x8000000, 0, 0); + let res = _mm_broadcastd_epi32(a); + assert_eq!(res, _mm_set1_epi32(0x2a)); } #[simd_test = "avx2"] - unsafe fn _mm256_broadcastd_epi32() { - let a = i32x4::splat(0x00).replace(0, 0x2a).replace(1, 0x8000000); - let res = avx2::_mm256_broadcastd_epi32(a); - assert_eq!(res, i32x8::splat(0x2a)); + unsafe fn test_mm256_broadcastd_epi32() { + let a = _mm_setr_epi32(0x2a, 0x8000000, 0, 0); + let res = _mm256_broadcastd_epi32(a); + assert_eq!(res, _mm256_set1_epi32(0x2a)); } #[simd_test = "avx2"] - unsafe fn _mm_broadcastq_epi64() { - let a = i64x2::splat(0x00).replace(0, 0x1ffffffff); - let res = avx2::_mm_broadcastq_epi64(a); - assert_eq!(res, i64x2::splat(0x1ffffffff)); + unsafe fn test_mm_broadcastq_epi64() { + let a = _mm_setr_epi64x(0x1ffffffff, 0); + let res = _mm_broadcastq_epi64(a); + assert_eq!(res, _mm_set1_epi64x(0x1ffffffff)); } #[simd_test = "avx2"] - unsafe fn _mm256_broadcastq_epi64() { - let a = i64x2::splat(0x00).replace(0, 0x1ffffffff); - let res = avx2::_mm256_broadcastq_epi64(a); - assert_eq!(res, i64x4::splat(0x1ffffffff)); + unsafe fn test_mm256_broadcastq_epi64() { + let a = _mm_setr_epi64x(0x1ffffffff, 0); + let res = _mm256_broadcastq_epi64(a); + assert_eq!(res, _mm256_set1_epi64x(0x1ffffffff)); } #[simd_test = "avx2"] - unsafe fn _mm_broadcastsd_pd() { - let a = f64x2::splat(3.14f64).replace(0, 6.28f64); - let res = avx2::_mm_broadcastsd_pd(a); - assert_eq!(res, f64x2::splat(6.28f64)); + unsafe fn test_mm_broadcastsd_pd() { + let a = _mm_setr_pd(6.28, 3.14); + let res = _mm_broadcastsd_pd(a); + assert_eq_m128d(res, _mm_set1_pd(6.28f64)); } #[simd_test = "avx2"] - unsafe fn _mm256_broadcastsd_pd() { - let a = f64x2::splat(3.14f64).replace(0, 6.28f64); - let res = avx2::_mm256_broadcastsd_pd(a); - assert_eq!(res, f64x4::splat(6.28f64)); + unsafe fn test_mm256_broadcastsd_pd() { + let a = _mm_setr_pd(6.28, 3.14); + let res = _mm256_broadcastsd_pd(a); + assert_eq_m256d(res, _mm256_set1_pd(6.28f64)); } #[simd_test = "avx2"] - unsafe fn _mm256_broadcastsi128_si256() { - let a = i64x2::new(0x0987654321012334, 0x5678909876543210); - let res = avx2::_mm256_broadcastsi128_si256(a); - let retval = i64x4::new( + unsafe fn test_mm256_broadcastsi128_si256() { + let a = _mm_setr_epi64x(0x0987654321012334, 0x5678909876543210); + let res = _mm256_broadcastsi128_si256(a); + let retval = _mm256_setr_epi64x( 0x0987654321012334, 0x5678909876543210, 0x0987654321012334, @@ -3409,570 +3567,576 @@ mod tests { } #[simd_test = "avx2"] - unsafe fn _mm_broadcastss_ps() { - let a = f32x4::splat(3.14f32).replace(0, 6.28f32); - let res = avx2::_mm_broadcastss_ps(a); - assert_eq!(res, f32x4::splat(6.28f32)); + unsafe fn test_mm_broadcastss_ps() { + let a = _mm_setr_ps(6.28, 3.14, 0.0, 0.0); + let res = _mm_broadcastss_ps(a); + assert_eq_m128(res, _mm_set1_ps(6.28f32)); } #[simd_test = "avx2"] - unsafe fn _mm256_broadcastss_ps() { - let a = f32x4::splat(3.14f32).replace(0, 6.28f32); - let res = avx2::_mm256_broadcastss_ps(a); - assert_eq!(res, f32x8::splat(6.28f32)); + unsafe fn test_mm256_broadcastss_ps() { + let a = _mm_setr_ps(6.28, 3.14, 0.0, 0.0); + let res = _mm256_broadcastss_ps(a); + assert_eq_m256(res, _mm256_set1_ps(6.28f32)); } #[simd_test = "avx2"] - unsafe fn _mm_broadcastw_epi16() { - let a = i16x8::splat(0x2a).replace(0, 0x22b); - let res = avx2::_mm_broadcastw_epi16(a); - assert_eq!(res, i16x8::splat(0x22b)); + unsafe fn test_mm_broadcastw_epi16() { + let a = _mm_insert_epi16(_mm_set1_epi16(0x2a), 0x22b, 0); + let res = _mm_broadcastw_epi16(a); + assert_eq!(res, _mm_set1_epi16(0x22b)); } #[simd_test = "avx2"] - unsafe fn _mm256_broadcastw_epi16() { - let a = i16x8::splat(0x2a).replace(0, 0x22b); - let res = avx2::_mm256_broadcastw_epi16(a); - assert_eq!(res, i16x16::splat(0x22b)); + unsafe fn test_mm256_broadcastw_epi16() { + let a = _mm_insert_epi16(_mm_set1_epi16(0x2a), 0x22b, 0); + let res = _mm256_broadcastw_epi16(a); + assert_eq!(res, _mm256_set1_epi16(0x22b)); } #[simd_test = "avx2"] - unsafe fn _mm256_cmpeq_epi8() { + unsafe fn test_mm256_cmpeq_epi8() { #[cfg_attr(rustfmt, rustfmt_skip)] - let a = i8x32::new( + let a = _mm256_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, ); #[cfg_attr(rustfmt, rustfmt_skip)] - let b = i8x32::new( + let b = _mm256_setr_epi8( 31, 30, 2, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, ); - let r = avx2::_mm256_cmpeq_epi8(a, b); - assert_eq!(r, i8x32::splat(0).replace(2, 0xFFu8 as i8)); + let r = _mm256_cmpeq_epi8(a, b); + assert_eq!(r, _mm256_insert_epi8(_mm256_set1_epi8(0), !0, 2)); } #[simd_test = "avx2"] - unsafe fn _mm256_cmpeq_epi16() { + unsafe fn test_mm256_cmpeq_epi16() { let a = - i16x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); let b = - i16x16::new(15, 14, 2, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - let r = avx2::_mm256_cmpeq_epi16(a, b); - assert_eq!(r, i16x16::splat(0).replace(2, 0xFFFFu16 as i16)); + _mm256_setr_epi16(15, 14, 2, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + let r = _mm256_cmpeq_epi16(a, b); + assert_eq!(r, _mm256_insert_epi16(_mm256_set1_epi16(0), !0, 2)); } #[simd_test = "avx2"] - unsafe fn _mm256_cmpeq_epi32() { - let a = i32x8::new(0, 1, 2, 3, 4, 5, 6, 7); - let b = i32x8::new(7, 6, 2, 4, 3, 2, 1, 0); - let r = avx2::_mm256_cmpeq_epi32(a, b); - assert_eq!(r, i32x8::splat(0).replace(2, 0xFFFFFFFFu32 as i32)); + unsafe fn test_mm256_cmpeq_epi32() { + let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); + let b = _mm256_setr_epi32(7, 6, 2, 4, 3, 2, 1, 0); + let r = _mm256_cmpeq_epi32(a, b); + let e = _mm256_set1_epi32(0); + let e = _mm256_insert_epi32(e, !0, 2); + assert_eq!(r, e); } #[simd_test = "avx2"] - unsafe fn _mm256_cmpeq_epi64() { - let a = i64x4::new(0, 1, 2, 3); - let b = i64x4::new(3, 2, 2, 0); - let r = avx2::_mm256_cmpeq_epi64(a, b); + unsafe fn test_mm256_cmpeq_epi64() { + let a = _mm256_setr_epi64x(0, 1, 2, 3); + let b = _mm256_setr_epi64x(3, 2, 2, 0); + let r = _mm256_cmpeq_epi64(a, b); assert_eq!( r, - i64x4::splat(0).replace(2, 0xFFFFFFFFFFFFFFFFu64 as i64) + _mm256_insert_epi64(_mm256_set1_epi64x(0), !0, 2), ); } #[simd_test = "avx2"] - unsafe fn _mm256_cmpgt_epi8() { - let a = i8x32::splat(0).replace(0, 5); - let b = i8x32::splat(0); - let r = avx2::_mm256_cmpgt_epi8(a, b); - assert_eq!(r, i8x32::splat(0).replace(0, 0xFFu8 as i8)); + unsafe fn test_mm256_cmpgt_epi8() { + let a = _mm256_insert_epi8(_mm256_set1_epi8(0), 5, 0); + let b = _mm256_set1_epi8(0); + let r = _mm256_cmpgt_epi8(a, b); + assert_eq!(r, _mm256_insert_epi8(_mm256_set1_epi8(0), !0, 0)); } #[simd_test = "avx2"] - unsafe fn _mm256_cmpgt_epi16() { - let a = i16x16::splat(0).replace(0, 5); - let b = i16x16::splat(0); - let r = avx2::_mm256_cmpgt_epi16(a, b); - assert_eq!(r, i16x16::splat(0).replace(0, 0xFFFFu16 as i16)); + unsafe fn test_mm256_cmpgt_epi16() { + let a = _mm256_insert_epi16(_mm256_set1_epi16(0), 5, 0); + let b = _mm256_set1_epi16(0); + let r = _mm256_cmpgt_epi16(a, b); + assert_eq!(r, _mm256_insert_epi16(_mm256_set1_epi16(0), !0, 0)); } #[simd_test = "avx2"] - unsafe fn _mm256_cmpgt_epi32() { - let a = i32x8::splat(0).replace(0, 5); - let b = i32x8::splat(0); - let r = avx2::_mm256_cmpgt_epi32(a, b); - assert_eq!(r, i32x8::splat(0).replace(0, 0xFFFFFFFFu32 as i32)); + unsafe fn test_mm256_cmpgt_epi32() { + let a = _mm256_insert_epi32(_mm256_set1_epi32(0), 5, 0); + let b = _mm256_set1_epi32(0); + let r = _mm256_cmpgt_epi32(a, b); + assert_eq!(r, _mm256_insert_epi32(_mm256_set1_epi32(0), !0, 0)); } #[simd_test = "avx2"] - unsafe fn _mm256_cmpgt_epi64() { - let a = i64x4::splat(0).replace(0, 5); - let b = i64x4::splat(0); - let r = avx2::_mm256_cmpgt_epi64(a, b); + unsafe fn test_mm256_cmpgt_epi64() { + let a = _mm256_insert_epi64(_mm256_set1_epi64x(0), 5, 0); + let b = _mm256_set1_epi64x(0); + let r = _mm256_cmpgt_epi64(a, b); assert_eq!( r, - i64x4::splat(0).replace(0, 0xFFFFFFFFFFFFFFFFu64 as i64) + _mm256_insert_epi64(_mm256_set1_epi64x(0), !0, 0), ); } #[simd_test = "avx2"] - unsafe fn _mm256_cvtepi8_epi16() { + unsafe fn test_mm256_cvtepi8_epi16() { let a = - i8x16::new(0, 0, -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7); + _mm_setr_epi8(0, 0, -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7); let r = - i16x16::new(0, 0, -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7); - assert_eq!(r, avx2::_mm256_cvtepi8_epi16(a)); + _mm256_setr_epi16(0, 0, -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7); + assert_eq!(r, _mm256_cvtepi8_epi16(a)); } #[simd_test = "avx2"] - unsafe fn _mm256_cvtepi8_epi32() { + unsafe fn test_mm256_cvtepi8_epi32() { let a = - i8x16::new(0, 0, -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7); - let r = i32x8::new(0, 0, -1, 1, -2, 2, -3, 3); - assert_eq!(r, avx2::_mm256_cvtepi8_epi32(a)); + _mm_setr_epi8(0, 0, -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7); + let r = _mm256_setr_epi32(0, 0, -1, 1, -2, 2, -3, 3); + assert_eq!(r, _mm256_cvtepi8_epi32(a)); } #[simd_test = "avx2"] - unsafe fn _mm256_cvtepi8_epi64() { + unsafe fn test_mm256_cvtepi8_epi64() { let a = - i8x16::new(0, 0, -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7); - let r = i64x4::new(0, 0, -1, 1); - assert_eq!(r, avx2::_mm256_cvtepi8_epi64(a)); + _mm_setr_epi8(0, 0, -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7); + let r = _mm256_setr_epi64x(0, 0, -1, 1); + assert_eq!(r, _mm256_cvtepi8_epi64(a)); } #[simd_test = "avx2"] - unsafe fn _mm256_cvtepi16_epi32() { - let a = i16x8::new(0, 0, -1, 1, -2, 2, -3, 3); - let r = i32x8::new(0, 0, -1, 1, -2, 2, -3, 3); - assert_eq!(r, avx2::_mm256_cvtepi16_epi32(a)); + unsafe fn test_mm256_cvtepi16_epi32() { + let a = _mm_setr_epi16(0, 0, -1, 1, -2, 2, -3, 3); + let r = _mm256_setr_epi32(0, 0, -1, 1, -2, 2, -3, 3); + assert_eq!(r, _mm256_cvtepi16_epi32(a)); } #[simd_test = "avx2"] - unsafe fn _mm256_cvtepi16_epi64() { - let a = i16x8::new(0, 0, -1, 1, -2, 2, -3, 3); - let r = i64x4::new(0, 0, -1, 1); - assert_eq!(r, avx2::_mm256_cvtepi16_epi64(a)); + unsafe fn test_mm256_cvtepi16_epi64() { + let a = _mm_setr_epi16(0, 0, -1, 1, -2, 2, -3, 3); + let r = _mm256_setr_epi64x(0, 0, -1, 1); + assert_eq!(r, _mm256_cvtepi16_epi64(a)); } #[simd_test = "avx2"] - unsafe fn _mm256_cvtepi32_epi64() { - let a = i32x4::new(0, 0, -1, 1); - let r = i64x4::new(0, 0, -1, 1); - assert_eq!(r, avx2::_mm256_cvtepi32_epi64(a)); + unsafe fn test_mm256_cvtepi32_epi64() { + let a = _mm_setr_epi32(0, 0, -1, 1); + let r = _mm256_setr_epi64x(0, 0, -1, 1); + assert_eq!(r, _mm256_cvtepi32_epi64(a)); } #[simd_test = "avx2"] - unsafe fn _mm256_cvtepu16_epi32() { - let a = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7); - let r = i32x8::new(0, 1, 2, 3, 4, 5, 6, 7); - assert_eq!(r, avx2::_mm256_cvtepu16_epi32(a)); + unsafe fn test_mm256_cvtepu16_epi32() { + let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); + let r = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); + assert_eq!(r, _mm256_cvtepu16_epi32(a)); } #[simd_test = "avx2"] - unsafe fn _mm256_cvtepu16_epi64() { - let a = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7); - let r = i64x4::new(0, 1, 2, 3); - assert_eq!(r, avx2::_mm256_cvtepu16_epi64(a)); + unsafe fn test_mm256_cvtepu16_epi64() { + let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); + let r = _mm256_setr_epi64x(0, 1, 2, 3); + assert_eq!(r, _mm256_cvtepu16_epi64(a)); } #[simd_test = "avx2"] - unsafe fn _mm256_cvtepu32_epi64() { - let a = u32x4::new(0, 1, 2, 3); - let r = i64x4::new(0, 1, 2, 3); - assert_eq!(r, avx2::_mm256_cvtepu32_epi64(a)); + unsafe fn test_mm256_cvtepu32_epi64() { + let a = _mm_setr_epi32(0, 1, 2, 3); + let r = _mm256_setr_epi64x(0, 1, 2, 3); + assert_eq!(r, _mm256_cvtepu32_epi64(a)); } #[simd_test = "avx2"] - unsafe fn _mm256_cvtepu8_epi16() { + unsafe fn test_mm256_cvtepu8_epi16() { let a = - u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); let r = - i16x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - assert_eq!(r, avx2::_mm256_cvtepu8_epi16(a)); + _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + assert_eq!(r, _mm256_cvtepu8_epi16(a)); } #[simd_test = "avx2"] - unsafe fn _mm256_cvtepu8_epi32() { + unsafe fn test_mm256_cvtepu8_epi32() { let a = - u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - let r = i32x8::new(0, 1, 2, 3, 4, 5, 6, 7); - assert_eq!(r, avx2::_mm256_cvtepu8_epi32(a)); + _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let r = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); + assert_eq!(r, _mm256_cvtepu8_epi32(a)); } #[simd_test = "avx2"] - unsafe fn _mm256_cvtepu8_epi64() { + unsafe fn test_mm256_cvtepu8_epi64() { let a = - u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - let r = i64x4::new(0, 1, 2, 3); - assert_eq!(r, avx2::_mm256_cvtepu8_epi64(a)); + _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let r = _mm256_setr_epi64x(0, 1, 2, 3); + assert_eq!(r, _mm256_cvtepu8_epi64(a)); } #[simd_test = "avx2"] - unsafe fn _mm256_extracti128_si256() { - let a = __m256i::from(i64x4::new(1, 2, 3, 4)); - let r = avx2::_mm256_extracti128_si256(a, 0b01); - let e = __m128i::from(i64x2::new(3, 4)); + unsafe fn test_mm256_extracti128_si256() { + let a = __m256i::from(_mm256_setr_epi64x(1, 2, 3, 4)); + let r = _mm256_extracti128_si256(a, 0b01); + let e = __m128i::from(_mm_setr_epi64x(3, 4)); assert_eq!(r, e); } #[simd_test = "avx2"] - unsafe fn _mm256_hadd_epi16() { - let a = i16x16::splat(2); - let b = i16x16::splat(4); - let r = avx2::_mm256_hadd_epi16(a, b); - let e = i16x16::new(4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8); + unsafe fn test_mm256_hadd_epi16() { + let a = _mm256_set1_epi16(2); + let b = _mm256_set1_epi16(4); + let r = _mm256_hadd_epi16(a, b); + let e = _mm256_setr_epi16(4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8); assert_eq!(r, e); } #[simd_test = "avx2"] - unsafe fn _mm256_hadd_epi32() { - let a = i32x8::splat(2); - let b = i32x8::splat(4); - let r = avx2::_mm256_hadd_epi32(a, b); - let e = i32x8::new(4, 4, 8, 8, 4, 4, 8, 8); + unsafe fn test_mm256_hadd_epi32() { + let a = _mm256_set1_epi32(2); + let b = _mm256_set1_epi32(4); + let r = _mm256_hadd_epi32(a, b); + let e = _mm256_setr_epi32(4, 4, 8, 8, 4, 4, 8, 8); assert_eq!(r, e); } #[simd_test = "avx2"] - unsafe fn _mm256_hadds_epi16() { - let a = i16x16::splat(2).replace(0, 0x7FFF).replace(1, 1); - let b = i16x16::splat(4); - let r = avx2::_mm256_hadds_epi16(a, b); + unsafe fn test_mm256_hadds_epi16() { + let a = _mm256_set1_epi16(2); + let a = _mm256_insert_epi16(a, 0x7fff, 0); + let a = _mm256_insert_epi16(a, 1, 1); + let b = _mm256_set1_epi16(4); + let r = _mm256_hadds_epi16(a, b); let e = - i16x16::new(0x7FFF, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8); + _mm256_setr_epi16(0x7FFF, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8); assert_eq!(r, e); } #[simd_test = "avx2"] - unsafe fn _mm256_hsub_epi16() { - let a = i16x16::splat(2); - let b = i16x16::splat(4); - let r = avx2::_mm256_hsub_epi16(a, b); - let e = i16x16::splat(0); + unsafe fn test_mm256_hsub_epi16() { + let a = _mm256_set1_epi16(2); + let b = _mm256_set1_epi16(4); + let r = _mm256_hsub_epi16(a, b); + let e = _mm256_set1_epi16(0); assert_eq!(r, e); } #[simd_test = "avx2"] - unsafe fn _mm256_hsub_epi32() { - let a = i32x8::splat(2); - let b = i32x8::splat(4); - let r = avx2::_mm256_hsub_epi32(a, b); - let e = i32x8::splat(0); + unsafe fn test_mm256_hsub_epi32() { + let a = _mm256_set1_epi32(2); + let b = _mm256_set1_epi32(4); + let r = _mm256_hsub_epi32(a, b); + let e = _mm256_set1_epi32(0); assert_eq!(r, e); } #[simd_test = "avx2"] - unsafe fn _mm256_hsubs_epi16() { - let a = i16x16::splat(2).replace(0, 0x7FFF).replace(1, -1); - let b = i16x16::splat(4); - let r = avx2::_mm256_hsubs_epi16(a, b); - let e = i16x16::splat(0).replace(0, 0x7FFF); + unsafe fn test_mm256_hsubs_epi16() { + let a = _mm256_set1_epi16(2); + let a = _mm256_insert_epi16(a, 0x7fff, 0); + let a = _mm256_insert_epi16(a, -1, 1); + let b = _mm256_set1_epi16(4); + let r = _mm256_hsubs_epi16(a, b); + let e = _mm256_insert_epi16(_mm256_set1_epi16(0), 0x7FFF, 0); assert_eq!(r, e); } #[simd_test = "avx2"] - unsafe fn _mm256_madd_epi16() { - let a = i16x16::splat(2); - let b = i16x16::splat(4); - let r = avx2::_mm256_madd_epi16(a, b); - let e = i32x8::splat(16); + unsafe fn test_mm256_madd_epi16() { + let a = _mm256_set1_epi16(2); + let b = _mm256_set1_epi16(4); + let r = _mm256_madd_epi16(a, b); + let e = _mm256_set1_epi32(16); assert_eq!(r, e); } #[simd_test = "avx2"] - unsafe fn _mm256_inserti128_si256() { - let a = __m256i::from(i64x4::new(1, 2, 3, 4)); - let b = __m128i::from(i64x2::new(7, 8)); - let r = avx2::_mm256_inserti128_si256(a, b, 0b01); - let e = i64x4::new(1, 2, 7, 8); + unsafe fn test_mm256_inserti128_si256() { + let a = __m256i::from(_mm256_setr_epi64x(1, 2, 3, 4)); + let b = __m128i::from(_mm_setr_epi64x(7, 8)); + let r = _mm256_inserti128_si256(a, b, 0b01); + let e = _mm256_setr_epi64x(1, 2, 7, 8); assert_eq!(r, __m256i::from(e)); } #[simd_test = "avx2"] - unsafe fn _mm256_maddubs_epi16() { - let a = u8x32::splat(2); - let b = u8x32::splat(4); - let r = avx2::_mm256_maddubs_epi16(a, b); - let e = i16x16::splat(16); + unsafe fn test_mm256_maddubs_epi16() { + let a = _mm256_set1_epi8(2); + let b = _mm256_set1_epi8(4); + let r = _mm256_maddubs_epi16(a, b); + let e = _mm256_set1_epi16(16); assert_eq!(r, e); } #[simd_test = "avx2"] - unsafe fn _mm_maskload_epi32() { + unsafe fn test_mm_maskload_epi32() { let nums = [1, 2, 3, 4]; let a = &nums as *const i32; - let mask = i32x4::new(-1, 0, 0, -1); - let r = avx2::_mm_maskload_epi32(a, mask); - let e = i32x4::new(1, 0, 0, 4); + let mask = _mm_setr_epi32(-1, 0, 0, -1); + let r = _mm_maskload_epi32(a, mask); + let e = _mm_setr_epi32(1, 0, 0, 4); assert_eq!(r, e); } #[simd_test = "avx2"] - unsafe fn _mm256_maskload_epi32() { + unsafe fn test_mm256_maskload_epi32() { let nums = [1, 2, 3, 4, 5, 6, 7, 8]; let a = &nums as *const i32; - let mask = i32x8::new(-1, 0, 0, -1, 0, -1, -1, 0); - let r = avx2::_mm256_maskload_epi32(a, mask); - let e = i32x8::new(1, 0, 0, 4, 0, 6, 7, 0); + let mask = _mm256_setr_epi32(-1, 0, 0, -1, 0, -1, -1, 0); + let r = _mm256_maskload_epi32(a, mask); + let e = _mm256_setr_epi32(1, 0, 0, 4, 0, 6, 7, 0); assert_eq!(r, e); } #[simd_test = "avx2"] - unsafe fn _mm_maskload_epi64() { + unsafe fn test_mm_maskload_epi64() { let nums = [1_i64, 2_i64]; let a = &nums as *const i64; - let mask = i64x2::new(0, -1); - let r = avx2::_mm_maskload_epi64(a, mask); - let e = i64x2::new(0, 2); + let mask = _mm_setr_epi64x(0, -1); + let r = _mm_maskload_epi64(a, mask); + let e = _mm_setr_epi64x(0, 2); assert_eq!(r, e); } #[simd_test = "avx2"] - unsafe fn _mm256_maskload_epi64() { + unsafe fn test_mm256_maskload_epi64() { let nums = [1_i64, 2_i64, 3_i64, 4_i64]; let a = &nums as *const i64; - let mask = i64x4::new(0, -1, -1, 0); - let r = avx2::_mm256_maskload_epi64(a, mask); - let e = i64x4::new(0, 2, 3, 0); + let mask = _mm256_setr_epi64x(0, -1, -1, 0); + let r = _mm256_maskload_epi64(a, mask); + let e = _mm256_setr_epi64x(0, 2, 3, 0); assert_eq!(r, e); } #[simd_test = "avx2"] - unsafe fn _mm_maskstore_epi32() { - let a = i32x4::new(1, 2, 3, 4); + unsafe fn test_mm_maskstore_epi32() { + let a = _mm_setr_epi32(1, 2, 3, 4); let mut arr = [-1, -1, -1, -1]; - let mask = i32x4::new(-1, 0, 0, -1); - avx2::_mm_maskstore_epi32(arr.as_mut_ptr(), mask, a); + let mask = _mm_setr_epi32(-1, 0, 0, -1); + _mm_maskstore_epi32(arr.as_mut_ptr(), mask, a); let e = [1, -1, -1, 4]; assert_eq!(arr, e); } #[simd_test = "avx2"] - unsafe fn _mm256_maskstore_epi32() { - let a = i32x8::new(1, 0x6d726f, 3, 42, 0x777161, 6, 7, 8); + unsafe fn test_mm256_maskstore_epi32() { + let a = _mm256_setr_epi32(1, 0x6d726f, 3, 42, 0x777161, 6, 7, 8); let mut arr = [-1, -1, -1, 0x776173, -1, 0x68657265, -1, -1]; - let mask = i32x8::new(-1, 0, 0, -1, 0, -1, -1, 0); - avx2::_mm256_maskstore_epi32(arr.as_mut_ptr(), mask, a); + let mask = _mm256_setr_epi32(-1, 0, 0, -1, 0, -1, -1, 0); + _mm256_maskstore_epi32(arr.as_mut_ptr(), mask, a); let e = [1, -1, -1, 42, -1, 6, 7, -1]; assert_eq!(arr, e); } #[simd_test = "avx2"] - unsafe fn _mm_maskstore_epi64() { - let a = i64x2::new(1_i64, 2_i64); + unsafe fn test_mm_maskstore_epi64() { + let a = _mm_setr_epi64x(1_i64, 2_i64); let mut arr = [-1_i64, -1_i64]; - let mask = i64x2::new(0, -1); - avx2::_mm_maskstore_epi64(arr.as_mut_ptr(), mask, a); + let mask = _mm_setr_epi64x(0, -1); + _mm_maskstore_epi64(arr.as_mut_ptr(), mask, a); let e = [-1, 2]; assert_eq!(arr, e); } #[simd_test = "avx2"] - unsafe fn _mm256_maskstore_epi64() { - let a = i64x4::new(1_i64, 2_i64, 3_i64, 4_i64); + unsafe fn test_mm256_maskstore_epi64() { + let a = _mm256_setr_epi64x(1_i64, 2_i64, 3_i64, 4_i64); let mut arr = [-1_i64, -1_i64, -1_i64, -1_i64]; - let mask = i64x4::new(0, -1, -1, 0); - avx2::_mm256_maskstore_epi64(arr.as_mut_ptr(), mask, a); + let mask = _mm256_setr_epi64x(0, -1, -1, 0); + _mm256_maskstore_epi64(arr.as_mut_ptr(), mask, a); let e = [-1, 2, 3, -1]; assert_eq!(arr, e); } #[simd_test = "avx2"] - unsafe fn _mm256_max_epi16() { - let a = i16x16::splat(2); - let b = i16x16::splat(4); - let r = avx2::_mm256_max_epi16(a, b); + unsafe fn test_mm256_max_epi16() { + let a = _mm256_set1_epi16(2); + let b = _mm256_set1_epi16(4); + let r = _mm256_max_epi16(a, b); assert_eq!(r, b); } #[simd_test = "avx2"] - unsafe fn _mm256_max_epi32() { - let a = i32x8::splat(2); - let b = i32x8::splat(4); - let r = avx2::_mm256_max_epi32(a, b); + unsafe fn test_mm256_max_epi32() { + let a = _mm256_set1_epi32(2); + let b = _mm256_set1_epi32(4); + let r = _mm256_max_epi32(a, b); assert_eq!(r, b); } #[simd_test = "avx2"] - unsafe fn _mm256_max_epi8() { - let a = i8x32::splat(2); - let b = i8x32::splat(4); - let r = avx2::_mm256_max_epi8(a, b); + unsafe fn test_mm256_max_epi8() { + let a = _mm256_set1_epi8(2); + let b = _mm256_set1_epi8(4); + let r = _mm256_max_epi8(a, b); assert_eq!(r, b); } #[simd_test = "avx2"] - unsafe fn _mm256_max_epu16() { - let a = u16x16::splat(2); - let b = u16x16::splat(4); - let r = avx2::_mm256_max_epu16(a, b); + unsafe fn test_mm256_max_epu16() { + let a = _mm256_set1_epi16(2); + let b = _mm256_set1_epi16(4); + let r = _mm256_max_epu16(a, b); assert_eq!(r, b); } #[simd_test = "avx2"] - unsafe fn _mm256_max_epu32() { - let a = u32x8::splat(2); - let b = u32x8::splat(4); - let r = avx2::_mm256_max_epu32(a, b); + unsafe fn test_mm256_max_epu32() { + let a = _mm256_set1_epi32(2); + let b = _mm256_set1_epi32(4); + let r = _mm256_max_epu32(a, b); assert_eq!(r, b); } #[simd_test = "avx2"] - unsafe fn _mm256_max_epu8() { - let a = u8x32::splat(2); - let b = u8x32::splat(4); - let r = avx2::_mm256_max_epu8(a, b); + unsafe fn test_mm256_max_epu8() { + let a = _mm256_set1_epi8(2); + let b = _mm256_set1_epi8(4); + let r = _mm256_max_epu8(a, b); assert_eq!(r, b); } #[simd_test = "avx2"] - unsafe fn _mm256_min_epi16() { - let a = i16x16::splat(2); - let b = i16x16::splat(4); - let r = avx2::_mm256_min_epi16(a, b); + unsafe fn test_mm256_min_epi16() { + let a = _mm256_set1_epi16(2); + let b = _mm256_set1_epi16(4); + let r = _mm256_min_epi16(a, b); assert_eq!(r, a); } #[simd_test = "avx2"] - unsafe fn _mm256_min_epi32() { - let a = i32x8::splat(2); - let b = i32x8::splat(4); - let r = avx2::_mm256_min_epi32(a, b); + unsafe fn test_mm256_min_epi32() { + let a = _mm256_set1_epi32(2); + let b = _mm256_set1_epi32(4); + let r = _mm256_min_epi32(a, b); assert_eq!(r, a); } #[simd_test = "avx2"] - unsafe fn _mm256_min_epi8() { - let a = i8x32::splat(2); - let b = i8x32::splat(4); - let r = avx2::_mm256_min_epi8(a, b); + unsafe fn test_mm256_min_epi8() { + let a = _mm256_set1_epi8(2); + let b = _mm256_set1_epi8(4); + let r = _mm256_min_epi8(a, b); assert_eq!(r, a); } #[simd_test = "avx2"] - unsafe fn _mm256_min_epu16() { - let a = u16x16::splat(2); - let b = u16x16::splat(4); - let r = avx2::_mm256_min_epu16(a, b); + unsafe fn test_mm256_min_epu16() { + let a = _mm256_set1_epi16(2); + let b = _mm256_set1_epi16(4); + let r = _mm256_min_epu16(a, b); assert_eq!(r, a); } #[simd_test = "avx2"] - unsafe fn _mm256_min_epu32() { - let a = u32x8::splat(2); - let b = u32x8::splat(4); - let r = avx2::_mm256_min_epu32(a, b); + unsafe fn test_mm256_min_epu32() { + let a = _mm256_set1_epi32(2); + let b = _mm256_set1_epi32(4); + let r = _mm256_min_epu32(a, b); assert_eq!(r, a); } #[simd_test = "avx2"] - unsafe fn _mm256_min_epu8() { - let a = u8x32::splat(2); - let b = u8x32::splat(4); - let r = avx2::_mm256_min_epu8(a, b); + unsafe fn test_mm256_min_epu8() { + let a = _mm256_set1_epi8(2); + let b = _mm256_set1_epi8(4); + let r = _mm256_min_epu8(a, b); assert_eq!(r, a); } #[simd_test = "avx2"] - unsafe fn _mm256_movemask_epi8() { - let a = i8x32::splat(-1); - let r = avx2::_mm256_movemask_epi8(a); + unsafe fn test_mm256_movemask_epi8() { + let a = _mm256_set1_epi8(-1); + let r = _mm256_movemask_epi8(a); let e = -1; assert_eq!(r, e); } #[simd_test = "avx2"] - unsafe fn _mm256_mpsadbw_epu8() { - let a = u8x32::splat(2); - let b = u8x32::splat(4); - let r = avx2::_mm256_mpsadbw_epu8(a, b, 0); - let e = u16x16::splat(8); + unsafe fn test_mm256_mpsadbw_epu8() { + let a = _mm256_set1_epi8(2); + let b = _mm256_set1_epi8(4); + let r = _mm256_mpsadbw_epu8(a, b, 0); + let e = _mm256_set1_epi16(8); assert_eq!(r, e); } #[simd_test = "avx2"] - unsafe fn _mm256_mul_epi32() { - let a = i32x8::new(0, 0, 0, 0, 2, 2, 2, 2); - let b = i32x8::new(1, 2, 3, 4, 5, 6, 7, 8); - let r = avx2::_mm256_mul_epi32(a, b); - let e = i64x4::new(0, 0, 10, 14); + unsafe fn test_mm256_mul_epi32() { + let a = _mm256_setr_epi32(0, 0, 0, 0, 2, 2, 2, 2); + let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8); + let r = _mm256_mul_epi32(a, b); + let e = _mm256_setr_epi64x(0, 0, 10, 14); assert_eq!(r, e); } #[simd_test = "avx2"] - unsafe fn _mm256_mul_epu32() { - let a = u32x8::new(0, 0, 0, 0, 2, 2, 2, 2); - let b = u32x8::new(1, 2, 3, 4, 5, 6, 7, 8); - let r = avx2::_mm256_mul_epu32(a, b); - let e = u64x4::new(0, 0, 10, 14); + unsafe fn test_mm256_mul_epu32() { + let a = _mm256_setr_epi32(0, 0, 0, 0, 2, 2, 2, 2); + let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8); + let r = _mm256_mul_epu32(a, b); + let e = _mm256_setr_epi64x(0, 0, 10, 14); assert_eq!(r, e); } #[simd_test = "avx2"] - unsafe fn _mm256_mulhi_epi16() { - let a = i16x16::splat(6535); - let b = i16x16::splat(6535); - let r = avx2::_mm256_mulhi_epi16(a, b); - let e = i16x16::splat(651); + unsafe fn test_mm256_mulhi_epi16() { + let a = _mm256_set1_epi16(6535); + let b = _mm256_set1_epi16(6535); + let r = _mm256_mulhi_epi16(a, b); + let e = _mm256_set1_epi16(651); assert_eq!(r, e); } #[simd_test = "avx2"] - unsafe fn _mm256_mulhi_epu16() { - let a = u16x16::splat(6535); - let b = u16x16::splat(6535); - let r = avx2::_mm256_mulhi_epu16(a, b); - let e = u16x16::splat(651); + unsafe fn test_mm256_mulhi_epu16() { + let a = _mm256_set1_epi16(6535); + let b = _mm256_set1_epi16(6535); + let r = _mm256_mulhi_epu16(a, b); + let e = _mm256_set1_epi16(651); assert_eq!(r, e); } #[simd_test = "avx2"] - unsafe fn _mm256_mullo_epi16() { - let a = i16x16::splat(2); - let b = i16x16::splat(4); - let r = avx2::_mm256_mullo_epi16(a, b); - let e = i16x16::splat(8); + unsafe fn test_mm256_mullo_epi16() { + let a = _mm256_set1_epi16(2); + let b = _mm256_set1_epi16(4); + let r = _mm256_mullo_epi16(a, b); + let e = _mm256_set1_epi16(8); assert_eq!(r, e); } #[simd_test = "avx2"] - unsafe fn _mm256_mullo_epi32() { - let a = i32x8::splat(2); - let b = i32x8::splat(4); - let r = avx2::_mm256_mullo_epi32(a, b); - let e = i32x8::splat(8); + unsafe fn test_mm256_mullo_epi32() { + let a = _mm256_set1_epi32(2); + let b = _mm256_set1_epi32(4); + let r = _mm256_mullo_epi32(a, b); + let e = _mm256_set1_epi32(8); assert_eq!(r, e); } #[simd_test = "avx2"] - unsafe fn _mm256_mulhrs_epi16() { - let a = i16x16::splat(2); - let b = i16x16::splat(4); - let r = avx2::_mm256_mullo_epi16(a, b); - let e = i16x16::splat(8); + unsafe fn test_mm256_mulhrs_epi16() { + let a = _mm256_set1_epi16(2); + let b = _mm256_set1_epi16(4); + let r = _mm256_mullo_epi16(a, b); + let e = _mm256_set1_epi16(8); assert_eq!(r, e); } #[simd_test = "avx2"] - unsafe fn _mm256_or_si256() { - let a = __m256i::from(i8x32::splat(-1)); - let b = __m256i::from(i8x32::splat(0)); - let r = avx2::_mm256_or_si256(a, b); + unsafe fn test_mm256_or_si256() { + let a = __m256i::from(_mm256_set1_epi8(-1)); + let b = __m256i::from(_mm256_set1_epi8(0)); + let r = _mm256_or_si256(a, b); assert_eq!(r, a); } #[simd_test = "avx2"] - unsafe fn _mm256_packs_epi16() { - let a = i16x16::splat(2); - let b = i16x16::splat(4); - let r = avx2::_mm256_packs_epi16(a, b); + unsafe fn test_mm256_packs_epi16() { + let a = _mm256_set1_epi16(2); + let b = _mm256_set1_epi16(4); + let r = _mm256_packs_epi16(a, b); #[cfg_attr(rustfmt, rustfmt_skip)] - let e = i8x32::new( + let e = _mm256_setr_epi8( 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, @@ -3983,22 +4147,22 @@ mod tests { } #[simd_test = "avx2"] - unsafe fn _mm256_packs_epi32() { - let a = i32x8::splat(2); - let b = i32x8::splat(4); - let r = avx2::_mm256_packs_epi32(a, b); - let e = i16x16::new(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4); + unsafe fn test_mm256_packs_epi32() { + let a = _mm256_set1_epi32(2); + let b = _mm256_set1_epi32(4); + let r = _mm256_packs_epi32(a, b); + let e = _mm256_setr_epi16(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4); assert_eq!(r, e); } #[simd_test = "avx2"] - unsafe fn _mm256_packus_epi16() { - let a = i16x16::splat(2); - let b = i16x16::splat(4); - let r = avx2::_mm256_packus_epi16(a, b); + unsafe fn test_mm256_packus_epi16() { + let a = _mm256_set1_epi16(2); + let b = _mm256_set1_epi16(4); + let r = _mm256_packus_epi16(a, b); #[cfg_attr(rustfmt, rustfmt_skip)] - let e = u8x32::new( + let e = _mm256_setr_epi8( 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, @@ -4009,236 +4173,236 @@ mod tests { } #[simd_test = "avx2"] - unsafe fn _mm256_packus_epi32() { - let a = i32x8::splat(2); - let b = i32x8::splat(4); - let r = avx2::_mm256_packus_epi32(a, b); - let e = u16x16::new(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4); + unsafe fn test_mm256_packus_epi32() { + let a = _mm256_set1_epi32(2); + let b = _mm256_set1_epi32(4); + let r = _mm256_packus_epi32(a, b); + let e = _mm256_setr_epi16(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4); assert_eq!(r, e); } #[simd_test = "avx2"] - unsafe fn _mm256_sad_epu8() { - let a = u8x32::splat(2); - let b = u8x32::splat(4); - let r = avx2::_mm256_sad_epu8(a, b); - let e = u64x4::splat(16); + unsafe fn test_mm256_sad_epu8() { + let a = _mm256_set1_epi8(2); + let b = _mm256_set1_epi8(4); + let r = _mm256_sad_epu8(a, b); + let e = _mm256_set1_epi64x(16); assert_eq!(r, e); } #[simd_test = "avx2"] - unsafe fn _mm256_shufflehi_epi16() { + unsafe fn test_mm256_shufflehi_epi16() { #[cfg_attr(rustfmt, rustfmt_skip)] - let a = i16x16::new( + let a = _mm256_setr_epi16( 0, 1, 2, 3, 11, 22, 33, 44, 4, 5, 6, 7, 55, 66, 77, 88, ); #[cfg_attr(rustfmt, rustfmt_skip)] - let e = i16x16::new( + let e = _mm256_setr_epi16( 0, 1, 2, 3, 44, 22, 22, 11, 4, 5, 6, 7, 88, 66, 66, 55, ); - let r = avx2::_mm256_shufflehi_epi16(a, 0b00_01_01_11); + let r = _mm256_shufflehi_epi16(a, 0b00_01_01_11); assert_eq!(r, e); } #[simd_test = "avx2"] - unsafe fn _mm256_shufflelo_epi16() { + unsafe fn test_mm256_shufflelo_epi16() { #[cfg_attr(rustfmt, rustfmt_skip)] - let a = i16x16::new( + let a = _mm256_setr_epi16( 11, 22, 33, 44, 0, 1, 2, 3, 55, 66, 77, 88, 4, 5, 6, 7, ); #[cfg_attr(rustfmt, rustfmt_skip)] - let e = i16x16::new( + let e = _mm256_setr_epi16( 44, 22, 22, 11, 0, 1, 2, 3, 88, 66, 66, 55, 4, 5, 6, 7, ); - let r = avx2::_mm256_shufflelo_epi16(a, 0b00_01_01_11); + let r = _mm256_shufflelo_epi16(a, 0b00_01_01_11); assert_eq!(r, e); } #[simd_test = "avx2"] - unsafe fn _mm256_sign_epi16() { - let a = i16x16::splat(2); - let b = i16x16::splat(-1); - let r = avx2::_mm256_sign_epi16(a, b); - let e = i16x16::splat(-2); + unsafe fn test_mm256_sign_epi16() { + let a = _mm256_set1_epi16(2); + let b = _mm256_set1_epi16(-1); + let r = _mm256_sign_epi16(a, b); + let e = _mm256_set1_epi16(-2); assert_eq!(r, e); } #[simd_test = "avx2"] - unsafe fn _mm256_sign_epi32() { - let a = i32x8::splat(2); - let b = i32x8::splat(-1); - let r = avx2::_mm256_sign_epi32(a, b); - let e = i32x8::splat(-2); + unsafe fn test_mm256_sign_epi32() { + let a = _mm256_set1_epi32(2); + let b = _mm256_set1_epi32(-1); + let r = _mm256_sign_epi32(a, b); + let e = _mm256_set1_epi32(-2); assert_eq!(r, e); } #[simd_test = "avx2"] - unsafe fn _mm256_sign_epi8() { - let a = i8x32::splat(2); - let b = i8x32::splat(-1); - let r = avx2::_mm256_sign_epi8(a, b); - let e = i8x32::splat(-2); + unsafe fn test_mm256_sign_epi8() { + let a = _mm256_set1_epi8(2); + let b = _mm256_set1_epi8(-1); + let r = _mm256_sign_epi8(a, b); + let e = _mm256_set1_epi8(-2); assert_eq!(r, e); } #[simd_test = "avx2"] - unsafe fn _mm256_sll_epi16() { - let a = i16x16::splat(0xFF); - let b = i16x8::splat(0).replace(0, 4); - let r = avx2::_mm256_sll_epi16(a, b); - assert_eq!(r, i16x16::splat(0xFF0)); + unsafe fn test_mm256_sll_epi16() { + let a = _mm256_set1_epi16(0xFF); + let b = _mm_insert_epi16(_mm_set1_epi16(0), 4, 0); + let r = _mm256_sll_epi16(a, b); + assert_eq!(r, _mm256_set1_epi16(0xFF0)); } #[simd_test = "avx2"] - unsafe fn _mm256_sll_epi32() { - let a = i32x8::splat(0xFFFF); - let b = i32x4::splat(0).replace(0, 4); - let r = avx2::_mm256_sll_epi32(a, b); - assert_eq!(r, i32x8::splat(0xFFFF0)); + unsafe fn test_mm256_sll_epi32() { + let a = _mm256_set1_epi32(0xFFFF); + let b = _mm_insert_epi32(_mm_set1_epi32(0), 4, 0); + let r = _mm256_sll_epi32(a, b); + assert_eq!(r, _mm256_set1_epi32(0xFFFF0)); } #[simd_test = "avx2"] - unsafe fn _mm256_sll_epi64() { - let a = i64x4::splat(0xFFFFFFFF); - let b = i64x2::splat(0).replace(0, 4); - let r = avx2::_mm256_sll_epi64(a, b); - assert_eq!(r, i64x4::splat(0xFFFFFFFF0)); + unsafe fn test_mm256_sll_epi64() { + let a = _mm256_set1_epi64x(0xFFFFFFFF); + let b = _mm_insert_epi64(_mm_set1_epi64x(0), 4, 0); + let r = _mm256_sll_epi64(a, b); + assert_eq!(r, _mm256_set1_epi64x(0xFFFFFFFF0)); } #[simd_test = "avx2"] - unsafe fn _mm256_slli_epi16() { + unsafe fn test_mm256_slli_epi16() { assert_eq!( - avx2::_mm256_slli_epi16(i16x16::splat(0xFF), 4), - i16x16::splat(0xFF0) + _mm256_slli_epi16(_mm256_set1_epi16(0xFF), 4), + _mm256_set1_epi16(0xFF0) ); } #[simd_test = "avx2"] - unsafe fn _mm256_slli_epi32() { + unsafe fn test_mm256_slli_epi32() { assert_eq!( - avx2::_mm256_slli_epi32(i32x8::splat(0xFFFF), 4), - i32x8::splat(0xFFFF0) + _mm256_slli_epi32(_mm256_set1_epi32(0xFFFF), 4), + _mm256_set1_epi32(0xFFFF0) ); } #[simd_test = "avx2"] - unsafe fn _mm256_slli_epi64() { + unsafe fn test_mm256_slli_epi64() { assert_eq!( - avx2::_mm256_slli_epi64(i64x4::splat(0xFFFFFFFF), 4), - i64x4::splat(0xFFFFFFFF0) + _mm256_slli_epi64(_mm256_set1_epi64x(0xFFFFFFFF), 4), + _mm256_set1_epi64x(0xFFFFFFFF0) ); } #[simd_test = "avx2"] - unsafe fn _mm256_slli_si256() { - let a = i64x4::splat(0xFFFFFFFF); - let r = avx2::_mm256_slli_si256(__m256i::from(a), 3); - assert_eq!(r, __m256i::from(i64x4::splat(0xFFFFFFFF000000))); + unsafe fn test_mm256_slli_si256() { + let a = _mm256_set1_epi64x(0xFFFFFFFF); + let r = _mm256_slli_si256(__m256i::from(a), 3); + assert_eq!(r, __m256i::from(_mm256_set1_epi64x(0xFFFFFFFF000000))); } #[simd_test = "avx2"] - unsafe fn _mm_sllv_epi32() { - let a = i32x4::splat(2); - let b = i32x4::splat(1); - let r = avx2::_mm_sllv_epi32(a, b); - let e = i32x4::splat(4); + unsafe fn test_mm_sllv_epi32() { + let a = _mm_set1_epi32(2); + let b = _mm_set1_epi32(1); + let r = _mm_sllv_epi32(a, b); + let e = _mm_set1_epi32(4); assert_eq!(r, e); } #[simd_test = "avx2"] - unsafe fn _mm256_sllv_epi32() { - let a = i32x8::splat(2); - let b = i32x8::splat(1); - let r = avx2::_mm256_sllv_epi32(a, b); - let e = i32x8::splat(4); + unsafe fn test_mm256_sllv_epi32() { + let a = _mm256_set1_epi32(2); + let b = _mm256_set1_epi32(1); + let r = _mm256_sllv_epi32(a, b); + let e = _mm256_set1_epi32(4); assert_eq!(r, e); } #[simd_test = "avx2"] - unsafe fn _mm_sllv_epi64() { - let a = i64x2::splat(2); - let b = i64x2::splat(1); - let r = avx2::_mm_sllv_epi64(a, b); - let e = i64x2::splat(4); + unsafe fn test_mm_sllv_epi64() { + let a = _mm_set1_epi64x(2); + let b = _mm_set1_epi64x(1); + let r = _mm_sllv_epi64(a, b); + let e = _mm_set1_epi64x(4); assert_eq!(r, e); } #[simd_test = "avx2"] - unsafe fn _mm256_sllv_epi64() { - let a = i64x4::splat(2); - let b = i64x4::splat(1); - let r = avx2::_mm256_sllv_epi64(a, b); - let e = i64x4::splat(4); + unsafe fn test_mm256_sllv_epi64() { + let a = _mm256_set1_epi64x(2); + let b = _mm256_set1_epi64x(1); + let r = _mm256_sllv_epi64(a, b); + let e = _mm256_set1_epi64x(4); assert_eq!(r, e); } #[simd_test = "avx2"] - unsafe fn _mm256_sra_epi16() { - let a = i16x16::splat(-1); - let b = i16x8::new(1, 0, 0, 0, 0, 0, 0, 0); - let r = avx2::_mm256_sra_epi16(a, b); - assert_eq!(r, i16x16::splat(-1)); + unsafe fn test_mm256_sra_epi16() { + let a = _mm256_set1_epi16(-1); + let b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); + let r = _mm256_sra_epi16(a, b); + assert_eq!(r, _mm256_set1_epi16(-1)); } #[simd_test = "avx2"] - unsafe fn _mm256_sra_epi32() { - let a = i32x8::splat(-1); - let b = i32x4::splat(0).replace(0, 1); - let r = avx2::_mm256_sra_epi32(a, b); - assert_eq!(r, i32x8::splat(-1)); + unsafe fn test_mm256_sra_epi32() { + let a = _mm256_set1_epi32(-1); + let b = _mm_insert_epi32(_mm_set1_epi32(0), 1, 0); + let r = _mm256_sra_epi32(a, b); + assert_eq!(r, _mm256_set1_epi32(-1)); } #[simd_test = "avx2"] - unsafe fn _mm256_srai_epi16() { + unsafe fn test_mm256_srai_epi16() { assert_eq!( - avx2::_mm256_srai_epi16(i16x16::splat(-1), 1), - i16x16::splat(-1) + _mm256_srai_epi16(_mm256_set1_epi16(-1), 1), + _mm256_set1_epi16(-1) ); } #[simd_test = "avx2"] - unsafe fn _mm256_srai_epi32() { + unsafe fn test_mm256_srai_epi32() { assert_eq!( - avx2::_mm256_srai_epi32(i32x8::splat(-1), 1), - i32x8::splat(-1) + _mm256_srai_epi32(_mm256_set1_epi32(-1), 1), + _mm256_set1_epi32(-1) ); } #[simd_test = "avx2"] - unsafe fn _mm_srav_epi32() { - let a = i32x4::splat(4); - let count = i32x4::splat(1); - let r = avx2::_mm_srav_epi32(a, count); - let e = i32x4::splat(2); + unsafe fn test_mm_srav_epi32() { + let a = _mm_set1_epi32(4); + let count = _mm_set1_epi32(1); + let r = _mm_srav_epi32(a, count); + let e = _mm_set1_epi32(2); assert_eq!(r, e); } #[simd_test = "avx2"] - unsafe fn _mm256_srav_epi32() { - let a = i32x8::splat(4); - let count = i32x8::splat(1); - let r = avx2::_mm256_srav_epi32(a, count); - let e = i32x8::splat(2); + unsafe fn test_mm256_srav_epi32() { + let a = _mm256_set1_epi32(4); + let count = _mm256_set1_epi32(1); + let r = _mm256_srav_epi32(a, count); + let e = _mm256_set1_epi32(2); assert_eq!(r, e); } #[simd_test = "avx2"] - unsafe fn _mm256_srli_si256() { + unsafe fn test_mm256_srli_si256() { #[cfg_attr(rustfmt, rustfmt_skip)] - let a = i8x32::new( + let a = _mm256_setr_epi8( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, ); - let r = avx2::_mm256_srli_si256(__m256i::from(a), 3); + let r = _mm256_srli_si256(__m256i::from(a), 3); #[cfg_attr(rustfmt, rustfmt_skip)] - let e = i8x32::new( + let e = _mm256_setr_epi8( 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0, 0, 0, 20, 21, 22, 23, 24, 25, 26, 27, @@ -4248,183 +4412,183 @@ mod tests { } #[simd_test = "avx2"] - unsafe fn _mm256_srl_epi16() { - let a = i16x16::splat(0xFF); - let b = i16x8::splat(0).replace(0, 4); - let r = avx2::_mm256_srl_epi16(a, b); - assert_eq!(r, i16x16::splat(0xF)); + unsafe fn test_mm256_srl_epi16() { + let a = _mm256_set1_epi16(0xFF); + let b = _mm_insert_epi16(_mm_set1_epi16(0), 4, 0); + let r = _mm256_srl_epi16(a, b); + assert_eq!(r, _mm256_set1_epi16(0xF)); } #[simd_test = "avx2"] - unsafe fn _mm256_srl_epi32() { - let a = i32x8::splat(0xFFFF); - let b = i32x4::splat(0).replace(0, 4); - let r = avx2::_mm256_srl_epi32(a, b); - assert_eq!(r, i32x8::splat(0xFFF)); + unsafe fn test_mm256_srl_epi32() { + let a = _mm256_set1_epi32(0xFFFF); + let b = _mm_insert_epi32(_mm_set1_epi32(0), 4, 0); + let r = _mm256_srl_epi32(a, b); + assert_eq!(r, _mm256_set1_epi32(0xFFF)); } #[simd_test = "avx2"] - unsafe fn _mm256_srl_epi64() { - let a = i64x4::splat(0xFFFFFFFF); - let b = i64x2::splat(0).replace(0, 4); - let r = avx2::_mm256_srl_epi64(a, b); - assert_eq!(r, i64x4::splat(0xFFFFFFF)); + unsafe fn test_mm256_srl_epi64() { + let a = _mm256_set1_epi64x(0xFFFFFFFF); + let b = _mm_setr_epi64x(4, 0); + let r = _mm256_srl_epi64(a, b); + assert_eq!(r, _mm256_set1_epi64x(0xFFFFFFF)); } #[simd_test = "avx2"] - unsafe fn _mm256_srli_epi16() { + unsafe fn test_mm256_srli_epi16() { assert_eq!( - avx2::_mm256_srli_epi16(i16x16::splat(0xFF), 4), - i16x16::splat(0xF) + _mm256_srli_epi16(_mm256_set1_epi16(0xFF), 4), + _mm256_set1_epi16(0xF) ); } #[simd_test = "avx2"] - unsafe fn _mm256_srli_epi32() { + unsafe fn test_mm256_srli_epi32() { assert_eq!( - avx2::_mm256_srli_epi32(i32x8::splat(0xFFFF), 4), - i32x8::splat(0xFFF) + _mm256_srli_epi32(_mm256_set1_epi32(0xFFFF), 4), + _mm256_set1_epi32(0xFFF) ); } #[simd_test = "avx2"] - unsafe fn _mm256_srli_epi64() { + unsafe fn test_mm256_srli_epi64() { assert_eq!( - avx2::_mm256_srli_epi64(i64x4::splat(0xFFFFFFFF), 4), - i64x4::splat(0xFFFFFFF) + _mm256_srli_epi64(_mm256_set1_epi64x(0xFFFFFFFF), 4), + _mm256_set1_epi64x(0xFFFFFFF) ); } #[simd_test = "avx2"] - unsafe fn _mm_srlv_epi32() { - let a = i32x4::splat(2); - let count = i32x4::splat(1); - let r = avx2::_mm_srlv_epi32(a, count); - let e = i32x4::splat(1); + unsafe fn test_mm_srlv_epi32() { + let a = _mm_set1_epi32(2); + let count = _mm_set1_epi32(1); + let r = _mm_srlv_epi32(a, count); + let e = _mm_set1_epi32(1); assert_eq!(r, e); } #[simd_test = "avx2"] - unsafe fn _mm256_srlv_epi32() { - let a = i32x8::splat(2); - let count = i32x8::splat(1); - let r = avx2::_mm256_srlv_epi32(a, count); - let e = i32x8::splat(1); + unsafe fn test_mm256_srlv_epi32() { + let a = _mm256_set1_epi32(2); + let count = _mm256_set1_epi32(1); + let r = _mm256_srlv_epi32(a, count); + let e = _mm256_set1_epi32(1); assert_eq!(r, e); } #[simd_test = "avx2"] - unsafe fn _mm_srlv_epi64() { - let a = i64x2::splat(2); - let count = i64x2::splat(1); - let r = avx2::_mm_srlv_epi64(a, count); - let e = i64x2::splat(1); + unsafe fn test_mm_srlv_epi64() { + let a = _mm_set1_epi64x(2); + let count = _mm_set1_epi64x(1); + let r = _mm_srlv_epi64(a, count); + let e = _mm_set1_epi64x(1); assert_eq!(r, e); } #[simd_test = "avx2"] - unsafe fn _mm256_srlv_epi64() { - let a = i64x4::splat(2); - let count = i64x4::splat(1); - let r = avx2::_mm256_srlv_epi64(a, count); - let e = i64x4::splat(1); + unsafe fn test_mm256_srlv_epi64() { + let a = _mm256_set1_epi64x(2); + let count = _mm256_set1_epi64x(1); + let r = _mm256_srlv_epi64(a, count); + let e = _mm256_set1_epi64x(1); assert_eq!(r, e); } #[simd_test = "avx2"] - unsafe fn _mm256_sub_epi16() { - let a = i16x16::splat(4); - let b = i16x16::splat(2); - let r = avx2::_mm256_sub_epi16(a, b); + unsafe fn test_mm256_sub_epi16() { + let a = _mm256_set1_epi16(4); + let b = _mm256_set1_epi16(2); + let r = _mm256_sub_epi16(a, b); assert_eq!(r, b); } #[simd_test = "avx2"] - unsafe fn _mm256_sub_epi32() { - let a = i32x8::splat(4); - let b = i32x8::splat(2); - let r = avx2::_mm256_sub_epi32(a, b); + unsafe fn test_mm256_sub_epi32() { + let a = _mm256_set1_epi32(4); + let b = _mm256_set1_epi32(2); + let r = _mm256_sub_epi32(a, b); assert_eq!(r, b); } #[simd_test = "avx2"] - unsafe fn _mm256_sub_epi64() { - let a = i64x4::splat(4); - let b = i64x4::splat(2); - let r = avx2::_mm256_sub_epi64(a, b); + unsafe fn test_mm256_sub_epi64() { + let a = _mm256_set1_epi64x(4); + let b = _mm256_set1_epi64x(2); + let r = _mm256_sub_epi64(a, b); assert_eq!(r, b); } #[simd_test = "avx2"] - unsafe fn _mm256_sub_epi8() { - let a = i8x32::splat(4); - let b = i8x32::splat(2); - let r = avx2::_mm256_sub_epi8(a, b); + unsafe fn test_mm256_sub_epi8() { + let a = _mm256_set1_epi8(4); + let b = _mm256_set1_epi8(2); + let r = _mm256_sub_epi8(a, b); assert_eq!(r, b); } #[simd_test = "avx2"] - unsafe fn _mm256_subs_epi16() { - let a = i16x16::splat(4); - let b = i16x16::splat(2); - let r = avx2::_mm256_subs_epi16(a, b); + unsafe fn test_mm256_subs_epi16() { + let a = _mm256_set1_epi16(4); + let b = _mm256_set1_epi16(2); + let r = _mm256_subs_epi16(a, b); assert_eq!(r, b); } #[simd_test = "avx2"] - unsafe fn _mm256_subs_epi8() { - let a = i8x32::splat(4); - let b = i8x32::splat(2); - let r = avx2::_mm256_subs_epi8(a, b); + unsafe fn test_mm256_subs_epi8() { + let a = _mm256_set1_epi8(4); + let b = _mm256_set1_epi8(2); + let r = _mm256_subs_epi8(a, b); assert_eq!(r, b); } #[simd_test = "avx2"] - unsafe fn _mm256_subs_epu16() { - let a = u16x16::splat(4); - let b = u16x16::splat(2); - let r = avx2::_mm256_subs_epu16(a, b); + unsafe fn test_mm256_subs_epu16() { + let a = _mm256_set1_epi16(4); + let b = _mm256_set1_epi16(2); + let r = _mm256_subs_epu16(a, b); assert_eq!(r, b); } #[simd_test = "avx2"] - unsafe fn _mm256_subs_epu8() { - let a = u8x32::splat(4); - let b = u8x32::splat(2); - let r = avx2::_mm256_subs_epu8(a, b); + unsafe fn test_mm256_subs_epu8() { + let a = _mm256_set1_epi8(4); + let b = _mm256_set1_epi8(2); + let r = _mm256_subs_epu8(a, b); assert_eq!(r, b); } #[simd_test = "avx2"] - unsafe fn _mm256_xor_si256() { - let a = __m256i::from(i8x32::splat(5)); - let b = __m256i::from(i8x32::splat(3)); - let r = avx2::_mm256_xor_si256(a, b); - assert_eq!(r, __m256i::from(i8x32::splat(6))); + unsafe fn test_mm256_xor_si256() { + let a = __m256i::from(_mm256_set1_epi8(5)); + let b = __m256i::from(_mm256_set1_epi8(3)); + let r = _mm256_xor_si256(a, b); + assert_eq!(r, __m256i::from(_mm256_set1_epi8(6))); } #[simd_test = "avx2"] - unsafe fn _mm256_alignr_epi8() { + unsafe fn test_mm256_alignr_epi8() { #[cfg_attr(rustfmt, rustfmt_skip)] - let a = i8x32::new( + let a = _mm256_setr_epi8( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, ); #[cfg_attr(rustfmt, rustfmt_skip)] - let b = i8x32::new( + let b = _mm256_setr_epi8( -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31, -32, ); - let r = avx2::_mm256_alignr_epi8(a, b, 33); - assert_eq!(r, i8x32::splat(0)); + let r = _mm256_alignr_epi8(a, b, 33); + assert_eq!(r, _mm256_set1_epi8(0)); - let r = avx2::_mm256_alignr_epi8(a, b, 17); + let r = _mm256_alignr_epi8(a, b, 17); #[cfg_attr(rustfmt, rustfmt_skip)] - let expected = i8x32::new( + let expected = _mm256_setr_epi8( 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, @@ -4433,18 +4597,18 @@ mod tests { assert_eq!(r, expected); #[cfg_attr(rustfmt, rustfmt_skip)] - let expected = i8x32::new( + let expected = _mm256_setr_epi8( -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31, -32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, ); - let r = avx2::_mm256_alignr_epi8(a, b, 16); + let r = _mm256_alignr_epi8(a, b, 16); assert_eq!(r, expected); - let r = avx2::_mm256_alignr_epi8(a, b, 15); + let r = _mm256_alignr_epi8(a, b, 15); #[cfg_attr(rustfmt, rustfmt_skip)] - let expected = i8x32::new( + let expected = _mm256_setr_epi8( -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31, -32, 1, 2, 3, 4, 5, 6, 7, @@ -4452,146 +4616,146 @@ mod tests { ); assert_eq!(r, expected); - let r = avx2::_mm256_alignr_epi8(a, b, 0); + let r = _mm256_alignr_epi8(a, b, 0); assert_eq!(r, b); } #[simd_test = "avx2"] - unsafe fn _mm256_shuffle_epi8() { + unsafe fn test_mm256_shuffle_epi8() { #[cfg_attr(rustfmt, rustfmt_skip)] - let a = u8x32::new( + let a = _mm256_setr_epi8( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, ); #[cfg_attr(rustfmt, rustfmt_skip)] - let b = u8x32::new( - 4, 128, 4, 3, 24, 12, 6, 19, + let b = _mm256_setr_epi8( + 4, 128u8 as i8, 4, 3, 24, 12, 6, 19, 12, 5, 5, 10, 4, 1, 8, 0, - 4, 128, 4, 3, 24, 12, 6, 19, + 4, 128u8 as i8, 4, 3, 24, 12, 6, 19, 12, 5, 5, 10, 4, 1, 8, 0, ); #[cfg_attr(rustfmt, rustfmt_skip)] - let expected = u8x32::new( + let expected = _mm256_setr_epi8( 5, 0, 5, 4, 9, 13, 7, 4, 13, 6, 6, 11, 5, 2, 9, 1, 21, 0, 21, 20, 25, 29, 23, 20, 29, 22, 22, 27, 21, 18, 25, 17, ); - let r = avx2::_mm256_shuffle_epi8(a, b); + let r = _mm256_shuffle_epi8(a, b); assert_eq!(r, expected); } #[simd_test = "avx2"] - unsafe fn _mm256_permutevar8x32_epi32() { - let a = u32x8::new(100, 200, 300, 400, 500, 600, 700, 800); - let b = u32x8::new(5, 0, 5, 1, 7, 6, 3, 4); - let expected = u32x8::new(600, 100, 600, 200, 800, 700, 400, 500); - let r = avx2::_mm256_permutevar8x32_epi32(a, b); + unsafe fn test_mm256_permutevar8x32_epi32() { + let a = _mm256_setr_epi32(100, 200, 300, 400, 500, 600, 700, 800); + let b = _mm256_setr_epi32(5, 0, 5, 1, 7, 6, 3, 4); + let expected = _mm256_setr_epi32(600, 100, 600, 200, 800, 700, 400, 500); + let r = _mm256_permutevar8x32_epi32(a, b); assert_eq!(r, expected); } #[simd_test = "avx2"] - unsafe fn _mm256_permute4x64_epi64() { - let a = i64x4::new(100, 200, 300, 400); - let expected = i64x4::new(400, 100, 200, 100); - let r = avx2::_mm256_permute4x64_epi64(a, 0b00010011); + unsafe fn test_mm256_permute4x64_epi64() { + let a = _mm256_setr_epi64x(100, 200, 300, 400); + let expected = _mm256_setr_epi64x(400, 100, 200, 100); + let r = _mm256_permute4x64_epi64(a, 0b00010011); assert_eq!(r, expected); } #[simd_test = "avx2"] - unsafe fn _mm256_permute2x128_si256() { - let a = __m256i::from(i64x4::new(100, 200, 500, 600)); - let b = __m256i::from(i64x4::new(300, 400, 700, 800)); - let r = avx2::_mm256_permute2x128_si256(a, b, 0b00_01_00_11); - let e = i64x4::new(700, 800, 500, 600); - assert_eq!(i64x4::from(r), e); + unsafe fn test_mm256_permute2x128_si256() { + let a = _mm256_setr_epi64x(100, 200, 500, 600); + let b = _mm256_setr_epi64x(300, 400, 700, 800); + let r = _mm256_permute2x128_si256(a, b, 0b00_01_00_11); + let e = _mm256_setr_epi64x(700, 800, 500, 600); + assert_eq!(r, e); } #[simd_test = "avx2"] - unsafe fn _mm256_permute4x64_pd() { - let a = f64x4::new(1., 2., 3., 4.); - let r = avx2::_mm256_permute4x64_pd(a, 0b00_01_00_11); - let e = f64x4::new(4., 1., 2., 1.); - assert_eq!(r, e); + unsafe fn test_mm256_permute4x64_pd() { + let a = _mm256_setr_pd(1., 2., 3., 4.); + let r = _mm256_permute4x64_pd(a, 0b00_01_00_11); + let e = _mm256_setr_pd(4., 1., 2., 1.); + assert_eq_m256d(r, e); } #[simd_test = "avx2"] - unsafe fn _mm256_permutevar8x32_ps() { - let a = f32x8::new(1., 2., 3., 4., 5., 6., 7., 8.); - let b = i32x8::new(5, 0, 5, 1, 7, 6, 3, 4); - let r = avx2::_mm256_permutevar8x32_ps(a, b); - let e = f32x8::new(6., 1., 6., 2., 8., 7., 4., 5.); - assert_eq!(r, e); + unsafe fn test_mm256_permutevar8x32_ps() { + let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.); + let b = _mm256_setr_epi32(5, 0, 5, 1, 7, 6, 3, 4); + let r = _mm256_permutevar8x32_ps(a, b); + let e = _mm256_setr_ps(6., 1., 6., 2., 8., 7., 4., 5.); + assert_eq_m256(r, e); } #[simd_test = "avx2"] - unsafe fn _mm_i32gather_epi32() { + unsafe fn test_mm_i32gather_epi32() { let mut arr = [0i32; 128]; for i in 0..128i32 { arr[i as usize] = i; } // A multiplier of 4 is word-addressing - let r = avx2::_mm_i32gather_epi32( + let r = _mm_i32gather_epi32( arr.as_ptr(), - i32x4::new(0, 16, 32, 48), + _mm_setr_epi32(0, 16, 32, 48), 4, ); - assert_eq!(r, i32x4::new(0, 16, 32, 48)); + assert_eq!(r, _mm_setr_epi32(0, 16, 32, 48)); } #[simd_test = "avx2"] - unsafe fn _mm_mask_i32gather_epi32() { + unsafe fn test_mm_mask_i32gather_epi32() { let mut arr = [0i32; 128]; for i in 0..128i32 { arr[i as usize] = i; } // A multiplier of 4 is word-addressing - let r = avx2::_mm_mask_i32gather_epi32( - i32x4::splat(256), + let r = _mm_mask_i32gather_epi32( + _mm_set1_epi32(256), arr.as_ptr(), - i32x4::new(0, 16, 64, 96), - i32x4::new(-1, -1, -1, 0), + _mm_setr_epi32(0, 16, 64, 96), + _mm_setr_epi32(-1, -1, -1, 0), 4, ); - assert_eq!(r, i32x4::new(0, 16, 64, 256)); + assert_eq!(r, _mm_setr_epi32(0, 16, 64, 256)); } #[simd_test = "avx2"] - unsafe fn _mm256_i32gather_epi32() { + unsafe fn test_mm256_i32gather_epi32() { let mut arr = [0i32; 128]; for i in 0..128i32 { arr[i as usize] = i; } // A multiplier of 4 is word-addressing - let r = avx2::_mm256_i32gather_epi32( + let r = _mm256_i32gather_epi32( arr.as_ptr(), - i32x8::new(0, 16, 32, 48, 1, 2, 3, 4), + _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4), 4, ); - assert_eq!(r, i32x8::new(0, 16, 32, 48, 1, 2, 3, 4)); + assert_eq!(r, _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4)); } #[simd_test = "avx2"] - unsafe fn _mm256_mask_i32gather_epi32() { + unsafe fn test_mm256_mask_i32gather_epi32() { let mut arr = [0i32; 128]; for i in 0..128i32 { arr[i as usize] = i; } // A multiplier of 4 is word-addressing - let r = avx2::_mm256_mask_i32gather_epi32( - i32x8::splat(256), + let r = _mm256_mask_i32gather_epi32( + _mm256_set1_epi32(256), arr.as_ptr(), - i32x8::new(0, 16, 64, 96, 0, 0, 0, 0), - i32x8::new(-1, -1, -1, 0, 0, 0, 0, 0), + _mm256_setr_epi32(0, 16, 64, 96, 0, 0, 0, 0), + _mm256_setr_epi32(-1, -1, -1, 0, 0, 0, 0, 0), 4, ); - assert_eq!(r, i32x8::new(0, 16, 64, 256, 256, 256, 256, 256)); + assert_eq!(r, _mm256_setr_epi32(0, 16, 64, 256, 256, 256, 256, 256)); } #[simd_test = "avx2"] - unsafe fn _mm_i32gather_ps() { + unsafe fn test_mm_i32gather_ps() { let mut arr = [0.0f32; 128]; let mut j = 0.0; for i in 0..128usize { @@ -4600,12 +4764,12 @@ mod tests { } // A multiplier of 4 is word-addressing for f32s let r = - avx2::_mm_i32gather_ps(arr.as_ptr(), i32x4::new(0, 16, 32, 48), 4); - assert_eq!(r, f32x4::new(0.0, 16.0, 32.0, 48.0)); + _mm_i32gather_ps(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48), 4); + assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 32.0, 48.0)); } #[simd_test = "avx2"] - unsafe fn _mm_mask_i32gather_ps() { + unsafe fn test_mm_mask_i32gather_ps() { let mut arr = [0.0f32; 128]; let mut j = 0.0; for i in 0..128usize { @@ -4613,18 +4777,18 @@ mod tests { j += 1.0; } // A multiplier of 4 is word-addressing for f32s - let r = avx2::_mm_mask_i32gather_ps( - f32x4::splat(256.0), + let r = _mm_mask_i32gather_ps( + _mm_set1_ps(256.0), arr.as_ptr(), - i32x4::new(0, 16, 64, 96), - f32x4::new(-1.0, -1.0, -1.0, 0.0), + _mm_setr_epi32(0, 16, 64, 96), + _mm_setr_ps(-1.0, -1.0, -1.0, 0.0), 4, ); - assert_eq!(r, f32x4::new(0.0, 16.0, 64.0, 256.0)); + assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 64.0, 256.0)); } #[simd_test = "avx2"] - unsafe fn _mm256_i32gather_ps() { + unsafe fn test_mm256_i32gather_ps() { let mut arr = [0.0f32; 128]; let mut j = 0.0; for i in 0..128usize { @@ -4632,16 +4796,16 @@ mod tests { j += 1.0; } // A multiplier of 4 is word-addressing for f32s - let r = avx2::_mm256_i32gather_ps( + let r = _mm256_i32gather_ps( arr.as_ptr(), - i32x8::new(0, 16, 32, 48, 1, 2, 3, 4), + _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4), 4, ); - assert_eq!(r, f32x8::new(0.0, 16.0, 32.0, 48.0, 1.0, 2.0, 3.0, 4.0)); + assert_eq_m256(r, _mm256_setr_ps(0.0, 16.0, 32.0, 48.0, 1.0, 2.0, 3.0, 4.0)); } #[simd_test = "avx2"] - unsafe fn _mm256_mask_i32gather_ps() { + unsafe fn test_mm256_mask_i32gather_ps() { let mut arr = [0.0f32; 128]; let mut j = 0.0; for i in 0..128usize { @@ -4649,85 +4813,85 @@ mod tests { j += 1.0; } // A multiplier of 4 is word-addressing for f32s - let r = avx2::_mm256_mask_i32gather_ps( - f32x8::splat(256.0), + let r = _mm256_mask_i32gather_ps( + _mm256_set1_ps(256.0), arr.as_ptr(), - i32x8::new(0, 16, 64, 96, 0, 0, 0, 0), - f32x8::new(-1.0, -1.0, -1.0, 0.0, 0.0, 0.0, 0.0, 0.0), + _mm256_setr_epi32(0, 16, 64, 96, 0, 0, 0, 0), + _mm256_setr_ps(-1.0, -1.0, -1.0, 0.0, 0.0, 0.0, 0.0, 0.0), 4, ); - assert_eq!( + assert_eq_m256( r, - f32x8::new(0.0, 16.0, 64.0, 256.0, 256.0, 256.0, 256.0, 256.0) + _mm256_setr_ps(0.0, 16.0, 64.0, 256.0, 256.0, 256.0, 256.0, 256.0) ); } #[simd_test = "avx2"] - unsafe fn _mm_i32gather_epi64() { + unsafe fn test_mm_i32gather_epi64() { let mut arr = [0i64; 128]; for i in 0..128i64 { arr[i as usize] = i; } // A multiplier of 8 is word-addressing for i64s - let r = avx2::_mm_i32gather_epi64( + let r = _mm_i32gather_epi64( arr.as_ptr(), - i32x4::new(0, 16, 0, 0), + _mm_setr_epi32(0, 16, 0, 0), 8, ); - assert_eq!(r, i64x2::new(0, 16)); + assert_eq!(r, _mm_setr_epi64x(0, 16)); } #[simd_test = "avx2"] - unsafe fn _mm_mask_i32gather_epi64() { + unsafe fn test_mm_mask_i32gather_epi64() { let mut arr = [0i64; 128]; for i in 0..128i64 { arr[i as usize] = i; } // A multiplier of 8 is word-addressing for i64s - let r = avx2::_mm_mask_i32gather_epi64( - i64x2::splat(256), + let r = _mm_mask_i32gather_epi64( + _mm_set1_epi64x(256), arr.as_ptr(), - i32x4::new(16, 16, 16, 16), - i64x2::new(-1, 0), + _mm_setr_epi32(16, 16, 16, 16), + _mm_setr_epi64x(-1, 0), 8, ); - assert_eq!(r, i64x2::new(16, 256)); + assert_eq!(r, _mm_setr_epi64x(16, 256)); } #[simd_test = "avx2"] - unsafe fn _mm256_i32gather_epi64() { + unsafe fn test_mm256_i32gather_epi64() { let mut arr = [0i64; 128]; for i in 0..128i64 { arr[i as usize] = i; } // A multiplier of 8 is word-addressing for i64s - let r = avx2::_mm256_i32gather_epi64( + let r = _mm256_i32gather_epi64( arr.as_ptr(), - i32x4::new(0, 16, 32, 48), + _mm_setr_epi32(0, 16, 32, 48), 8, ); - assert_eq!(r, i64x4::new(0, 16, 32, 48)); + assert_eq!(r, _mm256_setr_epi64x(0, 16, 32, 48)); } #[simd_test = "avx2"] - unsafe fn _mm256_mask_i32gather_epi64() { + unsafe fn test_mm256_mask_i32gather_epi64() { let mut arr = [0i64; 128]; for i in 0..128i64 { arr[i as usize] = i; } // A multiplier of 8 is word-addressing for i64s - let r = avx2::_mm256_mask_i32gather_epi64( - i64x4::splat(256), + let r = _mm256_mask_i32gather_epi64( + _mm256_set1_epi64x(256), arr.as_ptr(), - i32x4::new(0, 16, 64, 96), - i64x4::new(-1, -1, -1, 0), + _mm_setr_epi32(0, 16, 64, 96), + _mm256_setr_epi64x(-1, -1, -1, 0), 8, ); - assert_eq!(r, i64x4::new(0, 16, 64, 256)); + assert_eq!(r, _mm256_setr_epi64x(0, 16, 64, 256)); } #[simd_test = "avx2"] - unsafe fn _mm_i32gather_pd() { + unsafe fn test_mm_i32gather_pd() { let mut arr = [0.0f64; 128]; let mut j = 0.0; for i in 0..128usize { @@ -4736,12 +4900,12 @@ mod tests { } // A multiplier of 8 is word-addressing for f64s let r = - avx2::_mm_i32gather_pd(arr.as_ptr(), i32x4::new(0, 16, 0, 0), 8); - assert_eq!(r, f64x2::new(0.0, 16.0)); + _mm_i32gather_pd(arr.as_ptr(), _mm_setr_epi32(0, 16, 0, 0), 8); + assert_eq_m128d(r, _mm_setr_pd(0.0, 16.0)); } #[simd_test = "avx2"] - unsafe fn _mm_mask_i32gather_pd() { + unsafe fn test_mm_mask_i32gather_pd() { let mut arr = [0.0f64; 128]; let mut j = 0.0; for i in 0..128usize { @@ -4749,18 +4913,18 @@ mod tests { j += 1.0; } // A multiplier of 8 is word-addressing for f64s - let r = avx2::_mm_mask_i32gather_pd( - f64x2::splat(256.0), + let r = _mm_mask_i32gather_pd( + _mm_set1_pd(256.0), arr.as_ptr(), - i32x4::new(16, 16, 16, 16), - f64x2::new(-1.0, 0.0), + _mm_setr_epi32(16, 16, 16, 16), + _mm_setr_pd(-1.0, 0.0), 8, ); - assert_eq!(r, f64x2::new(16.0, 256.0)); + assert_eq_m128d(r, _mm_setr_pd(16.0, 256.0)); } #[simd_test = "avx2"] - unsafe fn _mm256_i32gather_pd() { + unsafe fn test_mm256_i32gather_pd() { let mut arr = [0.0f64; 128]; let mut j = 0.0; for i in 0..128usize { @@ -4768,16 +4932,16 @@ mod tests { j += 1.0; } // A multiplier of 8 is word-addressing for f64s - let r = avx2::_mm256_i32gather_pd( + let r = _mm256_i32gather_pd( arr.as_ptr(), - i32x4::new(0, 16, 32, 48), + _mm_setr_epi32(0, 16, 32, 48), 8, ); - assert_eq!(r, f64x4::new(0.0, 16.0, 32.0, 48.0)); + assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 32.0, 48.0)); } #[simd_test = "avx2"] - unsafe fn _mm256_mask_i32gather_pd() { + unsafe fn test_mm256_mask_i32gather_pd() { let mut arr = [0.0f64; 128]; let mut j = 0.0; for i in 0..128usize { @@ -4785,78 +4949,78 @@ mod tests { j += 1.0; } // A multiplier of 8 is word-addressing for f64s - let r = avx2::_mm256_mask_i32gather_pd( - f64x4::splat(256.0), + let r = _mm256_mask_i32gather_pd( + _mm256_set1_pd(256.0), arr.as_ptr(), - i32x4::new(0, 16, 64, 96), - f64x4::new(-1.0, -1.0, -1.0, 0.0), + _mm_setr_epi32(0, 16, 64, 96), + _mm256_setr_pd(-1.0, -1.0, -1.0, 0.0), 8, ); - assert_eq!(r, f64x4::new(0.0, 16.0, 64.0, 256.0)); + assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 64.0, 256.0)); } #[simd_test = "avx2"] - unsafe fn _mm_i64gather_epi32() { + unsafe fn test_mm_i64gather_epi32() { let mut arr = [0i32; 128]; for i in 0..128i32 { arr[i as usize] = i; } // A multiplier of 4 is word-addressing - let r = avx2::_mm_i64gather_epi32(arr.as_ptr(), i64x2::new(0, 16), 4); - assert_eq!(r, i32x4::new(0, 16, 0, 0)); + let r = _mm_i64gather_epi32(arr.as_ptr(), _mm_setr_epi64x(0, 16), 4); + assert_eq!(r, _mm_setr_epi32(0, 16, 0, 0)); } #[simd_test = "avx2"] - unsafe fn _mm_mask_i64gather_epi32() { + unsafe fn test_mm_mask_i64gather_epi32() { let mut arr = [0i32; 128]; for i in 0..128i32 { arr[i as usize] = i; } // A multiplier of 4 is word-addressing - let r = avx2::_mm_mask_i64gather_epi32( - i32x4::splat(256), + let r = _mm_mask_i64gather_epi32( + _mm_set1_epi32(256), arr.as_ptr(), - i64x2::new(0, 16), - i32x4::new(-1, 0, -1, 0), + _mm_setr_epi64x(0, 16), + _mm_setr_epi32(-1, 0, -1, 0), 4, ); - assert_eq!(r, i32x4::new(0, 256, 0, 0)); + assert_eq!(r, _mm_setr_epi32(0, 256, 0, 0)); } #[simd_test = "avx2"] - unsafe fn _mm256_i64gather_epi32() { + unsafe fn test_mm256_i64gather_epi32() { let mut arr = [0i32; 128]; for i in 0..128i32 { arr[i as usize] = i; } // A multiplier of 4 is word-addressing - let r = avx2::_mm256_i64gather_epi32( + let r = _mm256_i64gather_epi32( arr.as_ptr(), - i64x4::new(0, 16, 32, 48), + _mm256_setr_epi64x(0, 16, 32, 48), 4, ); - assert_eq!(r, i32x4::new(0, 16, 32, 48)); + assert_eq!(r, _mm_setr_epi32(0, 16, 32, 48)); } #[simd_test = "avx2"] - unsafe fn _mm256_mask_i64gather_epi32() { + unsafe fn test_mm256_mask_i64gather_epi32() { let mut arr = [0i32; 128]; for i in 0..128i32 { arr[i as usize] = i; } // A multiplier of 4 is word-addressing - let r = avx2::_mm256_mask_i64gather_epi32( - i32x4::splat(256), + let r = _mm256_mask_i64gather_epi32( + _mm_set1_epi32(256), arr.as_ptr(), - i64x4::new(0, 16, 64, 96), - i32x4::new(-1, -1, -1, 0), + _mm256_setr_epi64x(0, 16, 64, 96), + _mm_setr_epi32(-1, -1, -1, 0), 4, ); - assert_eq!(r, i32x4::new(0, 16, 64, 256)); + assert_eq!(r, _mm_setr_epi32(0, 16, 64, 256)); } #[simd_test = "avx2"] - unsafe fn _mm_i64gather_ps() { + unsafe fn test_mm_i64gather_ps() { let mut arr = [0.0f32; 128]; let mut j = 0.0; for i in 0..128usize { @@ -4864,12 +5028,12 @@ mod tests { j += 1.0; } // A multiplier of 4 is word-addressing for f32s - let r = avx2::_mm_i64gather_ps(arr.as_ptr(), i64x2::new(0, 16), 4); - assert_eq!(r, f32x4::new(0.0, 16.0, 0.0, 0.0)); + let r = _mm_i64gather_ps(arr.as_ptr(), _mm_setr_epi64x(0, 16), 4); + assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 0.0, 0.0)); } #[simd_test = "avx2"] - unsafe fn _mm_mask_i64gather_ps() { + unsafe fn test_mm_mask_i64gather_ps() { let mut arr = [0.0f32; 128]; let mut j = 0.0; for i in 0..128usize { @@ -4877,18 +5041,18 @@ mod tests { j += 1.0; } // A multiplier of 4 is word-addressing for f32s - let r = avx2::_mm_mask_i64gather_ps( - f32x4::splat(256.0), + let r = _mm_mask_i64gather_ps( + _mm_set1_ps(256.0), arr.as_ptr(), - i64x2::new(0, 16), - f32x4::new(-1.0, 0.0, -1.0, 0.0), + _mm_setr_epi64x(0, 16), + _mm_setr_ps(-1.0, 0.0, -1.0, 0.0), 4, ); - assert_eq!(r, f32x4::new(0.0, 256.0, 0.0, 0.0)); + assert_eq_m128(r, _mm_setr_ps(0.0, 256.0, 0.0, 0.0)); } #[simd_test = "avx2"] - unsafe fn _mm256_i64gather_ps() { + unsafe fn test_mm256_i64gather_ps() { let mut arr = [0.0f32; 128]; let mut j = 0.0; for i in 0..128usize { @@ -4896,16 +5060,16 @@ mod tests { j += 1.0; } // A multiplier of 4 is word-addressing for f32s - let r = avx2::_mm256_i64gather_ps( + let r = _mm256_i64gather_ps( arr.as_ptr(), - i64x4::new(0, 16, 32, 48), + _mm256_setr_epi64x(0, 16, 32, 48), 4, ); - assert_eq!(r, f32x4::new(0.0, 16.0, 32.0, 48.0)); + assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 32.0, 48.0)); } #[simd_test = "avx2"] - unsafe fn _mm256_mask_i64gather_ps() { + unsafe fn test_mm256_mask_i64gather_ps() { let mut arr = [0.0f32; 128]; let mut j = 0.0; for i in 0..128usize { @@ -4913,78 +5077,78 @@ mod tests { j += 1.0; } // A multiplier of 4 is word-addressing for f32s - let r = avx2::_mm256_mask_i64gather_ps( - f32x4::splat(256.0), + let r = _mm256_mask_i64gather_ps( + _mm_set1_ps(256.0), arr.as_ptr(), - i64x4::new(0, 16, 64, 96), - f32x4::new(-1.0, -1.0, -1.0, 0.0), + _mm256_setr_epi64x(0, 16, 64, 96), + _mm_setr_ps(-1.0, -1.0, -1.0, 0.0), 4, ); - assert_eq!(r, f32x4::new(0.0, 16.0, 64.0, 256.0)); + assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 64.0, 256.0)); } #[simd_test = "avx2"] - unsafe fn _mm_i64gather_epi64() { + unsafe fn test_mm_i64gather_epi64() { let mut arr = [0i64; 128]; for i in 0..128i64 { arr[i as usize] = i; } // A multiplier of 8 is word-addressing for i64s - let r = avx2::_mm_i64gather_epi64(arr.as_ptr(), i64x2::new(0, 16), 8); - assert_eq!(r, i64x2::new(0, 16)); + let r = _mm_i64gather_epi64(arr.as_ptr(), _mm_setr_epi64x(0, 16), 8); + assert_eq!(r, _mm_setr_epi64x(0, 16)); } #[simd_test = "avx2"] - unsafe fn _mm_mask_i64gather_epi64() { + unsafe fn test_mm_mask_i64gather_epi64() { let mut arr = [0i64; 128]; for i in 0..128i64 { arr[i as usize] = i; } // A multiplier of 8 is word-addressing for i64s - let r = avx2::_mm_mask_i64gather_epi64( - i64x2::splat(256), + let r = _mm_mask_i64gather_epi64( + _mm_set1_epi64x(256), arr.as_ptr(), - i64x2::new(16, 16), - i64x2::new(-1, 0), + _mm_setr_epi64x(16, 16), + _mm_setr_epi64x(-1, 0), 8, ); - assert_eq!(r, i64x2::new(16, 256)); + assert_eq!(r, _mm_setr_epi64x(16, 256)); } #[simd_test = "avx2"] - unsafe fn _mm256_i64gather_epi64() { + unsafe fn test_mm256_i64gather_epi64() { let mut arr = [0i64; 128]; for i in 0..128i64 { arr[i as usize] = i; } // A multiplier of 8 is word-addressing for i64s - let r = avx2::_mm256_i64gather_epi64( + let r = _mm256_i64gather_epi64( arr.as_ptr(), - i64x4::new(0, 16, 32, 48), + _mm256_setr_epi64x(0, 16, 32, 48), 8, ); - assert_eq!(r, i64x4::new(0, 16, 32, 48)); + assert_eq!(r, _mm256_setr_epi64x(0, 16, 32, 48)); } #[simd_test = "avx2"] - unsafe fn _mm256_mask_i64gather_epi64() { + unsafe fn test_mm256_mask_i64gather_epi64() { let mut arr = [0i64; 128]; for i in 0..128i64 { arr[i as usize] = i; } // A multiplier of 8 is word-addressing for i64s - let r = avx2::_mm256_mask_i64gather_epi64( - i64x4::splat(256), + let r = _mm256_mask_i64gather_epi64( + _mm256_set1_epi64x(256), arr.as_ptr(), - i64x4::new(0, 16, 64, 96), - i64x4::new(-1, -1, -1, 0), + _mm256_setr_epi64x(0, 16, 64, 96), + _mm256_setr_epi64x(-1, -1, -1, 0), 8, ); - assert_eq!(r, i64x4::new(0, 16, 64, 256)); + assert_eq!(r, _mm256_setr_epi64x(0, 16, 64, 256)); } #[simd_test = "avx2"] - unsafe fn _mm_i64gather_pd() { + unsafe fn test_mm_i64gather_pd() { let mut arr = [0.0f64; 128]; let mut j = 0.0; for i in 0..128usize { @@ -4992,12 +5156,12 @@ mod tests { j += 1.0; } // A multiplier of 8 is word-addressing for f64s - let r = avx2::_mm_i64gather_pd(arr.as_ptr(), i64x2::new(0, 16), 8); - assert_eq!(r, f64x2::new(0.0, 16.0)); + let r = _mm_i64gather_pd(arr.as_ptr(), _mm_setr_epi64x(0, 16), 8); + assert_eq_m128d(r, _mm_setr_pd(0.0, 16.0)); } #[simd_test = "avx2"] - unsafe fn _mm_mask_i64gather_pd() { + unsafe fn test_mm_mask_i64gather_pd() { let mut arr = [0.0f64; 128]; let mut j = 0.0; for i in 0..128usize { @@ -5005,18 +5169,18 @@ mod tests { j += 1.0; } // A multiplier of 8 is word-addressing for f64s - let r = avx2::_mm_mask_i64gather_pd( - f64x2::splat(256.0), + let r = _mm_mask_i64gather_pd( + _mm_set1_pd(256.0), arr.as_ptr(), - i64x2::new(16, 16), - f64x2::new(-1.0, 0.0), + _mm_setr_epi64x(16, 16), + _mm_setr_pd(-1.0, 0.0), 8, ); - assert_eq!(r, f64x2::new(16.0, 256.0)); + assert_eq_m128d(r, _mm_setr_pd(16.0, 256.0)); } #[simd_test = "avx2"] - unsafe fn _mm256_i64gather_pd() { + unsafe fn test_mm256_i64gather_pd() { let mut arr = [0.0f64; 128]; let mut j = 0.0; for i in 0..128usize { @@ -5024,16 +5188,16 @@ mod tests { j += 1.0; } // A multiplier of 8 is word-addressing for f64s - let r = avx2::_mm256_i64gather_pd( + let r = _mm256_i64gather_pd( arr.as_ptr(), - i64x4::new(0, 16, 32, 48), + _mm256_setr_epi64x(0, 16, 32, 48), 8, ); - assert_eq!(r, f64x4::new(0.0, 16.0, 32.0, 48.0)); + assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 32.0, 48.0)); } #[simd_test = "avx2"] - unsafe fn _mm256_mask_i64gather_pd() { + unsafe fn test_mm256_mask_i64gather_pd() { let mut arr = [0.0f64; 128]; let mut j = 0.0; for i in 0..128usize { @@ -5041,68 +5205,68 @@ mod tests { j += 1.0; } // A multiplier of 8 is word-addressing for f64s - let r = avx2::_mm256_mask_i64gather_pd( - f64x4::splat(256.0), + let r = _mm256_mask_i64gather_pd( + _mm256_set1_pd(256.0), arr.as_ptr(), - i64x4::new(0, 16, 64, 96), - f64x4::new(-1.0, -1.0, -1.0, 0.0), + _mm256_setr_epi64x(0, 16, 64, 96), + _mm256_setr_pd(-1.0, -1.0, -1.0, 0.0), 8, ); - assert_eq!(r, f64x4::new(0.0, 16.0, 64.0, 256.0)); + assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 64.0, 256.0)); } #[simd_test = "avx"] - unsafe fn _mm256_extract_epi8() { + unsafe fn test_mm256_extract_epi8() { #[cfg_attr(rustfmt, rustfmt_skip)] - let a = i8x32::new( + let a = _mm256_setr_epi8( -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 ); - let r1 = avx2::_mm256_extract_epi8(a, 0); - let r2 = avx2::_mm256_extract_epi8(a, 35); + let r1 = _mm256_extract_epi8(a, 0); + let r2 = _mm256_extract_epi8(a, 35); assert_eq!(r1, -1); assert_eq!(r2, 3); } #[simd_test = "avx2"] - unsafe fn _mm256_extract_epi16() { + unsafe fn test_mm256_extract_epi16() { let a = - i16x16::new(-1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - let r1 = avx2::_mm256_extract_epi16(a, 0); - let r2 = avx2::_mm256_extract_epi16(a, 19); + _mm256_setr_epi16(-1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let r1 = _mm256_extract_epi16(a, 0); + let r2 = _mm256_extract_epi16(a, 19); assert_eq!(r1, -1); assert_eq!(r2, 3); } #[simd_test = "avx2"] - unsafe fn _mm256_extract_epi32() { - let a = i32x8::new(-1, 1, 2, 3, 4, 5, 6, 7); - let r1 = avx2::_mm256_extract_epi32(a, 0); - let r2 = avx2::_mm256_extract_epi32(a, 11); + unsafe fn test_mm256_extract_epi32() { + let a = _mm256_setr_epi32(-1, 1, 2, 3, 4, 5, 6, 7); + let r1 = _mm256_extract_epi32(a, 0); + let r2 = _mm256_extract_epi32(a, 11); assert_eq!(r1, -1); assert_eq!(r2, 3); } #[simd_test = "avx2"] - unsafe fn _mm256_extract_epi64() { - let a = i64x4::new(0, 1, 2, 3); - let r = avx2::_mm256_extract_epi64(a, 3); + unsafe fn test_mm256_extract_epi64() { + let a = _mm256_setr_epi64x(0, 1, 2, 3); + let r = _mm256_extract_epi64(a, 3); assert_eq!(r, 3); } #[simd_test = "avx2"] - unsafe fn _mm256_cvtsd_f64() { - let a = f64x4::new(1., 2., 3., 4.); - let r = avx2::_mm256_cvtsd_f64(a); + unsafe fn test_mm256_cvtsd_f64() { + let a = _mm256_setr_pd(1., 2., 3., 4.); + let r = _mm256_cvtsd_f64(a); assert_eq!(r, 1.); } #[simd_test = "avx2"] - unsafe fn _mm256_cvtsi256_si32() { - let a = i32x8::new(1, 2, 3, 4, 5, 6, 7, 8); - let r = avx2::_mm256_cvtsi256_si32(a); + unsafe fn test_mm256_cvtsi256_si32() { + let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8); + let r = _mm256_cvtsi256_si32(a); assert_eq!(r, 1); } }