From c9a400e488ed250f45fbe67c7d647d661d3021e2 Mon Sep 17 00:00:00 2001 From: Jack Mott Date: Fri, 26 May 2017 18:08:55 -0500 Subject: [PATCH 01/12] start adding avx2 --- src/x86/avx2.rs | 164 +++++++++++++++++++++++++++++++++++++++++++++++- src/x86/mod.rs | 2 + 2 files changed, 163 insertions(+), 3 deletions(-) diff --git a/src/x86/avx2.rs b/src/x86/avx2.rs index 0a4588b178..2a29b48e17 100644 --- a/src/x86/avx2.rs +++ b/src/x86/avx2.rs @@ -1,3 +1,161 @@ -use simd::*; -use v128::*; -use v64::*; +use v256::*; + +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_abs_epi32(a: i32x8) -> i32x8 { + unsafe { pabsd(a) } +} + +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_abs_epi16(a: i16x16) -> i16x16 { + unsafe { pabsw(a) } +} + +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_abs_epi8(a: i8x32) -> i8x32 { + unsafe { pabsb(a) } +} + +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_add_epi64(a: i64x4, b: i64x4) -> i64x4 { + a + b +} + +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_add_epi32(a: i32x8, b: i32x8) -> i32x8 { + a + b +} + +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_add_epi16(a: i16x16, b: i16x16) -> i16x16 { + a + b +} + +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_add_epi8(a: i8x32, b: i8x32) -> i8x32 { + a + b +} + + +#[allow(improper_ctypes)] +extern "C" { + #[link_name = "llvm.x86.avx2.pabs.b"] + fn pabsb(a: i8x32) -> i8x32; + #[link_name = "llvm.x86.avx2.pabs.w"] + fn pabsw(a: i16x16) -> i16x16; + #[link_name = "llvm.x86.avx2.pabs.d"] + fn pabsd(a: i32x8) -> i32x8; + #[link_name = "llvm.x86.avx2.padds_b"] + fn paddsb(a:i8x32,b:i8x32) -> i8x32; + + +} + + +#[cfg(test)] +mod tests { + use v256::*; + use x86::avx2; + use std; + + #[test] + #[target_feature = "+avx2"] + fn _mm_256_abs_epi32() { + let a = i32x8::new(0, 1, -1, std::i32::MAX, + std::i32::MIN + 1, 100, -100, -32); + let r = avx2::_mm256_abs_epi32(a); + let e = i32x8::new(0, 1, 1, std::i32::MAX, + (std::i32::MIN + 1).abs(), 100, 100, 32); + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm_256_abs_epi16() { + let a = i16x16::new(0, 1, -1, 2, + -2, 3, -3, 4, + -4, 5, -5, std::i16::MAX, + std::i16::MIN + 1, 100, -100, -32); + let r = avx2::_mm256_abs_epi16(a); + let e = i16x16::new(0, 1, 1, 2, + 2, 3, 3, 4, + 4, 5, 5, std::i16::MAX, + (std::i16::MIN + 1).abs(), 100, 100, 32); + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm_256_abs_epi8() { + let a = i8x32::new(0, 1, -1, 2, + -2, 3, -3, 4, + -4, 5, -5, std::i8::MAX, + std::i8::MIN + 1, 100, -100, -32, + 0, 1, -1, 2, + -2, 3, -3, 4, + -4, 5, -5, std::i8::MAX, + std::i8::MIN + 1, 100, -100, -32); + let r = avx2::_mm256_abs_epi8(a); + let e = i8x32::new(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, std::i8::MAX, (std::i8::MIN + 1).abs(), 100, 100, 32, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, std::i8::MAX, (std::i8::MIN + 1).abs(), 100, 100, 32); + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+avx2"] + fn __mm_256_add_eip64() { + let a = i64x4::new(-10, 0, 100, 1_000_000_000); + let b = i64x4::new(-1, 0, 1, 2); + let r = avx2::_mm256_add_epi64(a, b); + let e = i64x4::new(-11, 0, 101, 1_000_000_002); + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+avx2"] + fn __mm_256_add_eip32() { + let a = i32x8::new(-1, 0, 1, 2, 3, 4, 5, 6); + let b = i32x8::new(1, 2, 3, 4, 5, 6, 7, 8); + let r = avx2::_mm256_add_epi32(a, b); + let e = i32x8::new(0, 2, 4, 6, 8, 10, 12, 14); + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+avx2"] + fn __mm_256_add_eip16() { + let a = i16x16::new(0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15); + let b = i16x16::new(0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15); + let r = avx2::_mm256_add_epi16(a, b); + let e = i16x16::new(0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30); + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+avx2"] + fn __mm_256_add_eip8() { + let a = i8x32::new(0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31); + let b = i8x32::new(0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31); + let r = avx2::_mm256_add_epi8(a, b); + let e = i8x32::new(0, 2, 4, 6, 8, 10, 12, 14, 16, + 18, 20, 22, 24, 26, 28, 30, 32, + 34, 36, 38, 40, 42, 44, 46, 48, + 50, 52, 54, 56, 58, 60,62); + assert_eq!(r, e); + } + +} diff --git a/src/x86/mod.rs b/src/x86/mod.rs index 610bf657d0..45ba6e158d 100644 --- a/src/x86/mod.rs +++ b/src/x86/mod.rs @@ -2,6 +2,7 @@ pub use self::sse::*; pub use self::sse2::*; pub use self::ssse3::*; pub use self::sse42::*; +pub use self::avx2::*; #[allow(non_camel_case_types)] pub type __m128i = ::v128::i8x16; @@ -10,3 +11,4 @@ mod sse; mod sse2; mod ssse3; mod sse42; +mod avx2; \ No newline at end of file From b052dc5a5522dd73902e3d01506b67dbc671a7b5 Mon Sep 17 00:00:00 2001 From: Jack Mott Date: Fri, 26 May 2017 19:14:06 -0500 Subject: [PATCH 02/12] test for adds_epi8 --- src/x86/avx2.rs | 104 ++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 87 insertions(+), 17 deletions(-) diff --git a/src/x86/avx2.rs b/src/x86/avx2.rs index 2a29b48e17..91b4236d47 100644 --- a/src/x86/avx2.rs +++ b/src/x86/avx2.rs @@ -42,6 +42,31 @@ pub fn _mm256_add_epi8(a: i8x32, b: i8x32) -> i8x32 { a + b } +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_adds_epi8(a: i8x32, b: i8x32) -> i8x32 { + unsafe { paddsb(a,b) } +} + +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_adds_epi16(a: i16x16, b: i16x16) -> i16x16 { + unsafe { paddsw(a,b) } +} + + +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_adds_epu8(a: u8x32, b: u8x32) -> u8x32 { + unsafe { paddusb(a,b) } +} + +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_adds_epu6(a: u16x16, b: u16x16) -> u16x16 { + unsafe { paddusw(a,b) } +} + #[allow(improper_ctypes)] extern "C" { @@ -51,8 +76,14 @@ extern "C" { fn pabsw(a: i16x16) -> i16x16; #[link_name = "llvm.x86.avx2.pabs.d"] fn pabsd(a: i32x8) -> i32x8; - #[link_name = "llvm.x86.avx2.padds_b"] + #[link_name = "llvm.x86.avx2.padds.b"] fn paddsb(a:i8x32,b:i8x32) -> i8x32; + #[link_name = "llvm.x86.avx2.padds.w"] + fn paddsw(a:i16x16,b:i16x16) -> i16x16; + #[link_name = "llvm.x86.avx2.paddus.b"] + fn paddusb(a:u8x32,b:u8x32) -> u8x32; + #[link_name = "llvm.x86.avx2.paddus.w"] + fn paddusw(a:u16x16,b:u16x16) -> u16x16; } @@ -108,7 +139,7 @@ mod tests { #[test] #[target_feature = "+avx2"] - fn __mm_256_add_eip64() { + fn _mm_256_add_epi64() { let a = i64x4::new(-10, 0, 100, 1_000_000_000); let b = i64x4::new(-1, 0, 1, 2); let r = avx2::_mm256_add_epi64(a, b); @@ -118,7 +149,7 @@ mod tests { #[test] #[target_feature = "+avx2"] - fn __mm_256_add_eip32() { + fn _mm_256_add_epi32() { let a = i32x8::new(-1, 0, 1, 2, 3, 4, 5, 6); let b = i32x8::new(1, 2, 3, 4, 5, 6, 7, 8); let r = avx2::_mm256_add_epi32(a, b); @@ -128,7 +159,7 @@ mod tests { #[test] #[target_feature = "+avx2"] - fn __mm_256_add_eip16() { + fn _mm_256_add_epi16() { let a = i16x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); let b = i16x16::new(0, 1, 2, 3, 4, 5, 6, 7, @@ -141,21 +172,60 @@ mod tests { #[test] #[target_feature = "+avx2"] - fn __mm_256_add_eip8() { - let a = i8x32::new(0, 1, 2, 3, 4, 5, 6, 7, - 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, - 24, 25, 26, 27, 28, 29, 30, 31); - let b = i8x32::new(0, 1, 2, 3, 4, 5, 6, 7, - 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, - 24, 25, 26, 27, 28, 29, 30, 31); + fn _mm_256_add_epi8() { + let a = i8x32::new( + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31); + let b = i8x32::new( + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31); let r = avx2::_mm256_add_epi8(a, b); - let e = i8x32::new(0, 2, 4, 6, 8, 10, 12, 14, 16, - 18, 20, 22, 24, 26, 28, 30, 32, - 34, 36, 38, 40, 42, 44, 46, 48, - 50, 52, 54, 56, 58, 60,62); + let e = i8x32::new( + 0, 2, 4, 6, 8, 10, 12, 14, 16, + 18, 20, 22, 24, 26, 28, 30, 32, + 34, 36, 38, 40, 42, 44, 46, 48, + 50, 52, 54, 56, 58, 60,62); + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm_256_adds_epi8() { + let a = i8x32::new( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31); + let b = i8x32::new( + 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47, + 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63); + let r = avx2::_mm256_adds_epi8(a, b); + let e = i8x32::new( + 32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62, + 64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94); assert_eq!(r, e); } + #[test] + #[target_feature = "+avx2"] + fn _mm_adds_epi8_saturate_positive() { + let a = i8x32::splat(0x7F); + let b = i8x32::splat(1); + let r = avx2::_mm256_adds_epi8(a, b); + assert_eq!(r, a); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm_adds_epi8_saturate_negative() { + let a = i8x32::splat(-0x80); + let b = i8x32::splat(-1); + let r = avx2::_mm256_adds_epi8(a, b); + assert_eq!(r, a); + } + + + } From e280aa4dcd2a8b3469b0070266fcdff4453a0f26 Mon Sep 17 00:00:00 2001 From: Jack Mott Date: Fri, 26 May 2017 19:21:19 -0500 Subject: [PATCH 03/12] all adds tests --- src/x86/avx2.rs | 105 +++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 94 insertions(+), 11 deletions(-) diff --git a/src/x86/avx2.rs b/src/x86/avx2.rs index 91b4236d47..c7caab83a1 100644 --- a/src/x86/avx2.rs +++ b/src/x86/avx2.rs @@ -63,7 +63,7 @@ pub fn _mm256_adds_epu8(a: u8x32, b: u8x32) -> u8x32 { #[inline(always)] #[target_feature = "+avx2"] -pub fn _mm256_adds_epu6(a: u16x16, b: u16x16) -> u16x16 { +pub fn _mm256_adds_epu16(a: u16x16, b: u16x16) -> u16x16 { unsafe { paddusw(a,b) } } @@ -97,7 +97,7 @@ mod tests { #[test] #[target_feature = "+avx2"] - fn _mm_256_abs_epi32() { + fn _mm256_abs_epi32() { let a = i32x8::new(0, 1, -1, std::i32::MAX, std::i32::MIN + 1, 100, -100, -32); let r = avx2::_mm256_abs_epi32(a); @@ -108,7 +108,7 @@ mod tests { #[test] #[target_feature = "+avx2"] - fn _mm_256_abs_epi16() { + fn _mm256_abs_epi16() { let a = i16x16::new(0, 1, -1, 2, -2, 3, -3, 4, -4, 5, -5, std::i16::MAX, @@ -123,7 +123,7 @@ mod tests { #[test] #[target_feature = "+avx2"] - fn _mm_256_abs_epi8() { + fn _mm256_abs_epi8() { let a = i8x32::new(0, 1, -1, 2, -2, 3, -3, 4, -4, 5, -5, std::i8::MAX, @@ -139,7 +139,7 @@ mod tests { #[test] #[target_feature = "+avx2"] - fn _mm_256_add_epi64() { + fn _mm256_add_epi64() { let a = i64x4::new(-10, 0, 100, 1_000_000_000); let b = i64x4::new(-1, 0, 1, 2); let r = avx2::_mm256_add_epi64(a, b); @@ -149,7 +149,7 @@ mod tests { #[test] #[target_feature = "+avx2"] - fn _mm_256_add_epi32() { + fn _mm256_add_epi32() { let a = i32x8::new(-1, 0, 1, 2, 3, 4, 5, 6); let b = i32x8::new(1, 2, 3, 4, 5, 6, 7, 8); let r = avx2::_mm256_add_epi32(a, b); @@ -159,7 +159,7 @@ mod tests { #[test] #[target_feature = "+avx2"] - fn _mm_256_add_epi16() { + fn _mm256_add_epi16() { let a = i16x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); let b = i16x16::new(0, 1, 2, 3, 4, 5, 6, 7, @@ -172,7 +172,7 @@ mod tests { #[test] #[target_feature = "+avx2"] - fn _mm_256_add_epi8() { + fn _mm256_add_epi8() { let a = i8x32::new( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, @@ -194,7 +194,7 @@ mod tests { #[test] #[target_feature = "+avx2"] - fn _mm_256_adds_epi8() { + fn _mm256_adds_epi8() { let a = i8x32::new( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31); @@ -210,7 +210,7 @@ mod tests { #[test] #[target_feature = "+avx2"] - fn _mm_adds_epi8_saturate_positive() { + fn _mm256_adds_epi8_saturate_positive() { let a = i8x32::splat(0x7F); let b = i8x32::splat(1); let r = avx2::_mm256_adds_epi8(a, b); @@ -219,13 +219,96 @@ mod tests { #[test] #[target_feature = "+avx2"] - fn _mm_adds_epi8_saturate_negative() { + fn _mm256_adds_epi8_saturate_negative() { let a = i8x32::splat(-0x80); let b = i8x32::splat(-1); let r = avx2::_mm256_adds_epi8(a, b); assert_eq!(r, a); } + #[test] + #[target_feature = "+avx2"] + fn _mm256_adds_epi16() { + let a = i16x16::new( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let b = i16x16::new( + 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47); + let r = avx2::_mm256_adds_epi16(a, b); + let e = i16x16::new( + 32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62); + + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_adds_epi16_saturate_positive() { + let a = i16x16::splat(0x7FFF); + let b = i16x16::splat(1); + let r = avx2::_mm256_adds_epi16(a, b); + assert_eq!(r, a); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_adds_epi16_saturate_negative() { + let a = i16x16::splat(-0x8000); + let b = i16x16::splat(-1); + let r = avx2::_mm256_adds_epi16(a, b); + assert_eq!(r, a); + } + + //-------------- + #[test] + #[target_feature = "+avx2"] + fn _mm256_adds_epu8() { + let a = u8x32::new( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31); + let b = u8x32::new( + 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47, + 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63); + let r = avx2::_mm256_adds_epu8(a, b); + let e = u8x32::new( + 32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62, + 64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94); + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_adds_epu8_saturate() { + let a = u8x32::splat(0xFF); + let b = u8x32::splat(1); + let r = avx2::_mm256_adds_epu8(a, b); + assert_eq!(r, a); + } + + + #[test] + #[target_feature = "+avx2"] + fn _mm256_adds_epu16() { + let a = u16x16::new( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let b = u16x16::new( + 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47); + let r = avx2::_mm256_adds_epu16(a, b); + let e = u16x16::new( + 32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62); + + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_adds_epu16_saturate() { + let a = u16x16::splat(0xFFFF); + let b = u16x16::splat(1); + let r = avx2::_mm256_adds_epu16(a, b); + assert_eq!(r, a); + } + + } From 0d653b50eae3f7b7b9e8e03e63a7077ebf89a1fe Mon Sep 17 00:00:00 2001 From: Jack Mott Date: Sat, 27 May 2017 10:47:18 -0500 Subject: [PATCH 04/12] doc comments --- src/x86/avx2.rs | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/x86/avx2.rs b/src/x86/avx2.rs index c7caab83a1..372850bdd7 100644 --- a/src/x86/avx2.rs +++ b/src/x86/avx2.rs @@ -1,66 +1,76 @@ use v256::*; +/// Computes the absolute values of packed 32-bit integers in `a`. #[inline(always)] #[target_feature = "+avx2"] pub fn _mm256_abs_epi32(a: i32x8) -> i32x8 { unsafe { pabsd(a) } } +/// Computes the absolute values of packed 16-bit integers in `a`. #[inline(always)] #[target_feature = "+avx2"] pub fn _mm256_abs_epi16(a: i16x16) -> i16x16 { unsafe { pabsw(a) } } +/// Computes the absolute values of packed 8-bit integers in `a`. #[inline(always)] #[target_feature = "+avx2"] pub fn _mm256_abs_epi8(a: i8x32) -> i8x32 { unsafe { pabsb(a) } } +/// Add packed 64-bit integers in `a` and `b`. #[inline(always)] #[target_feature = "+avx2"] pub fn _mm256_add_epi64(a: i64x4, b: i64x4) -> i64x4 { a + b } +/// Add packed 32-bit integers in `a` and `b`. #[inline(always)] #[target_feature = "+avx2"] pub fn _mm256_add_epi32(a: i32x8, b: i32x8) -> i32x8 { a + b } +/// Add packed 16-bit integers in `a` and `b`. #[inline(always)] #[target_feature = "+avx2"] pub fn _mm256_add_epi16(a: i16x16, b: i16x16) -> i16x16 { a + b } +/// Add packed 8-bit integers in `a` and `b`. #[inline(always)] #[target_feature = "+avx2"] pub fn _mm256_add_epi8(a: i8x32, b: i8x32) -> i8x32 { a + b } +/// Add packed 8-bit integers in `a` and `b` using saturation. #[inline(always)] #[target_feature = "+avx2"] pub fn _mm256_adds_epi8(a: i8x32, b: i8x32) -> i8x32 { unsafe { paddsb(a,b) } } +/// Add packed 16-bit integers in `a` and `b` using saturation. #[inline(always)] #[target_feature = "+avx2"] pub fn _mm256_adds_epi16(a: i16x16, b: i16x16) -> i16x16 { unsafe { paddsw(a,b) } } - +/// Add packed unsigned 8-bit integers in `a` and `b` using saturation. #[inline(always)] #[target_feature = "+avx2"] pub fn _mm256_adds_epu8(a: u8x32, b: u8x32) -> u8x32 { unsafe { paddusb(a,b) } } +/// Add packed unsigned 16-bit integers in `a` and `b` using saturation. #[inline(always)] #[target_feature = "+avx2"] pub fn _mm256_adds_epu16(a: u16x16, b: u16x16) -> u16x16 { @@ -257,8 +267,7 @@ mod tests { let r = avx2::_mm256_adds_epi16(a, b); assert_eq!(r, a); } - - //-------------- + #[test] #[target_feature = "+avx2"] fn _mm256_adds_epu8() { From fef7664eb953676d53aeb983910aff8e43949509 Mon Sep 17 00:00:00 2001 From: Jack Mott Date: Sat, 27 May 2017 11:01:12 -0500 Subject: [PATCH 05/12] and and andnot --- src/x86/avx2.rs | 31 ++++++++++++++++++++++++++++++- src/x86/mod.rs | 2 ++ 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/src/x86/avx2.rs b/src/x86/avx2.rs index 372850bdd7..73baf0e330 100644 --- a/src/x86/avx2.rs +++ b/src/x86/avx2.rs @@ -1,4 +1,5 @@ use v256::*; +use x86::__m256i; /// Computes the absolute values of packed 32-bit integers in `a`. #[inline(always)] @@ -77,6 +78,22 @@ pub fn _mm256_adds_epu16(a: u16x16, b: u16x16) -> u16x16 { unsafe { paddusw(a,b) } } +/// Compute the bitwise AND of 256 bits (representing integer data) +/// in `a` and `b`. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_and_si256(a: __m256i, b:__m256i) -> __m256i { + a & b +} + +/// Compute the bitwise NOT of 256 bits (representing integer data) +/// in `a` and then AND with `b`. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_andnot_si256(a: __m256i, b:__m256i) -> __m256i { + (!a) & b +} + #[allow(improper_ctypes)] extern "C" { @@ -103,6 +120,7 @@ extern "C" { mod tests { use v256::*; use x86::avx2; + use x86::__m256i; use std; #[test] @@ -316,8 +334,19 @@ mod tests { let r = avx2::_mm256_adds_epu16(a, b); assert_eq!(r, a); } - + #[test] + fn _mm_and_si256() { + assert_eq!( + avx2::_mm256_and_si256(__m256i::splat(5), __m256i::splat(3)), + __m256i::splat(1)); + } + #[test] + fn _mm_andnot_si256() { + assert_eq!( + avx2::_mm256_andnot_si256(__m256i::splat(5), __m256i::splat(3)), + __m256i::splat(2)); + } } diff --git a/src/x86/mod.rs b/src/x86/mod.rs index 45ba6e158d..d36fa4444d 100644 --- a/src/x86/mod.rs +++ b/src/x86/mod.rs @@ -6,6 +6,8 @@ pub use self::avx2::*; #[allow(non_camel_case_types)] pub type __m128i = ::v128::i8x16; +#[allow(non_camel_case_types)] +pub type __m256i = ::v256::i8x32; mod sse; mod sse2; From ef8252caa5dcc92d40fe5403ea3e6bab4821bf9a Mon Sep 17 00:00:00 2001 From: Jack Mott Date: Sat, 27 May 2017 11:12:57 -0500 Subject: [PATCH 06/12] avg --- src/x86/avx2.rs | 41 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 37 insertions(+), 4 deletions(-) diff --git a/src/x86/avx2.rs b/src/x86/avx2.rs index 73baf0e330..93da86a6f1 100644 --- a/src/x86/avx2.rs +++ b/src/x86/avx2.rs @@ -94,6 +94,19 @@ pub fn _mm256_andnot_si256(a: __m256i, b:__m256i) -> __m256i { (!a) & b } +/// Average packed unsigned 16-bit integers in `a` and `b`. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_avg_epu16 (a:u16x16,b:u16x16) -> u16x16 { + unsafe { pavgw(a,b) } +} + +/// Average packed unsigned 8-bit integers in `a` and `b`. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_avg_epu8 (a:u8x32,b:u8x32) -> u8x32 { + unsafe { pavgb(a,b) } +} #[allow(improper_ctypes)] extern "C" { @@ -111,8 +124,10 @@ extern "C" { fn paddusb(a:u8x32,b:u8x32) -> u8x32; #[link_name = "llvm.x86.avx2.paddus.w"] fn paddusw(a:u16x16,b:u16x16) -> u16x16; - - + #[link_name = "llvm.x86.avx2.pavg.b"] + fn pavgb(a:u8x32,b:u8x32) -> u8x32; + #[link_name = "llvm.x86.avx2.pavg.w"] + fn pavgw(a:u16x16,b:u16x16) -> u16x16; } @@ -336,17 +351,35 @@ mod tests { } #[test] - fn _mm_and_si256() { + #[target_feature = "+avx2"] + fn _mm256_and_si256() { assert_eq!( avx2::_mm256_and_si256(__m256i::splat(5), __m256i::splat(3)), __m256i::splat(1)); } #[test] - fn _mm_andnot_si256() { + #[target_feature = "+avx2"] + fn _mm256_andnot_si256() { assert_eq!( avx2::_mm256_andnot_si256(__m256i::splat(5), __m256i::splat(3)), __m256i::splat(2)); } + #[test] + #[target_feature = "+avx2"] + fn _mm256_avg_epu8() { + let (a, b) = (u8x32::splat(3), u8x32::splat(9)); + let r = avx2::_mm256_avg_epu8(a, b); + assert_eq!(r, u8x32::splat(6)); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_avg_epu16() { + let (a, b) = (u16x16::splat(3), u16x16::splat(9)); + let r = avx2::_mm256_avg_epu16(a, b); + assert_eq!(r, u16x16::splat(6)); + } + } From 9d393ba3df40035792a178b6cc3712b1eefd7f46 Mon Sep 17 00:00:00 2001 From: Jack Mott Date: Sat, 27 May 2017 12:23:17 -0500 Subject: [PATCH 07/12] blendv --- src/x86/avx2.rs | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/x86/avx2.rs b/src/x86/avx2.rs index 93da86a6f1..1b4d5c9315 100644 --- a/src/x86/avx2.rs +++ b/src/x86/avx2.rs @@ -108,6 +108,13 @@ pub fn _mm256_avg_epu8 (a:u8x32,b:u8x32) -> u8x32 { unsafe { pavgb(a,b) } } +/// Blend packed 8-bit integers from `a` and `b` using `mask`. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_blendv_epi8(a:i8x32,b:i8x32,mask:__m256i) -> i8x32 { + unsafe { pblendvb(a,b,mask) } +} + #[allow(improper_ctypes)] extern "C" { #[link_name = "llvm.x86.avx2.pabs.b"] @@ -128,6 +135,8 @@ extern "C" { fn pavgb(a:u8x32,b:u8x32) -> u8x32; #[link_name = "llvm.x86.avx2.pavg.w"] fn pavgw(a:u16x16,b:u16x16) -> u16x16; + #[link_name = "llvm.x86.avx2.pblendvb"] + fn pblendvb(a:i8x32,b:i8x32,mask:__m256i) -> i8x32; } @@ -382,4 +391,14 @@ mod tests { assert_eq!(r, u16x16::splat(6)); } + #[test] + #[target_feature = "+avx2"] + fn _mm256_blendv_epi8() { + let (a,b) = (i8x32::splat(4),i8x32::splat(2)); + let mask = i8x32::splat(0).replace(2,-1); + let e = i8x32::splat(4).replace(2,2); + let r= avx2::_mm256_blendv_epi8(a,b,mask); + assert_eq!(r,e); + } + } From 77c2560d99f53922a338b25c98edf411ce383ff9 Mon Sep 17 00:00:00 2001 From: Jack Mott Date: Sun, 28 May 2017 07:08:16 -0500 Subject: [PATCH 08/12] cmpeq and cmpgt --- src/x86/avx2.rs | 127 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 127 insertions(+) diff --git a/src/x86/avx2.rs b/src/x86/avx2.rs index 1b4d5c9315..2fc803809c 100644 --- a/src/x86/avx2.rs +++ b/src/x86/avx2.rs @@ -115,6 +115,64 @@ pub fn _mm256_blendv_epi8(a:i8x32,b:i8x32,mask:__m256i) -> i8x32 { unsafe { pblendvb(a,b,mask) } } +/// Compare packed 64-bit integers in `a` and `b` for equality. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_cmpeq_epi64(a:i64x4,b:i64x4) -> i64x4 { + a.eq(b) +} + +/// Compare packed 32-bit integers in `a` and `b` for equality. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_cmpeq_epi32(a:i32x8,b:i32x8) -> i32x8 { + a.eq(b) +} + +/// Compare packed 16-bit integers in `a` and `b` for equality. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_cmpeq_epi16(a:i16x16,b:i16x16) -> i16x16 { + a.eq(b) +} + +/// Compare packed 8-bit integers in `a` and `b` for equality. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_cmpeq_epi8(a:i8x32,b:i8x32) -> i8x32 { + a.eq(b) +} + +/// Compare packed 64-bit integers in `a` and `b` for greater-than. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_cmpgt_epi64(a:i64x4,b:i64x4) -> i64x4 { + a.gt(b) +} + +/// Compare packed 32-bit integers in `a` and `b` for greater-than. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_cmpgt_epi32(a:i32x8,b:i32x8) -> i32x8 { + a.gt(b) +} + +/// Compare packed 16-bit integers in `a` and `b` for greater-than. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_cmpgt_epi16(a:i16x16,b:i16x16) -> i16x16 { + a.gt(b) +} + +/// Compare packed 8-bit integers in `a` and `b` for greater-than. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_cmpgt_epi8(a:i8x32,b:i8x32) -> i8x32 { + a.gt(b) +} + + + #[allow(improper_ctypes)] extern "C" { #[link_name = "llvm.x86.avx2.pabs.b"] @@ -401,4 +459,73 @@ mod tests { assert_eq!(r,e); } + + #[test] + fn _mm256_cmpeq_epi8() { + let a = i8x32::new( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31); + let b = i8x32::new( + 31,30,2,28,27,26,25,24,23,22,21,20,19,18,17,16, + 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + let r = avx2::_mm256_cmpeq_epi8(a, b); + assert_eq!(r, i8x32::splat(0).replace(2,0xFFu8 as i8)); + } + + #[test] + fn _mm256_cmpeq_epi16() { + let a = i16x16::new(0, 1, 2, 3, 4, 5, 6, 7,8,9,10,11,12,13,14,15); + let b = i16x16::new(15,14,2,12,11,10,9,8,7, 6, 5, 4, 3, 2, 1, 0); + let r = avx2::_mm256_cmpeq_epi16(a, b); + assert_eq!(r, i16x16::splat(0).replace(2, 0xFFFFu16 as i16)); + } + + #[test] + fn _mm256_cmpeq_epi32() { + let a = i32x8::new(0, 1, 2, 3,4,5,6,7); + let b = i32x8::new(7,6,2,4,3, 2, 1, 0); + let r = avx2::_mm256_cmpeq_epi32(a, b); + assert_eq!(r, i32x8::splat(0).replace(2, 0xFFFFFFFFu32 as i32)); + } + + #[test] + fn _mm256_cmpeq_epi64() { + let a = i64x4::new(0, 1, 2, 3); + let b = i64x4::new(3, 2, 2, 0); + let r = avx2::_mm256_cmpeq_epi64(a, b); + assert_eq!(r, i64x4::splat(0).replace(2, 0xFFFFFFFFFFFFFFFFu64 as i64)); + } + + #[test] + fn _mm256_cmpgt_epi8() { + let a = i8x32::splat(0).replace(0, 5); + let b = i8x32::splat(0); + let r = avx2::_mm256_cmpgt_epi8(a, b); + assert_eq!(r, i8x32::splat(0).replace(0, 0xFFu8 as i8)); + } + + #[test] + fn _mm256_cmpgt_epi16() { + let a = i16x16::splat(0).replace(0, 5); + let b = i16x16::splat(0); + let r = avx2::_mm256_cmpgt_epi16(a, b); + assert_eq!(r, i16x16::splat(0).replace(0, 0xFFFFu16 as i16)); + } + + #[test] + fn _mm256_cmpgt_epi32() { + let a = i32x8::splat(0).replace(0, 5); + let b = i32x8::splat(0); + let r = avx2::_mm256_cmpgt_epi32(a, b); + assert_eq!(r, i32x8::splat(0).replace(0, 0xFFFFFFFFu32 as i32)); + } + + #[test] + fn _mm256_cmpgt_epi64() { + let a = i64x4::splat(0).replace(0, 5); + let b = i64x4::splat(0); + let r = avx2::_mm256_cmpgt_epi64(a, b); + assert_eq!(r, i64x4::splat(0).replace(0, 0xFFFFFFFFFFFFFFFFu64 as i64)); + } + } From 3c3a2da6f2f3e17900d64b31540c64660aca43f5 Mon Sep 17 00:00:00 2001 From: Jack Mott Date: Sun, 28 May 2017 07:09:46 -0500 Subject: [PATCH 09/12] todo comments --- src/x86/avx2.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/x86/avx2.rs b/src/x86/avx2.rs index 2fc803809c..df2b145b3a 100644 --- a/src/x86/avx2.rs +++ b/src/x86/avx2.rs @@ -108,6 +108,8 @@ pub fn _mm256_avg_epu8 (a:u8x32,b:u8x32) -> u8x32 { unsafe { pavgb(a,b) } } +// TODO alignr + /// Blend packed 8-bit integers from `a` and `b` using `mask`. #[inline(always)] #[target_feature = "+avx2"] @@ -115,6 +117,10 @@ pub fn _mm256_blendv_epi8(a:i8x32,b:i8x32,mask:__m256i) -> i8x32 { unsafe { pblendvb(a,b,mask) } } +// TODO rest of blend + +// TODO broadcast + /// Compare packed 64-bit integers in `a` and `b` for equality. #[inline(always)] #[target_feature = "+avx2"] From 2070eb504d93046d3d5b3129717877bd0d7d11d0 Mon Sep 17 00:00:00 2001 From: Jack Mott Date: Sun, 28 May 2017 09:26:55 -0500 Subject: [PATCH 10/12] style fixes --- src/x86/avx2.rs | 76 +++++++++++++++++++++++++++++-------------------- 1 file changed, 45 insertions(+), 31 deletions(-) diff --git a/src/x86/avx2.rs b/src/x86/avx2.rs index df2b145b3a..9d20890bf5 100644 --- a/src/x86/avx2.rs +++ b/src/x86/avx2.rs @@ -54,35 +54,35 @@ pub fn _mm256_add_epi8(a: i8x32, b: i8x32) -> i8x32 { #[inline(always)] #[target_feature = "+avx2"] pub fn _mm256_adds_epi8(a: i8x32, b: i8x32) -> i8x32 { - unsafe { paddsb(a,b) } + unsafe { paddsb(a, b) } } /// Add packed 16-bit integers in `a` and `b` using saturation. #[inline(always)] #[target_feature = "+avx2"] pub fn _mm256_adds_epi16(a: i16x16, b: i16x16) -> i16x16 { - unsafe { paddsw(a,b) } + unsafe { paddsw(a, b) } } /// Add packed unsigned 8-bit integers in `a` and `b` using saturation. #[inline(always)] #[target_feature = "+avx2"] pub fn _mm256_adds_epu8(a: u8x32, b: u8x32) -> u8x32 { - unsafe { paddusb(a,b) } + unsafe { paddusb(a, b) } } /// Add packed unsigned 16-bit integers in `a` and `b` using saturation. #[inline(always)] #[target_feature = "+avx2"] pub fn _mm256_adds_epu16(a: u16x16, b: u16x16) -> u16x16 { - unsafe { paddusw(a,b) } + unsafe { paddusw(a, b) } } /// Compute the bitwise AND of 256 bits (representing integer data) /// in `a` and `b`. #[inline(always)] #[target_feature = "+avx2"] -pub fn _mm256_and_si256(a: __m256i, b:__m256i) -> __m256i { +pub fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i { a & b } @@ -90,25 +90,28 @@ pub fn _mm256_and_si256(a: __m256i, b:__m256i) -> __m256i { /// in `a` and then AND with `b`. #[inline(always)] #[target_feature = "+avx2"] -pub fn _mm256_andnot_si256(a: __m256i, b:__m256i) -> __m256i { +pub fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i { (!a) & b } /// Average packed unsigned 16-bit integers in `a` and `b`. #[inline(always)] #[target_feature = "+avx2"] -pub fn _mm256_avg_epu16 (a:u16x16,b:u16x16) -> u16x16 { - unsafe { pavgw(a,b) } +pub fn _mm256_avg_epu16 (a: u16x16, b: u16x16) -> u16x16 { + unsafe { pavgw(a, b) } } /// Average packed unsigned 8-bit integers in `a` and `b`. #[inline(always)] #[target_feature = "+avx2"] -pub fn _mm256_avg_epu8 (a:u8x32,b:u8x32) -> u8x32 { - unsafe { pavgb(a,b) } +pub fn _mm256_avg_epu8 (a: u8x32, b: u8x32) -> u8x32 { + unsafe { pavgb(a, b) } } -// TODO alignr +// TODO _mm256_alignr_epi8 +// TODO _mm256_blend_epi16 +// TODO _mm_blend_epi32 +// TODO _mm256_blend_epi32 /// Blend packed 8-bit integers from `a` and `b` using `mask`. #[inline(always)] @@ -117,68 +120,80 @@ pub fn _mm256_blendv_epi8(a:i8x32,b:i8x32,mask:__m256i) -> i8x32 { unsafe { pblendvb(a,b,mask) } } -// TODO rest of blend +// TODO _mm_broadcastb_epi8 +// TODO _mm256_broadcastb_epi8 +// TODO _mm_broadcastd_epi32 +// TODO _mm256_broadcastd_epi32 +// TODO _mm_broadcastq_epi64 +// TODO _mm256_broadcastq_epi64 +// TODO _mm_broadcastsd_pd +// TODO _mm256_broadcastsd_pd +// TODO _mm_broadcastsi128_si256 +// TODO _mm256_broadcastsi128_si256 +// TODO _mm_broadcastss_ps +// TODO _mm256_broadcastss_ps +// TODO _mm_broadcastw_epi16 +// TODO _mm256_broadcastw_epi16 +// TODO _mm256_bslli_epi128 +// TODO _mm256_bsrli_epi128 -// TODO broadcast /// Compare packed 64-bit integers in `a` and `b` for equality. #[inline(always)] #[target_feature = "+avx2"] -pub fn _mm256_cmpeq_epi64(a:i64x4,b:i64x4) -> i64x4 { +pub fn _mm256_cmpeq_epi64(a: i64x4, b: i64x4) -> i64x4 { a.eq(b) } /// Compare packed 32-bit integers in `a` and `b` for equality. #[inline(always)] #[target_feature = "+avx2"] -pub fn _mm256_cmpeq_epi32(a:i32x8,b:i32x8) -> i32x8 { +pub fn _mm256_cmpeq_epi32(a: i32x8, b: i32x8) -> i32x8 { a.eq(b) } /// Compare packed 16-bit integers in `a` and `b` for equality. #[inline(always)] #[target_feature = "+avx2"] -pub fn _mm256_cmpeq_epi16(a:i16x16,b:i16x16) -> i16x16 { +pub fn _mm256_cmpeq_epi16(a: i16x16, b: i16x16) -> i16x16 { a.eq(b) } /// Compare packed 8-bit integers in `a` and `b` for equality. #[inline(always)] #[target_feature = "+avx2"] -pub fn _mm256_cmpeq_epi8(a:i8x32,b:i8x32) -> i8x32 { +pub fn _mm256_cmpeq_epi8(a: i8x32, b: i8x32) -> i8x32 { a.eq(b) } /// Compare packed 64-bit integers in `a` and `b` for greater-than. #[inline(always)] #[target_feature = "+avx2"] -pub fn _mm256_cmpgt_epi64(a:i64x4,b:i64x4) -> i64x4 { +pub fn _mm256_cmpgt_epi64(a: i64x4, b: i64x4) -> i64x4 { a.gt(b) } /// Compare packed 32-bit integers in `a` and `b` for greater-than. #[inline(always)] #[target_feature = "+avx2"] -pub fn _mm256_cmpgt_epi32(a:i32x8,b:i32x8) -> i32x8 { +pub fn _mm256_cmpgt_epi32(a: i32x8, b: i32x8) -> i32x8 { a.gt(b) } /// Compare packed 16-bit integers in `a` and `b` for greater-than. #[inline(always)] #[target_feature = "+avx2"] -pub fn _mm256_cmpgt_epi16(a:i16x16,b:i16x16) -> i16x16 { +pub fn _mm256_cmpgt_epi16(a: i16x16, b: i16x16) -> i16x16 { a.gt(b) } /// Compare packed 8-bit integers in `a` and `b` for greater-than. #[inline(always)] #[target_feature = "+avx2"] -pub fn _mm256_cmpgt_epi8(a:i8x32,b:i8x32) -> i8x32 { +pub fn _mm256_cmpgt_epi8(a: i8x32, b: i8x32) -> i8x32 { a.gt(b) } - - #[allow(improper_ctypes)] extern "C" { #[link_name = "llvm.x86.avx2.pabs.b"] @@ -188,19 +203,19 @@ extern "C" { #[link_name = "llvm.x86.avx2.pabs.d"] fn pabsd(a: i32x8) -> i32x8; #[link_name = "llvm.x86.avx2.padds.b"] - fn paddsb(a:i8x32,b:i8x32) -> i8x32; + fn paddsb(a: i8x32, b: i8x32) -> i8x32; #[link_name = "llvm.x86.avx2.padds.w"] - fn paddsw(a:i16x16,b:i16x16) -> i16x16; + fn paddsw(a: i16x16, b: i16x16) -> i16x16; #[link_name = "llvm.x86.avx2.paddus.b"] - fn paddusb(a:u8x32,b:u8x32) -> u8x32; + fn paddusb(a: u8x32, b: u8x32) -> u8x32; #[link_name = "llvm.x86.avx2.paddus.w"] - fn paddusw(a:u16x16,b:u16x16) -> u16x16; + fn paddusw(a: u16x16, b: u16x16) -> u16x16; #[link_name = "llvm.x86.avx2.pavg.b"] - fn pavgb(a:u8x32,b:u8x32) -> u8x32; + fn pavgb(a: u8x32, b: u8x32) -> u8x32; #[link_name = "llvm.x86.avx2.pavg.w"] - fn pavgw(a:u16x16,b:u16x16) -> u16x16; + fn pavgw(a: u16x16, b: u16x16) -> u16x16; #[link_name = "llvm.x86.avx2.pblendvb"] - fn pblendvb(a:i8x32,b:i8x32,mask:__m256i) -> i8x32; + fn pblendvb(a: i8x32, b: i8x32, mask: __m256i) -> i8x32; } @@ -465,7 +480,6 @@ mod tests { assert_eq!(r,e); } - #[test] fn _mm256_cmpeq_epi8() { let a = i8x32::new( From 922dacd44b424ed537d8a11ca73b3f0d62ac731b Mon Sep 17 00:00:00 2001 From: Jack Mott Date: Sun, 28 May 2017 12:26:53 -0500 Subject: [PATCH 11/12] update TODO, fix styles, add hadd and hsub --- TODO.md | 389 ++++++++++++++++++++++++++++++++++++++++++++++++ src/x86/avx2.rs | 316 ++++++++++++++++++++++++++++----------- 2 files changed, 621 insertions(+), 84 deletions(-) diff --git a/TODO.md b/TODO.md index 42ef6e43bb..2764cd8d97 100644 --- a/TODO.md +++ b/TODO.md @@ -5,6 +5,7 @@ Intel intrinsics. Replace `SSE4.2` with the intended type. rg '^> TODO.md ``` +rg calls the ripgrep tool, which can be installed with `cargo install ripgrep` sse --- @@ -535,3 +536,391 @@ sse4.2 * [ ] `_mm_crc32_u16` * [ ] `_mm_crc32_u32` * [ ] `_mm_crc32_u64` + + +avx +--- +* [ ] `_mm256_add_pd` +* [ ] `_mm256_add_ps` +* [ ] `_mm256_addsub_pd` +* [ ] `_mm256_addsub_ps` +* [ ] `_mm256_and_pd` +* [ ] `_mm256_and_ps` +* [ ] `_mm256_andnot_pd` +* [ ] `_mm256_andnot_ps` +* [ ] `_mm256_blend_pd` +* [ ] `_mm256_blend_ps` +* [ ] `_mm256_blendv_pd` +* [ ] `_mm256_blendv_ps` +* [ ] `_mm256_div_pd` +* [ ] `_mm256_div_ps` +* [ ] `_mm256_dp_ps` +* [ ] `_mm256_hadd_pd` +* [ ] `_mm256_hadd_ps` +* [ ] `_mm256_hsub_pd` +* [ ] `_mm256_hsub_ps` +* [ ] `_mm256_max_pd` +* [ ] `_mm256_max_ps` +* [ ] `_mm256_min_pd` +* [ ] `_mm256_min_ps` +* [ ] `_mm256_mul_pd` +* [ ] `_mm256_mul_ps` +* [ ] `_mm256_or_pd` +* [ ] `_mm256_or_ps` +* [ ] `_mm256_shuffle_pd` +* [ ] `_mm256_shuffle_ps` +* [ ] `_mm256_sub_pd` +* [ ] `_mm256_sub_ps` +* [ ] `_mm256_xor_pd` +* [ ] `_mm256_xor_ps` +* [ ] `_mm_cmp_pd` +* [ ] `_mm256_cmp_pd` +* [ ] `_mm_cmp_ps` +* [ ] `_mm256_cmp_ps` +* [ ] `_mm_cmp_sd` +* [ ] `_mm_cmp_ss` +* [ ] `_mm256_cvtepi32_pd` +* [ ] `_mm256_cvtepi32_ps` +* [ ] `_mm256_cvtpd_ps` +* [ ] `_mm256_cvtps_epi32` +* [ ] `_mm256_cvtps_pd` +* [ ] `_mm256_cvttpd_epi32` +* [ ] `_mm256_cvtpd_epi32` +* [ ] `_mm256_cvttps_epi32` +* [ ] `_mm256_extractf128_ps` +* [ ] `_mm256_extractf128_pd` +* [ ] `_mm256_extractf128_si256` +* [ ] `_mm256_extract_epi8` +* [ ] `_mm256_extract_epi16` +* [ ] `_mm256_extract_epi32` +* [ ] `_mm256_extract_epi64` +* [ ] `_mm256_zeroall` +* [ ] `_mm256_zeroupper` +* [ ] `_mm256_permutevar_ps` +* [ ] `_mm_permutevar_ps` +* [ ] `_mm256_permute_ps` +* [ ] `_mm_permute_ps` +* [ ] `_mm256_permutevar_pd` +* [ ] `_mm_permutevar_pd` +* [ ] `_mm256_permute_pd` +* [ ] `_mm_permute_pd` +* [ ] `_mm256_permute2f128_ps` +* [ ] `_mm256_permute2f128_pd` +* [ ] `_mm256_permute2f128_si256` +* [ ] `_mm256_broadcast_ss` +* [ ] `_mm_broadcast_ss` +* [ ] `_mm256_broadcast_sd` +* [ ] `_mm256_broadcast_ps` +* [ ] `_mm256_broadcast_pd` +* [ ] `_mm256_insertf128_ps` +* [ ] `_mm256_insertf128_pd` +* [ ] `_mm256_insertf128_si256` +* [ ] `_mm256_insert_epi8` +* [ ] `_mm256_insert_epi16` +* [ ] `_mm256_insert_epi32` +* [ ] `_mm256_insert_epi64` +* [ ] `_mm256_load_pd` +* [ ] `_mm256_store_pd` +* [ ] `_mm256_load_ps` +* [ ] `_mm256_store_ps` +* [ ] `_mm256_loadu_pd` +* [ ] `_mm256_storeu_pd` +* [ ] `_mm256_loadu_ps` +* [ ] `_mm256_storeu_ps` +* [ ] `_mm256_load_si256` +* [ ] `_mm256_store_si256` +* [ ] `_mm256_loadu_si256` +* [ ] `_mm256_storeu_si256` +* [ ] `_mm256_maskload_pd` +* [ ] `_mm256_maskstore_pd` +* [ ] `_mm_maskload_pd` +* [ ] `_mm_maskstore_pd` +* [ ] `_mm256_maskload_ps` +* [ ] `_mm256_maskstore_ps` +* [ ] `_mm_maskload_ps` +* [ ] `_mm_maskstore_ps` +* [ ] `_mm256_movehdup_ps` +* [ ] `_mm256_moveldup_ps` +* [ ] `_mm256_movedup_pd` +* [ ] `_mm256_lddqu_si256` +* [ ] `_mm256_stream_si256` +* [ ] `_mm256_stream_pd` +* [ ] `_mm256_stream_ps` +* [ ] `_mm256_rcp_ps` +* [ ] `_mm256_rsqrt_ps` +* [ ] `_mm256_sqrt_pd` +* [ ] `_mm256_sqrt_ps` +* [ ] `_mm256_round_pd` +* [ ] `_mm256_round_ps` +* [ ] `_mm256_unpackhi_pd` +* [ ] `_mm256_unpackhi_ps` +* [ ] `_mm256_unpacklo_pd` +* [ ] `_mm256_unpacklo_ps` +* [ ] `_mm256_testz_si256` +* [ ] `_mm256_testc_si256` +* [ ] `_mm256_testnzc_si256` +* [ ] `_mm256_testz_pd` +* [ ] `_mm256_testc_pd` +* [ ] `_mm256_testnzc_pd` +* [ ] `_mm_testz_pd` +* [ ] `_mm_testc_pd` +* [ ] `_mm_testnzc_pd` +* [ ] `_mm256_testz_ps` +* [ ] `_mm256_testc_ps` +* [ ] `_mm256_testnzc_ps` +* [ ] `_mm_testz_ps` +* [ ] `_mm_testc_ps` +* [ ] `_mm_testnzc_ps` +* [ ] `_mm256_movemask_pd` +* [ ] `_mm256_movemask_ps` +* [ ] `_mm256_setzero_pd` +* [ ] `_mm256_setzero_ps` +* [ ] `_mm256_setzero_si256` +* [ ] `_mm256_set_pd` +* [ ] `_mm256_set_ps` +* [ ] `_mm256_set_epi8` +* [ ] `_mm256_set_epi16` +* [ ] `_mm256_set_epi32` +* [ ] `_mm256_set_epi64x` +* [ ] `_mm256_setr_pd` +* [ ] `_mm256_setr_ps` +* [ ] `_mm256_setr_epi8` +* [ ] `_mm256_setr_epi16` +* [ ] `_mm256_setr_epi32` +* [ ] `_mm256_setr_epi64x` +* [ ] `_mm256_set1_pd` +* [ ] `_mm256_set1_ps` +* [ ] `_mm256_set1_epi8` +* [ ] `_mm256_set1_epi16` +* [ ] `_mm256_set1_epi32` +* [ ] `_mm256_set1_epi64x` +* [ ] `_mm256_castpd_ps` +* [ ] `_mm256_castps_pd` +* [ ] `_mm256_castps_si256` +* [ ] `_mm256_castpd_si256` +* [ ] `_mm256_castsi256_ps` +* [ ] `_mm256_castsi256_pd` +* [ ] `_mm256_castps256_ps128` +* [ ] `_mm256_castpd256_pd128` +* [ ] `_mm256_castsi256_si128` +* [ ] `_mm256_castps128_ps256` +* [ ] `_mm256_castpd128_pd256` +* [ ] `_mm256_castsi128_si256` +* [ ] `_mm256_zextps128_ps256` +* [ ] `_mm256_zextpd128_pd256` +* [ ] `_mm256_zextsi128_si256` +* [ ] `_mm256_floor_ps` +* [ ] `_mm256_ceil_ps` +* [ ] `_mm256_floor_pd` +* [ ] `_mm256_ceil_pd` +* [ ] `_mm256_undefined_ps` +* [ ] `_mm256_undefined_pd` +* [ ] `_mm256_undefined_si256` +* [ ] `_mm256_set_m128` +* [ ] `_mm256_set_m128d` +* [ ] `_mm256_set_m128i` +* [ ] `_mm256_setr_m128` +* [ ] `_mm256_setr_m128d` +* [ ] `_mm256_setr_m128i` +* [ ] `_mm256_loadu2_m128` +* [ ] `_mm256_loadu2_m128d` +* [ ] `_mm256_loadu2_m128i` +* [ ] `_mm256_storeu2_m128` +* [ ] `_mm256_storeu2_m128d` +* [ ] `_mm256_storeu2_m128i` + + + +avx2 +---- +* [x] `_mm256_abs_epi8` +* [x] `_mm256_abs_epi16` +* [x] `_mm256_abs_epi32` +* [x] `_mm256_add_epi8` +* [x] `_mm256_add_epi16` +* [x] `_mm256_add_epi32` +* [x] `_mm256_add_epi64` +* [x] `_mm256_adds_epi8` +* [x] `_mm256_adds_epi16` +* [x] `_mm256_adds_epu8` +* [x] `_mm256_adds_epu16` +* [ ] `_mm256_alignr_epi8` +* [x] `_mm256_and_si256` +* [x] `_mm256_andnot_si256` +* [x] `_mm256_avg_epu8` +* [x] `_mm256_avg_epu16` +* [ ] `_mm256_blend_epi16` +* [ ] `_mm_blend_epi32` +* [ ] `_mm256_blend_epi32` +* [x] `_mm256_blendv_epi8` +* [ ] `_mm_broadcastb_epi8` +* [ ] `_mm256_broadcastb_epi8` +* [ ] `_mm_broadcastd_epi32` +* [ ] `_mm256_broadcastd_epi32` +* [ ] `_mm_broadcastq_epi64` +* [ ] `_mm256_broadcastq_epi64` +* [ ] `_mm_broadcastsd_pd` +* [ ] `_mm256_broadcastsd_pd` +* [ ] `_mm_broadcastsi128_si256` +* [ ] `_mm256_broadcastsi128_si256` +* [ ] `_mm_broadcastss_ps` +* [ ] `_mm256_broadcastss_ps` +* [ ] `_mm_broadcastw_epi16` +* [ ] `_mm256_broadcastw_epi16` +* [x] `_mm256_cmpeq_epi8` +* [x] `_mm256_cmpeq_epi16` +* [x] `_mm256_cmpeq_epi32` +* [x] `_mm256_cmpeq_epi64` +* [x] `_mm256_cmpgt_epi8` +* [x] `_mm256_cmpgt_epi16` +* [x] `_mm256_cmpgt_epi32` +* [x] `_mm256_cmpgt_epi64` +* [ ] `_mm256_cvtepi16_epi32` +* [ ] `_mm256_cvtepi16_epi64` +* [ ] `_mm256_cvtepi32_epi64` +* [ ] `_mm256_cvtepi8_epi16` +* [ ] `_mm256_cvtepi8_epi32` +* [ ] `_mm256_cvtepi8_epi64` +* [ ] `_mm256_cvtepu16_epi32` +* [ ] `_mm256_cvtepu16_epi64` +* [ ] `_mm256_cvtepu32_epi64` +* [ ] `_mm256_cvtepu8_epi16` +* [ ] `_mm256_cvtepu8_epi32` +* [ ] `_mm256_cvtepu8_epi64` +* [ ] `_mm256_extracti128_si256` +* [x] `_mm256_hadd_epi16` +* [x] `_mm256_hadd_epi32` +* [x] `_mm256_hadds_epi16` +* [x] `_mm256_hsub_epi16` +* [x] `_mm256_hsub_epi32` +* [x] `_mm256_hsubs_epi16` +* [ ] `_mm_i32gather_pd` +* [ ] `_mm256_i32gather_pd` +* [ ] `_mm_i32gather_ps` +* [ ] `_mm256_i32gather_ps` +* [ ] `_mm_i32gather_epi32` +* [ ] `_mm256_i32gather_epi32` +* [ ] `_mm_i32gather_epi64` +* [ ] `_mm256_i32gather_epi64` +* [ ] `_mm_i64gather_pd` +* [ ] `_mm256_i64gather_pd` +* [ ] `_mm_i64gather_ps` +* [ ] `_mm256_i64gather_ps` +* [ ] `_mm_i64gather_epi32` +* [ ] `_mm256_i64gather_epi32` +* [ ] `_mm_i64gather_epi64` +* [ ] `_mm256_i64gather_epi64` +* [ ] `_mm256_inserti128_si256` +* [ ] `_mm256_madd_epi16` +* [ ] `_mm256_maddubs_epi16` +* [ ] `_mm_mask_i32gather_pd` +* [ ] `_mm256_mask_i32gather_pd` +* [ ] `_mm_mask_i32gather_ps` +* [ ] `_mm256_mask_i32gather_ps` +* [ ] `_mm_mask_i32gather_epi32` +* [ ] `_mm256_mask_i32gather_epi32` +* [ ] `_mm_mask_i32gather_epi64` +* [ ] `_mm256_mask_i32gather_epi64` +* [ ] `_mm_mask_i64gather_pd` +* [ ] `_mm256_mask_i64gather_pd` +* [ ] `_mm_mask_i64gather_ps` +* [ ] `_mm256_mask_i64gather_ps` +* [ ] `_mm_mask_i64gather_epi32` +* [ ] `_mm256_mask_i64gather_epi32` +* [ ] `_mm_mask_i64gather_epi64` +* [ ] `_mm256_mask_i64gather_epi64` +* [ ] `_mm_maskload_epi32` +* [ ] `_mm256_maskload_epi32` +* [ ] `_mm_maskload_epi64` +* [ ] `_mm256_maskload_epi64` +* [ ] `_mm_maskstore_epi32` +* [ ] `_mm256_maskstore_epi32` +* [ ] `_mm_maskstore_epi64` +* [ ] `_mm256_maskstore_epi64` +* [ ] `_mm256_max_epi8` +* [ ] `_mm256_max_epi16` +* [ ] `_mm256_max_epi32` +* [ ] `_mm256_max_epu8` +* [ ] `_mm256_max_epu16` +* [ ] `_mm256_max_epu32` +* [ ] `_mm256_min_epi8` +* [ ] `_mm256_min_epi16` +* [ ] `_mm256_min_epi32` +* [ ] `_mm256_min_epu8` +* [ ] `_mm256_min_epu16` +* [ ] `_mm256_min_epu32` +* [ ] `_mm256_movemask_epi8` +* [ ] `_mm256_mpsadbw_epu8` +* [ ] `_mm256_mul_epi32` +* [ ] `_mm256_mul_epu32` +* [ ] `_mm256_mulhi_epi16` +* [ ] `_mm256_mulhi_epu16` +* [ ] `_mm256_mulhrs_epi16` +* [ ] `_mm256_mullo_epi16` +* [ ] `_mm256_mullo_epi32` +* [ ] `_mm256_or_si256` +* [ ] `_mm256_packs_epi16` +* [ ] `_mm256_packs_epi32` +* [ ] `_mm256_packus_epi16` +* [ ] `_mm256_packus_epi32` +* [ ] `_mm256_permute2x128_si256` +* [ ] `_mm256_permute4x64_epi64` +* [ ] `_mm256_permute4x64_pd` +* [ ] `_mm256_permutevar8x32_epi32` +* [ ] `_mm256_permutevar8x32_ps` +* [ ] `_mm256_sad_epu8` +* [ ] `_mm256_shuffle_epi32` +* [ ] `_mm256_shuffle_epi8` +* [ ] `_mm256_shufflehi_epi16` +* [ ] `_mm256_shufflelo_epi16` +* [ ] `_mm256_sign_epi8` +* [ ] `_mm256_sign_epi16` +* [ ] `_mm256_sign_epi32` +* [ ] `_mm256_slli_si256` +* [ ] `_mm256_bslli_epi128` +* [ ] `_mm256_sll_epi16` +* [ ] `_mm256_slli_epi16` +* [ ] `_mm256_sll_epi32` +* [ ] `_mm256_slli_epi32` +* [ ] `_mm256_sll_epi64` +* [ ] `_mm256_slli_epi64` +* [ ] `_mm_sllv_epi32` +* [ ] `_mm256_sllv_epi32` +* [ ] `_mm_sllv_epi64` +* [ ] `_mm256_sllv_epi64` +* [ ] `_mm256_sra_epi16` +* [ ] `_mm256_srai_epi16` +* [ ] `_mm256_sra_epi32` +* [ ] `_mm256_srai_epi32` +* [ ] `_mm_srav_epi32` +* [ ] `_mm256_srav_epi32` +* [ ] `_mm256_srli_si256` +* [ ] `_mm256_bsrli_epi128` +* [ ] `_mm256_srl_epi16` +* [ ] `_mm256_srli_epi16` +* [ ] `_mm256_srl_epi32` +* [ ] `_mm256_srli_epi32` +* [ ] `_mm256_srl_epi64` +* [ ] `_mm256_srli_epi64` +* [ ] `_mm_srlv_epi32` +* [ ] `_mm256_srlv_epi32` +* [ ] `_mm_srlv_epi64` +* [ ] `_mm256_srlv_epi64` +* [ ] `_mm256_stream_load_si256` +* [ ] `_mm256_sub_epi8` +* [ ] `_mm256_sub_epi16` +* [ ] `_mm256_sub_epi32` +* [ ] `_mm256_sub_epi64` +* [ ] `_mm256_subs_epi8` +* [ ] `_mm256_subs_epi16` +* [ ] `_mm256_subs_epu8` +* [ ] `_mm256_subs_epu16` +* [ ] `_mm256_xor_si256` +* [ ] `_mm256_unpackhi_epi8` +* [ ] `_mm256_unpackhi_epi16` +* [ ] `_mm256_unpackhi_epi32` +* [ ] `_mm256_unpackhi_epi64` +* [ ] `_mm256_unpacklo_epi8` +* [ ] `_mm256_unpacklo_epi16` +* [ ] `_mm256_unpacklo_epi32` +* [ ] `_mm256_unpacklo_epi64` diff --git a/src/x86/avx2.rs b/src/x86/avx2.rs index 9d20890bf5..cbe14b1b80 100644 --- a/src/x86/avx2.rs +++ b/src/x86/avx2.rs @@ -78,7 +78,7 @@ pub fn _mm256_adds_epu16(a: u16x16, b: u16x16) -> u16x16 { unsafe { paddusw(a, b) } } -/// Compute the bitwise AND of 256 bits (representing integer data) +/// Compute the bitwise AND of 256 bits (representing integer data) /// in `a` and `b`. #[inline(always)] #[target_feature = "+avx2"] @@ -86,7 +86,7 @@ pub fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i { a & b } -/// Compute the bitwise NOT of 256 bits (representing integer data) +/// Compute the bitwise NOT of 256 bits (representing integer data) /// in `a` and then AND with `b`. #[inline(always)] #[target_feature = "+avx2"] @@ -121,21 +121,21 @@ pub fn _mm256_blendv_epi8(a:i8x32,b:i8x32,mask:__m256i) -> i8x32 { } // TODO _mm_broadcastb_epi8 -// TODO _mm256_broadcastb_epi8 -// TODO _mm_broadcastd_epi32 -// TODO _mm256_broadcastd_epi32 -// TODO _mm_broadcastq_epi64 +// TODO _mm256_broadcastb_epi8 +// TODO _mm_broadcastd_epi32 +// TODO _mm256_broadcastd_epi32 +// TODO _mm_broadcastq_epi64 // TODO _mm256_broadcastq_epi64 -// TODO _mm_broadcastsd_pd -// TODO _mm256_broadcastsd_pd -// TODO _mm_broadcastsi128_si256 -// TODO _mm256_broadcastsi128_si256 -// TODO _mm_broadcastss_ps -// TODO _mm256_broadcastss_ps -// TODO _mm_broadcastw_epi16 -// TODO _mm256_broadcastw_epi16 -// TODO _mm256_bslli_epi128 -// TODO _mm256_bsrli_epi128 +// TODO _mm_broadcastsd_pd +// TODO _mm256_broadcastsd_pd +// TODO _mm_broadcastsi128_si256 +// TODO _mm256_broadcastsi128_si256 +// TODO _mm_broadcastss_ps +// TODO _mm256_broadcastss_ps +// TODO _mm_broadcastw_epi16 +// TODO _mm256_broadcastw_epi16 +// TODO _mm256_bslli_epi128 +// TODO _mm256_bsrli_epi128 /// Compare packed 64-bit integers in `a` and `b` for equality. @@ -194,6 +194,64 @@ pub fn _mm256_cmpgt_epi8(a: i8x32, b: i8x32) -> i8x32 { a.gt(b) } +// TODO _mm256_cvtepi16_epi32 +// TODO _mm256_cvtepi16_epi64 +// TODO _mm256_cvtepi32_epi64 +// TODO _mm256_cvtepi8_epi16 +// TODO _mm256_cvtepi8_epi32 +// TODO _mm256_cvtepi8_epi64 +// TODO _mm256_cvtepu16_epi32 +// TODO _mm256_cvtepu16_epi64 +// TODO _mm256_cvtepu32_epi64 +// TODO _mm256_cvtepu8_epi16 +// TODO _mm256_cvtepu8_epi32 +// TODO _mm256_cvtepu8_epi64 +// TODO _m128i _mm256_extracti128_si256 + +/// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_hadd_epi16(a: i16x16, b: i16x16) -> i16x16 { + unsafe { phaddw(a, b) } +} + +/// Horizontally add adjacent pairs of 32-bit integers in `a` and `b`. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_hadd_epi32(a: i32x8, b: i32x8) -> i32x8 { + unsafe { phaddd(a, b) } +} + +/// Horizontally add adjacent pairs of 16-bit integers in `a` and `b` +/// using saturation. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_hadds_epi16(a: i16x16, b: i16x16) -> i16x16 { + unsafe { phaddsw(a, b) } +} + +/// Horizontally substract adjacent pairs of 16-bit integers in `a` and `b`. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_hsub_epi16(a: i16x16, b: i16x16) -> i16x16 { + unsafe { phsubw(a, b) } +} + +/// Horizontally substract adjacent pairs of 32-bit integers in `a` and `b`. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_hsub_epi32(a: i32x8, b: i32x8) -> i32x8 { + unsafe { phsubd(a, b) } +} + +/// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b` +/// using saturation. +#[inline(always)] +#[target_feature = "+avx2"] +pub fn _mm256_hsubs_epi16(a: i16x16, b: i16x16) -> i16x16 { + unsafe { phsubsw(a, b) } +} + #[allow(improper_ctypes)] extern "C" { #[link_name = "llvm.x86.avx2.pabs.b"] @@ -201,7 +259,7 @@ extern "C" { #[link_name = "llvm.x86.avx2.pabs.w"] fn pabsw(a: i16x16) -> i16x16; #[link_name = "llvm.x86.avx2.pabs.d"] - fn pabsd(a: i32x8) -> i32x8; + fn pabsd(a: i32x8) -> i32x8; #[link_name = "llvm.x86.avx2.padds.b"] fn paddsb(a: i8x32, b: i8x32) -> i8x32; #[link_name = "llvm.x86.avx2.padds.w"] @@ -216,6 +274,18 @@ extern "C" { fn pavgw(a: u16x16, b: u16x16) -> u16x16; #[link_name = "llvm.x86.avx2.pblendvb"] fn pblendvb(a: i8x32, b: i8x32, mask: __m256i) -> i8x32; + #[link_name = "llvm.x86.avx2.phadd.w"] + fn phaddw(a: i16x16, b: i16x16) -> i16x16; + #[link_name = "llvm.x86.avx2.phadd.d"] + fn phaddd(a: i32x8, b: i32x8) -> i32x8; + #[link_name = "llvm.x86.avx2.phadd.sw"] + fn phaddsw(a: i16x16, b: i16x16) -> i16x16; + #[link_name = "llvm.x86.avx2.phsub.w"] + fn phsubw(a: i16x16, b: i16x16) -> i16x16; + #[link_name = "llvm.x86.avx2.phsub.d"] + fn phsubd(a: i32x8, b: i32x8) -> i32x8; + #[link_name = "llvm.x86.avx2.phsub.sw"] + fn phsubsw(a: i16x16, b: i16x16) -> i16x16; } @@ -229,42 +299,51 @@ mod tests { #[test] #[target_feature = "+avx2"] fn _mm256_abs_epi32() { - let a = i32x8::new(0, 1, -1, std::i32::MAX, - std::i32::MIN + 1, 100, -100, -32); + let a = i32x8::new( + 0, 1, -1, std::i32::MAX, + std::i32::MIN + 1, 100, -100, -32); let r = avx2::_mm256_abs_epi32(a); - let e = i32x8::new(0, 1, 1, std::i32::MAX, - (std::i32::MIN + 1).abs(), 100, 100, 32); + let e = i32x8::new( + 0, 1, 1, std::i32::MAX, + (std::i32::MIN + 1).abs(), 100, 100, 32); assert_eq!(r, e); } #[test] #[target_feature = "+avx2"] fn _mm256_abs_epi16() { - let a = i16x16::new(0, 1, -1, 2, - -2, 3, -3, 4, - -4, 5, -5, std::i16::MAX, - std::i16::MIN + 1, 100, -100, -32); + let a = i16x16::new( + 0, 1, -1, 2, + -2, 3, -3, 4, + -4, 5, -5, std::i16::MAX, + std::i16::MIN + 1, 100, -100, -32); let r = avx2::_mm256_abs_epi16(a); - let e = i16x16::new(0, 1, 1, 2, - 2, 3, 3, 4, - 4, 5, 5, std::i16::MAX, - (std::i16::MIN + 1).abs(), 100, 100, 32); + let e = i16x16::new( + 0, 1, 1, 2, + 2, 3, 3, 4, + 4, 5, 5, std::i16::MAX, + (std::i16::MIN + 1).abs(), 100, 100, 32); assert_eq!(r, e); } #[test] #[target_feature = "+avx2"] fn _mm256_abs_epi8() { - let a = i8x32::new(0, 1, -1, 2, - -2, 3, -3, 4, - -4, 5, -5, std::i8::MAX, - std::i8::MIN + 1, 100, -100, -32, - 0, 1, -1, 2, - -2, 3, -3, 4, - -4, 5, -5, std::i8::MAX, - std::i8::MIN + 1, 100, -100, -32); + let a = i8x32::new( + 0, 1, -1, 2, + -2, 3, -3, 4, + -4, 5, -5, std::i8::MAX, + std::i8::MIN + 1, 100, -100, -32, + 0, 1, -1, 2, + -2, 3, -3, 4, + -4, 5, -5, std::i8::MAX, + std::i8::MIN + 1, 100, -100, -32); let r = avx2::_mm256_abs_epi8(a); - let e = i8x32::new(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, std::i8::MAX, (std::i8::MIN + 1).abs(), 100, 100, 32, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, std::i8::MAX, (std::i8::MIN + 1).abs(), 100, 100, 32); + let e = i8x32::new( + 0, 1, 1, 2, 2, 3, 3, 4, + 4, 5, 5, std::i8::MAX, (std::i8::MIN + 1).abs(), 100, 100, 32, + 0, 1, 1, 2, 2, 3, 3, 4, + 4, 5, 5, std::i8::MAX, (std::i8::MIN + 1).abs(), 100, 100, 32); assert_eq!(r, e); } @@ -291,13 +370,16 @@ mod tests { #[test] #[target_feature = "+avx2"] fn _mm256_add_epi16() { - let a = i16x16::new(0, 1, 2, 3, 4, 5, 6, 7, - 8, 9, 10, 11, 12, 13, 14, 15); - let b = i16x16::new(0, 1, 2, 3, 4, 5, 6, 7, - 8, 9, 10, 11, 12, 13, 14, 15); + let a = i16x16::new( + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15); + let b = i16x16::new( + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15); let r = avx2::_mm256_add_epi16(a, b); - let e = i16x16::new(0, 2, 4, 6, 8, 10, 12, 14, - 16, 18, 20, 22, 24, 26, 28, 30); + let e = i16x16::new( + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30); assert_eq!(r, e); } @@ -305,21 +387,21 @@ mod tests { #[target_feature = "+avx2"] fn _mm256_add_epi8() { let a = i8x32::new( - 0, 1, 2, 3, 4, 5, 6, 7, - 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); let b = i8x32::new( - 0, 1, 2, 3, 4, 5, 6, 7, - 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); let r = avx2::_mm256_add_epi8(a, b); let e = i8x32::new( - 0, 2, 4, 6, 8, 10, 12, 14, 16, - 18, 20, 22, 24, 26, 28, 30, 32, - 34, 36, 38, 40, 42, 44, 46, 48, - 50, 52, 54, 56, 58, 60,62); + 0, 2, 4, 6, 8, 10, 12, 14, 16, + 18, 20, 22, 24, 26, 28, 30, 32, + 34, 36, 38, 40, 42, 44, 46, 48, + 50, 52, 54, 56, 58, 60, 62); assert_eq!(r, e); } @@ -328,14 +410,14 @@ mod tests { fn _mm256_adds_epi8() { let a = i8x32::new( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31); + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); let b = i8x32::new( - 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47, - 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63); + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63); let r = avx2::_mm256_adds_epi8(a, b); let e = i8x32::new( - 32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62, - 64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94); + 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, + 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94); assert_eq!(r, e); } @@ -361,13 +443,13 @@ mod tests { #[target_feature = "+avx2"] fn _mm256_adds_epi16() { let a = i16x16::new( - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); let b = i16x16::new( - 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47); - let r = avx2::_mm256_adds_epi16(a, b); + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47); + let r = avx2::_mm256_adds_epi16(a, b); let e = i16x16::new( - 32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62); - + 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62); + assert_eq!(r, e); } @@ -388,20 +470,20 @@ mod tests { let r = avx2::_mm256_adds_epi16(a, b); assert_eq!(r, a); } - + #[test] #[target_feature = "+avx2"] fn _mm256_adds_epu8() { let a = u8x32::new( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31); + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); let b = u8x32::new( - 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47, - 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63); + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63); let r = avx2::_mm256_adds_epu8(a, b); let e = u8x32::new( - 32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62, - 64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94); + 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, + 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94); assert_eq!(r, e); } @@ -414,18 +496,18 @@ mod tests { assert_eq!(r, a); } - + #[test] #[target_feature = "+avx2"] fn _mm256_adds_epu16() { let a = u16x16::new( - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); let b = u16x16::new( - 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47); + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47); let r = avx2::_mm256_adds_epu16(a, b); let e = u16x16::new( - 32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62); - + 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62); + assert_eq!(r, e); } @@ -437,13 +519,13 @@ mod tests { let r = avx2::_mm256_adds_epu16(a, b); assert_eq!(r, a); } - + #[test] #[target_feature = "+avx2"] fn _mm256_and_si256() { assert_eq!( - avx2::_mm256_and_si256(__m256i::splat(5), __m256i::splat(3)), - __m256i::splat(1)); + avx2::_mm256_and_si256( + __m256i::splat(5), __m256i::splat(3)),__m256i::splat(1)); } #[test] @@ -484,9 +566,9 @@ mod tests { fn _mm256_cmpeq_epi8() { let a = i8x32::new( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31); + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); let b = i8x32::new( - 31,30,2,28,27,26,25,24,23,22,21,20,19,18,17,16, + 31, 30, 2, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); let r = avx2::_mm256_cmpeq_epi8(a, b); assert_eq!(r, i8x32::splat(0).replace(2,0xFFu8 as i8)); @@ -494,8 +576,10 @@ mod tests { #[test] fn _mm256_cmpeq_epi16() { - let a = i16x16::new(0, 1, 2, 3, 4, 5, 6, 7,8,9,10,11,12,13,14,15); - let b = i16x16::new(15,14,2,12,11,10,9,8,7, 6, 5, 4, 3, 2, 1, 0); + let a = i16x16::new( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let b = i16x16::new( + 15, 14, 2, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); let r = avx2::_mm256_cmpeq_epi16(a, b); assert_eq!(r, i16x16::splat(0).replace(2, 0xFFFFu16 as i16)); } @@ -513,7 +597,8 @@ mod tests { let a = i64x4::new(0, 1, 2, 3); let b = i64x4::new(3, 2, 2, 0); let r = avx2::_mm256_cmpeq_epi64(a, b); - assert_eq!(r, i64x4::splat(0).replace(2, 0xFFFFFFFFFFFFFFFFu64 as i64)); + assert_eq!(r, i64x4::splat(0).replace( + 2, 0xFFFFFFFFFFFFFFFFu64 as i64)); } #[test] @@ -545,7 +630,70 @@ mod tests { let a = i64x4::splat(0).replace(0, 5); let b = i64x4::splat(0); let r = avx2::_mm256_cmpgt_epi64(a, b); - assert_eq!(r, i64x4::splat(0).replace(0, 0xFFFFFFFFFFFFFFFFu64 as i64)); + assert_eq!(r, i64x4::splat(0).replace( + 0, 0xFFFFFFFFFFFFFFFFu64 as i64)); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_hadd_epi16() { + let a = i16x16::splat(2); + let b = i16x16::splat(4); + let r = avx2::_mm256_hadd_epi16(a, b); + let e = i16x16::new(4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8); + assert_eq!(r,e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_hadd_epi32() { + let a = i32x8::splat(2); + let b = i32x8::splat(4); + let r = avx2::_mm256_hadd_epi32(a, b); + let e = i32x8::new(4, 4, 8, 8, 4, 4, 8, 8); + assert_eq!(r,e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_hadds_epi16() { + let a = i16x16::splat(2).replace(0,0x7FFF).replace(1,1); + let b = i16x16::splat(4); + let r = avx2::_mm256_hadds_epi16(a, b); + let e = i16x16::new( + 0x7FFF, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8); + assert_eq!(r,e); } + #[test] + #[target_feature ="+avx2"] + fn _mm256_hsub_epi16() { + let a = i16x16::splat(2); + let b = i16x16::splat(4); + let r = avx2::_mm256_hsub_epi16(a, b); + let e = i16x16::splat(0); + assert_eq!(r,e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_hsub_epi32() { + let a = i32x8::splat(2); + let b = i32x8::splat(4); + let r = avx2::_mm256_hsub_epi32(a, b); + let e = i32x8::splat(0); + assert_eq!(r,e); + } + + #[test] + #[target_feature = "+avx2"] + fn _mm256_hsubs_epi16() { + let a = i16x16::splat(2).replace(0,0x7FFF).replace(1,-1); + let b = i16x16::splat(4); + let r = avx2::_mm256_hsubs_epi16(a, b); + let e = i16x16::splat(0).replace(0,0x7FFF); + assert_eq!(r,e); + } + + } From ff85180789db61824a2aac6334bfbc1fcf7e3a24 Mon Sep 17 00:00:00 2001 From: Jack Mott Date: Sun, 28 May 2017 12:34:18 -0500 Subject: [PATCH 12/12] spacing on tests --- src/x86/avx2.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/x86/avx2.rs b/src/x86/avx2.rs index cbe14b1b80..7ec508231d 100644 --- a/src/x86/avx2.rs +++ b/src/x86/avx2.rs @@ -675,7 +675,7 @@ mod tests { assert_eq!(r,e); } - #[test] + #[test] #[target_feature = "+avx2"] fn _mm256_hsub_epi32() { let a = i32x8::splat(2); @@ -685,7 +685,7 @@ mod tests { assert_eq!(r,e); } - #[test] + #[test] #[target_feature = "+avx2"] fn _mm256_hsubs_epi16() { let a = i16x16::splat(2).replace(0,0x7FFF).replace(1,-1);