diff --git a/Cargo.toml b/Cargo.toml index 19cf619a0fe..afc2a20d54c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -41,7 +41,7 @@ alloc = ["rand_core/alloc"] # Option: use getrandom package for seeding getrandom = ["rand_core/getrandom"] -# Option (requires nightly): experimental SIMD support +# Option (requires nightly Rust): experimental SIMD support simd_support = ["packed_simd"] # Option (enabled by default): enable StdRng diff --git a/src/distributions/integer.rs b/src/distributions/integer.rs index 19ce71599cb..fcd58e79264 100644 --- a/src/distributions/integer.rs +++ b/src/distributions/integer.rs @@ -10,12 +10,10 @@ use crate::distributions::{Distribution, Standard}; use crate::Rng; -#[cfg(all(target_arch = "x86", feature = "simd_support"))] -use core::arch::x86::{__m128i, __m256i}; -#[cfg(all(target_arch = "x86_64", feature = "simd_support"))] -use core::arch::x86_64::{__m128i, __m256i}; -use core::num::{NonZeroU16, NonZeroU32, NonZeroU64, NonZeroU8, NonZeroUsize, - NonZeroU128}; +#[cfg(target_arch = "x86")] use core::arch::x86::{__m128i, __m256i}; +#[cfg(target_arch = "x86_64")] use core::arch::x86_64::{__m128i, __m256i}; +use core::mem; +use core::num::{NonZeroU128, NonZeroU16, NonZeroU32, NonZeroU64, NonZeroU8, NonZeroUsize}; #[cfg(feature = "simd_support")] use packed_simd::*; impl Distribution for Standard { @@ -109,53 +107,60 @@ impl_nzint!(NonZeroU64, NonZeroU64::new); impl_nzint!(NonZeroU128, NonZeroU128::new); impl_nzint!(NonZeroUsize, NonZeroUsize::new); + #[cfg(feature = "simd_support")] -macro_rules! simd_impl { - ($(($intrinsic:ident, $vec:ty),)+) => {$( - impl Distribution<$intrinsic> for Standard { - #[inline] - fn sample(&self, rng: &mut R) -> $intrinsic { - $intrinsic::from_bits(rng.gen::<$vec>()) +macro_rules! packed_simd_types_impl { + ($($ty:ty),+) => { + $( + impl Distribution<$ty> for Standard { + #[inline] + fn sample(&self, rng: &mut R) -> $ty { + let mut vec: $ty = <$ty>::default(); + unsafe { + let ptr = &mut vec; + let b_ptr = &mut *(ptr as *mut $ty as *mut [u8; mem::size_of::<$ty>()]); + rng.fill_bytes(b_ptr); + } + vec.to_le() + } } - } - )+}; + )+ + }; +} - ($bits:expr,) => {}; - ($bits:expr, $ty:ty, $($ty_more:ty,)*) => { - simd_impl!($bits, $($ty_more,)*); +#[cfg(feature = "simd_support")] +#[rustfmt::skip] +packed_simd_types_impl!( + u8x2, i8x2, + u8x4, i8x4, u16x2, i16x2, + u8x8, i8x8, u16x4, i16x4, u32x2, i32x2, + u8x16, i8x16, u16x8, i16x8, u32x4, i32x4, u64x2, i64x2, u128x1, i128x1, + u8x32, i8x32, u16x16, i16x16, u32x8, i32x8, u64x4, i64x4, u128x2, i128x2, + u8x64, i8x64, u16x32, i16x32, u32x16, i32x16, u64x8, i64x8, u128x4, i128x4, + usizex2, usizex4, usizex8 +); - impl Distribution<$ty> for Standard { - #[inline] - fn sample(&self, rng: &mut R) -> $ty { - let mut vec: $ty = Default::default(); - unsafe { - let ptr = &mut vec; - let b_ptr = &mut *(ptr as *mut $ty as *mut [u8; $bits/8]); - rng.fill_bytes(b_ptr); +// x86/64 are already little endian so we don't need packed_simd's `to_le` and +// therefore can provide this on stable Rust. +macro_rules! intrinsic_native_le_impl { + ($($ty:ty),+) => { + $( + impl Distribution<$ty> for Standard { + #[inline] + fn sample(&self, rng: &mut R) -> $ty { + // On proper hardware, this should compile to SIMD intrinsics + // Verified on x86 Haswell with __m128i, __m256i + let mut buf = [0_u8; mem::size_of::<$ty>()]; + rng.fill_bytes(&mut buf); + unsafe { mem::transmute_copy(&buf) } } - vec.to_le() } - } + )+ }; } -#[cfg(feature = "simd_support")] -simd_impl!(16, u8x2, i8x2,); -#[cfg(feature = "simd_support")] -simd_impl!(32, u8x4, i8x4, u16x2, i16x2,); -#[cfg(feature = "simd_support")] -simd_impl!(64, u8x8, i8x8, u16x4, i16x4, u32x2, i32x2,); -#[cfg(feature = "simd_support")] -simd_impl!(128, u8x16, i8x16, u16x8, i16x8, u32x4, i32x4, u64x2, i64x2,); -#[cfg(feature = "simd_support")] -simd_impl!(256, u8x32, i8x32, u16x16, i16x16, u32x8, i32x8, u64x4, i64x4,); -#[cfg(feature = "simd_support")] -simd_impl!(512, u8x64, i8x64, u16x32, i16x32, u32x16, i32x16, u64x8, i64x8,); -#[cfg(all( - feature = "simd_support", - any(target_arch = "x86", target_arch = "x86_64") -))] -simd_impl!((__m128i, u8x16), (__m256i, u8x32),); +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +intrinsic_native_le_impl!(__m128i, __m256i); #[cfg(test)] mod tests { diff --git a/src/distributions/uniform.rs b/src/distributions/uniform.rs index 11a791ef8d1..139420de1a4 100644 --- a/src/distributions/uniform.rs +++ b/src/distributions/uniform.rs @@ -712,6 +712,21 @@ uniform_simd_int_impl! { u8 } +#[cfg(feature = "simd_support")] +uniform_simd_int_impl! { + (usizex2, isizex2), + (usizex4, isizex4), + (usizex8, isizex8), + usize +} + +#[cfg(feature = "simd_support")] +uniform_simd_int_impl! { + (u128x2, i128x2), + (u128x4, i128x4), + u128 +} + impl SampleUniform for char { type Sampler = UniformChar; } diff --git a/src/distributions/utils.rs b/src/distributions/utils.rs index 89da5fd7aad..b93a4221759 100644 --- a/src/distributions/utils.rs +++ b/src/distributions/utils.rs @@ -202,6 +202,14 @@ mod simd_wmul { wmul_impl_large! { (u16x32,) u16, 8 } wmul_impl_large! { (u32x16,) u32, 16 } wmul_impl_large! { (u64x2, u64x4, u64x8,) u64, 32 } + wmul_impl_large! { (u128x2, u128x4,) u128, 64 } + + #[cfg(target_pointer_width = "64")] + wmul_impl_large! { (usizex2, usizex4, usizex8,) usize, 32 } + #[cfg(target_pointer_width = "32")] + wmul_impl! { (usizex2, u64x2), (usizex4, u64x4), (usizex8, u64x8),, 32 } + #[cfg(target_pointer_width = "16")] + wmul_impl! { (usizex2, u32x2), (usizex4, u32x4), (usizex8, u32x8),, 16 } } /// Helper trait when dealing with scalar and SIMD floating point types.