diff --git a/rand_core/CHANGELOG.md b/rand_core/CHANGELOG.md index e52cfcbecb8..ee18c40e09b 100644 --- a/rand_core/CHANGELOG.md +++ b/rand_core/CHANGELOG.md @@ -12,7 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Add `Error` and `ErrorKind`. (#225) - Add `CryptoRng` marker trait. (#273) - Add `BlockRngCore` trait. (#281) -- Add `BlockRng` wrapper to help implemtations. (#281) +- Add `BlockRng` wrapper to help implementations. (#281) - Revise the `SeedableRng` trait. (#233) - Remove default implementations for `RngCore::next_u64` and `RngCore::fill_bytes`. (#288) - Add `RngCore::try_fill_bytes`. (#225) diff --git a/rand_core/Cargo.toml b/rand_core/Cargo.toml index e75d927fe89..f8457f4da15 100644 --- a/rand_core/Cargo.toml +++ b/rand_core/Cargo.toml @@ -23,6 +23,7 @@ appveyor = { repository = "alexcrichton/rand" } std = ["alloc"] # use std library; should be default but for above bug alloc = [] # enables Vec and Box support without std serde1 = ["serde", "serde_derive"] # enables serde for BlockRng wrapper +simd_support = [] # enables SIMD support [dependencies] serde = { version = "1", optional = true } diff --git a/rand_core/src/lib.rs b/rand_core/src/lib.rs index 93da90208d4..003aab9c107 100644 --- a/rand_core/src/lib.rs +++ b/rand_core/src/lib.rs @@ -41,6 +41,7 @@ #![cfg_attr(not(feature="std"), no_std)] #![cfg_attr(all(feature="alloc", not(feature="std")), feature(alloc))] +#![cfg_attr(feature="simd_support", feature(stdsimd))] #[cfg(feature="std")] extern crate core; #[cfg(all(feature = "alloc", not(feature="std")))] extern crate alloc; @@ -59,6 +60,8 @@ pub use error::{ErrorKind, Error}; mod error; pub mod impls; pub mod le; +#[cfg(feature="simd_support")] +pub mod simd_impls; /// The core of a random number generator. diff --git a/rand_core/src/simd_impls.rs b/rand_core/src/simd_impls.rs new file mode 100644 index 00000000000..f5fd24bd733 --- /dev/null +++ b/rand_core/src/simd_impls.rs @@ -0,0 +1,188 @@ +use core::simd::*; +use core::mem; + +/// Enables an RNG to use [`SimdRngImpls`]. +/// +/// # Example +/// +/// A simple example, obviously not generating very *random* output: +/// +/// ```rust +/// #![feature(stdsimd)] +/// use std::simd::u32x4; +/// use rand_core::simd_impls::SimdRng; +/// +/// struct CountingSimdRng(u32x4); +/// +/// impl SimdRng for CountingSimdRng { +/// fn generate(&mut self) -> u32x4 { +/// self.0 += u32x4::splat(1); +/// self.0 +/// } +/// } +/// ``` +/// +/// [`SimdRngImpls`]: /trait.SimdRngImpls.html +pub trait SimdRng { + /// Return the next random vector. + #[inline(always)] + fn generate(&mut self) -> Vector; +} + +/// Helper functions for implementing `RngCore` functions for SIMD RNGs which +/// implement [`SimdRng`]. +/// +/// # Example +/// +/// A simple example, using `CountingSimdRng` from the [`SimdRng`] example: +/// +/// ```rust +/// #![feature(stdsimd)] +/// use std::simd::u32x4; +/// use rand_core::{RngCore, Error}; +/// use rand_core::simd_impls::{SimdRng, SimdRngImpls}; +/// +/// struct CountingSimdRng(u32x4); +/// +/// impl SimdRng for CountingSimdRng { +/// fn generate(&mut self) -> u32x4 { +/// self.0 += u32x4::splat(1); +/// self.0 +/// } +/// } +/// +/// impl RngCore for CountingSimdRng { +/// fn next_u32(&mut self) -> u32 { +/// u32x4::next_u32_via_simd(self) +/// } +/// +/// fn next_u64(&mut self) -> u64 { +/// u32x4::next_u64_via_simd(self) +/// } +/// +/// fn fill_bytes(&mut self, dest: &mut [u8]) { +/// u32x4::fill_bytes_via_simd(self, dest) +/// } +/// +/// fn try_fill_bytes(&mut self, dest: &mut [u8]) -> Result<(), Error> { +/// self.fill_bytes(dest); +/// Ok(()) +/// } +/// } +/// ``` +/// +/// [`SimdRng`]: /trait.SimdRng.html +pub trait SimdRngImpls { + /// Implement `next_u32` via a SIMD vector. + #[inline(always)] + fn next_u32_via_simd>(rng: &mut R) -> u32; + + /// Implement `next_u64` via a SIMD vector. + #[inline(always)] + fn next_u64_via_simd>(rng: &mut R) -> u64; + + /// Implement `fill_bytes` via SIMD vectors. + /// + /// This is useful for generating other vector types. If the code uses + /// it in a SIMD context, the result should stay in SIMD registers. + #[inline(always)] + fn fill_bytes_via_simd>(rng: &mut R, dest: &mut [u8]); +} + +macro_rules! impl_simd_rng { + ($vector:ty, $v8:ident, $v32:ident, $v64:ident) => ( + impl SimdRngImpls<$vector> for $vector { + fn next_u32_via_simd>(rng: &mut R) -> u32 { + $v32::from_bits(rng.generate()).extract(0) + } + + fn next_u64_via_simd>(rng: &mut R) -> u64 { + $v64::from_bits(rng.generate()).extract(0) + } + + fn fill_bytes_via_simd>(rng: &mut R, dest: &mut [u8]) { + // Forced inlining will keep the result in SIMD registers if + // the code using it also uses it in a SIMD context. + let chunk_size = mem::size_of::<$vector>(); + let remainder = dest.len() % chunk_size; + let len = dest.len() - remainder; + let mut read_len = 0; + let mut results; + loop { + // FIXME: on big-endian we should do byte swapping around + // here. + results = $v8::from_bits(rng.generate()); + if read_len < len { + results.store_aligned(&mut dest[read_len..]); + read_len += chunk_size; + } + if read_len == len { break; } + } + if remainder > 0 { + // TODO: this part can probably be done better + // - the compiler seems to intelligently store a smaller + // vector. i.e. for a u32x8 and `remaining == 7`, it + // will store the first u32x4 into `dest` then do some + // slower stuff for the remaining 3 lanes + for i in 0..remainder { + dest[read_len+i] = results.extract(i); + } + } + } + } + ) +} + +impl_simd_rng!(u32x4, u8x16, u32x4, u64x2); +impl_simd_rng!(u32x8, u8x32, u32x8, u64x4); +impl_simd_rng!(u32x16, u8x64, u32x16, u64x8); + +impl_simd_rng!(u64x2, u8x16, u32x4, u64x2); +impl_simd_rng!(u64x4, u8x32, u32x8, u64x4); +impl_simd_rng!(u64x8, u8x64, u32x16, u64x8); + +impl SimdRngImpls for u32x2 { + fn next_u32_via_simd>(rng: &mut R) -> u32 { + rng.generate().extract(0) + } + + fn next_u64_via_simd>(rng: &mut R) -> u64 { + // We cannot do `u64x1::from_bits(u32x2)` so we concatenate the bits + // manually. + // Use LE; we explicitly generate one value before the next. + let results = rng.generate(); + let x = u64::from(results.extract(0)); + let y = u64::from(results.extract(1)); + (y << 32) | x + } + + fn fill_bytes_via_simd>(rng: &mut R, dest: &mut [u8]) { + // Forced inlining will keep the result in SIMD registers if + // the code using it also uses it in a SIMD context. + let chunk_size = mem::size_of::(); + let remainder = dest.len() % chunk_size; + let len = dest.len() - remainder; + let mut read_len = 0; + let mut results; + loop { + // FIXME: on big-endian we should do byte swapping around + // here. + results = u8x8::from_bits(rng.generate()); + if read_len < len { + results.store_aligned(&mut dest[read_len..]); + read_len += chunk_size; + } + if read_len == len { break; } + } + if remainder > 0 { + // TODO: this part can probably be done better + // - the compiler seems to intelligently store a smaller + // vector. i.e. for a u32x8 and `remaining == 7`, it + // will store the first u32x4 into `dest` then do some + // slower stuff for the remaining 3 lanes + for i in 0..remainder { + dest[read_len+i] = results.extract(i); + } + } + } +} diff --git a/src/distributions/box_muller.rs b/src/distributions/box_muller.rs index e229ddf1c19..80b753a7c62 100644 --- a/src/distributions/box_muller.rs +++ b/src/distributions/box_muller.rs @@ -2,6 +2,8 @@ //! //! +// TODO: look into more accuruate math + #[cfg(feature="simd_support")] use core::simd::*; #[cfg(feature="simd_support")] @@ -1038,8 +1040,7 @@ mod tests { use SeedableRng; use prng::Sfc32x4Rng; - const BENCH_N: usize = 1 << 10; - const TEST_N: usize = 1 << 15; + const TEST_N: usize = 1 << 10; macro_rules! make_log_test { ($test_name:ident, $ty:ident) => ( diff --git a/src/prng/sfc32.rs b/src/prng/sfc32.rs index 2aaec931290..c7ff3f51fb2 100644 --- a/src/prng/sfc32.rs +++ b/src/prng/sfc32.rs @@ -11,10 +11,13 @@ //! SFC generators (32-bit). use core::{fmt, slice, mem}; -#[cfg(feature="simd_support")] + + use core::simd::*; use rand_core::{RngCore, SeedableRng, Error, impls, le}; +#[cfg(feature="simd_support")] +use rand_core::simd_impls::{SimdRng, SimdRngImpls}; /// A Small Fast Counting RNG designed by Chris Doty-Humphrey (32-bit version). /// @@ -47,7 +50,7 @@ impl SeedableRng for Sfc32Rng { fn from_seed(seed: Self::Seed) -> Self { let mut seed_u32 = [0u32; 3]; le::read_u32_into(&seed, &mut seed_u32); - let mut state = Self { a: seed_u32[0], + let state = Self { a: seed_u32[0], b: seed_u32[1], c: seed_u32[2], counter: 1}; @@ -126,43 +129,17 @@ macro_rules! make_sfc_32_simd { impl RngCore for $rng_name { #[inline(always)] fn next_u32(&mut self) -> u32 { - let results = $vector32::from_bits(self.generate()); - results.extract(0) + $vector::next_u32_via_simd(self) } #[inline(always)] fn next_u64(&mut self) -> u64 { - let results = $vector32::from_bits(self.generate()); - let x = u64::from(results.extract(0)); - let y = u64::from(results.extract(1)); - (y << 32) | x + $vector::next_u64_via_simd(self) } #[inline(always)] fn fill_bytes(&mut self, dest: &mut [u8]) { - // Forced inlining will keep the result in SIMD registers if - // the code using it also uses it in a SIMD context. - let chunk_size = ::core::mem::size_of::<$vector>(); - let remainder = dest.len() % chunk_size; - let len = dest.len() - remainder; - let mut read_len = 0; - let mut results; - loop { - // FIXME: on big-endian we should do byte swapping around - // here. - results = $vector8::from_bits(self.generate()); - if read_len < len { - results.store_unaligned(&mut dest[read_len..]); - read_len += chunk_size; - } - if read_len == len { break; } - } - if remainder > 0 { - // TODO: this part can probably be done better - for i in 0..remainder { - dest[read_len+i] = results.extract(i); - } - } + $vector::fill_bytes_via_simd(self, dest) } fn try_fill_bytes(&mut self, dest: &mut [u8]) -> Result<(), Error> { @@ -194,7 +171,9 @@ macro_rules! make_sfc_32_simd { Self::from_vector(a, b, c) } + } + impl SimdRng<$vector> for $rng_name { #[inline(always)] fn generate(&mut self) -> $vector { #[inline]