diff --git a/rand_core/CHANGELOG.md b/rand_core/CHANGELOG.md
index e52cfcbecb8..ee18c40e09b 100644
--- a/rand_core/CHANGELOG.md
+++ b/rand_core/CHANGELOG.md
@@ -12,7 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Add `Error` and `ErrorKind`. (#225)
 - Add `CryptoRng` marker trait. (#273)
 - Add `BlockRngCore` trait. (#281)
-- Add `BlockRng` wrapper to help implemtations. (#281)
+- Add `BlockRng` wrapper to help implementations. (#281)
 - Revise the `SeedableRng` trait. (#233)
 - Remove default implementations for `RngCore::next_u64` and `RngCore::fill_bytes`. (#288)
 - Add `RngCore::try_fill_bytes`. (#225)
diff --git a/rand_core/Cargo.toml b/rand_core/Cargo.toml
index e75d927fe89..f8457f4da15 100644
--- a/rand_core/Cargo.toml
+++ b/rand_core/Cargo.toml
@@ -23,6 +23,7 @@ appveyor = { repository = "alexcrichton/rand" }
 std = ["alloc"]    # use std library; should be default but for above bug
 alloc = []  # enables Vec and Box support without std
 serde1 = ["serde", "serde_derive"] # enables serde for BlockRng wrapper
+simd_support = [] # enables SIMD support
 
 [dependencies]
 serde = { version = "1", optional = true }
diff --git a/rand_core/src/lib.rs b/rand_core/src/lib.rs
index 93da90208d4..003aab9c107 100644
--- a/rand_core/src/lib.rs
+++ b/rand_core/src/lib.rs
@@ -41,6 +41,7 @@
 
 #![cfg_attr(not(feature="std"), no_std)]
 #![cfg_attr(all(feature="alloc", not(feature="std")), feature(alloc))]
+#![cfg_attr(feature="simd_support", feature(stdsimd))]
 
 #[cfg(feature="std")] extern crate core;
 #[cfg(all(feature = "alloc", not(feature="std")))] extern crate alloc;
@@ -59,6 +60,8 @@ pub use error::{ErrorKind, Error};
 mod error;
 pub mod impls;
 pub mod le;
+#[cfg(feature="simd_support")]
+pub mod simd_impls;
 
 
 /// The core of a random number generator.
diff --git a/rand_core/src/simd_impls.rs b/rand_core/src/simd_impls.rs
new file mode 100644
index 00000000000..f5fd24bd733
--- /dev/null
+++ b/rand_core/src/simd_impls.rs
@@ -0,0 +1,188 @@
+use core::simd::*;
+use core::mem;
+
+/// Enables an RNG to use [`SimdRngImpls`].
+///
+/// # Example
+///
+/// A simple example, obviously not generating very *random* output:
+///
+/// ```rust
+/// #![feature(stdsimd)]
+/// use std::simd::u32x4;
+/// use rand_core::simd_impls::SimdRng;
+///
+/// struct CountingSimdRng(u32x4);
+///
+/// impl SimdRng<u32x4> for CountingSimdRng {
+///     fn generate(&mut self) -> u32x4 {
+///         self.0 += u32x4::splat(1);
+///         self.0
+///     }
+/// }
+/// ```
+///
+/// [`SimdRngImpls`]: /trait.SimdRngImpls.html
+pub trait SimdRng<Vector> {
+    /// Return the next random vector.
+    #[inline(always)]
+    fn generate(&mut self) -> Vector;
+}
+
+/// Helper functions for implementing `RngCore` functions for SIMD RNGs which
+/// implement [`SimdRng`].
+///
+/// # Example
+///
+/// A simple example, using `CountingSimdRng` from the [`SimdRng`] example:
+///
+/// ```rust
+/// #![feature(stdsimd)]
+/// use std::simd::u32x4;
+/// use rand_core::{RngCore, Error};
+/// use rand_core::simd_impls::{SimdRng, SimdRngImpls};
+///
+/// struct CountingSimdRng(u32x4);
+///
+/// impl SimdRng<u32x4> for CountingSimdRng {
+///     fn generate(&mut self) -> u32x4 {
+///         self.0 += u32x4::splat(1);
+///         self.0
+///     }
+/// }
+///
+/// impl RngCore for CountingSimdRng {
+///     fn next_u32(&mut self) -> u32 {
+///         u32x4::next_u32_via_simd(self)
+///     }
+///
+///     fn next_u64(&mut self) -> u64 {
+///         u32x4::next_u64_via_simd(self)
+///     }
+///
+///     fn fill_bytes(&mut self, dest: &mut [u8]) {
+///         u32x4::fill_bytes_via_simd(self, dest)
+///     }
+///
+///     fn try_fill_bytes(&mut self, dest: &mut [u8]) -> Result<(), Error> {
+///         self.fill_bytes(dest);
+///         Ok(())
+///     }
+/// }
+/// ```
+///
+/// [`SimdRng`]: /trait.SimdRng.html
+pub trait SimdRngImpls<V> {
+    /// Implement `next_u32` via a SIMD vector.
+    #[inline(always)]
+    fn next_u32_via_simd<R: SimdRng<V>>(rng: &mut R) -> u32;
+
+    /// Implement `next_u64` via a SIMD vector.
+    #[inline(always)]
+    fn next_u64_via_simd<R: SimdRng<V>>(rng: &mut R) -> u64;
+
+    /// Implement `fill_bytes` via SIMD vectors.
+    ///
+    /// This is useful for generating other vector types.  If the code uses 
+    /// it in a SIMD context, the result should stay in SIMD registers.
+    #[inline(always)]
+    fn fill_bytes_via_simd<R: SimdRng<V>>(rng: &mut R, dest: &mut [u8]);
+}
+
+macro_rules! impl_simd_rng {
+    ($vector:ty, $v8:ident, $v32:ident, $v64:ident) => (
+        impl SimdRngImpls<$vector> for $vector {
+            fn next_u32_via_simd<R: SimdRng<$vector>>(rng: &mut R) -> u32 {
+                $v32::from_bits(rng.generate()).extract(0)
+            }
+
+            fn next_u64_via_simd<R: SimdRng<$vector>>(rng: &mut R) -> u64 {
+                $v64::from_bits(rng.generate()).extract(0)
+            }
+
+            fn fill_bytes_via_simd<R: SimdRng<$vector>>(rng: &mut R, dest: &mut [u8]) {
+                // Forced inlining will keep the result in SIMD registers if
+                // the code using it also uses it in a SIMD context.
+                let chunk_size = mem::size_of::<$vector>();
+                let remainder = dest.len() % chunk_size;
+                let len = dest.len() - remainder;
+                let mut read_len = 0;
+                let mut results;
+                loop {
+                    // FIXME: on big-endian we should do byte swapping around
+                    // here.
+                    results = $v8::from_bits(rng.generate());
+                    if read_len < len {
+                        results.store_aligned(&mut dest[read_len..]);
+                        read_len += chunk_size;
+                    }
+                    if read_len == len { break; }
+                }
+                if remainder > 0 {
+                    // TODO: this part can probably be done better
+                    // - the compiler seems to intelligently store a smaller
+                    //   vector.  i.e. for a u32x8 and `remaining == 7`, it 
+                    //   will store the first u32x4 into `dest` then do some 
+                    //   slower stuff for the remaining 3 lanes
+                    for i in 0..remainder {
+                        dest[read_len+i] = results.extract(i);
+                    }
+                }
+            }
+        }
+    )
+}
+
+impl_simd_rng!(u32x4, u8x16, u32x4, u64x2);
+impl_simd_rng!(u32x8, u8x32, u32x8, u64x4);
+impl_simd_rng!(u32x16, u8x64, u32x16, u64x8);
+
+impl_simd_rng!(u64x2, u8x16, u32x4, u64x2);
+impl_simd_rng!(u64x4, u8x32, u32x8, u64x4);
+impl_simd_rng!(u64x8, u8x64, u32x16, u64x8);
+
+impl SimdRngImpls<u32x2> for u32x2 {
+    fn next_u32_via_simd<R: SimdRng<u32x2>>(rng: &mut R) -> u32 {
+        rng.generate().extract(0)
+    }
+
+    fn next_u64_via_simd<R: SimdRng<u32x2>>(rng: &mut R) -> u64 {
+        // We cannot do `u64x1::from_bits(u32x2)` so we concatenate the bits
+        // manually.
+        // Use LE; we explicitly generate one value before the next.
+        let results = rng.generate();
+        let x = u64::from(results.extract(0));
+        let y = u64::from(results.extract(1));
+        (y << 32) | x
+    }
+
+    fn fill_bytes_via_simd<R: SimdRng<u32x2>>(rng: &mut R, dest: &mut [u8]) {
+        // Forced inlining will keep the result in SIMD registers if
+        // the code using it also uses it in a SIMD context.
+        let chunk_size = mem::size_of::<u32x2>();
+        let remainder = dest.len() % chunk_size;
+        let len = dest.len() - remainder;
+        let mut read_len = 0;
+        let mut results;
+        loop {
+            // FIXME: on big-endian we should do byte swapping around
+            // here.
+            results = u8x8::from_bits(rng.generate());
+            if read_len < len {
+                results.store_aligned(&mut dest[read_len..]);
+                read_len += chunk_size;
+            }
+            if read_len == len { break; }
+        }
+        if remainder > 0 {
+            // TODO: this part can probably be done better
+            // - the compiler seems to intelligently store a smaller
+            //   vector.  i.e. for a u32x8 and `remaining == 7`, it 
+            //   will store the first u32x4 into `dest` then do some 
+            //   slower stuff for the remaining 3 lanes
+            for i in 0..remainder {
+                dest[read_len+i] = results.extract(i);
+            }
+        }
+    }
+}
diff --git a/src/distributions/box_muller.rs b/src/distributions/box_muller.rs
index e229ddf1c19..80b753a7c62 100644
--- a/src/distributions/box_muller.rs
+++ b/src/distributions/box_muller.rs
@@ -2,6 +2,8 @@
 //!
 //! <https://en.wikipedia.org/wiki/Box-Muller_transform>
 
+// TODO: look into more accuruate math
+
 #[cfg(feature="simd_support")]
 use core::simd::*;
 #[cfg(feature="simd_support")]
@@ -1038,8 +1040,7 @@ mod tests {
     use SeedableRng;
     use prng::Sfc32x4Rng;
 
-    const BENCH_N: usize = 1 << 10;
-    const TEST_N: usize = 1 << 15;
+    const TEST_N: usize = 1 << 10;
 
     macro_rules! make_log_test {
         ($test_name:ident, $ty:ident) => (
diff --git a/src/prng/sfc32.rs b/src/prng/sfc32.rs
index 2aaec931290..c7ff3f51fb2 100644
--- a/src/prng/sfc32.rs
+++ b/src/prng/sfc32.rs
@@ -11,10 +11,13 @@
 //! SFC generators (32-bit).
 
 use core::{fmt, slice, mem};
-#[cfg(feature="simd_support")]
+
+
 use core::simd::*;
 
 use rand_core::{RngCore, SeedableRng, Error, impls, le};
+#[cfg(feature="simd_support")]
+use rand_core::simd_impls::{SimdRng, SimdRngImpls};
 
 /// A Small Fast Counting RNG designed by Chris Doty-Humphrey (32-bit version).
 ///
@@ -47,7 +50,7 @@ impl SeedableRng for Sfc32Rng {
     fn from_seed(seed: Self::Seed) -> Self {
         let mut seed_u32 = [0u32; 3];
         le::read_u32_into(&seed, &mut seed_u32);
-        let mut state = Self { a: seed_u32[0],
+        let state = Self { a: seed_u32[0],
                                b: seed_u32[1],
                                c: seed_u32[2],
                                counter: 1};
@@ -126,43 +129,17 @@ macro_rules! make_sfc_32_simd {
         impl RngCore for $rng_name {
             #[inline(always)]
             fn next_u32(&mut self) -> u32 {
-                let results = $vector32::from_bits(self.generate());
-                results.extract(0)
+                $vector::next_u32_via_simd(self)
             }
 
             #[inline(always)]
             fn next_u64(&mut self) -> u64 {
-                let results = $vector32::from_bits(self.generate());
-                let x = u64::from(results.extract(0));
-                let y = u64::from(results.extract(1));
-                (y << 32) | x
+                $vector::next_u64_via_simd(self)
             }
 
             #[inline(always)]
             fn fill_bytes(&mut self, dest: &mut [u8]) {
-                // Forced inlining will keep the result in SIMD registers if
-                // the code using it also uses it in a SIMD context.
-                let chunk_size = ::core::mem::size_of::<$vector>();
-                let remainder = dest.len() % chunk_size;
-                let len = dest.len() - remainder;
-                let mut read_len = 0;
-                let mut results;
-                loop {
-                    // FIXME: on big-endian we should do byte swapping around
-                    // here.
-                    results = $vector8::from_bits(self.generate());
-                    if read_len < len {
-                        results.store_unaligned(&mut dest[read_len..]);
-                        read_len += chunk_size;
-                    }
-                    if read_len == len { break; }
-                }
-                if remainder > 0 {
-                    // TODO: this part can probably be done better
-                    for i in 0..remainder {
-                        dest[read_len+i] = results.extract(i);
-                    }
-                }
+                $vector::fill_bytes_via_simd(self, dest)
             }
 
             fn try_fill_bytes(&mut self, dest: &mut [u8]) -> Result<(), Error> {
@@ -194,7 +171,9 @@ macro_rules! make_sfc_32_simd {
 
                 Self::from_vector(a, b, c)
             }
+        }
 
+        impl SimdRng<$vector> for $rng_name {
             #[inline(always)]
             fn generate(&mut self) -> $vector {
                 #[inline]