diff --git a/Cargo.toml b/Cargo.toml
index de75047..ae8dca2 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "rust-crypto"
-version = "0.2.29"
+version = "0.2.30"
 authors = ["The Rust-Crypto Project Developers"]
 license = "MIT/Apache-2.0"
 homepage = "https://github.com/DaGenix/rust-crypto/"
@@ -8,11 +8,17 @@ repository = "https://github.com/DaGenix/rust-crypto/"
 description = "A (mostly) pure-Rust implementation of various common cryptographic algorithms."
 keywords = [ "Crypto", "MD5", "Sha1", "Sha2", "AES" ]
 readme = "README.md"
+build = "build.rs"
 
 [lib]
 name = "crypto"
 
+[build-dependencies]
+gcc = "*"
+
 [dependencies]
+libc = "*"
 time = "*"
 rand = "*"
 rustc-serialize = "*"
+
diff --git a/build.rs b/build.rs
new file mode 100644
index 0000000..afe41b2
--- /dev/null
+++ b/build.rs
@@ -0,0 +1,14 @@
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+extern crate gcc;
+
+fn main() {
+    gcc::compile_library(
+        "lib_rust_crypto_helpers.a",
+        &["src/util_helpers.c", "src/aesni_helpers.c"]);
+}
+
diff --git a/src/aes.rs b/src/aes.rs
index 6a91f69..c4da949 100644
--- a/src/aes.rs
+++ b/src/aes.rs
@@ -700,7 +700,7 @@ mod test {
             let mut result: Vec<u8> = repeat(0).take(test.plain.len()).collect();
             aes_enc.process(&test.plain[..], &mut result[..]);
             let res: &[u8] = result.as_ref();
-            assert!(res == test.cipher);
+            assert!(res == &test.cipher[..]);
         }
     }
 }
diff --git a/src/aes_gcm.rs b/src/aes_gcm.rs
index 506cd17..4f9752c 100644
--- a/src/aes_gcm.rs
+++ b/src/aes_gcm.rs
@@ -6,7 +6,7 @@
 
 use aes::{ctr, KeySize};
 use aead::{AeadEncryptor,AeadDecryptor};
-use std::slice::bytes::copy_memory;
+use cryptoutil::copy_memory;
 use symmetriccipher::SynchronousStreamCipher;
 use ghash::{Ghash};
 use util::fixed_time_eq;
diff --git a/src/aesni.rs b/src/aesni.rs
index 67860e4..c02698d 100644
--- a/src/aesni.rs
+++ b/src/aesni.rs
@@ -78,82 +78,32 @@ enum KeyType {
 #[inline(always)]
 fn size(rounds: u8) -> usize { 16 * ((rounds as usize) + 1) }
 
-#[inline(always)]
-unsafe fn aesimc(round_keys: *mut u8) {
-    asm!(
-    "
-        movdqu ($0), %xmm1
-        aesimc %xmm1, %xmm1
-        movdqu %xmm1, ($0)
-    "
-    : // outputs
-    : "r" (round_keys) // inputs
-    : "xmm1", "memory" // clobbers
-    : "volatile"
-    )
+extern {
+    fn rust_crypto_aesni_aesimc(round_keys: *mut u8);
+    fn rust_crypto_aesni_setup_working_key_128(key: *const u8, round_key: *mut u8);
+    fn rust_crypto_aesni_setup_working_key_192(key: *const u8, round_key: *mut u8);
+    fn rust_crypto_aesni_setup_working_key_256(key: *const u8, round_key: *mut u8);
+    fn rust_crypto_aesni_encrypt_block(
+            rounds: u8,
+            input: *const u8,
+            round_keys: *const u8,
+            output: *mut u8);
+    fn rust_crypto_aesni_decrypt_block(
+            rounds: u8,
+            input: *const u8,
+            round_keys: *const u8,
+            output: *mut u8);
 }
 
-#[allow(unused_assignments)]
 fn setup_working_key_aesni_128(key: &[u8], key_type: KeyType, round_key: &mut [u8]) {
     unsafe {
-        let mut round_keysp: *mut u8 = round_key.get_unchecked_mut(0);
-        let keyp: *const u8 = key.get_unchecked(0);
-
-        asm!(
-        "
-            movdqu ($1), %xmm1
-            movdqu %xmm1, ($0)
-            add $$0x10, $0
-
-            aeskeygenassist $$0x01, %xmm1, %xmm2
-            call 1f
-            aeskeygenassist $$0x02, %xmm1, %xmm2
-            call 1f
-            aeskeygenassist $$0x04, %xmm1, %xmm2
-            call 1f
-            aeskeygenassist $$0x08, %xmm1, %xmm2
-            call 1f
-            aeskeygenassist $$0x10, %xmm1, %xmm2
-            call 1f
-            aeskeygenassist $$0x20, %xmm1, %xmm2
-            call 1f
-            aeskeygenassist $$0x40, %xmm1, %xmm2
-            call 1f
-            aeskeygenassist $$0x80, %xmm1, %xmm2
-            call 1f
-            aeskeygenassist $$0x1b, %xmm1, %xmm2
-            call 1f
-            aeskeygenassist $$0x36, %xmm1, %xmm2
-            call 1f
-
-            jmp 2f
-
-            1:
-            pshufd $$0xff, %xmm2, %xmm2
-            vpslldq $$0x04, %xmm1, %xmm3
-            pxor %xmm3, %xmm1
-            vpslldq $$0x4, %xmm1, %xmm3
-            pxor %xmm3, %xmm1
-            vpslldq $$0x04, %xmm1, %xmm3
-            pxor %xmm3, %xmm1
-            pxor %xmm2, %xmm1
-            movdqu %xmm1, ($0)
-            add $$0x10, $0
-            ret
-
-            2:
-        "
-        : "=r" (round_keysp)
-        : "r" (keyp), "0" (round_keysp)
-        : "xmm1", "xmm2", "xmm3", "memory"
-        : "volatile"
-        );
+        rust_crypto_aesni_setup_working_key_128(key.as_ptr(), round_key.as_mut_ptr());
 
         match key_type {
             KeyType::Decryption => {
                 // range of rounds keys from #1 to #9; skip the first and last key
                 for i in (1..10) {
-                    aesimc(round_key.get_unchecked_mut(16 * i));
+                    rust_crypto_aesni_aesimc(round_key.get_unchecked_mut(16 * i));
                 }
             }
             KeyType::Encryption => { /* nothing more to do */ }
@@ -161,102 +111,15 @@ fn setup_working_key_aesni_128(key: &[u8], key_type: KeyType, round_key: &mut [u
     }
 }
 
-#[allow(unused_assignments)]
 fn setup_working_key_aesni_192(key: &[u8], key_type: KeyType, round_key: &mut [u8]) {
     unsafe {
-        let mut round_keysp: *mut u8 = round_key.get_unchecked_mut(0);
-        let keyp: *const u8 = key.get_unchecked(0);
-
-        asm!(
-        "
-            movdqu ($1), %xmm1
-            movdqu 16($1), %xmm3
-            movdqu %xmm1, ($0)
-            movdqa %xmm3, %xmm5
-
-            aeskeygenassist $$0x1, %xmm3, %xmm2
-            call 1f
-            shufpd $$0, %xmm1, %xmm5
-            movdqu %xmm5, 16($0)
-            movdqa %xmm1, %xmm6
-            shufpd $$1, %xmm3, %xmm6
-            movdqu %xmm6, 32($0)
-
-            aeskeygenassist $$0x2, %xmm3, %xmm2
-            call 1f
-            movdqu %xmm1, 48($0)
-            movdqa %xmm3, %xmm5
-
-            aeskeygenassist $$0x4, %xmm3, %xmm2
-            call 1f
-            shufpd $$0, %xmm1, %xmm5
-            movdqu %xmm5, 64($0)
-            movdqa %xmm1, %xmm6
-            shufpd $$1, %xmm3, %xmm6
-            movdqu %xmm6, 80($0)
-
-            aeskeygenassist $$0x8, %xmm3, %xmm2
-            call 1f
-            movdqu %xmm1, 96($0)
-            movdqa %xmm3, %xmm5
-
-            aeskeygenassist $$0x10, %xmm3, %xmm2
-            call 1f
-            shufpd $$0, %xmm1, %xmm5
-            movdqu %xmm5, 112($0)
-            movdqa %xmm1, %xmm6
-            shufpd $$1, %xmm3, %xmm6
-            movdqu %xmm6, 128($0)
-
-            aeskeygenassist $$0x20, %xmm3, %xmm2
-            call 1f
-            movdqu %xmm1, 144($0)
-            movdqa %xmm3, %xmm5
-
-            aeskeygenassist $$0x40, %xmm3, %xmm2
-            call 1f
-            shufpd $$0, %xmm1, %xmm5
-            movdqu %xmm5, 160($0)
-            movdqa %xmm1, %xmm6
-            shufpd $$1, %xmm3, %xmm6
-            movdqu %xmm6, 176($0)
-
-            aeskeygenassist $$0x80, %xmm3, %xmm2
-            call 1f
-            movdqu %xmm1, 192($0)
-
-            jmp 2f
-
-            1:
-            pshufd $$0x55, %xmm2, %xmm2
-            movdqu %xmm1, %xmm4
-            pslldq $$4, %xmm4
-            pxor %xmm4, %xmm1
-            pslldq $$4, %xmm4
-            pxor %xmm4, %xmm1
-            pslldq $$4, %xmm4
-            pxor %xmm4, %xmm1
-            pxor %xmm2, %xmm1
-            pshufd $$0xff, %xmm1, %xmm2
-            movdqu %xmm3, %xmm4
-            pslldq $$4, %xmm4
-            pxor %xmm4, %xmm3
-            pxor %xmm2, %xmm3
-            ret
-
-            2:
-        "
-        : "=r" (round_keysp)
-        : "r" (keyp), "0" (round_keysp)
-        : "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "memory"
-        : "volatile"
-        );
+        rust_crypto_aesni_setup_working_key_192(key.as_ptr(), round_key.as_mut_ptr());
 
         match key_type {
             KeyType::Decryption => {
                 // range of rounds keys from #1 to #11; skip the first and last key
                 for i in (1..12) {
-                    aesimc(round_key.get_unchecked_mut(16 * i));
+                    rust_crypto_aesni_aesimc(round_key.get_unchecked_mut(16 * i));
                 }
             }
             KeyType::Encryption => { /* nothing more to do */ }
@@ -264,110 +127,15 @@ fn setup_working_key_aesni_192(key: &[u8], key_type: KeyType, round_key: &mut [u
     }
 }
 
-#[allow(unused_assignments)]
 fn setup_working_key_aesni_256(key: &[u8], key_type: KeyType, round_key: &mut [u8]) {
     unsafe {
-        let mut round_keysp: *mut u8 = round_key.get_unchecked_mut(0);
-        let keyp: *const u8 = key.get_unchecked(0);
-
-        asm!(
-        "
-            movdqu ($1), %xmm1
-            movdqu 16($1), %xmm3
-            movdqu %xmm1, ($0)
-            movdqu %xmm3, 16($0)
-
-            aeskeygenassist $$0x1, %xmm3, %xmm2
-            call 1f
-            movdqu %xmm1, 32($0)
-
-            aeskeygenassist $$0x0, %xmm1, %xmm2
-            call 2f
-            movdqu %xmm3, 48($0)
-
-            aeskeygenassist $$0x2, %xmm3, %xmm2
-            call 1f
-            movdqu %xmm1, 64($0)
-
-            aeskeygenassist $$0x0, %xmm1, %xmm2
-            call 2f
-            movdqu %xmm3, 80($0)
-
-            aeskeygenassist $$0x4, %xmm3, %xmm2
-            call 1f
-            movdqu %xmm1, 96($0)
-
-            aeskeygenassist $$0x0, %xmm1, %xmm2
-            call 2f
-            movdqu %xmm3, 112($0)
-
-            aeskeygenassist $$0x8, %xmm3, %xmm2
-            call 1f
-            movdqu %xmm1, 128($0)
-
-            aeskeygenassist $$0x0, %xmm1, %xmm2
-            call 2f
-            movdqu %xmm3, 144($0)
-
-            aeskeygenassist $$0x10, %xmm3, %xmm2
-            call 1f
-            movdqu %xmm1, 160($0)
-
-            aeskeygenassist $$0x0, %xmm1, %xmm2
-            call 2f
-            movdqu %xmm3, 176($0)
-
-            aeskeygenassist $$0x20, %xmm3, %xmm2
-            call 1f
-            movdqu %xmm1, 192($0)
-
-            aeskeygenassist $$0x0, %xmm1, %xmm2
-            call 2f
-            movdqu %xmm3, 208($0)
-
-            aeskeygenassist $$0x40, %xmm3, %xmm2
-            call 1f
-            movdqu %xmm1, 224($0)
-
-            jmp 3f
-
-            1:
-            pshufd $$0xff, %xmm2, %xmm2
-            movdqa %xmm1, %xmm4
-            pslldq $$4, %xmm4
-            pxor %xmm4, %xmm1
-            pslldq $$4, %xmm4
-            pxor %xmm4, %xmm1
-            pslldq $$4, %xmm4
-            pxor %xmm4, %xmm1
-            pxor %xmm2, %xmm1
-            ret
-
-            2:
-            pshufd $$0xaa, %xmm2, %xmm2
-            movdqa %xmm3, %xmm4
-            pslldq $$4, %xmm4
-            pxor %xmm4, %xmm3
-            pslldq $$4, %xmm4
-            pxor %xmm4, %xmm3
-            pslldq $$4, %xmm4
-            pxor %xmm4, %xmm3
-            pxor %xmm2, %xmm3
-            ret
-
-            3:
-        "
-        : "=r" (round_keysp)
-        : "r" (keyp), "0" (round_keysp)
-        : "xmm1", "xmm2", "xmm3", "xmm4", "memory"
-        : "volatile"
-        );
+        rust_crypto_aesni_setup_working_key_256(key.as_ptr(), round_key.as_mut_ptr());
 
         match key_type {
             KeyType::Decryption => {
                 // range of rounds keys from #1 to #13; skip the first and last key
                 for i in (1..14) {
-                    aesimc(round_key.get_unchecked_mut(16 * i));
+                    rust_crypto_aesni_aesimc(round_key.get_unchecked_mut(16 * i));
                 }
             }
             KeyType::Encryption => { /* nothing more to do */ }
@@ -375,86 +143,22 @@ fn setup_working_key_aesni_256(key: &[u8], key_type: KeyType, round_key: &mut [u
     }
 }
 
-#[allow(unused_assignments)]
 fn encrypt_block_aesni(rounds: u8, input: &[u8], round_keys: &[u8], output: &mut [u8]) {
     unsafe {
-        let mut rounds = rounds;
-        let mut round_keysp: *const u8 = round_keys.get_unchecked(0);
-        let outp: *mut u8 = output.get_unchecked_mut(0);
-        let inp: *const u8 = input.get_unchecked(0);
-
-        asm!(
-        "
-            /* Copy the data to encrypt to xmm1 */
-            movdqu ($2), %xmm1
-
-            /* Perform round 0 - the whitening step */
-            movdqu ($1), %xmm0
-            add $$0x10, $1
-            pxor %xmm0, %xmm1
-
-            /* Perform all remaining rounds (except the final one) */
-            1:
-            movdqu ($1), %xmm0
-            add $$0x10, $1
-            aesenc %xmm0, %xmm1
-            sub $$0x01, $0
-            cmp $$0x01, $0
-            jne 1b
-
-            /* Perform the last round */
-            movdqu ($1), %xmm0
-            aesenclast %xmm0, %xmm1
-
-            /* Finally, move the result from xmm1 to outp */
-            movdqu %xmm1, ($3)
-        "
-        : "=r" (rounds), "=r" (round_keysp) // outputs
-        : "r" (inp), "r" (outp), "0" (rounds), "1" (round_keysp) // inputs
-        : "xmm0", "xmm1", "memory", "cc" // clobbers
-        : "volatile" // options
-        );
+        rust_crypto_aesni_encrypt_block(
+                rounds,
+                input.as_ptr(),
+                round_keys.as_ptr(),
+                output.as_mut_ptr());
     }
 }
 
-#[allow(unused_assignments)]
 fn decrypt_block_aesni(rounds: u8, input: &[u8], round_keys: &[u8], output: &mut [u8]) {
     unsafe {
-        let mut rounds = rounds;
-        let mut round_keysp: *const u8 = round_keys.get_unchecked(round_keys.len() - 16);
-        let outp: *mut u8 = output.get_unchecked_mut(0);
-        let inp: *const u8 = input.get_unchecked(0);
-
-        asm!(
-        "
-            /* Copy the data to decrypt to xmm1 */
-            movdqu ($2), %xmm1
-
-            /* Perform round 0 - the whitening step */
-            movdqu ($1), %xmm0
-            sub $$0x10, $1
-            pxor %xmm0, %xmm1
-
-            /* Perform all remaining rounds (except the final one) */
-            1:
-            movdqu ($1), %xmm0
-            sub $$0x10, $1
-            aesdec %xmm0, %xmm1
-            sub $$0x01, $0
-            cmp $$0x01, $0
-            jne 1b
-
-            /* Perform the last round */
-            movdqu ($1), %xmm0
-            aesdeclast %xmm0, %xmm1
-
-            /* Finally, move the result from xmm1 to outp */
-            movdqu %xmm1, ($3)
-        "
-        : "=r" (rounds), "=r" (round_keysp) // outputs
-        : "r" (inp), "r" (outp), "0" (rounds), "1" (round_keysp) // inputs
-        : "xmm0", "xmm1", "memory", "cc" // clobbers
-        : "volatile" // options
-        );
+        rust_crypto_aesni_decrypt_block(
+                rounds as u8,
+                input.as_ptr(),
+                round_keys.get_unchecked(round_keys.len() - 16),
+                output.as_mut_ptr());
     }
 }
diff --git a/src/aesni_helpers.c b/src/aesni_helpers.c
new file mode 100644
index 0000000..d38bdc7
--- /dev/null
+++ b/src/aesni_helpers.c
@@ -0,0 +1,335 @@
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+#include <stdint.h>
+
+#if defined(__i386__) || defined(__x86_64__)
+
+void rust_crypto_aesni_aesimc(uint8_t* round_keys) {
+    asm(
+        " \
+            movdqu (%0), %%xmm1; \
+            aesimc %%xmm1, %%xmm1; \
+            movdqu %%xmm1, (%0); \
+        "
+    : // outputs
+    : "r" (round_keys) // inputs
+    : "xmm1", "memory" // clobbers
+    );
+}
+
+void rust_crypto_aesni_setup_working_key_128(
+        uint8_t* key,
+        uint8_t* round_key) {
+    asm(
+        " \
+            movdqu (%1), %%xmm1; \
+            movdqu %%xmm1, (%0); \
+            add $0x10, %0; \
+            \
+            aeskeygenassist $0x01, %%xmm1, %%xmm2; \
+            call 1f; \
+            aeskeygenassist $0x02, %%xmm1, %%xmm2; \
+            call 1f; \
+            aeskeygenassist $0x04, %%xmm1, %%xmm2; \
+            call 1f; \
+            aeskeygenassist $0x08, %%xmm1, %%xmm2; \
+            call 1f; \
+            aeskeygenassist $0x10, %%xmm1, %%xmm2; \
+            call 1f; \
+            aeskeygenassist $0x20, %%xmm1, %%xmm2; \
+            call 1f; \
+            aeskeygenassist $0x40, %%xmm1, %%xmm2; \
+            call 1f; \
+            aeskeygenassist $0x80, %%xmm1, %%xmm2; \
+            call 1f; \
+            aeskeygenassist $0x1b, %%xmm1, %%xmm2; \
+            call 1f; \
+            aeskeygenassist $0x36, %%xmm1, %%xmm2; \
+            call 1f; \
+            \
+            jmp 2f; \
+            \
+            1: \
+            pshufd $0xff, %%xmm2, %%xmm2; \
+            vpslldq $0x04, %%xmm1, %%xmm3; \
+            pxor %%xmm3, %%xmm1; \
+            vpslldq $0x4, %%xmm1, %%xmm3; \
+            pxor %%xmm3, %%xmm1; \
+            vpslldq $0x04, %%xmm1, %%xmm3; \
+            pxor %%xmm3, %%xmm1; \
+            pxor %%xmm2, %%xmm1; \
+            movdqu %%xmm1, (%0); \
+            add $0x10, %0; \
+            ret; \
+            \
+            2: \
+        "
+    : "+r" (round_key)
+    : "r" (key)
+    : "xmm1", "xmm2", "xmm3", "memory"
+    );
+}
+
+void rust_crypto_aesni_setup_working_key_192(
+        uint8_t* key,
+        uint8_t* round_key) {
+    asm(
+        " \
+            movdqu (%1), %%xmm1; \
+            movdqu 16(%1), %%xmm3; \
+            movdqu %%xmm1, (%0); \
+            movdqa %%xmm3, %%xmm5; \
+            \
+            aeskeygenassist $0x1, %%xmm3, %%xmm2; \
+            call 1f; \
+            shufpd $0, %%xmm1, %%xmm5; \
+            movdqu %%xmm5, 16(%0); \
+            movdqa %%xmm1, %%xmm6; \
+            shufpd $1, %%xmm3, %%xmm6; \
+            movdqu %%xmm6, 32(%0); \
+            \
+            aeskeygenassist $0x2, %%xmm3, %%xmm2; \
+            call 1f; \
+            movdqu %%xmm1, 48(%0); \
+            movdqa %%xmm3, %%xmm5; \
+            \
+            aeskeygenassist $0x4, %%xmm3, %%xmm2; \
+            call 1f; \
+            shufpd $0, %%xmm1, %%xmm5; \
+            movdqu %%xmm5, 64(%0); \
+            movdqa %%xmm1, %%xmm6; \
+            shufpd $1, %%xmm3, %%xmm6; \
+            movdqu %%xmm6, 80(%0); \
+            \
+            aeskeygenassist $0x8, %%xmm3, %%xmm2; \
+            call 1f; \
+            movdqu %%xmm1, 96(%0); \
+            movdqa %%xmm3, %%xmm5; \
+            \
+            aeskeygenassist $0x10, %%xmm3, %%xmm2; \
+            call 1f; \
+            shufpd $0, %%xmm1, %%xmm5; \
+            movdqu %%xmm5, 112(%0); \
+            movdqa %%xmm1, %%xmm6; \
+            shufpd $1, %%xmm3, %%xmm6; \
+            movdqu %%xmm6, 128(%0); \
+            \
+            aeskeygenassist $0x20, %%xmm3, %%xmm2; \
+            call 1f; \
+            movdqu %%xmm1, 144(%0); \
+            movdqa %%xmm3, %%xmm5; \
+            \
+            aeskeygenassist $0x40, %%xmm3, %%xmm2; \
+            call 1f; \
+            shufpd $0, %%xmm1, %%xmm5; \
+            movdqu %%xmm5, 160(%0); \
+            movdqa %%xmm1, %%xmm6; \
+            shufpd $1, %%xmm3, %%xmm6; \
+            movdqu %%xmm6, 176(%0); \
+            \
+            aeskeygenassist $0x80, %%xmm3, %%xmm2; \
+            call 1f; \
+            movdqu %%xmm1, 192(%0); \
+            \
+            jmp 2f; \
+            \
+            1: \
+            pshufd $0x55, %%xmm2, %%xmm2; \
+            movdqu %%xmm1, %%xmm4; \
+            pslldq $4, %%xmm4; \
+            pxor %%xmm4, %%xmm1; \
+            pslldq $4, %%xmm4; \
+            pxor %%xmm4, %%xmm1; \
+            pslldq $4, %%xmm4; \
+            pxor %%xmm4, %%xmm1; \
+            pxor %%xmm2, %%xmm1; \
+            pshufd $0xff, %%xmm1, %%xmm2; \
+            movdqu %%xmm3, %%xmm4; \
+            pslldq $4, %%xmm4; \
+            pxor %%xmm4, %%xmm3; \
+            pxor %%xmm2, %%xmm3; \
+            ret; \
+            \
+            2: \
+        "
+    : "+r" (round_key)
+    : "r" (key)
+    : "xmm1", "xmm2", "xmm3", "memory"
+    );
+}
+
+void rust_crypto_aesni_setup_working_key_256(
+        uint8_t* key,
+        uint8_t* round_key) {
+    asm(
+        " \
+            movdqu (%1), %%xmm1; \
+            movdqu 16(%1), %%xmm3; \
+            movdqu %%xmm1, (%0); \
+            movdqu %%xmm3, 16(%0); \
+            \
+            aeskeygenassist $0x1, %%xmm3, %%xmm2; \
+            call 1f; \
+            movdqu %%xmm1, 32(%0); \
+            \
+            aeskeygenassist $0x0, %%xmm1, %%xmm2; \
+            call 2f; \
+            movdqu %%xmm3, 48(%0); \
+            \
+            aeskeygenassist $0x2, %%xmm3, %%xmm2; \
+            call 1f; \
+            movdqu %%xmm1, 64(%0); \
+            \
+            aeskeygenassist $0x0, %%xmm1, %%xmm2; \
+            call 2f; \
+            movdqu %%xmm3, 80(%0); \
+            \
+            aeskeygenassist $0x4, %%xmm3, %%xmm2; \
+            call 1f; \
+            movdqu %%xmm1, 96(%0); \
+            \
+            aeskeygenassist $0x0, %%xmm1, %%xmm2; \
+            call 2f; \
+            movdqu %%xmm3, 112(%0); \
+            \
+            aeskeygenassist $0x8, %%xmm3, %%xmm2; \
+            call 1f; \
+            movdqu %%xmm1, 128(%0); \
+            \
+            aeskeygenassist $0x0, %%xmm1, %%xmm2; \
+            call 2f; \
+            movdqu %%xmm3, 144(%0); \
+            \
+            aeskeygenassist $0x10, %%xmm3, %%xmm2; \
+            call 1f; \
+            movdqu %%xmm1, 160(%0); \
+            \
+            aeskeygenassist $0x0, %%xmm1, %%xmm2; \
+            call 2f; \
+            movdqu %%xmm3, 176(%0); \
+            \
+            aeskeygenassist $0x20, %%xmm3, %%xmm2; \
+            call 1f; \
+            movdqu %%xmm1, 192(%0); \
+            \
+            aeskeygenassist $0x0, %%xmm1, %%xmm2; \
+            call 2f; \
+            movdqu %%xmm3, 208(%0); \
+            \
+            aeskeygenassist $0x40, %%xmm3, %%xmm2; \
+            call 1f; \
+            movdqu %%xmm1, 224(%0); \
+            \
+            jmp 3f; \
+            \
+            1: \
+            pshufd $0xff, %%xmm2, %%xmm2; \
+            movdqa %%xmm1, %%xmm4; \
+            pslldq $4, %%xmm4; \
+            pxor %%xmm4, %%xmm1; \
+            pslldq $4, %%xmm4; \
+            pxor %%xmm4, %%xmm1; \
+            pslldq $4, %%xmm4; \
+            pxor %%xmm4, %%xmm1; \
+            pxor %%xmm2, %%xmm1; \
+            ret; \
+            \
+            2: \
+            pshufd $0xaa, %%xmm2, %%xmm2; \
+            movdqa %%xmm3, %%xmm4; \
+            pslldq $4, %%xmm4; \
+            pxor %%xmm4, %%xmm3; \
+            pslldq $4, %%xmm4; \
+            pxor %%xmm4, %%xmm3; \
+            pslldq $4, %%xmm4; \
+            pxor %%xmm4, %%xmm3; \
+            pxor %%xmm2, %%xmm3; \
+            ret; \
+            \
+            3: \
+        "
+    : "+r" (round_key)
+    : "r" (key)
+    : "xmm1", "xmm2", "xmm3", "memory"
+    );
+}
+
+void rust_crypto_aesni_encrypt_block(
+            uint8_t rounds,
+            uint8_t* input,
+            uint8_t* round_keys,
+            uint8_t* output) {
+    asm(
+    " \
+        /* Copy the data to encrypt to xmm1 */ \
+        movdqu (%2), %%xmm1; \
+        \
+        /* Perform round 0 - the whitening step */ \
+        movdqu (%1), %%xmm0; \
+        add $0x10, %1; \
+        pxor %%xmm0, %%xmm1; \
+        \
+        /* Perform all remaining rounds (except the final one) */ \
+        1: \
+        movdqu (%1), %%xmm0; \
+        add $0x10, %1; \
+        aesenc %%xmm0, %%xmm1; \
+        sub $0x01, %0; \
+        cmp $0x01, %0; \
+        jne 1b; \
+        \
+        /* Perform the last round */ \
+        movdqu (%1), %%xmm0; \
+        aesenclast %%xmm0, %%xmm1; \
+        \
+        /* Finally, move the result from xmm1 to outp */ \
+        movdqu %%xmm1, (%3); \
+    "
+    : "+&r" (rounds), "+&r" (round_keys) // outputs
+    : "r" (input), "r" (output) // inputs
+    : "xmm0", "xmm1", "memory", "cc" // clobbers
+    );
+}
+
+void rust_crypto_aesni_decrypt_block(
+            uint8_t rounds,
+            uint8_t* input,
+            uint8_t* round_keys,
+            uint8_t* output) {
+    asm(
+        " \
+            /* Copy the data to decrypt to xmm1 */ \
+            movdqu (%2), %%xmm1; \
+            \
+            /* Perform round 0 - the whitening step */ \
+            movdqu (%1), %%xmm0; \
+            sub $0x10, %1; \
+            pxor %%xmm0, %%xmm1; \
+            \
+            /* Perform all remaining rounds (except the final one) */ \
+            1: \
+            movdqu (%1), %%xmm0; \
+            sub $0x10, %1; \
+            aesdec %%xmm0, %%xmm1; \
+            sub $0x01, %0; \
+            cmp $0x01, %0; \
+            jne 1b; \
+            \
+            /* Perform the last round */ \
+            movdqu (%1), %%xmm0; \
+            aesdeclast %%xmm0, %%xmm1; \
+            \
+            /* Finally, move the result from xmm1 to outp */ \
+            movdqu %%xmm1, (%3); \
+        "
+    : "+&r" (rounds), "+&r" (round_keys) // outputs
+    : "r" (input), "r" (output) // inputs
+    : "xmm0", "xmm1", "memory", "cc" // clobbers
+    );
+}
+
+#endif
diff --git a/src/aessafe.rs b/src/aessafe.rs
index 78c5c97..d3e9183 100644
--- a/src/aessafe.rs
+++ b/src/aessafe.rs
@@ -127,16 +127,10 @@ use std::ops::{BitAnd, BitXor, Not};
 use std::default::Default;
 
 use cryptoutil::{read_u32v_le, write_u32_le};
+use simd::u32x4;
+use step_by::RangeExt;
 use symmetriccipher::{BlockEncryptor, BlockEncryptorX8, BlockDecryptor, BlockDecryptorX8};
 
-// Using std::unstable::simd::u32x4 results in issues creating static arrays of u32x4 values.
-// Defining the type here avoids that problem. Additionally, we need to implement various trait from
-// libstd which wouldn't be possible if we used that type directly.
-#[simd]
-#[derive(Copy, Eq, PartialEq)]
-#[allow(non_camel_case_types)]
-pub struct u32x4(u32, u32, u32, u32);
-
 const U32X4_0: u32x4 = u32x4(0, 0, 0, 0);
 const U32X4_1: u32x4 = u32x4(-1, -1, -1, -1);
 
@@ -373,7 +367,7 @@ fn create_round_keys(key: &[u8], key_type: KeyType, round_keys: &mut [[u32; 4]])
 
     // The key is copied directly into the first few round keys
     let mut j = 0;
-    for i in (0..key.len()).step_by(4) {
+    for i in (0..key.len()).step_up(4) {
         round_keys[j / 4][j % 4] =
             (key[i] as u32) |
             ((key[i+1] as u32) << 8) |
@@ -1195,22 +1189,6 @@ impl u32x4 {
     }
 }
 
-impl BitXor for u32x4 {
-    type Output = u32x4;
-
-    fn bitxor(self, rhs: u32x4) -> u32x4 {
-        self ^ rhs
-    }
-}
-
-impl BitAnd for u32x4 {
-    type Output = u32x4;
-
-    fn bitand(self, rhs: u32x4) -> u32x4 {
-        self & rhs
-    }
-}
-
 impl Not for u32x4 {
     type Output = u32x4;
 
diff --git a/src/bcrypt.rs b/src/bcrypt.rs
index 8375bb8..e44ece7 100644
--- a/src/bcrypt.rs
+++ b/src/bcrypt.rs
@@ -6,6 +6,7 @@
 
 use blowfish::Blowfish;
 use cryptoutil::{write_u32_be};
+use step_by::RangeExt;
 
 fn setup(cost: u32, salt: &[u8], key: &[u8]) -> Blowfish {
     let mut state = Blowfish::init_state();
@@ -27,7 +28,7 @@ pub fn bcrypt(cost: u32, salt: &[u8], password: &[u8], output: &mut [u8]) {
     let state = setup(cost, salt, password);
     // OrpheanBeholderScryDoubt
     let mut ctext = [0x4f727068, 0x65616e42, 0x65686f6c, 0x64657253, 0x63727944, 0x6f756274];
-    for i in (0..6).step_by(2) {
+    for i in (0..6).step_up(2) {
         for _ in (0..64) {
             let (l, r) = state.encrypt(ctext[i], ctext[i+1]);
             ctext[i] = l;
diff --git a/src/bcrypt_pbkdf.rs b/src/bcrypt_pbkdf.rs
index 4366e9b..60e6708 100644
--- a/src/bcrypt_pbkdf.rs
+++ b/src/bcrypt_pbkdf.rs
@@ -8,6 +8,7 @@ use blowfish::Blowfish;
 use cryptoutil::{read_u32v_be, write_u32_be, write_u32_le};
 use sha2::Sha512;
 use digest::Digest;
+use step_by::RangeExt;
 
 fn bcrypt_hash(hpass: &[u8], hsalt: &[u8], output: &mut [u8; 32]) {
     let mut bf = Blowfish::init_state();
@@ -21,7 +22,7 @@ fn bcrypt_hash(hpass: &[u8], hsalt: &[u8], output: &mut [u8; 32]) {
     let mut buf = [0u32; 8];
     read_u32v_be(&mut buf, b"OxychromaticBlowfishSwatDynamite");
 
-    for i in (0..8).step_by(2) {
+    for i in (0..8).step_up(2) {
         for _ in (0..64) {
             let (l, r) = bf.encrypt(buf[i], buf[i+1]);
             buf[i] = l;
diff --git a/src/blake2b.rs b/src/blake2b.rs
index 1f8aa5d..6c8ed0a 100644
--- a/src/blake2b.rs
+++ b/src/blake2b.rs
@@ -5,11 +5,10 @@
 // except according to those terms.
 
 use std::iter::repeat;
-use cryptoutil::{read_u64v_le, write_u64v_le};
-use std::slice::bytes::{copy_memory};
-use std::intrinsics::volatile_set_memory;
+use cryptoutil::{copy_memory, read_u64v_le, write_u64v_le};
 use digest::Digest;
 use mac::{Mac, MacResult};
+use util::secure_memset;
 
 static IV : [u64; 8] = [
   0x6a09e667f3bcc908, 0xbb67ae8584caa73b,
@@ -185,9 +184,7 @@ impl Blake2b {
         let mut block : [u8; BLAKE2B_BLOCKBYTES] = [0; BLAKE2B_BLOCKBYTES];
         copy_memory(&self.key[..self.key_length as usize], &mut block);
         self.update(&block);
-        unsafe {
-            volatile_set_memory(block.as_mut_ptr(), 0, block.len());
-        }
+        secure_memset(&mut block[..], 0);
     }
 
     pub fn new_keyed(outlen: usize, key: &[u8] ) -> Blake2b {
diff --git a/src/blockmodes.rs b/src/blockmodes.rs
index 85ef732..c66e4c6 100644
--- a/src/blockmodes.rs
+++ b/src/blockmodes.rs
@@ -10,12 +10,11 @@
 
 use std::cmp;
 use std::iter::repeat;
-use std::slice;
 
 use buffer::{ReadBuffer, WriteBuffer, OwnedReadBuffer, OwnedWriteBuffer, BufferResult,
     RefReadBuffer, RefWriteBuffer};
 use buffer::BufferResult::{BufferUnderflow, BufferOverflow};
-use cryptoutil::symm_enc_or_dec;
+use cryptoutil::{self, symm_enc_or_dec};
 use symmetriccipher::{BlockEncryptor, BlockEncryptorX8, Encryptor, BlockDecryptor, Decryptor,
     SynchronousStreamCipher, SymmetricCipherError};
 use symmetriccipher::SymmetricCipherError::{InvalidPadding, InvalidLength};
@@ -93,13 +92,13 @@ struct BlockEngine<P, X> {
 fn update_history(in_hist: &mut [u8], out_hist: &mut [u8], last_in: &[u8], last_out: &[u8]) {
     let in_hist_len = in_hist.len();
     if in_hist_len > 0 {
-        slice::bytes::copy_memory(
+        cryptoutil::copy_memory(
             &last_in[last_in.len() - in_hist_len..],
             in_hist);
     }
     let out_hist_len = out_hist.len();
     if out_hist_len > 0 {
-        slice::bytes::copy_memory(
+        cryptoutil::copy_memory(
             &last_out[last_out.len() - out_hist_len..],
             out_hist);
     }
@@ -412,8 +411,8 @@ impl <P: BlockProcessor, X: PaddingProcessor> BlockEngine<P, X> {
     }
     fn reset_with_history(&mut self, in_hist: &[u8], out_hist: &[u8]) {
         self.reset();
-        slice::bytes::copy_memory(in_hist, &mut self.in_hist);
-        slice::bytes::copy_memory(out_hist, &mut self.out_hist);
+        cryptoutil::copy_memory(in_hist, &mut self.in_hist);
+        cryptoutil::copy_memory(out_hist, &mut self.out_hist);
     }
 }
 
@@ -690,7 +689,7 @@ impl <A: BlockEncryptor> CtrMode<A> {
         }
     }
     pub fn reset(&mut self, ctr: &[u8]) {
-        slice::bytes::copy_memory(ctr, &mut self.ctr);
+        cryptoutil::copy_memory(ctr, &mut self.ctr);
         self.bytes.reset();
     }
     fn process(&mut self, input: &[u8], output: &mut [u8]) {
@@ -744,7 +743,7 @@ pub struct CtrModeX8<A> {
 
 fn construct_ctr_x8(in_ctr: &[u8], out_ctr_x8: &mut [u8]) {
     for (i, ctr_i) in out_ctr_x8.chunks_mut(in_ctr.len()).enumerate() {
-        slice::bytes::copy_memory(in_ctr, ctr_i);
+        cryptoutil::copy_memory(in_ctr, ctr_i);
         add_ctr(ctr_i, i as u8);
     }
 }
diff --git a/src/blowfish.rs b/src/blowfish.rs
index 179b4f8..07a123c 100644
--- a/src/blowfish.rs
+++ b/src/blowfish.rs
@@ -6,6 +6,7 @@
 
 use cryptoutil::{read_u32v_be, write_u32_be};
 use symmetriccipher::{BlockEncryptor, BlockDecryptor};
+use step_by::RangeExt;
 
 #[derive(Copy)]
 pub struct Blowfish {
@@ -222,7 +223,7 @@ impl Blowfish {
         }
         let mut l = 0u32;
         let mut r = 0u32;
-        for i in (0..18).step_by(2) {
+        for i in (0..18).step_up(2) {
             let (new_l, new_r) = self.encrypt(l, r);
             l = new_l;
             r = new_r;
@@ -230,7 +231,7 @@ impl Blowfish {
             self.p[i+1] = r;
         }
         for i in (0..4) {
-            for j in (0..256).step_by(2) {
+            for j in (0..256).step_up(2) {
                 let (new_l, new_r) = self.encrypt(l, r);
                 l = new_l;
                 r = new_r;
@@ -249,7 +250,7 @@ impl Blowfish {
         let mut l = 0u32;
         let mut r = 0u32;
         let mut salt_pos = 0;
-        for i in (0..18).step_by(2) {
+        for i in (0..18).step_up(2) {
             let (new_l, new_r) = self.encrypt(l ^ next_u32_wrap(salt, &mut salt_pos), r ^ next_u32_wrap(salt, &mut salt_pos));
             l = new_l;
             r = new_r;
@@ -257,7 +258,7 @@ impl Blowfish {
             self.p[i+1] = r;
         }
         for i in (0..4) {
-            for j in (0..256).step_by(4) {
+            for j in (0..256).step_up(4) {
                 let (new_l, new_r) = self.encrypt(l ^ next_u32_wrap(salt, &mut salt_pos), r ^ next_u32_wrap(salt, &mut salt_pos));
                 l = new_l;
                 r = new_r;
@@ -279,7 +280,7 @@ impl Blowfish {
 
     // Public for bcrypt.
     pub fn encrypt(&self, mut l: u32, mut r: u32) -> (u32, u32) {
-        for i in (0..16).step_by(2) {
+        for i in (0..16).step_up(2) {
             l ^= self.p[i];
             r ^= self.round_function(l);
             r ^= self.p[i+1];
diff --git a/src/buffer.rs b/src/buffer.rs
index 1293c6a..cd9476c 100644
--- a/src/buffer.rs
+++ b/src/buffer.rs
@@ -5,7 +5,8 @@
 // except according to those terms.
 
 use std::cmp;
-use std::slice;
+
+use cryptoutil;
 
 #[derive(Copy)]
 pub enum BufferResult {
@@ -37,7 +38,7 @@ pub trait ReadBuffer {
 
     fn push_to<W: WriteBuffer>(&mut self, output: &mut W) {
         let count = cmp::min(output.remaining(), self.remaining());
-        slice::bytes::copy_memory(self.take_next(count), output.take_next(count));
+        cryptoutil::copy_memory(self.take_next(count), output.take_next(count));
     }
 }
 
diff --git a/src/cryptoutil.rs b/src/cryptoutil.rs
index f2ee239..b84fdf7 100644
--- a/src/cryptoutil.rs
+++ b/src/cryptoutil.rs
@@ -11,7 +11,6 @@
 use std;
 use std::{io, mem};
 use std::ptr;
-use std::slice::bytes::{MutableByteVector, copy_memory};
 
 use buffer::{ReadBuffer, WriteBuffer, BufferResult};
 use buffer::BufferResult::{BufferUnderflow, BufferOverflow};
@@ -189,6 +188,25 @@ pub fn xor_keystream(dst: &mut[u8], plaintext: &[u8], keystream: &[u8]) {
     }
 }
 
+/// Copy bytes from src to dest
+#[inline]
+pub fn copy_memory(src: &[u8], dst: &mut [u8]) {
+    assert!(dst.len() >= src.len());
+    unsafe {
+        let srcp = src.as_ptr();
+        let dstp = dst.as_mut_ptr();
+        ptr::copy_nonoverlapping(srcp, dstp, src.len());
+    }
+}
+
+/// Zero all bytes in dst
+#[inline]
+pub fn zero(dst: &mut [u8]) {
+    unsafe {
+        ptr::write_bytes(dst.as_mut_ptr(), 0, dst.len());
+    }
+}
+
 /// An extension trait to implement a few useful serialization
 /// methods on types that implement Write
 pub trait WriteExt {
@@ -392,7 +410,7 @@ macro_rules! impl_fixed_buffer( ($name:ident, $size:expr) => (
 
         fn zero_until(&mut self, idx: usize) {
             assert!(idx >= self.buffer_idx);
-            &mut self.buffer[self.buffer_idx..idx].set_memory(0);
+            zero(&mut self.buffer[self.buffer_idx..idx]);
             self.buffer_idx = idx;
         }
 
diff --git a/src/curve25519.rs b/src/curve25519.rs
index a0f4efd..7c007ef 100644
--- a/src/curve25519.rs
+++ b/src/curve25519.rs
@@ -1,6 +1,7 @@
 use std::ops::{Add, Sub, Mul};
 use std::cmp::{Eq, PartialEq,min};
 use util::{fixed_time_eq};
+use step_by::RangeExt;
 
 /*
 fe means field element.
@@ -1479,7 +1480,7 @@ pub fn ge_scalarmult_base(a: &[u8]) -> GeP3 {
     /* each es[i] is between -8 and 8 */
 
     let mut h = GeP3::zero();
-    for i in (1..64).step_by(2) {
+    for i in (1..64).step_up(2) {
         t = GePrecomp::select(i/2, es[i]);
         r = h + t;
         h = r.to_p3();
@@ -1490,7 +1491,7 @@ pub fn ge_scalarmult_base(a: &[u8]) -> GeP3 {
     r = s.dbl(); s = r.to_p2();
     r = s.dbl(); h = r.to_p3();
 
-    for i in (0..64).step_by(2) {
+    for i in (0..64).step_up(2) {
         t = GePrecomp::select(i/2, es[i]);
         r = h + t;
         h = r.to_p3();
diff --git a/src/ed25519.rs b/src/ed25519.rs
index bb95399..286899f 100644
--- a/src/ed25519.rs
+++ b/src/ed25519.rs
@@ -82,9 +82,15 @@ fn check_s_lt_l(s: &[u8]) -> bool
     let mut c: u8 = 0;
     let mut n: u8 = 1;
 
-    for i in (31..-1).step_by(-1) {
+    let mut i = 31;
+    loop {
         c |= ((((s[i] as i32) - (l[i] as i32)) >> 8) as u8) & n;
         n &= (((((s[i] ^ l[i]) as i32)) - 1) >> 8) as u8;
+        if i == 0 {
+            break;
+        } else {
+            i -= 1;
+        }
     }
 
     c == 0
diff --git a/src/fortuna.rs b/src/fortuna.rs
index 40b2e52..6248e1b 100644
--- a/src/fortuna.rs
+++ b/src/fortuna.rs
@@ -44,7 +44,7 @@
  * say) then you need to EXPLICITLY RESEED THE RNG AFTER FORKING.
  */
 
-use std::slice::bytes::copy_memory;
+use cryptoutil::copy_memory;
 
 use rand::{Rng, SeedableRng};
 use time::precise_time_s;
diff --git a/src/ghash.rs b/src/ghash.rs
index ed99561..2f7c1bc 100644
--- a/src/ghash.rs
+++ b/src/ghash.rs
@@ -17,7 +17,7 @@
 
 use std::ops::BitXor;
 use std::mem;
-use std::slice::bytes::copy_memory;
+use cryptoutil::copy_memory;
 
 use cryptoutil::{read_u32_be, write_u32_be};
 use mac::{Mac, MacResult};
@@ -56,21 +56,6 @@ impl Gf128 {
 
     // Multiply the element by x modulo x^128
     // This is equivalent to a rightshift in the bit representation
-    #[cfg(all(target_arch = "x86_64",ndebug))]
-    fn times_x(mut self) -> Gf128 {
-        unsafe {
-            asm!("
-                movdqa $0, %xmm1
-                psrlq $$1, $0
-                psllq $$63, %xmm1
-                pshufd $$0x0c, %xmm1, %xmm1
-                por %xmm1, $0
-                " : "+x" (self.d) : : "xmm1" );
-        }
-        self
-    }
-
-    #[cfg(any(not(target_arch = "x86_64"),not(ndebug)))]
     fn times_x(self) -> Gf128 {
         let simd::u32x4(a, b, c, d) = self.d;
         Gf128::new(a >> 1 | b << 31, b >> 1 | c << 31, c >> 1 |  d << 31, d >> 1)
@@ -96,24 +81,6 @@ impl Gf128 {
     }
 
     // This XORs the value of y with x if the LSB of self is set, otherwise y is returned
-    #[cfg(all(target_arch = "x86_64",ndebug))]
-    fn cond_xor(self, x: Gf128, mut y: Gf128) -> Gf128 {
-        let lsb = simd::u32x4(1, 0, 0, 0);
-        unsafe {
-            asm!("
-                movdqa $1, %xmm1
-                pand $3, %xmm1
-                pcmpeqd $3, %xmm1
-                pshufd $$0x00, %xmm1, %xmm1
-                pand $2, %xmm1
-                pxor %xmm1, $0
-                " : "+x" (y.d) : "x" (self.d), "x" (x.d), "x" (lsb) : "xmm1"
-            );
-        }
-        y
-    }
-
-    #[cfg(any(not(target_arch = "x86_64"),not(ndebug)))]
     fn cond_xor(self, x: Gf128, y: Gf128) -> Gf128 {
         use simd::SimdExt;
         let lsb = simd::u32x4(1, 0, 0, 0);
diff --git a/src/hkdf.rs b/src/hkdf.rs
index c6c7a58..84461b1 100644
--- a/src/hkdf.rs
+++ b/src/hkdf.rs
@@ -8,7 +8,7 @@
 //! Derivation Function as specified by  https://tools.ietf.org/html/rfc5869.
 
 use std::iter::repeat;
-use std::slice::bytes::copy_memory;
+use cryptoutil::copy_memory;
 
 use digest::Digest;
 use hmac::Hmac;
@@ -68,7 +68,7 @@ pub fn hkdf_expand<D: Digest>(mut digest: D, prk: &[u8], info: &[u8], okm: &mut
 
 #[cfg(test)]
 mod test {
-    use std::iter::{repeat, range_inclusive};
+    use std::iter::repeat;
 
     use digest::Digest;
     use sha1::Sha1;
@@ -92,8 +92,8 @@ mod test {
             TestVector{
                 digest: Sha256::new(),
                 ikm: repeat(0x0b).take(22).collect(),
-                salt: range_inclusive(0x00, 0x0c).collect(),
-                info: range_inclusive(0xf0, 0xf9).collect(),
+                salt: (0x00..0x0c + 1).collect(),
+                info: (0xf0..0xf9 + 1).collect(),
                 l: 42,
                 prk: vec![
                     0x07, 0x77, 0x09, 0x36, 0x2c, 0x2e, 0x32, 0xdf,
@@ -110,9 +110,9 @@ mod test {
             },
             TestVector{
                 digest: Sha256::new(),
-                ikm: range_inclusive(0x00, 0x4f).collect(),
-                salt: range_inclusive(0x60, 0xaf).collect(),
-                info: range_inclusive(0xb0, 0xff).collect(),
+                ikm: (0x00..0x4f + 1).collect(),
+                salt: (0x60..0xaf + 1).collect(),
+                info: (0xb0..0xff + 1).map(|x| x as u8).collect(),
                 l: 82,
                 prk: vec![
                     0x06, 0xa6, 0xb8, 0x8c, 0x58, 0x53, 0x36, 0x1a,
@@ -171,8 +171,8 @@ mod test {
             TestVector{
                 digest: Sha1::new(),
                 ikm: repeat(0x0b).take(11).collect(),
-                salt: range_inclusive(0x00, 0x0c).collect(),
-                info: range_inclusive(0xf0, 0xf9).collect(),
+                salt: (0x00..0x0c + 1).collect(),
+                info: (0xf0..0xf9 + 1).collect(),
                 l: 42,
                 prk: vec![
                     0x9b, 0x6c, 0x18, 0xc4, 0x32, 0xa7, 0xbf, 0x8f,
@@ -188,9 +188,9 @@ mod test {
             },
             TestVector{
                 digest: Sha1::new(),
-                ikm: range_inclusive(0x00, 0x4f).collect(),
-                salt: range_inclusive(0x60, 0xaf).collect(),
-                info: range_inclusive(0xb0, 0xff).collect(),
+                ikm: (0x00..0x4f + 1).collect(),
+                salt: (0x60..0xaf + 1).collect(),
+                info: (0xb0..0xff + 1).map(|x| x as u8).collect(),
                 l: 82,
                 prk: vec![
                     0x8a, 0xda, 0xe0, 0x9a, 0x2a, 0x30, 0x70, 0x59,
diff --git a/src/hmac.rs b/src/hmac.rs
index 401457f..1fc7744 100644
--- a/src/hmac.rs
+++ b/src/hmac.rs
@@ -9,8 +9,8 @@
  */
 
 use std::iter::repeat;
-use std::slice;
 
+use cryptoutil;
 use digest::Digest;
 use mac::{Mac, MacResult};
 
@@ -38,7 +38,7 @@ fn expand_key<D: Digest>(digest: &mut D, key: &[u8]) -> Vec<u8> {
     let mut expanded_key: Vec<u8> = repeat(0).take(bs).collect();
 
     if key.len() <= bs {
-        slice::bytes::copy_memory(key, &mut expanded_key);
+        cryptoutil::copy_memory(key, &mut expanded_key);
     } else {
         let output_size = digest.output_bytes();
         digest.input(key);
diff --git a/src/lib.rs b/src/lib.rs
index 8e3bf47..cb9a3d2 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -4,17 +4,13 @@
 // option. This file may not be copied, modified, or distributed
 // except according to those terms.
 
-#![feature(asm)]
-#![feature(core)]
-#![feature(simd)]
-#![feature(step_by)]
-#![feature(convert)]
 #![cfg_attr(test, feature(test))]
 
 extern crate rand;
 extern crate rustc_serialize as serialize;
 extern crate time;
 #[cfg(test)] extern crate test;
+extern crate libc;
 
 pub mod aead;
 pub mod aes;
@@ -49,6 +45,7 @@ pub mod sha1;
 pub mod sha2;
 mod simd;
 pub mod sosemanuk;
+mod step_by;
 pub mod symmetriccipher;
 pub mod util;
 pub mod whirlpool;
diff --git a/src/md5.rs b/src/md5.rs
index 2cf8c9e..589cf85 100644
--- a/src/md5.rs
+++ b/src/md5.rs
@@ -10,6 +10,7 @@
 
 use cryptoutil::{write_u32_le, read_u32v_le, FixedBuffer, FixedBuffer64, StandardPadding};
 use digest::Digest;
+use step_by::RangeExt;
 
 
 // A structure that represents that state of a digest computation for the MD5 digest function
@@ -80,7 +81,7 @@ impl Md5State {
         read_u32v_le(&mut data, input);
 
         // round 1
-        for i in (0..16).step_by(4) {
+        for i in (0..16).step_up(4) {
             a = op_f(a, b, c, d, data[i].wrapping_add(C1[i]), 7);
             d = op_f(d, a, b, c, data[i + 1].wrapping_add(C1[i + 1]), 12);
             c = op_f(c, d, a, b, data[i + 2].wrapping_add(C1[i + 2]), 17);
@@ -89,7 +90,7 @@ impl Md5State {
 
         // round 2
         let mut t = 1;
-        for i in (0..16).step_by(4) {
+        for i in (0..16).step_up(4) {
             a = op_g(a, b, c, d, data[t & 0x0f].wrapping_add(C2[i]), 5);
             d = op_g(d, a, b, c, data[(t + 5) & 0x0f].wrapping_add(C2[i + 1]), 9);
             c = op_g(c, d, a, b, data[(t + 10) & 0x0f].wrapping_add(C2[i + 2]), 14);
@@ -99,7 +100,7 @@ impl Md5State {
 
         // round 3
         t = 5;
-        for i in (0..16).step_by(4) {
+        for i in (0..16).step_up(4) {
             a = op_h(a, b, c, d, data[t & 0x0f].wrapping_add(C3[i]), 4);
             d = op_h(d, a, b, c, data[(t + 3) & 0x0f].wrapping_add(C3[i + 1]), 11);
             c = op_h(c, d, a, b, data[(t + 6) & 0x0f].wrapping_add(C3[i + 2]), 16);
@@ -109,7 +110,7 @@ impl Md5State {
 
         // round 4
         t = 0;
-        for i in (0..16).step_by(4) {
+        for i in (0..16).step_up(4) {
             a = op_i(a, b, c, d, data[t & 0x0f].wrapping_add(C4[i]), 6);
             d = op_i(d, a, b, c, data[(t + 7) & 0x0f].wrapping_add(C4[i + 1]), 10);
             c = op_i(c, d, a, b, data[(t + 14) & 0x0f].wrapping_add(C4[i + 2]), 15);
diff --git a/src/pbkdf2.rs b/src/pbkdf2.rs
index 3216f30..d2cbac3 100644
--- a/src/pbkdf2.rs
+++ b/src/pbkdf2.rs
@@ -11,7 +11,7 @@
 
 use std::iter::repeat;
 use std::io;
-use std::slice::bytes::copy_memory;
+use cryptoutil::copy_memory;
 
 use rand::{OsRng, Rng};
 use serialize::base64;
diff --git a/src/scrypt.rs b/src/scrypt.rs
index a956bff..2a58347 100644
--- a/src/scrypt.rs
+++ b/src/scrypt.rs
@@ -12,11 +12,11 @@
  *       http://www.tarsnap.com/scrypt/scrypt.pdf
  */
 
+use std;
 use std::iter::repeat;
 use std::io;
-use std::num::{Int, ToPrimitive};
 use std::mem::size_of;
-use std::slice::bytes::copy_memory;
+use cryptoutil::copy_memory;
 
 use rand::{OsRng, Rng};
 use serialize::base64;
@@ -165,9 +165,10 @@ impl ScryptParams {
         assert!(p > 0);
         assert!(log_n > 0);
         assert!((log_n as usize) < size_of::<usize>() * 8);
+        assert!(size_of::<usize>() >= size_of::<u32>() || (r <= std::usize::MAX as u32 && p < std::usize::MAX as u32));
 
-        let r = r.to_usize().unwrap();
-        let p = p.to_usize().unwrap();
+        let r = r as usize;
+        let p = p as usize;
 
         let n: usize = 1 << log_n;
 
diff --git a/src/simd.rs b/src/simd.rs
index c1896a1..6138d42 100644
--- a/src/simd.rs
+++ b/src/simd.rs
@@ -4,17 +4,12 @@
 // option. This file may not be copied, modified, or distributed
 // except according to those terms.
 
-#[cfg(not(ndebug))]
 pub use self::fake::*;
 
-#[cfg(ndebug)]
-pub use self::real::*;
-
 pub trait SimdExt {
     fn simd_eq(self, rhs: Self) -> Self;
 }
 
-#[cfg(not(ndebug))]
 impl SimdExt for fake::u32x4 {
     fn simd_eq(self, rhs: Self) -> Self {
         if self == rhs {
@@ -25,14 +20,6 @@ impl SimdExt for fake::u32x4 {
     }
 }
 
-#[cfg(ndebug)]
-impl SimdExt for real::u32x4 {
-    fn simd_eq(self, rhs: Self) -> Self {
-        self == rhs
-    }
-}
-
-#[cfg(not(ndebug))]
 mod fake {
     use std::ops::{Add, BitAnd, BitOr, BitXor, Shl, Shr, Sub};
 
@@ -133,9 +120,3 @@ mod fake {
     }
 }
 
-#[cfg(ndebug)]
-mod real {
-    pub use std::simd::u32x4;
-    pub use std::simd::u64x2;
-}
-
diff --git a/src/sosemanuk.rs b/src/sosemanuk.rs
index 96f5782..5b050d7 100644
--- a/src/sosemanuk.rs
+++ b/src/sosemanuk.rs
@@ -9,7 +9,7 @@ use buffer::{BufferResult, RefReadBuffer, RefWriteBuffer};
 use symmetriccipher::{Encryptor, Decryptor, SynchronousStreamCipher, SymmetricCipherError};
 use cryptoutil::{read_u32_le, symm_enc_or_dec, write_u32v_le};
  
-use std::slice::bytes::copy_memory;
+use cryptoutil::copy_memory;
 
 
 const ALPHA_MUL_TABLE : [u32; 256] = 
diff --git a/src/step_by.rs b/src/step_by.rs
new file mode 100644
index 0000000..ac12d4b
--- /dev/null
+++ b/src/step_by.rs
@@ -0,0 +1,50 @@
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+/// This module just implements a simple verison of step_by() since
+/// the function from the standard library is currently unstable.
+/// This should be removed once that function becomes stable.
+
+use std::ops::{Add, Range};
+
+#[derive(Clone)]
+pub struct StepUp<T> {
+    next: T,
+    end: T,
+    ammount: T
+}
+
+impl <T> Iterator for StepUp<T> where
+        T: Add<T, Output = T> + PartialOrd + Copy {
+    type Item = T;
+
+    #[inline]
+    fn next(&mut self) -> Option<T> {
+        if self.next < self.end {
+            let n = self.next;
+            self.next = self.next + self.ammount;
+            Some(n)
+        } else {
+            None
+        }
+    }
+}
+
+pub trait RangeExt<T> {
+    fn step_up(self, ammount: T) -> StepUp<T>;
+}
+
+impl <T> RangeExt<T> for Range<T> where
+        T: Add<T, Output = T> + PartialOrd + Copy {
+    fn step_up(self, ammount: T) -> StepUp<T> {
+        StepUp {
+            next: self.start,
+            end: self.end,
+            ammount: ammount
+        }
+    }
+}
+
diff --git a/src/util.rs b/src/util.rs
index 5f913df..b3403e5 100644
--- a/src/util.rs
+++ b/src/util.rs
@@ -4,83 +4,38 @@
 // option. This file may not be copied, modified, or distributed
 // except according to those terms.
 
+use libc;
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+extern {
+    pub fn rust_crypto_util_supports_aesni() -> u32;
+}
+
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 pub fn supports_aesni() -> bool {
-    let mut flags: u32;
     unsafe {
-        asm!(
-        "
-        mov $$1, %eax;
-        cpuid;
-        mov %ecx, $0;
-        "
-        : "=r" (flags) // output
-        : // input
-        : "eax", "ebx", "ecx", "edx" // clobbers
-        );
-        // No idea why, but on 32-bit targets, the compiler complains
-        // about not having enough registers. Adding in this dummy
-        // section, however, seems to fix it.
-        asm!("");
+        rust_crypto_util_supports_aesni() != 0
     }
-
-    (flags & 0x02000000) != 0
 }
 
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-#[allow(unused_assignments)]
-#[allow(unused_variables)]
-unsafe fn fixed_time_eq_asm(mut lhsp: *const u8, mut rhsp: *const u8, mut count: usize) -> bool {
-    let mut result: u8 = 0;
-
-    asm!(
-        "
-            1:
-
-            mov ($1), %cl
-            xor ($2), %cl
-            or %cl, $0
-
-            inc $1
-            inc $2
-            dec $3
-            jnz 1b
-        "
-        : "+r" (result), "+r" (lhsp), "+r" (rhsp), "+r" (count) // all input and output
-        : // input
-        : "cl", "cc" // clobbers
-        : "volatile" // flags
-    );
-
-    result == 0
+extern {
+    pub fn rust_crypto_util_fixed_time_eq_asm(
+            lhsp: *const u8,
+            rhsp: *const u8,
+            count: libc::size_t) -> u32;
+    pub fn rust_crypto_util_secure_memset(
+            dst: *mut u8,
+            val: libc::uint8_t,
+            count: libc::size_t);
 }
 
-#[cfg(target_arch = "arm")]
-#[allow(unused_assignments)]
-unsafe fn fixed_time_eq_asm(mut lhsp: *const u8, mut rhsp: *const u8, mut count: usize) -> bool {
-    let mut result: u8 = 0;
-
-    asm!(
-        "
-            1:
-
-            ldrb r4, [$1]
-            ldrb r5, [$2]
-            eor r4, r4, r5
-            orr $0, $0, r4
-
-            add $1, $1, #1
-            add $2, $2, #1
-            subs $3, $3, #1
-            bne 1b
-        "
-        : "+r" (result), "+r" (lhsp), "+r" (rhsp), "+r" (count) // all input and output
-        : // input
-        : "r4", "r5", "cc" // clobbers
-        : "volatile" // flags
-    );
-
-    result == 0
+pub fn secure_memset(dst: &mut [u8], val: u8) {
+    unsafe {
+        rust_crypto_util_secure_memset(
+            dst.as_mut_ptr(),
+            val,
+            dst.len() as libc::size_t);
+    }
 }
 
 /// Compare two vectors using a fixed number of operations. If the two vectors are not of equal
@@ -91,12 +46,12 @@ pub fn fixed_time_eq(lhs: &[u8], rhs: &[u8]) -> bool {
     } else if lhs.len() == 0 {
         true
     } else {
-        let count = lhs.len();
+        let count = lhs.len() as libc::size_t;
 
         unsafe {
             let lhsp = lhs.get_unchecked(0);
             let rhsp = rhs.get_unchecked(0);
-            fixed_time_eq_asm(lhsp, rhsp, count)
+            rust_crypto_util_fixed_time_eq_asm(lhsp, rhsp, count) == 0
         }
     }
 }
diff --git a/src/util_helpers.c b/src/util_helpers.c
new file mode 100644
index 0000000..5a20fa2
--- /dev/null
+++ b/src/util_helpers.c
@@ -0,0 +1,79 @@
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#if defined(__i386__) || defined(__x86_64__)
+uint32_t rust_crypto_util_supports_aesni() {
+    uint32_t flags;
+    asm(
+        "mov $1, %%eax; cpuid;"
+        : "=c" (flags) // output
+        : // input
+        : "eax", "ebx", "edx" // clobbers
+    );
+    return flags & 0x02000000;
+}
+#endif
+
+#if defined(__i386__) || defined(__x86_64__)
+uint32_t rust_crypto_util_fixed_time_eq_asm(uint8_t* lhsp, uint8_t* rhsp, size_t count) {
+    uint8_t result = 0;
+    asm(
+        " \
+            1: \
+            \
+            mov (%1), %%cl; \
+            xor (%2), %%cl; \
+            or %%cl, %0; \
+            \
+            inc %1; \
+            inc %2; \
+            dec %3; \
+            jnz 1b; \
+        "
+        : "+&r" (result), "+&r" (lhsp), "+&r" (rhsp), "+&r" (count) // all input and output
+        : // input
+        : "cl", "cc" // clobbers
+    );
+
+    return result;
+}
+#endif
+
+#ifdef __arm__
+uint32_t rust_crypto_util_fixed_time_eq_asm(uint8_t* lhsp, uint8_t* rhsp, size_t count) {
+    uint8_t result = 0;
+    asm(
+        " \
+            1: \
+            \
+            ldrb r4, [$1] \
+            ldrb r5, [$2] \
+            eor r4, r4, r5 \
+            orr $0, $0, r4 \
+            \
+            add $1, $1, #1 \
+            add $2, $2, #1 \
+            subs $3, $3, #1 \
+            bne 1b \
+        "
+        : "+&r" (result), "+&r" (lhsp), "+&r" (rhsp), "+&r" (count) // all input and output
+        : // input
+        : "r4", "r5", "cc" // clobbers
+    );
+
+    return result;
+}
+#endif
+
+void rust_crypto_util_secure_memset(uint8_t* dst, uint8_t val, size_t count) {
+    memset(dst, val, count);
+    asm("" : : "g" (dst) : "memory");
+}
+