From 381d2ed70d3f3c2913e19a950dee0da0149dae1d Mon Sep 17 00:00:00 2001
From: Ulrik Sverdrup <bluss@users.noreply.github.com>
Date: Sat, 25 Jul 2015 11:52:58 +0200
Subject: [PATCH 1/4] siphash: Add more benchmarks

---
 src/libcoretest/hash/sip.rs | 90 +++++++++++++++++++++++++++++++++++--
 1 file changed, 87 insertions(+), 3 deletions(-)
diff --git a/src/libcoretest/hash/sip.rs b/src/libcoretest/hash/sip.rs
index 7832985d3f1c1..9b6cedd25b741 100644
--- a/src/libcoretest/hash/sip.rs
+++ b/src/libcoretest/hash/sip.rs
@@ -7,7 +7,7 @@
 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
 // option. This file may not be copied, modified, or distributed
 // except according to those terms.
-use test::Bencher;
+use test::{Bencher, black_box};
 
 use core::hash::{Hash, Hasher};
 use core::hash::SipHasher;
@@ -57,6 +57,12 @@ fn hash_with_keys<T: Hash>(k1: u64, k2: u64, x: &T) -> u64 {
     st.finish()
 }
 
+fn hash_bytes(x: &[u8]) -> u64 {
+    let mut s = SipHasher::default();
+    Hasher::write(&mut s, x);
+    s.finish()
+}
+
 #[test]
 #[allow(unused_must_use)]
 fn test_siphash() {
@@ -266,10 +272,88 @@ officia deserunt mollit anim id est laborum.";
     })
 }
 
+#[bench]
+fn bench_u32(b: &mut Bencher) {
+    let u = 162629500u32;
+    let u = black_box(u);
+    b.iter(|| {
+        hash(&u)
+    });
+    b.bytes = 8;
+}
+
+#[bench]
+fn bench_u32_keyed(b: &mut Bencher) {
+    let u = 162629500u32;
+    let u = black_box(u);
+    let k1 = black_box(0x1);
+    let k2 = black_box(0x2);
+    b.iter(|| {
+        hash_with_keys(k1, k2, &u)
+    });
+    b.bytes = 8;
+}
+
 #[bench]
 fn bench_u64(b: &mut Bencher) {
     let u = 16262950014981195938u64;
+    let u = black_box(u);
     b.iter(|| {
-        assert_eq!(hash(&u), 5254097107239593357);
-    })
+        hash(&u)
+    });
+    b.bytes = 8;
+}
+
+#[bench]
+fn bench_bytes_4(b: &mut Bencher) {
+    let data = black_box([b' '; 4]);
+    b.iter(|| {
+        hash_bytes(&data)
+    });
+    b.bytes = 4;
+}
+
+#[bench]
+fn bench_bytes_7(b: &mut Bencher) {
+    let data = black_box([b' '; 7]);
+    b.iter(|| {
+        hash_bytes(&data)
+    });
+    b.bytes = 7;
+}
+
+#[bench]
+fn bench_bytes_8(b: &mut Bencher) {
+    let data = black_box([b' '; 8]);
+    b.iter(|| {
+        hash_bytes(&data)
+    });
+    b.bytes = 8;
+}
+
+#[bench]
+fn bench_bytes_a_16(b: &mut Bencher) {
+    let data = black_box([b' '; 16]);
+    b.iter(|| {
+        hash_bytes(&data)
+    });
+    b.bytes = 16;
+}
+
+#[bench]
+fn bench_bytes_b_32(b: &mut Bencher) {
+    let data = black_box([b' '; 32]);
+    b.iter(|| {
+        hash_bytes(&data)
+    });
+    b.bytes = 32;
+}
+
+#[bench]
+fn bench_bytes_c_128(b: &mut Bencher) {
+    let data = black_box([b' '; 128]);
+    b.iter(|| {
+        hash_bytes(&data)
+    });
+    b.bytes = 128;
 }

From f910d27f87419e17cc59034265f6795db5247dfa Mon Sep 17 00:00:00 2001
From: Ulrik Sverdrup <bluss@users.noreply.github.com>
Date: Sat, 25 Jul 2015 11:55:26 +0200
Subject: [PATCH 2/4] siphash: Use ptr::copy_nonoverlapping for efficient data
 loading

Use `ptr::copy_nonoverlapping` (aka memcpy) to load an u64 from the
byte stream. This is correct for any alignment, and the compiler will
use the appropriate instruction to load the data.

Use unchecked indexing.

This results in a large improvement of throughput (hashed bytes
/ second) for long data. Maximum improvement benches at a 70% increase
in throughput for large values (> 256 bytes) but already values of 16
bytes or larger improve.

Introducing unchecked indexing is motivated to reach as good throughput
as possible. Using ptr::copy_nonoverlapping without unchecked indexing
would land the improvement some 20-30 pct units lower.

We use a debug assertion so that the test suite checks our use of
unchecked indexing.
---
 src/libcore/hash/sip.rs | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/src/libcore/hash/sip.rs b/src/libcore/hash/sip.rs
index d26e9ab707205..fae14da22c4c6 100644
--- a/src/libcore/hash/sip.rs
+++ b/src/libcore/hash/sip.rs
@@ -10,6 +10,7 @@
 
 //! An implementation of SipHash 2-4.
 
+use ptr;
 use prelude::*;
 use super::Hasher;
 
@@ -65,6 +66,20 @@ macro_rules! u8to64_le {
     });
 }
 
+/// Load a full u64 word from a byte stream, in LE order. Use
+/// `copy_nonoverlapping` to let the compiler generate the most efficient way
+/// to load u64 from a possibly unaligned address.
+///
+/// Unsafe because: unchecked indexing at i..i+8
+#[inline]
+unsafe fn load_u64_le(buf: &[u8], i: usize) -> u64 {
+    debug_assert!(i + 8 <= buf.len());
+    let mut data = 0u64;
+    ptr::copy_nonoverlapping(buf.get_unchecked(i),
+                             &mut data as *mut _ as *mut u8, 8);
+    data.to_le()
+}
+
 macro_rules! rotl {
     ($x:expr, $b:expr) =>
     (($x << $b) | ($x >> (64_i32.wrapping_sub($b))))
@@ -151,7 +166,7 @@ impl SipHasher {
 
         let mut i = needed;
         while i < end {
-            let mi = u8to64_le!(msg, i);
+            let mi = unsafe { load_u64_le(msg, i) };
 
             self.v3 ^= mi;
             compress!(self.v0, self.v1, self.v2, self.v3);

From 5f6a61e16524025a690ac5512669583145db94b1 Mon Sep 17 00:00:00 2001
From: Ulrik Sverdrup <bluss@users.noreply.github.com>
Date: Sat, 25 Jul 2015 11:57:02 +0200
Subject: [PATCH 3/4] siphash: Remove one variable

Without this temporary variable, codegen improves slightly and less
registers are spilled to the stack in SipHash::write.
---
 src/libcore/hash/sip.rs | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/libcore/hash/sip.rs b/src/libcore/hash/sip.rs
index fae14da22c4c6..5b6fd46f677af 100644
--- a/src/libcore/hash/sip.rs
+++ b/src/libcore/hash/sip.rs
@@ -161,11 +161,10 @@ impl SipHasher {
 
         // Buffered tail is now flushed, process new input.
         let len = length - needed;
-        let end = len & (!0x7);
         let left = len & 0x7;
 
         let mut i = needed;
-        while i < end {
+        while i < len - left {
             let mi = unsafe { load_u64_le(msg, i) };
 
             self.v3 ^= mi;

From 27c44ce9c3be36d49b829e3dfbdcc983bddd727d Mon Sep 17 00:00:00 2001
From: Ulrik Sverdrup <bluss@users.noreply.github.com>
Date: Sat, 25 Jul 2015 11:59:06 +0200
Subject: [PATCH 4/4] siphash: Reorder hash state in the struct

If they are ordered v0, v2, v1, v3, the compiler can find just a few
simd optimizations itself.

The new optimization I could observe on x86-64 was using 128 bit
registers for the v = key ^ constant operations in new / reset.
---
 src/libcore/hash/sip.rs | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/libcore/hash/sip.rs b/src/libcore/hash/sip.rs
index 5b6fd46f677af..93bdadff549ae 100644
--- a/src/libcore/hash/sip.rs
+++ b/src/libcore/hash/sip.rs
@@ -32,9 +32,13 @@ pub struct SipHasher {
     k0: u64,
     k1: u64,
     length: usize, // how many bytes we've processed
+    // v0, v2 and v1, v3 show up in pairs in the algorithm,
+    // and simd implementations of SipHash will use vectors
+    // of v02 and v13. By placing them in this order in the struct,
+    // the compiler can pick up on just a few simd optimizations by itself.
     v0: u64,      // hash state
-    v1: u64,
     v2: u64,
+    v1: u64,
     v3: u64,
     tail: u64, // unprocessed bytes le
     ntail: usize,  // how many bytes in tail are valid