diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index c7ce09f..f3002de 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -15,18 +15,30 @@ jobs: - uses: actions/checkout@v4 - name: Benchmark - run: cargo bench --bench throughput --features 'bench-plot' + run: cargo bench --bench throughput --features bench-plot + + - uses: actions/upload-artifact@v3 + with: + name: benches + path: benches/throughput/x86_64.svg + + benchmark-x86-avx2: + name: Benchmark X86 AVX2 + runs-on: buildjet-2vcpu-ubuntu-2204 + + steps: + - uses: actions/checkout@v4 - name: Switch to nightly rust run: rustup default nightly - - name: Benchmark AVX2 (nightly) - run: cargo bench --bench throughput --features 'bench-plot avx2' + - name: Benchmark + run: cargo bench --bench throughput --features bench-plot - uses: actions/upload-artifact@v3 with: name: benches - path: benches/throughput/*.svg + path: benches/throughput/x86_64-hybrid.svg benchmark-arm: name: Benchmark ARM @@ -36,17 +48,17 @@ jobs: - uses: actions/checkout@v4 - name: Benchmark - run: cargo bench --bench throughput --features 'bench-plot' + run: cargo bench --bench throughput --features bench-plot - uses: actions/upload-artifact@v3 with: name: benches - path: benches/throughput/*.svg + path: benches/throughput/aarch64.svg commit: name: Commit & Push runs-on: buildjet-2vcpu-ubuntu-2204 - needs: [benchmark-x86, benchmark-arm] + needs: [benchmark-x86, benchmark-x86-avx2, benchmark-arm] permissions: contents: write diff --git a/.github/workflows/build_test.yml b/.github/workflows/build_test.yml index a178298..45e04cf 100644 --- a/.github/workflows/build_test.yml +++ b/.github/workflows/build_test.yml @@ -10,14 +10,44 @@ env: CARGO_TERM_COLOR: always jobs: - build_test: + build_test_x86: + name: Build & Test X86 + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 - name: Build & Test + - name: Build + run: cargo build --release + + - name: Test + run: cargo test --release + + build_test_x86_avx2: + name: Build & Test X86 AVX2 runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 + + - name: Switch to nightly rust + run: rustup default nightly + - name: Build run: cargo build --release + + - name: Test + run: cargo test --release + + build_test_arm: + name: Build & Test ARM + runs-on: buildjet-2vcpu-ubuntu-2204-arm + + steps: + - uses: actions/checkout@v3 + + - name: Build + run: cargo build --release + - name: Test run: cargo test --release \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml index 42ce8c3..289de9c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "gxhash" authors = ["Olivier Giniaux"] -version = "2.3.0" +version = "3.0.0" edition = "2021" description = "GxHash non-cryptographic algorithm" license = "MIT" @@ -13,10 +13,6 @@ categories = ["algorithms", "data-structures", "no-std"] exclude = ["article/*"] [features] -# The 256-bit state GxHash is faster for large inputs than the default 128-bit state implementation, but faster on smaller hashes. -# Please not however that the 256-bit GxHash and the 128-bit GxHash don't generate the same hashes for a same input. -# Requires AVX2 and VAES (X86). -avx2 = [] # Only relevant for throughput benchmarks bench-csv = [] bench-md = [] @@ -39,6 +35,9 @@ seahash = "4.1.0" metrohash = "1.0.6" fnv = "1.0.3" +[build-dependencies] +rustc_version = "0.4.0" + [dev-dependencies.plotters] version = "0.3.5" default-features = false diff --git a/README.md b/README.md index a18522d..eac8029 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,7 @@ GxHash is compatible with: > Other platforms are currently not supported (there is no fallback). The behavior on these platforms is undefined. ### Hashes Stability -All generated hashes for a given version of GxHash are stable, meaning that for a given input the output hash will be the same across all supported platforms. An exception to this is the AVX2 version of GxHash (nightly). +All generated hashes for a given version of GxHash are stable, meaning that for a given input the output hash will be the same across all supported platforms. ## Benchmarks @@ -74,7 +74,7 @@ GxHash is continuously benchmarked on X86 and ARM Github runners. GxHash is a seeded hashing algorithm, meaning that depending on the seed used, it will generate completely different hashes. The default `HasherBuilder` (`GxHasherBuilder::default()`) uses seed randomization, making any `HashMap`/`HashSet` more DOS resistant, as it will make it much more difficult for attackers to be able to predict which hashes may collide without knowing the seed used. This does not mean however that it is completely DOS resistant. This has to be analyzed further. ### Multicollisions Resistance -GxHash uses a 128-bit internal state (and even 256-bit with the `avx2` feature). This makes GxHash [a widepipe construction](https://en.wikipedia.org/wiki/Merkle%E2%80%93Damg%C3%A5rd_construction#Wide_pipe_construction) when generating hashes of size 64-bit or smaller, which had amongst other properties to be inherently more resistant to multicollision attacks. See [this paper](https://www.iacr.org/archive/crypto2004/31520306/multicollisions.pdf) for more details. +GxHash uses a 128-bit internal state. This makes GxHash [a widepipe construction](https://en.wikipedia.org/wiki/Merkle%E2%80%93Damg%C3%A5rd_construction#Wide_pipe_construction) when generating hashes of size 64-bit or smaller, which had amongst other properties to be inherently more resistant to multicollision attacks. See [this paper](https://www.iacr.org/archive/crypto2004/31520306/multicollisions.pdf) for more details. ### Cryptographic Properties GxHash is a non-cryptographic hashing algorithm, thus it is not recommended to use it as a cryptographic algorithm (it is not a replacement for SHA). It has not been assessed if GxHash is preimage resistant and how difficult it is to be reversed. @@ -103,4 +103,4 @@ Publication: [PDF](https://github.com/ogxd/gxhash-rust/blob/main/article/article.pdf) Cite this publication / algorithm: -[![DOI](https://zenodo.org/badge/690754256.svg)](https://zenodo.org/badge/latestdoi/690754256) +[![DOI](https://zenodo.org/badge/690754256.svg)](https://zenodo.org/badge/latestdoi/690754256) \ No newline at end of file diff --git a/benches/throughput/aarch64.svg b/benches/throughput/aarch64.svg index 63c9d27..541e304 100644 --- a/benches/throughput/aarch64.svg +++ b/benches/throughput/aarch64.svg @@ -4,12 +4,13 @@ Throughput (aarch64) - - - - - - + + + + + + + Throughput (MiB/s) @@ -31,41 +32,46 @@ Input Size (bytes) - - - - - - + + + + + + + 0 - + 5000 - - + + 10000 - - + + 15000 - - + + 20000 - - + + 25000 - - + + 30000 - + + +35000 + + 4 @@ -123,14 +129,14 @@ Input Size (bytes) 32768 - - - - - - - - + + + + + + + + diff --git a/benches/throughput/main.rs b/benches/throughput/main.rs index 3b78fa7..3a48963 100644 --- a/benches/throughput/main.rs +++ b/benches/throughput/main.rs @@ -38,7 +38,7 @@ fn main() { }; // GxHash - let gxhash_name = if cfg!(feature = "avx2") { "gxhash-avx2" } else { "gxhash" }; + let gxhash_name = if cfg!(hybrid) { "gxhash-hybrid" } else { "gxhash" }; benchmark(processor.as_mut(), slice, gxhash_name, |data: &[u8], seed: i64| -> u64 { gxhash64(data, seed) }); diff --git a/benches/throughput/result_processor.rs b/benches/throughput/result_processor.rs index 49bece6..2b15546 100644 --- a/benches/throughput/result_processor.rs +++ b/benches/throughput/result_processor.rs @@ -120,8 +120,8 @@ impl ResultProcessor for OutputPlot { fn finish(&self) { let mut arch = std::env::consts::ARCH.to_string(); - if cfg!(feature = "avx2") { - arch += "-avx2"; + if cfg!(hybrid) { + arch += "-hybrid"; } let file_name = format!("benches/throughput/{}.svg", arch); @@ -176,6 +176,6 @@ impl ResultProcessor for OutputPlot { // To avoid the IO failure being ignored silently, we manually call the present function canvas.present().expect("Unable to write result to file, please make sure 'plotters-doc-data' dir exists under current dir"); - println!("Finished"); + println!("Finished: '{}'", file_name); } } \ No newline at end of file diff --git a/benches/throughput/x86_64-avx2.svg b/benches/throughput/x86_64-hybrid.svg similarity index 72% rename from benches/throughput/x86_64-avx2.svg rename to benches/throughput/x86_64-hybrid.svg index 3e769ce..918f5c7 100644 --- a/benches/throughput/x86_64-avx2.svg +++ b/benches/throughput/x86_64-hybrid.svg @@ -1,17 +1,16 @@ -Throughput (x86_64-avx2) +Throughput (x86_64-hybrid) - - - - - - - - + + + + + + + Throughput (MiB/s) @@ -33,51 +32,46 @@ Input Size (bytes) - - - - - - - - + + + + + + + 0 - + 20000 - - + + 40000 - - + + 60000 - - + + 80000 - - + + 100000 - - + + 120000 - - + + 140000 - - -160000 - - + 4 @@ -135,46 +129,46 @@ Input Size (bytes) 32768 - - - - - - - - - - - -gxhash-avx2 - - + + + + + + + + + + + +gxhash-hybrid + + xxhash - + ahash - + t1ha0 - + seahash - + metrohash - + highwayhash - + fnv-1a - - - - - - - - + + + + + + + + diff --git a/benches/throughput/x86_64.svg b/benches/throughput/x86_64.svg index 16a5047..0a34d6c 100644 --- a/benches/throughput/x86_64.svg +++ b/benches/throughput/x86_64.svg @@ -4,16 +4,11 @@ Throughput (x86_64) - - - - - - - - - - + + + + + Throughput (MiB/s) @@ -35,61 +30,36 @@ Input Size (bytes) - - - - - - - - - - + + + + + 0 - -10000 - - - + 20000 - - -30000 - - - + + 40000 - - -50000 - - - + + 60000 - - -70000 - - - + + 80000 - - -90000 - - - + + 100000 - + 4 @@ -147,14 +117,14 @@ Input Size (bytes) 32768 - - - - - - - - + + + + + + + + diff --git a/benches/throughput_criterion.rs b/benches/throughput_criterion.rs index f999da6..7d355fa 100644 --- a/benches/throughput_criterion.rs +++ b/benches/throughput_criterion.rs @@ -44,8 +44,8 @@ fn benchmark_all(c: &mut Criterion) { group.plot_config(plot_config); // GxHash - let algo_name = if cfg!(feature = "avx2") { "gxhash-avx2" } else { "gxhash" }; - benchmark(&mut group, slice, algo_name, |data: &[u8], seed: i32| -> u64 { + let gxhash_name = if cfg!(hybrid) { "gxhash-hybrid" } else { "gxhash" }; + benchmark(&mut group, slice, gxhash_name, |data: &[u8], seed: i32| -> u64 { gxhash64(data, seed as i64) }); diff --git a/build.rs b/build.rs new file mode 100644 index 0000000..dd11580 --- /dev/null +++ b/build.rs @@ -0,0 +1,11 @@ +extern crate rustc_version; +use rustc_version::{version_meta, Channel}; + +fn main() { + if version_meta().unwrap().channel == Channel::Nightly + && cfg!(target_arch = "x86_64") + && cfg!(target_feature = "avx2") + && cfg!(target_feature = "vaes") { + println!("cargo:rustc-cfg=hybrid"); + } +} \ No newline at end of file diff --git a/src/gxhash/mod.rs b/src/gxhash/mod.rs index a3d7fd2..dd117ae 100644 --- a/src/gxhash/mod.rs +++ b/src/gxhash/mod.rs @@ -63,37 +63,36 @@ macro_rules! load_unaligned { }; } +pub(crate) use load_unaligned; + #[inline(always)] pub(crate) unsafe fn gxhash(input: &[u8], seed: State) -> State { - finalize(compress_fast(compress_all(input), seed)) + finalize(aes_encrypt(compress_all(input), seed)) } #[inline(always)] pub(crate) unsafe fn compress_all(input: &[u8]) -> State { let len = input.len(); + let mut ptr = input.as_ptr() as *const State; if len == 0 { return create_empty(); } - let mut ptr = input.as_ptr() as *const State; - if len <= VECTOR_SIZE { // Input fits on a single SIMD vector, however we might read beyond the input message // Thus we need this safe method that checks if it can safely read beyond or must copy return get_partial(ptr, len); } - let extra_bytes_count = len % VECTOR_SIZE; - let remaining_bytes: usize; + let mut hash_vector: State; + let end = ptr as usize + len; - // The input does not fit on a single SIMD vector - let hash_vector: State; + let extra_bytes_count = len % VECTOR_SIZE; if extra_bytes_count == 0 { load_unaligned!(ptr, v0); hash_vector = v0; - remaining_bytes = len - VECTOR_SIZE; } else { // If the input length does not match the length of a whole number of SIMD vectors, // it means we'll need to read a partial vector. We can start with the partial vector first, @@ -101,57 +100,53 @@ pub(crate) unsafe fn compress_all(input: &[u8]) -> State { // the input hash_vector = get_partial_unsafe(ptr, extra_bytes_count); ptr = ptr.cast::().add(extra_bytes_count).cast(); - remaining_bytes = len - extra_bytes_count; } - #[allow(unused_assignments)] - if len <= VECTOR_SIZE * 2 { - // Fast path when input length > 16 and <= 32 - load_unaligned!(ptr, v0); - compress(hash_vector, v0) - } else if len <= VECTOR_SIZE * 3 { + load_unaligned!(ptr, v0); + + if len > VECTOR_SIZE * 2 { // Fast path when input length > 32 and <= 48 - load_unaligned!(ptr, v0, v1); - compress(hash_vector, compress(v0, v1)) - } else { - // Input message is large and we can use the high ILP loop - compress_many(ptr, hash_vector, remaining_bytes) + load_unaligned!(ptr, v); + v0 = aes_encrypt(v0, v); + + if len > VECTOR_SIZE * 3 { + // Fast path when input length > 48 and <= 64 + load_unaligned!(ptr, v); + v0 = aes_encrypt(v0, v); + + if len > VECTOR_SIZE * 4 { + // Input message is large and we can use the high ILP loop + hash_vector = compress_many(ptr, end, hash_vector, len); + } + } } + + return aes_encrypt_last(hash_vector, + aes_encrypt(aes_encrypt(v0, ld(KEYS.as_ptr())), ld(KEYS.as_ptr().offset(4)))); } #[inline(always)] -unsafe fn compress_many(mut ptr: *const State, hash_vector: State, remaining_bytes: usize) -> State { +unsafe fn compress_many(mut ptr: *const State, end: usize, hash_vector: State, len: usize) -> State { const UNROLL_FACTOR: usize = 8; - let unrollable_blocks_count: usize = remaining_bytes / (VECTOR_SIZE * UNROLL_FACTOR) * UNROLL_FACTOR; - let end_address = ptr.add(unrollable_blocks_count) as usize; - let mut hash_vector = hash_vector; - while (ptr as usize) < end_address { + let remaining_bytes = end - ptr as usize; - load_unaligned!(ptr, v0, v1, v2, v3, v4, v5, v6, v7); - - let mut tmp: State; - tmp = compress_fast(v0, v1); - tmp = compress_fast(tmp, v2); - tmp = compress_fast(tmp, v3); - tmp = compress_fast(tmp, v4); - tmp = compress_fast(tmp, v5); - tmp = compress_fast(tmp, v6); - tmp = compress_fast(tmp, v7); - - hash_vector = compress(hash_vector, tmp); - } + let unrollable_blocks_count: usize = remaining_bytes / (VECTOR_SIZE * UNROLL_FACTOR) * UNROLL_FACTOR; let remaining_bytes = remaining_bytes - unrollable_blocks_count * VECTOR_SIZE; let end_address = ptr.add(remaining_bytes / VECTOR_SIZE) as usize; + // Process first individual blocks until we have an whole number of 8 blocks + let mut hash_vector = hash_vector; while (ptr as usize) < end_address { load_unaligned!(ptr, v0); - hash_vector = compress(hash_vector, v0); + hash_vector = aes_encrypt(hash_vector, v0); } - hash_vector + // Process the remaining n * 8 blocks + // This part may use 128-bit or 256-bit + compress_8(ptr, end, hash_vector, len) } #[cfg(test)] @@ -294,7 +289,7 @@ mod tests { fn is_stable() { assert_eq!(456576800, gxhash32(&[0u8; 0], 0)); assert_eq!(978957914, gxhash32(&[0u8; 1], 0)); - assert_eq!(3325885698, gxhash32(&[0u8; 1000], 0)); - assert_eq!(3805815999, gxhash32(&[42u8; 4242], 42)); + assert_eq!(2252110365, gxhash32(&[0u8; 1000], 0)); + assert_eq!(2426107958, gxhash32(&[42u8; 4242], 42)); } } diff --git a/src/gxhash/platform/arm_128.rs b/src/gxhash/platform/aarch64.rs similarity index 59% rename from src/gxhash/platform/arm_128.rs rename to src/gxhash/platform/aarch64.rs index 4cb452d..b314a61 100644 --- a/src/gxhash/platform/arm_128.rs +++ b/src/gxhash/platform/aarch64.rs @@ -4,15 +4,6 @@ use super::*; pub type State = int8x16_t; -#[repr(C)] -union ReinterpretUnion { - int64: int64x2_t, - int32: int32x4_t, - uint32: uint32x4_t, - int8: int8x16_t, - uint8: uint8x16_t, -} - #[inline(always)] pub unsafe fn create_empty() -> State { vdupq_n_s8(0) @@ -52,45 +43,33 @@ pub unsafe fn get_partial_safe(data: *const State, len: usize) -> State { pub unsafe fn get_partial_unsafe(data: *const State, len: usize) -> State { let indices = vld1q_s8([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15].as_ptr()); let mask = vcgtq_s8(vdupq_n_s8(len as i8), indices); - let partial_vector = vandq_s8(load_unaligned(data), ReinterpretUnion { uint8: mask }.int8); + let partial_vector = vandq_s8(load_unaligned(data), vreinterpretq_s8_u8(mask)); vaddq_s8(partial_vector, vdupq_n_s8(len as i8)) } #[inline(always)] -pub unsafe fn compress(a: int8x16_t, b: int8x16_t) -> int8x16_t { - let keys_1 = vld1q_u32([0xFC3BC28E, 0x89C222E5, 0xB09D3E21, 0xF2784542].as_ptr()); - let keys_2 = vld1q_u32([0x03FCE279, 0xCB6B2E9B, 0xB361DC58, 0x39136BD9].as_ptr()); - - let mut bs = vreinterpretq_u8_s8(b); - bs = aes_encrypt(bs, vreinterpretq_u8_u32(keys_1)); - bs = aes_encrypt(bs, vreinterpretq_u8_u32(keys_2)); - - vreinterpretq_s8_u8(aes_encrypt_last(vreinterpretq_u8_s8(a), bs)) -} - -#[inline(always)] -pub unsafe fn compress_fast(a: int8x16_t, b: int8x16_t) -> int8x16_t { - vreinterpretq_s8_u8(aes_encrypt(vreinterpretq_u8_s8(a), vreinterpretq_u8_s8(b))) +pub unsafe fn ld(array: *const u32) -> State { + vreinterpretq_s8_u32(vld1q_u32(array)) } #[inline(always)] // See https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a -unsafe fn aes_encrypt(data: uint8x16_t, keys: uint8x16_t) -> uint8x16_t { +pub unsafe fn aes_encrypt(data: State, keys: State) -> State { // Encrypt - let encrypted = vaeseq_u8(data, vdupq_n_u8(0)); + let encrypted = vaeseq_u8(vreinterpretq_u8_s8(data), vdupq_n_u8(0)); // Mix columns let mixed = vaesmcq_u8(encrypted); // Xor keys - veorq_u8(mixed, keys) + vreinterpretq_s8_u8(veorq_u8(mixed, vreinterpretq_u8_s8(keys))) } #[inline(always)] // See https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a -unsafe fn aes_encrypt_last(data: uint8x16_t, keys: uint8x16_t) -> uint8x16_t { +pub unsafe fn aes_encrypt_last(data: State, keys: State) -> State { // Encrypt - let encrypted = vaeseq_u8(data, vdupq_n_u8(0)); + let encrypted = vaeseq_u8(vreinterpretq_u8_s8(data), vdupq_n_u8(0)); // Xor keys - veorq_u8(encrypted, keys) + vreinterpretq_s8_u8(veorq_u8(encrypted, vreinterpretq_u8_s8(keys))) } #[inline(always)] @@ -101,12 +80,52 @@ pub unsafe fn finalize(hash: State) -> State { let keys_3 = vld1q_u32([0xC78B122B, 0x5544B1B7, 0x689D2B7D, 0xD0012E32].as_ptr()); // 3 rounds of AES - let mut hash = ReinterpretUnion { int8: hash }.uint8; - hash = aes_encrypt(hash, ReinterpretUnion { uint32: keys_1 }.uint8); - hash = aes_encrypt(hash, ReinterpretUnion { uint32: keys_2 }.uint8); - hash = aes_encrypt_last(hash, ReinterpretUnion { uint32: keys_3 }.uint8); + let mut hash = hash; + hash = aes_encrypt(hash, vreinterpretq_s8_u32(keys_1)); + hash = aes_encrypt(hash, vreinterpretq_s8_u32(keys_2)); + hash = aes_encrypt_last(hash, vreinterpretq_s8_u32(keys_3)); + + hash +} + +#[inline(always)] +pub unsafe fn compress_8(mut ptr: *const State, end_address: usize, hash_vector: State, len: usize) -> State { + + // Disambiguation vectors + let mut t1: State = create_empty(); + let mut t2: State = create_empty(); + + // Hash is processed in two separate 128-bit parallel lanes + // This allows the same processing to be applied using 256-bit V-AES instrinsics + // so that hashes are stable in both cases. + let mut lane1 = hash_vector; + let mut lane2 = hash_vector; + + while (ptr as usize) < end_address { + + crate::gxhash::load_unaligned!(ptr, v0, v1, v2, v3, v4, v5, v6, v7); - ReinterpretUnion { uint8: hash }.int8 + let mut tmp1 = aes_encrypt(v0, v2); + let mut tmp2 = aes_encrypt(v1, v3); + + tmp1 = aes_encrypt(tmp1, v4); + tmp2 = aes_encrypt(tmp2, v5); + + tmp1 = aes_encrypt(tmp1, v6); + tmp2 = aes_encrypt(tmp2, v7); + + t1 = vaddq_s8(t1, ld(KEYS.as_ptr())); + t2 = vaddq_s8(t2, ld(KEYS.as_ptr().offset(4))); + + lane1 = aes_encrypt_last(aes_encrypt(tmp1, t1), lane1); + lane2 = aes_encrypt_last(aes_encrypt(tmp2, t2), lane2); + } + // For 'Zeroes' test + let len_vec = vreinterpretq_s8_u32(vdupq_n_u32(len as u32)); + lane1 = vaddq_s8(lane1, len_vec); + lane2 = vaddq_s8(lane2, len_vec); + // Merge lanes + aes_encrypt(lane1, lane2) } #[inline(always)] diff --git a/src/gxhash/platform/mod.rs b/src/gxhash/platform/mod.rs index adbfe87..97ecbab 100644 --- a/src/gxhash/platform/mod.rs +++ b/src/gxhash/platform/mod.rs @@ -1,19 +1,15 @@ #[cfg(target_arch = "aarch64")] -#[path = "arm_128.rs"] +#[path = "aarch64.rs"] mod platform; -#[cfg(all(feature = "avx2", target_arch = "x86_64", target_feature = "avx2"))] -#[path = "x86_256.rs"] +#[cfg(target_arch = "x86_64")] +#[path = "x86_64.rs"] mod platform; -#[cfg(all(not(feature = "avx2"), target_arch = "x86_64"))] -#[path = "x86_128.rs"] -mod platform; +pub use platform::*; use std::mem::size_of; -pub use platform::*; - pub(crate) const VECTOR_SIZE: usize = size_of::(); // 4KiB is the default page size for most systems, and conservative for other systems such as MacOS ARM (16KiB) const PAGE_SIZE: usize = 0x1000; @@ -26,3 +22,5 @@ unsafe fn check_same_page(ptr: *const State) -> bool { // Check if the 16nd byte from the current offset exceeds the page boundary offset_within_page < PAGE_SIZE - VECTOR_SIZE } + +pub const KEYS: [u32; 8] = [0xF2784542, 0xB09D3E21, 0x89C222E5, 0xFC3BC28E, 0x03FCE279, 0xCB6B2E9B, 0xB361DC58, 0x39132BD9]; \ No newline at end of file diff --git a/src/gxhash/platform/x86_128.rs b/src/gxhash/platform/x86_128.rs deleted file mode 100644 index 248d814..0000000 --- a/src/gxhash/platform/x86_128.rs +++ /dev/null @@ -1,135 +0,0 @@ -use core::arch::x86_64::*; - -use super::*; - -pub type State = __m128i; - -#[inline(always)] -pub unsafe fn create_empty() -> State { - _mm_setzero_si128() -} - -#[inline(always)] -pub unsafe fn create_seed(seed: i64) -> State { - _mm_set1_epi64x(seed) -} - -#[inline(always)] -pub unsafe fn load_unaligned(p: *const State) -> State { - _mm_loadu_si128(p) -} - -#[inline(always)] -pub unsafe fn get_partial(p: *const State, len: usize) -> State { - // Safety check - if check_same_page(p) { - get_partial_unsafe(p, len) - } else { - get_partial_safe(p, len) - } -} - -#[inline(always)] -pub unsafe fn get_partial_safe(data: *const State, len: usize) -> State { - // Temporary buffer filled with zeros - let mut buffer = [0i8; VECTOR_SIZE]; - // Copy data into the buffer - std::ptr::copy(data as *const i8, buffer.as_mut_ptr(), len); - // Load the buffer into a __m256i vector - let partial_vector = _mm_loadu_si128(buffer.as_ptr() as *const State); - _mm_add_epi8(partial_vector, _mm_set1_epi8(len as i8)) -} - -#[inline(always)] -pub unsafe fn get_partial_unsafe(data: *const State, len: usize) -> State { - let indices = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - let mask = _mm_cmpgt_epi8(_mm_set1_epi8(len as i8), indices); - let partial_vector = _mm_and_si128(_mm_loadu_si128(data), mask); - _mm_add_epi8(partial_vector, _mm_set1_epi8(len as i8)) -} - -#[inline(always)] -#[allow(overflowing_literals)] -pub unsafe fn compress(a: State, b: State) -> State { - let keys_1 = _mm_set_epi32(0xF2784542, 0xB09D3E21, 0x89C222E5, 0xFC3BC28E); - let keys_2 = _mm_set_epi32(0x39136BD9, 0xB361DC58, 0xCB6B2E9B, 0x03FCE279); - - // 2+1 rounds of AES for compression - let mut b = _mm_aesenc_si128(b, keys_1); - b = _mm_aesenc_si128(b, keys_2); - _mm_aesenclast_si128(a, b) -} - -#[inline(always)] -#[allow(overflowing_literals)] -pub unsafe fn compress_fast(a: State, b: State) -> State { - _mm_aesenc_si128(a, b) -} - -#[inline(always)] -#[allow(overflowing_literals)] -pub unsafe fn finalize(hash: State) -> State { - // Hardcoded AES keys - let keys_1 = _mm_set_epi32(0x85459F85, 0xAF163956, 0x8F2F35DB, 0x713B01D0); - let keys_2 = _mm_set_epi32(0xB89C054F, 0x3DD99ACA, 0x92CFA39C, 0x1DE09647); - let keys_3 = _mm_set_epi32(0xD0012E32, 0x689D2B7D, 0x5544B1B7, 0xC78B122B); - - // 4 rounds of AES - let mut hash = _mm_aesenc_si128(hash, keys_1); - hash = _mm_aesenc_si128(hash, keys_2); - hash = _mm_aesenclast_si128(hash, keys_3); - - hash -} - -#[inline(always)] -pub unsafe fn load_u8(x: u8) -> State { - _mm_set1_epi8(x as i8) -} - -#[inline(always)] -pub unsafe fn load_u16(x: u16) -> State { - _mm_set1_epi16(x as i16) -} - -#[inline(always)] -pub unsafe fn load_u32(x: u32) -> State { - _mm_set1_epi32(x as i32) -} - -#[inline(always)] -pub unsafe fn load_u64(x: u64) -> State { - _mm_set1_epi64x(x as i64) -} - -#[inline(always)] -pub unsafe fn load_u128(x: u128) -> State { - let ptr = &x as *const u128 as *const State; - _mm_loadu_si128(ptr) -} - -#[inline(always)] -pub unsafe fn load_i8(x: i8) -> State { - _mm_set1_epi8(x) -} - -#[inline(always)] -pub unsafe fn load_i16(x: i16) -> State { - _mm_set1_epi16(x) -} - -#[inline(always)] -pub unsafe fn load_i32(x: i32) -> State { - _mm_set1_epi32(x) -} - -#[inline(always)] -pub unsafe fn load_i64(x: i64) -> State { - _mm_set1_epi64x(x) -} - -#[inline(always)] -pub unsafe fn load_i128(x: i128) -> State { - let ptr = &x as *const i128 as *const State; - _mm_loadu_si128(ptr) -} \ No newline at end of file diff --git a/src/gxhash/platform/x86_256.rs b/src/gxhash/platform/x86_256.rs deleted file mode 100644 index 0b5b785..0000000 --- a/src/gxhash/platform/x86_256.rs +++ /dev/null @@ -1,138 +0,0 @@ -use core::arch::x86_64::*; - -use super::*; - -pub type State = __m256i; - -#[inline(always)] -pub unsafe fn create_empty() -> State { - _mm256_setzero_si256() -} - -#[inline(always)] -pub unsafe fn create_seed(seed: i64) -> State { - _mm256_set1_epi64x(seed) -} - -#[inline(always)] -pub unsafe fn load_unaligned(p: *const State) -> State { - _mm256_loadu_si256(p) -} - -#[inline(always)] -pub unsafe fn get_partial(p: *const State, len: usize) -> State { - // Safety check - if check_same_page(p) { - get_partial_unsafe(p, len as usize) - } else { - get_partial_safe(p, len as usize) - } -} - -#[inline(always)] -pub unsafe fn get_partial_safe(data: *const State, len: usize) -> State { - // Temporary buffer filled with zeros - let mut buffer = [0i8; VECTOR_SIZE]; - // Copy data into the buffer - std::ptr::copy(data as *const i8, buffer.as_mut_ptr(), len); - // Load the buffer into a __m256i vector - let partial_vector = _mm256_loadu_si256(buffer.as_ptr() as *const State); - _mm256_add_epi8(partial_vector, _mm256_set1_epi8(len as i8)) -} - -#[inline(always)] -pub unsafe fn get_partial_unsafe(data: *const State, len: usize) -> State { - let indices = _mm256_set_epi8(31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - let mask = _mm256_cmpgt_epi8(_mm256_set1_epi8(len as i8), indices); - let partial_vector = _mm256_and_si256(_mm256_loadu_si256(data), mask); - _mm256_add_epi8(partial_vector, _mm256_set1_epi8(len as i8)) -} - -#[inline(always)] -#[allow(overflowing_literals)] -pub unsafe fn compress(a: State, b: State) -> State { - let keys_1 = _mm256_set_epi32(0xFC3BC28E, 0x89C222E5, 0xB09D3E21, 0xF2784542, 0x4155EE07, 0xC897CCE2, 0x780AF2C3, 0x8A72B781); - let keys_2 = _mm256_set_epi32(0x03FCE279, 0xCB6B2E9B, 0xB361DC58, 0x39136BD9, 0x7A83D76B, 0xB1E8F9F0, 0x028925A8, 0x3B9A4E71); - - // 2+1 rounds of AES for compression - let mut b = _mm256_aesenc_epi128(b, keys_1); - b = _mm256_aesenc_epi128(b, keys_2); - return _mm256_aesenclast_epi128(a, b); -} - -#[inline(always)] -#[allow(overflowing_literals)] -pub unsafe fn compress_fast(a: State, b: State) -> State { - return _mm256_aesenc_epi128(a, b); -} - -#[inline(always)] -#[allow(overflowing_literals)] -pub unsafe fn finalize(hash: State) -> State { - // Hardcoded AES keys - let keys_1 = _mm256_set_epi32(0x713B01D0, 0x8F2F35DB, 0xAF163956, 0x85459F85, 0xB49D3E21, 0xF2784542, 0x2155EE07, 0xC197CCE2); - let keys_2 = _mm256_set_epi32(0x1DE09647, 0x92CFA39C, 0x3DD99ACA, 0xB89C054F, 0xCB6B2E9B, 0xC361DC58, 0x39136BD9, 0x7A83D76F); - let keys_3 = _mm256_set_epi32(0xC78B122B, 0x5544B1B7, 0x689D2B7D, 0xD0012E32, 0xE2784542, 0x4155EE07, 0xC897CCE2, 0x780BF2C2); - - // 4 rounds of AES - let mut hash = _mm256_aesenc_epi128(hash, keys_1); - hash = _mm256_aesenc_epi128(hash, keys_2); - hash = _mm256_aesenclast_epi128(hash, keys_3); - - let permuted = _mm256_permute2x128_si256(hash, hash, 0x21); - _mm256_xor_si256(hash, permuted) -} - -#[inline(always)] -pub unsafe fn load_u8(x: u8) -> State { - _mm256_set1_epi8(x as i8) -} - -#[inline(always)] -pub unsafe fn load_u16(x: u16) -> State { - _mm256_set1_epi16(x as i16) -} - -#[inline(always)] -pub unsafe fn load_u32(x: u32) -> State { - _mm256_set1_epi32(x as i32) -} - -#[inline(always)] -pub unsafe fn load_u64(x: u64) -> State { - _mm256_set1_epi64x(x as i64) -} - -#[inline(always)] -pub unsafe fn load_u128(x: u128) -> State { - let ptr = &x as *const u128 as *const __m128i; - let s128 = _mm_loadu_si128(ptr); - _mm256_set_m128i(s128, s128) -} - -#[inline(always)] -pub unsafe fn load_i8(x: i8) -> State { - _mm256_set1_epi8(x) -} - -#[inline(always)] -pub unsafe fn load_i16(x: i16) -> State { - _mm256_set1_epi16(x) -} - -#[inline(always)] -pub unsafe fn load_i32(x: i32) -> State { - _mm256_set1_epi32(x) -} - -#[inline(always)] -pub unsafe fn load_i64(x: i64) -> State { - _mm256_set1_epi64x(x) -} - -#[inline(always)] -pub unsafe fn load_i128(x: i128) -> State { - let ptr = &x as *const i128 as *const __m128i; - let s128 = _mm_loadu_si128(ptr); - _mm256_set_m128i(s128, s128) -} \ No newline at end of file diff --git a/src/gxhash/platform/x86_64.rs b/src/gxhash/platform/x86_64.rs new file mode 100644 index 0000000..ce737b3 --- /dev/null +++ b/src/gxhash/platform/x86_64.rs @@ -0,0 +1,215 @@ +use core::arch::x86_64::*; + +use super::*; + +pub type State = __m128i; + +#[inline(always)] +pub unsafe fn create_empty() -> State { + _mm_setzero_si128() +} + +#[inline(always)] +pub unsafe fn create_seed(seed: i64) -> State { + _mm_set1_epi64x(seed) +} + +#[inline(always)] +pub unsafe fn load_unaligned(p: *const State) -> State { + _mm_loadu_si128(p) +} + +#[inline(always)] +pub unsafe fn get_partial(p: *const State, len: usize) -> State { + // Safety check + if check_same_page(p) { + get_partial_unsafe(p, len) + } else { + get_partial_safe(p, len) + } +} + +#[inline(always)] +pub unsafe fn get_partial_safe(data: *const State, len: usize) -> State { + // Temporary buffer filled with zeros + let mut buffer = [0i8; VECTOR_SIZE]; + // Copy data into the buffer + std::ptr::copy(data as *const i8, buffer.as_mut_ptr(), len); + // Load the buffer into a __m256i vector + let partial_vector = _mm_loadu_si128(buffer.as_ptr() as *const State); + _mm_add_epi8(partial_vector, _mm_set1_epi8(len as i8)) +} + +#[inline(always)] +pub unsafe fn get_partial_unsafe(data: *const State, len: usize) -> State { + let indices = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + let mask = _mm_cmpgt_epi8(_mm_set1_epi8(len as i8), indices); + let partial_vector = _mm_and_si128(_mm_loadu_si128(data), mask); + _mm_add_epi8(partial_vector, _mm_set1_epi8(len as i8)) +} + +#[inline(always)] +#[allow(dead_code)] +pub unsafe fn aes_encrypt(data: State, keys: State) -> State { + _mm_aesenc_si128(data, keys) +} + +#[inline(always)] +#[allow(dead_code)] +pub unsafe fn aes_encrypt_last(data: State, keys: State) -> State { + _mm_aesenclast_si128(data, keys) +} + +#[inline(always)] +#[allow(dead_code)] +pub unsafe fn ld(array: *const u32) -> State { + _mm_loadu_si128(array as *const State) +} + +#[inline(always)] +#[allow(overflowing_literals)] +pub unsafe fn finalize(hash: State) -> State { + // Hardcoded AES keys + let keys_1 = _mm_set_epi32(0x85459F85, 0xAF163956, 0x8F2F35DB, 0x713B01D0); + let keys_2 = _mm_set_epi32(0xB89C054F, 0x3DD99ACA, 0x92CFA39C, 0x1DE09647); + let keys_3 = _mm_set_epi32(0xD0012E32, 0x689D2B7D, 0x5544B1B7, 0xC78B122B); + + // 4 rounds of AES + let mut hash = _mm_aesenc_si128(hash, keys_1); + hash = _mm_aesenc_si128(hash, keys_2); + hash = _mm_aesenclast_si128(hash, keys_3); + + hash +} + +#[cfg(not(hybrid))] +#[inline(always)] +pub unsafe fn compress_8(mut ptr: *const State, end_address: usize, hash_vector: State, len: usize) -> State { + + // Disambiguation vectors + let mut t1: State = create_empty(); + let mut t2: State = create_empty(); + + // Hash is processed in two separate 128-bit parallel lanes + // This allows the same processing to be applied using 256-bit V-AES instrinsics + // so that hashes are stable in both cases. + let mut lane1 = hash_vector; + let mut lane2 = hash_vector; + + while (ptr as usize) < end_address { + + crate::gxhash::load_unaligned!(ptr, v0, v1, v2, v3, v4, v5, v6, v7); + + let mut tmp1 = aes_encrypt(v0, v2); + let mut tmp2 = aes_encrypt(v1, v3); + + tmp1 = aes_encrypt(tmp1, v4); + tmp2 = aes_encrypt(tmp2, v5); + + tmp1 = aes_encrypt(tmp1, v6); + tmp2 = aes_encrypt(tmp2, v7); + + t1 = _mm_add_epi8(t1, ld(KEYS.as_ptr())); + t2 = _mm_add_epi8(t2, ld(KEYS.as_ptr().offset(4))); + + lane1 = aes_encrypt_last(aes_encrypt(tmp1, t1), lane1); + lane2 = aes_encrypt_last(aes_encrypt(tmp2, t2), lane2); + } + // For 'Zeroes' test + let len_vec = _mm_set1_epi32(len as i32); + lane1 = _mm_add_epi8(lane1, len_vec); + lane2 = _mm_add_epi8(lane2, len_vec); + // Merge lanes + aes_encrypt(lane1, lane2) +} + +#[cfg(hybrid)] +#[inline(always)] +pub unsafe fn compress_8(ptr: *const State, end_address: usize, hash_vector: State, len: usize) -> State { + macro_rules! load_unaligned_x2 { + ($ptr:ident, $($var:ident),+) => { + $( + #[allow(unused_mut)] + let mut $var = _mm256_loadu_si256($ptr); + $ptr = ($ptr).offset(1); + )+ + }; + } + + let mut ptr = ptr as *const __m256i; + let mut t = _mm256_setzero_si256(); + let mut lane = _mm256_set_m128i(hash_vector, hash_vector); + while (ptr as usize) < end_address { + + load_unaligned_x2!(ptr, v0, v1, v2, v3); + + let mut tmp = _mm256_aesenc_epi128(v0, v1); + tmp = _mm256_aesenc_epi128(tmp, v2); + tmp = _mm256_aesenc_epi128(tmp, v3); + + t = _mm256_add_epi8(t, _mm256_loadu_si256(KEYS.as_ptr() as *const __m256i)); + + lane = _mm256_aesenclast_epi128(_mm256_aesenc_epi128(tmp, t), lane); + } + // Extract the two 128-bit lanes + let mut lane1 = _mm256_castsi256_si128(lane); + let mut lane2 = _mm256_extracti128_si256(lane, 1); + // For 'Zeroes' test + let len_vec = _mm_set1_epi32(len as i32); + lane1 = _mm_add_epi8(lane1, len_vec); + lane2 = _mm_add_epi8(lane2, len_vec); + // Merge lanes + aes_encrypt(lane1, lane2) +} + +#[inline(always)] +pub unsafe fn load_u8(x: u8) -> State { + _mm_set1_epi8(x as i8) +} + +#[inline(always)] +pub unsafe fn load_u16(x: u16) -> State { + _mm_set1_epi16(x as i16) +} + +#[inline(always)] +pub unsafe fn load_u32(x: u32) -> State { + _mm_set1_epi32(x as i32) +} + +#[inline(always)] +pub unsafe fn load_u64(x: u64) -> State { + _mm_set1_epi64x(x as i64) +} + +#[inline(always)] +pub unsafe fn load_u128(x: u128) -> State { + let ptr = &x as *const u128 as *const State; + _mm_loadu_si128(ptr) +} + +#[inline(always)] +pub unsafe fn load_i8(x: i8) -> State { + _mm_set1_epi8(x) +} + +#[inline(always)] +pub unsafe fn load_i16(x: i16) -> State { + _mm_set1_epi16(x) +} + +#[inline(always)] +pub unsafe fn load_i32(x: i32) -> State { + _mm_set1_epi32(x) +} + +#[inline(always)] +pub unsafe fn load_i64(x: i64) -> State { + _mm_set1_epi64x(x) +} + +#[inline(always)] +pub unsafe fn load_i128(x: i128) -> State { + let ptr = &x as *const i128 as *const State; + _mm_loadu_si128(ptr) +} \ No newline at end of file diff --git a/src/hasher.rs b/src/hasher.rs index 5b82406..2cbc6c7 100644 --- a/src/hasher.rs +++ b/src/hasher.rs @@ -98,7 +98,7 @@ macro_rules! write { #[inline] fn $name(&mut self, value: $type) { self.state = unsafe { - compress_fast($load(value), self.state) + aes_encrypt($load(value), self.state) }; } } @@ -115,7 +115,8 @@ impl Hasher for GxHasher { #[inline] fn write(&mut self, bytes: &[u8]) { - self.state = unsafe { compress_fast(compress_all(bytes), self.state) }; + // Improvement: only compress at this stage and finalize in finish + self.state = unsafe { aes_encrypt(compress_all(bytes), self.state) }; } write!(write_u8, u8, load_u8); @@ -181,14 +182,6 @@ mod tests { assert!(hashset.insert("bye")); } - #[test] - fn hasher_handles_empty_inputs() { - let mut hashset = GxHashSet::default(); - // Getting a ptr from a Vec::::new() return a pointer with address of 1 - // We must make sure we dont SIGSEGV in such case - assert!(hashset.insert(Vec::::new())); - } - // This is important for DOS resistance #[test] fn gxhashset_uses_default_gxhasherbuilder() { diff --git a/src/lib.rs b/src/lib.rs index 7fd9229..0c54243 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,5 @@ -// Feature 'avx2' currently requires unstable 'stdsimd' -#![cfg_attr(all(feature = "avx2", target_arch = "x86_64"), feature(stdsimd))] +// Hybrid SIMD width usage currently requires unstable 'stdsimd' +#![cfg_attr(hybrid, feature(stdsimd))] #[rustfmt::skip] mod gxhash;