From bc54006330348906f52416eedf6db3dcb08da3ec Mon Sep 17 00:00:00 2001 From: Olivier Giniaux Date: Sun, 22 Oct 2023 00:14:47 +0200 Subject: [PATCH] Split platform code and experiment with compression --- src/gxhash.rs | 353 --------------------------------- src/gxhash/mod.rs | 205 +++++++++++++++++++ src/gxhash/platform/arm_128.rs | 165 +++++++++++++++ src/gxhash/platform/mod.rs | 9 + src/gxhash/platform/x86_256.rs | 90 +++++++++ 5 files changed, 469 insertions(+), 353 deletions(-) delete mode 100644 src/gxhash.rs create mode 100644 src/gxhash/mod.rs create mode 100644 src/gxhash/platform/arm_128.rs create mode 100644 src/gxhash/platform/mod.rs create mode 100644 src/gxhash/platform/x86_256.rs diff --git a/src/gxhash.rs b/src/gxhash.rs deleted file mode 100644 index d259bb8..0000000 --- a/src/gxhash.rs +++ /dev/null @@ -1,353 +0,0 @@ -// For ARM architecture -#[cfg(target_arch = "aarch64")] -mod platform_defs { - use std::mem::size_of; - use core::arch::aarch64::*; - - pub type state = int8x16_t; - - #[repr(C)] - union ReinterpretUnion { - int64: int64x2_t, - int32: int32x4_t, - uint32: uint32x4_t, - int8: int8x16_t, - uint8: uint8x16_t, - } - - #[inline] - pub unsafe fn create_empty() -> state { - vdupq_n_s8(0) - } - - #[inline] - pub unsafe fn prefetch(p: *const state) { - //__pld(p as *const i8); - } - - #[inline] - pub unsafe fn load_unaligned(p: *const state) -> state { - vld1q_s8(p as *const i8) - } - - #[inline] - pub unsafe fn get_partial(p: *const state, len: isize) -> state { - let partial_vector: state; - if check_same_page(p) { - // Unsafe (hence the check) but much faster - let indices = vld1q_s8([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15].as_ptr()); - let mask = vcgtq_s8(vdupq_n_s8(len as i8), indices); - partial_vector = vandq_s8(load_unaligned(p), ReinterpretUnion { uint8: mask }.int8); - } else { - // Safer but slower, using memcpy - partial_vector = get_partial_safe(p as *const i8, len as usize); - } - // Prevents padded zeroes to introduce bias - return vaddq_s8(partial_vector, vdupq_n_s8(len as i8)); - } - - #[inline] - unsafe fn check_same_page(ptr: *const state) -> bool { - let address = ptr as usize; - // Mask to keep only the last 12 bits (3 bytes) - let offset_within_page = address & 0xFFF; - // Check if the 32nd byte from the current offset exceeds the page boundary - offset_within_page <= (4096 - size_of::() - 1) - } - - #[inline] - unsafe fn get_partial_safe(data: *const i8, len: usize) -> state { - // Temporary buffer filled with zeros - let mut buffer = [0i8; size_of::()]; - // Copy data into the buffer - std::ptr::copy(data, buffer.as_mut_ptr(), len); - // Load the buffer into a __m256i vector - vld1q_s8(buffer.as_ptr()) - } - - #[inline] - pub unsafe fn compress(a: state, b: state) -> state { - let keys_1 = vld1q_u32([0xFC3BC28E, 0x89C222E5, 0xB09D3E21, 0xF2784542].as_ptr()); - let keys_2 = vld1q_u32([0x03FCE279, 0xCB6B2E9B, 0xB361DC58, 0x39136BD9].as_ptr()); - - // 2+1 rounds of AES for compression - let mut b = aes_encrypt(ReinterpretUnion { int8: b }.uint8, ReinterpretUnion { uint32: keys_1 }.uint8); - b = aes_encrypt(b, ReinterpretUnion { uint32: keys_2 }.uint8); - return ReinterpretUnion { uint8: aes_encrypt_last(ReinterpretUnion { int8: a }.uint8, b) }.int8; - } - - #[inline] - // See https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a - unsafe fn aes_encrypt(data: uint8x16_t, keys: uint8x16_t) -> uint8x16_t { - // Encrypt - let encrypted = vaeseq_u8(data, vdupq_n_u8(0)); - // Mix columns - let mixed = vaesmcq_u8(encrypted); - // Xor keys - veorq_u8(mixed, keys) - } - - #[inline] - // See https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a - unsafe fn aes_encrypt_last(data: uint8x16_t, keys: uint8x16_t) -> uint8x16_t { - // Encrypt - let encrypted = vaeseq_u8(data, vdupq_n_u8(0)); - // Xor keys - veorq_u8(encrypted, keys) - } - - #[inline] - pub unsafe fn finalize(hash: state, seed: i32) -> state { - // Hardcoded AES keys - let keys_1 = vld1q_u32([0x713B01D0, 0x8F2F35DB, 0xAF163956, 0x85459F85].as_ptr()); - let keys_2 = vld1q_u32([0x1DE09647, 0x92CFA39C, 0x3DD99ACA, 0xB89C054F].as_ptr()); - let keys_3 = vld1q_u32([0xC78B122B, 0x5544B1B7, 0x689D2B7D, 0xD0012E32].as_ptr()); - - // 3 rounds of AES - let mut hash = ReinterpretUnion{ int8: hash }.uint8; - hash = aes_encrypt(hash, ReinterpretUnion{ int32: vdupq_n_s32(seed) }.uint8); - hash = aes_encrypt(hash, ReinterpretUnion{ uint32: keys_1 }.uint8); - hash = aes_encrypt(hash, ReinterpretUnion{ uint32: keys_2 }.uint8); - hash = aes_encrypt_last(hash, ReinterpretUnion{ uint32: keys_3 }.uint8); - return ReinterpretUnion{ uint8: hash }.int8; - } -} - -// For x86 architecture -#[cfg(target_arch = "x86_64")] -mod platform_defs { - use core::arch::x86_64::*; - use std::mem::size_of; - - pub type state = __m256i; - - #[inline] - pub unsafe fn create_empty() -> state { - _mm256_setzero_si256() - } - - #[inline] - pub unsafe fn prefetch(p: *const state) { - _mm_prefetch(p as *const i8, 3); - } - - #[inline] - pub unsafe fn load_unaligned(p: *const state) -> state { - _mm256_loadu_si256(p) - } - - #[inline] - pub unsafe fn get_partial(p: *const state, len: isize) -> state { - let partial_vector: state; - // Safety check - if check_same_page(p) { - let indices = _mm256_setr_epi8( - 0, 1, 2, 3, 4, 5, 6, 7, - 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, - 24, 25, 26, 27, 28, 29, 30, 31 - ); - - let mask = _mm256_cmpgt_epi8(_mm256_set1_epi8(len as i8), indices); - partial_vector = _mm256_and_si256(_mm256_loadu_si256(p), mask); - } else { - partial_vector = get_partial_safe(p as *const u8, len as usize) - } - // Prevents padded zeroes to introduce bias - _mm256_add_epi32(partial_vector, _mm256_set1_epi32(len as i32)) - } - - #[inline] - unsafe fn check_same_page(ptr: *const state) -> bool { - let address = ptr as usize; - // Mask to keep only the last 12 bits (3 bytes) - let offset_within_page = address & 0xFFF; - // Check if the 32nd byte from the current offset exceeds the page boundary - offset_within_page <= (4096 - size_of::() - 1) - } - - #[inline] - unsafe fn get_partial_safe(data: *const u8, len: usize) -> state { - // Temporary buffer filled with zeros - let mut buffer = [0u8; size_of::()]; - // Copy data into the buffer - std::ptr::copy(data, buffer.as_mut_ptr(), len); - // Load the buffer into a __m256i vector - _mm256_loadu_si256(buffer.as_ptr() as *const state) - } - - #[inline] - #[allow(overflowing_literals)] - pub unsafe fn compress(a: state, b: state) -> state { - let keys_1 = _mm256_set_epi32(0xFC3BC28E, 0x89C222E5, 0xB09D3E21, 0xF2784542, 0x4155EE07, 0xC897CCE2, 0x780AF2C3, 0x8A72B781); - let keys_2 = _mm256_set_epi32(0x03FCE279, 0xCB6B2E9B, 0xB361DC58, 0x39136BD9, 0x7A83D76B, 0xB1E8F9F0, 0x028925A8, 0x3B9A4E71); - - // 2+1 rounds of AES for compression - let mut b = _mm256_aesenc_epi128(b, keys_1); - b = _mm256_aesenc_epi128(b, keys_2); - return _mm256_aesenclast_epi128(a, b); - } - - #[inline] - #[allow(overflowing_literals)] - pub unsafe fn finalize(hash: state, seed: i32) -> state { - // Hardcoded AES keys - let keys_1 = _mm256_set_epi32(0x713B01D0, 0x8F2F35DB, 0xAF163956, 0x85459F85, 0xB49D3E21, 0xF2784542, 0x2155EE07, 0xC197CCE2); - let keys_2 = _mm256_set_epi32(0x1DE09647, 0x92CFA39C, 0x3DD99ACA, 0xB89C054F, 0xCB6B2E9B, 0xC361DC58, 0x39136BD9, 0x7A83D76F); - let keys_3 = _mm256_set_epi32(0xC78B122B, 0x5544B1B7, 0x689D2B7D, 0xD0012E32, 0xE2784542, 0x4155EE07, 0xC897CCE2, 0x780BF2C2); - - // 4 rounds of AES - let mut hash = _mm256_aesenc_epi128(hash, _mm256_set1_epi32(seed)); - hash = _mm256_aesenc_epi128(hash, keys_1); - hash = _mm256_aesenc_epi128(hash, keys_2); - hash = _mm256_aesenclast_epi128(hash, keys_3); - - // Merge the two 128 bit lanes entropy, so we can after safely truncate up to 128-bits - let permuted = _mm256_permute2x128_si256(hash, hash, 0x21); - _mm256_xor_si256(hash, permuted) - } -} - -use std::intrinsics::likely; - -pub use platform_defs::*; - -#[inline] // To be disabled when profiling -pub fn gxhash32(input: &[u8], seed: i32) -> u32 { - unsafe { - let p = &gxhash(input, seed) as *const state as *const u32; - *p - } -} - -#[inline] // To be disabled when profiling -pub fn gxhash64(input: &[u8], seed: i32) -> u64 { - unsafe { - let p = &gxhash(input, seed) as *const state as *const u64; - *p - } -} - -#[inline] -fn gxhash(input: &[u8], seed: i32) -> state { - unsafe { - const VECTOR_SIZE: isize = std::mem::size_of::() as isize; - - let len: isize = input.len() as isize; - - let p = input.as_ptr() as *const i8; - let mut v = p as *const state; - - // Quick exit - if len <= VECTOR_SIZE { - let partial_vector = get_partial(v, len); - return finalize(partial_vector, seed); - } - - let mut end_address: usize; - let mut remaining_blocks_count: isize = len / VECTOR_SIZE; - let mut hash_vector: state = create_empty(); - - macro_rules! load_unaligned { - ($($var:ident),+) => { - $( - #[allow(unused_mut)] - let mut $var = load_unaligned(v); - v = v.offset(1); - )+ - }; - } - - const UNROLL_FACTOR: isize = 8; - if len >= VECTOR_SIZE * UNROLL_FACTOR { - - let unrollable_blocks_count: isize = (len / (VECTOR_SIZE * UNROLL_FACTOR)) * UNROLL_FACTOR; - end_address = v.offset(unrollable_blocks_count) as usize; - - load_unaligned!(s0, s1, s2, s3, s4, s5, s6, s7); - - while (v as usize) < end_address { - - load_unaligned!(v0, v1, v2, v3, v4, v5, v6, v7); - - prefetch(v); - - s0 = compress(s0, v0); - s1 = compress(s1, v1); - s2 = compress(s2, v2); - s3 = compress(s3, v3); - s4 = compress(s4, v4); - s5 = compress(s5, v5); - s6 = compress(s6, v6); - s7 = compress(s7, v7); - } - - let a = compress(compress(s0, s1), compress(s2, s3)); - let b = compress(compress(s4, s5), compress(s6, s7)); - hash_vector = compress(a, b); - - remaining_blocks_count -= unrollable_blocks_count; - } - - end_address = v.offset(remaining_blocks_count) as usize; - - while likely((v as usize) < end_address) { - load_unaligned!(v0); - hash_vector = compress(hash_vector, v0); - } - - let remaining_bytes = len & (VECTOR_SIZE - 1); - if likely(remaining_bytes > 0) { - let partial_vector = get_partial(v, remaining_bytes); - hash_vector = compress(hash_vector, partial_vector); - } - - finalize(hash_vector, seed) - } -} - -#[cfg(test)] -mod tests { - - use super::*; - use rand::Rng; - - #[test] - fn all_blocks_are_consumed() { - let mut bytes = [42u8; 1200]; - - let ref_hash = gxhash32(&bytes, 0); - - for i in 0..bytes.len() { - let swap = bytes[i]; - bytes[i] = 82; - let new_hash = gxhash32(&bytes, 0); - bytes[i] = swap; - - assert_ne!(ref_hash, new_hash, "byte {i} not processed"); - } - } - - #[test] - fn add_zeroes_mutates_hash() { - let mut bytes = [0u8; 1200]; - - let mut rng = rand::thread_rng(); - rng.fill(&mut bytes[..32]); - - let mut ref_hash = 0; - - for i in 32..100 { - let new_hash = gxhash32(&mut bytes[..i], 0); - assert_ne!(ref_hash, new_hash, "Same hash at size {i} ({new_hash})"); - ref_hash = new_hash; - } - } - - #[test] - fn hash_of_zero_is_not_zero() { - assert_ne!(0, gxhash32(&[0u8; 0], 0)); - assert_ne!(0, gxhash32(&[0u8; 1], 0)); - assert_ne!(0, gxhash32(&[0u8; 1200], 0)); - } -} \ No newline at end of file diff --git a/src/gxhash/mod.rs b/src/gxhash/mod.rs new file mode 100644 index 0000000..6ec6d9d --- /dev/null +++ b/src/gxhash/mod.rs @@ -0,0 +1,205 @@ +use std::intrinsics::likely; + +mod platform; + +pub use platform::*; + +#[inline] // To be disabled when profiling +pub fn gxhash32(input: &[u8], seed: i32) -> u32 { + unsafe { + let p = &gxhash(input, seed) as *const state as *const u32; + *p + } +} + +#[inline] // To be disabled when profiling +pub fn gxhash64(input: &[u8], seed: i32) -> u64 { + unsafe { + let p = &gxhash(input, seed) as *const state as *const u64; + *p + } +} + +#[inline] +fn gxhash(input: &[u8], seed: i32) -> state { + unsafe { + const VECTOR_SIZE: isize = std::mem::size_of::() as isize; + + let len: isize = input.len() as isize; + + let p = input.as_ptr() as *const i8; + let mut v = p as *const state; + + // Quick exit + if len <= VECTOR_SIZE { + let partial_vector = get_partial(v, len); + return finalize(partial_vector, seed); + } + + let mut end_address: usize; + let mut remaining_blocks_count: isize = len / VECTOR_SIZE; + let mut hash_vector: state = create_empty(); + + macro_rules! load_unaligned { + ($($var:ident),+) => { + $( + #[allow(unused_mut)] + let mut $var = load_unaligned(v); + v = v.offset(1); + )+ + }; + } + + const UNROLL_FACTOR: isize = 8; + if len >= VECTOR_SIZE * UNROLL_FACTOR { + + let unrollable_blocks_count: isize = (len / (VECTOR_SIZE * UNROLL_FACTOR)) * UNROLL_FACTOR; + end_address = v.offset(unrollable_blocks_count) as usize; + + load_unaligned!(s0, s1, s2, s3, s4, s5, s6, s7); + + while (v as usize) < end_address { + + load_unaligned!(v0, v1, v2, v3, v4, v5, v6, v7); + + prefetch(v); + + s0 = compress(s0, v0); + s1 = compress(s1, v1); + s2 = compress(s2, v2); + s3 = compress(s3, v3); + s4 = compress(s4, v4); + s5 = compress(s5, v5); + s6 = compress(s6, v6); + s7 = compress(s7, v7); + } + + let a = compress(compress(s0, s1), compress(s2, s3)); + let b = compress(compress(s4, s5), compress(s6, s7)); + hash_vector = compress(a, b); + + remaining_blocks_count -= unrollable_blocks_count; + } + + end_address = v.offset(remaining_blocks_count) as usize; + + while likely((v as usize) < end_address) { + load_unaligned!(v0); + hash_vector = compress(hash_vector, v0); + } + + let remaining_bytes = len & (VECTOR_SIZE - 1); + if likely(remaining_bytes > 0) { + let partial_vector = get_partial(v, remaining_bytes); + hash_vector = compress(hash_vector, partial_vector); + } + + finalize(hash_vector, seed) + } +} + +#[cfg(test)] +mod tests { + + use super::*; + use rand::Rng; + + #[test] + fn all_blocks_are_consumed() { + let mut bytes = [42u8; 1200]; + + let ref_hash = gxhash32(&bytes, 0); + + for i in 0..bytes.len() { + let swap = bytes[i]; + bytes[i] = 82; + let new_hash = gxhash32(&bytes, 0); + bytes[i] = swap; + + assert_ne!(ref_hash, new_hash, "byte {i} not processed"); + } + } + + #[test] + fn add_zeroes_mutates_hash() { + let mut bytes = [0u8; 1200]; + + let mut rng = rand::thread_rng(); + rng.fill(&mut bytes[..32]); + + let mut ref_hash = 0; + + for i in 32..100 { + let new_hash = gxhash32(&mut bytes[..i], 0); + assert_ne!(ref_hash, new_hash, "Same hash at size {i} ({new_hash})"); + ref_hash = new_hash; + } + } + + #[test] + fn test_collisions_bits() { + let mut bytes = [0u8; 120]; + let bits_to_set = 2; + + let n = bytes.len() * 8; + let mut digits: Vec = vec![0; bits_to_set]; + + for i in 0..bits_to_set { + digits[i] = i; + } + + let mut i = 0; + let mut set = std::collections::HashSet::new(); + + 'stop: loop { + + // Set bits + for d in digits.iter() { + let bit = 1 << (d % 8); + bytes[d / 8] |= bit; + } + + i += 1; + set.insert(gxhash64(&bytes, 0)); + // for &byte in bytes.iter() { + // print!("{:08b}", byte); + // } + // println!(); + + // Reset bits + for d in digits.iter() { + bytes[d / 8] = 0; + } + + // Increment the rightmost digit + for i in (0..bits_to_set).rev() { + digits[i] += 1; + if digits[i] == n - bits_to_set + i + 1 { + if i == 0 { + break 'stop; + } + digits[i] = 0; + } else { + break; + } + } + + for i in 1..bits_to_set { + if digits[i] < digits[i - 1] { + digits[i] = digits[i - 1] + 1; + } + } + } + + println!("count: {}, collisions: {}", i, i - set.len()); + + assert_eq!(0, i - set.len(), "Collisions!"); + } + + #[test] + fn hash_of_zero_is_not_zero() { + assert_ne!(0, gxhash32(&[0u8; 0], 0)); + assert_ne!(0, gxhash32(&[0u8; 1], 0)); + assert_ne!(0, gxhash32(&[0u8; 1200], 0)); + } +} \ No newline at end of file diff --git a/src/gxhash/platform/arm_128.rs b/src/gxhash/platform/arm_128.rs new file mode 100644 index 0000000..d63f499 --- /dev/null +++ b/src/gxhash/platform/arm_128.rs @@ -0,0 +1,165 @@ +use std::mem::size_of; +use core::arch::aarch64::*; + +pub type state = int8x16_t; + +#[repr(C)] +union ReinterpretUnion { + int64: int64x2_t, + int32: int32x4_t, + uint32: uint32x4_t, + int8: int8x16_t, + uint8: uint8x16_t, +} + +#[inline(always)] +pub unsafe fn create_empty() -> state { + vdupq_n_s8(0) +} + +#[inline(always)] +pub unsafe fn prefetch(p: *const state) { + //__pld(p as *const i8); +} + +#[inline(always)] +pub unsafe fn load_unaligned(p: *const state) -> state { + vld1q_s8(p as *const i8) +} + +#[inline(always)] +pub unsafe fn get_partial(p: *const state, len: isize) -> state { + let partial_vector: state; + if check_same_page(p) { + // Unsafe (hence the check) but much faster + let indices = vld1q_s8([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15].as_ptr()); + let mask = vcgtq_s8(vdupq_n_s8(len as i8), indices); + partial_vector = vandq_s8(load_unaligned(p), ReinterpretUnion { uint8: mask }.int8); + } else { + // Safer but slower, using memcpy + partial_vector = get_partial_safe(p as *const i8, len as usize); + } + // Prevents padded zeroes to introduce bias + return vaddq_s8(partial_vector, vdupq_n_s8(len as i8)); +} + +#[inline(always)] +unsafe fn check_same_page(ptr: *const state) -> bool { + let address = ptr as usize; + // Mask to keep only the last 12 bits (3 bytes) + let offset_within_page = address & 0xFFF; + // Check if the 32nd byte from the current offset exceeds the page boundary + offset_within_page <= (4096 - size_of::() - 1) +} + +#[inline(always)] +unsafe fn get_partial_safe(data: *const i8, len: usize) -> state { + // Temporary buffer filled with zeros + let mut buffer = [0i8; size_of::()]; + // Copy data into the buffer + std::ptr::copy(data, buffer.as_mut_ptr(), len); + // Load the buffer into a __m256i vector + vld1q_s8(buffer.as_ptr()) +} + +#[inline(always)] +pub unsafe fn compress(a: int8x16_t, b: int8x16_t) -> int8x16_t { + // 37 GiB/s + let keys_1 = vld1q_u32([0xFC3BC28E, 0x89C222E5, 0xB09D3E21, 0xF2784542].as_ptr()); + let keys_2 = vld1q_u32([0x03FCE279, 0xCB6B2E9B, 0xB361DC58, 0x39136BD9].as_ptr()); + let b = aes_encrypt(vreinterpretq_u8_s8(b), vreinterpretq_u8_u32(keys_1)); + let a = aes_encrypt(vreinterpretq_u8_s8(a), vreinterpretq_u8_u32(keys_2)); + vreinterpretq_s8_u8(aes_encrypt_last(a, b)) + + // 70 GiB/s + //vreinterpretq_s8_u8(aes_encrypt(vreinterpretq_u8_s8(a), vreinterpretq_u8_s8(b))) + + //vreinterpretq_s8_u8(chmuck(vreinterpretq_u8_s8(a), vreinterpretq_u8_s8(b))) + // 26 GiB/s + // let keys_1 = vld1q_u32([0x713B01D0, 0x8F2F35DB, 0xAF163956, 0x85459F85].as_ptr()); + // let keys_2 = vld1q_u32([0x1DE09647, 0x92CFA39C, 0x3DD99ACA, 0xB89C054F].as_ptr()); + // let keys_3 = vld1q_u32([0xC78B122B, 0x5544B1B7, 0x689D2B7D, 0xD0012E32].as_ptr()); + // let b1 = vaddq_u32(vreinterpretq_u32_s8(b), keys_2); // Cheap + // let b2 = vreinterpretq_s8_u32(vmulq_u32(b1, keys_1)); // Cheap + // let b3 = vreinterpretq_u32_s8(vextq_s8(b2, b2, 3)); // Expensive + // let b4 = vaddq_u32(b3, keys_3); // Cheap + // let b5 = vreinterpretq_s8_u32(vmulq_u32(b4, keys_2)); // Cheap + // let b6 = vreinterpretq_u32_s8(vextq_s8(b5, b5, 3)); // Expensive + // let b7 = vaddq_u32(b6, keys_1); // Cheap + // let b8 = vmulq_u32(b7, keys_3); // Cheap + // let b9 = veorq_s8(a, vreinterpretq_s8_u32(b8)); + // vextq_s8(b9, b9, 7) + + //let primes = vld1q_u32([0x9e3779b9, 0x9e3779b9, 0x9e3779b9, 0x9e3779b9].as_ptr()); + // let keys_2 = vld1q_u32([0x1DE09647, 0x92CFA39C, 0x3DD99ACA, 0xB89C054F].as_ptr()); + // let keys_3 = vld1q_u32([0xC78B122B, 0x5544B1B7, 0x689D2B7D, 0xD0012E32].as_ptr()); + // let b1 = vaddq_u32(vreinterpretq_u32_s8(b), primes); // Cheap + // let b2 = vreinterpretq_s8_u32(vmulq_u32(vreinterpretq_u32_s8(b), primes)); // Cheap + // let shifted = vshlq_n_s8::<1>(b2); + // let b3 = veorq_s8(b, shifted); + //let b3: uint32x4_t = vreinterpretq_u32_s8(vextq_s8(b2, b2, 3)); // Expensive + // let b4 = vaddq_u32(b3, keys_3); // Cheap + // let b5 = vreinterpretq_s8_u32(vmulq_u32(b4, keys_2)); // Cheap + // let b6 = vreinterpretq_u32_s8(vextq_s8(b5, b5, 3)); // Expensive + // let b7 = vaddq_u32(b6, keys_1); // Cheap + // let b8 = vmulq_u32(b7, keys_3); // Cheap + // let b9 = vaddq_u32( + // vreinterpretq_u32_s8(b), + // veorq_u32( + // primes, + // vaddq_u32( + // vshrq_n_u32::<2>(vreinterpretq_u32_s8(a)), + // vshlq_n_u32::<6>(vreinterpretq_u32_s8(a))))); + + // vextq_s8(vreinterpretq_s8_u32(b9), vreinterpretq_s8_u32(b9), 1) + + // let mut x = vreinterpretq_u32_s8(b); + // // Round 1 + // x = veorq_u32(x, vshrq_n_u32::<16>(x)); + // x = vmulq_u32(x, vld1q_u32([0x7feb352d, 0x7feb352d, 0x7feb352d, 0x7feb352d].as_ptr())); + // // Round 2 + // x = veorq_u32(x, vshrq_n_u32::<15>(x)); + // x = vmulq_u32(x, vld1q_u32([0x846ca68b, 0x846ca68b, 0x846ca68b, 0x846ca68b].as_ptr())); + // // Round 3 + // x = veorq_u32(x, vshrq_n_u32::<16>(x)); + // let f = vaddq_s8(a, vreinterpretq_s8_u32(x)); + // vextq_s8(f, f, 1) + + //ve +} + +#[inline(always)] +// See https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a +unsafe fn aes_encrypt(data: uint8x16_t, keys: uint8x16_t) -> uint8x16_t { + // Encrypt + let encrypted = vaeseq_u8(data, vdupq_n_u8(0)); + // Mix columns + let mixed = vaesmcq_u8(encrypted); + // Xor keys + veorq_u8(mixed, keys) +} + +#[inline(always)] +// See https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a +unsafe fn aes_encrypt_last(data: uint8x16_t, keys: uint8x16_t) -> uint8x16_t { + // Encrypt + let encrypted = vaeseq_u8(data, vdupq_n_u8(0)); + // Xor keys + veorq_u8(encrypted, keys) +} + +#[inline(always)] +pub unsafe fn finalize(hash: state, seed: i32) -> state { + // Hardcoded AES keys + let keys_1 = vld1q_u32([0x713B01D0, 0x8F2F35DB, 0xAF163956, 0x85459F85].as_ptr()); + let keys_2 = vld1q_u32([0x1DE09647, 0x92CFA39C, 0x3DD99ACA, 0xB89C054F].as_ptr()); + let keys_3 = vld1q_u32([0xC78B122B, 0x5544B1B7, 0x689D2B7D, 0xD0012E32].as_ptr()); + + // 3 rounds of AES + let mut hash = ReinterpretUnion{ int8: hash }.uint8; + hash = aes_encrypt(hash, ReinterpretUnion{ int32: vdupq_n_s32(seed) }.uint8); + hash = aes_encrypt(hash, ReinterpretUnion{ uint32: keys_1 }.uint8); + hash = aes_encrypt(hash, ReinterpretUnion{ uint32: keys_2 }.uint8); + hash = aes_encrypt_last(hash, ReinterpretUnion{ uint32: keys_3 }.uint8); + return ReinterpretUnion{ uint8: hash }.int8; +} \ No newline at end of file diff --git a/src/gxhash/platform/mod.rs b/src/gxhash/platform/mod.rs new file mode 100644 index 0000000..dced5b6 --- /dev/null +++ b/src/gxhash/platform/mod.rs @@ -0,0 +1,9 @@ +#[cfg(target_arch = "aarch64")] +#[path = "arm_128.rs"] +pub mod platform; + +#[cfg(target_arch = "x86_64")] +#[path = "x86_256.rs"] +pub mod platform; + +pub use platform::*; \ No newline at end of file diff --git a/src/gxhash/platform/x86_256.rs b/src/gxhash/platform/x86_256.rs new file mode 100644 index 0000000..2f557f2 --- /dev/null +++ b/src/gxhash/platform/x86_256.rs @@ -0,0 +1,90 @@ +use core::arch::x86_64::*; +use std::mem::size_of; + +pub type state = __m256i; + +#[inline] +pub unsafe fn create_empty() -> state { + _mm256_setzero_si256() +} + +#[inline] +pub unsafe fn prefetch(p: *const state) { + _mm_prefetch(p as *const i8, 3); +} + +#[inline] +pub unsafe fn load_unaligned(p: *const state) -> state { + _mm256_loadu_si256(p) +} + +#[inline] +pub unsafe fn get_partial(p: *const state, len: isize) -> state { + let partial_vector: state; + // Safety check + if check_same_page(p) { + let indices = _mm256_setr_epi8( + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31 + ); + + let mask = _mm256_cmpgt_epi8(_mm256_set1_epi8(len as i8), indices); + partial_vector = _mm256_and_si256(_mm256_loadu_si256(p), mask); + } else { + partial_vector = get_partial_safe(p as *const u8, len as usize) + } + // Prevents padded zeroes to introduce bias + _mm256_add_epi32(partial_vector, _mm256_set1_epi32(len as i32)) +} + +#[inline] +unsafe fn check_same_page(ptr: *const state) -> bool { + let address = ptr as usize; + // Mask to keep only the last 12 bits (3 bytes) + let offset_within_page = address & 0xFFF; + // Check if the 32nd byte from the current offset exceeds the page boundary + offset_within_page <= (4096 - size_of::() - 1) +} + +#[inline] +unsafe fn get_partial_safe(data: *const u8, len: usize) -> state { + // Temporary buffer filled with zeros + let mut buffer = [0u8; size_of::()]; + // Copy data into the buffer + std::ptr::copy(data, buffer.as_mut_ptr(), len); + // Load the buffer into a __m256i vector + _mm256_loadu_si256(buffer.as_ptr() as *const state) +} + +#[inline] +#[allow(overflowing_literals)] +pub unsafe fn compress(a: state, b: state) -> state { + let keys_1 = _mm256_set_epi32(0xFC3BC28E, 0x89C222E5, 0xB09D3E21, 0xF2784542, 0x4155EE07, 0xC897CCE2, 0x780AF2C3, 0x8A72B781); + let keys_2 = _mm256_set_epi32(0x03FCE279, 0xCB6B2E9B, 0xB361DC58, 0x39136BD9, 0x7A83D76B, 0xB1E8F9F0, 0x028925A8, 0x3B9A4E71); + + // 2+1 rounds of AES for compression + let mut b = _mm256_aesenc_epi128(b, keys_1); + b = _mm256_aesenc_epi128(b, keys_2); + return _mm256_aesenclast_epi128(a, b); +} + +#[inline] +#[allow(overflowing_literals)] +pub unsafe fn finalize(hash: state, seed: i32) -> state { + // Hardcoded AES keys + let keys_1 = _mm256_set_epi32(0x713B01D0, 0x8F2F35DB, 0xAF163956, 0x85459F85, 0xB49D3E21, 0xF2784542, 0x2155EE07, 0xC197CCE2); + let keys_2 = _mm256_set_epi32(0x1DE09647, 0x92CFA39C, 0x3DD99ACA, 0xB89C054F, 0xCB6B2E9B, 0xC361DC58, 0x39136BD9, 0x7A83D76F); + let keys_3 = _mm256_set_epi32(0xC78B122B, 0x5544B1B7, 0x689D2B7D, 0xD0012E32, 0xE2784542, 0x4155EE07, 0xC897CCE2, 0x780BF2C2); + + // 4 rounds of AES + let mut hash = _mm256_aesenc_epi128(hash, _mm256_set1_epi32(seed)); + hash = _mm256_aesenc_epi128(hash, keys_1); + hash = _mm256_aesenc_epi128(hash, keys_2); + hash = _mm256_aesenclast_epi128(hash, keys_3); + + // Merge the two 128 bit lanes entropy, so we can after safely truncate up to 128-bits + let permuted = _mm256_permute2x128_si256(hash, hash, 0x21); + _mm256_xor_si256(hash, permuted) +} \ No newline at end of file