diff --git a/src/index.rs b/src/index.rs index 0bf5f9c1..510c6c1c 100644 --- a/src/index.rs +++ b/src/index.rs @@ -15,7 +15,7 @@ use crate::{ use std::arch::x86::*; #[cfg(target_arch = "x86_64")] use std::arch::x86_64::*; -use std::convert::TryInto; +use std::{cmp::max, convert::TryInto}; // Index chunk consists of 8 64-bit entries. const CHUNK_LEN: usize = CHUNK_ENTRIES * ENTRY_BYTES; // 512 bytes @@ -238,25 +238,27 @@ impl IndexTable { #[cfg(target_feature = "sse2")] fn find_entry(&self, key_prefix: u64, sub_index: usize, chunk: &[u8]) -> (Entry, usize) { assert!(chunk.len() >= CHUNK_ENTRIES * 8); // Bound checking (not done by SIMD instructions) - debug_assert!( - Entry::address_bits(self.id.index_bits()) <= 32, - "To be sure we can use all high 32 bits as key prefix" - ); const _: () = assert!( CHUNK_ENTRIES % 4 == 0, "We assume here we got buffer with a number of elements that is a multiple of 4" ); + let shift = max(32, Entry::address_bits(self.id.index_bits())); unsafe { - let target = _mm_set1_epi32(((key_prefix << self.id.index_bits()) >> 32) as i32); + let target = _mm_set1_epi32(((key_prefix << self.id.index_bits()) >> shift) as i32); + let shift_mask = _mm_set_epi64x(0, shift.into()); let mut i = (sub_index >> 2) << 2; // We keep an alignment of 4 while i + 4 <= CHUNK_ENTRIES { - // We load the value 2 by 2 and move the high bits into the low part of the register - let first_two = _mm_shuffle_epi32::<0b10001101>(_mm_loadu_si128( - chunk[i * 8..].as_ptr() as *const __m128i, + // We load the value 2 by 2 + // Then we remove the address by shifting such that the partial key is in the low + // part + let first_two = _mm_shuffle_epi32::<0b11011000>(_mm_srl_epi64( + _mm_loadu_si128(chunk[i * 8..].as_ptr() as *const __m128i), + shift_mask, )); - let last_two = _mm_shuffle_epi32::<0b10001101>(_mm_loadu_si128( - chunk[(i + 2) * 8..].as_ptr() as *const __m128i, + let last_two = _mm_shuffle_epi32::<0b11011000>(_mm_srl_epi64( + _mm_loadu_si128(chunk[(i + 2) * 8..].as_ptr() as *const __m128i), + shift_mask, )); // We set into current the input low parts let current = _mm_unpacklo_epi64(first_two, last_two); @@ -575,6 +577,7 @@ impl IndexTable { #[cfg(test)] mod test { use super::*; + use std::path::PathBuf; #[test] fn test_entries() { @@ -595,4 +598,42 @@ mod test { assert!(IndexTable::transmute_chunk(chunk2) == chunk); } + + #[test] + fn test_find_entries() { + for index_bits in [16, 18, 20, 22] { + let index_table = IndexTable { + id: TableId(index_bits.into()), + map: RwLock::new(None), + path: PathBuf::new(), + }; + + let data_address = Address::from_u64((1 << index_bits) - 1); + + let mut chunk = [0; CHUNK_ENTRIES * 8]; + chunk[0..8] + .copy_from_slice(&Entry::new(data_address, 1, index_bits).as_u64().to_le_bytes()); + chunk[8..16].copy_from_slice( + &Entry::new(data_address, 1 << 10, index_bits).as_u64().to_le_bytes(), + ); + chunk[16..24].copy_from_slice( + &Entry::new(data_address, 1 << 20, index_bits).as_u64().to_le_bytes(), + ); + + assert_eq!( + index_table + .find_entry(((1 << 10) << CHUNK_ENTRIES_BITS + SIZE_TIERS_BITS), 0, &chunk,) + .0 + .partial_key(index_bits), + 1 << 10 + ); + assert_eq!( + index_table + .find_entry(((1 << 20) << CHUNK_ENTRIES_BITS + SIZE_TIERS_BITS), 1, &chunk,) + .0 + .partial_key(index_bits), + 1 << 20 + ); + } + } }