diff --git a/src/index.rs b/src/index.rs index b0e17ddf..65323444 100644 --- a/src/index.rs +++ b/src/index.rs @@ -11,6 +11,10 @@ use crate::{ table::{key::TableKey, SIZE_TIERS_BITS}, Key, }; +#[cfg(target_arch = "x86")] +use std::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::*; use std::convert::TryInto; // Index chunk consists of 8 64-bit entries. @@ -231,8 +235,63 @@ impl IndexTable { Ok(try_io!(Ok(&map[offset..offset + CHUNK_LEN]))) } - #[inline(never)] fn find_entry(&self, key_prefix: u64, sub_index: usize, chunk: &[u8]) -> (Entry, usize) { + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + if is_x86_feature_detected!("sse2") { + return self.find_entry_sse2(key_prefix, sub_index, chunk); + } + self.find_entry_regular(key_prefix, sub_index, chunk) + } + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + fn find_entry_sse2(&self, key_prefix: u64, sub_index: usize, chunk: &[u8]) -> (Entry, usize) { + debug_assert!( + (CHUNK_ENTRIES - sub_index) % 4 == 0, + "SSE2 find entry only works on number of entities dividable by 4" + ); + + unsafe { + let partial_key = Entry::extract_key(key_prefix, self.id.index_bits()); + let target = _mm_set1_epi32(partial_key as i32); + let shift = Entry::address_bits(self.id.index_bits()); + let mut i = sub_index; + while i + 4 <= CHUNK_ENTRIES { + // We load the value two by two and do the same shift as for the partial_key + // before moving the values into the low part of the register + let first_two = _mm_shuffle_epi32::<0b11011000>(_mm_srl_epi64( + _mm_loadu_si128(chunk[i * 8..].as_ptr() as *const __m128i), + _mm_set_epi64x(0, shift.into()), + )); + let last_two = _mm_shuffle_epi32::<0b11011000>(_mm_srl_epi64( + _mm_loadu_si128(chunk[(i + 2) * 8..].as_ptr() as *const __m128i), + _mm_set_epi64x(0, shift.into()), + )); + // We set into current the low parts of first_two and last_two + // in the interleaved order + let current = _mm_unpacklo_epi32(first_two, last_two); + let cmp = _mm_movemask_epi8(_mm_cmpeq_epi32(current, target)); + if cmp != 0 { + // We found a match, let's validate it + for i in i..(i + 4) { + let entry = Self::read_entry(chunk, i); + if entry.partial_key(self.id.index_bits()) == partial_key { + return (entry, i) + } + } + } + i += 4; + } + } + (Entry::empty(), 0) + } + + #[inline(never)] + fn find_entry_regular( + &self, + key_prefix: u64, + sub_index: usize, + chunk: &[u8], + ) -> (Entry, usize) { let partial_key = Entry::extract_key(key_prefix, self.id.index_bits()); for i in sub_index..CHUNK_ENTRIES { let entry = Self::read_entry(chunk, i);