Skip to content

Commit

Permalink
Use SSE2 SIMD to accelerate IndexTable::find_entry
Browse files Browse the repository at this point in the history
  • Loading branch information
Tpt committed Jan 31, 2023
1 parent 15df7fe commit 5427684
Showing 1 changed file with 60 additions and 1 deletion.
61 changes: 60 additions & 1 deletion src/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ use crate::{
table::{key::TableKey, SIZE_TIERS_BITS},
Key,
};
#[cfg(target_arch = "x86")]
use std::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;
use std::convert::TryInto;

// Index chunk consists of 8 64-bit entries.
Expand Down Expand Up @@ -231,8 +235,63 @@ impl IndexTable {
Ok(try_io!(Ok(&map[offset..offset + CHUNK_LEN])))
}

#[inline(never)]
fn find_entry(&self, key_prefix: u64, sub_index: usize, chunk: &[u8]) -> (Entry, usize) {
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
if is_x86_feature_detected!("sse2") {
return self.find_entry_sse2(key_prefix, sub_index, chunk);
}
self.find_entry_regular(key_prefix, sub_index, chunk)
}

#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
fn find_entry_sse2(&self, key_prefix: u64, sub_index: usize, chunk: &[u8]) -> (Entry, usize) {
debug_assert!(
(CHUNK_ENTRIES - sub_index) % 4 == 0,
"SSE2 find entry only works on number of entities dividable by 4"
);

unsafe {
let partial_key = Entry::extract_key(key_prefix, self.id.index_bits());
let target = _mm_set1_epi32(partial_key as i32);
let shift = Entry::address_bits(self.id.index_bits());
let mut i = sub_index;
while i + 4 <= CHUNK_ENTRIES {
// We load the value two by two and do the same shift as for the partial_key
// before moving the values into the low part of the register
let first_two = _mm_shuffle_epi32::<0b11011000>(_mm_srl_epi64(
_mm_loadu_si128(chunk[i * 8..].as_ptr() as *const __m128i),
_mm_set_epi64x(0, shift.into()),
));
let last_two = _mm_shuffle_epi32::<0b11011000>(_mm_srl_epi64(
_mm_loadu_si128(chunk[(i + 2) * 8..].as_ptr() as *const __m128i),
_mm_set_epi64x(0, shift.into()),
));
// We set into current the low parts of first_two and last_two
// in the interleaved order
let current = _mm_unpacklo_epi32(first_two, last_two);
let cmp = _mm_movemask_epi8(_mm_cmpeq_epi32(current, target));
if cmp != 0 {
// We found a match, let's validate it
for i in i..(i + 4) {
let entry = Self::read_entry(chunk, i);
if entry.partial_key(self.id.index_bits()) == partial_key {
return (entry, i)
}
}
}
i += 4;
}
}
(Entry::empty(), 0)
}

#[inline(never)]
fn find_entry_regular(
&self,
key_prefix: u64,
sub_index: usize,
chunk: &[u8],
) -> (Entry, usize) {
let partial_key = Entry::extract_key(key_prefix, self.id.index_bits());
for i in sub_index..CHUNK_ENTRIES {
let entry = Self::read_entry(chunk, i);
Expand Down

0 comments on commit 5427684

Please sign in to comment.