diff --git a/src/termdict/sstable_termdict/sstable/mod.rs b/src/termdict/sstable_termdict/sstable/mod.rs index 71c387cae1..0afef761cd 100644 --- a/src/termdict/sstable_termdict/sstable/mod.rs +++ b/src/termdict/sstable_termdict/sstable/mod.rs @@ -145,6 +145,12 @@ where } pub fn write_key(&mut self, key: &[u8]) { + // If this is the first key in the block, we use it to + // shorten the last term in the last block. + if self.first_ordinal_of_the_block == self.num_terms { + self.index_builder + .shorten_last_block_key_given_next_key(key); + } let keep_len = common_prefix_len(&self.previous_key, key); let add_len = key.len() - keep_len; let increasing_keys = add_len > 0 && (self.previous_key.len() == keep_len) @@ -273,11 +279,12 @@ mod test { 33u8, 18u8, 19u8, // keep 1 push 1 | 20 17u8, 20u8, 0u8, 0u8, 0u8, 0u8, // no more blocks // index - 161, 102, 98, 108, 111, 99, 107, 115, 129, 162, 104, 108, 97, 115, 116, 95, 107, - 101, 121, 130, 17, 20, 106, 98, 108, 111, 99, 107, 95, 97, 100, 100, 114, 162, 106, - 98, 121, 116, 101, 95, 114, 97, 110, 103, 101, 162, 101, 115, 116, 97, 114, 116, 0, - 99, 101, 110, 100, 11, 109, 102, 105, 114, 115, 116, 95, 111, 114, 100, 105, 110, - 97, 108, 0, 15, 0, 0, 0, 0, 0, 0, 0, // offset for the index + 161, 102, 98, 108, 111, 99, 107, 115, 129, 162, 115, 108, 97, 115, 116, 95, 107, + 101, 121, 95, 111, 114, 95, 103, 114, 101, 97, 116, 101, 114, 130, 17, 20, 106, 98, + 108, 111, 99, 107, 95, 97, 100, 100, 114, 162, 106, 98, 121, 116, 101, 95, 114, 97, + 110, 103, 101, 162, 101, 115, 116, 97, 114, 116, 0, 99, 101, 110, 100, 11, 109, + 102, 105, 114, 115, 116, 95, 111, 114, 100, 105, 110, 97, 108, 0, 15, 0, 0, 0, 0, + 0, 0, 0, // offset for the index 3u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8 // num terms ] ); diff --git a/src/termdict/sstable_termdict/sstable/sstable_index.rs b/src/termdict/sstable_termdict/sstable/sstable_index.rs index e1fceceeed..f239b20205 100644 --- a/src/termdict/sstable_termdict/sstable/sstable_index.rs +++ b/src/termdict/sstable_termdict/sstable/sstable_index.rs @@ -4,6 +4,7 @@ use std::ops::Range; use serde::{Deserialize, Serialize}; use crate::error::DataCorruption; +use crate::termdict::sstable_termdict::sstable::common_prefix_len; #[derive(Default, Debug, Serialize, Deserialize)] pub struct SSTableIndex { @@ -19,7 +20,7 @@ impl SSTableIndex { pub fn search(&self, key: &[u8]) -> Option { self.blocks .iter() - .find(|block| &block.last_key[..] >= key) + .find(|block| &block.last_key_or_greater[..] >= key) .map(|block| block.block_addr.clone()) } } @@ -32,7 +33,9 @@ pub struct BlockAddr { #[derive(Debug, Serialize, Deserialize)] struct BlockMeta { - pub last_key: Vec, + /// Any byte string that is lexicographically greater than the last key in the block, + /// and yet stricly smaller than the first key in the next block. + pub last_key_or_greater: Vec, pub block_addr: BlockAddr, } @@ -41,10 +44,39 @@ pub struct SSTableIndexBuilder { index: SSTableIndex, } +/// Given that left < right, +/// mutates `left into a shorter byte string left'` that +/// matches `left <= left' < right`. +fn find_shorter_str_in_between(left: &mut Vec, right: &[u8]) { + assert!(&left[..] < right); + let common_len = common_prefix_len(&left, right); + if left.len() == common_len { + return; + } + // It is possible to do one character shorter in some case, + // but it is not worth the extra complexity + for pos in (common_len + 1)..left.len() { + if left[pos] != u8::MAX { + left[pos] += 1; + left.truncate(pos + 1); + return; + } + } +} + impl SSTableIndexBuilder { + /// In order to make the index as light as possible, we + /// try to find a shorter alternative to the last key of the last block + /// that is still smaller than the next key. + pub(crate) fn shorten_last_block_key_given_next_key(&mut self, next_key: &[u8]) { + if let Some(last_block) = self.index.blocks.last_mut() { + find_shorter_str_in_between(&mut last_block.last_key_or_greater, next_key); + } + } + pub fn add_block(&mut self, last_key: &[u8], byte_range: Range, first_ordinal: u64) { self.index.blocks.push(BlockMeta { - last_key: last_key.to_vec(), + last_key_or_greater: last_key.to_vec(), block_addr: BlockAddr { byte_range, first_ordinal, @@ -97,4 +129,35 @@ mod tests { "Data corruption: SSTable index is corrupted." ); } + + #[track_caller] + fn test_find_shorter_str_in_between_aux(left: &[u8], right: &[u8]) { + let mut left_buf = left.to_vec(); + super::find_shorter_str_in_between(&mut left_buf, right); + assert!(left_buf.len() <= left.len()); + assert!(left <= &left_buf); + assert!(&left_buf[..] < &right); + } + + #[test] + fn test_find_shorter_str_in_between() { + test_find_shorter_str_in_between_aux(b"", b"hello"); + test_find_shorter_str_in_between_aux(b"abc", b"abcd"); + test_find_shorter_str_in_between_aux(b"abcd", b"abd"); + test_find_shorter_str_in_between_aux(&[0, 0, 0], &[1]); + test_find_shorter_str_in_between_aux(&[0, 0, 0], &[0, 0, 1]); + test_find_shorter_str_in_between_aux(&[0, 0, 255, 255, 255, 0u8], &[0, 1]); + } + + use proptest::prelude::*; + + proptest! { + #![proptest_config(ProptestConfig::with_cases(100))] + #[test] + fn test_proptest_find_shorter_str(left in any::>(), right in any::>()) { + if left < right { + test_find_shorter_str_in_between_aux(&left, &right); + } + } + } }