Skip to content
This repository has been archived by the owner on Apr 4, 2023. It is now read-only.

Commit

Permalink
Merge #538
Browse files Browse the repository at this point in the history
538: speedup exact words r=Kerollmops a=MarinPostma

This PR make `exact_words` return an `Option` instead of an empty set, since set creation is costly, as noticed by `@kerollmops.`

I was not convinces that this was the cause for all of the performance drop we measured, and then realized that methods that initialized it were called recursively which caused initialization times to add up. While the first fix solves the issue when not using exact words, using exact word remained way more expensive that it should be. To address this issue, the exact words are cached into the `Context`, so they are only initialized once.


Co-authored-by: ad hoc <postma.marin@protonmail.com>
  • Loading branch information
bors[bot] and MarinPostma authored May 30, 2022
2 parents 9f78e39 + 25fc576 commit 582930d
Showing 4 changed files with 36 additions and 22 deletions.
6 changes: 3 additions & 3 deletions milli/src/index.rs
Original file line number Diff line number Diff line change
@@ -1041,10 +1041,10 @@ impl Index {
}

/// List the words on which typo are not allowed
pub fn exact_words<'t>(&self, txn: &'t RoTxn) -> Result<fst::Set<Cow<'t, [u8]>>> {
pub fn exact_words<'t>(&self, txn: &'t RoTxn) -> Result<Option<fst::Set<Cow<'t, [u8]>>>> {
match self.main.get::<_, Str, ByteSlice>(txn, main_key::EXACT_WORDS)? {
Some(bytes) => Ok(fst::Set::new(bytes)?.map_data(Cow::Borrowed)?),
None => Ok(fst::Set::default().map_data(Cow::Owned)?),
Some(bytes) => Ok(Some(fst::Set::new(bytes)?.map_data(Cow::Borrowed)?)),
None => Ok(None),
}
}

2 changes: 1 addition & 1 deletion milli/src/search/mod.rs
Original file line number Diff line number Diff line change
@@ -118,7 +118,7 @@ impl<'a> Search<'a> {
let before = Instant::now();
let (query_tree, primitive_query, matching_words) = match self.query.as_ref() {
Some(query) => {
let mut builder = QueryTreeBuilder::new(self.rtxn, self.index);
let mut builder = QueryTreeBuilder::new(self.rtxn, self.index)?;
builder.optional_words(self.optional_words);

builder.authorize_typos(self.is_typo_authorized()?);
48 changes: 31 additions & 17 deletions milli/src/search/query_tree.rs
Original file line number Diff line number Diff line change
@@ -152,7 +152,7 @@ trait Context {
}
/// Returns the minimum word len for 1 and 2 typos.
fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)>;
fn exact_words(&self) -> crate::Result<fst::Set<Cow<[u8]>>>;
fn exact_words(&self) -> Option<&fst::Set<Cow<[u8]>>>;
}

/// The query tree builder is the interface to build a query tree.
@@ -162,6 +162,7 @@ pub struct QueryTreeBuilder<'a> {
optional_words: bool,
authorize_typos: bool,
words_limit: Option<usize>,
exact_words: Option<fst::Set<Cow<'a, [u8]>>>,
}

impl<'a> Context for QueryTreeBuilder<'a> {
@@ -183,16 +184,23 @@ impl<'a> Context for QueryTreeBuilder<'a> {
Ok((one, two))
}

fn exact_words(&self) -> crate::Result<fst::Set<Cow<[u8]>>> {
self.index.exact_words(self.rtxn)
fn exact_words(&self) -> Option<&fst::Set<Cow<[u8]>>> {
self.exact_words.as_ref()
}
}

impl<'a> QueryTreeBuilder<'a> {
/// Create a `QueryTreeBuilder` from a heed ReadOnly transaction `rtxn`
/// and an Index `index`.
pub fn new(rtxn: &'a heed::RoTxn<'a>, index: &'a Index) -> Self {
Self { rtxn, index, optional_words: true, authorize_typos: true, words_limit: None }
pub fn new(rtxn: &'a heed::RoTxn<'a>, index: &'a Index) -> Result<Self> {
Ok(Self {
rtxn,
index,
optional_words: true,
authorize_typos: true,
words_limit: None,
exact_words: index.exact_words(rtxn)?,
})
}

/// if `optional_words` is set to `false` the query tree will be
@@ -277,13 +285,13 @@ pub struct TypoConfig<'a> {
pub max_typos: u8,
pub word_len_one_typo: u8,
pub word_len_two_typo: u8,
pub exact_words: fst::Set<Cow<'a, [u8]>>,
pub exact_words: Option<&'a fst::Set<Cow<'a, [u8]>>>,
}

/// Return the `QueryKind` of a word depending on `authorize_typos`
/// and the provided word length.
fn typos<'a>(word: String, authorize_typos: bool, config: TypoConfig<'a>) -> QueryKind {
if authorize_typos && !config.exact_words.contains(&word) {
if authorize_typos && !config.exact_words.map_or(false, |s| s.contains(&word)) {
let count = word.chars().count().min(u8::MAX as usize) as u8;
if count < config.word_len_one_typo {
QueryKind::exact(word)
@@ -342,7 +350,7 @@ fn create_query_tree(
children.push(Operation::Phrase(vec![left.to_string(), right.to_string()]));
}
let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?;
let exact_words = ctx.exact_words()?;
let exact_words = ctx.exact_words();
let config =
TypoConfig { max_typos: 2, word_len_one_typo, word_len_two_typo, exact_words };
children.push(Operation::Query(Query {
@@ -396,7 +404,7 @@ fn create_query_tree(
let concat = words.concat();
let (word_len_one_typo, word_len_two_typo) =
ctx.min_word_len_for_typo()?;
let exact_words = ctx.exact_words()?;
let exact_words = ctx.exact_words();
let config = TypoConfig {
max_typos: 1,
word_len_one_typo,
@@ -501,7 +509,7 @@ fn create_matching_words(
}

let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?;
let exact_words = ctx.exact_words()?;
let exact_words = ctx.exact_words();
let config =
TypoConfig { max_typos: 2, word_len_one_typo, word_len_two_typo, exact_words };

@@ -579,7 +587,7 @@ fn create_matching_words(
let word = words.concat();
let (word_len_one_typo, word_len_two_typo) =
ctx.min_word_len_for_typo()?;
let exact_words = ctx.exact_words()?;
let exact_words = ctx.exact_words();
let config = TypoConfig {
max_typos: 1,
word_len_one_typo,
@@ -742,8 +750,7 @@ mod test {
struct TestContext {
synonyms: HashMap<Vec<String>, Vec<Vec<String>>>,
postings: HashMap<String, RoaringBitmap>,
// Raw bytes for the exact word fst Set
exact_words: Vec<u8>,
exact_words: Option<fst::Set<Cow<'static, [u8]>>>,
}

impl TestContext {
@@ -779,8 +786,8 @@ mod test {
Ok((DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS))
}

fn exact_words(&self) -> crate::Result<fst::Set<Cow<[u8]>>> {
Ok(fst::Set::new(Cow::Borrowed(self.exact_words.as_slice())).unwrap())
fn exact_words(&self) -> Option<&fst::Set<Cow<[u8]>>> {
self.exact_words.as_ref()
}
}

@@ -799,6 +806,8 @@ mod test {
}

let exact_words = fst::SetBuilder::new(Vec::new()).unwrap().into_inner().unwrap();
let exact_words =
Some(fst::Set::new(exact_words).unwrap().map_data(Cow::Owned).unwrap());

TestContext {
synonyms: hashmap! {
@@ -1406,8 +1415,12 @@ mod test {
#[test]
fn test_min_word_len_typo() {
let exact_words = fst::Set::from_iter([b""]).unwrap().map_data(Cow::Owned).unwrap();
let config =
TypoConfig { max_typos: 2, word_len_one_typo: 5, word_len_two_typo: 7, exact_words };
let config = TypoConfig {
max_typos: 2,
word_len_one_typo: 5,
word_len_two_typo: 7,
exact_words: Some(&exact_words),
};

assert_eq!(
typos("hello".to_string(), true, config.clone()),
@@ -1433,6 +1446,7 @@ mod test {

let tokens = result.tokens();
let exact_words = fst::Set::from_iter(Some("goodbye")).unwrap().into_fst().into_inner();
let exact_words = Some(fst::Set::new(exact_words).unwrap().map_data(Cow::Owned).unwrap());
let context = TestContext { exact_words, ..Default::default() };
let (query_tree, _) = context.build(false, true, Some(2), tokens).unwrap().unwrap();

2 changes: 1 addition & 1 deletion milli/src/update/settings.rs
Original file line number Diff line number Diff line change
@@ -1495,7 +1495,7 @@ mod tests {
let words = btreeset! { S("Ab"), S("ac") };
builder.set_exact_words(words);
assert!(builder.execute(|_| ()).is_ok());
let exact_words = index.exact_words(&txn).unwrap();
let exact_words = index.exact_words(&txn).unwrap().unwrap();
for word in exact_words.into_fst().stream().into_str_vec().unwrap() {
assert!(word.0 == "ac" || word.0 == "ab");
}

0 comments on commit 582930d

Please sign in to comment.