Skip to content
This repository has been archived by the owner on Apr 4, 2023. It is now read-only.

Commit

Permalink
Merge #720
Browse files Browse the repository at this point in the history
720: Make soft deletion optional in document addition and deletion + add lots of tests r=irevoire a=loiclec

# Pull Request

## What does this PR do?
When debugging recent issues, I created a few unit tests in the hopes reproducing the bugs I was looking for. In the end, I didn't find any, but I thought it would still be good to keep those tests. 

More importantly, I added a field to the `DeleteDocuments` and `IndexDocuments` builders, called `disable_soft_deletion`. If set to `true`, the indexing/deletion will never add documents to the `soft_deleted_documents_ids` and instead perform a real deletion of the documents from the databases.

For the new tests, I have:
- Improved the insta-snapshot format of the `external_documents_ids` structure
- Added more tests for the facet DB indexing, deletion, and search algorithms, making sure to test them when the facet DB contains strings (instead of numbers) as well.
- Added more tests for the incremental indexing of the prefix proximity databases. For example, to see if documents are replaced correctly and if common prefixes are deleted correctly.
- Added tests that mix soft deletion and hard deletion, including when processing batches of document updates. 


Co-authored-by: Loïc Lecrenier <loic.lecrenier@me.com>
  • Loading branch information
bors[bot] and loiclec authored Dec 5, 2022
2 parents d3731dd + cda4ba2 commit 46e26ab
Show file tree
Hide file tree
Showing 91 changed files with 2,724 additions and 64 deletions.
547 changes: 545 additions & 2 deletions milli/src/index.rs

Large diffs are not rendered by default.

93 changes: 91 additions & 2 deletions milli/src/search/facet/facet_sort_ascending.rs
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,8 @@ impl<'t, 'e> Iterator for AscendingFacetSort<'t, 'e> {
// that we found all the documents in the sub level iterations already,
// we can pop this level iterator.
if documents_ids.is_empty() {
// break our of the for loop into the end of the 'outer loop, which
// pops the stack
break;
}

Expand Down Expand Up @@ -113,11 +115,14 @@ mod tests {

use crate::milli_snap;
use crate::search::facet::facet_sort_ascending::ascending_facet_sort;
use crate::search::facet::tests::{get_random_looking_index, get_simple_index};
use crate::search::facet::tests::{
get_random_looking_index, get_random_looking_string_index_with_multiple_field_ids,
get_simple_index, get_simple_string_index_with_multiple_field_ids,
};
use crate::snapshot_tests::display_bitmap;

#[test]
fn filter_sort() {
fn filter_sort_ascending() {
let indexes = [get_simple_index(), get_random_looking_index()];
for (i, index) in indexes.iter().enumerate() {
let txn = index.env.read_txn().unwrap();
Expand All @@ -134,4 +139,88 @@ mod tests {
txn.commit().unwrap();
}
}

#[test]
fn filter_sort_ascending_multiple_field_ids() {
let indexes = [
get_simple_string_index_with_multiple_field_ids(),
get_random_looking_string_index_with_multiple_field_ids(),
];
for (i, index) in indexes.iter().enumerate() {
let txn = index.env.read_txn().unwrap();
let candidates = (200..=300).into_iter().collect::<RoaringBitmap>();
let mut results = String::new();
let iter = ascending_facet_sort(&txn, index.content, 0, candidates.clone()).unwrap();
for el in iter {
let docids = el.unwrap();
results.push_str(&display_bitmap(&docids));
results.push('\n');
}
milli_snap!(results, format!("{i}-0"));

let mut results = String::new();
let iter = ascending_facet_sort(&txn, index.content, 1, candidates).unwrap();
for el in iter {
let docids = el.unwrap();
results.push_str(&display_bitmap(&docids));
results.push('\n');
}
milli_snap!(results, format!("{i}-1"));

txn.commit().unwrap();
}
}

#[test]
fn filter_sort_ascending_with_no_candidates() {
let indexes = [
get_simple_string_index_with_multiple_field_ids(),
get_random_looking_string_index_with_multiple_field_ids(),
];
for (_i, index) in indexes.iter().enumerate() {
let txn = index.env.read_txn().unwrap();
let candidates = RoaringBitmap::new();
let mut results = String::new();
let iter = ascending_facet_sort(&txn, index.content, 0, candidates.clone()).unwrap();
for el in iter {
let docids = el.unwrap();
results.push_str(&display_bitmap(&docids));
results.push('\n');
}
assert!(results.is_empty());

let mut results = String::new();
let iter = ascending_facet_sort(&txn, index.content, 1, candidates).unwrap();
for el in iter {
let docids = el.unwrap();
results.push_str(&display_bitmap(&docids));
results.push('\n');
}
assert!(results.is_empty());

txn.commit().unwrap();
}
}

#[test]
fn filter_sort_ascending_with_inexisting_field_id() {
let indexes = [
get_simple_string_index_with_multiple_field_ids(),
get_random_looking_string_index_with_multiple_field_ids(),
];
for (_i, index) in indexes.iter().enumerate() {
let txn = index.env.read_txn().unwrap();
let candidates = RoaringBitmap::new();
let mut results = String::new();
let iter = ascending_facet_sort(&txn, index.content, 3, candidates.clone()).unwrap();
for el in iter {
let docids = el.unwrap();
results.push_str(&display_bitmap(&docids));
results.push('\n');
}
assert!(results.is_empty());

txn.commit().unwrap();
}
}
}
97 changes: 95 additions & 2 deletions milli/src/search/facet/facet_sort_descending.rs
Original file line number Diff line number Diff line change
Expand Up @@ -125,12 +125,20 @@ mod tests {
use crate::heed_codec::ByteSliceRefCodec;
use crate::milli_snap;
use crate::search::facet::facet_sort_descending::descending_facet_sort;
use crate::search::facet::tests::{get_random_looking_index, get_simple_index};
use crate::search::facet::tests::{
get_random_looking_index, get_random_looking_string_index_with_multiple_field_ids,
get_simple_index, get_simple_index_with_multiple_field_ids,
get_simple_string_index_with_multiple_field_ids,
};
use crate::snapshot_tests::display_bitmap;

#[test]
fn filter_sort_descending() {
let indexes = [get_simple_index(), get_random_looking_index()];
let indexes = [
get_simple_index(),
get_random_looking_index(),
get_simple_index_with_multiple_field_ids(),
];
for (i, index) in indexes.iter().enumerate() {
let txn = index.env.read_txn().unwrap();
let candidates = (200..=300).into_iter().collect::<RoaringBitmap>();
Expand All @@ -147,4 +155,89 @@ mod tests {
txn.commit().unwrap();
}
}

#[test]
fn filter_sort_descending_multiple_field_ids() {
let indexes = [
get_simple_string_index_with_multiple_field_ids(),
get_random_looking_string_index_with_multiple_field_ids(),
];
for (i, index) in indexes.iter().enumerate() {
let txn = index.env.read_txn().unwrap();
let candidates = (200..=300).into_iter().collect::<RoaringBitmap>();
let mut results = String::new();
let db = index.content.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>();
let iter = descending_facet_sort(&txn, db, 0, candidates.clone()).unwrap();
for el in iter {
let docids = el.unwrap();
results.push_str(&display_bitmap(&docids));
results.push('\n');
}
milli_snap!(results, format!("{i}-0"));

let mut results = String::new();

let iter = descending_facet_sort(&txn, db, 1, candidates).unwrap();
for el in iter {
let docids = el.unwrap();
results.push_str(&display_bitmap(&docids));
results.push('\n');
}
milli_snap!(results, format!("{i}-1"));

txn.commit().unwrap();
}
}
#[test]
fn filter_sort_ascending_with_no_candidates() {
let indexes = [
get_simple_string_index_with_multiple_field_ids(),
get_random_looking_string_index_with_multiple_field_ids(),
];
for (_i, index) in indexes.iter().enumerate() {
let txn = index.env.read_txn().unwrap();
let candidates = RoaringBitmap::new();
let mut results = String::new();
let iter = descending_facet_sort(&txn, index.content, 0, candidates.clone()).unwrap();
for el in iter {
let docids = el.unwrap();
results.push_str(&display_bitmap(&docids));
results.push('\n');
}
assert!(results.is_empty());

let mut results = String::new();
let iter = descending_facet_sort(&txn, index.content, 1, candidates).unwrap();
for el in iter {
let docids = el.unwrap();
results.push_str(&display_bitmap(&docids));
results.push('\n');
}
assert!(results.is_empty());

txn.commit().unwrap();
}
}

#[test]
fn filter_sort_ascending_with_inexisting_field_id() {
let indexes = [
get_simple_string_index_with_multiple_field_ids(),
get_random_looking_string_index_with_multiple_field_ids(),
];
for (_i, index) in indexes.iter().enumerate() {
let txn = index.env.read_txn().unwrap();
let candidates = RoaringBitmap::new();
let mut results = String::new();
let iter = descending_facet_sort(&txn, index.content, 3, candidates.clone()).unwrap();
for el in iter {
let docids = el.unwrap();
results.push_str(&display_bitmap(&docids));
results.push('\n');
}
assert!(results.is_empty());

txn.commit().unwrap();
}
}
}
42 changes: 41 additions & 1 deletion milli/src/search/facet/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,8 @@ pub(crate) mod tests {
use roaring::RoaringBitmap;

use crate::heed_codec::facet::OrderedF64Codec;
use crate::update::facet::tests::FacetIndex;
use crate::heed_codec::StrRefCodec;
use crate::update::facet::test_helpers::FacetIndex;

pub fn get_simple_index() -> FacetIndex<OrderedF64Codec> {
let index = FacetIndex::<OrderedF64Codec>::new(4, 8, 5);
Expand Down Expand Up @@ -147,4 +148,43 @@ pub(crate) mod tests {
txn.commit().unwrap();
index
}
pub fn get_simple_string_index_with_multiple_field_ids() -> FacetIndex<StrRefCodec> {
let index = FacetIndex::<StrRefCodec>::new(4, 8, 5);
let mut txn = index.env.write_txn().unwrap();
for fid in 0..2 {
for i in 0..256u16 {
let mut bitmap = RoaringBitmap::new();
bitmap.insert(i as u32);
if i % 2 == 0 {
index.insert(&mut txn, fid, &format!("{i}").as_str(), &bitmap);
} else {
index.insert(&mut txn, fid, &"", &bitmap);
}
}
}
txn.commit().unwrap();
index
}
pub fn get_random_looking_string_index_with_multiple_field_ids() -> FacetIndex<StrRefCodec> {
let index = FacetIndex::<StrRefCodec>::new(4, 8, 5);
let mut txn = index.env.write_txn().unwrap();

let mut rng = rand::rngs::SmallRng::from_seed([0; 32]);
let keys =
std::iter::from_fn(|| Some(rng.gen_range(0..256))).take(128).collect::<Vec<u32>>();
for fid in 0..2 {
for (_i, &key) in keys.iter().enumerate() {
let mut bitmap = RoaringBitmap::new();
bitmap.insert(key);
bitmap.insert(key + 100);
if key % 2 == 0 {
index.insert(&mut txn, fid, &format!("{key}").as_str(), &bitmap);
} else {
index.insert(&mut txn, fid, &"", &bitmap);
}
}
}
txn.commit().unwrap();
index
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
---
source: milli/src/search/facet/facet_sort_ascending.rs
---
[201, 203, 205, 207, 209, 211, 213, 215, 217, 219, 221, 223, 225, 227, 229, 231, 233, 235, 237, 239, 241, 243, 245, 247, 249, 251, 253, 255, ]
[200, ]
[202, ]
[204, ]
[206, ]
[208, ]
[210, ]
[212, ]
[214, ]
[216, ]
[218, ]
[220, ]
[222, ]
[224, ]
[226, ]
[228, ]
[230, ]
[232, ]
[234, ]
[236, ]
[238, ]
[240, ]
[242, ]
[244, ]
[246, ]
[248, ]
[250, ]
[252, ]
[254, ]

Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
---
source: milli/src/search/facet/facet_sort_ascending.rs
---
[201, 203, 205, 207, 209, 211, 213, 215, 217, 219, 221, 223, 225, 227, 229, 231, 233, 235, 237, 239, 241, 243, 245, 247, 249, 251, 253, 255, ]
[200, ]
[202, ]
[204, ]
[206, ]
[208, ]
[210, ]
[212, ]
[214, ]
[216, ]
[218, ]
[220, ]
[222, ]
[224, ]
[226, ]
[228, ]
[230, ]
[232, ]
[234, ]
[236, ]
[238, ]
[240, ]
[242, ]
[244, ]
[246, ]
[248, ]
[250, ]
[252, ]
[254, ]

Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
---
source: milli/src/search/facet/facet_sort_ascending.rs
---
[201, 203, 205, 207, 209, 211, 215, 219, 223, 231, 233, 235, 237, 239, 241, 243, 247, 263, 267, 269, 273, 277, 279, 281, 289, 293, 295, 297, ]
[202, ]
[224, ]
[230, ]
[236, ]
[244, ]
[250, ]
[256, ]
[258, ]
[260, ]
[262, ]
[264, ]
[278, ]
[282, ]
[286, ]
[292, ]
[206, ]
[208, ]
[210, ]
[216, ]
[220, ]
[226, ]
[238, ]

Loading

0 comments on commit 46e26ab

Please sign in to comment.