Skip to content
This repository has been archived by the owner on Apr 4, 2023. It is now read-only.

Commit

Permalink
Fasten the document deletion
Browse files Browse the repository at this point in the history
When a document deletion occurs, instead of deleting the document we mark it as deleted
in the new “soft deleted” bitmap. It is then removed from the search, and all the other
endpoints.
  • Loading branch information
irevoire committed Jun 15, 2022
1 parent 0a5d1a4 commit f4d9cd2
Show file tree
Hide file tree
Showing 10 changed files with 413 additions and 284 deletions.
2 changes: 2 additions & 0 deletions milli/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ pub enum Error {

#[derive(Error, Debug)]
pub enum InternalError {
#[error("Tried to access a soft deleted documents.")]
AccessingSoftDeletedDocument { document_id: DocumentId },
#[error("{}", HeedError::DatabaseClosing)]
DatabaseClosing,
#[error("Missing {} in the {db_name} database.", key.unwrap_or("key"))]
Expand Down
11 changes: 9 additions & 2 deletions milli/src/external_documents_ids.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,26 +5,30 @@ use std::{fmt, str};

use fst::map::IndexedValue;
use fst::{IntoStreamer, Streamer};
use roaring::RoaringBitmap;

const DELETED_ID: u64 = u64::MAX;

pub struct ExternalDocumentsIds<'a> {
pub(crate) hard: fst::Map<Cow<'a, [u8]>>,
pub(crate) soft: fst::Map<Cow<'a, [u8]>>,
soft_deleted_docids: RoaringBitmap,
}

impl<'a> ExternalDocumentsIds<'a> {
pub fn new(
hard: fst::Map<Cow<'a, [u8]>>,
soft: fst::Map<Cow<'a, [u8]>>,
soft_deleted_docids: RoaringBitmap,
) -> ExternalDocumentsIds<'a> {
ExternalDocumentsIds { hard, soft }
ExternalDocumentsIds { hard, soft, soft_deleted_docids }
}

pub fn into_static(self) -> ExternalDocumentsIds<'static> {
ExternalDocumentsIds {
hard: self.hard.map_data(|c| Cow::Owned(c.into_owned())).unwrap(),
soft: self.soft.map_data(|c| Cow::Owned(c.into_owned())).unwrap(),
soft_deleted_docids: self.soft_deleted_docids,
}
}

Expand All @@ -36,7 +40,9 @@ impl<'a> ExternalDocumentsIds<'a> {
pub fn get<A: AsRef<[u8]>>(&self, external_id: A) -> Option<u32> {
let external_id = external_id.as_ref();
match self.soft.get(external_id).or_else(|| self.hard.get(external_id)) {
Some(id) if id != DELETED_ID => Some(id.try_into().unwrap()),
Some(id) if id != DELETED_ID && !self.soft_deleted_docids.contains(id as u32) => {
Some(id.try_into().unwrap())
}
_otherwise => None,
}
}
Expand Down Expand Up @@ -134,6 +140,7 @@ impl Default for ExternalDocumentsIds<'static> {
ExternalDocumentsIds {
hard: fst::Map::default().map_data(Cow::Owned).unwrap(),
soft: fst::Map::default().map_data(Cow::Owned).unwrap(),
soft_deleted_docids: RoaringBitmap::new(),
}
}
}
Expand Down
40 changes: 37 additions & 3 deletions milli/src/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ pub mod main_key {
pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields";
pub const DISTINCT_FIELD_KEY: &str = "distinct-field-key";
pub const DOCUMENTS_IDS_KEY: &str = "documents-ids";
pub const SOFT_DELETED_DOCUMENTS_IDS_KEY: &str = "soft-deleted-documents-ids";
pub const HIDDEN_FACETED_FIELDS_KEY: &str = "hidden-faceted-fields";
pub const FILTERABLE_FIELDS_KEY: &str = "filterable-fields";
pub const SORTABLE_FIELDS_KEY: &str = "sortable-fields";
Expand Down Expand Up @@ -249,6 +250,29 @@ impl Index {
Ok(count.unwrap_or_default())
}

/* deleted documents ids */

/// Writes the documents ids that corresponds to the user-ids-documents-ids FST.
pub(crate) fn put_soft_deleted_documents_ids(
&self,
wtxn: &mut RwTxn,
docids: &RoaringBitmap,
) -> heed::Result<()> {
self.main.put::<_, Str, RoaringBitmapCodec>(
wtxn,
main_key::SOFT_DELETED_DOCUMENTS_IDS_KEY,
docids,
)
}

/// Returns the internal documents ids.
pub(crate) fn soft_deleted_documents_ids(&self, rtxn: &RoTxn) -> heed::Result<RoaringBitmap> {
Ok(self
.main
.get::<_, Str, RoaringBitmapCodec>(rtxn, main_key::SOFT_DELETED_DOCUMENTS_IDS_KEY)?
.unwrap_or_default())
}

/* primary key */

/// Writes the documents primary key, this is the field name that is used to store the id.
Expand All @@ -275,7 +299,7 @@ impl Index {
wtxn: &mut RwTxn,
external_documents_ids: &ExternalDocumentsIds<'a>,
) -> heed::Result<()> {
let ExternalDocumentsIds { hard, soft } = external_documents_ids;
let ExternalDocumentsIds { hard, soft, .. } = external_documents_ids;
let hard = hard.as_fst().as_bytes();
let soft = soft.as_fst().as_bytes();
self.main.put::<_, Str, ByteSlice>(
Expand Down Expand Up @@ -306,7 +330,8 @@ impl Index {
Some(soft) => fst::Map::new(soft)?.map_data(Cow::Borrowed)?,
None => fst::Map::default().map_data(Cow::Owned)?,
};
Ok(ExternalDocumentsIds::new(hard, soft))
let soft_deleted_docids = self.soft_deleted_documents_ids(rtxn)?;
Ok(ExternalDocumentsIds::new(hard, soft, soft_deleted_docids))
}

/* fields ids map */
Expand Down Expand Up @@ -924,9 +949,13 @@ impl Index {
rtxn: &'t RoTxn,
ids: impl IntoIterator<Item = DocumentId>,
) -> Result<Vec<(DocumentId, obkv::KvReaderU16<'t>)>> {
let soft_deleted_documents = self.soft_deleted_documents_ids(rtxn)?;
let mut documents = Vec::new();

for id in ids {
if soft_deleted_documents.contains(id) {
return Err(InternalError::AccessingSoftDeletedDocument { document_id: id })?;
}
let kv = self
.documents
.get(rtxn, &BEU32::new(id))?
Expand All @@ -942,11 +971,16 @@ impl Index {
&self,
rtxn: &'t RoTxn,
) -> Result<impl Iterator<Item = heed::Result<(DocumentId, obkv::KvReaderU16<'t>)>>> {
let soft_deleted_docids = self.soft_deleted_documents_ids(rtxn)?;

Ok(self
.documents
.iter(rtxn)?
// we cast the BEU32 to a DocumentId
.map(|document| document.map(|(id, obkv)| (id.get(), obkv))))
.map(|document| document.map(|(id, obkv)| (id.get(), obkv)))
.filter(move |document| {
document.as_ref().map_or(true, |(id, _)| !soft_deleted_docids.contains(*id))
}))
}

pub fn facets_distribution<'a>(&'a self, rtxn: &'a RoTxn) -> FacetDistribution<'a> {
Expand Down
74 changes: 52 additions & 22 deletions milli/src/search/facet/filter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,7 @@ use roaring::RoaringBitmap;

use super::FacetNumberRange;
use crate::error::{Error, UserError};
use crate::heed_codec::facet::{
FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec,
};
use crate::heed_codec::facet::FacetLevelValueF64Codec;
use crate::{
distance_between_two_points, lat_lng_to_xyz, CboRoaringBitmapCodec, FieldId, Index, Result,
};
Expand Down Expand Up @@ -266,11 +264,12 @@ impl<'a> Filter<'a> {
fn evaluate_operator(
rtxn: &heed::RoTxn,
index: &Index,
numbers_db: heed::Database<FacetLevelValueF64Codec, CboRoaringBitmapCodec>,
strings_db: heed::Database<FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec>,
field_id: FieldId,
operator: &Condition<'a>,
) -> Result<RoaringBitmap> {
let numbers_db = index.facet_id_f64_docids;
let strings_db = index.facet_id_string_docids;

// Make sure we always bound the ranges with the field id and the level,
// as the facets values are all in the same database and prefixed by the
// field id and the level.
Expand Down Expand Up @@ -314,9 +313,7 @@ impl<'a> Filter<'a> {
};
let all_strings_ids = index.string_faceted_documents_ids(rtxn, field_id)?;
let operator = Condition::Equal(val.clone());
let docids = Self::evaluate_operator(
rtxn, index, numbers_db, strings_db, field_id, &operator,
)?;
let docids = Self::evaluate_operator(rtxn, index, field_id, &operator)?;
return Ok((all_numbers_ids | all_strings_ids) - docids);
}
};
Expand Down Expand Up @@ -347,17 +344,27 @@ impl<'a> Filter<'a> {
}

pub fn evaluate(&self, rtxn: &heed::RoTxn, index: &Index) -> Result<RoaringBitmap> {
let numbers_db = index.facet_id_f64_docids;
let strings_db = index.facet_id_string_docids;
// to avoid doing this for each recursive call we're going to do it ONCE ahead of time
let soft_deleted_documents = index.soft_deleted_documents_ids(rtxn)?;
let filterable_fields = index.filterable_fields(rtxn)?;

// and finally we delete all the soft_deleted_documents, again, only once at the very end
self.inner_evaluate(rtxn, index, &filterable_fields)
.map(|result| result - soft_deleted_documents)
}

fn inner_evaluate(
&self,
rtxn: &heed::RoTxn,
index: &Index,
filterable_fields: &HashSet<String>,
) -> Result<RoaringBitmap> {
match &self.condition {
FilterCondition::Condition { fid, op } => {
let filterable_fields = index.filterable_fields(rtxn)?;

if crate::is_faceted(fid.value(), &filterable_fields) {
if crate::is_faceted(fid.value(), filterable_fields) {
let field_ids_map = index.fields_ids_map(rtxn)?;
if let Some(fid) = field_ids_map.id(fid.value()) {
Self::evaluate_operator(rtxn, index, numbers_db, strings_db, fid, &op)
Self::evaluate_operator(rtxn, index, fid, &op)
} else {
return Ok(RoaringBitmap::new());
}
Expand All @@ -376,25 +383,47 @@ impl<'a> Filter<'a> {
return Err(fid.as_external_error(
FilterError::AttributeNotFilterable {
attribute,
filterable_fields,
filterable_fields: filterable_fields.clone(),
},
))?;
}
}
}
}
FilterCondition::Or(lhs, rhs) => {
let lhs = Self::evaluate(&(lhs.as_ref().clone()).into(), rtxn, index)?;
let rhs = Self::evaluate(&(rhs.as_ref().clone()).into(), rtxn, index)?;
let lhs = Self::inner_evaluate(
&(lhs.as_ref().clone()).into(),
rtxn,
index,
filterable_fields,
)?;
let rhs = Self::inner_evaluate(
&(rhs.as_ref().clone()).into(),
rtxn,
index,
filterable_fields,
)?;
Ok(lhs | rhs)
}
FilterCondition::And(lhs, rhs) => {
let lhs = Self::evaluate(&(lhs.as_ref().clone()).into(), rtxn, index)?;
let rhs = Self::evaluate(&(rhs.as_ref().clone()).into(), rtxn, index)?;
let lhs = Self::inner_evaluate(
&(lhs.as_ref().clone()).into(),
rtxn,
index,
filterable_fields,
)?;
if lhs.is_empty() {
return Ok(lhs);
}
let rhs = Self::inner_evaluate(
&(rhs.as_ref().clone()).into(),
rtxn,
index,
filterable_fields,
)?;
Ok(lhs & rhs)
}
FilterCondition::GeoLowerThan { point, radius } => {
let filterable_fields = index.filterable_fields(rtxn)?;
if filterable_fields.contains("_geo") {
let base_point: [f64; 2] = [point[0].parse()?, point[1].parse()?];
if !(-90.0..=90.0).contains(&base_point[0]) {
Expand Down Expand Up @@ -427,16 +456,17 @@ impl<'a> Filter<'a> {
} else {
return Err(point[0].as_external_error(FilterError::AttributeNotFilterable {
attribute: "_geo",
filterable_fields,
filterable_fields: filterable_fields.clone(),
}))?;
}
}
FilterCondition::GeoGreaterThan { point, radius } => {
let result = Self::evaluate(
let result = Self::inner_evaluate(
&FilterCondition::GeoLowerThan { point: point.clone(), radius: radius.clone() }
.into(),
rtxn,
index,
filterable_fields,
)?;
let geo_faceted_doc_ids = index.geo_faceted_documents_ids(rtxn)?;
Ok(geo_faceted_doc_ids - result)
Expand Down
2 changes: 1 addition & 1 deletion milli/src/search/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ impl<'a> Search<'a> {
) -> Result<SearchResult> {
let mut offset = self.offset;
let mut initial_candidates = RoaringBitmap::new();
let mut excluded_candidates = RoaringBitmap::new();
let mut excluded_candidates = self.index.soft_deleted_documents_ids(self.rtxn)?;
let mut documents_ids = Vec::new();

while let Some(FinalResult { candidates, bucket_candidates, .. }) =
Expand Down
35 changes: 30 additions & 5 deletions milli/src/update/available_documents_ids.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,16 @@ pub struct AvailableDocumentsIds {
}

impl AvailableDocumentsIds {
pub fn from_documents_ids(docids: &RoaringBitmap) -> AvailableDocumentsIds {
match docids.max() {
pub fn from_documents_ids(
docids: &RoaringBitmap,
soft_deleted_docids: &RoaringBitmap,
) -> AvailableDocumentsIds {
let used_docids = docids | soft_deleted_docids;

match used_docids.max() {
Some(last_id) => {
let mut available = RoaringBitmap::from_iter(0..last_id);
available -= docids;
available -= used_docids;

let iter = match last_id.checked_add(1) {
Some(id) => id..=u32::max_value(),
Expand Down Expand Up @@ -44,7 +49,7 @@ mod tests {
#[test]
fn empty() {
let base = RoaringBitmap::new();
let left = AvailableDocumentsIds::from_documents_ids(&base);
let left = AvailableDocumentsIds::from_documents_ids(&base, &RoaringBitmap::new());
let right = 0..=u32::max_value();
left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r));
}
Expand All @@ -57,8 +62,28 @@ mod tests {
base.insert(100);
base.insert(405);

let left = AvailableDocumentsIds::from_documents_ids(&base);
let left = AvailableDocumentsIds::from_documents_ids(&base, &RoaringBitmap::new());
let right = (0..=u32::max_value()).filter(|&n| n != 0 && n != 10 && n != 100 && n != 405);
left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r));
}

#[test]
fn soft_deleted() {
let mut base = RoaringBitmap::new();
base.insert(0);
base.insert(10);
base.insert(100);
base.insert(405);

let mut soft_deleted = RoaringBitmap::new();
soft_deleted.insert(1);
soft_deleted.insert(11);
soft_deleted.insert(101);
soft_deleted.insert(406);

let left = AvailableDocumentsIds::from_documents_ids(&base, &soft_deleted);
let right =
(0..=u32::max_value()).filter(|&n| ![0, 1, 10, 11, 100, 101, 405, 406].contains(&n));
left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r));
}
}
10 changes: 6 additions & 4 deletions milli/src/update/clear_documents.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
documents,
} = self.index;

let empty_roaring = RoaringBitmap::default();

// We retrieve the number of documents ids that we are deleting.
let number_of_documents = self.index.number_of_documents(self.wtxn)?;
let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?;
Expand All @@ -43,16 +45,16 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
self.index.put_words_fst(self.wtxn, &fst::Set::default())?;
self.index.put_words_prefixes_fst(self.wtxn, &fst::Set::default())?;
self.index.put_external_documents_ids(self.wtxn, &ExternalDocumentsIds::default())?;
self.index.put_documents_ids(self.wtxn, &RoaringBitmap::default())?;
self.index.put_documents_ids(self.wtxn, &empty_roaring)?;
self.index.put_soft_deleted_documents_ids(self.wtxn, &empty_roaring)?;
self.index.put_field_distribution(self.wtxn, &FieldDistribution::default())?;
self.index.delete_geo_rtree(self.wtxn)?;
self.index.delete_geo_faceted_documents_ids(self.wtxn)?;

// We clean all the faceted documents ids.
let empty = RoaringBitmap::default();
for field_id in faceted_fields {
self.index.put_number_faceted_documents_ids(self.wtxn, field_id, &empty)?;
self.index.put_string_faceted_documents_ids(self.wtxn, field_id, &empty)?;
self.index.put_number_faceted_documents_ids(self.wtxn, field_id, &empty_roaring)?;
self.index.put_string_faceted_documents_ids(self.wtxn, field_id, &empty_roaring)?;
}

// Clear the other databases.
Expand Down
Loading

0 comments on commit f4d9cd2

Please sign in to comment.