From 048e174efb80daa7ca0ac302f53d36fe514d172e Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 14 Jun 2022 10:13:48 +0200 Subject: [PATCH 01/55] Do not allocate when parsing CSV headers --- milli/src/documents/builder.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs index 2be7c1dd8..391175f31 100644 --- a/milli/src/documents/builder.rs +++ b/milli/src/documents/builder.rs @@ -108,7 +108,7 @@ impl DocumentBatchBuilder { .headers()? .into_iter() .map(parse_csv_header) - .map(|(k, t)| (this.index.insert(&k), t)) + .map(|(k, t)| (this.index.insert(k), t)) .collect::>(); for (i, record) in records.into_records().enumerate() { @@ -161,16 +161,16 @@ enum AllowedType { Number, } -fn parse_csv_header(header: &str) -> (String, AllowedType) { +fn parse_csv_header(header: &str) -> (&str, AllowedType) { // if there are several separators we only split on the last one. match header.rsplit_once(':') { Some((field_name, field_type)) => match field_type { - "string" => (field_name.to_string(), AllowedType::String), - "number" => (field_name.to_string(), AllowedType::Number), + "string" => (field_name, AllowedType::String), + "number" => (field_name, AllowedType::Number), // if the pattern isn't reconized, we keep the whole field. - _otherwise => (header.to_string(), AllowedType::String), + _otherwise => (header, AllowedType::String), }, - None => (header.to_string(), AllowedType::String), + None => (header, AllowedType::String), } } From eb63af1f1024063afc14eca909e99b4b370b2ff1 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 14 Jun 2022 13:20:51 +0200 Subject: [PATCH 02/55] Update grenad to 0.4.2 --- milli/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/Cargo.toml b/milli/Cargo.toml index feddf91c5..d980c6041 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -17,7 +17,7 @@ flatten-serde-json = { path = "../flatten-serde-json" } fst = "0.4.7" fxhash = "0.2.1" geoutils = "0.4.1" -grenad = { version = "0.4.1", default-features = false, features = ["tempfile"] } +grenad = { version = "0.4.2", default-features = false, features = ["tempfile"] } heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] } json-depth-checker = { path = "../json-depth-checker" } levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] } From 419ce3966c42fcb66fd1dbdb239b4287bf55d74b Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 14 Jun 2022 16:03:48 +0200 Subject: [PATCH 03/55] Rework the DocumentsBatchBuilder/Reader to use grenad --- milli/src/documents/builder.rs | 220 +++++++++++++++--------------- milli/src/documents/mod.rs | 109 ++++++--------- milli/src/documents/reader.rs | 105 ++++++++------ milli/src/documents/serde_impl.rs | 134 ------------------ 4 files changed, 215 insertions(+), 353 deletions(-) delete mode 100644 milli/src/documents/serde_impl.rs diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs index 391175f31..159afb8d9 100644 --- a/milli/src/documents/builder.rs +++ b/milli/src/documents/builder.rs @@ -1,157 +1,159 @@ -use std::collections::BTreeMap; -use std::io; -use std::io::{Cursor, Write}; +use std::io::{self, Write}; -use byteorder::{BigEndian, WriteBytesExt}; -use serde::Deserializer; -use serde_json::Value; +use grenad::{CompressionType, WriterBuilder}; +use serde_json::{to_writer, Map, Value}; -use super::serde_impl::DocumentVisitor; -use super::{ByteCounter, DocumentsBatchIndex, DocumentsMetadata, Error}; -use crate::FieldId; +use super::{DocumentsBatchIndex, Error, DOCUMENTS_BATCH_INDEX_KEY}; /// The `DocumentsBatchBuilder` provides a way to build a documents batch in the intermediary /// format used by milli. /// -/// The writer used by the DocumentBatchBuilder can be read using a `DocumentBatchReader` to -/// iterate over the documents. +/// The writer used by the `DocumentsBatchBuilder` can be read using a `DocumentsBatchReader` +/// to iterate over the documents. /// /// ## example: /// ``` -/// use milli::documents::DocumentBatchBuilder; /// use serde_json::json; -/// use std::io::Cursor; +/// use milli::documents::DocumentsBatchBuilder; /// -/// let json = r##"{"id": 1, "name": "foo"}"##; -/// let mut writer = Cursor::new(Vec::new()); -/// let mut builder = DocumentBatchBuilder::new(&mut writer).unwrap(); -/// builder.extend_from_json(&mut json.as_bytes()).unwrap(); -/// builder.finish().unwrap(); +/// let json = json!({ "id": 1, "name": "foo" }); +/// +/// let mut builder = DocumentsBatchBuilder::new(Vec::new()); +/// builder.append_json_object(json.as_object().unwrap()).unwrap(); +/// let _vector = builder.into_inner().unwrap(); /// ``` -pub struct DocumentBatchBuilder { - inner: ByteCounter, - index: DocumentsBatchIndex, +pub struct DocumentsBatchBuilder { + /// The inner grenad writer, the last value must always be the `DocumentsBatchIndex`. + writer: grenad::Writer, + /// A map that creates the relation between field ids and field names. + fields_index: DocumentsBatchIndex, + /// The number of documents that were added to this builder, + /// it doesn't take the primary key of the documents into account at this point. + documents_count: u32, + + /// A buffer to store a temporary obkv buffer and avoid reallocating. obkv_buffer: Vec, + /// A buffer to serialize the values and avoid reallocating, + /// serialized values are stored in an obkv. value_buffer: Vec, - values: BTreeMap, - count: usize, } -impl DocumentBatchBuilder { - pub fn new(writer: W) -> Result { - let index = DocumentsBatchIndex::default(); - let mut writer = ByteCounter::new(writer); - // add space to write the offset of the metadata at the end of the writer - writer.write_u64::(0)?; - - Ok(Self { - inner: writer, - index, +impl DocumentsBatchBuilder { + pub fn new(writer: W) -> DocumentsBatchBuilder { + DocumentsBatchBuilder { + writer: WriterBuilder::new().compression_type(CompressionType::None).build(writer), + fields_index: DocumentsBatchIndex::default(), + documents_count: 0, obkv_buffer: Vec::new(), value_buffer: Vec::new(), - values: BTreeMap::new(), - count: 0, - }) - } - - /// Returns the number of documents that have been written to the builder. - pub fn len(&self) -> usize { - self.count + } } - /// This method must be called after the document addition is terminated. It will put the - /// metadata at the end of the file, and write the metadata offset at the beginning on the - /// file. - pub fn finish(self) -> Result { - let Self { inner: ByteCounter { mut writer, count: offset }, index, count, .. } = self; - - let meta = DocumentsMetadata { count, index }; - - bincode::serialize_into(&mut writer, &meta)?; - - writer.seek(io::SeekFrom::Start(0))?; - writer.write_u64::(offset as u64)?; - - writer.flush()?; - - Ok(count) + /// Returns the number of documents inserted into this builder. + pub fn documents_count(&self) -> u32 { + self.documents_count } - /// Extends the builder with json documents from a reader. - pub fn extend_from_json(&mut self, reader: R) -> Result<(), Error> { - let mut de = serde_json::Deserializer::from_reader(reader); + /// Appends a new JSON object into the batch and updates the `DocumentsBatchIndex` accordingly. + pub fn append_json_object(&mut self, object: &Map) -> io::Result<()> { + // Make sure that we insert the fields ids in order as the obkv writer has this requirement. + let mut fields_ids: Vec<_> = object.keys().map(|k| self.fields_index.insert(&k)).collect(); + fields_ids.sort_unstable(); + + self.obkv_buffer.clear(); + let mut writer = obkv::KvWriter::new(&mut self.obkv_buffer); + for field_id in fields_ids { + let key = self.fields_index.name(field_id).unwrap(); + self.value_buffer.clear(); + to_writer(&mut self.value_buffer, &object[key])?; + writer.insert(field_id, &self.value_buffer)?; + } - let mut visitor = DocumentVisitor { - inner: &mut self.inner, - index: &mut self.index, - obkv_buffer: &mut self.obkv_buffer, - value_buffer: &mut self.value_buffer, - values: &mut self.values, - count: &mut self.count, - }; + let internal_id = self.documents_count.to_be_bytes(); + let document_bytes = writer.into_inner()?; + self.writer.insert(internal_id, &document_bytes)?; + self.documents_count += 1; - de.deserialize_any(&mut visitor).map_err(Error::JsonError)? + Ok(()) } - /// Creates a builder from a reader of CSV documents. - /// - /// Since all fields in a csv documents are guaranteed to be ordered, we are able to perform - /// optimisations, and extending from another CSV is not allowed. - pub fn from_csv(reader: R, writer: W) -> Result { - let mut this = Self::new(writer)?; - // Ensure that this is the first and only addition made with this builder - debug_assert!(this.index.is_empty()); - - let mut records = csv::Reader::from_reader(reader); - - let headers = records + /// Appends a new CSV file into the batch and updates the `DocumentsBatchIndex` accordingly. + pub fn append_csv(&mut self, mut reader: csv::Reader) -> Result<(), Error> { + // Make sure that we insert the fields ids in order as the obkv writer has this requirement. + let mut typed_fields_ids: Vec<_> = reader .headers()? .into_iter() .map(parse_csv_header) - .map(|(k, t)| (this.index.insert(k), t)) - .collect::>(); - - for (i, record) in records.into_records().enumerate() { - let record = record?; - this.obkv_buffer.clear(); - let mut writer = obkv::KvWriter::new(&mut this.obkv_buffer); - for (value, (fid, ty)) in record.into_iter().zip(headers.iter()) { - let value = match ty { + .map(|(k, t)| (self.fields_index.insert(k), t)) + .enumerate() + .collect(); + typed_fields_ids.sort_unstable_by_key(|(_, (fid, _))| *fid); + + let mut record = csv::StringRecord::new(); + let mut line = 0; + while reader.read_record(&mut record)? { + // We increment here and not at the end of the while loop to take + // the header offset into account. + line += 1; + + self.obkv_buffer.clear(); + let mut writer = obkv::KvWriter::new(&mut self.obkv_buffer); + + for (i, (field_id, type_)) in typed_fields_ids.iter() { + self.value_buffer.clear(); + + let value = &record[*i]; + match type_ { AllowedType::Number => { if value.trim().is_empty() { - Value::Null + to_writer(&mut self.value_buffer, &Value::Null)?; } else { - value.trim().parse::().map(Value::from).map_err(|error| { - Error::ParseFloat { - error, - // +1 for the header offset. - line: i + 1, - value: value.to_string(), + match value.trim().parse::() { + Ok(float) => { + to_writer(&mut self.value_buffer, &float)?; } - })? + Err(error) => { + return Err(Error::ParseFloat { + error, + line, + value: value.to_string(), + }); + } + } } } AllowedType::String => { if value.is_empty() { - Value::Null + to_writer(&mut self.value_buffer, &Value::Null)?; } else { - Value::String(value.to_string()) + to_writer(&mut self.value_buffer, value)?; } } - }; + } - this.value_buffer.clear(); - serde_json::to_writer(Cursor::new(&mut this.value_buffer), &value)?; - writer.insert(*fid, &this.value_buffer)?; + // We insert into the obkv writer the value buffer that has been filled just above. + writer.insert(*field_id, &self.value_buffer)?; } - this.inner.write_u32::(this.obkv_buffer.len() as u32)?; - this.inner.write_all(&this.obkv_buffer)?; - - this.count += 1; + let internal_id = self.documents_count.to_be_bytes(); + let document_bytes = writer.into_inner()?; + self.writer.insert(internal_id, &document_bytes)?; + self.documents_count += 1; } - Ok(this) + Ok(()) + } + + /// Flushes the content on disk and stores the final version of the `DocumentsBatchIndex`. + pub fn into_inner(mut self) -> io::Result { + let DocumentsBatchBuilder { mut writer, fields_index, .. } = self; + + // We serialize and insert the `DocumentsBatchIndex` as the last key of the grenad writer. + self.value_buffer.clear(); + to_writer(&mut self.value_buffer, &fields_index)?; + writer.insert(DOCUMENTS_BATCH_INDEX_KEY, &self.value_buffer)?; + + writer.into_inner() } } diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs index 09f15901d..bd0afc6e4 100644 --- a/milli/src/documents/mod.rs +++ b/milli/src/documents/mod.rs @@ -1,24 +1,22 @@ mod builder; -/// The documents module defines an intermediary document format that milli uses for indexation, and -/// provides an API to easily build and read such documents. -/// -/// The `DocumentBatchBuilder` interface allows to write batches of documents to a writer, that can -/// later be read by milli using the `DocumentBatchReader` interface. mod reader; -mod serde_impl; use std::fmt::{self, Debug}; use std::io; use bimap::BiHashMap; -pub use builder::DocumentBatchBuilder; -pub use reader::DocumentBatchReader; +pub use builder::DocumentsBatchBuilder; +pub use reader::{DocumentsBatchCursor, DocumentsBatchReader}; use serde::{Deserialize, Serialize}; use crate::FieldId; +/// The key that is used to store the `DocumentsBatchIndex` datastructure, +/// it is the absolute last key of the list. +const DOCUMENTS_BATCH_INDEX_KEY: [u8; 8] = u64::MAX.to_be_bytes(); + /// A bidirectional map that links field ids to their name in a document batch. -#[derive(Default, Debug, Serialize, Deserialize)] +#[derive(Default, Clone, Debug, Serialize, Deserialize)] pub struct DocumentsBatchIndex(pub BiHashMap); impl DocumentsBatchIndex { @@ -46,8 +44,8 @@ impl DocumentsBatchIndex { self.0.iter() } - pub fn name(&self, id: FieldId) -> Option<&String> { - self.0.get_by_left(&id) + pub fn name(&self, id: FieldId) -> Option<&str> { + self.0.get_by_left(&id).map(AsRef::as_ref) } pub fn recreate_json( @@ -69,50 +67,20 @@ impl DocumentsBatchIndex { } } -#[derive(Debug, Serialize, Deserialize)] -struct DocumentsMetadata { - count: usize, - index: DocumentsBatchIndex, -} - -pub struct ByteCounter { - count: usize, - writer: W, -} - -impl ByteCounter { - fn new(writer: W) -> Self { - Self { count: 0, writer } - } -} - -impl io::Write for ByteCounter { - fn write(&mut self, buf: &[u8]) -> io::Result { - let count = self.writer.write(buf)?; - self.count += count; - Ok(count) - } - - fn flush(&mut self) -> io::Result<()> { - self.writer.flush() - } -} - #[derive(Debug)] pub enum Error { ParseFloat { error: std::num::ParseFloatError, line: usize, value: String }, InvalidDocumentFormat, - Custom(String), - JsonError(serde_json::Error), - CsvError(csv::Error), - Serialize(bincode::Error), + Csv(csv::Error), + Json(serde_json::Error), + Serialize(serde_json::Error), + Grenad(grenad::Error), Io(io::Error), - DocumentTooLarge, } impl From for Error { fn from(e: csv::Error) -> Self { - Self::CsvError(e) + Self::Csv(e) } } @@ -122,15 +90,15 @@ impl From for Error { } } -impl From for Error { - fn from(other: bincode::Error) -> Self { - Self::Serialize(other) +impl From for Error { + fn from(other: serde_json::Error) -> Self { + Self::Json(other) } } -impl From for Error { - fn from(other: serde_json::Error) -> Self { - Self::JsonError(other) +impl From for Error { + fn from(other: grenad::Error) -> Self { + Self::Grenad(other) } } @@ -140,13 +108,14 @@ impl fmt::Display for Error { Error::ParseFloat { error, line, value } => { write!(f, "Error parsing number {:?} at line {}: {}", value, line, error) } - Error::Custom(s) => write!(f, "Unexpected serialization error: {}", s), - Error::InvalidDocumentFormat => f.write_str("Invalid document addition format."), - Error::JsonError(err) => write!(f, "Couldn't serialize document value: {}", err), + Error::InvalidDocumentFormat => { + f.write_str("Invalid document addition format, missing the documents batch index.") + } Error::Io(e) => write!(f, "{}", e), - Error::DocumentTooLarge => f.write_str("Provided document is too large (>2Gib)"), Error::Serialize(e) => write!(f, "{}", e), - Error::CsvError(e) => write!(f, "{}", e), + Error::Grenad(e) => write!(f, "{}", e), + Error::Csv(e) => write!(f, "{}", e), + Error::Json(e) => write!(f, "{}", e), } } } @@ -158,15 +127,25 @@ impl std::error::Error for Error {} macro_rules! documents { ($data:tt) => {{ let documents = serde_json::json!($data); - let mut writer = std::io::Cursor::new(Vec::new()); - let mut builder = crate::documents::DocumentBatchBuilder::new(&mut writer).unwrap(); - let documents = serde_json::to_vec(&documents).unwrap(); - builder.extend_from_json(std::io::Cursor::new(documents)).unwrap(); - builder.finish().unwrap(); - - writer.set_position(0); + let documents = match documents { + object @ serde_json::Value::Object(_) => vec![object], + serde_json::Value::Array(objects) => objects, + invalid => { + panic!("an array of objects must be specified, {:#?} is not an array", invalid) + } + }; + + let mut builder = crate::documents::DocumentsBatchBuilder::new(Vec::new()); + for document in documents { + let object = match document { + serde_json::Value::Object(object) => object, + invalid => panic!("an object must be specified, {:#?} is not an object", invalid), + }; + builder.append_json_object(&object).unwrap(); + } - crate::documents::DocumentBatchReader::from_reader(writer).unwrap() + let vector = builder.into_inner().unwrap(); + crate::documents::DocumentsBatchReader::from_reader(std::io::Cursor::new(vector)).unwrap() }}; } diff --git a/milli/src/documents/reader.rs b/milli/src/documents/reader.rs index 14d7c8ceb..3dff999f5 100644 --- a/milli/src/documents/reader.rs +++ b/milli/src/documents/reader.rs @@ -1,11 +1,9 @@ +use std::convert::TryInto; use std::io; -use std::io::{BufReader, Read}; -use std::mem::size_of; -use byteorder::{BigEndian, ReadBytesExt}; use obkv::KvReader; -use super::{DocumentsBatchIndex, DocumentsMetadata, Error}; +use super::{DocumentsBatchIndex, Error, DOCUMENTS_BATCH_INDEX_KEY}; use crate::FieldId; /// The `DocumentsBatchReader` provides a way to iterate over documents that have been created with @@ -13,63 +11,80 @@ use crate::FieldId; /// /// The documents are returned in the form of `obkv::Reader` where each field is identified with a /// `FieldId`. The mapping between the field ids and the field names is done thanks to the index. -pub struct DocumentBatchReader { - reader: BufReader, - metadata: DocumentsMetadata, - buffer: Vec, - seen_documents: usize, +pub struct DocumentsBatchReader { + cursor: grenad::ReaderCursor, + fields_index: DocumentsBatchIndex, } -impl DocumentBatchReader { +impl DocumentsBatchReader { /// Construct a `DocumentsReader` from a reader. /// - /// It first retrieves the index, then moves to the first document. Subsequent calls to - /// `next_document` advance the document reader until all the documents have been read. - pub fn from_reader(mut reader: R) -> Result { - let mut buffer = Vec::new(); + /// It first retrieves the index, then moves to the first document. Use the `into_cursor` + /// method to iterator over the documents, from the first to the last. + pub fn from_reader(reader: R) -> Result { + let reader = grenad::Reader::new(reader)?; + let mut cursor = reader.into_cursor()?; - let meta_offset = reader.read_u64::()?; - reader.seek(io::SeekFrom::Start(meta_offset))?; - reader.read_to_end(&mut buffer)?; - let metadata: DocumentsMetadata = bincode::deserialize(&buffer)?; + let fields_index = match cursor.move_on_key_equal_to(DOCUMENTS_BATCH_INDEX_KEY)? { + Some((_, value)) => serde_json::from_slice(value).map_err(Error::Serialize)?, + None => return Err(Error::InvalidDocumentFormat), + }; - reader.seek(io::SeekFrom::Start(size_of::() as u64))?; - buffer.clear(); + Ok(DocumentsBatchReader { cursor, fields_index }) + } - let reader = BufReader::new(reader); + pub fn documents_count(&self) -> u32 { + self.cursor.len().saturating_sub(1).try_into().expect("Invalid number of documents") + } - Ok(Self { reader, metadata, buffer, seen_documents: 0 }) + pub fn is_empty(&self) -> bool { + self.cursor.len().saturating_sub(1) == 0 } - /// Returns the next document in the reader, and wraps it in an `obkv::KvReader`, along with a - /// reference to the addition index. - pub fn next_document_with_index<'a>( - &'a mut self, - ) -> io::Result)>> { - if self.seen_documents < self.metadata.count { - let doc_len = self.reader.read_u32::()?; - self.buffer.resize(doc_len as usize, 0); - self.reader.read_exact(&mut self.buffer)?; - self.seen_documents += 1; + pub fn documents_batch_index(&self) -> &DocumentsBatchIndex { + &self.fields_index + } - let reader = KvReader::new(&self.buffer); - Ok(Some((&self.metadata.index, reader))) - } else { - Ok(None) - } + /// This method returns a forward cursor over the documents. + pub fn into_cursor(self) -> DocumentsBatchCursor { + let DocumentsBatchReader { cursor, fields_index } = self; + let mut cursor = DocumentsBatchCursor { cursor, fields_index }; + cursor.reset(); + cursor } +} - /// Return the fields index for the documents batch. - pub fn index(&self) -> &DocumentsBatchIndex { - &self.metadata.index +/// A forward cursor over the documents in a `DocumentsBatchReader`. +pub struct DocumentsBatchCursor { + cursor: grenad::ReaderCursor, + fields_index: DocumentsBatchIndex, +} + +impl DocumentsBatchCursor { + pub fn into_reader(self) -> DocumentsBatchReader { + let DocumentsBatchCursor { cursor, fields_index, .. } = self; + DocumentsBatchReader { cursor, fields_index } } - /// Returns the number of documents in the reader. - pub fn len(&self) -> usize { - self.metadata.count + pub fn documents_batch_index(&self) -> &DocumentsBatchIndex { + &self.fields_index } - pub fn is_empty(&self) -> bool { - self.len() == 0 + /// Resets the cursor to be able to read from the start again. + pub fn reset(&mut self) { + self.cursor.reset(); + } +} + +impl DocumentsBatchCursor { + /// Returns the next document, starting from the first one. Subsequent calls to + /// `next_document` advance the document reader until all the documents have been read. + pub fn next_document(&mut self) -> Result>, grenad::Error> { + match self.cursor.move_on_next()? { + Some((key, value)) if key != DOCUMENTS_BATCH_INDEX_KEY => { + Ok(Some(KvReader::new(value))) + } + _otherwise => Ok(None), + } } } diff --git a/milli/src/documents/serde_impl.rs b/milli/src/documents/serde_impl.rs deleted file mode 100644 index d57bf1ffb..000000000 --- a/milli/src/documents/serde_impl.rs +++ /dev/null @@ -1,134 +0,0 @@ -use std::collections::BTreeMap; -use std::fmt; -use std::io::{Cursor, Write}; - -use byteorder::WriteBytesExt; -use serde::de::{DeserializeSeed, MapAccess, SeqAccess, Visitor}; -use serde::Deserialize; -use serde_json::Value; - -use super::{ByteCounter, DocumentsBatchIndex, Error}; -use crate::FieldId; - -macro_rules! tri { - ($e:expr) => { - match $e { - Ok(r) => r, - Err(e) => return Ok(Err(e.into())), - } - }; -} - -struct FieldIdResolver<'a>(&'a mut DocumentsBatchIndex); - -impl<'a, 'de> DeserializeSeed<'de> for FieldIdResolver<'a> { - type Value = FieldId; - - fn deserialize(self, deserializer: D) -> Result - where - D: serde::Deserializer<'de>, - { - deserializer.deserialize_str(self) - } -} - -impl<'a, 'de> Visitor<'de> for FieldIdResolver<'a> { - type Value = FieldId; - - fn visit_str(self, v: &str) -> Result - where - E: serde::de::Error, - { - Ok(self.0.insert(v)) - } - - fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "a string") - } -} - -struct ValueDeserializer; - -impl<'de> DeserializeSeed<'de> for ValueDeserializer { - type Value = serde_json::Value; - - fn deserialize(self, deserializer: D) -> Result - where - D: serde::Deserializer<'de>, - { - serde_json::Value::deserialize(deserializer) - } -} - -pub struct DocumentVisitor<'a, W> { - pub inner: &'a mut ByteCounter, - pub index: &'a mut DocumentsBatchIndex, - pub obkv_buffer: &'a mut Vec, - pub value_buffer: &'a mut Vec, - pub values: &'a mut BTreeMap, - pub count: &'a mut usize, -} - -impl<'a, 'de, W: Write> Visitor<'de> for &mut DocumentVisitor<'a, W> { - /// This Visitor value is nothing, since it write the value to a file. - type Value = Result<(), Error>; - - fn visit_seq(self, mut seq: A) -> Result - where - A: SeqAccess<'de>, - { - while let Some(v) = seq.next_element_seed(&mut *self)? { - tri!(v) - } - - Ok(Ok(())) - } - - fn visit_map(self, mut map: A) -> Result - where - A: MapAccess<'de>, - { - while let Some((key, value)) = - map.next_entry_seed(FieldIdResolver(&mut *self.index), ValueDeserializer)? - { - self.values.insert(key, value); - } - - self.obkv_buffer.clear(); - let mut obkv = obkv::KvWriter::new(Cursor::new(&mut *self.obkv_buffer)); - for (key, value) in self.values.iter() { - self.value_buffer.clear(); - // This is guaranteed to work - tri!(serde_json::to_writer(Cursor::new(&mut *self.value_buffer), value)); - tri!(obkv.insert(*key, &self.value_buffer)); - } - - let reader = tri!(obkv.into_inner()).into_inner(); - - tri!(self.inner.write_u32::(reader.len() as u32)); - tri!(self.inner.write_all(reader)); - - *self.count += 1; - self.values.clear(); - - Ok(Ok(())) - } - - fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "a documents, or a sequence of documents.") - } -} - -impl<'a, 'de, W> DeserializeSeed<'de> for &mut DocumentVisitor<'a, W> -where - W: Write, -{ - type Value = Result<(), Error>; - - fn deserialize(self, deserializer: D) -> Result - where - D: serde::Deserializer<'de>, - { - deserializer.deserialize_map(self) - } -} From e8297ad27e4977f4f43c43f181dc9d7c9ea041dd Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 14 Jun 2022 16:04:27 +0200 Subject: [PATCH 04/55] Fix the tests for the new DocumentsBatchBuilder/Reader --- milli/src/documents/builder.rs | 330 +++++++++--------- milli/src/documents/mod.rs | 133 +++---- milli/src/search/distinct/mod.rs | 31 +- milli/src/update/index_documents/mod.rs | 65 ++-- milli/src/update/index_documents/transform.rs | 12 +- milli/tests/search/facet_distribution.rs | 26 +- milli/tests/search/mod.rs | 18 +- milli/tests/search/query_criteria.rs | 20 +- milli/tests/search/typo_tolerance.rs | 37 +- 9 files changed, 295 insertions(+), 377 deletions(-) diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs index 159afb8d9..19cc1ce53 100644 --- a/milli/src/documents/builder.rs +++ b/milli/src/documents/builder.rs @@ -183,7 +183,8 @@ mod test { use serde_json::{json, Map}; use super::*; - use crate::documents::DocumentBatchReader; + use crate::documents::DocumentsBatchReader; + use crate::FieldId; fn obkv_to_value(obkv: &obkv::KvReader, index: &DocumentsBatchIndex) -> Value { let mut map = Map::new(); @@ -192,7 +193,7 @@ mod test { let field_name = index.name(fid).unwrap().clone(); let value: Value = serde_json::from_slice(value).unwrap(); - map.insert(field_name, value); + map.insert(field_name.to_string(), value); } Value::Object(map) @@ -200,15 +201,13 @@ mod test { #[test] fn add_single_documents_json() { - let mut cursor = Cursor::new(Vec::new()); - let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); - let json = serde_json::json!({ "id": 1, "field": "hello!", }); - builder.extend_from_json(Cursor::new(serde_json::to_vec(&json).unwrap())).unwrap(); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_json_object(json.as_object().unwrap()).unwrap(); let json = serde_json::json!({ "blabla": false, @@ -216,100 +215,64 @@ mod test { "id": 1, }); - builder.extend_from_json(Cursor::new(serde_json::to_vec(&json).unwrap())).unwrap(); - - assert_eq!(builder.len(), 2); - - builder.finish().unwrap(); - - cursor.set_position(0); + builder.append_json_object(json.as_object().unwrap()).unwrap(); - let mut reader = DocumentBatchReader::from_reader(cursor).unwrap(); - - let (index, document) = reader.next_document_with_index().unwrap().unwrap(); - assert_eq!(index.len(), 3); - assert_eq!(document.iter().count(), 2); + assert_eq!(builder.documents_count(), 2); + let vector = builder.into_inner().unwrap(); - let (index, document) = reader.next_document_with_index().unwrap().unwrap(); + let mut cursor = + DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); + let index = cursor.documents_batch_index().clone(); assert_eq!(index.len(), 3); - assert_eq!(document.iter().count(), 3); - - assert!(reader.next_document_with_index().unwrap().is_none()); - } - - #[test] - fn add_documents_seq_json() { - let mut cursor = Cursor::new(Vec::new()); - let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); - - let json = serde_json::json!([{ - "id": 1, - "field": "hello!", - },{ - "blabla": false, - "field": "hello!", - "id": 1, - } - ]); - - builder.extend_from_json(Cursor::new(serde_json::to_vec(&json).unwrap())).unwrap(); - assert_eq!(builder.len(), 2); - - builder.finish().unwrap(); - - cursor.set_position(0); - - let mut reader = DocumentBatchReader::from_reader(cursor).unwrap(); - - let (index, document) = reader.next_document_with_index().unwrap().unwrap(); - assert_eq!(index.len(), 3); + let document = cursor.next_document().unwrap().unwrap(); assert_eq!(document.iter().count(), 2); - let (index, document) = reader.next_document_with_index().unwrap().unwrap(); - assert_eq!(index.len(), 3); + let document = cursor.next_document().unwrap().unwrap(); assert_eq!(document.iter().count(), 3); - assert!(reader.next_document_with_index().unwrap().is_none()); + assert!(cursor.next_document().unwrap().is_none()); } #[test] fn add_documents_csv() { - let mut cursor = Cursor::new(Vec::new()); + let csv_content = "id:number,field:string\n1,hello!\n2,blabla"; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); - let csv = "id:number,field:string\n1,hello!\n2,blabla"; + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv).unwrap(); + assert_eq!(builder.documents_count(), 2); + let vector = builder.into_inner().unwrap(); - let builder = - DocumentBatchBuilder::from_csv(Cursor::new(csv.as_bytes()), &mut cursor).unwrap(); - builder.finish().unwrap(); - - cursor.set_position(0); - - let mut reader = DocumentBatchReader::from_reader(cursor).unwrap(); - - let (index, document) = reader.next_document_with_index().unwrap().unwrap(); + let mut cursor = + DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); + let index = cursor.documents_batch_index().clone(); assert_eq!(index.len(), 2); + + let document = cursor.next_document().unwrap().unwrap(); assert_eq!(document.iter().count(), 2); - let (_index, document) = reader.next_document_with_index().unwrap().unwrap(); + let document = cursor.next_document().unwrap().unwrap(); assert_eq!(document.iter().count(), 2); - assert!(reader.next_document_with_index().unwrap().is_none()); + assert!(cursor.next_document().unwrap().is_none()); } #[test] fn simple_csv_document() { - let documents = r#"city,country,pop + let csv_content = r#"city,country,pop "Boston","United States","4628910""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); - let mut buf = Vec::new(); - DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) - .unwrap() - .finish() - .unwrap(); - let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); - let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); - let val = obkv_to_value(&doc, index); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv).unwrap(); + let vector = builder.into_inner().unwrap(); + + let mut cursor = + DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); + let index = cursor.documents_batch_index().clone(); + let doc = cursor.next_document().unwrap().unwrap(); + let val = obkv_to_value(&doc, &index); assert_eq!( val, @@ -320,22 +283,25 @@ mod test { }) ); - assert!(reader.next_document_with_index().unwrap().is_none()); + assert!(cursor.next_document().unwrap().is_none()); } #[test] fn coma_in_field() { - let documents = r#"city,country,pop + let csv_content = r#"city,country,pop "Boston","United, States","4628910""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); + + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv).unwrap(); + let vector = builder.into_inner().unwrap(); + + let mut cursor = + DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); + let index = cursor.documents_batch_index().clone(); - let mut buf = Vec::new(); - DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) - .unwrap() - .finish() - .unwrap(); - let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); - let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); - let val = obkv_to_value(&doc, index); + let doc = cursor.next_document().unwrap().unwrap(); + let val = obkv_to_value(&doc, &index); assert_eq!( val, @@ -349,17 +315,20 @@ mod test { #[test] fn quote_in_field() { - let documents = r#"city,country,pop + let csv_content = r#"city,country,pop "Boston","United"" States","4628910""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); - let mut buf = Vec::new(); - DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) - .unwrap() - .finish() - .unwrap(); - let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); - let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); - let val = obkv_to_value(&doc, index); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv).unwrap(); + let vector = builder.into_inner().unwrap(); + + let mut cursor = + DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); + let index = cursor.documents_batch_index().clone(); + + let doc = cursor.next_document().unwrap().unwrap(); + let val = obkv_to_value(&doc, &index); assert_eq!( val, @@ -373,17 +342,20 @@ mod test { #[test] fn integer_in_field() { - let documents = r#"city,country,pop:number + let csv_content = r#"city,country,pop:number "Boston","United States","4628910""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); + + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv).unwrap(); + let vector = builder.into_inner().unwrap(); - let mut buf = Vec::new(); - DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) - .unwrap() - .finish() - .unwrap(); - let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); - let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); - let val = obkv_to_value(&doc, index); + let mut cursor = + DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); + let index = cursor.documents_batch_index().clone(); + + let doc = cursor.next_document().unwrap().unwrap(); + let val = obkv_to_value(&doc, &index); assert_eq!( val, @@ -397,17 +369,20 @@ mod test { #[test] fn float_in_field() { - let documents = r#"city,country,pop:number + let csv_content = r#"city,country,pop:number "Boston","United States","4628910.01""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); + + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv).unwrap(); + let vector = builder.into_inner().unwrap(); + + let mut cursor = + DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); + let index = cursor.documents_batch_index().clone(); - let mut buf = Vec::new(); - DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) - .unwrap() - .finish() - .unwrap(); - let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); - let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); - let val = obkv_to_value(&doc, index); + let doc = cursor.next_document().unwrap().unwrap(); + let val = obkv_to_value(&doc, &index); assert_eq!( val, @@ -421,17 +396,20 @@ mod test { #[test] fn several_colon_in_header() { - let documents = r#"city:love:string,country:state,pop + let csv_content = r#"city:love:string,country:state,pop "Boston","United States","4628910""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); - let mut buf = Vec::new(); - DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) - .unwrap() - .finish() - .unwrap(); - let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); - let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); - let val = obkv_to_value(&doc, index); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv).unwrap(); + let vector = builder.into_inner().unwrap(); + + let mut cursor = + DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); + let index = cursor.documents_batch_index().clone(); + + let doc = cursor.next_document().unwrap().unwrap(); + let val = obkv_to_value(&doc, &index); assert_eq!( val, @@ -445,17 +423,20 @@ mod test { #[test] fn ending_by_colon_in_header() { - let documents = r#"city:,country,pop + let csv_content = r#"city:,country,pop "Boston","United States","4628910""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); + + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv).unwrap(); + let vector = builder.into_inner().unwrap(); - let mut buf = Vec::new(); - DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) - .unwrap() - .finish() - .unwrap(); - let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); - let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); - let val = obkv_to_value(&doc, index); + let mut cursor = + DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); + let index = cursor.documents_batch_index().clone(); + + let doc = cursor.next_document().unwrap().unwrap(); + let val = obkv_to_value(&doc, &index); assert_eq!( val, @@ -469,17 +450,20 @@ mod test { #[test] fn starting_by_colon_in_header() { - let documents = r#":city,country,pop + let csv_content = r#":city,country,pop "Boston","United States","4628910""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); + + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv).unwrap(); + let vector = builder.into_inner().unwrap(); + + let mut cursor = + DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); + let index = cursor.documents_batch_index().clone(); - let mut buf = Vec::new(); - DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) - .unwrap() - .finish() - .unwrap(); - let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); - let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); - let val = obkv_to_value(&doc, index); + let doc = cursor.next_document().unwrap().unwrap(); + let val = obkv_to_value(&doc, &index); assert_eq!( val, @@ -494,32 +478,36 @@ mod test { #[ignore] #[test] fn starting_by_colon_in_header2() { - let documents = r#":string,country,pop + let csv_content = r#":string,country,pop "Boston","United States","4628910""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); - let mut buf = Vec::new(); - DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) - .unwrap() - .finish() - .unwrap(); - let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv).unwrap(); + let vector = builder.into_inner().unwrap(); - assert!(reader.next_document_with_index().is_err()); + let mut cursor = + DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); + + assert!(cursor.next_document().is_err()); } #[test] fn double_colon_in_header() { - let documents = r#"city::string,country,pop + let csv_content = r#"city::string,country,pop "Boston","United States","4628910""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); + + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv).unwrap(); + let vector = builder.into_inner().unwrap(); + + let mut cursor = + DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); + let index = cursor.documents_batch_index().clone(); - let mut buf = Vec::new(); - DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) - .unwrap() - .finish() - .unwrap(); - let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); - let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); - let val = obkv_to_value(&doc, index); + let doc = cursor.next_document().unwrap().unwrap(); + let val = obkv_to_value(&doc, &index); assert_eq!( val, @@ -533,34 +521,32 @@ mod test { #[test] fn bad_type_in_header() { - let documents = r#"city,country:number,pop + let csv_content = r#"city,country:number,pop "Boston","United States","4628910""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); - let mut buf = Vec::new(); - assert!( - DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err() - ); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + assert!(builder.append_csv(csv).is_err()); } #[test] fn bad_column_count1() { - let documents = r#"city,country,pop -"Boston","United States","4628910", "too much""#; + let csv_content = r#"city,country,pop +"Boston","United States","4628910", "too much + let csv = csv::Reader::from_reader(Cursor::new(csv_content"#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); - let mut buf = Vec::new(); - assert!( - DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err() - ); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + assert!(builder.append_csv(csv).is_err()); } #[test] fn bad_column_count2() { - let documents = r#"city,country,pop + let csv_content = r#"city,country,pop "Boston","United States""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); - let mut buf = Vec::new(); - assert!( - DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err() - ); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + assert!(builder.append_csv(csv).is_err()); } } diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs index bd0afc6e4..7a34ae13b 100644 --- a/milli/src/documents/mod.rs +++ b/milli/src/documents/mod.rs @@ -159,7 +159,7 @@ mod test { #[test] fn create_documents_no_errors() { - let json = json!({ + let value = json!({ "number": 1, "string": "this is a field", "array": ["an", "array"], @@ -169,26 +169,17 @@ mod test { "bool": true }); - let json = serde_json::to_vec(&json).unwrap(); - - let mut v = Vec::new(); - let mut cursor = io::Cursor::new(&mut v); - - let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); - - builder.extend_from_json(Cursor::new(json)).unwrap(); - - builder.finish().unwrap(); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_json_object(value.as_object().unwrap()).unwrap(); + let vector = builder.into_inner().unwrap(); let mut documents = - DocumentBatchReader::from_reader(io::Cursor::new(cursor.into_inner())).unwrap(); - - assert_eq!(documents.index().iter().count(), 5); + DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); - let reader = documents.next_document_with_index().unwrap().unwrap(); - - assert_eq!(reader.1.iter().count(), 5); - assert!(documents.next_document_with_index().unwrap().is_none()); + assert_eq!(documents.documents_batch_index().iter().count(), 5); + let reader = documents.next_document().unwrap().unwrap(); + assert_eq!(reader.iter().count(), 5); + assert!(documents.next_document().unwrap().is_none()); } #[test] @@ -200,101 +191,55 @@ mod test { "toto": false, }); - let doc1 = serde_json::to_vec(&doc1).unwrap(); - let doc2 = serde_json::to_vec(&doc2).unwrap(); - - let mut v = Vec::new(); - let mut cursor = io::Cursor::new(&mut v); - - let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); - - builder.extend_from_json(Cursor::new(doc1)).unwrap(); - builder.extend_from_json(Cursor::new(doc2)).unwrap(); - - builder.finish().unwrap(); - - let mut documents = - DocumentBatchReader::from_reader(io::Cursor::new(cursor.into_inner())).unwrap(); - - assert_eq!(documents.index().iter().count(), 2); - - let reader = documents.next_document_with_index().unwrap().unwrap(); - - assert_eq!(reader.1.iter().count(), 1); - assert!(documents.next_document_with_index().unwrap().is_some()); - assert!(documents.next_document_with_index().unwrap().is_none()); - } - - #[test] - fn add_documents_array() { - let docs = json!([ - { "toto": false }, - { "tata": "hello" }, - ]); - - let docs = serde_json::to_vec(&docs).unwrap(); - - let mut v = Vec::new(); - let mut cursor = io::Cursor::new(&mut v); - - let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); - - builder.extend_from_json(Cursor::new(docs)).unwrap(); - - builder.finish().unwrap(); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_json_object(doc1.as_object().unwrap()).unwrap(); + builder.append_json_object(doc2.as_object().unwrap()).unwrap(); + let vector = builder.into_inner().unwrap(); let mut documents = - DocumentBatchReader::from_reader(io::Cursor::new(cursor.into_inner())).unwrap(); - - assert_eq!(documents.index().iter().count(), 2); - - let reader = documents.next_document_with_index().unwrap().unwrap(); - - assert_eq!(reader.1.iter().count(), 1); - assert!(documents.next_document_with_index().unwrap().is_some()); - assert!(documents.next_document_with_index().unwrap().is_none()); - } - - #[test] - fn add_invalid_document_format() { - let mut v = Vec::new(); - let mut cursor = io::Cursor::new(&mut v); - - let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); - - let docs = json!([[ - { "toto": false }, - { "tata": "hello" }, - ]]); - - let docs = serde_json::to_vec(&docs).unwrap(); - assert!(builder.extend_from_json(Cursor::new(docs)).is_err()); - - let docs = json!("hello"); - let docs = serde_json::to_vec(&docs).unwrap(); - - assert!(builder.extend_from_json(Cursor::new(docs)).is_err()); + DocumentsBatchReader::from_reader(io::Cursor::new(vector)).unwrap().into_cursor(); + assert_eq!(documents.documents_batch_index().iter().count(), 2); + let reader = documents.next_document().unwrap().unwrap(); + assert_eq!(reader.iter().count(), 1); + assert!(documents.next_document().unwrap().is_some()); + assert!(documents.next_document().unwrap().is_none()); } #[test] fn test_nested() { - let mut docs = documents!([{ + let docs_reader = documents!([{ "hello": { "toto": ["hello"] } }]); - let (_index, doc) = docs.next_document_with_index().unwrap().unwrap(); - + let mut cursor = docs_reader.into_cursor(); + let doc = cursor.next_document().unwrap().unwrap(); let nested: Value = serde_json::from_slice(doc.get(0).unwrap()).unwrap(); assert_eq!(nested, json!({ "toto": ["hello"] })); } #[test] - fn out_of_order_fields() { + fn out_of_order_json_fields() { let _documents = documents!([ {"id": 1,"b": 0}, {"id": 2,"a": 0,"b": 0}, ]); } + + #[test] + fn out_of_order_csv_fields() { + let csv1_content = "id:number,b\n1,0"; + let csv1 = csv::Reader::from_reader(Cursor::new(csv1_content)); + + let csv2_content = "id:number,a,b\n2,0,0"; + let csv2 = csv::Reader::from_reader(Cursor::new(csv2_content)); + + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv1).unwrap(); + builder.append_csv(csv2).unwrap(); + let vector = builder.into_inner().unwrap(); + + DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap(); + } } diff --git a/milli/src/search/distinct/mod.rs b/milli/src/search/distinct/mod.rs index 237fd718a..670fa01ac 100644 --- a/milli/src/search/distinct/mod.rs +++ b/milli/src/search/distinct/mod.rs @@ -35,7 +35,7 @@ mod test { use roaring::RoaringBitmap; use serde_json::{json, Value}; - use crate::documents::{DocumentBatchBuilder, DocumentBatchReader}; + use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use crate::index::tests::TempIndex; use crate::index::Index; use crate::update::{ @@ -43,14 +43,11 @@ mod test { }; use crate::{DocumentId, FieldId, BEU32}; - static JSON: Lazy> = Lazy::new(generate_documents); - - fn generate_documents() -> Vec { + static JSON: Lazy> = Lazy::new(|| { let mut rng = rand::thread_rng(); let num_docs = rng.gen_range(10..30); - let mut cursor = Cursor::new(Vec::new()); - let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); let txts = ["Toto", "Titi", "Tata"]; let cats = (1..10).map(|i| i.to_string()).collect::>(); let cat_ints = (1..10).collect::>(); @@ -63,7 +60,7 @@ mod test { let mut sample_ints = cat_ints.clone(); sample_ints.shuffle(&mut rng); - let doc = json!({ + let json = json!({ "id": i, "txt": txt, "cat-int": rng.gen_range(0..3), @@ -71,13 +68,16 @@ mod test { "cat-ints": sample_ints[..(rng.gen_range(0..3))], }); - let doc = Cursor::new(serde_json::to_vec(&doc).unwrap()); - builder.extend_from_json(doc).unwrap(); + let object = match json { + Value::Object(object) => object, + _ => panic!(), + }; + + builder.append_json_object(&object).unwrap(); } - builder.finish().unwrap(); - cursor.into_inner() - } + builder.into_inner().unwrap() + }); /// Returns a temporary index populated with random test documents, the FieldId for the /// distinct attribute, and the RoaringBitmap with the document ids. @@ -101,7 +101,8 @@ mod test { IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| ()).unwrap(); let reader = - crate::documents::DocumentBatchReader::from_reader(Cursor::new(&*JSON)).unwrap(); + crate::documents::DocumentsBatchReader::from_reader(Cursor::new(JSON.as_slice())) + .unwrap(); addition.add_documents(reader).unwrap(); addition.execute().unwrap(); @@ -109,8 +110,8 @@ mod test { let fields_map = index.fields_ids_map(&txn).unwrap(); let fid = fields_map.id(&distinct).unwrap(); - let documents = DocumentBatchReader::from_reader(Cursor::new(&*JSON)).unwrap(); - let map = (0..documents.len() as u32).collect(); + let documents = DocumentsBatchReader::from_reader(Cursor::new(JSON.as_slice())).unwrap(); + let map = (0..documents.documents_count() as u32).collect(); txn.commit().unwrap(); diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index ba428f078..7f6e00b11 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -25,7 +25,7 @@ pub use self::helpers::{ }; use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; pub use self::transform::{Transform, TransformOutput}; -use crate::documents::DocumentBatchReader; +use crate::documents::DocumentsBatchReader; pub use crate::update::index_documents::helpers::CursorClonableMmap; use crate::update::{ self, Facets, IndexerConfig, UpdateIndexingStep, WordPrefixDocids, @@ -121,7 +121,7 @@ where /// builder, and the builder must be discarded. /// /// Returns the number of documents added to the builder. - pub fn add_documents(&mut self, reader: DocumentBatchReader) -> Result + pub fn add_documents(&mut self, reader: DocumentsBatchReader) -> Result where R: Read + Seek, { @@ -590,9 +590,8 @@ mod tests { use maplit::hashset; use super::*; - use crate::documents::DocumentBatchBuilder; + use crate::documents::DocumentsBatchBuilder; use crate::update::DeleteDocuments; - use crate::HashMap; #[test] fn simple_document_replacement() { @@ -1252,21 +1251,17 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); - let mut big_object = HashMap::new(); - big_object.insert(S("id"), "wow"); + let mut big_object = serde_json::Map::new(); + big_object.insert(S("id"), serde_json::Value::from("wow")); for i in 0..1000 { let key = i.to_string(); - big_object.insert(key, "I am a text!"); + big_object.insert(key, serde_json::Value::from("I am a text!")); } - let mut cursor = Cursor::new(Vec::new()); - - let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); - let big_object = Cursor::new(serde_json::to_vec(&big_object).unwrap()); - builder.extend_from_json(big_object).unwrap(); - builder.finish().unwrap(); - cursor.set_position(0); - let content = DocumentBatchReader::from_reader(cursor).unwrap(); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_json_object(&big_object).unwrap(); + let vector = builder.into_inner().unwrap(); + let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap(); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); @@ -1288,23 +1283,19 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); - let mut big_object = HashMap::new(); - big_object.insert(S("id"), "wow"); + let mut big_object = serde_json::Map::new(); + big_object.insert(S("id"), serde_json::Value::from("wow")); let content: String = (0..=u16::MAX) .into_iter() .map(|p| p.to_string()) .reduce(|a, b| a + " " + b.as_ref()) .unwrap(); - big_object.insert("content".to_string(), &content); - - let mut cursor = Cursor::new(Vec::new()); + big_object.insert("content".to_string(), serde_json::Value::from(content)); - let big_object = serde_json::to_string(&big_object).unwrap(); - let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); - builder.extend_from_json(&mut big_object.as_bytes()).unwrap(); - builder.finish().unwrap(); - cursor.set_position(0); - let content = DocumentBatchReader::from_reader(cursor).unwrap(); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_json_object(&big_object).unwrap(); + let vector = builder.into_inner().unwrap(); + let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap(); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); @@ -1843,18 +1834,20 @@ mod tests { // Create 200 documents with a long text let content = { - let documents: Vec<_> = (0..200i32) + let documents_iter = (0..200i32) .into_iter() .map(|i| serde_json::json!({ "id": i, "script": script })) - .collect(); - - let mut writer = std::io::Cursor::new(Vec::new()); - let mut builder = crate::documents::DocumentBatchBuilder::new(&mut writer).unwrap(); - let documents = serde_json::to_vec(&documents).unwrap(); - builder.extend_from_json(std::io::Cursor::new(documents)).unwrap(); - builder.finish().unwrap(); - writer.set_position(0); - crate::documents::DocumentBatchReader::from_reader(writer).unwrap() + .filter_map(|json| match json { + serde_json::Value::Object(object) => Some(object), + _ => None, + }); + + let mut builder = crate::documents::DocumentsBatchBuilder::new(Vec::new()); + for object in documents_iter { + builder.append_json_object(&object).unwrap(); + } + let vector = builder.into_inner().unwrap(); + crate::documents::DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap() }; // Index those 200 long documents diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 12a858024..129357075 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -14,7 +14,7 @@ use smartstring::SmartString; use super::helpers::{create_sorter, create_writer, keep_latest_obkv, merge_obkvs, MergeFn}; use super::{IndexDocumentsMethod, IndexerConfig}; -use crate::documents::{DocumentBatchReader, DocumentsBatchIndex}; +use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader}; use crate::error::{Error, InternalError, UserError}; use crate::index::db_name; use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; @@ -152,7 +152,7 @@ impl<'a, 'i> Transform<'a, 'i> { pub fn read_documents( &mut self, - mut reader: DocumentBatchReader, + reader: DocumentsBatchReader, wtxn: &mut heed::RwTxn, progress_callback: F, ) -> Result @@ -160,7 +160,8 @@ impl<'a, 'i> Transform<'a, 'i> { R: Read + Seek, F: Fn(UpdateIndexingStep) + Sync, { - let fields_index = reader.index(); + let mut cursor = reader.into_cursor(); + let fields_index = cursor.documents_batch_index(); let external_documents_ids = self.index.external_documents_ids(wtxn)?; let mapping = create_fields_mapping(&mut self.fields_ids_map, fields_index)?; @@ -186,7 +187,8 @@ impl<'a, 'i> Transform<'a, 'i> { let mut documents_count = 0; let mut external_id_buffer = Vec::new(); let mut field_buffer: Vec<(u16, Cow<[u8]>)> = Vec::new(); - while let Some((addition_index, document)) = reader.next_document_with_index()? { + let addition_index = cursor.documents_batch_index().clone(); + while let Some(document) = cursor.next_document()? { let mut field_buffer_cache = drop_and_reuse(field_buffer); if self.indexer_settings.log_every_n.map_or(false, |len| documents_count % len == 0) { progress_callback(UpdateIndexingStep::RemapDocumentAddition { @@ -840,7 +842,7 @@ fn update_primary_key<'a>( None => { let mut json = Map::new(); for (key, value) in document.iter() { - let key = addition_index.name(key).cloned(); + let key = addition_index.name(key).map(ToString::to_string); let value = serde_json::from_slice::(&value).ok(); if let Some((k, v)) = key.zip(value) { diff --git a/milli/tests/search/facet_distribution.rs b/milli/tests/search/facet_distribution.rs index d3aece2ab..66713de1e 100644 --- a/milli/tests/search/facet_distribution.rs +++ b/milli/tests/search/facet_distribution.rs @@ -3,9 +3,10 @@ use std::io::Cursor; use big_s::S; use heed::EnvOpenOptions; use maplit::hashset; -use milli::documents::{DocumentBatchBuilder, DocumentBatchReader}; +use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; use milli::{FacetDistribution, Index}; +use serde_json::{Deserializer, Map, Value}; #[test] fn test_facet_distribution_with_no_facet_values() { @@ -30,35 +31,30 @@ fn test_facet_distribution_with_no_facet_values() { let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - let mut cursor = Cursor::new(Vec::new()); - let mut documents_builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); + let mut documents_builder = DocumentsBatchBuilder::new(Vec::new()); let reader = Cursor::new( - r#"[ - { + r#"{ "id": 123, "title": "What a week, hu...", "genres": [], "tags": ["blue"] - }, + } { "id": 345, "title": "I am the pig!", "tags": ["red"] - } - ]"#, + }"#, ); - for doc in serde_json::Deserializer::from_reader(reader).into_iter::() { - let doc = Cursor::new(serde_json::to_vec(&doc.unwrap()).unwrap()); - documents_builder.extend_from_json(doc).unwrap(); + for result in Deserializer::from_reader(reader).into_iter::>() { + let object = result.unwrap(); + documents_builder.append_json_object(&object).unwrap(); } - documents_builder.finish().unwrap(); - - cursor.set_position(0); + let vector = documents_builder.into_inner().unwrap(); // index documents - let content = DocumentBatchReader::from_reader(cursor).unwrap(); + let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap(); builder.add_documents(content).unwrap(); builder.execute().unwrap(); diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index 472fbafe0..4cf117dc7 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -6,10 +6,11 @@ use big_s::S; use either::{Either, Left, Right}; use heed::EnvOpenOptions; use maplit::{hashmap, hashset}; -use milli::documents::{DocumentBatchBuilder, DocumentBatchReader}; +use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; use milli::{AscDesc, Criterion, DocumentId, Index, Member}; use serde::Deserialize; +use serde_json::{Deserializer, Map, Value}; use slice_group_by::GroupBy; mod distinct; @@ -62,21 +63,18 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - let mut cursor = Cursor::new(Vec::new()); - let mut documents_builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); + let mut documents_builder = DocumentsBatchBuilder::new(Vec::new()); let reader = Cursor::new(CONTENT.as_bytes()); - for doc in serde_json::Deserializer::from_reader(reader).into_iter::() { - let doc = Cursor::new(serde_json::to_vec(&doc.unwrap()).unwrap()); - documents_builder.extend_from_json(doc).unwrap(); + for result in Deserializer::from_reader(reader).into_iter::>() { + let object = result.unwrap(); + documents_builder.append_json_object(&object).unwrap(); } - documents_builder.finish().unwrap(); - - cursor.set_position(0); + let vector = documents_builder.into_inner().unwrap(); // index documents - let content = DocumentBatchReader::from_reader(cursor).unwrap(); + let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap(); builder.add_documents(content).unwrap(); builder.execute().unwrap(); diff --git a/milli/tests/search/query_criteria.rs b/milli/tests/search/query_criteria.rs index 893d7c30a..89a6a6eec 100644 --- a/milli/tests/search/query_criteria.rs +++ b/milli/tests/search/query_criteria.rs @@ -5,7 +5,7 @@ use big_s::S; use heed::EnvOpenOptions; use itertools::Itertools; use maplit::hashset; -use milli::documents::{DocumentBatchBuilder, DocumentBatchReader}; +use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; use milli::{AscDesc, Criterion, Index, Member, Search, SearchResult}; use rand::Rng; @@ -393,8 +393,7 @@ fn criteria_ascdesc() { let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - let mut cursor = Cursor::new(Vec::new()); - let mut batch_builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); + let mut batch_builder = DocumentsBatchBuilder::new(Vec::new()); (0..ASC_DESC_CANDIDATES_THRESHOLD + 1).for_each(|_| { let mut rng = rand::thread_rng(); @@ -412,16 +411,17 @@ fn criteria_ascdesc() { "age": age, }); - let json = Cursor::new(serde_json::to_vec(&json).unwrap()); - batch_builder.extend_from_json(json).unwrap(); - }); - - batch_builder.finish().unwrap(); + let object = match json { + serde_json::Value::Object(object) => object, + _ => panic!(), + }; - cursor.set_position(0); + batch_builder.append_json_object(&object).unwrap(); + }); - let reader = DocumentBatchReader::from_reader(cursor).unwrap(); + let vector = batch_builder.into_inner().unwrap(); + let reader = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap(); builder.add_documents(reader).unwrap(); builder.execute().unwrap(); diff --git a/milli/tests/search/typo_tolerance.rs b/milli/tests/search/typo_tolerance.rs index 9a7986c5e..63bf22579 100644 --- a/milli/tests/search/typo_tolerance.rs +++ b/milli/tests/search/typo_tolerance.rs @@ -106,26 +106,23 @@ fn test_typo_disabled_on_word() { options.map_size(4096 * 100); let index = Index::new(options, tmp.path()).unwrap(); - let documents = json!([ - { - "id": 1usize, - "data": "zealand", - }, - { - "id": 2usize, - "data": "zearand", - }, - ]); - - let mut writer = std::io::Cursor::new(Vec::new()); - let mut builder = milli::documents::DocumentBatchBuilder::new(&mut writer).unwrap(); - let documents = serde_json::to_vec(&documents).unwrap(); - builder.extend_from_json(std::io::Cursor::new(documents)).unwrap(); - builder.finish().unwrap(); - - writer.set_position(0); - - let documents = milli::documents::DocumentBatchReader::from_reader(writer).unwrap(); + let mut builder = milli::documents::DocumentsBatchBuilder::new(Vec::new()); + let doc1 = json!({ + "id": 1usize, + "data": "zealand", + }); + + let doc2 = json!({ + "id": 2usize, + "data": "zearand", + }); + + builder.append_json_object(doc1.as_object().unwrap()).unwrap(); + builder.append_json_object(doc2.as_object().unwrap()).unwrap(); + let vector = builder.into_inner().unwrap(); + + let documents = + milli::documents::DocumentsBatchReader::from_reader(std::io::Cursor::new(vector)).unwrap(); let mut txn = index.write_txn().unwrap(); let config = IndexerConfig::default(); From 6d0498df2445975d8cfff3cb8c4b1453bd5e00e6 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 14 Jun 2022 16:22:59 +0200 Subject: [PATCH 05/55] Fix the fuzz tests --- milli/fuzz/fuzz_targets/indexing.rs | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/milli/fuzz/fuzz_targets/indexing.rs b/milli/fuzz/fuzz_targets/indexing.rs index b618aabad..5c3b79ed7 100644 --- a/milli/fuzz/fuzz_targets/indexing.rs +++ b/milli/fuzz/fuzz_targets/indexing.rs @@ -7,10 +7,10 @@ use anyhow::{bail, Result}; use arbitrary_json::ArbitraryValue; use heed::EnvOpenOptions; use libfuzzer_sys::fuzz_target; -use milli::documents::{DocumentBatchBuilder, DocumentBatchReader}; +use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; use milli::Index; -use serde_json::Value; +use serde_json::{Map, Value}; #[cfg(target_os = "linux")] #[global_allocator] @@ -19,21 +19,26 @@ static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; /// reads json from input and write an obkv batch to writer. pub fn read_json(input: impl Read, writer: impl Write + Seek) -> Result { let writer = BufWriter::new(writer); - let mut builder = DocumentBatchBuilder::new(writer)?; - builder.extend_from_json(input)?; + let mut builder = DocumentsBatchBuilder::new(writer); - if builder.len() == 0 { + let values: Vec> = serde_json::from_reader(input)?; + if builder.documents_count() == 0 { bail!("Empty payload"); } - let count = builder.finish()?; + for object in values { + builder.append_json_object(&object)?; + } + + let count = builder.documents_count(); + let vector = builder.into_inner()?; - Ok(count) + Ok(count as usize) } fn index_documents( index: &mut milli::Index, - documents: DocumentBatchReader>>, + documents: DocumentsBatchReader>>, ) -> Result<()> { let config = IndexerConfig::default(); let mut wtxn = index.write_txn()?; @@ -98,7 +103,7 @@ fuzz_target!(|batches: Vec>| { // We ignore all malformed documents if let Ok(_) = read_json(json.as_bytes(), &mut documents) { documents.rewind().unwrap(); - let documents = DocumentBatchReader::from_reader(documents).unwrap(); + let documents = DocumentsBatchReader::from_reader(documents).unwrap(); // A lot of errors can come out of milli and we don't know which ones are normal or not // so we are only going to look for the unexpected panics. let _ = index_documents(&mut index, documents); From a4ceef96246e8e121007c16a0d159cd4c99ede11 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 14 Jun 2022 16:35:59 +0200 Subject: [PATCH 06/55] Fix the cli for the new DocumentsBatchBuilder/Reader structs --- cli/src/main.rs | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/cli/src/main.rs b/cli/src/main.rs index 14bc797af..dcd0f407a 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -8,6 +8,7 @@ use std::time::Instant; use byte_unit::Byte; use eyre::Result; use indicatif::{MultiProgress, ProgressBar, ProgressStyle}; +use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use milli::update::UpdateIndexingStep::{ ComputeIdsAndMergeDocuments, IndexDocuments, MergeDataIntoFinalDatabase, RemapDocumentAddition, }; @@ -225,9 +226,9 @@ impl Performer for DocumentAddition { DocumentAdditionFormat::Jsonl => documents_from_jsonl(reader)?, }; - let reader = milli::documents::DocumentBatchReader::from_reader(Cursor::new(documents))?; + let reader = DocumentsBatchReader::from_reader(Cursor::new(documents))?; - println!("Adding {} documents to the index.", reader.len()); + println!("Adding {} documents to the index.", reader.documents_count()); let mut txn = index.write_txn()?; let config = milli::update::IndexerConfig { log_every_n: Some(100), ..Default::default() }; @@ -321,35 +322,35 @@ fn indexing_callback(step: milli::update::UpdateIndexingStep, bars: &[ProgressBa } fn documents_from_jsonl(reader: impl Read) -> Result> { - let mut writer = Cursor::new(Vec::new()); - let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; + let mut documents = DocumentsBatchBuilder::new(Vec::new()); + let reader = BufReader::new(reader); - let mut buf = String::new(); - let mut reader = BufReader::new(reader); - - while reader.read_line(&mut buf)? > 0 { - documents.extend_from_json(&mut buf.as_bytes())?; + for result in serde_json::Deserializer::from_reader(reader).into_iter::>() { + let object = result?; + documents.append_json_object(&object)?; } - documents.finish()?; - Ok(writer.into_inner()) + documents.into_inner().map_err(Into::into) } fn documents_from_json(reader: impl Read) -> Result> { - let mut writer = Cursor::new(Vec::new()); - let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; + let mut documents = DocumentsBatchBuilder::new(Vec::new()); + let list: Vec> = serde_json::from_reader(reader)?; - documents.extend_from_json(reader)?; - documents.finish()?; + for object in list { + documents.append_json_object(&object)?; + } - Ok(writer.into_inner()) + documents.into_inner().map_err(Into::into) } fn documents_from_csv(reader: impl Read) -> Result> { - let mut writer = Cursor::new(Vec::new()); - milli::documents::DocumentBatchBuilder::from_csv(reader, &mut writer)?.finish()?; + let csv = csv::Reader::from_reader(reader); + + let mut documents = DocumentsBatchBuilder::new(Vec::new()); + documents.append_csv(csv)?; - Ok(writer.into_inner()) + documents.into_inner().map_err(Into::into) } #[derive(Debug, StructOpt)] From f29114f94aff42592d8f0a932784f578433e0744 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 14 Jun 2022 17:21:54 +0200 Subject: [PATCH 07/55] Fix http-ui to fit with the new DocumentsBatchBuilder/Reader structs --- http-ui/src/main.rs | 45 +++++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index ce4fa7ba5..63b9ee5e0 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -3,7 +3,7 @@ mod update_store; use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; use std::fmt::Display; use std::fs::{create_dir_all, File}; -use std::io::{BufRead, BufReader, Cursor, Read}; +use std::io::{BufReader, Cursor, Read}; use std::net::SocketAddr; use std::num::{NonZeroU32, NonZeroUsize}; use std::path::PathBuf; @@ -18,7 +18,7 @@ use either::Either; use flate2::read::GzDecoder; use futures::{stream, FutureExt, StreamExt}; use heed::EnvOpenOptions; -use milli::documents::DocumentBatchReader; +use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use milli::tokenizer::TokenizerBuilder; use milli::update::UpdateIndexingStep::*; use milli::update::{ @@ -399,7 +399,7 @@ async fn main() -> anyhow::Result<()> { otherwise => panic!("invalid update format {:?}", otherwise), }; - let documents = DocumentBatchReader::from_reader(Cursor::new(documents))?; + let documents = DocumentsBatchReader::from_reader(Cursor::new(documents))?; builder.add_documents(documents)?; @@ -1032,35 +1032,36 @@ async fn main() -> anyhow::Result<()> { Ok(()) } -fn documents_from_jsonl(reader: impl io::Read) -> anyhow::Result> { - let mut writer = Cursor::new(Vec::new()); - let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; +fn documents_from_jsonl(reader: impl Read) -> anyhow::Result> { + let mut documents = DocumentsBatchBuilder::new(Vec::new()); + let reader = BufReader::new(reader); - for result in BufReader::new(reader).lines() { - let line = result?; - documents.extend_from_json(Cursor::new(line))?; + for result in serde_json::Deserializer::from_reader(reader).into_iter::>() { + let object = result?; + documents.append_json_object(&object)?; } - documents.finish()?; - - Ok(writer.into_inner()) + documents.into_inner().map_err(Into::into) } -fn documents_from_json(reader: impl io::Read) -> anyhow::Result> { - let mut writer = Cursor::new(Vec::new()); - let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; +fn documents_from_json(reader: impl Read) -> anyhow::Result> { + let mut documents = DocumentsBatchBuilder::new(Vec::new()); + let list: Vec> = serde_json::from_reader(reader)?; - documents.extend_from_json(reader)?; - documents.finish()?; + for object in list { + documents.append_json_object(&object)?; + } - Ok(writer.into_inner()) + documents.into_inner().map_err(Into::into) } -fn documents_from_csv(reader: impl io::Read) -> anyhow::Result> { - let mut writer = Cursor::new(Vec::new()); - milli::documents::DocumentBatchBuilder::from_csv(reader, &mut writer)?.finish()?; +fn documents_from_csv(reader: impl Read) -> anyhow::Result> { + let csv = csv::Reader::from_reader(reader); + + let mut documents = DocumentsBatchBuilder::new(Vec::new()); + documents.append_csv(csv)?; - Ok(writer.into_inner()) + documents.into_inner().map_err(Into::into) } #[cfg(test)] From a97d4d63b9f1f29edac7f78aa33710682996e63c Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 14 Jun 2022 18:17:48 +0200 Subject: [PATCH 08/55] Fix the benchmarks --- benchmarks/benches/utils.rs | 41 ++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/benchmarks/benches/utils.rs b/benchmarks/benches/utils.rs index b2a9966a2..091b9b0f5 100644 --- a/benchmarks/benches/utils.rs +++ b/benchmarks/benches/utils.rs @@ -1,13 +1,13 @@ #![allow(dead_code)] use std::fs::{create_dir_all, remove_dir_all, File}; -use std::io::{self, BufRead, BufReader, Cursor, Read, Seek}; +use std::io::{self, BufReader, Cursor, Read, Seek}; use std::num::ParseFloatError; use std::path::Path; use criterion::BenchmarkId; use heed::EnvOpenOptions; -use milli::documents::DocumentBatchReader; +use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use milli::update::{ IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings, }; @@ -150,39 +150,38 @@ pub fn documents_from(filename: &str, filetype: &str) -> DocumentBatchReader documents_from_jsonl(reader).unwrap(), otherwise => panic!("invalid update format {:?}", otherwise), }; - DocumentBatchReader::from_reader(Cursor::new(documents)).unwrap() + DocumentsBatchReader::from_reader(Cursor::new(documents)).unwrap() } -fn documents_from_jsonl(mut reader: impl BufRead) -> anyhow::Result> { - let mut writer = Cursor::new(Vec::new()); - let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; +fn documents_from_jsonl(reader: impl BufRead) -> anyhow::Result> { + let mut documents = DocumentsBatchBuilder::new(Vec::new()); - let mut buf = String::new(); - - while reader.read_line(&mut buf)? > 0 { - documents.extend_from_json(&mut buf.as_bytes())?; - buf.clear(); + for result in serde_json::Deserializer::from_reader(reader).into_iter::>() { + let object = result?; + documents.append_json_object(&object)?; } - documents.finish()?; - Ok(writer.into_inner()) + documents.into_inner().map_err(Into::into) } fn documents_from_json(reader: impl BufRead) -> anyhow::Result> { - let mut writer = Cursor::new(Vec::new()); - let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; + let mut documents = DocumentsBatchBuilder::new(Vec::new()); + let list: Vec> = serde_json::from_reader(reader)?; - documents.extend_from_json(reader)?; - documents.finish()?; + for object in list { + documents.append_json_object(&object)?; + } - Ok(writer.into_inner()) + documents.into_inner().map_err(Into::into) } fn documents_from_csv(reader: impl BufRead) -> anyhow::Result> { - let mut writer = Cursor::new(Vec::new()); - milli::documents::DocumentBatchBuilder::from_csv(reader, &mut writer)?.finish()?; + let csv = csv::Reader::from_reader(reader); + + let mut documents = DocumentsBatchBuilder::new(Vec::new()); + documents.append_csv(csv)?; - Ok(writer.into_inner()) + documents.into_inner().map_err(Into::into) } enum AllowedType { From bdc426388379e80e03d8291d3a12b2a6361f0571 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 14 Jun 2022 18:12:15 +0200 Subject: [PATCH 09/55] Introduce the validate_documents_batch function --- milli/src/update/index_documents/transform.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 129357075..42187fc1e 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -13,7 +13,7 @@ use serde_json::{Map, Value}; use smartstring::SmartString; use super::helpers::{create_sorter, create_writer, keep_latest_obkv, merge_obkvs, MergeFn}; -use super::{IndexDocumentsMethod, IndexerConfig}; +use super::{validate_document_id, IndexDocumentsMethod, IndexerConfig}; use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader}; use crate::error::{Error, InternalError, UserError}; use crate::index::db_name; From cefffde9af704bc9b4146206a852972745941eda Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 14 Jun 2022 18:07:28 +0200 Subject: [PATCH 10/55] Improve the .gitignore of the fuzz crate --- milli/fuzz/.gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/milli/fuzz/.gitignore b/milli/fuzz/.gitignore index cb73742e4..ebf2c9395 100644 --- a/milli/fuzz/.gitignore +++ b/milli/fuzz/.gitignore @@ -1,2 +1,5 @@ +Cargo.lock +target/ + /corpus/ /artifacts/ From 0146175fe60e104519ebd23fa2e14bd1ff3e8bfc Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 14 Jun 2022 18:12:15 +0200 Subject: [PATCH 11/55] Introduce the validate_documents_batch function --- milli/src/documents/builder.rs | 36 ++--- milli/src/documents/mod.rs | 28 +++- milli/src/error.rs | 6 + .../extract/extract_geo_points.rs | 9 +- milli/src/update/index_documents/mod.rs | 36 +++-- milli/src/update/index_documents/transform.rs | 26 +--- milli/src/update/index_documents/validate.rs | 140 ++++++++++++++++++ 7 files changed, 208 insertions(+), 73 deletions(-) create mode 100644 milli/src/update/index_documents/validate.rs diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs index 19cc1ce53..15a22090a 100644 --- a/milli/src/documents/builder.rs +++ b/milli/src/documents/builder.rs @@ -180,24 +180,10 @@ fn parse_csv_header(header: &str) -> (&str, AllowedType) { mod test { use std::io::Cursor; - use serde_json::{json, Map}; + use serde_json::json; use super::*; - use crate::documents::DocumentsBatchReader; - use crate::FieldId; - - fn obkv_to_value(obkv: &obkv::KvReader, index: &DocumentsBatchIndex) -> Value { - let mut map = Map::new(); - - for (fid, value) in obkv.iter() { - let field_name = index.name(fid).unwrap().clone(); - let value: Value = serde_json::from_slice(value).unwrap(); - - map.insert(field_name.to_string(), value); - } - - Value::Object(map) - } + use crate::documents::{obkv_to_object, DocumentsBatchReader}; #[test] fn add_single_documents_json() { @@ -272,7 +258,7 @@ mod test { DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); let index = cursor.documents_batch_index().clone(); let doc = cursor.next_document().unwrap().unwrap(); - let val = obkv_to_value(&doc, &index); + let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); assert_eq!( val, @@ -301,7 +287,7 @@ mod test { let index = cursor.documents_batch_index().clone(); let doc = cursor.next_document().unwrap().unwrap(); - let val = obkv_to_value(&doc, &index); + let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); assert_eq!( val, @@ -328,7 +314,7 @@ mod test { let index = cursor.documents_batch_index().clone(); let doc = cursor.next_document().unwrap().unwrap(); - let val = obkv_to_value(&doc, &index); + let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); assert_eq!( val, @@ -355,7 +341,7 @@ mod test { let index = cursor.documents_batch_index().clone(); let doc = cursor.next_document().unwrap().unwrap(); - let val = obkv_to_value(&doc, &index); + let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); assert_eq!( val, @@ -382,7 +368,7 @@ mod test { let index = cursor.documents_batch_index().clone(); let doc = cursor.next_document().unwrap().unwrap(); - let val = obkv_to_value(&doc, &index); + let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); assert_eq!( val, @@ -409,7 +395,7 @@ mod test { let index = cursor.documents_batch_index().clone(); let doc = cursor.next_document().unwrap().unwrap(); - let val = obkv_to_value(&doc, &index); + let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); assert_eq!( val, @@ -436,7 +422,7 @@ mod test { let index = cursor.documents_batch_index().clone(); let doc = cursor.next_document().unwrap().unwrap(); - let val = obkv_to_value(&doc, &index); + let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); assert_eq!( val, @@ -463,7 +449,7 @@ mod test { let index = cursor.documents_batch_index().clone(); let doc = cursor.next_document().unwrap().unwrap(); - let val = obkv_to_value(&doc, &index); + let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); assert_eq!( val, @@ -507,7 +493,7 @@ mod test { let index = cursor.documents_batch_index().clone(); let doc = cursor.next_document().unwrap().unwrap(); - let val = obkv_to_value(&doc, &index); + let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); assert_eq!( val, diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs index 7a34ae13b..ee3593bf8 100644 --- a/milli/src/documents/mod.rs +++ b/milli/src/documents/mod.rs @@ -6,15 +6,30 @@ use std::io; use bimap::BiHashMap; pub use builder::DocumentsBatchBuilder; +use obkv::KvReader; pub use reader::{DocumentsBatchCursor, DocumentsBatchReader}; use serde::{Deserialize, Serialize}; -use crate::FieldId; +use crate::error::{FieldIdMapMissingEntry, InternalError}; +use crate::{FieldId, Object, Result}; /// The key that is used to store the `DocumentsBatchIndex` datastructure, /// it is the absolute last key of the list. const DOCUMENTS_BATCH_INDEX_KEY: [u8; 8] = u64::MAX.to_be_bytes(); +/// Helper function to convert an obkv reader into a JSON object. +pub fn obkv_to_object(obkv: &KvReader, index: &DocumentsBatchIndex) -> Result { + obkv.iter() + .map(|(field_id, value)| { + let field_name = index.name(field_id).ok_or_else(|| { + FieldIdMapMissingEntry::FieldId { field_id, process: "obkv_to_object" } + })?; + let value = serde_json::from_slice(value).map_err(InternalError::SerdeJson)?; + Ok((field_name.to_string(), value)) + }) + .collect() +} + /// A bidirectional map that links field ids to their name in a document batch. #[derive(Default, Clone, Debug, Serialize, Deserialize)] pub struct DocumentsBatchIndex(pub BiHashMap); @@ -48,11 +63,12 @@ impl DocumentsBatchIndex { self.0.get_by_left(&id).map(AsRef::as_ref) } - pub fn recreate_json( - &self, - document: &obkv::KvReaderU16, - ) -> Result, crate::Error> { - let mut map = serde_json::Map::new(); + pub fn id(&self, name: &str) -> Option { + self.0.get_by_right(name).cloned() + } + + pub fn recreate_json(&self, document: &obkv::KvReaderU16) -> Result { + let mut map = Object::new(); for (k, v) in document.iter() { // TODO: TAMO: update the error type diff --git a/milli/src/error.rs b/milli/src/error.rs index 57ae1c85a..d34130210 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -141,10 +141,16 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco #[derive(Error, Debug)] pub enum GeoError { + #[error("The `_geo` field in the document with the id: `{document_id}` is not an object. Was expecting an object with the `_geo.lat` and `_geo.lng` fields but instead got `{value}`.")] + NotAnObject { document_id: Value, value: Value }, + #[error("Could not find latitude nor longitude in the document with the id: `{document_id}`. Was expecting `_geo.lat` and `_geo.lng` fields.")] + MissingLatitudeAndLongitude { document_id: Value }, #[error("Could not find latitude in the document with the id: `{document_id}`. Was expecting a `_geo.lat` field.")] MissingLatitude { document_id: Value }, #[error("Could not find longitude in the document with the id: `{document_id}`. Was expecting a `_geo.lng` field.")] MissingLongitude { document_id: Value }, + #[error("Could not parse latitude nor longitude in the document with the id: `{document_id}`. Was expecting a number but instead got `{lat}` and `{lng}`.")] + BadLatitudeAndLongitude { document_id: Value, lat: Value, lng: Value }, #[error("Could not parse latitude in the document with the id: `{document_id}`. Was expecting a number but instead got `{value}`.")] BadLatitude { document_id: Value, value: Value }, #[error("Could not parse longitude in the document with the id: `{document_id}`. Was expecting a number but instead got `{value}`.")] diff --git a/milli/src/update/index_documents/extract/extract_geo_points.rs b/milli/src/update/index_documents/extract/extract_geo_points.rs index fffae5e77..0f804b93b 100644 --- a/milli/src/update/index_documents/extract/extract_geo_points.rs +++ b/milli/src/update/index_documents/extract/extract_geo_points.rs @@ -7,6 +7,7 @@ use serde_json::Value; use super::helpers::{create_writer, writer_into_reader, GrenadParameters}; use crate::error::GeoError; +use crate::update::index_documents::extract_float_from_value; use crate::{FieldId, InternalError, Result}; /// Extracts the geographical coordinates contained in each document under the `_geo` field. @@ -61,11 +62,3 @@ pub fn extract_geo_points( Ok(writer_into_reader(writer)?) } - -fn extract_float_from_value(value: Value) -> StdResult { - match value { - Value::Number(ref n) => n.as_f64().ok_or(value), - Value::String(ref s) => s.parse::().map_err(|_| value), - value => Err(value), - } -} diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 7f6e00b11..2fb7cbcd9 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -2,11 +2,13 @@ mod extract; mod helpers; mod transform; mod typed_chunk; +mod validate; use std::collections::HashSet; use std::io::{Cursor, Read, Seek}; use std::iter::FromIterator; use std::num::{NonZeroU32, NonZeroUsize}; +use std::result::Result as StdResult; use crossbeam_channel::{Receiver, Sender}; use heed::types::Str; @@ -25,13 +27,19 @@ pub use self::helpers::{ }; use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; pub use self::transform::{Transform, TransformOutput}; -use crate::documents::DocumentsBatchReader; +use self::validate::validate_documents_batch; +pub use self::validate::{ + extract_float_from_value, validate_document_id, validate_document_id_from_json, + validate_geo_from_json, +}; +use crate::documents::{obkv_to_object, DocumentsBatchReader}; +use crate::error::UserError; pub use crate::update::index_documents::helpers::CursorClonableMmap; use crate::update::{ self, Facets, IndexerConfig, UpdateIndexingStep, WordPrefixDocids, WordPrefixPairProximityDocids, WordPrefixPositionDocids, WordsPrefixesFst, }; -use crate::{Index, Result, RoaringBitmapCodec, UserError}; +use crate::{Index, Result, RoaringBitmapCodec}; static MERGED_DATABASE_COUNT: usize = 7; static PREFIX_DATABASE_COUNT: usize = 5; @@ -117,19 +125,27 @@ where /// Adds a batch of documents to the current builder. /// - /// Since the documents are progressively added to the writer, a failure will cause a stale - /// builder, and the builder must be discarded. + /// Since the documents are progressively added to the writer, a failure will cause only + /// return an error and not the `IndexDocuments` struct as it is invalid to use it afterward. /// /// Returns the number of documents added to the builder. - pub fn add_documents(&mut self, reader: DocumentsBatchReader) -> Result - where - R: Read + Seek, - { + pub fn add_documents( + mut self, + reader: DocumentsBatchReader, + ) -> Result<(Self, StdResult)> { // Early return when there is no document to add if reader.is_empty() { - return Ok(0); + return Ok((self, Ok(0))); } + // We check for user errors in this validator and if there is one, we can return + // the `IndexDocument` struct as it is valid to send more documents into it. + // However, if there is an internal error we throw it away! + let reader = match validate_documents_batch(self.wtxn, self.index, reader)? { + Ok(reader) => reader, + Err(user_error) => return Ok((self, Err(user_error))), + }; + let indexed_documents = self .transform .as_mut() @@ -139,7 +155,7 @@ where self.added_documents += indexed_documents; - Ok(indexed_documents) + Ok((self, Ok(indexed_documents))) } #[logging_timer::time("IndexDocuments::{}")] diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 42187fc1e..bc7eefd33 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -17,6 +17,7 @@ use super::{validate_document_id, IndexDocumentsMethod, IndexerConfig}; use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader}; use crate::error::{Error, InternalError, UserError}; use crate::index::db_name; +use crate::update::index_documents::validate_document_id_from_json; use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; use crate::{ ExternalDocumentsIds, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, @@ -782,14 +783,6 @@ fn compute_primary_key_pair( } } -fn validate_document_id(document_id: &str) -> Option<&str> { - let document_id = document_id.trim(); - Some(document_id).filter(|id| { - !id.is_empty() - && id.chars().all(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_')) - }) -} - /// Drops all the value of type `U` in vec, and reuses the allocation to create a `Vec`. /// /// The size and alignment of T and U must match. @@ -813,22 +806,7 @@ fn update_primary_key<'a>( ) -> Result> { match field_buffer_cache.iter_mut().find(|(id, _)| *id == primary_key_id) { Some((_, bytes)) => { - let value = match serde_json::from_slice(bytes).map_err(InternalError::SerdeJson)? { - Value::String(string) => match validate_document_id(&string) { - Some(s) if s.len() == string.len() => string, - Some(s) => s.to_string(), - None => { - return Err(UserError::InvalidDocumentId { - document_id: Value::String(string), - } - .into()) - } - }, - Value::Number(number) => number.to_string(), - content => { - return Err(UserError::InvalidDocumentId { document_id: content.clone() }.into()) - } - }; + let value = validate_document_id_from_json(bytes)??; serde_json::to_writer(external_id_buffer, &value).map_err(InternalError::SerdeJson)?; Ok(Cow::Owned(value)) } diff --git a/milli/src/update/index_documents/validate.rs b/milli/src/update/index_documents/validate.rs new file mode 100644 index 000000000..b4c0cb68f --- /dev/null +++ b/milli/src/update/index_documents/validate.rs @@ -0,0 +1,140 @@ +use std::io::{Read, Seek}; +use std::result::Result as StdResult; + +use serde_json::Value; + +use crate::error::{GeoError, InternalError, UserError}; +use crate::update::index_documents::{obkv_to_object, DocumentsBatchReader}; +use crate::{Index, Result}; + +/// This function validates a documents by checking that: +/// - we can infer a primary key, +/// - all the documents id exist and, +/// - the validity of them but also, +/// - the validity of the `_geo` field depending on the settings. +pub fn validate_documents_batch( + rtxn: &heed::RoTxn, + index: &Index, + reader: DocumentsBatchReader, +) -> Result, UserError>> { + let mut cursor = reader.into_cursor(); + let documents_batch_index = cursor.documents_batch_index().clone(); + + // The primary key *field id* that has already been set for this index or the one + // we will guess by searching for the first key that contains "id" as a substring. + let (primary_key, primary_key_id) = match index.primary_key(rtxn)? { + Some(primary_key) => match documents_batch_index.id(primary_key) { + Some(id) => (primary_key, id), + None => { + return match cursor.next_document()? { + Some(first_document) => Ok(Err(UserError::MissingDocumentId { + primary_key: primary_key.to_string(), + document: obkv_to_object(&first_document, &documents_batch_index)?, + })), + // If there is no document in this batch the best we can do is to return this error. + None => Ok(Err(UserError::MissingPrimaryKey)), + }; + } + }, + None => { + let guessed = documents_batch_index + .iter() + .filter(|(_, name)| name.contains("id")) + .min_by_key(|(fid, _)| *fid); + match guessed { + Some((id, name)) => (name.as_str(), *id), + None => return Ok(Err(UserError::MissingPrimaryKey)), + } + } + }; + + // If the settings specifies that a _geo field must be used therefore we must check the + // validity of it in all the documents of this batch and this is when we return `Some`. + let geo_field_id = match documents_batch_index.id("_geo") { + Some(geo_field_id) if index.sortable_fields(rtxn)?.contains("_geo") => Some(geo_field_id), + _otherwise => None, + }; + + while let Some(document) = cursor.next_document()? { + let document_id = match document.get(primary_key_id) { + Some(document_id_bytes) => match validate_document_id_from_json(document_id_bytes)? { + Ok(document_id) => document_id, + Err(user_error) => return Ok(Err(user_error)), + }, + None => { + return Ok(Err(UserError::MissingDocumentId { + primary_key: primary_key.to_string(), + document: obkv_to_object(&document, &documents_batch_index)?, + })) + } + }; + + if let Some(geo_value) = geo_field_id.and_then(|fid| document.get(fid)) { + if let Err(user_error) = validate_geo_from_json(Value::from(document_id), geo_value)? { + return Ok(Err(UserError::from(user_error))); + } + } + } + + Ok(Ok(cursor.into_reader())) +} + +/// Returns a trimmed version of the document id or `None` if it is invalid. +pub fn validate_document_id(document_id: &str) -> Option<&str> { + let id = document_id.trim(); + if !id.is_empty() + && id.chars().all(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_')) + { + Some(id) + } else { + None + } +} + +/// Parses a Json encoded document id and validate it, returning a user error when it is one. +pub fn validate_document_id_from_json(bytes: &[u8]) -> Result> { + match serde_json::from_slice(bytes).map_err(InternalError::SerdeJson)? { + Value::String(string) => match validate_document_id(&string) { + Some(s) if s.len() == string.len() => Ok(Ok(string)), + Some(s) => Ok(Ok(s.to_string())), + None => { + return Ok(Err(UserError::InvalidDocumentId { document_id: Value::String(string) })) + } + }, + Value::Number(number) => Ok(Ok(number.to_string())), + content => return Ok(Err(UserError::InvalidDocumentId { document_id: content.clone() })), + } +} + +/// Try to extract an `f64` from a JSON `Value` and return the `Value` +/// in the `Err` variant if it failed. +pub fn extract_float_from_value(value: Value) -> StdResult { + match value { + Value::Number(ref n) => n.as_f64().ok_or(value), + Value::String(ref s) => s.parse::().map_err(|_| value), + value => Err(value), + } +} + +pub fn validate_geo_from_json(document_id: Value, bytes: &[u8]) -> Result> { + let result = match serde_json::from_slice(bytes).map_err(InternalError::SerdeJson)? { + Value::Object(mut object) => match (object.remove("lat"), object.remove("lng")) { + (Some(lat), Some(lng)) => { + match (extract_float_from_value(lat), extract_float_from_value(lng)) { + (Ok(_), Ok(_)) => Ok(()), + (Err(value), Ok(_)) => Err(GeoError::BadLatitude { document_id, value }), + (Ok(_), Err(value)) => Err(GeoError::BadLongitude { document_id, value }), + (Err(lat), Err(lng)) => { + Err(GeoError::BadLatitudeAndLongitude { document_id, lat, lng }) + } + } + } + (None, Some(_)) => Err(GeoError::MissingLatitude { document_id }), + (Some(_), None) => Err(GeoError::MissingLongitude { document_id }), + (None, None) => Err(GeoError::MissingLatitudeAndLongitude { document_id }), + }, + value => Err(GeoError::NotAnObject { document_id, value }), + }; + + Ok(result) +} From fcfc4caf8c4512f9dce768017d95ac91804a90d2 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 15 Jun 2022 15:36:27 +0200 Subject: [PATCH 12/55] Move the Object type in the lib.rs file and use it everywhere --- benchmarks/benches/utils.rs | 20 ++++++++---------- cli/src/main.rs | 9 ++++---- http-ui/src/main.rs | 16 ++++++-------- milli/fuzz/fuzz_targets/indexing.rs | 2 +- milli/src/documents/builder.rs | 5 +++-- milli/src/error.rs | 6 ++---- milli/src/lib.rs | 21 ++++++++++--------- .../extract/extract_geo_points.rs | 1 - milli/src/update/index_documents/transform.rs | 2 +- milli/tests/search/facet_distribution.rs | 6 +++--- milli/tests/search/mod.rs | 6 +++--- 11 files changed, 43 insertions(+), 51 deletions(-) diff --git a/benchmarks/benches/utils.rs b/benchmarks/benches/utils.rs index 091b9b0f5..630e17943 100644 --- a/benchmarks/benches/utils.rs +++ b/benchmarks/benches/utils.rs @@ -11,8 +11,8 @@ use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use milli::update::{ IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings, }; -use milli::{Filter, Index}; -use serde_json::{Map, Value}; +use milli::{Filter, Index, Object}; +use serde_json::Value; pub struct Conf<'a> { /// where we are going to create our database.mmdb directory @@ -96,12 +96,10 @@ pub fn base_setup(conf: &Conf) -> Index { update_method: IndexDocumentsMethod::ReplaceDocuments, ..Default::default() }; - let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); let documents = documents_from(conf.dataset, conf.dataset_format); - - builder.add_documents(documents).unwrap(); - + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -156,7 +154,7 @@ pub fn documents_from(filename: &str, filetype: &str) -> DocumentBatchReader anyhow::Result> { let mut documents = DocumentsBatchBuilder::new(Vec::new()); - for result in serde_json::Deserializer::from_reader(reader).into_iter::>() { + for result in serde_json::Deserializer::from_reader(reader).into_iter::() { let object = result?; documents.append_json_object(&object)?; } @@ -166,7 +164,7 @@ fn documents_from_jsonl(reader: impl BufRead) -> anyhow::Result> { fn documents_from_json(reader: impl BufRead) -> anyhow::Result> { let mut documents = DocumentsBatchBuilder::new(Vec::new()); - let list: Vec> = serde_json::from_reader(reader)?; + let list: Vec = serde_json::from_reader(reader)?; for object in list { documents.append_json_object(&object)?; @@ -221,14 +219,14 @@ impl CSVDocumentDeserializer { } impl Iterator for CSVDocumentDeserializer { - type Item = anyhow::Result>; + type Item = anyhow::Result; fn next(&mut self) -> Option { let csv_document = self.documents.next()?; match csv_document { Ok(csv_document) => { - let mut document = Map::new(); + let mut document = Object::new(); for ((field_name, field_type), value) in self.headers.iter().zip(csv_document.into_iter()) diff --git a/cli/src/main.rs b/cli/src/main.rs index dcd0f407a..db4ca91ab 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -13,8 +13,7 @@ use milli::update::UpdateIndexingStep::{ ComputeIdsAndMergeDocuments, IndexDocuments, MergeDataIntoFinalDatabase, RemapDocumentAddition, }; use milli::update::{self, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig}; -use milli::Index; -use serde_json::{Map, Value}; +use milli::{Index, Object}; use structopt::StructOpt; #[cfg(target_os = "linux")] @@ -325,7 +324,7 @@ fn documents_from_jsonl(reader: impl Read) -> Result> { let mut documents = DocumentsBatchBuilder::new(Vec::new()); let reader = BufReader::new(reader); - for result in serde_json::Deserializer::from_reader(reader).into_iter::>() { + for result in serde_json::Deserializer::from_reader(reader).into_iter::() { let object = result?; documents.append_json_object(&object)?; } @@ -335,7 +334,7 @@ fn documents_from_jsonl(reader: impl Read) -> Result> { fn documents_from_json(reader: impl Read) -> Result> { let mut documents = DocumentsBatchBuilder::new(Vec::new()); - let list: Vec> = serde_json::from_reader(reader)?; + let list: Vec = serde_json::from_reader(reader)?; for object in list { documents.append_json_object(&object)?; @@ -424,7 +423,7 @@ impl Search { filter: &Option, offset: &Option, limit: &Option, - ) -> Result>> { + ) -> Result> { let txn = index.read_txn()?; let mut search = index.search(&txn); diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 63b9ee5e0..8167076c6 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -26,11 +26,11 @@ use milli::update::{ }; use milli::{ obkv_to_json, CompressionType, Filter as MilliFilter, FilterCondition, FormatOptions, Index, - MatcherBuilder, SearchResult, SortError, + MatcherBuilder, Object, SearchResult, SortError, }; use once_cell::sync::OnceCell; use serde::{Deserialize, Serialize}; -use serde_json::{Map, Value}; +use serde_json::Value; use structopt::StructOpt; use tokio::fs::File as TFile; use tokio::io::AsyncWriteExt; @@ -169,11 +169,7 @@ impl<'s, A: AsRef<[u8]>> Highlighter<'s, A> { } } - fn highlight_record( - &self, - object: &mut Map, - attributes_to_highlight: &HashSet, - ) { + fn highlight_record(&self, object: &mut Object, attributes_to_highlight: &HashSet) { // TODO do we need to create a string for element that are not and needs to be highlight? for (key, value) in object.iter_mut() { if attributes_to_highlight.contains(key) { @@ -708,7 +704,7 @@ async fn main() -> anyhow::Result<()> { #[derive(Debug, Serialize)] #[serde(rename_all = "camelCase")] struct Answer { - documents: Vec>, + documents: Vec, number_of_candidates: u64, facets: BTreeMap>, } @@ -1036,7 +1032,7 @@ fn documents_from_jsonl(reader: impl Read) -> anyhow::Result> { let mut documents = DocumentsBatchBuilder::new(Vec::new()); let reader = BufReader::new(reader); - for result in serde_json::Deserializer::from_reader(reader).into_iter::>() { + for result in serde_json::Deserializer::from_reader(reader).into_iter::() { let object = result?; documents.append_json_object(&object)?; } @@ -1046,7 +1042,7 @@ fn documents_from_jsonl(reader: impl Read) -> anyhow::Result> { fn documents_from_json(reader: impl Read) -> anyhow::Result> { let mut documents = DocumentsBatchBuilder::new(Vec::new()); - let list: Vec> = serde_json::from_reader(reader)?; + let list: Vec = serde_json::from_reader(reader)?; for object in list { documents.append_json_object(&object)?; diff --git a/milli/fuzz/fuzz_targets/indexing.rs b/milli/fuzz/fuzz_targets/indexing.rs index 5c3b79ed7..e4f42655e 100644 --- a/milli/fuzz/fuzz_targets/indexing.rs +++ b/milli/fuzz/fuzz_targets/indexing.rs @@ -21,7 +21,7 @@ pub fn read_json(input: impl Read, writer: impl Write + Seek) -> Result { let writer = BufWriter::new(writer); let mut builder = DocumentsBatchBuilder::new(writer); - let values: Vec> = serde_json::from_reader(input)?; + let values: Vec = serde_json::from_reader(input)?; if builder.documents_count() == 0 { bail!("Empty payload"); } diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs index 15a22090a..589e52269 100644 --- a/milli/src/documents/builder.rs +++ b/milli/src/documents/builder.rs @@ -1,9 +1,10 @@ use std::io::{self, Write}; use grenad::{CompressionType, WriterBuilder}; -use serde_json::{to_writer, Map, Value}; +use serde_json::{to_writer, Value}; use super::{DocumentsBatchIndex, Error, DOCUMENTS_BATCH_INDEX_KEY}; +use crate::Object; /// The `DocumentsBatchBuilder` provides a way to build a documents batch in the intermediary /// format used by milli. @@ -55,7 +56,7 @@ impl DocumentsBatchBuilder { } /// Appends a new JSON object into the batch and updates the `DocumentsBatchIndex` accordingly. - pub fn append_json_object(&mut self, object: &Map) -> io::Result<()> { + pub fn append_json_object(&mut self, object: &Object) -> io::Result<()> { // Make sure that we insert the fields ids in order as the obkv writer has this requirement. let mut fields_ids: Vec<_> = object.keys().map(|k| self.fields_index.insert(&k)).collect(); fields_ids.sort_unstable(); diff --git a/milli/src/error.rs b/milli/src/error.rs index d34130210..a23472951 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -4,12 +4,10 @@ use std::{io, str}; use heed::{Error as HeedError, MdbError}; use rayon::ThreadPoolBuildError; -use serde_json::{Map, Value}; +use serde_json::Value; use thiserror::Error; -use crate::{CriterionError, DocumentId, FieldId, SortError}; - -pub type Object = Map; +use crate::{CriterionError, DocumentId, FieldId, Object, SortError}; pub fn is_reserved_keyword(keyword: &str) -> bool { ["_geo", "_geoDistance", "_geoPoint", "_geoRadius"].contains(&keyword) diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 81cd057d5..a7be87183 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -20,7 +20,7 @@ use std::hash::BuildHasherDefault; pub use filter_parser::{Condition, FilterCondition}; use fxhash::{FxHasher32, FxHasher64}; pub use grenad::CompressionType; -use serde_json::{Map, Value}; +use serde_json::Value; pub use {charabia as tokenizer, heed}; pub use self::asc_desc::{AscDesc, AscDescError, Member, SortError}; @@ -43,20 +43,21 @@ pub use self::search::{ pub type Result = std::result::Result; -pub type FastMap4 = HashMap>; -pub type FastMap8 = HashMap>; -pub type SmallString32 = smallstr::SmallString<[u8; 32]>; -pub type SmallVec16 = smallvec::SmallVec<[T; 16]>; -pub type SmallVec32 = smallvec::SmallVec<[T; 32]>; -pub type SmallVec8 = smallvec::SmallVec<[T; 8]>; +pub type Attribute = u32; pub type BEU32 = heed::zerocopy::U32; pub type BEU64 = heed::zerocopy::U64; -pub type Attribute = u32; pub type DocumentId = u32; +pub type FastMap4 = HashMap>; +pub type FastMap8 = HashMap>; +pub type FieldDistribution = BTreeMap; pub type FieldId = u16; +pub type Object = serde_json::Map; pub type Position = u32; pub type RelativePosition = u16; -pub type FieldDistribution = BTreeMap; +pub type SmallString32 = smallstr::SmallString<[u8; 32]>; +pub type SmallVec16 = smallvec::SmallVec<[T; 16]>; +pub type SmallVec32 = smallvec::SmallVec<[T; 32]>; +pub type SmallVec8 = smallvec::SmallVec<[T; 8]>; /// A GeoPoint is a point in cartesian plan, called xyz_point in the code. Its metadata /// is a tuple composed of 1. the DocumentId of the associated document and 2. the original point @@ -82,7 +83,7 @@ pub fn obkv_to_json( displayed_fields: &[FieldId], fields_ids_map: &FieldsIdsMap, obkv: obkv::KvReaderU16, -) -> Result> { +) -> Result { displayed_fields .iter() .copied() diff --git a/milli/src/update/index_documents/extract/extract_geo_points.rs b/milli/src/update/index_documents/extract/extract_geo_points.rs index 0f804b93b..46ef9ba9b 100644 --- a/milli/src/update/index_documents/extract/extract_geo_points.rs +++ b/milli/src/update/index_documents/extract/extract_geo_points.rs @@ -1,6 +1,5 @@ use std::fs::File; use std::io; -use std::result::Result as StdResult; use concat_arrays::concat_arrays; use serde_json::Value; diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index bc7eefd33..4ece58509 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -13,7 +13,7 @@ use serde_json::{Map, Value}; use smartstring::SmartString; use super::helpers::{create_sorter, create_writer, keep_latest_obkv, merge_obkvs, MergeFn}; -use super::{validate_document_id, IndexDocumentsMethod, IndexerConfig}; +use super::{IndexDocumentsMethod, IndexerConfig}; use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader}; use crate::error::{Error, InternalError, UserError}; use crate::index::db_name; diff --git a/milli/tests/search/facet_distribution.rs b/milli/tests/search/facet_distribution.rs index 66713de1e..8890285e7 100644 --- a/milli/tests/search/facet_distribution.rs +++ b/milli/tests/search/facet_distribution.rs @@ -5,8 +5,8 @@ use heed::EnvOpenOptions; use maplit::hashset; use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; -use milli::{FacetDistribution, Index}; -use serde_json::{Deserializer, Map, Value}; +use milli::{FacetDistribution, Index, Object}; +use serde_json::Deserializer; #[test] fn test_facet_distribution_with_no_facet_values() { @@ -46,7 +46,7 @@ fn test_facet_distribution_with_no_facet_values() { }"#, ); - for result in Deserializer::from_reader(reader).into_iter::>() { + for result in Deserializer::from_reader(reader).into_iter::() { let object = result.unwrap(); documents_builder.append_json_object(&object).unwrap(); } diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index 4cf117dc7..0b6ce80cc 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -8,9 +8,9 @@ use heed::EnvOpenOptions; use maplit::{hashmap, hashset}; use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; -use milli::{AscDesc, Criterion, DocumentId, Index, Member}; +use milli::{AscDesc, Criterion, DocumentId, Index, Member, Object}; use serde::Deserialize; -use serde_json::{Deserializer, Map, Value}; +use serde_json::Deserializer; use slice_group_by::GroupBy; mod distinct; @@ -66,7 +66,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { let mut documents_builder = DocumentsBatchBuilder::new(Vec::new()); let reader = Cursor::new(CONTENT.as_bytes()); - for result in Deserializer::from_reader(reader).into_iter::>() { + for result in Deserializer::from_reader(reader).into_iter::() { let object = result.unwrap(); documents_builder.append_json_object(&object).unwrap(); } From 399eec5c0101780f74b9794957865c3fe8b3d519 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 15 Jun 2022 14:35:19 +0200 Subject: [PATCH 13/55] Fix the indexation tests --- benchmarks/benches/indexing.rs | 110 ++++++---- cli/src/main.rs | 7 +- http-ui/src/main.rs | 6 +- milli/src/index.rs | 25 ++- milli/src/search/distinct/mod.rs | 5 +- milli/src/search/facet/filter.rs | 5 +- milli/src/update/clear_documents.rs | 5 +- milli/src/update/delete_documents.rs | 18 +- milli/src/update/index_documents/mod.rs | 205 +++++++++++-------- milli/src/update/index_documents/validate.rs | 1 + milli/src/update/settings.rs | 70 ++++--- milli/tests/search/facet_distribution.rs | 6 +- milli/tests/search/mod.rs | 6 +- milli/tests/search/query_criteria.rs | 6 +- milli/tests/search/typo_tolerance.rs | 7 +- 15 files changed, 288 insertions(+), 194 deletions(-) diff --git a/benchmarks/benches/indexing.rs b/benchmarks/benches/indexing.rs index 3ae0a1a84..1b501b21a 100644 --- a/benchmarks/benches/indexing.rs +++ b/benchmarks/benches/indexing.rs @@ -132,12 +132,13 @@ fn indexing_songs_default(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -223,11 +224,12 @@ fn deleting_songs_in_batches_default(c: &mut Criterion) { let config = IndexerConfig::default(); let mut wtxn = index.write_txn().unwrap(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -279,11 +281,12 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { let config = IndexerConfig::default(); let mut wtxn = index.write_txn().unwrap(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_SONGS_1_2, "csv"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -294,19 +297,21 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_SONGS_3_4, "csv"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_SONGS_4_4, "csv"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -339,13 +344,14 @@ fn indexing_songs_without_faceted_numbers(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -377,12 +383,13 @@ fn indexing_songs_without_faceted_fields(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -415,12 +422,13 @@ fn indexing_wiki(c: &mut Criterion) { let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -507,11 +515,12 @@ fn deleting_wiki_in_batches_default(c: &mut Criterion) { let mut wtxn = index.write_txn().unwrap(); let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -564,12 +573,13 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_1_2, "csv"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -581,24 +591,26 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_3_4, "csv"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_4_4, "csv"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -631,12 +643,13 @@ fn indexing_movies_default(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::MOVIES, "json"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -720,11 +733,12 @@ fn deleting_movies_in_batches_default(c: &mut Criterion) { let config = IndexerConfig::default(); let mut wtxn = index.write_txn().unwrap(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::MOVIES, "json"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -775,12 +789,13 @@ fn indexing_movies_in_three_batches(c: &mut Criterion) { // as we don't care about the time it takes. let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::MOVIES_1_2, "json"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -791,21 +806,23 @@ fn indexing_movies_in_three_batches(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::MOVIES_3_4, "json"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::MOVIES_4_4, "json"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -861,12 +878,13 @@ fn indexing_nested_movies_default(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::NESTED_MOVIES, "json"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -922,11 +940,12 @@ fn deleting_nested_movies_in_batches_default(c: &mut Criterion) { let config = IndexerConfig::default(); let mut wtxn = index.write_txn().unwrap(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::NESTED_MOVIES, "json"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -984,12 +1003,13 @@ fn indexing_nested_movies_without_faceted_fields(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::NESTED_MOVIES, "json"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1021,12 +1041,13 @@ fn indexing_geo(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1113,11 +1134,12 @@ fn deleting_geo_in_batches_default(c: &mut Criterion) { let config = IndexerConfig::default(); let mut wtxn = index.write_txn().unwrap(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "json"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); diff --git a/cli/src/main.rs b/cli/src/main.rs index db4ca91ab..0d197af17 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -255,7 +255,7 @@ impl Performer for DocumentAddition { let bar = progesses.add(bar); bars.push(bar); } - let mut addition = milli::update::IndexDocuments::new( + let addition = milli::update::IndexDocuments::new( &mut txn, &index, &config, @@ -263,7 +263,10 @@ impl Performer for DocumentAddition { |step| indexing_callback(step, &bars), ) .unwrap(); - addition.add_documents(reader)?; + let (addition, user_error) = addition.add_documents(reader)?; + if let Err(error) = user_error { + return Err(error.into()); + } std::thread::spawn(move || { progesses.join().unwrap(); diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 8167076c6..117aa31e8 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -374,7 +374,7 @@ async fn main() -> anyhow::Result<()> { }); }; - let mut builder = milli::update::IndexDocuments::new( + let builder = milli::update::IndexDocuments::new( &mut wtxn, &index_cloned, GLOBAL_CONFIG.get().unwrap(), @@ -397,8 +397,8 @@ async fn main() -> anyhow::Result<()> { let documents = DocumentsBatchReader::from_reader(Cursor::new(documents))?; - builder.add_documents(documents)?; - + let (builder, user_error) = builder.add_documents(documents)?; + let _count = user_error?; let result = builder.execute(); match result { diff --git a/milli/src/index.rs b/milli/src/index.rs index 9637b4103..272877912 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1212,10 +1212,11 @@ pub(crate) mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1234,7 +1235,7 @@ pub(crate) mod tests { // we add all the documents a second time. we are supposed to get the same // field_distribution in the end let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); let content = documents!([ @@ -1242,7 +1243,8 @@ pub(crate) mod tests { { "id": 2, "name": "bob", "age": 20 }, { "id": 2, "name": "bob", "age": 20 }, ]); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1265,10 +1267,11 @@ pub(crate) mod tests { ]); let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1333,10 +1336,11 @@ pub(crate) mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1390,10 +1394,11 @@ pub(crate) mod tests { ]); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); diff --git a/milli/src/search/distinct/mod.rs b/milli/src/search/distinct/mod.rs index 670fa01ac..1a9c56cf3 100644 --- a/milli/src/search/distinct/mod.rs +++ b/milli/src/search/distinct/mod.rs @@ -97,14 +97,15 @@ mod test { update_method: IndexDocumentsMethod::ReplaceDocuments, ..Default::default() }; - let mut addition = + let addition = IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| ()).unwrap(); let reader = crate::documents::DocumentsBatchReader::from_reader(Cursor::new(JSON.as_slice())) .unwrap(); - addition.add_documents(reader).unwrap(); + let (addition, user_error) = addition.add_documents(reader).unwrap(); + user_error.unwrap(); addition.execute().unwrap(); let fields_map = index.fields_ids_map(&txn).unwrap(); diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index d89413f62..41e2f0657 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -648,10 +648,11 @@ mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index d1939df7b..3fe57eeae 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -100,9 +100,10 @@ mod tests { ]); let indexing_config = IndexDocumentsConfig::default(); let config = IndexerConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); // Clear all documents from the database. diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 3b519c101..49e7de8ae 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -657,13 +657,13 @@ mod tests { fn insert_documents<'t, R: std::io::Read + std::io::Seek>( wtxn: &mut RwTxn<'t, '_>, index: &'t Index, - documents: crate::documents::DocumentBatchReader, + documents: crate::documents::DocumentsBatchReader, ) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = - IndexDocuments::new(wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - builder.add_documents(documents).unwrap(); + let builder = IndexDocuments::new(wtxn, &index, &config, indexing_config, |_| ()).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); } @@ -701,9 +701,10 @@ mod tests { ]); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); // delete those documents, ids are synchronous therefore 0, 1, and 2. @@ -736,9 +737,10 @@ mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); // Delete not all of the documents but some of them. diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 2fb7cbcd9..ae42483f0 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -141,7 +141,12 @@ where // We check for user errors in this validator and if there is one, we can return // the `IndexDocument` struct as it is valid to send more documents into it. // However, if there is an internal error we throw it away! - let reader = match validate_documents_batch(self.wtxn, self.index, reader)? { + let reader = match validate_documents_batch( + self.wtxn, + self.index, + self.config.autogenerate_docids, + reader, + )? { Ok(reader) => reader, Err(user_error) => return Ok((self, Err(user_error))), }; @@ -626,10 +631,11 @@ mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -642,10 +648,11 @@ mod tests { // Second we send 1 document with id 1, to erase the previous ones. let mut wtxn = index.write_txn().unwrap(); let content = documents!([ { "id": 1, "name": "updated kevin" } ]); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -662,9 +669,11 @@ mod tests { { "id": 2, "name": "updated kevina" }, { "id": 3, "name": "updated benoit" } ]); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); + builder.execute().unwrap(); wtxn.commit().unwrap(); // Check that there is **always** 3 documents. @@ -694,10 +703,11 @@ mod tests { update_method: IndexDocumentsMethod::UpdateDocuments, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -722,9 +732,10 @@ mod tests { // Second we send 1 document with id 1, to force it to be merged with the previous one. let mut wtxn = index.write_txn().unwrap(); let content = documents!([ { "id": 1, "age": 25 } ]); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -765,7 +776,7 @@ mod tests { ]); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); assert!(builder.add_documents(content).is_err()); wtxn.commit().unwrap(); @@ -794,10 +805,11 @@ mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -814,9 +826,10 @@ mod tests { // Second we send 1 document with the generated uuid, to erase the previous ones. let mut wtxn = index.write_txn().unwrap(); let content = documents!([ { "name": "updated kevin", "id": kevin_uuid } ]); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -856,9 +869,10 @@ mod tests { ]); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -873,9 +887,10 @@ mod tests { let content = documents!([ { "name": "new kevin" } ]); let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -898,9 +913,10 @@ mod tests { let content = documents!([]); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -924,7 +940,7 @@ mod tests { let content = documents!([ { "id": "brume bleue", "name": "kevin" } ]); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); assert!(builder.add_documents(content).is_err()); @@ -934,9 +950,10 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); // There is a space in the document id. let content = documents!([ { "id": 32, "name": "kevin" } ]); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -963,9 +980,10 @@ mod tests { ]); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1009,9 +1027,10 @@ mod tests { update_method: IndexDocumentsMethod::ReplaceDocuments, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1020,7 +1039,7 @@ mod tests { update_method: IndexDocumentsMethod::UpdateDocuments, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); let documents = documents!([ { @@ -1030,7 +1049,8 @@ mod tests { } ]); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); } @@ -1057,9 +1077,10 @@ mod tests { update_method: IndexDocumentsMethod::ReplaceDocuments, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1099,10 +1120,11 @@ mod tests { { "id": 2, "_geo": { "lng": "42" }, "_geo.lat": "31" }, { "id": 3, "_geo.lat": 31, "_geo.lng": "42" }, ]); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1138,10 +1160,11 @@ mod tests { let documents = documents!([ { "id": 0, "_geo": { "lng": 42 } } ]); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); let error = builder.execute().unwrap_err(); assert_eq!( &error.to_string(), @@ -1151,10 +1174,11 @@ mod tests { let documents = documents!([ { "id": 0, "_geo": { "lat": 42 } } ]); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); let error = builder.execute().unwrap_err(); assert_eq!( &error.to_string(), @@ -1164,10 +1188,11 @@ mod tests { let documents = documents!([ { "id": 0, "_geo": { "lat": "lol", "lng": 42 } } ]); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); let error = builder.execute().unwrap_err(); assert_eq!( &error.to_string(), @@ -1177,10 +1202,11 @@ mod tests { let documents = documents!([ { "id": 0, "_geo": { "lat": [12, 13], "lng": 42 } } ]); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); let error = builder.execute().unwrap_err(); assert_eq!( &error.to_string(), @@ -1190,10 +1216,11 @@ mod tests { let documents = documents!([ { "id": 0, "_geo": { "lat": 12, "lng": "hello" } } ]); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); let error = builder.execute().unwrap_err(); assert_eq!( &error.to_string(), @@ -1217,10 +1244,11 @@ mod tests { ]); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); assert_eq!(index.primary_key(&wtxn).unwrap(), Some("objectId")); @@ -1237,10 +1265,11 @@ mod tests { { "objectId": 30, "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } } ]); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); let external_documents_ids = index.external_documents_ids(&wtxn).unwrap(); assert!(external_documents_ids.get("30").is_some()); @@ -1249,10 +1278,11 @@ mod tests { { "objectId": 30, "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } } ]); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1281,10 +1311,11 @@ mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1315,10 +1346,11 @@ mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1373,10 +1405,11 @@ mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1426,10 +1459,11 @@ mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1558,10 +1592,11 @@ mod tests { ]); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1628,10 +1663,11 @@ mod tests { // index the documents let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1720,10 +1756,11 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1737,10 +1774,11 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1759,10 +1797,11 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1787,10 +1826,11 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1832,10 +1872,11 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); } @@ -1870,10 +1911,11 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); // Create one long document @@ -1884,10 +1926,11 @@ mod tests { // Index this one long document let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1901,7 +1944,7 @@ mod tests { let index = Index::new(options, tmp).unwrap(); let mut wtxn = index.write_txn().unwrap(); let indexer_config = IndexerConfig::default(); - let mut builder = IndexDocuments::new( + let builder = IndexDocuments::new( &mut wtxn, &index, &indexer_config, @@ -1930,8 +1973,10 @@ mod tests { "branch_id_number": 0 }]}; - builder.add_documents(doc1).unwrap(); - builder.add_documents(doc2).unwrap(); + let (builder, user_error) = builder.add_documents(doc1).unwrap(); + user_error.unwrap(); + let (builder, user_error) = builder.add_documents(doc2).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); diff --git a/milli/src/update/index_documents/validate.rs b/milli/src/update/index_documents/validate.rs index b4c0cb68f..0ed1f1cc0 100644 --- a/milli/src/update/index_documents/validate.rs +++ b/milli/src/update/index_documents/validate.rs @@ -15,6 +15,7 @@ use crate::{Index, Result}; pub fn validate_documents_batch( rtxn: &heed::RoTxn, index: &Index, + autogenerate_docids: bool, reader: DocumentsBatchReader, ) -> Result, UserError>> { let mut cursor = reader.into_cursor(); diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index ccf29eb49..5f39579b7 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -735,10 +735,11 @@ mod tests { ]); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -798,10 +799,11 @@ mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -850,10 +852,11 @@ mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -880,10 +883,11 @@ mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); // In the same transaction we change the displayed fields to be only the age. @@ -934,10 +938,11 @@ mod tests { ]); let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -974,10 +979,11 @@ mod tests { let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1016,10 +1022,11 @@ mod tests { ]); let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1067,10 +1074,11 @@ mod tests { ]); let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1110,10 +1118,11 @@ mod tests { ]); let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1142,10 +1151,11 @@ mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1172,10 +1182,11 @@ mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); // In the same transaction we provide some stop_words @@ -1251,10 +1262,11 @@ mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); // In the same transaction provide some synonyms @@ -1389,10 +1401,11 @@ mod tests { ]); let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1452,10 +1465,11 @@ mod tests { ]); let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); diff --git a/milli/tests/search/facet_distribution.rs b/milli/tests/search/facet_distribution.rs index 8890285e7..83d692d7f 100644 --- a/milli/tests/search/facet_distribution.rs +++ b/milli/tests/search/facet_distribution.rs @@ -29,8 +29,7 @@ fn test_facet_distribution_with_no_facet_values() { let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() }; let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); let mut documents_builder = DocumentsBatchBuilder::new(Vec::new()); let reader = Cursor::new( r#"{ @@ -55,7 +54,8 @@ fn test_facet_distribution_with_no_facet_values() { // index documents let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index 0b6ce80cc..3b8960fcc 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -61,8 +61,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() }; let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); let mut documents_builder = DocumentsBatchBuilder::new(Vec::new()); let reader = Cursor::new(CONTENT.as_bytes()); @@ -75,7 +74,8 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { // index documents let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); diff --git a/milli/tests/search/query_criteria.rs b/milli/tests/search/query_criteria.rs index 89a6a6eec..a96366f5e 100644 --- a/milli/tests/search/query_criteria.rs +++ b/milli/tests/search/query_criteria.rs @@ -390,8 +390,7 @@ fn criteria_ascdesc() { // index documents let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() }; let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); let mut batch_builder = DocumentsBatchBuilder::new(Vec::new()); @@ -422,7 +421,8 @@ fn criteria_ascdesc() { let vector = batch_builder.into_inner().unwrap(); let reader = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap(); - builder.add_documents(reader).unwrap(); + let (builder, user_error) = builder.add_documents(reader).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); diff --git a/milli/tests/search/typo_tolerance.rs b/milli/tests/search/typo_tolerance.rs index 63bf22579..7c4cf8971 100644 --- a/milli/tests/search/typo_tolerance.rs +++ b/milli/tests/search/typo_tolerance.rs @@ -127,11 +127,10 @@ fn test_typo_disabled_on_word() { let mut txn = index.write_txn().unwrap(); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = - IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| ()).unwrap(); - - builder.add_documents(documents).unwrap(); + let builder = IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| ()).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); txn.commit().unwrap(); From 2ceeb51c37a98e8233edab175467e11101ef5dd2 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 15 Jun 2022 15:14:20 +0200 Subject: [PATCH 14/55] Support the auto-generated ids when validating documents --- milli/src/update/index_documents/mod.rs | 6 ++++-- milli/src/update/index_documents/validate.rs | 12 +++++++++--- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index ae42483f0..1a1fc9a0e 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -778,7 +778,8 @@ mod tests { let indexing_config = IndexDocumentsConfig::default(); let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - assert!(builder.add_documents(content).is_err()); + let (_builder, user_error) = builder.add_documents(content).unwrap(); + assert!(user_error.is_err()); wtxn.commit().unwrap(); // Check that there is no document. @@ -943,7 +944,8 @@ mod tests { let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - assert!(builder.add_documents(content).is_err()); + let (_builder, user_error) = builder.add_documents(content).unwrap(); + assert!(user_error.is_err()); wtxn.commit().unwrap(); // First we send 1 document with a valid id. diff --git a/milli/src/update/index_documents/validate.rs b/milli/src/update/index_documents/validate.rs index 0ed1f1cc0..0cb0b4aff 100644 --- a/milli/src/update/index_documents/validate.rs +++ b/milli/src/update/index_documents/validate.rs @@ -19,20 +19,20 @@ pub fn validate_documents_batch( reader: DocumentsBatchReader, ) -> Result, UserError>> { let mut cursor = reader.into_cursor(); - let documents_batch_index = cursor.documents_batch_index().clone(); + let mut documents_batch_index = cursor.documents_batch_index().clone(); // The primary key *field id* that has already been set for this index or the one // we will guess by searching for the first key that contains "id" as a substring. let (primary_key, primary_key_id) = match index.primary_key(rtxn)? { Some(primary_key) => match documents_batch_index.id(primary_key) { Some(id) => (primary_key, id), + None if autogenerate_docids => (primary_key, documents_batch_index.insert(primary_key)), None => { return match cursor.next_document()? { Some(first_document) => Ok(Err(UserError::MissingDocumentId { primary_key: primary_key.to_string(), document: obkv_to_object(&first_document, &documents_batch_index)?, })), - // If there is no document in this batch the best we can do is to return this error. None => Ok(Err(UserError::MissingPrimaryKey)), }; } @@ -40,10 +40,11 @@ pub fn validate_documents_batch( None => { let guessed = documents_batch_index .iter() - .filter(|(_, name)| name.contains("id")) + .filter(|(_, name)| name.to_lowercase().contains("id")) .min_by_key(|(fid, _)| *fid); match guessed { Some((id, name)) => (name.as_str(), *id), + None if autogenerate_docids => ("id", documents_batch_index.insert("id")), None => return Ok(Err(UserError::MissingPrimaryKey)), } } @@ -56,12 +57,16 @@ pub fn validate_documents_batch( _otherwise => None, }; + let mut count = 0; while let Some(document) = cursor.next_document()? { let document_id = match document.get(primary_key_id) { Some(document_id_bytes) => match validate_document_id_from_json(document_id_bytes)? { Ok(document_id) => document_id, Err(user_error) => return Ok(Err(user_error)), }, + None if autogenerate_docids => { + format!("{{auto-generated id of the {}nth document}}", count) + } None => { return Ok(Err(UserError::MissingDocumentId { primary_key: primary_key.to_string(), @@ -75,6 +80,7 @@ pub fn validate_documents_batch( return Ok(Err(UserError::from(user_error))); } } + count += 1; } Ok(Ok(cursor.into_reader())) From 19eb3b4708c1cd2619d2a77cefbcda7ff5e4bda5 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 15 Jun 2022 16:06:52 +0200 Subject: [PATCH 15/55] Make sur that we do not accept floats as documents ids --- milli/src/update/index_documents/mod.rs | 47 ++++++++++++++++++++ milli/src/update/index_documents/validate.rs | 2 +- 2 files changed, 48 insertions(+), 1 deletion(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 1a1fc9a0e..5bce3b851 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1987,4 +1987,51 @@ mod tests { assert_eq!(ids.len(), map.len()); } + + #[test] + fn primary_key_must_not_contain_floats() { + let tmp = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(4096 * 100); + let index = Index::new(options, tmp).unwrap(); + let mut wtxn = index.write_txn().unwrap(); + let indexer_config = IndexerConfig::default(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &indexer_config, + IndexDocumentsConfig::default(), + |_| (), + ) + .unwrap(); + + let doc1 = documents! {[{ + "id": -228142, + "title": "asdsad", + }]}; + + let doc2 = documents! {[{ + "id": 228143.56, + "title": "something", + }]}; + + let doc3 = documents! {[{ + "id": -228143.56, + "title": "something", + }]}; + + let doc4 = documents! {[{ + "id": 2.0, + "title": "something", + }]}; + + let (builder, user_error) = builder.add_documents(doc1).unwrap(); + user_error.unwrap(); + let (builder, user_error) = builder.add_documents(doc2).unwrap(); + assert!(user_error.is_err()); + let (builder, user_error) = builder.add_documents(doc3).unwrap(); + assert!(user_error.is_err()); + let (_builder, user_error) = builder.add_documents(doc4).unwrap(); + assert!(user_error.is_err()); + } } diff --git a/milli/src/update/index_documents/validate.rs b/milli/src/update/index_documents/validate.rs index 0cb0b4aff..c69c754ac 100644 --- a/milli/src/update/index_documents/validate.rs +++ b/milli/src/update/index_documents/validate.rs @@ -108,7 +108,7 @@ pub fn validate_document_id_from_json(bytes: &[u8]) -> Result Ok(Ok(number.to_string())), + Value::Number(number) if number.is_i64() => Ok(Ok(number.to_string())), content => return Ok(Err(UserError::InvalidDocumentId { document_id: content.clone() })), } } From 8ebf5eed0d80beb623a056d4e70a5d0535cd7181 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 15 Jun 2022 17:58:52 +0200 Subject: [PATCH 16/55] Make the nested primary key work --- milli/src/error.rs | 2 + milli/src/update/index_documents/mod.rs | 2 +- milli/src/update/index_documents/transform.rs | 5 +- milli/src/update/index_documents/validate.rs | 228 ++++++++++++++---- 4 files changed, 191 insertions(+), 46 deletions(-) diff --git a/milli/src/error.rs b/milli/src/error.rs index a23472951..d05acbe1c 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -121,6 +121,8 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco MaxDatabaseSizeReached, #[error("Document doesn't have a `{}` attribute: `{}`.", .primary_key, serde_json::to_string(.document).unwrap())] MissingDocumentId { primary_key: String, document: Object }, + #[error("Document have too many matching `{}` attribute: `{}`.", .primary_key, serde_json::to_string(.document).unwrap())] + TooManyDocumentIds { primary_key: String, document: Object }, #[error("The primary key inference process failed because the engine did not find any fields containing `id` substring in their name. If your document identifier does not contain any `id` substring, you can set the primary key of the index.")] MissingPrimaryKey, #[error("There is no more space left on the device. Consider increasing the size of the disk/partition.")] diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 5bce3b851..ba1064684 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -29,7 +29,7 @@ use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; pub use self::transform::{Transform, TransformOutput}; use self::validate::validate_documents_batch; pub use self::validate::{ - extract_float_from_value, validate_document_id, validate_document_id_from_json, + extract_float_from_value, validate_document_id, validate_document_id_value, validate_geo_from_json, }; use crate::documents::{obkv_to_object, DocumentsBatchReader}; diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 4ece58509..38f6dc8ff 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -17,7 +17,7 @@ use super::{IndexDocumentsMethod, IndexerConfig}; use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader}; use crate::error::{Error, InternalError, UserError}; use crate::index::db_name; -use crate::update::index_documents::validate_document_id_from_json; +use crate::update::index_documents::validate_document_id_value; use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; use crate::{ ExternalDocumentsIds, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, @@ -806,7 +806,8 @@ fn update_primary_key<'a>( ) -> Result> { match field_buffer_cache.iter_mut().find(|(id, _)| *id == primary_key_id) { Some((_, bytes)) => { - let value = validate_document_id_from_json(bytes)??; + let document_id = serde_json::from_slice(bytes).map_err(InternalError::SerdeJson)?; + let value = validate_document_id_value(document_id)??; serde_json::to_writer(external_id_buffer, &value).map_err(InternalError::SerdeJson)?; Ok(Cow::Owned(value)) } diff --git a/milli/src/update/index_documents/validate.rs b/milli/src/update/index_documents/validate.rs index c69c754ac..32e8de03f 100644 --- a/milli/src/update/index_documents/validate.rs +++ b/milli/src/update/index_documents/validate.rs @@ -1,11 +1,16 @@ use std::io::{Read, Seek}; +use std::iter; use std::result::Result as StdResult; use serde_json::Value; +use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader}; use crate::error::{GeoError, InternalError, UserError}; -use crate::update::index_documents::{obkv_to_object, DocumentsBatchReader}; -use crate::{Index, Result}; +use crate::update::index_documents::obkv_to_object; +use crate::{FieldId, Index, Object, Result}; + +/// The symbol used to define levels in a nested primary key. +const PRIMARY_KEY_SPLIT_SYMBOL: char = '.'; /// This function validates a documents by checking that: /// - we can infer a primary key, @@ -23,10 +28,15 @@ pub fn validate_documents_batch( // The primary key *field id* that has already been set for this index or the one // we will guess by searching for the first key that contains "id" as a substring. - let (primary_key, primary_key_id) = match index.primary_key(rtxn)? { + let primary_key = match index.primary_key(rtxn)? { + Some(primary_key) if primary_key.contains(PRIMARY_KEY_SPLIT_SYMBOL) => { + PrimaryKey::nested(primary_key) + } Some(primary_key) => match documents_batch_index.id(primary_key) { - Some(id) => (primary_key, id), - None if autogenerate_docids => (primary_key, documents_batch_index.insert(primary_key)), + Some(id) => PrimaryKey::flat(primary_key, id), + None if autogenerate_docids => { + PrimaryKey::flat(primary_key, documents_batch_index.insert(primary_key)) + } None => { return match cursor.next_document()? { Some(first_document) => Ok(Err(UserError::MissingDocumentId { @@ -43,8 +53,10 @@ pub fn validate_documents_batch( .filter(|(_, name)| name.to_lowercase().contains("id")) .min_by_key(|(fid, _)| *fid); match guessed { - Some((id, name)) => (name.as_str(), *id), - None if autogenerate_docids => ("id", documents_batch_index.insert("id")), + Some((id, name)) => PrimaryKey::flat(name.as_str(), *id), + None if autogenerate_docids => { + PrimaryKey::flat("id", documents_batch_index.insert("id")) + } None => return Ok(Err(UserError::MissingPrimaryKey)), } } @@ -59,20 +71,15 @@ pub fn validate_documents_batch( let mut count = 0; while let Some(document) = cursor.next_document()? { - let document_id = match document.get(primary_key_id) { - Some(document_id_bytes) => match validate_document_id_from_json(document_id_bytes)? { - Ok(document_id) => document_id, - Err(user_error) => return Ok(Err(user_error)), - }, - None if autogenerate_docids => { - format!("{{auto-generated id of the {}nth document}}", count) - } - None => { - return Ok(Err(UserError::MissingDocumentId { - primary_key: primary_key.to_string(), - document: obkv_to_object(&document, &documents_batch_index)?, - })) - } + let document_id = match fetch_document_id( + &document, + &documents_batch_index, + primary_key, + autogenerate_docids, + count, + )? { + Ok(document_id) => document_id, + Err(user_error) => return Ok(Err(user_error)), }; if let Some(geo_value) = geo_field_id.and_then(|fid| document.get(fid)) { @@ -86,30 +93,167 @@ pub fn validate_documents_batch( Ok(Ok(cursor.into_reader())) } +/// Retrieve the document id after validating it, returning a `UserError` +/// if the id is invalid or can't be guessed. +fn fetch_document_id( + document: &obkv::KvReader, + documents_batch_index: &DocumentsBatchIndex, + primary_key: PrimaryKey, + autogenerate_docids: bool, + count: usize, +) -> Result> { + match primary_key { + PrimaryKey::Flat { name: primary_key, field_id: primary_key_id } => { + match document.get(primary_key_id) { + Some(document_id_bytes) => { + let document_id = serde_json::from_slice(document_id_bytes) + .map_err(InternalError::SerdeJson)?; + match validate_document_id_value(document_id)? { + Ok(document_id) => Ok(Ok(document_id)), + Err(user_error) => Ok(Err(user_error)), + } + } + None if autogenerate_docids => { + Ok(Ok(format!("{{auto-generated id of the {}nth document}}", count))) + } + None => Ok(Err(UserError::MissingDocumentId { + primary_key: primary_key.to_string(), + document: obkv_to_object(&document, &documents_batch_index)?, + })), + } + } + nested @ PrimaryKey::Nested { .. } => { + let mut matching_documents_ids = Vec::new(); + for (first_level_name, right) in nested.possible_level_names() { + if let Some(field_id) = documents_batch_index.id(first_level_name) { + if let Some(value_bytes) = document.get(field_id) { + let object = serde_json::from_slice(value_bytes) + .map_err(InternalError::SerdeJson)?; + fetch_matching_values(object, right, &mut matching_documents_ids); + + if matching_documents_ids.len() >= 2 { + return Ok(Err(UserError::TooManyDocumentIds { + primary_key: nested.primary_key().to_string(), + document: obkv_to_object(&document, &documents_batch_index)?, + })); + } + } + } + } + + match matching_documents_ids.pop() { + Some(document_id) => match validate_document_id_value(document_id)? { + Ok(document_id) => Ok(Ok(document_id)), + Err(user_error) => Ok(Err(user_error)), + }, + None => Ok(Err(UserError::MissingDocumentId { + primary_key: nested.primary_key().to_string(), + document: obkv_to_object(&document, &documents_batch_index)?, + })), + } + } + } +} + +/// A type that represent the type of primary key that has been set +/// for this index, a classic flat one or a nested one. +#[derive(Debug, Clone, Copy)] +enum PrimaryKey<'a> { + Flat { name: &'a str, field_id: FieldId }, + Nested { name: &'a str }, +} + +impl PrimaryKey<'_> { + fn flat(name: &str, field_id: FieldId) -> PrimaryKey { + PrimaryKey::Flat { name, field_id } + } + + fn nested(name: &str) -> PrimaryKey { + PrimaryKey::Nested { name } + } + + fn primary_key(&self) -> &str { + match self { + PrimaryKey::Flat { name, .. } => name, + PrimaryKey::Nested { name } => name, + } + } + + /// Returns an `Iterator` that gives all the possible fields names the primary key + /// can have depending of the first level name and deepnes of the objects. + fn possible_level_names(&self) -> impl Iterator + '_ { + let name = self.primary_key(); + iter::successors(Some((name, "")), |(curr, _)| curr.rsplit_once(PRIMARY_KEY_SPLIT_SYMBOL)) + } +} + +fn contained_in(selector: &str, key: &str) -> bool { + selector.starts_with(key) + && selector[key.len()..] + .chars() + .next() + .map(|c| c == PRIMARY_KEY_SPLIT_SYMBOL) + .unwrap_or(true) +} + +pub fn fetch_matching_values(value: Value, selector: &str, output: &mut Vec) { + match value { + Value::Object(object) => fetch_matching_values_in_object(object, selector, "", output), + otherwise => output.push(otherwise), + } +} + +pub fn fetch_matching_values_in_object( + object: Object, + selector: &str, + base_key: &str, + output: &mut Vec, +) { + for (key, value) in object { + let base_key = if base_key.is_empty() { + key.to_string() + } else { + format!("{}{}{}", base_key, PRIMARY_KEY_SPLIT_SYMBOL, key) + }; + + // here if the user only specified `doggo` we need to iterate in all the fields of `doggo` + // so we check the contained_in on both side. + let should_continue = + contained_in(selector, &base_key) || contained_in(&base_key, selector); + + if should_continue { + match value { + Value::Object(object) => { + fetch_matching_values_in_object(object, selector, &base_key, output) + } + value => output.push(value), + } + } + } +} + /// Returns a trimmed version of the document id or `None` if it is invalid. pub fn validate_document_id(document_id: &str) -> Option<&str> { - let id = document_id.trim(); - if !id.is_empty() - && id.chars().all(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_')) + let document_id = document_id.trim(); + if !document_id.is_empty() + && document_id.chars().all(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_')) { - Some(id) + Some(document_id) } else { None } } /// Parses a Json encoded document id and validate it, returning a user error when it is one. -pub fn validate_document_id_from_json(bytes: &[u8]) -> Result> { - match serde_json::from_slice(bytes).map_err(InternalError::SerdeJson)? { +pub fn validate_document_id_value(document_id: Value) -> Result> { + match document_id { Value::String(string) => match validate_document_id(&string) { Some(s) if s.len() == string.len() => Ok(Ok(string)), Some(s) => Ok(Ok(s.to_string())), - None => { - return Ok(Err(UserError::InvalidDocumentId { document_id: Value::String(string) })) - } + None => Ok(Err(UserError::InvalidDocumentId { document_id: Value::String(string) })), }, Value::Number(number) if number.is_i64() => Ok(Ok(number.to_string())), - content => return Ok(Err(UserError::InvalidDocumentId { document_id: content.clone() })), + content => Ok(Err(UserError::InvalidDocumentId { document_id: content.clone() })), } } @@ -124,24 +268,22 @@ pub fn extract_float_from_value(value: Value) -> StdResult { } pub fn validate_geo_from_json(document_id: Value, bytes: &[u8]) -> Result> { - let result = match serde_json::from_slice(bytes).map_err(InternalError::SerdeJson)? { + match serde_json::from_slice(bytes).map_err(InternalError::SerdeJson)? { Value::Object(mut object) => match (object.remove("lat"), object.remove("lng")) { (Some(lat), Some(lng)) => { match (extract_float_from_value(lat), extract_float_from_value(lng)) { - (Ok(_), Ok(_)) => Ok(()), - (Err(value), Ok(_)) => Err(GeoError::BadLatitude { document_id, value }), - (Ok(_), Err(value)) => Err(GeoError::BadLongitude { document_id, value }), + (Ok(_), Ok(_)) => Ok(Ok(())), + (Err(value), Ok(_)) => Ok(Err(GeoError::BadLatitude { document_id, value })), + (Ok(_), Err(value)) => Ok(Err(GeoError::BadLongitude { document_id, value })), (Err(lat), Err(lng)) => { - Err(GeoError::BadLatitudeAndLongitude { document_id, lat, lng }) + Ok(Err(GeoError::BadLatitudeAndLongitude { document_id, lat, lng })) } } } - (None, Some(_)) => Err(GeoError::MissingLatitude { document_id }), - (Some(_), None) => Err(GeoError::MissingLongitude { document_id }), - (None, None) => Err(GeoError::MissingLatitudeAndLongitude { document_id }), + (None, Some(_)) => Ok(Err(GeoError::MissingLatitude { document_id })), + (Some(_), None) => Ok(Err(GeoError::MissingLongitude { document_id })), + (None, None) => Ok(Err(GeoError::MissingLatitudeAndLongitude { document_id })), }, - value => Err(GeoError::NotAnObject { document_id, value }), - }; - - Ok(result) + value => Ok(Err(GeoError::NotAnObject { document_id, value })), + } } From dc3f092d0782fd7b39bb29aa424caa04ada8de3c Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 16 Jun 2022 12:03:43 +0200 Subject: [PATCH 17/55] Do not leak an internal grenad Error --- milli/src/documents/mod.rs | 2 +- milli/src/documents/reader.rs | 32 ++++++++++++++++++++++++++++++-- milli/src/error.rs | 7 +++++++ 3 files changed, 38 insertions(+), 3 deletions(-) diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs index ee3593bf8..66a05b7b6 100644 --- a/milli/src/documents/mod.rs +++ b/milli/src/documents/mod.rs @@ -7,7 +7,7 @@ use std::io; use bimap::BiHashMap; pub use builder::DocumentsBatchBuilder; use obkv::KvReader; -pub use reader::{DocumentsBatchCursor, DocumentsBatchReader}; +pub use reader::{DocumentsBatchCursor, DocumentsBatchCursorError, DocumentsBatchReader}; use serde::{Deserialize, Serialize}; use crate::error::{FieldIdMapMissingEntry, InternalError}; diff --git a/milli/src/documents/reader.rs b/milli/src/documents/reader.rs index 3dff999f5..720b403b9 100644 --- a/milli/src/documents/reader.rs +++ b/milli/src/documents/reader.rs @@ -1,5 +1,5 @@ use std::convert::TryInto; -use std::io; +use std::{error, fmt, io}; use obkv::KvReader; @@ -79,7 +79,9 @@ impl DocumentsBatchCursor { impl DocumentsBatchCursor { /// Returns the next document, starting from the first one. Subsequent calls to /// `next_document` advance the document reader until all the documents have been read. - pub fn next_document(&mut self) -> Result>, grenad::Error> { + pub fn next_document( + &mut self, + ) -> Result>, DocumentsBatchCursorError> { match self.cursor.move_on_next()? { Some((key, value)) if key != DOCUMENTS_BATCH_INDEX_KEY => { Ok(Some(KvReader::new(value))) @@ -88,3 +90,29 @@ impl DocumentsBatchCursor { } } } + +/// The possible error thrown by the `DocumentsBatchCursor` when iterating on the documents. +#[derive(Debug)] +pub struct DocumentsBatchCursorError { + inner: grenad::Error, +} + +impl From for DocumentsBatchCursorError { + fn from(error: grenad::Error) -> DocumentsBatchCursorError { + DocumentsBatchCursorError { inner: error } + } +} + +impl Into for DocumentsBatchCursorError { + fn into(self) -> grenad::Error { + self.inner + } +} + +impl error::Error for DocumentsBatchCursorError {} + +impl fmt::Display for DocumentsBatchCursorError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + self.inner.fmt(f) + } +} diff --git a/milli/src/error.rs b/milli/src/error.rs index d05acbe1c..d9dca287d 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -7,6 +7,7 @@ use rayon::ThreadPoolBuildError; use serde_json::Value; use thiserror::Error; +use crate::documents::DocumentsBatchCursorError; use crate::{CriterionError, DocumentId, FieldId, Object, SortError}; pub fn is_reserved_keyword(keyword: &str) -> bool { @@ -209,6 +210,12 @@ where } } +impl From for Error { + fn from(error: DocumentsBatchCursorError) -> Error { + Error::from(Into::::into(error)) + } +} + impl From for Error { fn from(_error: Infallible) -> Error { unreachable!() From ea852200bbb9d9520ab7e13e5f6019c3b7c7fed3 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 22 Jun 2022 10:28:49 +0200 Subject: [PATCH 18/55] Fix the format used for a geo deleting benchmark --- benchmarks/benches/indexing.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/benches/indexing.rs b/benchmarks/benches/indexing.rs index 1b501b21a..80c7ba0ed 100644 --- a/benchmarks/benches/indexing.rs +++ b/benchmarks/benches/indexing.rs @@ -1137,7 +1137,7 @@ fn deleting_geo_in_batches_default(c: &mut Criterion) { let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); - let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "json"); + let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl"); let (builder, user_error) = builder.add_documents(documents).unwrap(); user_error.unwrap(); builder.execute().unwrap(); From 6a0a0ae94f38ee04ffed9bd252444128163b1550 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 20 Jun 2022 13:48:02 +0200 Subject: [PATCH 19/55] Make the Transform read from an EnrichedDocumentsBatchReader --- milli/src/documents/enriched.rs | 103 ++++++++++++++++++ milli/src/documents/mod.rs | 13 +++ milli/src/documents/reader.rs | 20 ++-- milli/src/error.rs | 10 +- milli/src/update/index_documents/mod.rs | 6 +- milli/src/update/index_documents/transform.rs | 8 +- milli/src/update/index_documents/validate.rs | 22 ++-- 7 files changed, 158 insertions(+), 24 deletions(-) create mode 100644 milli/src/documents/enriched.rs diff --git a/milli/src/documents/enriched.rs b/milli/src/documents/enriched.rs new file mode 100644 index 000000000..8645e06c4 --- /dev/null +++ b/milli/src/documents/enriched.rs @@ -0,0 +1,103 @@ +use std::fs::File; +use std::{io, str}; + +use obkv::KvReader; + +use super::{ + DocumentsBatchCursor, DocumentsBatchCursorError, DocumentsBatchIndex, DocumentsBatchReader, + Error, +}; +use crate::FieldId; + +/// The `EnrichedDocumentsBatchReader` provides a way to iterate over documents that have +/// been created with a `DocumentsBatchWriter` and, for the enriched data, +/// a simple `grenad::Reader`. +/// +/// The documents are returned in the form of `obkv::Reader` where each field is identified with a +/// `FieldId`. The mapping between the field ids and the field names is done thanks to the index. +pub struct EnrichedDocumentsBatchReader { + documents: DocumentsBatchReader, + external_ids: grenad::ReaderCursor, +} + +impl EnrichedDocumentsBatchReader { + pub fn new( + documents: DocumentsBatchReader, + external_ids: grenad::Reader, + ) -> Result { + if documents.documents_count() as u64 == external_ids.len() { + Ok(EnrichedDocumentsBatchReader { + documents, + external_ids: external_ids.into_cursor()?, + }) + } else { + Err(Error::InvalidEnrichedData) + } + } + + pub fn documents_count(&self) -> u32 { + self.documents.documents_count() + } + + pub fn is_empty(&self) -> bool { + self.documents.is_empty() + } + + pub fn documents_batch_index(&self) -> &DocumentsBatchIndex { + self.documents.documents_batch_index() + } + + /// This method returns a forward cursor over the enriched documents. + pub fn into_cursor(self) -> EnrichedDocumentsBatchCursor { + let EnrichedDocumentsBatchReader { documents, mut external_ids } = self; + external_ids.reset(); + EnrichedDocumentsBatchCursor { documents: documents.into_cursor(), external_ids } + } +} + +#[derive(Debug, Clone, Copy)] +pub struct EnrichedDocument<'a> { + pub document: KvReader<'a, FieldId>, + pub external_id: &'a str, +} + +pub struct EnrichedDocumentsBatchCursor { + documents: DocumentsBatchCursor, + external_ids: grenad::ReaderCursor, +} + +impl EnrichedDocumentsBatchCursor { + pub fn into_reader(self) -> EnrichedDocumentsBatchReader { + let EnrichedDocumentsBatchCursor { documents, external_ids } = self; + EnrichedDocumentsBatchReader { documents: documents.into_reader(), external_ids } + } + + pub fn documents_batch_index(&self) -> &DocumentsBatchIndex { + self.documents.documents_batch_index() + } + + /// Resets the cursor to be able to read from the start again. + pub fn reset(&mut self) { + self.documents.reset(); + self.external_ids.reset(); + } +} + +impl EnrichedDocumentsBatchCursor { + /// Returns the next document, starting from the first one. Subsequent calls to + /// `next_document` advance the document reader until all the documents have been read. + pub fn next_enriched_document( + &mut self, + ) -> Result, DocumentsBatchCursorError> { + let document = self.documents.next_document()?; + let external_id = match self.external_ids.move_on_next()? { + Some((_, bytes)) => Some(str::from_utf8(bytes)?), + None => None, + }; + + match document.zip(external_id) { + Some((document, external_id)) => Ok(Some(EnrichedDocument { document, external_id })), + None => Ok(None), + } + } +} diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs index 66a05b7b6..43bfc1c20 100644 --- a/milli/src/documents/mod.rs +++ b/milli/src/documents/mod.rs @@ -1,11 +1,14 @@ mod builder; +mod enriched; mod reader; use std::fmt::{self, Debug}; use std::io; +use std::str::Utf8Error; use bimap::BiHashMap; pub use builder::DocumentsBatchBuilder; +pub use enriched::{EnrichedDocument, EnrichedDocumentsBatchCursor, EnrichedDocumentsBatchReader}; use obkv::KvReader; pub use reader::{DocumentsBatchCursor, DocumentsBatchCursorError, DocumentsBatchReader}; use serde::{Deserialize, Serialize}; @@ -87,6 +90,8 @@ impl DocumentsBatchIndex { pub enum Error { ParseFloat { error: std::num::ParseFloatError, line: usize, value: String }, InvalidDocumentFormat, + InvalidEnrichedData, + InvalidUtf8(Utf8Error), Csv(csv::Error), Json(serde_json::Error), Serialize(serde_json::Error), @@ -118,6 +123,12 @@ impl From for Error { } } +impl From for Error { + fn from(other: Utf8Error) -> Self { + Self::InvalidUtf8(other) + } +} + impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { @@ -127,6 +138,8 @@ impl fmt::Display for Error { Error::InvalidDocumentFormat => { f.write_str("Invalid document addition format, missing the documents batch index.") } + Error::InvalidEnrichedData => f.write_str("Invalid enriched data."), + Error::InvalidUtf8(e) => write!(f, "{}", e), Error::Io(e) => write!(f, "{}", e), Error::Serialize(e) => write!(f, "{}", e), Error::Grenad(e) => write!(f, "{}", e), diff --git a/milli/src/documents/reader.rs b/milli/src/documents/reader.rs index 720b403b9..7bd6dbd51 100644 --- a/milli/src/documents/reader.rs +++ b/milli/src/documents/reader.rs @@ -1,5 +1,5 @@ use std::convert::TryInto; -use std::{error, fmt, io}; +use std::{error, fmt, io, str}; use obkv::KvReader; @@ -93,19 +93,20 @@ impl DocumentsBatchCursor { /// The possible error thrown by the `DocumentsBatchCursor` when iterating on the documents. #[derive(Debug)] -pub struct DocumentsBatchCursorError { - inner: grenad::Error, +pub enum DocumentsBatchCursorError { + Grenad(grenad::Error), + Utf8(str::Utf8Error), } impl From for DocumentsBatchCursorError { fn from(error: grenad::Error) -> DocumentsBatchCursorError { - DocumentsBatchCursorError { inner: error } + DocumentsBatchCursorError::Grenad(error) } } -impl Into for DocumentsBatchCursorError { - fn into(self) -> grenad::Error { - self.inner +impl From for DocumentsBatchCursorError { + fn from(error: str::Utf8Error) -> DocumentsBatchCursorError { + DocumentsBatchCursorError::Utf8(error) } } @@ -113,6 +114,9 @@ impl error::Error for DocumentsBatchCursorError {} impl fmt::Display for DocumentsBatchCursorError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - self.inner.fmt(f) + match self { + DocumentsBatchCursorError::Grenad(e) => e.fmt(f), + DocumentsBatchCursorError::Utf8(e) => e.fmt(f), + } } } diff --git a/milli/src/error.rs b/milli/src/error.rs index d9dca287d..0419ceeda 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -7,7 +7,7 @@ use rayon::ThreadPoolBuildError; use serde_json::Value; use thiserror::Error; -use crate::documents::DocumentsBatchCursorError; +use crate::documents::{self, DocumentsBatchCursorError}; use crate::{CriterionError, DocumentId, FieldId, Object, SortError}; pub fn is_reserved_keyword(keyword: &str) -> bool { @@ -36,6 +36,8 @@ pub enum InternalError { FieldIdMappingMissingEntry { key: FieldId }, #[error(transparent)] Fst(#[from] fst::Error), + #[error(transparent)] + DocumentsError(#[from] documents::Error), #[error("Invalid compression type have been specified to grenad.")] GrenadInvalidCompressionType, #[error("Invalid grenad file with an invalid version format.")] @@ -185,6 +187,7 @@ macro_rules! error_from_sub_error { error_from_sub_error! { FieldIdMapMissingEntry => InternalError, fst::Error => InternalError, + documents::Error => InternalError, str::Utf8Error => InternalError, ThreadPoolBuildError => InternalError, SerializationError => InternalError, @@ -212,7 +215,10 @@ where impl From for Error { fn from(error: DocumentsBatchCursorError) -> Error { - Error::from(Into::::into(error)) + match error { + DocumentsBatchCursorError::Grenad(e) => Error::from(e), + DocumentsBatchCursorError::Utf8(e) => Error::from(e), + } } } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index ba1064684..fe3bd1f8f 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -27,7 +27,7 @@ pub use self::helpers::{ }; use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; pub use self::transform::{Transform, TransformOutput}; -use self::validate::validate_documents_batch; +use self::validate::validate_and_enrich_documents_batch; pub use self::validate::{ extract_float_from_value, validate_document_id, validate_document_id_value, validate_geo_from_json, @@ -141,7 +141,7 @@ where // We check for user errors in this validator and if there is one, we can return // the `IndexDocument` struct as it is valid to send more documents into it. // However, if there is an internal error we throw it away! - let reader = match validate_documents_batch( + let enriched_documents_reader = match validate_and_enrich_documents_batch( self.wtxn, self.index, self.config.autogenerate_docids, @@ -155,7 +155,7 @@ where .transform .as_mut() .expect("Invalid document addition state") - .read_documents(reader, self.wtxn, &self.progress)? + .read_documents(enriched_documents_reader, self.wtxn, &self.progress)? as u64; self.added_documents += indexed_documents; diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 38f6dc8ff..4d0a4c311 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -14,7 +14,7 @@ use smartstring::SmartString; use super::helpers::{create_sorter, create_writer, keep_latest_obkv, merge_obkvs, MergeFn}; use super::{IndexDocumentsMethod, IndexerConfig}; -use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader}; +use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader}; use crate::error::{Error, InternalError, UserError}; use crate::index::db_name; use crate::update::index_documents::validate_document_id_value; @@ -153,7 +153,7 @@ impl<'a, 'i> Transform<'a, 'i> { pub fn read_documents( &mut self, - reader: DocumentsBatchReader, + reader: EnrichedDocumentsBatchReader, wtxn: &mut heed::RwTxn, progress_callback: F, ) -> Result @@ -189,7 +189,9 @@ impl<'a, 'i> Transform<'a, 'i> { let mut external_id_buffer = Vec::new(); let mut field_buffer: Vec<(u16, Cow<[u8]>)> = Vec::new(); let addition_index = cursor.documents_batch_index().clone(); - while let Some(document) = cursor.next_document()? { + while let Some(enriched_document) = cursor.next_enriched_document()? { + let EnrichedDocument { document, external_id } = enriched_document; + let mut field_buffer_cache = drop_and_reuse(field_buffer); if self.indexer_settings.log_every_n.map_or(false, |len| documents_count % len == 0) { progress_callback(UpdateIndexingStep::RemapDocumentAddition { diff --git a/milli/src/update/index_documents/validate.rs b/milli/src/update/index_documents/validate.rs index 32e8de03f..8b68532cb 100644 --- a/milli/src/update/index_documents/validate.rs +++ b/milli/src/update/index_documents/validate.rs @@ -4,27 +4,28 @@ use std::result::Result as StdResult; use serde_json::Value; -use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader}; +use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader, EnrichedDocumentsBatchReader}; use crate::error::{GeoError, InternalError, UserError}; -use crate::update::index_documents::obkv_to_object; +use crate::update::index_documents::{obkv_to_object, writer_into_reader}; use crate::{FieldId, Index, Object, Result}; /// The symbol used to define levels in a nested primary key. const PRIMARY_KEY_SPLIT_SYMBOL: char = '.'; -/// This function validates a documents by checking that: +/// This function validates and enrich the documents by checking that: /// - we can infer a primary key, -/// - all the documents id exist and, +/// - all the documents id exist and are extracted, /// - the validity of them but also, /// - the validity of the `_geo` field depending on the settings. -pub fn validate_documents_batch( +pub fn validate_and_enrich_documents_batch( rtxn: &heed::RoTxn, index: &Index, autogenerate_docids: bool, reader: DocumentsBatchReader, -) -> Result, UserError>> { +) -> Result, UserError>> { let mut cursor = reader.into_cursor(); let mut documents_batch_index = cursor.documents_batch_index().clone(); + let mut external_ids = tempfile::tempfile().map(grenad::Writer::new)?; // The primary key *field id* that has already been set for this index or the one // we will guess by searching for the first key that contains "id" as a substring. @@ -82,6 +83,8 @@ pub fn validate_documents_batch( Err(user_error) => return Ok(Err(user_error)), }; + external_ids.insert(count.to_be_bytes(), &document_id)?; + if let Some(geo_value) = geo_field_id.and_then(|fid| document.get(fid)) { if let Err(user_error) = validate_geo_from_json(Value::from(document_id), geo_value)? { return Ok(Err(UserError::from(user_error))); @@ -90,7 +93,10 @@ pub fn validate_documents_batch( count += 1; } - Ok(Ok(cursor.into_reader())) + let external_ids = writer_into_reader(external_ids)?; + let reader = EnrichedDocumentsBatchReader::new(cursor.into_reader(), external_ids)?; + + Ok(Ok(reader)) } /// Retrieve the document id after validating it, returning a `UserError` @@ -100,7 +106,7 @@ fn fetch_document_id( documents_batch_index: &DocumentsBatchIndex, primary_key: PrimaryKey, autogenerate_docids: bool, - count: usize, + count: u32, ) -> Result> { match primary_key { PrimaryKey::Flat { name: primary_key, field_id: primary_key_id } => { From 5f1bfb73eeeb1da17a1d2598d01916b08022aa8c Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 21 Jun 2022 10:45:25 +0200 Subject: [PATCH 20/55] Extract the primary key name and make it accessible --- milli/src/documents/enriched.rs | 28 +++++++++++++++++--- milli/src/update/index_documents/validate.rs | 6 ++++- 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/milli/src/documents/enriched.rs b/milli/src/documents/enriched.rs index 8645e06c4..918b47c95 100644 --- a/milli/src/documents/enriched.rs +++ b/milli/src/documents/enriched.rs @@ -17,17 +17,20 @@ use crate::FieldId; /// `FieldId`. The mapping between the field ids and the field names is done thanks to the index. pub struct EnrichedDocumentsBatchReader { documents: DocumentsBatchReader, + primary_key: String, external_ids: grenad::ReaderCursor, } impl EnrichedDocumentsBatchReader { pub fn new( documents: DocumentsBatchReader, + primary_key: String, external_ids: grenad::Reader, ) -> Result { if documents.documents_count() as u64 == external_ids.len() { Ok(EnrichedDocumentsBatchReader { documents, + primary_key, external_ids: external_ids.into_cursor()?, }) } else { @@ -39,6 +42,10 @@ impl EnrichedDocumentsBatchReader { self.documents.documents_count() } + pub fn primary_key(&self) -> &str { + &self.primary_key + } + pub fn is_empty(&self) -> bool { self.documents.is_empty() } @@ -49,9 +56,13 @@ impl EnrichedDocumentsBatchReader { /// This method returns a forward cursor over the enriched documents. pub fn into_cursor(self) -> EnrichedDocumentsBatchCursor { - let EnrichedDocumentsBatchReader { documents, mut external_ids } = self; + let EnrichedDocumentsBatchReader { documents, primary_key, mut external_ids } = self; external_ids.reset(); - EnrichedDocumentsBatchCursor { documents: documents.into_cursor(), external_ids } + EnrichedDocumentsBatchCursor { + documents: documents.into_cursor(), + primary_key, + external_ids, + } } } @@ -63,13 +74,22 @@ pub struct EnrichedDocument<'a> { pub struct EnrichedDocumentsBatchCursor { documents: DocumentsBatchCursor, + primary_key: String, external_ids: grenad::ReaderCursor, } impl EnrichedDocumentsBatchCursor { pub fn into_reader(self) -> EnrichedDocumentsBatchReader { - let EnrichedDocumentsBatchCursor { documents, external_ids } = self; - EnrichedDocumentsBatchReader { documents: documents.into_reader(), external_ids } + let EnrichedDocumentsBatchCursor { documents, primary_key, external_ids } = self; + EnrichedDocumentsBatchReader { + documents: documents.into_reader(), + primary_key, + external_ids, + } + } + + pub fn primary_key(&self) -> &str { + &self.primary_key } pub fn documents_batch_index(&self) -> &DocumentsBatchIndex { diff --git a/milli/src/update/index_documents/validate.rs b/milli/src/update/index_documents/validate.rs index 8b68532cb..83d7ef38e 100644 --- a/milli/src/update/index_documents/validate.rs +++ b/milli/src/update/index_documents/validate.rs @@ -94,7 +94,11 @@ pub fn validate_and_enrich_documents_batch( } let external_ids = writer_into_reader(external_ids)?; - let reader = EnrichedDocumentsBatchReader::new(cursor.into_reader(), external_ids)?; + let reader = EnrichedDocumentsBatchReader::new( + cursor.into_reader(), + primary_key.primary_key().to_string(), + external_ids, + )?; Ok(Ok(reader)) } From 742543091e9b33084952ad27e00eff60655d7bf0 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 21 Jun 2022 10:48:07 +0200 Subject: [PATCH 21/55] Constify the default primary key name --- milli/src/update/index_documents/validate.rs | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/milli/src/update/index_documents/validate.rs b/milli/src/update/index_documents/validate.rs index 83d7ef38e..4e52f4cb9 100644 --- a/milli/src/update/index_documents/validate.rs +++ b/milli/src/update/index_documents/validate.rs @@ -12,6 +12,9 @@ use crate::{FieldId, Index, Object, Result}; /// The symbol used to define levels in a nested primary key. const PRIMARY_KEY_SPLIT_SYMBOL: char = '.'; +/// The default primary that is used when not specified. +const DEFAULT_PRIMARY_KEY: &str = "id"; + /// This function validates and enrich the documents by checking that: /// - we can infer a primary key, /// - all the documents id exist and are extracted, @@ -51,13 +54,14 @@ pub fn validate_and_enrich_documents_batch( None => { let guessed = documents_batch_index .iter() - .filter(|(_, name)| name.to_lowercase().contains("id")) + .filter(|(_, name)| name.to_lowercase().contains(DEFAULT_PRIMARY_KEY)) .min_by_key(|(fid, _)| *fid); match guessed { Some((id, name)) => PrimaryKey::flat(name.as_str(), *id), - None if autogenerate_docids => { - PrimaryKey::flat("id", documents_batch_index.insert("id")) - } + None if autogenerate_docids => PrimaryKey::flat( + DEFAULT_PRIMARY_KEY, + documents_batch_index.insert(DEFAULT_PRIMARY_KEY), + ), None => return Ok(Err(UserError::MissingPrimaryKey)), } } From 905af2a2e9f69b2341d532b105bbf1b3baff819e Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 21 Jun 2022 11:12:51 +0200 Subject: [PATCH 22/55] Use the primary key and external id in the transform --- milli/src/update/index_documents/transform.rs | 312 +++++------------- 1 file changed, 90 insertions(+), 222 deletions(-) diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 4d0a4c311..e82556ec7 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -9,7 +9,7 @@ use heed::RoTxn; use itertools::Itertools; use obkv::{KvReader, KvWriter}; use roaring::RoaringBitmap; -use serde_json::{Map, Value}; +use serde_json::Value; use smartstring::SmartString; use super::helpers::{create_sorter, create_writer, keep_latest_obkv, merge_obkvs, MergeFn}; @@ -17,15 +17,12 @@ use super::{IndexDocumentsMethod, IndexerConfig}; use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader}; use crate::error::{Error, InternalError, UserError}; use crate::index::db_name; -use crate::update::index_documents::validate_document_id_value; use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; use crate::{ ExternalDocumentsIds, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result, BEU32, }; -const DEFAULT_PRIMARY_KEY_NAME: &str = "id"; - pub struct TransformOutput { pub primary_key: String, pub fields_ids_map: FieldsIdsMap, @@ -85,18 +82,6 @@ fn create_fields_mapping( .collect() } -/// Look for a key containing the [DEFAULT_PRIMARY_KEY_NAME] in the fields. -/// It doesn't look in the subfield because we don't want to enable the -/// primary key inference on nested objects. -fn find_primary_key(index: &DocumentsBatchIndex) -> Option<&str> { - index - .iter() - .sorted_by_key(|(k, _)| *k) - .map(|(_, v)| v) - .find(|v| v.to_lowercase().contains(DEFAULT_PRIMARY_KEY_NAME)) - .map(String::as_str) -} - impl<'a, 'i> Transform<'a, 'i> { pub fn new( wtxn: &mut heed::RwTxn, @@ -167,28 +152,15 @@ impl<'a, 'i> Transform<'a, 'i> { let mapping = create_fields_mapping(&mut self.fields_ids_map, fields_index)?; - let alternative_name = self - .index - .primary_key(wtxn)? - .or_else(|| find_primary_key(fields_index)) - .map(String::from); - - let (primary_key_id, primary_key_name) = compute_primary_key_pair( - self.index.primary_key(wtxn)?, - &mut self.fields_ids_map, - alternative_name, - self.autogenerate_docids, - )?; - - let primary_key_id_nested = primary_key_name.contains('.'); + let primary_key = cursor.primary_key().to_string(); + self.fields_ids_map.insert(&primary_key).ok_or(UserError::AttributeLimitReached)?; + let primary_key_id_nested = primary_key.contains('.'); let mut flattened_document = None; let mut obkv_buffer = Vec::new(); let mut flattened_obkv_buffer = Vec::new(); let mut documents_count = 0; - let mut external_id_buffer = Vec::new(); let mut field_buffer: Vec<(u16, Cow<[u8]>)> = Vec::new(); - let addition_index = cursor.documents_batch_index().clone(); while let Some(enriched_document) = cursor.next_enriched_document()? { let EnrichedDocument { document, external_id } = enriched_document; @@ -210,8 +182,7 @@ impl<'a, 'i> Transform<'a, 'i> { // it, transform it into a string and validate it, and then update it in the // document. If none is found, and we were told to generate missing document ids, then // we create the missing field, and update the new document. - let mut uuid_buffer = [0; uuid::fmt::Hyphenated::LENGTH]; - let external_id = if primary_key_id_nested { + if primary_key_id_nested { let mut field_buffer_cache = field_buffer_cache.clone(); self.flatten_from_field_mapping( &mapping, @@ -220,29 +191,6 @@ impl<'a, 'i> Transform<'a, 'i> { &mut field_buffer_cache, )?; flattened_document = Some(&flattened_obkv_buffer); - let document = KvReader::new(&flattened_obkv_buffer); - - update_primary_key( - document, - &addition_index, - primary_key_id, - &primary_key_name, - &mut uuid_buffer, - &mut field_buffer_cache, - &mut external_id_buffer, - self.autogenerate_docids, - )? - } else { - update_primary_key( - document, - &addition_index, - primary_key_id, - &primary_key_name, - &mut uuid_buffer, - &mut field_buffer_cache, - &mut external_id_buffer, - self.autogenerate_docids, - )? }; // Insertion in a obkv need to be done with keys ordered. For now they are ordered @@ -318,7 +266,6 @@ impl<'a, 'i> Transform<'a, 'i> { }); field_buffer = drop_and_reuse(field_buffer_cache); - external_id_buffer.clear(); obkv_buffer.clear(); } @@ -327,7 +274,7 @@ impl<'a, 'i> Transform<'a, 'i> { }); self.index.put_fields_ids_map(wtxn, &self.fields_ids_map)?; - self.index.put_primary_key(wtxn, &primary_key_name)?; + self.index.put_primary_key(wtxn, &primary_key)?; self.documents_count += documents_count; // Now that we have a valid sorter that contains the user id and the obkv we // give it to the last transforming function which returns the TransformOutput. @@ -749,42 +696,6 @@ impl<'a, 'i> Transform<'a, 'i> { } } -/// Given an optional primary key and an optional alternative name, returns the (field_id, attr_name) -/// for the primary key according to the following rules: -/// - if primary_key is `Some`, returns the id and the name, else -/// - if alternative_name is Some, adds alternative to the fields_ids_map, and returns the pair, else -/// - if autogenerate_docids is true, insert the default id value in the field ids map ("id") and -/// returns the pair, else -/// - returns an error. -fn compute_primary_key_pair( - primary_key: Option<&str>, - fields_ids_map: &mut FieldsIdsMap, - alternative_name: Option, - autogenerate_docids: bool, -) -> Result<(FieldId, String)> { - match primary_key { - Some(primary_key) => { - let id = fields_ids_map.insert(primary_key).ok_or(UserError::AttributeLimitReached)?; - Ok((id, primary_key.to_string())) - } - None => { - let name = match alternative_name { - Some(key) => key, - None => { - if !autogenerate_docids { - // If there is no primary key in the current document batch, we must - // return an error and not automatically generate any document id. - return Err(UserError::MissingPrimaryKey.into()); - } - DEFAULT_PRIMARY_KEY_NAME.to_string() - } - }; - let id = fields_ids_map.insert(&name).ok_or(UserError::AttributeLimitReached)?; - Ok((id, name)) - } - } -} - /// Drops all the value of type `U` in vec, and reuses the allocation to create a `Vec`. /// /// The size and alignment of T and U must match. @@ -796,49 +707,6 @@ fn drop_and_reuse(mut vec: Vec) -> Vec { vec.into_iter().map(|_| unreachable!()).collect() } -fn update_primary_key<'a>( - document: KvReader<'a, FieldId>, - addition_index: &DocumentsBatchIndex, - primary_key_id: FieldId, - primary_key_name: &str, - uuid_buffer: &'a mut [u8; uuid::fmt::Hyphenated::LENGTH], - field_buffer_cache: &mut Vec<(u16, Cow<'a, [u8]>)>, - mut external_id_buffer: &'a mut Vec, - autogenerate_docids: bool, -) -> Result> { - match field_buffer_cache.iter_mut().find(|(id, _)| *id == primary_key_id) { - Some((_, bytes)) => { - let document_id = serde_json::from_slice(bytes).map_err(InternalError::SerdeJson)?; - let value = validate_document_id_value(document_id)??; - serde_json::to_writer(external_id_buffer, &value).map_err(InternalError::SerdeJson)?; - Ok(Cow::Owned(value)) - } - None if autogenerate_docids => { - let uuid = uuid::Uuid::new_v4().as_hyphenated().encode_lower(uuid_buffer); - serde_json::to_writer(&mut external_id_buffer, &uuid) - .map_err(InternalError::SerdeJson)?; - field_buffer_cache.push((primary_key_id, external_id_buffer.as_slice().into())); - Ok(Cow::Borrowed(&*uuid)) - } - None => { - let mut json = Map::new(); - for (key, value) in document.iter() { - let key = addition_index.name(key).map(ToString::to_string); - let value = serde_json::from_slice::(&value).ok(); - - if let Some((k, v)) = key.zip(value) { - json.insert(k, v); - } - } - - Err(UserError::MissingDocumentId { - primary_key: primary_key_name.to_string(), - document: json, - })? - } - } -} - impl TransformOutput { // find and insert the new field ids pub fn compute_real_facets(&self, rtxn: &RoTxn, index: &Index) -> Result> { @@ -853,87 +721,87 @@ impl TransformOutput { } } -#[cfg(test)] -mod test { - use super::*; - - mod compute_primary_key { - use big_s::S; - - use super::{compute_primary_key_pair, FieldsIdsMap}; - - #[test] - fn should_return_primary_key_if_is_some() { - let mut fields_map = FieldsIdsMap::new(); - fields_map.insert("toto").unwrap(); - let result = compute_primary_key_pair( - Some("toto"), - &mut fields_map, - Some("tata".to_string()), - false, - ); - assert_eq!(result.unwrap(), (0, "toto".to_string())); - assert_eq!(fields_map.len(), 1); - - // and with nested fields - let mut fields_map = FieldsIdsMap::new(); - fields_map.insert("toto.tata").unwrap(); - let result = compute_primary_key_pair( - Some("toto.tata"), - &mut fields_map, - Some(S("titi")), - false, - ); - assert_eq!(result.unwrap(), (0, "toto.tata".to_string())); - assert_eq!(fields_map.len(), 1); - } - - #[test] - fn should_return_alternative_if_primary_is_none() { - let mut fields_map = FieldsIdsMap::new(); - let result = - compute_primary_key_pair(None, &mut fields_map, Some("tata".to_string()), false); - assert_eq!(result.unwrap(), (0, S("tata"))); - assert_eq!(fields_map.len(), 1); - } - - #[test] - fn should_return_default_if_both_are_none() { - let mut fields_map = FieldsIdsMap::new(); - let result = compute_primary_key_pair(None, &mut fields_map, None, true); - assert_eq!(result.unwrap(), (0, S("id"))); - assert_eq!(fields_map.len(), 1); - } - - #[test] - fn should_return_err_if_both_are_none_and_recompute_is_false() { - let mut fields_map = FieldsIdsMap::new(); - let result = compute_primary_key_pair(None, &mut fields_map, None, false); - assert!(result.is_err()); - assert_eq!(fields_map.len(), 0); - } - } - - mod primary_key_inference { - use big_s::S; - use bimap::BiHashMap; - - use crate::documents::DocumentsBatchIndex; - use crate::update::index_documents::transform::find_primary_key; - - #[test] - fn primary_key_infered_on_first_field() { - // We run the test multiple times to change the order in which the fields are iterated upon. - for _ in 1..50 { - let mut map = BiHashMap::new(); - map.insert(1, S("fakeId")); - map.insert(2, S("fakeId")); - map.insert(3, S("fakeId")); - map.insert(4, S("fakeId")); - map.insert(0, S("realId")); - - assert_eq!(find_primary_key(&DocumentsBatchIndex(map)), Some("realId")); - } - } - } -} +// #[cfg(test)] +// mod test { +// use super::*; + +// mod compute_primary_key { +// use big_s::S; + +// use super::{compute_primary_key_pair, FieldsIdsMap}; + +// #[test] +// fn should_return_primary_key_if_is_some() { +// let mut fields_map = FieldsIdsMap::new(); +// fields_map.insert("toto").unwrap(); +// let result = compute_primary_key_pair( +// Some("toto"), +// &mut fields_map, +// Some("tata".to_string()), +// false, +// ); +// assert_eq!(result.unwrap(), (0, "toto".to_string())); +// assert_eq!(fields_map.len(), 1); + +// // and with nested fields +// let mut fields_map = FieldsIdsMap::new(); +// fields_map.insert("toto.tata").unwrap(); +// let result = compute_primary_key_pair( +// Some("toto.tata"), +// &mut fields_map, +// Some(S("titi")), +// false, +// ); +// assert_eq!(result.unwrap(), (0, "toto.tata".to_string())); +// assert_eq!(fields_map.len(), 1); +// } + +// #[test] +// fn should_return_alternative_if_primary_is_none() { +// let mut fields_map = FieldsIdsMap::new(); +// let result = +// compute_primary_key_pair(None, &mut fields_map, Some("tata".to_string()), false); +// assert_eq!(result.unwrap(), (0, S("tata"))); +// assert_eq!(fields_map.len(), 1); +// } + +// #[test] +// fn should_return_default_if_both_are_none() { +// let mut fields_map = FieldsIdsMap::new(); +// let result = compute_primary_key_pair(None, &mut fields_map, None, true); +// assert_eq!(result.unwrap(), (0, S("id"))); +// assert_eq!(fields_map.len(), 1); +// } + +// #[test] +// fn should_return_err_if_both_are_none_and_recompute_is_false() { +// let mut fields_map = FieldsIdsMap::new(); +// let result = compute_primary_key_pair(None, &mut fields_map, None, false); +// assert!(result.is_err()); +// assert_eq!(fields_map.len(), 0); +// } +// } + +// mod primary_key_inference { +// use big_s::S; +// use bimap::BiHashMap; + +// use crate::documents::DocumentsBatchIndex; +// use crate::update::index_documents::transform::find_primary_key; + +// #[test] +// fn primary_key_infered_on_first_field() { +// // We run the test multiple times to change the order in which the fields are iterated upon. +// for _ in 1..50 { +// let mut map = BiHashMap::new(); +// map.insert(1, S("fakeId")); +// map.insert(2, S("fakeId")); +// map.insert(3, S("fakeId")); +// map.insert(4, S("fakeId")); +// map.insert(0, S("realId")); + +// assert_eq!(find_primary_key(&DocumentsBatchIndex(map)), Some("realId")); +// } +// } +// } +// } From c8ebf0de47e09964a9b4060b1da9982a593040e1 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 21 Jun 2022 11:14:14 +0200 Subject: [PATCH 23/55] Rename the validate function as an enriching function --- .../index_documents/{validate.rs => enrich.rs} | 2 +- milli/src/update/index_documents/mod.rs | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) rename milli/src/update/index_documents/{validate.rs => enrich.rs} (99%) diff --git a/milli/src/update/index_documents/validate.rs b/milli/src/update/index_documents/enrich.rs similarity index 99% rename from milli/src/update/index_documents/validate.rs rename to milli/src/update/index_documents/enrich.rs index 4e52f4cb9..e3c3bd6f6 100644 --- a/milli/src/update/index_documents/validate.rs +++ b/milli/src/update/index_documents/enrich.rs @@ -20,7 +20,7 @@ const DEFAULT_PRIMARY_KEY: &str = "id"; /// - all the documents id exist and are extracted, /// - the validity of them but also, /// - the validity of the `_geo` field depending on the settings. -pub fn validate_and_enrich_documents_batch( +pub fn enrich_documents_batch( rtxn: &heed::RoTxn, index: &Index, autogenerate_docids: bool, diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index fe3bd1f8f..db1a768e6 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1,8 +1,8 @@ +mod enrich; mod extract; mod helpers; mod transform; mod typed_chunk; -mod validate; use std::collections::HashSet; use std::io::{Cursor, Read, Seek}; @@ -19,6 +19,11 @@ use serde::{Deserialize, Serialize}; use slice_group_by::GroupBy; use typed_chunk::{write_typed_chunk_into_index, TypedChunk}; +use self::enrich::enrich_documents_batch; +pub use self::enrich::{ + extract_float_from_value, validate_document_id, validate_document_id_value, + validate_geo_from_json, +}; pub use self::helpers::{ as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset, fst_stream_into_vec, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, @@ -27,11 +32,6 @@ pub use self::helpers::{ }; use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; pub use self::transform::{Transform, TransformOutput}; -use self::validate::validate_and_enrich_documents_batch; -pub use self::validate::{ - extract_float_from_value, validate_document_id, validate_document_id_value, - validate_geo_from_json, -}; use crate::documents::{obkv_to_object, DocumentsBatchReader}; use crate::error::UserError; pub use crate::update::index_documents::helpers::CursorClonableMmap; @@ -141,7 +141,7 @@ where // We check for user errors in this validator and if there is one, we can return // the `IndexDocument` struct as it is valid to send more documents into it. // However, if there is an internal error we throw it away! - let enriched_documents_reader = match validate_and_enrich_documents_batch( + let enriched_documents_reader = match enrich_documents_batch( self.wtxn, self.index, self.config.autogenerate_docids, From d1a4da98127207b7ce560280911480a4e0a2ced4 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 21 Jun 2022 11:16:59 +0200 Subject: [PATCH 24/55] Generate a real UUIDv4 when ids are auto-generated --- milli/src/update/index_documents/enrich.rs | 93 ++++++++++++++----- .../extract/extract_geo_points.rs | 14 +-- 2 files changed, 78 insertions(+), 29 deletions(-) diff --git a/milli/src/update/index_documents/enrich.rs b/milli/src/update/index_documents/enrich.rs index e3c3bd6f6..5d00565a8 100644 --- a/milli/src/update/index_documents/enrich.rs +++ b/milli/src/update/index_documents/enrich.rs @@ -1,6 +1,6 @@ use std::io::{Read, Seek}; -use std::iter; use std::result::Result as StdResult; +use std::{fmt, iter}; use serde_json::Value; @@ -29,6 +29,7 @@ pub fn enrich_documents_batch( let mut cursor = reader.into_cursor(); let mut documents_batch_index = cursor.documents_batch_index().clone(); let mut external_ids = tempfile::tempfile().map(grenad::Writer::new)?; + let mut uuid_buffer = [0; uuid::adapter::Hyphenated::LENGTH]; // The primary key *field id* that has already been set for this index or the one // we will guess by searching for the first key that contains "id" as a substring. @@ -76,31 +77,33 @@ pub fn enrich_documents_batch( let mut count = 0; while let Some(document) = cursor.next_document()? { - let document_id = match fetch_document_id( + let document_id = match fetch_or_generate_document_id( &document, &documents_batch_index, primary_key, autogenerate_docids, + &mut uuid_buffer, count, )? { Ok(document_id) => document_id, Err(user_error) => return Ok(Err(user_error)), }; - external_ids.insert(count.to_be_bytes(), &document_id)?; + external_ids.insert(count.to_be_bytes(), document_id.value())?; if let Some(geo_value) = geo_field_id.and_then(|fid| document.get(fid)) { - if let Err(user_error) = validate_geo_from_json(Value::from(document_id), geo_value)? { + if let Err(user_error) = validate_geo_from_json(&document_id, geo_value)? { return Ok(Err(UserError::from(user_error))); } } + count += 1; } let external_ids = writer_into_reader(external_ids)?; let reader = EnrichedDocumentsBatchReader::new( cursor.into_reader(), - primary_key.primary_key().to_string(), + primary_key.name().to_string(), external_ids, )?; @@ -109,13 +112,14 @@ pub fn enrich_documents_batch( /// Retrieve the document id after validating it, returning a `UserError` /// if the id is invalid or can't be guessed. -fn fetch_document_id( +fn fetch_or_generate_document_id( document: &obkv::KvReader, documents_batch_index: &DocumentsBatchIndex, primary_key: PrimaryKey, autogenerate_docids: bool, + uuid_buffer: &mut [u8; uuid::adapter::Hyphenated::LENGTH], count: u32, -) -> Result> { +) -> Result> { match primary_key { PrimaryKey::Flat { name: primary_key, field_id: primary_key_id } => { match document.get(primary_key_id) { @@ -123,12 +127,13 @@ fn fetch_document_id( let document_id = serde_json::from_slice(document_id_bytes) .map_err(InternalError::SerdeJson)?; match validate_document_id_value(document_id)? { - Ok(document_id) => Ok(Ok(document_id)), + Ok(document_id) => Ok(Ok(DocumentId::retrieved(document_id))), Err(user_error) => Ok(Err(user_error)), } } None if autogenerate_docids => { - Ok(Ok(format!("{{auto-generated id of the {}nth document}}", count))) + let uuid = uuid::Uuid::new_v4().to_hyphenated().encode_lower(uuid_buffer); + Ok(Ok(DocumentId::generated(uuid.to_string(), count))) } None => Ok(Err(UserError::MissingDocumentId { primary_key: primary_key.to_string(), @@ -147,7 +152,7 @@ fn fetch_document_id( if matching_documents_ids.len() >= 2 { return Ok(Err(UserError::TooManyDocumentIds { - primary_key: nested.primary_key().to_string(), + primary_key: nested.name().to_string(), document: obkv_to_object(&document, &documents_batch_index)?, })); } @@ -157,11 +162,11 @@ fn fetch_document_id( match matching_documents_ids.pop() { Some(document_id) => match validate_document_id_value(document_id)? { - Ok(document_id) => Ok(Ok(document_id)), + Ok(document_id) => Ok(Ok(DocumentId::retrieved(document_id))), Err(user_error) => Ok(Err(user_error)), }, None => Ok(Err(UserError::MissingDocumentId { - primary_key: nested.primary_key().to_string(), + primary_key: nested.name().to_string(), document: obkv_to_object(&document, &documents_batch_index)?, })), } @@ -186,7 +191,7 @@ impl PrimaryKey<'_> { PrimaryKey::Nested { name } } - fn primary_key(&self) -> &str { + fn name(&self) -> &str { match self { PrimaryKey::Flat { name, .. } => name, PrimaryKey::Nested { name } => name, @@ -196,11 +201,53 @@ impl PrimaryKey<'_> { /// Returns an `Iterator` that gives all the possible fields names the primary key /// can have depending of the first level name and deepnes of the objects. fn possible_level_names(&self) -> impl Iterator + '_ { - let name = self.primary_key(); + let name = self.name(); iter::successors(Some((name, "")), |(curr, _)| curr.rsplit_once(PRIMARY_KEY_SPLIT_SYMBOL)) } } +/// A type that represents a document id that has been retrieved from a document or auto-generated. +/// +/// In case the document id has been auto-generated, the document nth is kept to help +/// users debug if there is an issue with the document itself. +#[derive(Clone)] +pub enum DocumentId { + Retrieved { value: String }, + Generated { value: String, document_nth: u32 }, +} + +impl DocumentId { + fn retrieved(value: String) -> DocumentId { + DocumentId::Retrieved { value } + } + + fn generated(value: String, document_nth: u32) -> DocumentId { + DocumentId::Generated { value, document_nth } + } + + fn value(&self) -> &str { + match self { + DocumentId::Retrieved { value } => value, + DocumentId::Generated { value, .. } => value, + } + } + + fn debug(&self) -> String { + format!("{:?}", self) + } +} + +impl fmt::Debug for DocumentId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + DocumentId::Retrieved { value } => write!(f, "{:?}", value), + DocumentId::Generated { value, document_nth } => { + write!(f, "{{{:?}}} of the {}nth document", value, document_nth) + } + } + } +} + fn contained_in(selector: &str, key: &str) -> bool { selector.starts_with(key) && selector[key.len()..] @@ -281,23 +328,25 @@ pub fn extract_float_from_value(value: Value) -> StdResult { } } -pub fn validate_geo_from_json(document_id: Value, bytes: &[u8]) -> Result> { +pub fn validate_geo_from_json(id: &DocumentId, bytes: &[u8]) -> Result> { + use GeoError::*; + let debug_id = || Value::from(id.debug()); match serde_json::from_slice(bytes).map_err(InternalError::SerdeJson)? { Value::Object(mut object) => match (object.remove("lat"), object.remove("lng")) { (Some(lat), Some(lng)) => { match (extract_float_from_value(lat), extract_float_from_value(lng)) { (Ok(_), Ok(_)) => Ok(Ok(())), - (Err(value), Ok(_)) => Ok(Err(GeoError::BadLatitude { document_id, value })), - (Ok(_), Err(value)) => Ok(Err(GeoError::BadLongitude { document_id, value })), + (Err(value), Ok(_)) => Ok(Err(BadLatitude { document_id: debug_id(), value })), + (Ok(_), Err(value)) => Ok(Err(BadLongitude { document_id: debug_id(), value })), (Err(lat), Err(lng)) => { - Ok(Err(GeoError::BadLatitudeAndLongitude { document_id, lat, lng })) + Ok(Err(BadLatitudeAndLongitude { document_id: debug_id(), lat, lng })) } } } - (None, Some(_)) => Ok(Err(GeoError::MissingLatitude { document_id })), - (Some(_), None) => Ok(Err(GeoError::MissingLongitude { document_id })), - (None, None) => Ok(Err(GeoError::MissingLatitudeAndLongitude { document_id })), + (None, Some(_)) => Ok(Err(MissingLatitude { document_id: debug_id() })), + (Some(_), None) => Ok(Err(MissingLongitude { document_id: debug_id() })), + (None, None) => Ok(Err(MissingLatitudeAndLongitude { document_id: debug_id() })), }, - value => Ok(Err(GeoError::NotAnObject { document_id, value })), + value => Ok(Err(NotAnObject { document_id: debug_id(), value })), } } diff --git a/milli/src/update/index_documents/extract/extract_geo_points.rs b/milli/src/update/index_documents/extract/extract_geo_points.rs index 46ef9ba9b..5a6de236b 100644 --- a/milli/src/update/index_documents/extract/extract_geo_points.rs +++ b/milli/src/update/index_documents/extract/extract_geo_points.rs @@ -29,9 +29,9 @@ pub fn extract_geo_points( let obkv = obkv::KvReader::new(value); // since we only needs the primary key when we throw an error we create this getter to // lazily get it when needed - let primary_key = || -> Value { - let primary_key = obkv.get(primary_key_id).unwrap(); - serde_json::from_slice(primary_key).unwrap() + let document_id = || -> Value { + let document_id = obkv.get(primary_key_id).unwrap(); + serde_json::from_slice(document_id).unwrap() }; // first we get the two fields @@ -43,19 +43,19 @@ pub fn extract_geo_points( let lat = extract_float_from_value( serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?, ) - .map_err(|lat| GeoError::BadLatitude { document_id: primary_key(), value: lat })?; + .map_err(|lat| GeoError::BadLatitude { document_id: document_id(), value: lat })?; let lng = extract_float_from_value( serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?, ) - .map_err(|lng| GeoError::BadLongitude { document_id: primary_key(), value: lng })?; + .map_err(|lng| GeoError::BadLongitude { document_id: document_id(), value: lng })?; let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()]; writer.insert(docid_bytes, bytes)?; } else if lat.is_none() && lng.is_some() { - return Err(GeoError::MissingLatitude { document_id: primary_key() })?; + return Err(GeoError::MissingLatitude { document_id: document_id() })?; } else if lat.is_some() && lng.is_none() { - return Err(GeoError::MissingLongitude { document_id: primary_key() })?; + return Err(GeoError::MissingLongitude { document_id: document_id() })?; } } From 0bbcc7b1808061355202efe040c38f64de64ba50 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 21 Jun 2022 14:41:19 +0200 Subject: [PATCH 25/55] Expose the `DocumentId` struct to be sure to inject the generated ids --- milli/src/documents/enriched.rs | 13 ++++++----- milli/src/documents/reader.rs | 12 +++++----- milli/src/error.rs | 2 +- milli/src/update/index_documents/enrich.rs | 22 ++++++++++++------- milli/src/update/index_documents/mod.rs | 2 +- milli/src/update/index_documents/transform.rs | 13 +++++++++-- milli/src/update/mod.rs | 2 +- 7 files changed, 41 insertions(+), 25 deletions(-) diff --git a/milli/src/documents/enriched.rs b/milli/src/documents/enriched.rs index 918b47c95..4f45a891a 100644 --- a/milli/src/documents/enriched.rs +++ b/milli/src/documents/enriched.rs @@ -7,6 +7,7 @@ use super::{ DocumentsBatchCursor, DocumentsBatchCursorError, DocumentsBatchIndex, DocumentsBatchReader, Error, }; +use crate::update::DocumentId; use crate::FieldId; /// The `EnrichedDocumentsBatchReader` provides a way to iterate over documents that have @@ -66,10 +67,10 @@ impl EnrichedDocumentsBatchReader { } } -#[derive(Debug, Clone, Copy)] +#[derive(Debug, Clone)] pub struct EnrichedDocument<'a> { pub document: KvReader<'a, FieldId>, - pub external_id: &'a str, + pub document_id: DocumentId, } pub struct EnrichedDocumentsBatchCursor { @@ -110,13 +111,13 @@ impl EnrichedDocumentsBatchCursor { &mut self, ) -> Result, DocumentsBatchCursorError> { let document = self.documents.next_document()?; - let external_id = match self.external_ids.move_on_next()? { - Some((_, bytes)) => Some(str::from_utf8(bytes)?), + let document_id = match self.external_ids.move_on_next()? { + Some((_, bytes)) => serde_json::from_slice(bytes).map(Some)?, None => None, }; - match document.zip(external_id) { - Some((document, external_id)) => Ok(Some(EnrichedDocument { document, external_id })), + match document.zip(document_id) { + Some((document, document_id)) => Ok(Some(EnrichedDocument { document, document_id })), None => Ok(None), } } diff --git a/milli/src/documents/reader.rs b/milli/src/documents/reader.rs index 7bd6dbd51..70b8b0131 100644 --- a/milli/src/documents/reader.rs +++ b/milli/src/documents/reader.rs @@ -1,5 +1,5 @@ use std::convert::TryInto; -use std::{error, fmt, io, str}; +use std::{error, fmt, io}; use obkv::KvReader; @@ -95,7 +95,7 @@ impl DocumentsBatchCursor { #[derive(Debug)] pub enum DocumentsBatchCursorError { Grenad(grenad::Error), - Utf8(str::Utf8Error), + SerdeJson(serde_json::Error), } impl From for DocumentsBatchCursorError { @@ -104,9 +104,9 @@ impl From for DocumentsBatchCursorError { } } -impl From for DocumentsBatchCursorError { - fn from(error: str::Utf8Error) -> DocumentsBatchCursorError { - DocumentsBatchCursorError::Utf8(error) +impl From for DocumentsBatchCursorError { + fn from(error: serde_json::Error) -> DocumentsBatchCursorError { + DocumentsBatchCursorError::SerdeJson(error) } } @@ -116,7 +116,7 @@ impl fmt::Display for DocumentsBatchCursorError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { DocumentsBatchCursorError::Grenad(e) => e.fmt(f), - DocumentsBatchCursorError::Utf8(e) => e.fmt(f), + DocumentsBatchCursorError::SerdeJson(e) => e.fmt(f), } } } diff --git a/milli/src/error.rs b/milli/src/error.rs index 0419ceeda..0abb41eec 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -217,7 +217,7 @@ impl From for Error { fn from(error: DocumentsBatchCursorError) -> Error { match error { DocumentsBatchCursorError::Grenad(e) => Error::from(e), - DocumentsBatchCursorError::Utf8(e) => Error::from(e), + DocumentsBatchCursorError::SerdeJson(e) => Error::from(InternalError::from(e)), } } } diff --git a/milli/src/update/index_documents/enrich.rs b/milli/src/update/index_documents/enrich.rs index 5d00565a8..1a0c31c24 100644 --- a/milli/src/update/index_documents/enrich.rs +++ b/milli/src/update/index_documents/enrich.rs @@ -2,6 +2,7 @@ use std::io::{Read, Seek}; use std::result::Result as StdResult; use std::{fmt, iter}; +use serde::{Deserialize, Serialize}; use serde_json::Value; use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader, EnrichedDocumentsBatchReader}; @@ -89,14 +90,15 @@ pub fn enrich_documents_batch( Err(user_error) => return Ok(Err(user_error)), }; - external_ids.insert(count.to_be_bytes(), document_id.value())?; - if let Some(geo_value) = geo_field_id.and_then(|fid| document.get(fid)) { if let Err(user_error) = validate_geo_from_json(&document_id, geo_value)? { return Ok(Err(UserError::from(user_error))); } } + let document_id = serde_json::to_vec(&document_id).map_err(InternalError::SerdeJson)?; + external_ids.insert(count.to_be_bytes(), document_id)?; + count += 1; } @@ -210,7 +212,7 @@ impl PrimaryKey<'_> { /// /// In case the document id has been auto-generated, the document nth is kept to help /// users debug if there is an issue with the document itself. -#[derive(Clone)] +#[derive(Serialize, Deserialize, Clone)] pub enum DocumentId { Retrieved { value: String }, Generated { value: String, document_nth: u32 }, @@ -225,16 +227,20 @@ impl DocumentId { DocumentId::Generated { value, document_nth } } - fn value(&self) -> &str { + fn debug(&self) -> String { + format!("{:?}", self) + } + + pub fn is_generated(&self) -> bool { + matches!(self, DocumentId::Generated { .. }) + } + + pub fn value(&self) -> &str { match self { DocumentId::Retrieved { value } => value, DocumentId::Generated { value, .. } => value, } } - - fn debug(&self) -> String { - format!("{:?}", self) - } } impl fmt::Debug for DocumentId { diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index db1a768e6..615e1dfc7 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -22,7 +22,7 @@ use typed_chunk::{write_typed_chunk_into_index, TypedChunk}; use self::enrich::enrich_documents_batch; pub use self::enrich::{ extract_float_from_value, validate_document_id, validate_document_id_value, - validate_geo_from_json, + validate_geo_from_json, DocumentId, }; pub use self::helpers::{ as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset, diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index e82556ec7..a34295a50 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -153,8 +153,9 @@ impl<'a, 'i> Transform<'a, 'i> { let mapping = create_fields_mapping(&mut self.fields_ids_map, fields_index)?; let primary_key = cursor.primary_key().to_string(); - self.fields_ids_map.insert(&primary_key).ok_or(UserError::AttributeLimitReached)?; let primary_key_id_nested = primary_key.contains('.'); + let primary_key_id = + self.fields_ids_map.insert(&primary_key).ok_or(UserError::AttributeLimitReached)?; let mut flattened_document = None; let mut obkv_buffer = Vec::new(); @@ -162,7 +163,7 @@ impl<'a, 'i> Transform<'a, 'i> { let mut documents_count = 0; let mut field_buffer: Vec<(u16, Cow<[u8]>)> = Vec::new(); while let Some(enriched_document) = cursor.next_enriched_document()? { - let EnrichedDocument { document, external_id } = enriched_document; + let EnrichedDocument { document, document_id } = enriched_document; let mut field_buffer_cache = drop_and_reuse(field_buffer); if self.indexer_settings.log_every_n.map_or(false, |len| documents_count % len == 0) { @@ -171,6 +172,14 @@ impl<'a, 'i> Transform<'a, 'i> { }); } + // When the document id has been auto-generated by the `enrich_documents_batch` + // we must insert this document id into the remaped document. + let external_id = document_id.value(); + if document_id.is_generated() { + let docid = serde_json::to_vec(external_id).map_err(InternalError::SerdeJson)?; + field_buffer_cache.push((primary_key_id, Cow::from(docid))); + } + for (k, v) in document.iter() { let mapped_id = *mapping.get(&k).ok_or(InternalError::FieldIdMappingMissingEntry { key: k })?; diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs index 965ed4fd2..1bf27a5f0 100644 --- a/milli/src/update/mod.rs +++ b/milli/src/update/mod.rs @@ -3,7 +3,7 @@ pub use self::clear_documents::ClearDocuments; pub use self::delete_documents::{DeleteDocuments, DocumentDeletionResult}; pub use self::facets::Facets; pub use self::index_documents::{ - DocumentAdditionResult, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, + DocumentAdditionResult, DocumentId, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, }; pub use self::indexer_config::IndexerConfig; pub use self::settings::{Setting, Settings}; From 5d149d631f7736443d5c871a3442bfd99dbaeb48 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 30 Jun 2022 15:13:50 +0200 Subject: [PATCH 26/55] Remove tests for a function that no more exists --- milli/src/update/index_documents/transform.rs | 85 ------------------- 1 file changed, 85 deletions(-) diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index a34295a50..6bf3dde43 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -729,88 +729,3 @@ impl TransformOutput { .collect()) } } - -// #[cfg(test)] -// mod test { -// use super::*; - -// mod compute_primary_key { -// use big_s::S; - -// use super::{compute_primary_key_pair, FieldsIdsMap}; - -// #[test] -// fn should_return_primary_key_if_is_some() { -// let mut fields_map = FieldsIdsMap::new(); -// fields_map.insert("toto").unwrap(); -// let result = compute_primary_key_pair( -// Some("toto"), -// &mut fields_map, -// Some("tata".to_string()), -// false, -// ); -// assert_eq!(result.unwrap(), (0, "toto".to_string())); -// assert_eq!(fields_map.len(), 1); - -// // and with nested fields -// let mut fields_map = FieldsIdsMap::new(); -// fields_map.insert("toto.tata").unwrap(); -// let result = compute_primary_key_pair( -// Some("toto.tata"), -// &mut fields_map, -// Some(S("titi")), -// false, -// ); -// assert_eq!(result.unwrap(), (0, "toto.tata".to_string())); -// assert_eq!(fields_map.len(), 1); -// } - -// #[test] -// fn should_return_alternative_if_primary_is_none() { -// let mut fields_map = FieldsIdsMap::new(); -// let result = -// compute_primary_key_pair(None, &mut fields_map, Some("tata".to_string()), false); -// assert_eq!(result.unwrap(), (0, S("tata"))); -// assert_eq!(fields_map.len(), 1); -// } - -// #[test] -// fn should_return_default_if_both_are_none() { -// let mut fields_map = FieldsIdsMap::new(); -// let result = compute_primary_key_pair(None, &mut fields_map, None, true); -// assert_eq!(result.unwrap(), (0, S("id"))); -// assert_eq!(fields_map.len(), 1); -// } - -// #[test] -// fn should_return_err_if_both_are_none_and_recompute_is_false() { -// let mut fields_map = FieldsIdsMap::new(); -// let result = compute_primary_key_pair(None, &mut fields_map, None, false); -// assert!(result.is_err()); -// assert_eq!(fields_map.len(), 0); -// } -// } - -// mod primary_key_inference { -// use big_s::S; -// use bimap::BiHashMap; - -// use crate::documents::DocumentsBatchIndex; -// use crate::update::index_documents::transform::find_primary_key; - -// #[test] -// fn primary_key_infered_on_first_field() { -// // We run the test multiple times to change the order in which the fields are iterated upon. -// for _ in 1..50 { -// let mut map = BiHashMap::new(); -// map.insert(1, S("fakeId")); -// map.insert(2, S("fakeId")); -// map.insert(3, S("fakeId")); -// map.insert(4, S("fakeId")); -// map.insert(0, S("realId")); - -// assert_eq!(find_primary_key(&DocumentsBatchIndex(map)), Some("realId")); -// } -// } -// } -// } From 2eec290424b7508c3953be301e070028fdf900f7 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 11 Jul 2022 16:36:23 +0200 Subject: [PATCH 27/55] Check the validity of the latitute and longitude numbers --- milli/src/error.rs | 6 ++--- milli/src/update/index_documents/enrich.rs | 24 ++++++++++++++----- .../extract/extract_geo_points.rs | 6 ++--- milli/src/update/index_documents/mod.rs | 2 +- 4 files changed, 25 insertions(+), 13 deletions(-) diff --git a/milli/src/error.rs b/milli/src/error.rs index 0abb41eec..80c923bd9 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -152,11 +152,11 @@ pub enum GeoError { MissingLatitude { document_id: Value }, #[error("Could not find longitude in the document with the id: `{document_id}`. Was expecting a `_geo.lng` field.")] MissingLongitude { document_id: Value }, - #[error("Could not parse latitude nor longitude in the document with the id: `{document_id}`. Was expecting a number but instead got `{lat}` and `{lng}`.")] + #[error("Could not parse latitude nor longitude in the document with the id: `{document_id}`. Was expecting finite numbers but instead got `{lat}` and `{lng}`.")] BadLatitudeAndLongitude { document_id: Value, lat: Value, lng: Value }, - #[error("Could not parse latitude in the document with the id: `{document_id}`. Was expecting a number but instead got `{value}`.")] + #[error("Could not parse latitude in the document with the id: `{document_id}`. Was expecting a finite number but instead got `{value}`.")] BadLatitude { document_id: Value, value: Value }, - #[error("Could not parse longitude in the document with the id: `{document_id}`. Was expecting a number but instead got `{value}`.")] + #[error("Could not parse longitude in the document with the id: `{document_id}`. Was expecting a finite number but instead got `{value}`.")] BadLongitude { document_id: Value, value: Value }, } diff --git a/milli/src/update/index_documents/enrich.rs b/milli/src/update/index_documents/enrich.rs index 1a0c31c24..28318881b 100644 --- a/milli/src/update/index_documents/enrich.rs +++ b/milli/src/update/index_documents/enrich.rs @@ -326,11 +326,23 @@ pub fn validate_document_id_value(document_id: Value) -> Result StdResult { - match value { - Value::Number(ref n) => n.as_f64().ok_or(value), - Value::String(ref s) => s.parse::().map_err(|_| value), - value => Err(value), +pub fn extract_finite_float_from_value(value: Value) -> StdResult { + let number = match value { + Value::Number(ref n) => match n.as_f64() { + Some(number) => number, + None => return Err(value), + }, + Value::String(ref s) => match s.parse::() { + Ok(number) => number, + Err(_) => return Err(value), + }, + value => return Err(value), + }; + + if number.is_finite() { + Ok(number) + } else { + Err(value) } } @@ -340,7 +352,7 @@ pub fn validate_geo_from_json(id: &DocumentId, bytes: &[u8]) -> Result match (object.remove("lat"), object.remove("lng")) { (Some(lat), Some(lng)) => { - match (extract_float_from_value(lat), extract_float_from_value(lng)) { + match (extract_finite_float_from_value(lat), extract_finite_float_from_value(lng)) { (Ok(_), Ok(_)) => Ok(Ok(())), (Err(value), Ok(_)) => Ok(Err(BadLatitude { document_id: debug_id(), value })), (Ok(_), Err(value)) => Ok(Err(BadLongitude { document_id: debug_id(), value })), diff --git a/milli/src/update/index_documents/extract/extract_geo_points.rs b/milli/src/update/index_documents/extract/extract_geo_points.rs index 5a6de236b..47085144a 100644 --- a/milli/src/update/index_documents/extract/extract_geo_points.rs +++ b/milli/src/update/index_documents/extract/extract_geo_points.rs @@ -6,7 +6,7 @@ use serde_json::Value; use super::helpers::{create_writer, writer_into_reader, GrenadParameters}; use crate::error::GeoError; -use crate::update::index_documents::extract_float_from_value; +use crate::update::index_documents::extract_finite_float_from_value; use crate::{FieldId, InternalError, Result}; /// Extracts the geographical coordinates contained in each document under the `_geo` field. @@ -40,12 +40,12 @@ pub fn extract_geo_points( if let Some((lat, lng)) = lat.zip(lng) { // then we extract the values - let lat = extract_float_from_value( + let lat = extract_finite_float_from_value( serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?, ) .map_err(|lat| GeoError::BadLatitude { document_id: document_id(), value: lat })?; - let lng = extract_float_from_value( + let lng = extract_finite_float_from_value( serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?, ) .map_err(|lng| GeoError::BadLongitude { document_id: document_id(), value: lng })?; diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 615e1dfc7..652c1e72b 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -21,7 +21,7 @@ use typed_chunk::{write_typed_chunk_into_index, TypedChunk}; use self::enrich::enrich_documents_batch; pub use self::enrich::{ - extract_float_from_value, validate_document_id, validate_document_id_value, + extract_finite_float_from_value, validate_document_id, validate_document_id_value, validate_geo_from_json, DocumentId, }; pub use self::helpers::{ From dc61105554954801cce37cc4cca32b1ef4814347 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 11 Jul 2022 17:44:08 +0200 Subject: [PATCH 28/55] Fix the nested document id fetching function --- milli/src/update/index_documents/enrich.rs | 4 +++- milli/src/update/index_documents/mod.rs | 6 +++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/milli/src/update/index_documents/enrich.rs b/milli/src/update/index_documents/enrich.rs index 28318881b..0298f0532 100644 --- a/milli/src/update/index_documents/enrich.rs +++ b/milli/src/update/index_documents/enrich.rs @@ -204,7 +204,9 @@ impl PrimaryKey<'_> { /// can have depending of the first level name and deepnes of the objects. fn possible_level_names(&self) -> impl Iterator + '_ { let name = self.name(); - iter::successors(Some((name, "")), |(curr, _)| curr.rsplit_once(PRIMARY_KEY_SPLIT_SYMBOL)) + name.match_indices(PRIMARY_KEY_SPLIT_SYMBOL) + .map(move |(i, _)| (&name[..i], &name[i + PRIMARY_KEY_SPLIT_SYMBOL.len_utf8()..])) + .chain(iter::once((name, ""))) } } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 652c1e72b..54599acce 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1198,7 +1198,7 @@ mod tests { let error = builder.execute().unwrap_err(); assert_eq!( &error.to_string(), - r#"Could not parse latitude in the document with the id: `0`. Was expecting a number but instead got `"lol"`."# + r#"Could not parse latitude in the document with the id: `0`. Was expecting a finite number but instead got `"lol"`."# ); let documents = documents!([ @@ -1212,7 +1212,7 @@ mod tests { let error = builder.execute().unwrap_err(); assert_eq!( &error.to_string(), - r#"Could not parse latitude in the document with the id: `0`. Was expecting a number but instead got `[12,13]`."# + r#"Could not parse latitude in the document with the id: `0`. Was expecting a finite number but instead got `[12,13]`."# ); let documents = documents!([ @@ -1226,7 +1226,7 @@ mod tests { let error = builder.execute().unwrap_err(); assert_eq!( &error.to_string(), - r#"Could not parse longitude in the document with the id: `0`. Was expecting a number but instead got `"hello"`."# + r#"Could not parse longitude in the document with the id: `0`. Was expecting a finite number but instead got `"hello"`."# ); } From a892a4a79c5933a2eedb04d6b46bd19d66229729 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 11 Jul 2022 18:38:50 +0200 Subject: [PATCH 29/55] Introduce a function to extend from a JSON array of objects --- benchmarks/benches/utils.rs | 5 +- cli/src/main.rs | 5 +- http-ui/src/main.rs | 5 +- milli/src/documents/builder.rs | 9 ++++ milli/src/documents/mod.rs | 1 + milli/src/documents/serde_impl.rs | 76 +++++++++++++++++++++++++++++++ 6 files changed, 89 insertions(+), 12 deletions(-) create mode 100644 milli/src/documents/serde_impl.rs diff --git a/benchmarks/benches/utils.rs b/benchmarks/benches/utils.rs index 630e17943..51178b43b 100644 --- a/benchmarks/benches/utils.rs +++ b/benchmarks/benches/utils.rs @@ -164,11 +164,8 @@ fn documents_from_jsonl(reader: impl BufRead) -> anyhow::Result> { fn documents_from_json(reader: impl BufRead) -> anyhow::Result> { let mut documents = DocumentsBatchBuilder::new(Vec::new()); - let list: Vec = serde_json::from_reader(reader)?; - for object in list { - documents.append_json_object(&object)?; - } + documents.append_json_array(reader)?; documents.into_inner().map_err(Into::into) } diff --git a/cli/src/main.rs b/cli/src/main.rs index 0d197af17..35fef95c6 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -337,11 +337,8 @@ fn documents_from_jsonl(reader: impl Read) -> Result> { fn documents_from_json(reader: impl Read) -> Result> { let mut documents = DocumentsBatchBuilder::new(Vec::new()); - let list: Vec = serde_json::from_reader(reader)?; - for object in list { - documents.append_json_object(&object)?; - } + documents.append_json_array(reader)?; documents.into_inner().map_err(Into::into) } diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 117aa31e8..83fce9a9c 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -1042,11 +1042,8 @@ fn documents_from_jsonl(reader: impl Read) -> anyhow::Result> { fn documents_from_json(reader: impl Read) -> anyhow::Result> { let mut documents = DocumentsBatchBuilder::new(Vec::new()); - let list: Vec = serde_json::from_reader(reader)?; - for object in list { - documents.append_json_object(&object)?; - } + documents.append_json_array(reader)?; documents.into_inner().map_err(Into::into) } diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs index 589e52269..bb9d6aa68 100644 --- a/milli/src/documents/builder.rs +++ b/milli/src/documents/builder.rs @@ -1,9 +1,11 @@ use std::io::{self, Write}; use grenad::{CompressionType, WriterBuilder}; +use serde::de::Deserializer; use serde_json::{to_writer, Value}; use super::{DocumentsBatchIndex, Error, DOCUMENTS_BATCH_INDEX_KEY}; +use crate::documents::serde_impl::DocumentVisitor; use crate::Object; /// The `DocumentsBatchBuilder` provides a way to build a documents batch in the intermediary @@ -78,6 +80,13 @@ impl DocumentsBatchBuilder { Ok(()) } + /// Appends a new JSON array of objects into the batch and updates the `DocumentsBatchIndex` accordingly. + pub fn append_json_array(&mut self, reader: R) -> Result<(), Error> { + let mut de = serde_json::Deserializer::from_reader(reader); + let mut visitor = DocumentVisitor::new(self); + de.deserialize_any(&mut visitor)? + } + /// Appends a new CSV file into the batch and updates the `DocumentsBatchIndex` accordingly. pub fn append_csv(&mut self, mut reader: csv::Reader) -> Result<(), Error> { // Make sure that we insert the fields ids in order as the obkv writer has this requirement. diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs index 43bfc1c20..c5ff7a120 100644 --- a/milli/src/documents/mod.rs +++ b/milli/src/documents/mod.rs @@ -1,6 +1,7 @@ mod builder; mod enriched; mod reader; +mod serde_impl; use std::fmt::{self, Debug}; use std::io; diff --git a/milli/src/documents/serde_impl.rs b/milli/src/documents/serde_impl.rs new file mode 100644 index 000000000..d4abdc844 --- /dev/null +++ b/milli/src/documents/serde_impl.rs @@ -0,0 +1,76 @@ +use std::fmt; +use std::io::Write; + +use serde::de::{DeserializeSeed, MapAccess, SeqAccess, Visitor}; + +use super::Error; +use crate::documents::DocumentsBatchBuilder; +use crate::Object; + +macro_rules! tri { + ($e:expr) => { + match $e { + Ok(r) => r, + Err(e) => return Ok(Err(e.into())), + } + }; +} + +pub struct DocumentVisitor<'a, W> { + inner: &'a mut DocumentsBatchBuilder, + object: Object, +} + +impl<'a, W> DocumentVisitor<'a, W> { + pub fn new(inner: &'a mut DocumentsBatchBuilder) -> Self { + DocumentVisitor { inner, object: Object::new() } + } +} + +impl<'a, 'de, W: Write> Visitor<'de> for &mut DocumentVisitor<'a, W> { + /// This Visitor value is nothing, since it write the value to a file. + type Value = Result<(), Error>; + + fn visit_seq(self, mut seq: A) -> Result + where + A: SeqAccess<'de>, + { + while let Some(v) = seq.next_element_seed(&mut *self)? { + tri!(v) + } + + Ok(Ok(())) + } + + fn visit_map(self, mut map: A) -> Result + where + A: MapAccess<'de>, + { + self.object.clear(); + while let Some((key, value)) = map.next_entry()? { + self.object.insert(key, value); + } + + tri!(self.inner.append_json_object(&self.object)); + + Ok(Ok(())) + } + + fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "a documents, or a sequence of documents.") + } +} + +impl<'a, 'de, W> DeserializeSeed<'de> for &mut DocumentVisitor<'a, W> +where + W: Write, +{ + type Value = Result<(), Error>; + + fn deserialize(self, deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + deserializer.deserialize_map(self) + } +} From 192793ee38c7d819b5b542c8c7d0f1ae473d9619 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 12 Jul 2022 12:42:06 +0200 Subject: [PATCH 30/55] Add some tests to check for the nested documents ids --- milli/src/update/index_documents/enrich.rs | 9 ++-- milli/src/update/index_documents/mod.rs | 52 ++++++++++++++++++++++ 2 files changed, 55 insertions(+), 6 deletions(-) diff --git a/milli/src/update/index_documents/enrich.rs b/milli/src/update/index_documents/enrich.rs index 0298f0532..d7ab89faa 100644 --- a/milli/src/update/index_documents/enrich.rs +++ b/milli/src/update/index_documents/enrich.rs @@ -257,12 +257,9 @@ impl fmt::Debug for DocumentId { } fn contained_in(selector: &str, key: &str) -> bool { - selector.starts_with(key) - && selector[key.len()..] - .chars() - .next() - .map(|c| c == PRIMARY_KEY_SPLIT_SYMBOL) - .unwrap_or(true) + selector.strip_prefix(key).map_or(false, |tail| { + tail.chars().next().map(|c| c == PRIMARY_KEY_SPLIT_SYMBOL).unwrap_or(true) + }) } pub fn fetch_matching_values(value: Value, selector: &str, output: &mut Vec) { diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 54599acce..c9890f93f 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1627,6 +1627,58 @@ mod tests { assert_eq!(documents_ids, vec![3]); } + #[test] + fn retrieve_a_b_nested_document_id() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + let config = IndexerConfig::default(); + + let mut wtxn = index.write_txn().unwrap(); + let mut builder = update::Settings::new(&mut wtxn, &index, &config); + builder.set_primary_key("a.b".to_owned()); + builder.execute(|_| ()).unwrap(); + + let content = documents!({ "a" : { "b" : { "c" : 1 }}}); + let indexing_config = IndexDocumentsConfig::default(); + let builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) + .unwrap(); + let (_builder, user_error) = builder.add_documents(content).unwrap(); + + // There must be an issue with the primary key no present in the given document + user_error.unwrap_err(); + } + + #[test] + fn retrieve_a_b_c_nested_document_id() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + let config = IndexerConfig::default(); + + let mut wtxn = index.write_txn().unwrap(); + let mut builder = update::Settings::new(&mut wtxn, &index, &config); + builder.set_primary_key("a.b.c".to_owned()); + builder.execute(|_| ()).unwrap(); + + let content = documents!({ "a" : { "b" : { "c" : 1 }}}); + let indexing_config = IndexDocumentsConfig::default(); + let builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) + .unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + let external_documents_ids = index.external_documents_ids(&rtxn).unwrap(); + assert!(external_documents_ids.get("1").is_some()); + } + #[test] fn test_facets_generation() { let path = tempfile::tempdir().unwrap(); From 25e768f31c32d8718132be124ab2e32c94c15f8f Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 12 Jul 2022 14:41:33 +0200 Subject: [PATCH 31/55] Fix another issue with the nested primary key selector --- milli/src/update/index_documents/enrich.rs | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/milli/src/update/index_documents/enrich.rs b/milli/src/update/index_documents/enrich.rs index d7ab89faa..56f8fa4c0 100644 --- a/milli/src/update/index_documents/enrich.rs +++ b/milli/src/update/index_documents/enrich.rs @@ -256,7 +256,7 @@ impl fmt::Debug for DocumentId { } } -fn contained_in(selector: &str, key: &str) -> bool { +fn starts_with(selector: &str, key: &str) -> bool { selector.strip_prefix(key).map_or(false, |tail| { tail.chars().next().map(|c| c == PRIMARY_KEY_SPLIT_SYMBOL).unwrap_or(true) }) @@ -282,12 +282,7 @@ pub fn fetch_matching_values_in_object( format!("{}{}{}", base_key, PRIMARY_KEY_SPLIT_SYMBOL, key) }; - // here if the user only specified `doggo` we need to iterate in all the fields of `doggo` - // so we check the contained_in on both side. - let should_continue = - contained_in(selector, &base_key) || contained_in(&base_key, selector); - - if should_continue { + if starts_with(selector, &base_key) { match value { Value::Object(object) => { fetch_matching_values_in_object(object, selector, &base_key, output) From 448114cc1c1edd1781a830590f2c888dcdab775d Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 12 Jul 2022 15:22:09 +0200 Subject: [PATCH 32/55] Fix the benchmarks with the new indexation API --- benchmarks/benches/indexing.rs | 40 +++++++++++++--------- benchmarks/benches/utils.rs | 4 +-- milli/src/update/index_documents/enrich.rs | 6 ++-- 3 files changed, 29 insertions(+), 21 deletions(-) diff --git a/benchmarks/benches/indexing.rs b/benchmarks/benches/indexing.rs index 80c7ba0ed..81b21b5ea 100644 --- a/benchmarks/benches/indexing.rs +++ b/benchmarks/benches/indexing.rs @@ -170,12 +170,13 @@ fn reindexing_songs_default(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -185,12 +186,13 @@ fn reindexing_songs_default(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -460,12 +462,13 @@ fn reindexing_wiki(c: &mut Criterion) { let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -476,12 +479,13 @@ fn reindexing_wiki(c: &mut Criterion) { let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -680,12 +684,13 @@ fn reindexing_movies_default(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::MOVIES, "json"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -695,12 +700,13 @@ fn reindexing_movies_default(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::MOVIES, "json"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1079,12 +1085,13 @@ fn reindexing_geo(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1095,12 +1102,13 @@ fn reindexing_geo(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); diff --git a/benchmarks/benches/utils.rs b/benchmarks/benches/utils.rs index 51178b43b..fba05edbe 100644 --- a/benchmarks/benches/utils.rs +++ b/benchmarks/benches/utils.rs @@ -1,7 +1,7 @@ #![allow(dead_code)] use std::fs::{create_dir_all, remove_dir_all, File}; -use std::io::{self, BufReader, Cursor, Read, Seek}; +use std::io::{self, BufRead, BufReader, Cursor, Read, Seek}; use std::num::ParseFloatError; use std::path::Path; @@ -138,7 +138,7 @@ pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) { } } -pub fn documents_from(filename: &str, filetype: &str) -> DocumentBatchReader { +pub fn documents_from(filename: &str, filetype: &str) -> DocumentsBatchReader { let reader = File::open(filename).expect(&format!("could not find the dataset in: {}", filename)); let reader = BufReader::new(reader); diff --git a/milli/src/update/index_documents/enrich.rs b/milli/src/update/index_documents/enrich.rs index 56f8fa4c0..51495c598 100644 --- a/milli/src/update/index_documents/enrich.rs +++ b/milli/src/update/index_documents/enrich.rs @@ -30,7 +30,7 @@ pub fn enrich_documents_batch( let mut cursor = reader.into_cursor(); let mut documents_batch_index = cursor.documents_batch_index().clone(); let mut external_ids = tempfile::tempfile().map(grenad::Writer::new)?; - let mut uuid_buffer = [0; uuid::adapter::Hyphenated::LENGTH]; + let mut uuid_buffer = [0; uuid::fmt::Hyphenated::LENGTH]; // The primary key *field id* that has already been set for this index or the one // we will guess by searching for the first key that contains "id" as a substring. @@ -119,7 +119,7 @@ fn fetch_or_generate_document_id( documents_batch_index: &DocumentsBatchIndex, primary_key: PrimaryKey, autogenerate_docids: bool, - uuid_buffer: &mut [u8; uuid::adapter::Hyphenated::LENGTH], + uuid_buffer: &mut [u8; uuid::fmt::Hyphenated::LENGTH], count: u32, ) -> Result> { match primary_key { @@ -134,7 +134,7 @@ fn fetch_or_generate_document_id( } } None if autogenerate_docids => { - let uuid = uuid::Uuid::new_v4().to_hyphenated().encode_lower(uuid_buffer); + let uuid = uuid::Uuid::new_v4().as_hyphenated().encode_lower(uuid_buffer); Ok(Ok(DocumentId::generated(uuid.to_string(), count))) } None => Ok(Err(UserError::MissingDocumentId { From ab1571cdec615d025eeab8c924df74a5c485dceb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 18 Jul 2022 12:41:58 +0200 Subject: [PATCH 33/55] Simplify Transform::read_documents, enabled by enriched documents reader --- milli/src/update/index_documents/transform.rs | 97 +++---------------- 1 file changed, 11 insertions(+), 86 deletions(-) diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 6bf3dde43..d03a803fd 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -153,18 +153,18 @@ impl<'a, 'i> Transform<'a, 'i> { let mapping = create_fields_mapping(&mut self.fields_ids_map, fields_index)?; let primary_key = cursor.primary_key().to_string(); - let primary_key_id_nested = primary_key.contains('.'); let primary_key_id = self.fields_ids_map.insert(&primary_key).ok_or(UserError::AttributeLimitReached)?; - let mut flattened_document = None; let mut obkv_buffer = Vec::new(); - let mut flattened_obkv_buffer = Vec::new(); let mut documents_count = 0; + let mut docid_buffer: Vec = Vec::new(); let mut field_buffer: Vec<(u16, Cow<[u8]>)> = Vec::new(); while let Some(enriched_document) = cursor.next_enriched_document()? { let EnrichedDocument { document, document_id } = enriched_document; + // drop_and_reuse is called instead of .clear() to communicate to the compiler that field_buffer + // does not keep references from the cursor between loop iterations let mut field_buffer_cache = drop_and_reuse(field_buffer); if self.indexer_settings.log_every_n.map_or(false, |len| documents_count % len == 0) { progress_callback(UpdateIndexingStep::RemapDocumentAddition { @@ -176,8 +176,9 @@ impl<'a, 'i> Transform<'a, 'i> { // we must insert this document id into the remaped document. let external_id = document_id.value(); if document_id.is_generated() { - let docid = serde_json::to_vec(external_id).map_err(InternalError::SerdeJson)?; - field_buffer_cache.push((primary_key_id, Cow::from(docid))); + serde_json::to_writer(&mut docid_buffer, external_id) + .map_err(InternalError::SerdeJson)?; + field_buffer_cache.push((primary_key_id, Cow::from(&docid_buffer))); } for (k, v) in document.iter() { @@ -186,22 +187,6 @@ impl<'a, 'i> Transform<'a, 'i> { field_buffer_cache.push((mapped_id, Cow::from(v))); } - // We need to make sure that every document has a primary key. After we have remapped - // all the fields in the document, we try to find the primary key value. If we can find - // it, transform it into a string and validate it, and then update it in the - // document. If none is found, and we were told to generate missing document ids, then - // we create the missing field, and update the new document. - if primary_key_id_nested { - let mut field_buffer_cache = field_buffer_cache.clone(); - self.flatten_from_field_mapping( - &mapping, - &document, - &mut flattened_obkv_buffer, - &mut field_buffer_cache, - )?; - flattened_document = Some(&flattened_obkv_buffer); - }; - // Insertion in a obkv need to be done with keys ordered. For now they are ordered // according to the document addition key order, so we sort it according to the // fieldids map keys order. @@ -256,18 +241,12 @@ impl<'a, 'i> Transform<'a, 'i> { } // We use the extracted/generated user id as the key for this document. - self.original_sorter.insert(&docid.to_be_bytes(), obkv_buffer.clone())?; + self.original_sorter.insert(&docid.to_be_bytes(), &obkv_buffer)?; documents_count += 1; - if let Some(flatten) = flattened_document { - self.flattened_sorter.insert(docid.to_be_bytes(), &flatten)?; - } else { - match self.flatten_from_fields_ids_map(KvReader::new(&obkv_buffer))? { - Some(buffer) => self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?, - None => { - self.flattened_sorter.insert(docid.to_be_bytes(), obkv_buffer.clone())? - } - } + match self.flatten_from_fields_ids_map(KvReader::new(&obkv_buffer))? { + Some(buffer) => self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?, + None => self.flattened_sorter.insert(docid.to_be_bytes(), &obkv_buffer)?, } progress_callback(UpdateIndexingStep::RemapDocumentAddition { @@ -275,6 +254,7 @@ impl<'a, 'i> Transform<'a, 'i> { }); field_buffer = drop_and_reuse(field_buffer_cache); + docid_buffer.clear(); obkv_buffer.clear(); } @@ -345,61 +325,6 @@ impl<'a, 'i> Transform<'a, 'i> { Ok(Some(buffer)) } - // Flatten a document from a field mapping generated by [create_fields_mapping] - fn flatten_from_field_mapping( - &mut self, - mapping: &HashMap, - obkv: &KvReader, - output_buffer: &mut Vec, - field_buffer_cache: &mut Vec<(u16, Cow<[u8]>)>, - ) -> Result<()> { - // store the keys and values of the json + the original obkv - let mut key_value: Vec<(FieldId, Cow<[u8]>)> = Vec::new(); - - // if the primary_key is nested we need to flatten the document before being able to do anything - let mut doc = serde_json::Map::new(); - - // we recreate a json containing only the fields that needs to be flattened. - // all the raw values get inserted directly in the `key_value` vec. - for (key, value) in obkv.iter() { - if json_depth_checker::should_flatten_from_unchecked_slice(value) { - let key = - mapping.get(&key).ok_or(InternalError::FieldIdMappingMissingEntry { key })?; - let key = - self.fields_ids_map.name(*key).ok_or(FieldIdMapMissingEntry::FieldId { - field_id: *key, - process: "Flatten from field mapping.", - })?; - let value = serde_json::from_slice::(value) - .map_err(InternalError::SerdeJson)?; - doc.insert(key.to_string(), value); - } else { - key_value.push((key, value.into())); - } - } - - let flattened = flatten_serde_json::flatten(&doc); - - // Once we have the flattened version we insert all the new generated fields_ids - // (if any) in the fields ids map and serialize the value. - for (key, value) in flattened.into_iter() { - let fid = self.fields_ids_map.insert(&key).ok_or(UserError::AttributeLimitReached)?; - let value = serde_json::to_vec(&value).map_err(InternalError::SerdeJson)?; - key_value.push((fid, value.clone().into())); - - if field_buffer_cache.iter().find(|(id, _)| *id == fid).is_none() { - field_buffer_cache.push((fid, value.into())); - } - } - - // we sort the key. If there was a conflict between the obkv and the new generated value the - // keys will be consecutive. - key_value.sort_unstable_by_key(|(key, _)| *key); - - Self::create_obkv_from_key_value(&mut key_value, output_buffer)?; - Ok(()) - } - /// Generate an obkv from a slice of key / value sorted by key. fn create_obkv_from_key_value( key_value: &mut [(FieldId, Cow<[u8]>)], From fc9f3f31e74edda92ad1beb31226e9cb100781e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 18 Jul 2022 16:08:01 +0200 Subject: [PATCH 34/55] Change DocumentsBatchReader to access cursor and index at same time Otherwise it is not possible to iterate over all documents while using the fields index at the same time. --- milli/src/documents/builder.rs | 71 ++++++++++--------- milli/src/documents/enriched.rs | 25 ++----- milli/src/documents/mod.rs | 16 +++-- milli/src/documents/reader.rs | 20 ++---- milli/src/update/index_documents/enrich.rs | 9 +-- milli/src/update/index_documents/transform.rs | 6 +- 6 files changed, 65 insertions(+), 82 deletions(-) diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs index bb9d6aa68..dc027e1b7 100644 --- a/milli/src/documents/builder.rs +++ b/milli/src/documents/builder.rs @@ -216,9 +216,9 @@ mod test { assert_eq!(builder.documents_count(), 2); let vector = builder.into_inner().unwrap(); - let mut cursor = - DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); - let index = cursor.documents_batch_index().clone(); + let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) + .unwrap() + .into_cursor_and_fields_index(); assert_eq!(index.len(), 3); let document = cursor.next_document().unwrap().unwrap(); @@ -240,9 +240,9 @@ mod test { assert_eq!(builder.documents_count(), 2); let vector = builder.into_inner().unwrap(); - let mut cursor = - DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); - let index = cursor.documents_batch_index().clone(); + let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) + .unwrap() + .into_cursor_and_fields_index(); assert_eq!(index.len(), 2); let document = cursor.next_document().unwrap().unwrap(); @@ -264,9 +264,9 @@ mod test { builder.append_csv(csv).unwrap(); let vector = builder.into_inner().unwrap(); - let mut cursor = - DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); - let index = cursor.documents_batch_index().clone(); + let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) + .unwrap() + .into_cursor_and_fields_index(); let doc = cursor.next_document().unwrap().unwrap(); let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); @@ -292,9 +292,9 @@ mod test { builder.append_csv(csv).unwrap(); let vector = builder.into_inner().unwrap(); - let mut cursor = - DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); - let index = cursor.documents_batch_index().clone(); + let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) + .unwrap() + .into_cursor_and_fields_index(); let doc = cursor.next_document().unwrap().unwrap(); let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); @@ -319,9 +319,9 @@ mod test { builder.append_csv(csv).unwrap(); let vector = builder.into_inner().unwrap(); - let mut cursor = - DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); - let index = cursor.documents_batch_index().clone(); + let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) + .unwrap() + .into_cursor_and_fields_index(); let doc = cursor.next_document().unwrap().unwrap(); let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); @@ -346,9 +346,9 @@ mod test { builder.append_csv(csv).unwrap(); let vector = builder.into_inner().unwrap(); - let mut cursor = - DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); - let index = cursor.documents_batch_index().clone(); + let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) + .unwrap() + .into_cursor_and_fields_index(); let doc = cursor.next_document().unwrap().unwrap(); let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); @@ -373,9 +373,9 @@ mod test { builder.append_csv(csv).unwrap(); let vector = builder.into_inner().unwrap(); - let mut cursor = - DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); - let index = cursor.documents_batch_index().clone(); + let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) + .unwrap() + .into_cursor_and_fields_index(); let doc = cursor.next_document().unwrap().unwrap(); let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); @@ -400,9 +400,9 @@ mod test { builder.append_csv(csv).unwrap(); let vector = builder.into_inner().unwrap(); - let mut cursor = - DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); - let index = cursor.documents_batch_index().clone(); + let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) + .unwrap() + .into_cursor_and_fields_index(); let doc = cursor.next_document().unwrap().unwrap(); let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); @@ -427,9 +427,9 @@ mod test { builder.append_csv(csv).unwrap(); let vector = builder.into_inner().unwrap(); - let mut cursor = - DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); - let index = cursor.documents_batch_index().clone(); + let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) + .unwrap() + .into_cursor_and_fields_index(); let doc = cursor.next_document().unwrap().unwrap(); let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); @@ -454,9 +454,9 @@ mod test { builder.append_csv(csv).unwrap(); let vector = builder.into_inner().unwrap(); - let mut cursor = - DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); - let index = cursor.documents_batch_index().clone(); + let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) + .unwrap() + .into_cursor_and_fields_index(); let doc = cursor.next_document().unwrap().unwrap(); let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); @@ -482,8 +482,9 @@ mod test { builder.append_csv(csv).unwrap(); let vector = builder.into_inner().unwrap(); - let mut cursor = - DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); + let (mut cursor, _) = DocumentsBatchReader::from_reader(Cursor::new(vector)) + .unwrap() + .into_cursor_and_fields_index(); assert!(cursor.next_document().is_err()); } @@ -498,9 +499,9 @@ mod test { builder.append_csv(csv).unwrap(); let vector = builder.into_inner().unwrap(); - let mut cursor = - DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); - let index = cursor.documents_batch_index().clone(); + let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) + .unwrap() + .into_cursor_and_fields_index(); let doc = cursor.next_document().unwrap().unwrap(); let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); diff --git a/milli/src/documents/enriched.rs b/milli/src/documents/enriched.rs index 4f45a891a..fa21c0f87 100644 --- a/milli/src/documents/enriched.rs +++ b/milli/src/documents/enriched.rs @@ -56,14 +56,13 @@ impl EnrichedDocumentsBatchReader { } /// This method returns a forward cursor over the enriched documents. - pub fn into_cursor(self) -> EnrichedDocumentsBatchCursor { + pub fn into_cursor_and_fields_index( + self, + ) -> (EnrichedDocumentsBatchCursor, DocumentsBatchIndex) { let EnrichedDocumentsBatchReader { documents, primary_key, mut external_ids } = self; + let (documents, fields_index) = documents.into_cursor_and_fields_index(); external_ids.reset(); - EnrichedDocumentsBatchCursor { - documents: documents.into_cursor(), - primary_key, - external_ids, - } + (EnrichedDocumentsBatchCursor { documents, primary_key, external_ids }, fields_index) } } @@ -80,23 +79,9 @@ pub struct EnrichedDocumentsBatchCursor { } impl EnrichedDocumentsBatchCursor { - pub fn into_reader(self) -> EnrichedDocumentsBatchReader { - let EnrichedDocumentsBatchCursor { documents, primary_key, external_ids } = self; - EnrichedDocumentsBatchReader { - documents: documents.into_reader(), - primary_key, - external_ids, - } - } - pub fn primary_key(&self) -> &str { &self.primary_key } - - pub fn documents_batch_index(&self) -> &DocumentsBatchIndex { - self.documents.documents_batch_index() - } - /// Resets the cursor to be able to read from the start again. pub fn reset(&mut self) { self.documents.reset(); diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs index c5ff7a120..e766e29cf 100644 --- a/milli/src/documents/mod.rs +++ b/milli/src/documents/mod.rs @@ -203,10 +203,11 @@ mod test { builder.append_json_object(value.as_object().unwrap()).unwrap(); let vector = builder.into_inner().unwrap(); - let mut documents = - DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); + let (mut documents, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) + .unwrap() + .into_cursor_and_fields_index(); - assert_eq!(documents.documents_batch_index().iter().count(), 5); + assert_eq!(index.iter().count(), 5); let reader = documents.next_document().unwrap().unwrap(); assert_eq!(reader.iter().count(), 5); assert!(documents.next_document().unwrap().is_none()); @@ -226,9 +227,10 @@ mod test { builder.append_json_object(doc2.as_object().unwrap()).unwrap(); let vector = builder.into_inner().unwrap(); - let mut documents = - DocumentsBatchReader::from_reader(io::Cursor::new(vector)).unwrap().into_cursor(); - assert_eq!(documents.documents_batch_index().iter().count(), 2); + let (mut documents, index) = DocumentsBatchReader::from_reader(io::Cursor::new(vector)) + .unwrap() + .into_cursor_and_fields_index(); + assert_eq!(index.iter().count(), 2); let reader = documents.next_document().unwrap().unwrap(); assert_eq!(reader.iter().count(), 1); assert!(documents.next_document().unwrap().is_some()); @@ -243,7 +245,7 @@ mod test { } }]); - let mut cursor = docs_reader.into_cursor(); + let (mut cursor, _) = docs_reader.into_cursor_and_fields_index(); let doc = cursor.next_document().unwrap().unwrap(); let nested: Value = serde_json::from_slice(doc.get(0).unwrap()).unwrap(); assert_eq!(nested, json!({ "toto": ["hello"] })); diff --git a/milli/src/documents/reader.rs b/milli/src/documents/reader.rs index 70b8b0131..a8a4c662d 100644 --- a/milli/src/documents/reader.rs +++ b/milli/src/documents/reader.rs @@ -17,6 +17,10 @@ pub struct DocumentsBatchReader { } impl DocumentsBatchReader { + pub fn new(cursor: DocumentsBatchCursor, fields_index: DocumentsBatchIndex) -> Self { + Self { cursor: cursor.cursor, fields_index } + } + /// Construct a `DocumentsReader` from a reader. /// /// It first retrieves the index, then moves to the first document. Use the `into_cursor` @@ -46,30 +50,20 @@ impl DocumentsBatchReader { } /// This method returns a forward cursor over the documents. - pub fn into_cursor(self) -> DocumentsBatchCursor { + pub fn into_cursor_and_fields_index(self) -> (DocumentsBatchCursor, DocumentsBatchIndex) { let DocumentsBatchReader { cursor, fields_index } = self; - let mut cursor = DocumentsBatchCursor { cursor, fields_index }; + let mut cursor = DocumentsBatchCursor { cursor }; cursor.reset(); - cursor + (cursor, fields_index) } } /// A forward cursor over the documents in a `DocumentsBatchReader`. pub struct DocumentsBatchCursor { cursor: grenad::ReaderCursor, - fields_index: DocumentsBatchIndex, } impl DocumentsBatchCursor { - pub fn into_reader(self) -> DocumentsBatchReader { - let DocumentsBatchCursor { cursor, fields_index, .. } = self; - DocumentsBatchReader { cursor, fields_index } - } - - pub fn documents_batch_index(&self) -> &DocumentsBatchIndex { - &self.fields_index - } - /// Resets the cursor to be able to read from the start again. pub fn reset(&mut self) { self.cursor.reset(); diff --git a/milli/src/update/index_documents/enrich.rs b/milli/src/update/index_documents/enrich.rs index 51495c598..7c9a016d8 100644 --- a/milli/src/update/index_documents/enrich.rs +++ b/milli/src/update/index_documents/enrich.rs @@ -27,8 +27,8 @@ pub fn enrich_documents_batch( autogenerate_docids: bool, reader: DocumentsBatchReader, ) -> Result, UserError>> { - let mut cursor = reader.into_cursor(); - let mut documents_batch_index = cursor.documents_batch_index().clone(); + let (mut cursor, mut documents_batch_index) = reader.into_cursor_and_fields_index(); + let mut external_ids = tempfile::tempfile().map(grenad::Writer::new)?; let mut uuid_buffer = [0; uuid::fmt::Hyphenated::LENGTH]; @@ -103,9 +103,10 @@ pub fn enrich_documents_batch( } let external_ids = writer_into_reader(external_ids)?; + let primary_key_name = primary_key.name().to_string(); let reader = EnrichedDocumentsBatchReader::new( - cursor.into_reader(), - primary_key.name().to_string(), + DocumentsBatchReader::new(cursor, documents_batch_index), + primary_key_name, external_ids, )?; diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index d03a803fd..0de90924a 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -146,11 +146,11 @@ impl<'a, 'i> Transform<'a, 'i> { R: Read + Seek, F: Fn(UpdateIndexingStep) + Sync, { - let mut cursor = reader.into_cursor(); - let fields_index = cursor.documents_batch_index(); + let (mut cursor, fields_index) = reader.into_cursor_and_fields_index(); + let external_documents_ids = self.index.external_documents_ids(wtxn)?; - let mapping = create_fields_mapping(&mut self.fields_ids_map, fields_index)?; + let mapping = create_fields_mapping(&mut self.fields_ids_map, &fields_index)?; let primary_key = cursor.primary_key().to_string(); let primary_key_id = From 453d593ce804eac35af4f7abd1b3a88c218941d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 19 Jul 2022 09:30:19 +0200 Subject: [PATCH 35/55] Add a database containing the docids where each field exists --- infos/src/main.rs | 15 ++ milli/src/heed_codec/facet/mod.rs | 40 +++++ milli/src/index.rs | 24 ++- milli/src/update/clear_documents.rs | 2 + milli/src/update/delete_documents.rs | 11 +- .../extract/extract_facet_exists_docids.rs | 42 +++++ .../extract/extract_fid_docid_facet_values.rs | 29 +++- .../src/update/index_documents/extract/mod.rs | 47 ++++-- milli/src/update/index_documents/mod.rs | 149 ++++++++++++++++++ .../src/update/index_documents/typed_chunk.rs | 13 ++ 10 files changed, 350 insertions(+), 22 deletions(-) create mode 100644 milli/src/update/index_documents/extract/extract_facet_exists_docids.rs diff --git a/infos/src/main.rs b/infos/src/main.rs index 29a87cdcf..89aec6182 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -384,6 +384,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho field_id_word_count_docids, facet_id_f64_docids, facet_id_string_docids, + facet_id_exists_docids, exact_word_docids, exact_word_prefix_docids, field_id_docid_facet_f64s: _, @@ -402,6 +403,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho let field_id_word_count_docids_name = "field_id_word_count_docids"; let facet_id_f64_docids_name = "facet_id_f64_docids"; let facet_id_string_docids_name = "facet_id_string_docids"; + let facet_id_exists_docids_name = "facet_id_exists_docids"; let documents_name = "documents"; let mut heap = BinaryHeap::with_capacity(limit + 1); @@ -544,6 +546,17 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho heap.pop(); } } + + // List the docids where the facet exists + let db = facet_id_exists_docids.remap_data_type::(); + for result in facet_values_iter(rtxn, db, facet_id)? { + let (_fid, value) = result?; + let key = format!("{}", facet_name); + heap.push(Reverse((value.len(), key, facet_id_exists_docids_name))); + if heap.len() > limit { + heap.pop(); + } + } } for result in index.all_documents(rtxn)? { @@ -984,6 +997,7 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec) -> a facet_id_string_docids, field_id_docid_facet_f64s, field_id_docid_facet_strings, + facet_id_exists_docids, exact_word_prefix_docids, exact_word_docids, .. @@ -1007,6 +1021,7 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec) -> a FIELD_ID_WORD_COUNT_DOCIDS => field_id_word_count_docids.as_polymorph(), FACET_ID_F64_DOCIDS => facet_id_f64_docids.as_polymorph(), FACET_ID_STRING_DOCIDS => facet_id_string_docids.as_polymorph(), + FACET_ID_EXISTS_DOCIDS => facet_id_exists_docids.as_polymorph(), FIELD_ID_DOCID_FACET_F64S => field_id_docid_facet_f64s.as_polymorph(), FIELD_ID_DOCID_FACET_STRINGS => field_id_docid_facet_strings.as_polymorph(), EXACT_WORD_DOCIDS => exact_word_docids.as_polymorph(), diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs index e93fb57b9..79dbffa1d 100644 --- a/milli/src/heed_codec/facet/mod.rs +++ b/milli/src/heed_codec/facet/mod.rs @@ -25,3 +25,43 @@ pub fn try_split_at(slice: &[u8], mid: usize) -> Option<(&[u8], &[u8])> { None } } + +use crate::{try_split_array_at, DocumentId, FieldId}; +use std::borrow::Cow; +use std::convert::TryInto; + +pub struct FieldIdCodec; + +impl<'a> heed::BytesDecode<'a> for FieldIdCodec { + type DItem = FieldId; + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let (field_id_bytes, _) = try_split_array_at(bytes)?; + let field_id = u16::from_be_bytes(field_id_bytes); + Some(field_id) + } +} + +impl<'a> heed::BytesEncode<'a> for FieldIdCodec { + type EItem = FieldId; + + fn bytes_encode(field_id: &Self::EItem) -> Option> { + Some(Cow::Owned(field_id.to_be_bytes().to_vec())) + } +} + +pub struct FieldIdDocIdCodec; + +impl<'a> heed::BytesDecode<'a> for FieldIdDocIdCodec { + type DItem = (FieldId, DocumentId); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let (field_id_bytes, bytes) = try_split_array_at(bytes)?; + let field_id = u16::from_be_bytes(field_id_bytes); + + let document_id_bytes = bytes[..4].try_into().ok()?; + let document_id = u32::from_be_bytes(document_id_bytes); + + Some((field_id, document_id)) + } +} diff --git a/milli/src/index.rs b/milli/src/index.rs index 9637b4103..816112178 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -15,7 +15,7 @@ use crate::error::{InternalError, UserError}; use crate::fields_ids_map::FieldsIdsMap; use crate::heed_codec::facet::{ FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec, - FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, + FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, FieldIdCodec, }; use crate::{ default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion, @@ -75,6 +75,7 @@ pub mod db_name { pub const WORD_PREFIX_POSITION_DOCIDS: &str = "word-prefix-position-docids"; pub const FIELD_ID_WORD_COUNT_DOCIDS: &str = "field-id-word-count-docids"; pub const FACET_ID_F64_DOCIDS: &str = "facet-id-f64-docids"; + pub const FACET_ID_EXISTS_DOCIDS: &str = "facet-id-exists-docids"; pub const FACET_ID_STRING_DOCIDS: &str = "facet-id-string-docids"; pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s"; pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings"; @@ -116,6 +117,9 @@ pub struct Index { /// Maps the position of a word prefix with all the docids where this prefix appears. pub word_prefix_position_docids: Database, + /// Maps the facet field id and the docids for which this field exists + pub facet_id_exists_docids: Database, + /// Maps the facet field id, level and the number with the docids that corresponds to it. pub facet_id_f64_docids: Database, /// Maps the facet field id and the string with the original string and docids that corresponds to it. @@ -134,7 +138,7 @@ impl Index { pub fn new>(mut options: heed::EnvOpenOptions, path: P) -> Result { use db_name::*; - options.max_dbs(16); + options.max_dbs(17); unsafe { options.flag(Flags::MdbAlwaysFreePages) }; let env = options.open(path)?; @@ -152,6 +156,9 @@ impl Index { let word_prefix_position_docids = env.create_database(Some(WORD_PREFIX_POSITION_DOCIDS))?; let facet_id_f64_docids = env.create_database(Some(FACET_ID_F64_DOCIDS))?; let facet_id_string_docids = env.create_database(Some(FACET_ID_STRING_DOCIDS))?; + let facet_id_exists_docids: Database = + env.create_database(Some(FACET_ID_EXISTS_DOCIDS))?; + let field_id_docid_facet_f64s = env.create_database(Some(FIELD_ID_DOCID_FACET_F64S))?; let field_id_docid_facet_strings = env.create_database(Some(FIELD_ID_DOCID_FACET_STRINGS))?; @@ -174,6 +181,7 @@ impl Index { field_id_word_count_docids, facet_id_f64_docids, facet_id_string_docids, + facet_id_exists_docids, field_id_docid_facet_f64s, field_id_docid_facet_strings, documents, @@ -806,6 +814,18 @@ impl Index { } } + /// Retrieve all the documents which contain this field id + pub fn exists_faceted_documents_ids( + &self, + rtxn: &RoTxn, + field_id: FieldId, + ) -> heed::Result { + match self.facet_id_exists_docids.get(rtxn, &field_id)? { + Some(docids) => Ok(docids), + None => Ok(RoaringBitmap::new()), + } + } + /* distinct field */ pub(crate) fn put_distinct_field( diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index d1939df7b..db438d019 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -30,6 +30,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { word_prefix_position_docids, facet_id_f64_docids, facet_id_string_docids, + facet_id_exists_docids, field_id_docid_facet_f64s, field_id_docid_facet_strings, documents, @@ -69,6 +70,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { field_id_word_count_docids.clear(self.wtxn)?; word_prefix_position_docids.clear(self.wtxn)?; facet_id_f64_docids.clear(self.wtxn)?; + facet_id_exists_docids.clear(self.wtxn)?; facet_id_string_docids.clear(self.wtxn)?; field_id_docid_facet_f64s.clear(self.wtxn)?; field_id_docid_facet_strings.clear(self.wtxn)?; diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 3b519c101..6dfdb9a7c 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -170,6 +170,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { word_position_docids, word_prefix_position_docids, facet_id_f64_docids, + facet_id_exists_docids, facet_id_string_docids, field_id_docid_facet_f64s, field_id_docid_facet_strings, @@ -424,11 +425,17 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { } // We delete the documents ids that are under the facet field id values. - remove_docids_from_facet_field_id_number_docids( + remove_docids_from_facet_field_id_docids( self.wtxn, facet_id_f64_docids, &self.to_delete_docids, )?; + // We delete the documents ids that are under the facet field id values. + remove_docids_from_facet_field_id_docids( + self.wtxn, + facet_id_exists_docids, + &self.to_delete_docids, + )?; remove_docids_from_facet_field_id_string_docids( self.wtxn, @@ -618,7 +625,7 @@ fn remove_docids_from_facet_field_id_string_docids<'a, C, D>( Ok(()) } -fn remove_docids_from_facet_field_id_number_docids<'a, C>( +fn remove_docids_from_facet_field_id_docids<'a, C>( wtxn: &'a mut heed::RwTxn, db: &heed::Database, to_remove: &RoaringBitmap, diff --git a/milli/src/update/index_documents/extract/extract_facet_exists_docids.rs b/milli/src/update/index_documents/extract/extract_facet_exists_docids.rs new file mode 100644 index 000000000..e7a001c08 --- /dev/null +++ b/milli/src/update/index_documents/extract/extract_facet_exists_docids.rs @@ -0,0 +1,42 @@ +use std::fs::File; +use std::io; + +use heed::{BytesDecode, BytesEncode}; + +use super::helpers::{ + create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, GrenadParameters, +}; +use crate::heed_codec::facet::{FieldIdCodec, FieldIdDocIdCodec}; +use crate::Result; + +/// Extracts the documents ids where this field appears. +/// +/// Returns a grenad reader whose key is the field id encoded +/// with `FieldIdCodec` and the value is a document_id (u32) +/// encoded as native-endian bytes. +#[logging_timer::time] +pub fn extract_facet_exists_docids( + docid_fid_facet_number: grenad::Reader, + indexer: GrenadParameters, +) -> Result> { + let max_memory = indexer.max_memory_by_thread(); + + let mut facet_exists_docids_sorter = create_sorter( + merge_cbo_roaring_bitmaps, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory, + ); + + let mut cursor = docid_fid_facet_number.into_cursor()?; + while let Some((key_bytes, _)) = cursor.move_on_next()? { + let (field_id, document_id) = FieldIdDocIdCodec::bytes_decode(key_bytes).unwrap(); + + let key_bytes = FieldIdCodec::bytes_encode(&field_id).unwrap(); + + facet_exists_docids_sorter.insert(key_bytes, document_id.to_ne_bytes())?; + } + + sorter_into_reader(facet_exists_docids_sorter, indexer) +} diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index 628636f78..d93bde500 100644 --- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -20,7 +20,7 @@ pub fn extract_fid_docid_facet_values( obkv_documents: grenad::Reader, indexer: GrenadParameters, faceted_fields: &HashSet, -) -> Result<(grenad::Reader, grenad::Reader)> { +) -> Result<(grenad::Reader, grenad::Reader, grenad::Reader)> { let max_memory = indexer.max_memory_by_thread(); let mut fid_docid_facet_numbers_sorter = create_sorter( @@ -28,7 +28,7 @@ pub fn extract_fid_docid_facet_values( indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, - max_memory.map(|m| m / 2), + max_memory.map(|m| m / 3), ); let mut fid_docid_facet_strings_sorter = create_sorter( @@ -36,7 +36,15 @@ pub fn extract_fid_docid_facet_values( indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, - max_memory.map(|m| m / 2), + max_memory.map(|m| m / 3), + ); + + let mut fid_docid_facet_exists_sorter = create_sorter( + keep_first, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory.map(|m| m / 3), ); let mut key_buffer = Vec::new(); @@ -46,15 +54,19 @@ pub fn extract_fid_docid_facet_values( for (field_id, field_bytes) in obkv.iter() { if faceted_fields.contains(&field_id) { - let value = - serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?; - let (numbers, strings) = extract_facet_values(&value); - key_buffer.clear(); + // here, we know already that the document must be added to the “field id exists” database // prefix key with the field_id and the document_id + key_buffer.extend_from_slice(&field_id.to_be_bytes()); key_buffer.extend_from_slice(&docid_bytes); + fid_docid_facet_exists_sorter.insert(&key_buffer, ().as_bytes())?; + + let value = + serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?; + + let (numbers, strings) = extract_facet_values(&value); // insert facet numbers in sorter for number in numbers { @@ -79,7 +91,8 @@ pub fn extract_fid_docid_facet_values( Ok(( sorter_into_reader(fid_docid_facet_numbers_sorter, indexer.clone())?, - sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?, + sorter_into_reader(fid_docid_facet_strings_sorter, indexer.clone())?, + sorter_into_reader(fid_docid_facet_exists_sorter, indexer)?, )) } diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index c3c2033a6..7d26e0984 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -1,4 +1,5 @@ mod extract_docid_word_positions; +mod extract_facet_exists_docids; mod extract_facet_number_docids; mod extract_facet_string_docids; mod extract_fid_docid_facet_values; @@ -16,6 +17,7 @@ use log::debug; use rayon::prelude::*; use self::extract_docid_word_positions::extract_docid_word_positions; +use self::extract_facet_exists_docids::extract_facet_exists_docids; use self::extract_facet_number_docids::extract_facet_number_docids; use self::extract_facet_string_docids::extract_facet_string_docids; use self::extract_fid_docid_facet_values::extract_fid_docid_facet_values; @@ -53,7 +55,7 @@ pub(crate) fn data_from_obkv_documents( }) .collect::>()?; - let result: Result<(Vec<_>, (Vec<_>, Vec<_>))> = flattened_obkv_chunks + let result: Result<(Vec<_>, (Vec<_>, (Vec<_>, Vec<_>)))> = flattened_obkv_chunks .par_bridge() .map(|flattened_obkv_chunks| { send_and_extract_flattened_documents_data( @@ -72,7 +74,10 @@ pub(crate) fn data_from_obkv_documents( let ( docid_word_positions_chunks, - (docid_fid_facet_numbers_chunks, docid_fid_facet_strings_chunks), + ( + docid_fid_facet_numbers_chunks, + (docid_fid_facet_strings_chunks, docid_fid_facet_exists_chunks), + ), ) = result?; spawn_extraction_task::<_, _, Vec>>( @@ -137,6 +142,15 @@ pub(crate) fn data_from_obkv_documents( TypedChunk::FieldIdFacetNumberDocids, "field-id-facet-number-docids", ); + spawn_extraction_task::<_, _, Vec>>( + docid_fid_facet_exists_chunks.clone(), + indexer.clone(), + lmdb_writer_sx.clone(), + extract_facet_exists_docids, + merge_cbo_roaring_bitmaps, + TypedChunk::FieldIdFacetExistsDocids, + "field-id-facet-exists-docids", + ); Ok(()) } @@ -197,6 +211,7 @@ fn send_original_documents_data( /// - docid_word_positions /// - docid_fid_facet_numbers /// - docid_fid_facet_strings +/// - docid_fid_facet_exists fn send_and_extract_flattened_documents_data( flattened_documents_chunk: Result>, indexer: GrenadParameters, @@ -209,7 +224,10 @@ fn send_and_extract_flattened_documents_data( max_positions_per_attributes: Option, ) -> Result<( grenad::Reader, - (grenad::Reader, grenad::Reader), + ( + grenad::Reader, + (grenad::Reader, grenad::Reader), + ), )> { let flattened_documents_chunk = flattened_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?; @@ -250,12 +268,15 @@ fn send_and_extract_flattened_documents_data( Ok(docid_word_positions_chunk) }, || { - let (docid_fid_facet_numbers_chunk, docid_fid_facet_strings_chunk) = - extract_fid_docid_facet_values( - flattened_documents_chunk.clone(), - indexer.clone(), - faceted_fields, - )?; + let ( + docid_fid_facet_numbers_chunk, + docid_fid_facet_strings_chunk, + docid_fid_facet_exists_chunk, + ) = extract_fid_docid_facet_values( + flattened_documents_chunk.clone(), + indexer.clone(), + faceted_fields, + )?; // send docid_fid_facet_numbers_chunk to DB writer let docid_fid_facet_numbers_chunk = @@ -273,7 +294,13 @@ fn send_and_extract_flattened_documents_data( docid_fid_facet_strings_chunk.clone(), ))); - Ok((docid_fid_facet_numbers_chunk, docid_fid_facet_strings_chunk)) + let docid_fid_facet_exists_chunk = + unsafe { as_cloneable_grenad(&docid_fid_facet_exists_chunk)? }; + + Ok(( + docid_fid_facet_numbers_chunk, + (docid_fid_facet_strings_chunk, docid_fid_facet_exists_chunk), + )) }, ); diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index ba428f078..82457762e 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1931,4 +1931,153 @@ mod tests { assert_eq!(ids.len(), map.len()); } + + #[test] + fn index_documents_check_exists_database_reindex() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + let mut wtxn = index.write_txn().unwrap(); + let content = documents!([ + { + "id": 0, + "colour": 0, + }, + { + "id": 1, + "colour": [] + }, + { + "id": 2, + "colour": {} + }, + { + "id": 3, + "colour": null + }, + { + "id": 4, + "colour": [1] + }, + { + "id": 5 + }, + { + "id": 6, + "colour": { + "green": 1 + } + } + ]); + + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) + .unwrap(); + builder.add_documents(content).unwrap(); + builder.execute().unwrap(); + + wtxn.commit().unwrap(); + + let mut wtxn = index.write_txn().unwrap(); + let mut builder = update::Settings::new(&mut wtxn, &index, &config); + + let faceted_fields = hashset!(S("colour")); + builder.set_filterable_fields(faceted_fields); + builder.execute(|_| ()).unwrap(); + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + let facets = index.faceted_fields(&rtxn).unwrap(); + assert_eq!(facets, hashset!(S("colour"), S("colour.green"))); + + let colour_id = index.fields_ids_map(&rtxn).unwrap().id("colour").unwrap(); + let colour_green_id = index.fields_ids_map(&rtxn).unwrap().id("colour.green").unwrap(); + + let bitmap_colour = index.facet_id_exists_docids.get(&rtxn, &colour_id).unwrap().unwrap(); + assert_eq!(bitmap_colour.into_iter().collect::>(), vec![0, 1, 2, 3, 4, 6]); + + let bitmap_colour_green = + index.facet_id_exists_docids.get(&rtxn, &colour_green_id).unwrap().unwrap(); + assert_eq!(bitmap_colour_green.into_iter().collect::>(), vec![6]); + } + + #[test] + fn index_documents_check_exists_database() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + let config = IndexerConfig::default(); + + let mut wtxn = index.write_txn().unwrap(); + let mut builder = update::Settings::new(&mut wtxn, &index, &config); + + let faceted_fields = hashset!(S("colour")); + builder.set_filterable_fields(faceted_fields); + builder.execute(|_| ()).unwrap(); + wtxn.commit().unwrap(); + + let content = documents!([ + { + "id": 0, + "colour": 0, + }, + { + "id": 1, + "colour": [] + }, + { + "id": 2, + "colour": {} + }, + { + "id": 3, + "colour": null + }, + { + "id": 4, + "colour": [1] + }, + { + "id": 5 + }, + { + "id": 6, + "colour": { + "green": 1 + } + } + ]); + + let indexing_config = IndexDocumentsConfig::default(); + + let mut wtxn = index.write_txn().unwrap(); + + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) + .unwrap(); + builder.add_documents(content).unwrap(); + builder.execute().unwrap(); + + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + let facets = index.faceted_fields(&rtxn).unwrap(); + assert_eq!(facets, hashset!(S("colour"), S("colour.green"))); + + let colour_id = index.fields_ids_map(&rtxn).unwrap().id("colour").unwrap(); + let colour_green_id = index.fields_ids_map(&rtxn).unwrap().id("colour.green").unwrap(); + + let bitmap_colour = index.facet_id_exists_docids.get(&rtxn, &colour_id).unwrap().unwrap(); + assert_eq!(bitmap_colour.into_iter().collect::>(), vec![0, 1, 2, 3, 4, 6]); + + let bitmap_colour_green = + index.facet_id_exists_docids.get(&rtxn, &colour_green_id).unwrap().unwrap(); + assert_eq!(bitmap_colour_green.into_iter().collect::>(), vec![6]); + } } diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 26b97c3a0..e501e5efd 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -35,6 +35,7 @@ pub(crate) enum TypedChunk { WordPairProximityDocids(grenad::Reader), FieldIdFacetStringDocids(grenad::Reader), FieldIdFacetNumberDocids(grenad::Reader), + FieldIdFacetExistsDocids(grenad::Reader), GeoPoints(grenad::Reader), } @@ -146,6 +147,18 @@ pub(crate) fn write_typed_chunk_into_index( )?; is_merged_database = true; } + TypedChunk::FieldIdFacetExistsDocids(facet_id_exists_docids_iter) => { + append_entries_into_database( + facet_id_exists_docids_iter, + &index.facet_id_exists_docids, + wtxn, + index_is_empty, + |value, _buffer| Ok(value), + merge_cbo_roaring_bitmaps, + ) + .unwrap(); + is_merged_database = true; + } TypedChunk::WordPairProximityDocids(word_pair_proximity_docids_iter) => { append_entries_into_database( word_pair_proximity_docids_iter, From a8641b42a7963666eedb9846b6aa69481a3f92ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 14 Jun 2022 15:05:49 +0200 Subject: [PATCH 36/55] Modify flatten_serde_json to keep dummy value for all object keys Example: ```json { "id": 0, "colour" : { "green": 1 } } ``` becomes: ```json { "id": 0, "colour" : [], "colour.green": 1 } ``` to retain the information the key "colour" exists in the original json value. --- flatten-serde-json/src/lib.rs | 53 ++++++++++++++++++++++++++++++----- 1 file changed, 46 insertions(+), 7 deletions(-) diff --git a/flatten-serde-json/src/lib.rs b/flatten-serde-json/src/lib.rs index 8312f5bd6..e1b2b20c7 100644 --- a/flatten-serde-json/src/lib.rs +++ b/flatten-serde-json/src/lib.rs @@ -4,7 +4,11 @@ use serde_json::{Map, Value}; pub fn flatten(json: &Map) -> Map { let mut obj = Map::new(); - insert_object(&mut obj, None, json); + let mut all_keys = vec![]; + insert_object(&mut obj, None, json, &mut all_keys); + for key in all_keys { + obj.entry(key).or_insert(Value::Array(vec![])); + } obj } @@ -12,26 +16,32 @@ fn insert_object( base_json: &mut Map, base_key: Option<&str>, object: &Map, + all_keys: &mut Vec, ) { for (key, value) in object { let new_key = base_key.map_or_else(|| key.clone(), |base_key| format!("{base_key}.{key}")); - + all_keys.push(new_key.clone()); if let Some(array) = value.as_array() { - insert_array(base_json, &new_key, array); + insert_array(base_json, &new_key, array, all_keys); } else if let Some(object) = value.as_object() { - insert_object(base_json, Some(&new_key), object); + insert_object(base_json, Some(&new_key), object, all_keys); } else { insert_value(base_json, &new_key, value.clone()); } } } -fn insert_array(base_json: &mut Map, base_key: &str, array: &Vec) { +fn insert_array( + base_json: &mut Map, + base_key: &str, + array: &Vec, + all_keys: &mut Vec, +) { for value in array { if let Some(object) = value.as_object() { - insert_object(base_json, Some(base_key), object); + insert_object(base_json, Some(base_key), object, all_keys); } else if let Some(sub_array) = value.as_array() { - insert_array(base_json, base_key, sub_array); + insert_array(base_json, base_key, sub_array, all_keys); } else { insert_value(base_json, base_key, value.clone()); } @@ -103,6 +113,7 @@ mod tests { assert_eq!( &flat, json!({ + "a": [], "a.b": "c", "a.d": "e", "a.f": "g" @@ -116,6 +127,10 @@ mod tests { fn flatten_array() { let mut base: Value = json!({ "a": [ + 1, + "b", + [], + [{}], { "b": "c" }, { "b": "d" }, { "b": "e" }, @@ -127,6 +142,7 @@ mod tests { assert_eq!( &flat, json!({ + "a": [1, "b"], "a.b": ["c", "d", "e"], }) .as_object() @@ -154,6 +170,28 @@ mod tests { .as_object() .unwrap() ); + + // here we must keep 42 in "a" + let mut base: Value = json!({ + "a": [ + { "b": "c" }, + { "b": "d" }, + { "b": "e" }, + null, + ] + }); + let json = std::mem::take(base.as_object_mut().unwrap()); + let flat = flatten(&json); + + assert_eq!( + &flat, + json!({ + "a": null, + "a.b": ["c", "d", "e"], + }) + .as_object() + .unwrap() + ); } #[test] @@ -170,6 +208,7 @@ mod tests { assert_eq!( &flat, json!({ + "a": [], "a.b": ["c", "d"], }) .as_object() From 72452f0cb270d5225e83b31ec4d707845afd1eea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 25 May 2022 11:55:16 +0200 Subject: [PATCH 37/55] Implements the EXIST filter operator --- filter-parser/src/condition.rs | 13 ++++++++++++- filter-parser/src/error.rs | 4 ++-- filter-parser/src/lib.rs | 27 ++++++++++++++++++++++----- milli/src/search/facet/filter.rs | 19 +++++++++++++++++++ 4 files changed, 55 insertions(+), 8 deletions(-) diff --git a/filter-parser/src/condition.rs b/filter-parser/src/condition.rs index 264787055..0ece99a0d 100644 --- a/filter-parser/src/condition.rs +++ b/filter-parser/src/condition.rs @@ -8,7 +8,7 @@ use nom::branch::alt; use nom::bytes::complete::tag; use nom::combinator::cut; -use nom::sequence::tuple; +use nom::sequence::{terminated, tuple}; use Condition::*; use crate::{parse_value, FilterCondition, IResult, Span, Token}; @@ -19,6 +19,8 @@ pub enum Condition<'a> { GreaterThanOrEqual(Token<'a>), Equal(Token<'a>), NotEqual(Token<'a>), + Exist, + NotExist, LowerThan(Token<'a>), LowerThanOrEqual(Token<'a>), Between { from: Token<'a>, to: Token<'a> }, @@ -33,6 +35,8 @@ impl<'a> Condition<'a> { GreaterThanOrEqual(n) => (LowerThan(n), None), Equal(s) => (NotEqual(s), None), NotEqual(s) => (Equal(s), None), + Exist => (NotExist, None), + NotExist => (Exist, None), LowerThan(n) => (GreaterThanOrEqual(n), None), LowerThanOrEqual(n) => (GreaterThan(n), None), Between { from, to } => (LowerThan(from), Some(GreaterThan(to))), @@ -58,6 +62,13 @@ pub fn parse_condition(input: Span) -> IResult { Ok((input, condition)) } +/// exist = value EXIST +pub fn parse_exist(input: Span) -> IResult { + let (input, key) = terminated(parse_value, tag("EXIST"))(input)?; + + Ok((input, FilterCondition::Condition { fid: key.into(), op: Exist })) +} + /// to = value value TO value pub fn parse_to(input: Span) -> IResult { let (input, (key, from, _, to)) = diff --git a/filter-parser/src/error.rs b/filter-parser/src/error.rs index ddf7bea47..8136732c8 100644 --- a/filter-parser/src/error.rs +++ b/filter-parser/src/error.rs @@ -128,10 +128,10 @@ impl<'a> Display for Error<'a> { writeln!(f, "Was expecting a value but instead got `{}`.", escaped_input)? } ErrorKind::InvalidPrimary if input.trim().is_empty() => { - writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO` or `_geoRadius` but instead got nothing.")? + writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXIST`, or `_geoRadius` but instead got nothing.")? } ErrorKind::InvalidPrimary => { - writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO` or `_geoRadius` at `{}`.", escaped_input)? + writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXIST`, or `_geoRadius` at `{}`.", escaped_input)? } ErrorKind::ExpectedEof => { writeln!(f, "Found unexpected characters at the end of the filter: `{}`. You probably forgot an `OR` or an `AND` rule.", escaped_input)? diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index 243d1a3f4..ee4edc122 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -6,8 +6,9 @@ //! or = and (~ "OR" ~ and) //! and = not (~ "AND" not)* //! not = ("NOT" ~ not) | primary -//! primary = (WS* ~ "(" expression ")" ~ WS*) | geoRadius | condition | to +//! primary = (WS* ~ "(" expression ")" ~ WS*) | geoRadius | condition | exist | to //! condition = value ("==" | ">" ...) value +//! exist = value EXIST //! to = value value TO value //! value = WS* ~ ( word | singleQuoted | doubleQuoted) ~ WS* //! singleQuoted = "'" .* all but quotes "'" @@ -42,6 +43,7 @@ mod value; use std::fmt::Debug; use std::str::FromStr; +use condition::parse_exist; pub use condition::{parse_condition, parse_to, Condition}; use error::{cut_with_err, NomErrorExt}; pub use error::{Error, ErrorKind}; @@ -248,6 +250,7 @@ fn parse_primary(input: Span) -> IResult { ), parse_geo_radius, parse_condition, + parse_exist, parse_to, // the next lines are only for error handling and are written at the end to have the less possible performance impact parse_geo_point, @@ -420,6 +423,20 @@ pub mod tests { op: Condition::LowerThan(rtok("NOT subscribers >= ", "1000")), }, ), + ( + "subscribers EXIST", + Fc::Condition { + fid: rtok("", "subscribers"), + op: Condition::Exist, + }, + ), + ( + "NOT subscribers EXIST", + Fc::Condition { + fid: rtok("NOT ", "subscribers"), + op: Condition::NotExist, + }, + ), ( "subscribers 100 TO 1000", Fc::Condition { @@ -577,10 +594,10 @@ pub mod tests { ("channel = ", "Was expecting a value but instead got nothing."), ("channel = 🐻", "Was expecting a value but instead got `🐻`."), ("channel = 🐻 AND followers < 100", "Was expecting a value but instead got `🐻`."), - ("OR", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO` or `_geoRadius` at `OR`."), - ("AND", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO` or `_geoRadius` at `AND`."), - ("channel Ponce", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO` or `_geoRadius` at `channel Ponce`."), - ("channel = Ponce OR", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO` or `_geoRadius` but instead got nothing."), + ("OR", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXIST`, or `_geoRadius` at `OR`."), + ("AND", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXIST`, or `_geoRadius` at `AND`."), + ("channel Ponce", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXIST`, or `_geoRadius` at `channel Ponce`."), + ("channel = Ponce OR", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXIST`, or `_geoRadius` but instead got nothing."), ("_geoRadius", "The `_geoRadius` filter expects three arguments: `_geoRadius(latitude, longitude, radius)`."), ("_geoRadius = 12", "The `_geoRadius` filter expects three arguments: `_geoRadius(latitude, longitude, radius)`."), ("_geoPoint(12, 13, 14)", "`_geoPoint` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance) built-in rule to filter on `_geo` coordinates."), diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index d89413f62..a5c13ec2a 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -280,6 +280,25 @@ impl<'a> Filter<'a> { Condition::LowerThan(val) => (Included(f64::MIN), Excluded(val.parse()?)), Condition::LowerThanOrEqual(val) => (Included(f64::MIN), Included(val.parse()?)), Condition::Between { from, to } => (Included(from.parse()?), Included(to.parse()?)), + Condition::Exist => { + let exist = index.exists_faceted_documents_ids(rtxn, field_id)?; + return Ok(exist); + } + Condition::NotExist => { + let all_ids = index.documents_ids(rtxn)?; + + let exist = Self::evaluate_operator( + rtxn, + index, + numbers_db, + strings_db, + field_id, + &Condition::Exist, + )?; + + let notexist = all_ids - exist; + return Ok(notexist); + } Condition::Equal(val) => { let (_original_value, string_docids) = strings_db .get(rtxn, &(field_id, &val.value().to_lowercase()))? From dc64170a69e0be95b2c9e5bb96e1a5be0b58ec06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 14 Jun 2022 16:42:09 +0200 Subject: [PATCH 38/55] =?UTF-8?q?Improve=20syntax=20of=20EXISTS=20filter,?= =?UTF-8?q?=20allow=20=E2=80=9Cvalue=20NOT=20EXISTS=E2=80=9D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- filter-parser/src/condition.rs | 22 ++++++++++------ filter-parser/src/error.rs | 4 +-- filter-parser/src/lib.rs | 45 +++++++++++++++++++------------- milli/src/search/facet/filter.rs | 6 ++--- 4 files changed, 46 insertions(+), 31 deletions(-) diff --git a/filter-parser/src/condition.rs b/filter-parser/src/condition.rs index 0ece99a0d..b57d68b75 100644 --- a/filter-parser/src/condition.rs +++ b/filter-parser/src/condition.rs @@ -19,8 +19,8 @@ pub enum Condition<'a> { GreaterThanOrEqual(Token<'a>), Equal(Token<'a>), NotEqual(Token<'a>), - Exist, - NotExist, + Exists, + NotExists, LowerThan(Token<'a>), LowerThanOrEqual(Token<'a>), Between { from: Token<'a>, to: Token<'a> }, @@ -35,8 +35,8 @@ impl<'a> Condition<'a> { GreaterThanOrEqual(n) => (LowerThan(n), None), Equal(s) => (NotEqual(s), None), NotEqual(s) => (Equal(s), None), - Exist => (NotExist, None), - NotExist => (Exist, None), + Exists => (NotExists, None), + NotExists => (Exists, None), LowerThan(n) => (GreaterThanOrEqual(n), None), LowerThanOrEqual(n) => (GreaterThan(n), None), Between { from, to } => (LowerThan(from), Some(GreaterThan(to))), @@ -62,11 +62,17 @@ pub fn parse_condition(input: Span) -> IResult { Ok((input, condition)) } -/// exist = value EXIST -pub fn parse_exist(input: Span) -> IResult { - let (input, key) = terminated(parse_value, tag("EXIST"))(input)?; +/// exist = value NOT EXISTS +pub fn parse_exists(input: Span) -> IResult { + let (input, key) = terminated(parse_value, tag("EXISTS"))(input)?; - Ok((input, FilterCondition::Condition { fid: key.into(), op: Exist })) + Ok((input, FilterCondition::Condition { fid: key.into(), op: Exists })) +} +/// exist = value NOT EXISTS +pub fn parse_not_exists(input: Span) -> IResult { + let (input, key) = terminated(parse_value, tag("NOT EXISTS"))(input)?; + + Ok((input, FilterCondition::Condition { fid: key.into(), op: NotExists })) } /// to = value value TO value diff --git a/filter-parser/src/error.rs b/filter-parser/src/error.rs index 8136732c8..a3720f7bf 100644 --- a/filter-parser/src/error.rs +++ b/filter-parser/src/error.rs @@ -128,10 +128,10 @@ impl<'a> Display for Error<'a> { writeln!(f, "Was expecting a value but instead got `{}`.", escaped_input)? } ErrorKind::InvalidPrimary if input.trim().is_empty() => { - writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXIST`, or `_geoRadius` but instead got nothing.")? + writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` but instead got nothing.")? } ErrorKind::InvalidPrimary => { - writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXIST`, or `_geoRadius` at `{}`.", escaped_input)? + writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `{}`.", escaped_input)? } ErrorKind::ExpectedEof => { writeln!(f, "Found unexpected characters at the end of the filter: `{}`. You probably forgot an `OR` or an `AND` rule.", escaped_input)? diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index ee4edc122..69215798c 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -3,23 +3,24 @@ //! ```text //! filter = expression ~ EOF //! expression = or -//! or = and (~ "OR" ~ and) -//! and = not (~ "AND" not)* -//! not = ("NOT" ~ not) | primary -//! primary = (WS* ~ "(" expression ")" ~ WS*) | geoRadius | condition | exist | to +//! or = and ("OR" and) +//! and = not ("AND" not)* +//! not = ("NOT" not) | primary +//! primary = (WS* "(" expression ")" WS*) | geoRadius | condition | exists | not_exists | to //! condition = value ("==" | ">" ...) value -//! exist = value EXIST +//! exists = value EXISTS +//! not_exists = value NOT EXISTS //! to = value value TO value -//! value = WS* ~ ( word | singleQuoted | doubleQuoted) ~ WS* +//! value = WS* ( word | singleQuoted | doubleQuoted) ~ WS* //! singleQuoted = "'" .* all but quotes "'" //! doubleQuoted = "\"" .* all but double quotes "\"" //! word = (alphanumeric | _ | - | .)+ -//! geoRadius = WS* ~ "_geoRadius(" ~ WS* ~ float ~ WS* ~ "," ~ WS* ~ float ~ WS* ~ "," float ~ WS* ~ ")" +//! geoRadius = WS* ~ "_geoRadius(" WS* float WS* "," WS* float WS* "," float WS* ")" //! ``` //! //! Other BNF grammar used to handle some specific errors: //! ```text -//! geoPoint = WS* ~ "_geoPoint(" ~ (float ~ ",")* ~ ")" +//! geoPoint = WS* "_geoPoint(" (float ",")* ")" //! ``` //! //! Specific errors: @@ -43,8 +44,8 @@ mod value; use std::fmt::Debug; use std::str::FromStr; -use condition::parse_exist; pub use condition::{parse_condition, parse_to, Condition}; +use condition::{parse_exists, parse_not_exists}; use error::{cut_with_err, NomErrorExt}; pub use error::{Error, ErrorKind}; use nom::branch::alt; @@ -250,7 +251,8 @@ fn parse_primary(input: Span) -> IResult { ), parse_geo_radius, parse_condition, - parse_exist, + parse_exists, + parse_not_exists, parse_to, // the next lines are only for error handling and are written at the end to have the less possible performance impact parse_geo_point, @@ -424,17 +426,24 @@ pub mod tests { }, ), ( - "subscribers EXIST", + "subscribers EXISTS", Fc::Condition { fid: rtok("", "subscribers"), - op: Condition::Exist, + op: Condition::Exists, }, ), ( - "NOT subscribers EXIST", + "NOT subscribers EXISTS", Fc::Condition { fid: rtok("NOT ", "subscribers"), - op: Condition::NotExist, + op: Condition::NotExists, + }, + ), + ( + "subscribers NOT EXISTS", + Fc::Condition { + fid: rtok("", "subscribers"), + op: Condition::NotExists, }, ), ( @@ -594,10 +603,10 @@ pub mod tests { ("channel = ", "Was expecting a value but instead got nothing."), ("channel = 🐻", "Was expecting a value but instead got `🐻`."), ("channel = 🐻 AND followers < 100", "Was expecting a value but instead got `🐻`."), - ("OR", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXIST`, or `_geoRadius` at `OR`."), - ("AND", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXIST`, or `_geoRadius` at `AND`."), - ("channel Ponce", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXIST`, or `_geoRadius` at `channel Ponce`."), - ("channel = Ponce OR", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXIST`, or `_geoRadius` but instead got nothing."), + ("OR", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `OR`."), + ("AND", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `AND`."), + ("channel Ponce", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `channel Ponce`."), + ("channel = Ponce OR", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` but instead got nothing."), ("_geoRadius", "The `_geoRadius` filter expects three arguments: `_geoRadius(latitude, longitude, radius)`."), ("_geoRadius = 12", "The `_geoRadius` filter expects three arguments: `_geoRadius(latitude, longitude, radius)`."), ("_geoPoint(12, 13, 14)", "`_geoPoint` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance) built-in rule to filter on `_geo` coordinates."), diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index a5c13ec2a..7f3b928dd 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -280,11 +280,11 @@ impl<'a> Filter<'a> { Condition::LowerThan(val) => (Included(f64::MIN), Excluded(val.parse()?)), Condition::LowerThanOrEqual(val) => (Included(f64::MIN), Included(val.parse()?)), Condition::Between { from, to } => (Included(from.parse()?), Included(to.parse()?)), - Condition::Exist => { + Condition::Exists => { let exist = index.exists_faceted_documents_ids(rtxn, field_id)?; return Ok(exist); } - Condition::NotExist => { + Condition::NotExists => { let all_ids = index.documents_ids(rtxn)?; let exist = Self::evaluate_operator( @@ -293,7 +293,7 @@ impl<'a> Filter<'a> { numbers_db, strings_db, field_id, - &Condition::Exist, + &Condition::Exists, )?; let notexist = all_ids - exist; From 0388b2d46382e5578824dd5b2b522cead764f35b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 15 Jun 2022 08:58:41 +0200 Subject: [PATCH 39/55] Run cargo fmt --- milli/src/heed_codec/facet/mod.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs index 79dbffa1d..8c5a4c118 100644 --- a/milli/src/heed_codec/facet/mod.rs +++ b/milli/src/heed_codec/facet/mod.rs @@ -26,10 +26,11 @@ pub fn try_split_at(slice: &[u8], mid: usize) -> Option<(&[u8], &[u8])> { } } -use crate::{try_split_array_at, DocumentId, FieldId}; use std::borrow::Cow; use std::convert::TryInto; +use crate::{try_split_array_at, DocumentId, FieldId}; + pub struct FieldIdCodec; impl<'a> heed::BytesDecode<'a> for FieldIdCodec { From a5c916225077b0dfe8ba35cea702af730f692b1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 15 Jun 2022 09:14:19 +0200 Subject: [PATCH 40/55] Improve parser for NOT EXISTS filter Allow multiple spaces between NOT and EXISTS --- filter-parser/src/condition.rs | 10 ++++---- filter-parser/src/lib.rs | 43 +++++++++++++++++++++++----------- filter-parser/src/value.rs | 2 +- 3 files changed, 36 insertions(+), 19 deletions(-) diff --git a/filter-parser/src/condition.rs b/filter-parser/src/condition.rs index b57d68b75..6a5ecbe0a 100644 --- a/filter-parser/src/condition.rs +++ b/filter-parser/src/condition.rs @@ -7,11 +7,12 @@ use nom::branch::alt; use nom::bytes::complete::tag; +use nom::character::complete::multispace1; use nom::combinator::cut; use nom::sequence::{terminated, tuple}; use Condition::*; -use crate::{parse_value, FilterCondition, IResult, Span, Token}; +use crate::{parse_value, ws, FilterCondition, IResult, Span, Token}; #[derive(Debug, Clone, PartialEq, Eq)] pub enum Condition<'a> { @@ -62,16 +63,17 @@ pub fn parse_condition(input: Span) -> IResult { Ok((input, condition)) } -/// exist = value NOT EXISTS +/// exist = value "EXISTS" pub fn parse_exists(input: Span) -> IResult { let (input, key) = terminated(parse_value, tag("EXISTS"))(input)?; Ok((input, FilterCondition::Condition { fid: key.into(), op: Exists })) } -/// exist = value NOT EXISTS +/// exist = value "NOT" WS* "EXISTS" pub fn parse_not_exists(input: Span) -> IResult { - let (input, key) = terminated(parse_value, tag("NOT EXISTS"))(input)?; + let (input, key) = parse_value(input)?; + let (input, _) = tuple((tag("NOT"), multispace1, tag("EXISTS")))(input)?; Ok((input, FilterCondition::Condition { fid: key.into(), op: NotExists })) } diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index 69215798c..e40519e87 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -1,21 +1,21 @@ //! BNF grammar: //! //! ```text -//! filter = expression ~ EOF +//! filter = expression EOF //! expression = or //! or = and ("OR" and) //! and = not ("AND" not)* //! not = ("NOT" not) | primary //! primary = (WS* "(" expression ")" WS*) | geoRadius | condition | exists | not_exists | to //! condition = value ("==" | ">" ...) value -//! exists = value EXISTS -//! not_exists = value NOT EXISTS -//! to = value value TO value -//! value = WS* ( word | singleQuoted | doubleQuoted) ~ WS* +//! exists = value "EXISTS" +//! not_exists = value "NOT" WS* "EXISTS" +//! to = value value "TO" value +//! value = WS* ( word | singleQuoted | doubleQuoted) WS* //! singleQuoted = "'" .* all but quotes "'" //! doubleQuoted = "\"" .* all but double quotes "\"" //! word = (alphanumeric | _ | - | .)+ -//! geoRadius = WS* ~ "_geoRadius(" WS* float WS* "," WS* float WS* "," float WS* ")" +//! geoRadius = WS* "_geoRadius(" WS* float WS* "," WS* float WS* "," float WS* ")" //! ``` //! //! Other BNF grammar used to handle some specific errors: @@ -31,7 +31,7 @@ //! field < 12 AND _geoPoint(1, 2) //! ``` //! -//! - If a user try to use a geoRadius as a value we must throw an error. +//! - If a user try to use a geoRadius as a value we must throw an error. //! ```text //! field = _geoRadius(12, 13, 14) //! ``` @@ -170,7 +170,7 @@ fn ws<'a, O>(inner: impl FnMut(Span<'a>) -> IResult) -> impl FnMut(Span<'a>) delimited(multispace0, inner, multispace0) } -/// or = and (~ "OR" ~ and) +/// or = and ("OR" and) fn parse_or(input: Span) -> IResult { let (input, lhs) = parse_and(input)?; // if we found a `OR` then we MUST find something next @@ -182,7 +182,7 @@ fn parse_or(input: Span) -> IResult { Ok((input, expr)) } -/// and = not (~ "AND" not)* +/// and = not ("AND" not)* fn parse_and(input: Span) -> IResult { let (input, lhs) = parse_not(input)?; // if we found a `AND` then we MUST find something next @@ -193,14 +193,14 @@ fn parse_and(input: Span) -> IResult { Ok((input, expr)) } -/// not = ("NOT" ~ not) | primary +/// not = ("NOT" not) | primary /// We can have multiple consecutive not, eg: `NOT NOT channel = mv`. /// If we parse a `NOT` we MUST parse something behind. fn parse_not(input: Span) -> IResult { alt((map(preceded(tag("NOT"), cut(parse_not)), |e| e.negate()), parse_primary))(input) } -/// geoRadius = WS* ~ "_geoRadius(float ~ "," ~ float ~ "," float) +/// geoRadius = WS* "_geoRadius(float "," float "," float) /// If we parse `_geoRadius` we MUST parse the rest of the expression. fn parse_geo_radius(input: Span) -> IResult { // we want to forbid space BEFORE the _geoRadius but not after @@ -224,7 +224,7 @@ fn parse_geo_radius(input: Span) -> IResult { Ok((input, res)) } -/// geoPoint = WS* ~ "_geoPoint(float ~ "," ~ float ~ "," float) +/// geoPoint = WS* "_geoPoint(float "," float "," float) fn parse_geo_point(input: Span) -> IResult { // we want to forbid space BEFORE the _geoPoint but not after tuple(( @@ -238,7 +238,7 @@ fn parse_geo_point(input: Span) -> IResult { Err(nom::Err::Failure(Error::new_from_kind(input, ErrorKind::ReservedGeo("_geoPoint")))) } -/// primary = (WS* ~ "(" expression ")" ~ WS*) | geoRadius | condition | to +/// primary = (WS* "(" expression ")" WS*) | geoRadius | condition | to fn parse_primary(input: Span) -> IResult { alt(( // if we find a first parenthesis, then we must parse an expression and find the closing parenthesis @@ -266,7 +266,7 @@ pub fn parse_expression(input: Span) -> IResult { parse_or(input) } -/// filter = expression ~ EOF +/// filter = expression EOF pub fn parse_filter(input: Span) -> IResult { terminated(parse_expression, eof)(input) } @@ -446,6 +446,20 @@ pub mod tests { op: Condition::NotExists, }, ), + ( + "NOT subscribers NOT EXISTS", + Fc::Condition { + fid: rtok("NOT ", "subscribers"), + op: Condition::Exists, + }, + ), + ( + "subscribers NOT EXISTS", + Fc::Condition { + fid: rtok("", "subscribers"), + op: Condition::NotExists, + }, + ), ( "subscribers 100 TO 1000", Fc::Condition { @@ -616,6 +630,7 @@ pub mod tests { ("channel = \"ponce", "Expression `\\\"ponce` is missing the following closing delimiter: `\"`."), ("channel = mv OR (followers >= 1000", "Expression `(followers >= 1000` is missing the following closing delimiter: `)`."), ("channel = mv OR followers >= 1000)", "Found unexpected characters at the end of the filter: `)`. You probably forgot an `OR` or an `AND` rule."), + ("colour NOT EXIST", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `colour NOT EXIST`."), ]; for (input, expected) in test_case { diff --git a/filter-parser/src/value.rs b/filter-parser/src/value.rs index 84dd21902..18ae58ae5 100644 --- a/filter-parser/src/value.rs +++ b/filter-parser/src/value.rs @@ -48,7 +48,7 @@ fn quoted_by(quote: char, input: Span) -> IResult { )) } -/// value = WS* ~ ( word | singleQuoted | doubleQuoted) ~ WS* +/// value = WS* ( word | singleQuoted | doubleQuoted) WS* pub fn parse_value<'a>(input: Span<'a>) -> IResult> { // to get better diagnostic message we are going to strip the left whitespaces from the input right now let (input, _) = take_while(char::is_whitespace)(input)?; From 722db7b088a86f2f4cabef93927e6815a791136e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 15 Jun 2022 09:28:21 +0200 Subject: [PATCH 41/55] Ignore target directory of filter-parser/fuzz crate --- filter-parser/fuzz/.gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/filter-parser/fuzz/.gitignore b/filter-parser/fuzz/.gitignore index cb73742e4..9a2e1d58c 100644 --- a/filter-parser/fuzz/.gitignore +++ b/filter-parser/fuzz/.gitignore @@ -1,2 +1,3 @@ /corpus/ /artifacts/ +/target/ \ No newline at end of file From bd15f5625af5d3032f386d8c67cfb9a4d5f8330d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 15 Jun 2022 13:31:51 +0200 Subject: [PATCH 42/55] Fix compiler warning --- filter-parser/src/condition.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/filter-parser/src/condition.rs b/filter-parser/src/condition.rs index 6a5ecbe0a..c63f1d926 100644 --- a/filter-parser/src/condition.rs +++ b/filter-parser/src/condition.rs @@ -12,7 +12,7 @@ use nom::combinator::cut; use nom::sequence::{terminated, tuple}; use Condition::*; -use crate::{parse_value, ws, FilterCondition, IResult, Span, Token}; +use crate::{parse_value, FilterCondition, IResult, Span, Token}; #[derive(Debug, Clone, PartialEq, Eq)] pub enum Condition<'a> { From 392472f4bb9f1f16aa27e6b9feef249f2d1f31f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 16 Jun 2022 06:19:33 +0200 Subject: [PATCH 43/55] Apply suggestions from code review Co-authored-by: Tamo --- filter-parser/src/lib.rs | 6 +++--- infos/src/main.rs | 2 +- milli/src/index.rs | 3 +-- .../index_documents/extract/extract_facet_exists_docids.rs | 2 -- 4 files changed, 5 insertions(+), 8 deletions(-) diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index e40519e87..5cce5f4c3 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -170,7 +170,7 @@ fn ws<'a, O>(inner: impl FnMut(Span<'a>) -> IResult) -> impl FnMut(Span<'a>) delimited(multispace0, inner, multispace0) } -/// or = and ("OR" and) +/// or = and ("OR" and)* fn parse_or(input: Span) -> IResult { let (input, lhs) = parse_and(input)?; // if we found a `OR` then we MUST find something next @@ -200,7 +200,7 @@ fn parse_not(input: Span) -> IResult { alt((map(preceded(tag("NOT"), cut(parse_not)), |e| e.negate()), parse_primary))(input) } -/// geoRadius = WS* "_geoRadius(float "," float "," float) +/// geoRadius = WS* "_geoRadius(float WS* "," WS* float WS* "," WS* float) /// If we parse `_geoRadius` we MUST parse the rest of the expression. fn parse_geo_radius(input: Span) -> IResult { // we want to forbid space BEFORE the _geoRadius but not after @@ -224,7 +224,7 @@ fn parse_geo_radius(input: Span) -> IResult { Ok((input, res)) } -/// geoPoint = WS* "_geoPoint(float "," float "," float) +/// geoPoint = WS* "_geoPoint(float WS* "," WS* float WS* "," WS* float) fn parse_geo_point(input: Span) -> IResult { // we want to forbid space BEFORE the _geoPoint but not after tuple(( diff --git a/infos/src/main.rs b/infos/src/main.rs index 89aec6182..feec17557 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -551,7 +551,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho let db = facet_id_exists_docids.remap_data_type::(); for result in facet_values_iter(rtxn, db, facet_id)? { let (_fid, value) = result?; - let key = format!("{}", facet_name); + let key = facet_name.to_string(); heap.push(Reverse((value.len(), key, facet_id_exists_docids_name))); if heap.len() > limit { heap.pop(); diff --git a/milli/src/index.rs b/milli/src/index.rs index 816112178..b0897271e 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -156,8 +156,7 @@ impl Index { let word_prefix_position_docids = env.create_database(Some(WORD_PREFIX_POSITION_DOCIDS))?; let facet_id_f64_docids = env.create_database(Some(FACET_ID_F64_DOCIDS))?; let facet_id_string_docids = env.create_database(Some(FACET_ID_STRING_DOCIDS))?; - let facet_id_exists_docids: Database = - env.create_database(Some(FACET_ID_EXISTS_DOCIDS))?; + let facet_id_exists_docids = env.create_database(Some(FACET_ID_EXISTS_DOCIDS))?; let field_id_docid_facet_f64s = env.create_database(Some(FIELD_ID_DOCID_FACET_F64S))?; let field_id_docid_facet_strings = diff --git a/milli/src/update/index_documents/extract/extract_facet_exists_docids.rs b/milli/src/update/index_documents/extract/extract_facet_exists_docids.rs index e7a001c08..d25c57aea 100644 --- a/milli/src/update/index_documents/extract/extract_facet_exists_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_exists_docids.rs @@ -32,9 +32,7 @@ pub fn extract_facet_exists_docids( let mut cursor = docid_fid_facet_number.into_cursor()?; while let Some((key_bytes, _)) = cursor.move_on_next()? { let (field_id, document_id) = FieldIdDocIdCodec::bytes_decode(key_bytes).unwrap(); - let key_bytes = FieldIdCodec::bytes_encode(&field_id).unwrap(); - facet_exists_docids_sorter.insert(key_bytes, document_id.to_ne_bytes())?; } From 30bd4db0fcbcf31d80baec5892d28076c16b8577 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 16 Jun 2022 08:24:16 +0200 Subject: [PATCH 44/55] Simplify indexing task for facet_exists_docids database --- milli/src/heed_codec/facet/field_id_codec.rs | 25 +++++++++++ milli/src/heed_codec/facet/mod.rs | 43 +------------------ milli/src/lib.rs | 1 + .../extract/extract_facet_exists_docids.rs | 40 ----------------- .../extract/extract_fid_docid_facet_values.rs | 24 +++++++---- .../src/update/index_documents/extract/mod.rs | 22 +++------- 6 files changed, 50 insertions(+), 105 deletions(-) create mode 100644 milli/src/heed_codec/facet/field_id_codec.rs delete mode 100644 milli/src/update/index_documents/extract/extract_facet_exists_docids.rs diff --git a/milli/src/heed_codec/facet/field_id_codec.rs b/milli/src/heed_codec/facet/field_id_codec.rs new file mode 100644 index 000000000..d147423f2 --- /dev/null +++ b/milli/src/heed_codec/facet/field_id_codec.rs @@ -0,0 +1,25 @@ +use crate::{FieldId, BEU16}; +use heed::zerocopy::AsBytes; +use std::{borrow::Cow, convert::TryInto}; + +pub struct FieldIdCodec; + +impl<'a> heed::BytesDecode<'a> for FieldIdCodec { + type DItem = FieldId; + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let bytes: [u8; 2] = bytes[..2].try_into().ok()?; + let field_id = BEU16::from(bytes).get(); + Some(field_id) + } +} + +impl<'a> heed::BytesEncode<'a> for FieldIdCodec { + type EItem = FieldId; + + fn bytes_encode(field_id: &Self::EItem) -> Option> { + let field_id = BEU16::new(*field_id); + let bytes = field_id.as_bytes(); + Some(Cow::Owned(bytes.to_vec())) + } +} diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs index 8c5a4c118..384991fd7 100644 --- a/milli/src/heed_codec/facet/mod.rs +++ b/milli/src/heed_codec/facet/mod.rs @@ -5,6 +5,7 @@ mod facet_string_level_zero_value_codec; mod facet_string_zero_bounds_value_codec; mod field_doc_id_facet_f64_codec; mod field_doc_id_facet_string_codec; +mod field_id_codec; pub use self::facet_level_value_f64_codec::FacetLevelValueF64Codec; pub use self::facet_level_value_u32_codec::FacetLevelValueU32Codec; @@ -15,6 +16,7 @@ pub use self::facet_string_level_zero_value_codec::{ pub use self::facet_string_zero_bounds_value_codec::FacetStringZeroBoundsValueCodec; pub use self::field_doc_id_facet_f64_codec::FieldDocIdFacetF64Codec; pub use self::field_doc_id_facet_string_codec::FieldDocIdFacetStringCodec; +pub use self::field_id_codec::FieldIdCodec; /// Tries to split a slice in half at the given middle point, /// `None` if the slice is too short. @@ -25,44 +27,3 @@ pub fn try_split_at(slice: &[u8], mid: usize) -> Option<(&[u8], &[u8])> { None } } - -use std::borrow::Cow; -use std::convert::TryInto; - -use crate::{try_split_array_at, DocumentId, FieldId}; - -pub struct FieldIdCodec; - -impl<'a> heed::BytesDecode<'a> for FieldIdCodec { - type DItem = FieldId; - - fn bytes_decode(bytes: &'a [u8]) -> Option { - let (field_id_bytes, _) = try_split_array_at(bytes)?; - let field_id = u16::from_be_bytes(field_id_bytes); - Some(field_id) - } -} - -impl<'a> heed::BytesEncode<'a> for FieldIdCodec { - type EItem = FieldId; - - fn bytes_encode(field_id: &Self::EItem) -> Option> { - Some(Cow::Owned(field_id.to_be_bytes().to_vec())) - } -} - -pub struct FieldIdDocIdCodec; - -impl<'a> heed::BytesDecode<'a> for FieldIdDocIdCodec { - type DItem = (FieldId, DocumentId); - - fn bytes_decode(bytes: &'a [u8]) -> Option { - let (field_id_bytes, bytes) = try_split_array_at(bytes)?; - let field_id = u16::from_be_bytes(field_id_bytes); - - let document_id_bytes = bytes[..4].try_into().ok()?; - let document_id = u32::from_be_bytes(document_id_bytes); - - Some((field_id, document_id)) - } -} diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 81cd057d5..20fdceaec 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -49,6 +49,7 @@ pub type SmallString32 = smallstr::SmallString<[u8; 32]>; pub type SmallVec16 = smallvec::SmallVec<[T; 16]>; pub type SmallVec32 = smallvec::SmallVec<[T; 32]>; pub type SmallVec8 = smallvec::SmallVec<[T; 8]>; +pub type BEU16 = heed::zerocopy::U16; pub type BEU32 = heed::zerocopy::U32; pub type BEU64 = heed::zerocopy::U64; pub type Attribute = u32; diff --git a/milli/src/update/index_documents/extract/extract_facet_exists_docids.rs b/milli/src/update/index_documents/extract/extract_facet_exists_docids.rs deleted file mode 100644 index d25c57aea..000000000 --- a/milli/src/update/index_documents/extract/extract_facet_exists_docids.rs +++ /dev/null @@ -1,40 +0,0 @@ -use std::fs::File; -use std::io; - -use heed::{BytesDecode, BytesEncode}; - -use super::helpers::{ - create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, GrenadParameters, -}; -use crate::heed_codec::facet::{FieldIdCodec, FieldIdDocIdCodec}; -use crate::Result; - -/// Extracts the documents ids where this field appears. -/// -/// Returns a grenad reader whose key is the field id encoded -/// with `FieldIdCodec` and the value is a document_id (u32) -/// encoded as native-endian bytes. -#[logging_timer::time] -pub fn extract_facet_exists_docids( - docid_fid_facet_number: grenad::Reader, - indexer: GrenadParameters, -) -> Result> { - let max_memory = indexer.max_memory_by_thread(); - - let mut facet_exists_docids_sorter = create_sorter( - merge_cbo_roaring_bitmaps, - indexer.chunk_compression_type, - indexer.chunk_compression_level, - indexer.max_nb_chunks, - max_memory, - ); - - let mut cursor = docid_fid_facet_number.into_cursor()?; - while let Some((key_bytes, _)) = cursor.move_on_next()? { - let (field_id, document_id) = FieldIdDocIdCodec::bytes_decode(key_bytes).unwrap(); - let key_bytes = FieldIdCodec::bytes_encode(&field_id).unwrap(); - facet_exists_docids_sorter.insert(key_bytes, document_id.to_ne_bytes())?; - } - - sorter_into_reader(facet_exists_docids_sorter, indexer) -} diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index d93bde500..c83ac49e0 100644 --- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -1,15 +1,16 @@ +use heed::zerocopy::AsBytes; +use serde_json::Value; use std::collections::HashSet; +use std::convert::TryInto; use std::fs::File; use std::io; use std::mem::size_of; -use heed::zerocopy::AsBytes; -use serde_json::Value; - use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters}; use crate::error::InternalError; use crate::facet::value_encoding::f64_into_bytes; -use crate::{DocumentId, FieldId, Result}; +use crate::update::index_documents::merge_cbo_roaring_bitmaps; +use crate::{DocumentId, FieldId, Result, BEU32}; /// Extracts the facet values of each faceted field of each document. /// @@ -40,7 +41,7 @@ pub fn extract_fid_docid_facet_values( ); let mut fid_docid_facet_exists_sorter = create_sorter( - keep_first, + merge_cbo_roaring_bitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, @@ -56,12 +57,17 @@ pub fn extract_fid_docid_facet_values( if faceted_fields.contains(&field_id) { key_buffer.clear(); - // here, we know already that the document must be added to the “field id exists” database - // prefix key with the field_id and the document_id - + // Set key to the field_id + // Note: this encoding is consistent with FieldIdCodec key_buffer.extend_from_slice(&field_id.to_be_bytes()); + + // Here, we know already that the document must be added to the “field id exists” database + let document: [u8; 4] = docid_bytes[..4].try_into().ok().unwrap(); + let document = BEU32::from(document).get(); + fid_docid_facet_exists_sorter.insert(&key_buffer, document.to_ne_bytes())?; + + // For the other extraction tasks, prefix the key with the field_id and the document_id key_buffer.extend_from_slice(&docid_bytes); - fid_docid_facet_exists_sorter.insert(&key_buffer, ().as_bytes())?; let value = serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?; diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 7d26e0984..bb695a99f 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -1,5 +1,4 @@ mod extract_docid_word_positions; -mod extract_facet_exists_docids; mod extract_facet_number_docids; mod extract_facet_string_docids; mod extract_fid_docid_facet_values; @@ -17,7 +16,6 @@ use log::debug; use rayon::prelude::*; use self::extract_docid_word_positions::extract_docid_word_positions; -use self::extract_facet_exists_docids::extract_facet_exists_docids; use self::extract_facet_number_docids::extract_facet_number_docids; use self::extract_facet_string_docids::extract_facet_string_docids; use self::extract_fid_docid_facet_values::extract_fid_docid_facet_values; @@ -142,15 +140,12 @@ pub(crate) fn data_from_obkv_documents( TypedChunk::FieldIdFacetNumberDocids, "field-id-facet-number-docids", ); - spawn_extraction_task::<_, _, Vec>>( - docid_fid_facet_exists_chunks.clone(), - indexer.clone(), - lmdb_writer_sx.clone(), - extract_facet_exists_docids, - merge_cbo_roaring_bitmaps, - TypedChunk::FieldIdFacetExistsDocids, - "field-id-facet-exists-docids", - ); + + // spawn extraction task for field-id-facet-exists-docids + rayon::spawn(move || { + let reader = docid_fid_facet_exists_chunks.merge(merge_cbo_roaring_bitmaps, &indexer); + let _ = lmdb_writer_sx.send(reader.map(TypedChunk::FieldIdFacetExistsDocids)); + }); Ok(()) } @@ -226,7 +221,7 @@ fn send_and_extract_flattened_documents_data( grenad::Reader, ( grenad::Reader, - (grenad::Reader, grenad::Reader), + (grenad::Reader, grenad::Reader), ), )> { let flattened_documents_chunk = @@ -294,9 +289,6 @@ fn send_and_extract_flattened_documents_data( docid_fid_facet_strings_chunk.clone(), ))); - let docid_fid_facet_exists_chunk = - unsafe { as_cloneable_grenad(&docid_fid_facet_exists_chunk)? }; - Ok(( docid_fid_facet_numbers_chunk, (docid_fid_facet_strings_chunk, docid_fid_facet_exists_chunk), From c17d616250cee38d34dbad51173188ae8cc54ead Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 16 Jun 2022 08:41:33 +0200 Subject: [PATCH 45/55] Refactor index_documents_check_exists_database tests --- milli/src/update/index_documents/mod.rs | 218 +++++++++--------------- 1 file changed, 84 insertions(+), 134 deletions(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 82457762e..99f474eb6 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1932,152 +1932,102 @@ mod tests { assert_eq!(ids.len(), map.len()); } - #[test] - fn index_documents_check_exists_database_reindex() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - - let mut wtxn = index.write_txn().unwrap(); - let content = documents!([ - { - "id": 0, - "colour": 0, - }, - { - "id": 1, - "colour": [] - }, - { - "id": 2, - "colour": {} - }, - { - "id": 3, - "colour": null - }, - { - "id": 4, - "colour": [1] - }, - { - "id": 5 - }, - { - "id": 6, - "colour": { - "green": 1 - } - } - ]); - - let config = IndexerConfig::default(); - let indexing_config = IndexDocumentsConfig::default(); - let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) - .unwrap(); - builder.add_documents(content).unwrap(); - builder.execute().unwrap(); - - wtxn.commit().unwrap(); - - let mut wtxn = index.write_txn().unwrap(); - let mut builder = update::Settings::new(&mut wtxn, &index, &config); - - let faceted_fields = hashset!(S("colour")); - builder.set_filterable_fields(faceted_fields); - builder.execute(|_| ()).unwrap(); - wtxn.commit().unwrap(); - - let rtxn = index.read_txn().unwrap(); - let facets = index.faceted_fields(&rtxn).unwrap(); - assert_eq!(facets, hashset!(S("colour"), S("colour.green"))); - - let colour_id = index.fields_ids_map(&rtxn).unwrap().id("colour").unwrap(); - let colour_green_id = index.fields_ids_map(&rtxn).unwrap().id("colour.green").unwrap(); - - let bitmap_colour = index.facet_id_exists_docids.get(&rtxn, &colour_id).unwrap().unwrap(); - assert_eq!(bitmap_colour.into_iter().collect::>(), vec![0, 1, 2, 3, 4, 6]); - - let bitmap_colour_green = - index.facet_id_exists_docids.get(&rtxn, &colour_green_id).unwrap().unwrap(); - assert_eq!(bitmap_colour_green.into_iter().collect::>(), vec![6]); - } - #[test] fn index_documents_check_exists_database() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - let config = IndexerConfig::default(); - - let mut wtxn = index.write_txn().unwrap(); - let mut builder = update::Settings::new(&mut wtxn, &index, &config); + let indexing_config = IndexDocumentsConfig::default(); let faceted_fields = hashset!(S("colour")); - builder.set_filterable_fields(faceted_fields); - builder.execute(|_| ()).unwrap(); - wtxn.commit().unwrap(); - - let content = documents!([ - { - "id": 0, - "colour": 0, - }, - { - "id": 1, - "colour": [] - }, - { - "id": 2, - "colour": {} - }, - { - "id": 3, - "colour": null - }, - { - "id": 4, - "colour": [1] - }, - { - "id": 5 - }, - { - "id": 6, - "colour": { - "green": 1 + let content = || { + documents!([ + { + "id": 0, + "colour": 0, + }, + { + "id": 1, + "colour": [] + }, + { + "id": 2, + "colour": {} + }, + { + "id": 3, + "colour": null + }, + { + "id": 4, + "colour": [1] + }, + { + "id": 5 + }, + { + "id": 6, + "colour": { + "green": 1 + } + }, + { + "id": 7, + "colour": { + "green": { + "blue": [] + } + } } - } - ]); - - let indexing_config = IndexDocumentsConfig::default(); + ]) + }; + let make_index = || { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + Index::new(options, &path).unwrap() + }; - let mut wtxn = index.write_txn().unwrap(); + let set_filterable_fields = |index: &Index| { + let mut wtxn = index.write_txn().unwrap(); + let mut builder = update::Settings::new(&mut wtxn, &index, &config); + builder.set_filterable_fields(faceted_fields.clone()); + builder.execute(|_| ()).unwrap(); + wtxn.commit().unwrap(); + }; + let add_documents = |index: &Index| { + let mut wtxn = index.write_txn().unwrap(); + let mut builder = + IndexDocuments::new(&mut wtxn, index, &config, indexing_config.clone(), |_| ()) + .unwrap(); + builder.add_documents(content()).unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + }; - let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) - .unwrap(); - builder.add_documents(content).unwrap(); - builder.execute().unwrap(); + let check_ok = |index: &Index| { + let rtxn = index.read_txn().unwrap(); + let facets = index.faceted_fields(&rtxn).unwrap(); + assert_eq!(facets, hashset!(S("colour"), S("colour.green"), S("colour.green.blue"))); - wtxn.commit().unwrap(); + let colour_id = index.fields_ids_map(&rtxn).unwrap().id("colour").unwrap(); + let colour_green_id = index.fields_ids_map(&rtxn).unwrap().id("colour.green").unwrap(); - let rtxn = index.read_txn().unwrap(); - let facets = index.faceted_fields(&rtxn).unwrap(); - assert_eq!(facets, hashset!(S("colour"), S("colour.green"))); + let bitmap_colour = + index.facet_id_exists_docids.get(&rtxn, &colour_id).unwrap().unwrap(); + assert_eq!(bitmap_colour.into_iter().collect::>(), vec![0, 1, 2, 3, 4, 6, 7]); - let colour_id = index.fields_ids_map(&rtxn).unwrap().id("colour").unwrap(); - let colour_green_id = index.fields_ids_map(&rtxn).unwrap().id("colour.green").unwrap(); + let bitmap_colour_green = + index.facet_id_exists_docids.get(&rtxn, &colour_green_id).unwrap().unwrap(); + assert_eq!(bitmap_colour_green.into_iter().collect::>(), vec![6, 7]); + }; - let bitmap_colour = index.facet_id_exists_docids.get(&rtxn, &colour_id).unwrap().unwrap(); - assert_eq!(bitmap_colour.into_iter().collect::>(), vec![0, 1, 2, 3, 4, 6]); + let index = make_index(); + add_documents(&index); + set_filterable_fields(&index); + check_ok(&index); - let bitmap_colour_green = - index.facet_id_exists_docids.get(&rtxn, &colour_green_id).unwrap().unwrap(); - assert_eq!(bitmap_colour_green.into_iter().collect::>(), vec![6]); + let index = make_index(); + set_filterable_fields(&index); + add_documents(&index); + check_ok(&index); } } From ea0642c32d799b201c61cc81d8f979e33cc74380 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 16 Jun 2022 09:12:37 +0200 Subject: [PATCH 46/55] Make filter parser more strict regarding spacing around operators OR, AND, NOT, TO must now be followed by spaces --- filter-parser/src/condition.rs | 11 ++++----- filter-parser/src/lib.rs | 45 ++++++++++++++++++++-------------- filter-parser/src/value.rs | 2 +- 3 files changed, 32 insertions(+), 26 deletions(-) diff --git a/filter-parser/src/condition.rs b/filter-parser/src/condition.rs index c63f1d926..cbf73b96a 100644 --- a/filter-parser/src/condition.rs +++ b/filter-parser/src/condition.rs @@ -44,8 +44,7 @@ impl<'a> Condition<'a> { } } } - -/// condition = value ("==" | ">" ...) value +/// condition = value ("=" | "!=" | ">" | ">=" | "<" | "<=") value pub fn parse_condition(input: Span) -> IResult { let operator = alt((tag("<="), tag(">="), tag("!="), tag("<"), tag(">"), tag("="))); let (input, (fid, op, value)) = tuple((parse_value, operator, cut(parse_value)))(input)?; @@ -69,7 +68,7 @@ pub fn parse_exists(input: Span) -> IResult { Ok((input, FilterCondition::Condition { fid: key.into(), op: Exists })) } -/// exist = value "NOT" WS* "EXISTS" +/// exist = value "NOT" WS+ "EXISTS" pub fn parse_not_exists(input: Span) -> IResult { let (input, key) = parse_value(input)?; @@ -77,10 +76,10 @@ pub fn parse_not_exists(input: Span) -> IResult { Ok((input, FilterCondition::Condition { fid: key.into(), op: NotExists })) } -/// to = value value TO value +/// to = value value "TO" WS+ value pub fn parse_to(input: Span) -> IResult { - let (input, (key, from, _, to)) = - tuple((parse_value, parse_value, tag("TO"), cut(parse_value)))(input)?; + let (input, (key, from, _, _, to)) = + tuple((parse_value, parse_value, tag("TO"), multispace1, cut(parse_value)))(input)?; Ok((input, FilterCondition::Condition { fid: key, op: Between { from, to } })) } diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index 5cce5f4c3..01be432d7 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -3,19 +3,19 @@ //! ```text //! filter = expression EOF //! expression = or -//! or = and ("OR" and) -//! and = not ("AND" not)* -//! not = ("NOT" not) | primary -//! primary = (WS* "(" expression ")" WS*) | geoRadius | condition | exists | not_exists | to -//! condition = value ("==" | ">" ...) value +//! or = and ("OR" WS+ and)* +//! and = not ("AND" WS+ not)* +//! not = ("NOT" WS+ not) | primary +//! primary = (WS* "(" WS* expression WS* ")" WS*) | geoRadius | condition | exists | not_exists | to +//! condition = value ("=" | "!=" | ">" | ">=" | "<" | "<=") value //! exists = value "EXISTS" -//! not_exists = value "NOT" WS* "EXISTS" -//! to = value value "TO" value -//! value = WS* ( word | singleQuoted | doubleQuoted) WS* +//! not_exists = value "NOT" WS+ "EXISTS" +//! to = value value "TO" WS+ value +//! value = WS* ( word | singleQuoted | doubleQuoted) WS+ //! singleQuoted = "'" .* all but quotes "'" //! doubleQuoted = "\"" .* all but double quotes "\"" //! word = (alphanumeric | _ | - | .)+ -//! geoRadius = WS* "_geoRadius(" WS* float WS* "," WS* float WS* "," float WS* ")" +//! geoRadius = "_geoRadius(" WS* float WS* "," WS* float WS* "," float WS* ")" //! ``` //! //! Other BNF grammar used to handle some specific errors: @@ -50,7 +50,7 @@ use error::{cut_with_err, NomErrorExt}; pub use error::{Error, ErrorKind}; use nom::branch::alt; use nom::bytes::complete::tag; -use nom::character::complete::{char, multispace0}; +use nom::character::complete::{char, multispace0, multispace1}; use nom::combinator::{cut, eof, map}; use nom::multi::{many0, separated_list1}; use nom::number::complete::recognize_float; @@ -170,11 +170,11 @@ fn ws<'a, O>(inner: impl FnMut(Span<'a>) -> IResult) -> impl FnMut(Span<'a>) delimited(multispace0, inner, multispace0) } -/// or = and ("OR" and)* +/// or = and ("OR" WS+ and)* fn parse_or(input: Span) -> IResult { let (input, lhs) = parse_and(input)?; // if we found a `OR` then we MUST find something next - let (input, ors) = many0(preceded(ws(tag("OR")), cut(parse_and)))(input)?; + let (input, ors) = many0(preceded(ws(tuple((tag("OR"), multispace1))), cut(parse_and)))(input)?; let expr = ors .into_iter() @@ -186,24 +186,28 @@ fn parse_or(input: Span) -> IResult { fn parse_and(input: Span) -> IResult { let (input, lhs) = parse_not(input)?; // if we found a `AND` then we MUST find something next - let (input, ors) = many0(preceded(ws(tag("AND")), cut(parse_not)))(input)?; + let (input, ors) = + many0(preceded(ws(tuple((tag("AND"), multispace1))), cut(parse_not)))(input)?; let expr = ors .into_iter() .fold(lhs, |acc, branch| FilterCondition::And(Box::new(acc), Box::new(branch))); Ok((input, expr)) } -/// not = ("NOT" not) | primary -/// We can have multiple consecutive not, eg: `NOT NOT channel = mv`. +/// not = ("NOT" WS+ not) | primary +/// We can have multiple consecutive not, eg: `NOT NOT channel = mv`. /// If we parse a `NOT` we MUST parse something behind. fn parse_not(input: Span) -> IResult { - alt((map(preceded(tag("NOT"), cut(parse_not)), |e| e.negate()), parse_primary))(input) + alt(( + map(preceded(ws(tuple((tag("NOT"), multispace1))), cut(parse_not)), |e| e.negate()), + parse_primary, + ))(input) } /// geoRadius = WS* "_geoRadius(float WS* "," WS* float WS* "," WS* float) /// If we parse `_geoRadius` we MUST parse the rest of the expression. fn parse_geo_radius(input: Span) -> IResult { - // we want to forbid space BEFORE the _geoRadius but not after + // we want to allow space BEFORE the _geoRadius but not after let parsed = preceded( tuple((multispace0, tag("_geoRadius"))), // if we were able to parse `_geoRadius` and can't parse the rest of the input we return a failure @@ -238,7 +242,7 @@ fn parse_geo_point(input: Span) -> IResult { Err(nom::Err::Failure(Error::new_from_kind(input, ErrorKind::ReservedGeo("_geoPoint")))) } -/// primary = (WS* "(" expression ")" WS*) | geoRadius | condition | to +/// primary = (WS* "(" WS* expression WS* ")" WS*) | geoRadius | condition | exists | not_exists | to fn parse_primary(input: Span) -> IResult { alt(( // if we find a first parenthesis, then we must parse an expression and find the closing parenthesis @@ -620,7 +624,7 @@ pub mod tests { ("OR", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `OR`."), ("AND", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `AND`."), ("channel Ponce", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `channel Ponce`."), - ("channel = Ponce OR", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` but instead got nothing."), + ("channel = Ponce OR", "Found unexpected characters at the end of the filter: `OR`. You probably forgot an `OR` or an `AND` rule."), ("_geoRadius", "The `_geoRadius` filter expects three arguments: `_geoRadius(latitude, longitude, radius)`."), ("_geoRadius = 12", "The `_geoRadius` filter expects three arguments: `_geoRadius(latitude, longitude, radius)`."), ("_geoPoint(12, 13, 14)", "`_geoPoint` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance) built-in rule to filter on `_geo` coordinates."), @@ -631,6 +635,9 @@ pub mod tests { ("channel = mv OR (followers >= 1000", "Expression `(followers >= 1000` is missing the following closing delimiter: `)`."), ("channel = mv OR followers >= 1000)", "Found unexpected characters at the end of the filter: `)`. You probably forgot an `OR` or an `AND` rule."), ("colour NOT EXIST", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `colour NOT EXIST`."), + ("subscribers 100 TO1000", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `subscribers 100 TO1000`."), + ("channel = ponce ORdog != 'bernese mountain'", "Found unexpected characters at the end of the filter: `ORdog != \\'bernese mountain\\'`. You probably forgot an `OR` or an `AND` rule."), + ("channel = ponce AND'dog' != 'bernese mountain'", "Found unexpected characters at the end of the filter: `AND\\'dog\\' != \\'bernese mountain\\'`. You probably forgot an `OR` or an `AND` rule."), ]; for (input, expected) in test_case { diff --git a/filter-parser/src/value.rs b/filter-parser/src/value.rs index 18ae58ae5..22da6a0df 100644 --- a/filter-parser/src/value.rs +++ b/filter-parser/src/value.rs @@ -48,7 +48,7 @@ fn quoted_by(quote: char, input: Span) -> IResult { )) } -/// value = WS* ( word | singleQuoted | doubleQuoted) WS* +/// value = WS* ( word | singleQuoted | doubleQuoted) WS+ pub fn parse_value<'a>(input: Span<'a>) -> IResult> { // to get better diagnostic message we are going to strip the left whitespaces from the input right now let (input, _) = take_while(char::is_whitespace)(input)?; From 80b962b4f4aeacaec68e22279e3349adf3138746 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 16 Jun 2022 09:16:53 +0200 Subject: [PATCH 47/55] Run cargo fmt --- milli/src/heed_codec/facet/field_id_codec.rs | 7 +++++-- .../extract/extract_fid_docid_facet_values.rs | 5 +++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/milli/src/heed_codec/facet/field_id_codec.rs b/milli/src/heed_codec/facet/field_id_codec.rs index d147423f2..871b05a09 100644 --- a/milli/src/heed_codec/facet/field_id_codec.rs +++ b/milli/src/heed_codec/facet/field_id_codec.rs @@ -1,6 +1,9 @@ -use crate::{FieldId, BEU16}; +use std::borrow::Cow; +use std::convert::TryInto; + use heed::zerocopy::AsBytes; -use std::{borrow::Cow, convert::TryInto}; + +use crate::{FieldId, BEU16}; pub struct FieldIdCodec; diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index c83ac49e0..6d66a7a64 100644 --- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -1,11 +1,12 @@ -use heed::zerocopy::AsBytes; -use serde_json::Value; use std::collections::HashSet; use std::convert::TryInto; use std::fs::File; use std::io; use std::mem::size_of; +use heed::zerocopy::AsBytes; +use serde_json::Value; + use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters}; use crate::error::InternalError; use crate::facet::value_encoding::f64_into_bytes; From 4f0bd317dff811a74a86649d4c6e064f31949c15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 4 Jul 2022 08:41:54 +0200 Subject: [PATCH 48/55] Remove custom implementation of BytesEncode/Decode for the FieldId --- milli/src/heed_codec/facet/field_id_codec.rs | 28 -------------------- milli/src/heed_codec/facet/mod.rs | 7 +++-- milli/src/index.rs | 4 +-- 3 files changed, 7 insertions(+), 32 deletions(-) delete mode 100644 milli/src/heed_codec/facet/field_id_codec.rs diff --git a/milli/src/heed_codec/facet/field_id_codec.rs b/milli/src/heed_codec/facet/field_id_codec.rs deleted file mode 100644 index 871b05a09..000000000 --- a/milli/src/heed_codec/facet/field_id_codec.rs +++ /dev/null @@ -1,28 +0,0 @@ -use std::borrow::Cow; -use std::convert::TryInto; - -use heed::zerocopy::AsBytes; - -use crate::{FieldId, BEU16}; - -pub struct FieldIdCodec; - -impl<'a> heed::BytesDecode<'a> for FieldIdCodec { - type DItem = FieldId; - - fn bytes_decode(bytes: &'a [u8]) -> Option { - let bytes: [u8; 2] = bytes[..2].try_into().ok()?; - let field_id = BEU16::from(bytes).get(); - Some(field_id) - } -} - -impl<'a> heed::BytesEncode<'a> for FieldIdCodec { - type EItem = FieldId; - - fn bytes_encode(field_id: &Self::EItem) -> Option> { - let field_id = BEU16::new(*field_id); - let bytes = field_id.as_bytes(); - Some(Cow::Owned(bytes.to_vec())) - } -} diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs index 384991fd7..51812d97a 100644 --- a/milli/src/heed_codec/facet/mod.rs +++ b/milli/src/heed_codec/facet/mod.rs @@ -5,7 +5,9 @@ mod facet_string_level_zero_value_codec; mod facet_string_zero_bounds_value_codec; mod field_doc_id_facet_f64_codec; mod field_doc_id_facet_string_codec; -mod field_id_codec; + +use crate::BEU16; +use heed::types::OwnedType; pub use self::facet_level_value_f64_codec::FacetLevelValueF64Codec; pub use self::facet_level_value_u32_codec::FacetLevelValueU32Codec; @@ -16,7 +18,8 @@ pub use self::facet_string_level_zero_value_codec::{ pub use self::facet_string_zero_bounds_value_codec::FacetStringZeroBoundsValueCodec; pub use self::field_doc_id_facet_f64_codec::FieldDocIdFacetF64Codec; pub use self::field_doc_id_facet_string_codec::FieldDocIdFacetStringCodec; -pub use self::field_id_codec::FieldIdCodec; + +pub type FieldIdCodec = OwnedType; /// Tries to split a slice in half at the given middle point, /// `None` if the slice is too short. diff --git a/milli/src/index.rs b/milli/src/index.rs index b0897271e..aec7aa396 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -21,7 +21,7 @@ use crate::{ default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId, FieldIdWordCountCodec, GeoPoint, ObkvCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec, - Search, StrBEU32Codec, StrStrU8Codec, BEU32, + Search, StrBEU32Codec, StrStrU8Codec, BEU16, BEU32, }; pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5; @@ -819,7 +819,7 @@ impl Index { rtxn: &RoTxn, field_id: FieldId, ) -> heed::Result { - match self.facet_id_exists_docids.get(rtxn, &field_id)? { + match self.facet_id_exists_docids.get(rtxn, &BEU16::new(field_id))? { Some(docids) => Ok(docids), None => Ok(RoaringBitmap::new()), } From 1eb1e73bb3d909d2be54ff835126f008b812aa9f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 4 Jul 2022 09:28:23 +0200 Subject: [PATCH 49/55] Add integration tests for the EXISTS filter --- milli/src/heed_codec/facet/mod.rs | 2 +- milli/src/update/index_documents/mod.rs | 11 ++++-- milli/tests/assets/test_set.ndjson | 30 +++++++------- milli/tests/search/filters.rs | 6 +++ milli/tests/search/mod.rs | 52 +++++++++++++++++++++++-- 5 files changed, 78 insertions(+), 23 deletions(-) diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs index 51812d97a..0b2d9186f 100644 --- a/milli/src/heed_codec/facet/mod.rs +++ b/milli/src/heed_codec/facet/mod.rs @@ -6,7 +6,6 @@ mod facet_string_zero_bounds_value_codec; mod field_doc_id_facet_f64_codec; mod field_doc_id_facet_string_codec; -use crate::BEU16; use heed::types::OwnedType; pub use self::facet_level_value_f64_codec::FacetLevelValueF64Codec; @@ -18,6 +17,7 @@ pub use self::facet_string_level_zero_value_codec::{ pub use self::facet_string_zero_bounds_value_codec::FacetStringZeroBoundsValueCodec; pub use self::field_doc_id_facet_f64_codec::FieldDocIdFacetF64Codec; pub use self::field_doc_id_facet_string_codec::FieldDocIdFacetStringCodec; +use crate::BEU16; pub type FieldIdCodec = OwnedType; diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 99f474eb6..950b3a417 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -592,7 +592,7 @@ mod tests { use super::*; use crate::documents::DocumentBatchBuilder; use crate::update::DeleteDocuments; - use crate::HashMap; + use crate::{HashMap, BEU16}; #[test] fn simple_document_replacement() { @@ -2012,11 +2012,14 @@ mod tests { let colour_green_id = index.fields_ids_map(&rtxn).unwrap().id("colour.green").unwrap(); let bitmap_colour = - index.facet_id_exists_docids.get(&rtxn, &colour_id).unwrap().unwrap(); + index.facet_id_exists_docids.get(&rtxn, &BEU16::new(colour_id)).unwrap().unwrap(); assert_eq!(bitmap_colour.into_iter().collect::>(), vec![0, 1, 2, 3, 4, 6, 7]); - let bitmap_colour_green = - index.facet_id_exists_docids.get(&rtxn, &colour_green_id).unwrap().unwrap(); + let bitmap_colour_green = index + .facet_id_exists_docids + .get(&rtxn, &BEU16::new(colour_green_id)) + .unwrap() + .unwrap(); assert_eq!(bitmap_colour_green.into_iter().collect::>(), vec![6, 7]); }; diff --git a/milli/tests/assets/test_set.ndjson b/milli/tests/assets/test_set.ndjson index 6383d274e..427daca8c 100644 --- a/milli/tests/assets/test_set.ndjson +++ b/milli/tests/assets/test_set.ndjson @@ -1,17 +1,17 @@ -{"id":"A","word_rank":0,"typo_rank":1,"proximity_rank":15,"attribute_rank":505,"exact_rank":5,"asc_desc_rank":0,"sort_by_rank":0,"geo_rank":43,"title":"hell o","description":"hell o is the fourteenth episode of the american television series glee performing songs with this word","tag":"blue","_geo": { "lat": 50.62984446145472, "lng": 3.085712705162039 },"":""} -{"id":"B","word_rank":2,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":4,"asc_desc_rank":1,"sort_by_rank":2,"geo_rank":191,"title":"hello","description":"hello is a song recorded by english singer songwriter adele","tag":"red","_geo": { "lat": 50.63047567664291, "lng": 3.088852230809636 },"":""} -{"id":"C","word_rank":0,"typo_rank":1,"proximity_rank":8,"attribute_rank":336,"exact_rank":4,"asc_desc_rank":2,"sort_by_rank":0,"geo_rank":283,"title":"hell on earth","description":"hell on earth is the third studio album by american hip hop duo mobb deep","tag":"blue","_geo": { "lat": 50.6321800003937, "lng": 3.088331882262139 },"":""} -{"id":"D","word_rank":0,"typo_rank":1,"proximity_rank":10,"attribute_rank":757,"exact_rank":4,"asc_desc_rank":3,"sort_by_rank":2,"geo_rank":1381,"title":"hell on wheels tv series","description":"the construction of the first transcontinental railroad across the united states in the world","tag":"red","_geo": { "lat": 50.63728851135729, "lng": 3.0703951595971626 },"":""} -{"id":"E","word_rank":2,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":4,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":1979,"title":"hello kitty","description":"also known by her full name kitty white is a fictional character produced by the japanese company sanrio","tag":"green","_geo": { "lat": 50.64264610511925, "lng": 3.0665099941857634 },"":""} -{"id":"F","word_rank":2,"typo_rank":1,"proximity_rank":0,"attribute_rank":1017,"exact_rank":5,"asc_desc_rank":5,"sort_by_rank":0,"geo_rank":65022,"title":"laptop orchestra","description":"a laptop orchestra lork or lo is a chamber music ensemble consisting primarily of laptops like helo huddersfield experimental laptop orchestra","tag":"blue","_geo": { "lat": 51.05028653642387, "lng": 3.7301072771642096 },"":""} -{"id":"G","word_rank":1,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":5,"sort_by_rank":2,"geo_rank":34692,"title":"hello world film","description":"hello world is a 2019 japanese animated sci fi romantic drama film directed by tomohiko ito and produced by graphinica","tag":"red","_geo": { "lat": 50.78776041427129, "lng": 2.661201766290338 },"":""} -{"id":"H","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":202182,"title":"world hello day","description":"holiday observed on november 21 to express that conflicts should be resolved through communication rather than the use of force","tag":"green","_geo": { "lat": 48.875617484531965, "lng": 2.346747821504194 },"":""} +{"id":"A","word_rank":0,"typo_rank":1,"proximity_rank":15,"attribute_rank":505,"exact_rank":5,"asc_desc_rank":0,"sort_by_rank":0,"geo_rank":43,"title":"hell o","description":"hell o is the fourteenth episode of the american television series glee performing songs with this word","tag":"blue","_geo": { "lat": 50.62984446145472, "lng": 3.085712705162039 },"":"", "opt1": [null]} +{"id":"B","word_rank":2,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":4,"asc_desc_rank":1,"sort_by_rank":2,"geo_rank":191,"title":"hello","description":"hello is a song recorded by english singer songwriter adele","tag":"red","_geo": { "lat": 50.63047567664291, "lng": 3.088852230809636 },"":"", "opt1": []} +{"id":"C","word_rank":0,"typo_rank":1,"proximity_rank":8,"attribute_rank":336,"exact_rank":4,"asc_desc_rank":2,"sort_by_rank":0,"geo_rank":283,"title":"hell on earth","description":"hell on earth is the third studio album by american hip hop duo mobb deep","tag":"blue","_geo": { "lat": 50.6321800003937, "lng": 3.088331882262139 },"":"", "opt1": null} +{"id":"D","word_rank":0,"typo_rank":1,"proximity_rank":10,"attribute_rank":757,"exact_rank":4,"asc_desc_rank":3,"sort_by_rank":2,"geo_rank":1381,"title":"hell on wheels tv series","description":"the construction of the first transcontinental railroad across the united states in the world","tag":"red","_geo": { "lat": 50.63728851135729, "lng": 3.0703951595971626 },"":"", "opt1": 4} +{"id":"E","word_rank":2,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":4,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":1979,"title":"hello kitty","description":"also known by her full name kitty white is a fictional character produced by the japanese company sanrio","tag":"green","_geo": { "lat": 50.64264610511925, "lng": 3.0665099941857634 },"":"", "opt1": "E"} +{"id":"F","word_rank":2,"typo_rank":1,"proximity_rank":0,"attribute_rank":1017,"exact_rank":5,"asc_desc_rank":5,"sort_by_rank":0,"geo_rank":65022,"title":"laptop orchestra","description":"a laptop orchestra lork or lo is a chamber music ensemble consisting primarily of laptops like helo huddersfield experimental laptop orchestra","tag":"blue","_geo": { "lat": 51.05028653642387, "lng": 3.7301072771642096 },"":"", "opt1": ["F"]} +{"id":"G","word_rank":1,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":5,"sort_by_rank":2,"geo_rank":34692,"title":"hello world film","description":"hello world is a 2019 japanese animated sci fi romantic drama film directed by tomohiko ito and produced by graphinica","tag":"red","_geo": { "lat": 50.78776041427129, "lng": 2.661201766290338 },"":"", "opt1": [7]} +{"id":"H","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":202182,"title":"world hello day","description":"holiday observed on november 21 to express that conflicts should be resolved through communication rather than the use of force","tag":"green","_geo": { "lat": 48.875617484531965, "lng": 2.346747821504194 },"":"", "opt1": ["H", 8]} {"id":"I","word_rank":0,"typo_rank":0,"proximity_rank":8,"attribute_rank":338,"exact_rank":3,"asc_desc_rank":3,"sort_by_rank":0,"geo_rank":740667,"title":"hello world song","description":"hello world is a song written by tom douglas tony lane and david lee and recorded by american country music group lady antebellum","tag":"blue","_geo": { "lat": 43.973998070351065, "lng": 3.4661837318345032 },"":""} -{"id":"J","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":1,"exact_rank":3,"asc_desc_rank":2,"sort_by_rank":1,"geo_rank":739020,"title":"hello cruel world","description":"hello cruel world is an album by new zealand band tall dwarfs","tag":"green","_geo": { "lat": 43.98920130353838, "lng": 3.480519311627928 },"":""} -{"id":"K","word_rank":0,"typo_rank":2,"proximity_rank":9,"attribute_rank":670,"exact_rank":5,"asc_desc_rank":1,"sort_by_rank":2,"geo_rank":738830,"title":"hallo creation system","description":"in few word hallo was a construction toy created by the american company mattel to engage girls in construction play","tag":"red","_geo": { "lat": 43.99155030238669, "lng": 3.503453528249425 },"":""} -{"id":"L","word_rank":0,"typo_rank":0,"proximity_rank":2,"attribute_rank":250,"exact_rank":4,"asc_desc_rank":0,"sort_by_rank":0,"geo_rank":737861,"title":"good morning world","description":"good morning world is an american sitcom broadcast on cbs tv during the 1967 1968 season","tag":"blue","_geo": { "lat": 44.000507750283695, "lng": 3.5116812040621572 },"":""} -{"id":"M","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":0,"asc_desc_rank":0,"sort_by_rank":2,"geo_rank":739203,"title":"hello world america","description":"a perfect match for a perfect engine using the query hello world america","tag":"red","_geo": { "lat": 43.99150729038736, "lng": 3.606143957295055 },"":""} -{"id":"N","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":1,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":9499586,"title":"hello world america unleashed","description":"a very good match for a very good engine using the query hello world america","tag":"green","_geo": { "lat": 35.511540843367115, "lng": 138.764368875787 },"":""} -{"id":"O","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":10,"exact_rank":0,"asc_desc_rank":6,"sort_by_rank":0,"geo_rank":9425163,"title":"a perfect match for a perfect engine using the query hello world america","description":"hello world america","tag":"blue","_geo": { "lat": 35.00536702277189, "lng": 135.76118763940391 },"":""} -{"id":"P","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":12,"exact_rank":1,"asc_desc_rank":3,"sort_by_rank":2,"geo_rank":9422437,"title":"a very good match for a very good engine using the query hello world america","description":"hello world america unleashed","tag":"red","_geo": { "lat": 35.06462306367058, "lng": 135.8338440354251 },"":""} +{"id":"J","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":1,"exact_rank":3,"asc_desc_rank":2,"sort_by_rank":1,"geo_rank":739020,"title":"hello cruel world","description":"hello cruel world is an album by new zealand band tall dwarfs","tag":"green","_geo": { "lat": 43.98920130353838, "lng": 3.480519311627928 },"":"", "opt1": {}} +{"id":"K","word_rank":0,"typo_rank":2,"proximity_rank":9,"attribute_rank":670,"exact_rank":5,"asc_desc_rank":1,"sort_by_rank":2,"geo_rank":738830,"title":"hallo creation system","description":"in few word hallo was a construction toy created by the american company mattel to engage girls in construction play","tag":"red","_geo": { "lat": 43.99155030238669, "lng": 3.503453528249425 },"":"", "opt1": [{"opt2": 11}] } +{"id":"L","word_rank":0,"typo_rank":0,"proximity_rank":2,"attribute_rank":250,"exact_rank":4,"asc_desc_rank":0,"sort_by_rank":0,"geo_rank":737861,"title":"good morning world","description":"good morning world is an american sitcom broadcast on cbs tv during the 1967 1968 season","tag":"blue","_geo": { "lat": 44.000507750283695, "lng": 3.5116812040621572 },"":"", "opt1": {"opt2": [12]}} +{"id":"M","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":0,"asc_desc_rank":0,"sort_by_rank":2,"geo_rank":739203,"title":"hello world america","description":"a perfect match for a perfect engine using the query hello world america","tag":"red","_geo": { "lat": 43.99150729038736, "lng": 3.606143957295055 },"":"", "opt1": [13, [{"opt2": null}]]} +{"id":"N","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":1,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":9499586,"title":"hello world america unleashed","description":"a very good match for a very good engine using the query hello world america","tag":"green","_geo": { "lat": 35.511540843367115, "lng": 138.764368875787 },"":"", "opt1": {"a": 1, "opt2": {"opt3": 14}} } +{"id":"O","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":10,"exact_rank":0,"asc_desc_rank":6,"sort_by_rank":0,"geo_rank":9425163,"title":"a perfect match for a perfect engine using the query hello world america","description":"hello world america","tag":"blue","_geo": { "lat": 35.00536702277189, "lng": 135.76118763940391 },"":"", "opt1": [[[[]]]] } +{"id":"P","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":12,"exact_rank":1,"asc_desc_rank":3,"sort_by_rank":2,"geo_rank":9422437,"title":"a very good match for a very good engine using the query hello world america","description":"hello world america unleashed","tag":"red","_geo": { "lat": 35.06462306367058, "lng": 135.8338440354251 },"":"", "opt1.opt2": 16} {"id":"Q","word_rank":1,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":2,"sort_by_rank":1,"geo_rank":9339230,"title":"hello world","description":"a hello world program generally is a computer program that outputs or displays the message hello world","tag":"green","_geo": { "lat": 34.39548365683149, "lng": 132.4535960928883 },"":""} diff --git a/milli/tests/search/filters.rs b/milli/tests/search/filters.rs index fe926d17a..1700a1478 100644 --- a/milli/tests/search/filters.rs +++ b/milli/tests/search/filters.rs @@ -80,3 +80,9 @@ test_filter!( lower_complex_filter_2, vec![Left(vec!["tag=red", "tag=green"]), Left(vec!["asc_desc_rank<3", "asc_desc_rank<1"])] ); +test_filter!(exists_filter_1, vec![Right("opt1 EXISTS")]); +test_filter!(exists_filter_1_not, vec![Right("opt1 NOT EXISTS")]); +test_filter!(exists_filter_1_not_alt, vec![Right("NOT opt1 EXISTS")]); +test_filter!(exists_filter_1_double_not, vec![Right("NOT opt1 NOT EXISTS")]); + +test_filter!(exists_filter_2, vec![Right("opt1.opt2 EXISTS")]); diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index 472fbafe0..ec784bfc0 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -9,7 +9,7 @@ use maplit::{hashmap, hashset}; use milli::documents::{DocumentBatchBuilder, DocumentBatchReader}; use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; use milli::{AscDesc, Criterion, DocumentId, Index, Member}; -use serde::Deserialize; +use serde::{Deserialize, Deserializer}; use slice_group_by::GroupBy; mod distinct; @@ -43,6 +43,8 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { S("tag"), S("asc_desc_rank"), S("_geo"), + S("opt1"), + S("opt1.opt2") }); builder.set_sortable_fields(hashset! { S("tag"), @@ -196,12 +198,44 @@ fn execute_filter(filter: &str, document: &TestDocument) -> Option { id = (document.geo_rank < 100000).then(|| document.id.clone()); } else if filter.starts_with("NOT _geoRadius") { id = (document.geo_rank > 1000000).then(|| document.id.clone()); + } else if matches!(filter, "opt1 EXISTS" | "NOT opt1 NOT EXISTS") { + id = document.opt1.is_some().then(|| document.id.clone()); + } else if matches!(filter, "NOT opt1 EXISTS" | "opt1 NOT EXISTS") { + id = document.opt1.is_none().then(|| document.id.clone()); + } else if matches!(filter, "opt1.opt2 EXISTS") { + if document.opt1opt2.is_some() { + id = Some(document.id.clone()); + } else if let Some(opt1) = &document.opt1 { + id = contains_key_rec(opt1, "opt2").then(|| document.id.clone()); + } } id } +pub fn contains_key_rec(v: &serde_json::Value, key: &str) -> bool { + match v { + serde_json::Value::Array(v) => { + for v in v.iter() { + if contains_key_rec(v, key) { + return true; + } + } + false + } + serde_json::Value::Object(v) => { + for (k, v) in v.iter() { + if k == key || contains_key_rec(v, key) { + return true; + } + } + false + } + _ => false, + } +} + pub fn expected_filtered_ids(filters: Vec, &str>>) -> HashSet { - let dataset: HashSet = + let dataset: Vec = serde_json::Deserializer::from_str(CONTENT).into_iter().map(|r| r.unwrap()).collect(); let mut filtered_ids: HashSet<_> = dataset.iter().map(|d| d.id.clone()).collect(); @@ -229,7 +263,7 @@ pub fn expected_filtered_ids(filters: Vec, &str>>) -> HashSet, + #[serde(default, deserialize_with = "some_option", rename = "opt1.opt2")] + pub opt1opt2: Option, +} + +fn some_option<'de, D>(deserializer: D) -> Result, D::Error> +where + D: Deserializer<'de>, +{ + let result = serde_json::Value::deserialize(deserializer)?; + Ok(Some(result)) } From aed8c69bcb94b83ccb2957b741584f908c22d094 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 19 Jul 2022 09:57:28 +0200 Subject: [PATCH 50/55] Refactor indexation of the "facet-id-exists-docids" database The idea is to directly create a sorted and merged list of bitmaps in the form of a BTreeMap instead of creating a grenad::Reader where the keys are field_id and the values are docids. Then we send that BTreeMap to the thing that handles TypedChunks, which inserts its content into the database. --- .../extract/extract_fid_docid_facet_values.rs | 23 +++--- .../src/update/index_documents/extract/mod.rs | 35 +++++---- .../src/update/index_documents/typed_chunk.rs | 73 ++++++++++++++++--- 3 files changed, 92 insertions(+), 39 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index 6d66a7a64..368378792 100644 --- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -1,16 +1,16 @@ -use std::collections::HashSet; +use std::collections::{BTreeMap, HashSet}; use std::convert::TryInto; use std::fs::File; use std::io; use std::mem::size_of; use heed::zerocopy::AsBytes; +use roaring::RoaringBitmap; use serde_json::Value; use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters}; use crate::error::InternalError; use crate::facet::value_encoding::f64_into_bytes; -use crate::update::index_documents::merge_cbo_roaring_bitmaps; use crate::{DocumentId, FieldId, Result, BEU32}; /// Extracts the facet values of each faceted field of each document. @@ -22,7 +22,7 @@ pub fn extract_fid_docid_facet_values( obkv_documents: grenad::Reader, indexer: GrenadParameters, faceted_fields: &HashSet, -) -> Result<(grenad::Reader, grenad::Reader, grenad::Reader)> { +) -> Result<(grenad::Reader, grenad::Reader, BTreeMap)> { let max_memory = indexer.max_memory_by_thread(); let mut fid_docid_facet_numbers_sorter = create_sorter( @@ -30,7 +30,7 @@ pub fn extract_fid_docid_facet_values( indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, - max_memory.map(|m| m / 3), + max_memory.map(|m| m / 2), ); let mut fid_docid_facet_strings_sorter = create_sorter( @@ -38,16 +38,10 @@ pub fn extract_fid_docid_facet_values( indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, - max_memory.map(|m| m / 3), + max_memory.map(|m| m / 2), ); - let mut fid_docid_facet_exists_sorter = create_sorter( - merge_cbo_roaring_bitmaps, - indexer.chunk_compression_type, - indexer.chunk_compression_level, - indexer.max_nb_chunks, - max_memory.map(|m| m / 3), - ); + let mut facet_exists_docids = BTreeMap::::new(); let mut key_buffer = Vec::new(); let mut cursor = obkv_documents.into_cursor()?; @@ -65,7 +59,8 @@ pub fn extract_fid_docid_facet_values( // Here, we know already that the document must be added to the “field id exists” database let document: [u8; 4] = docid_bytes[..4].try_into().ok().unwrap(); let document = BEU32::from(document).get(); - fid_docid_facet_exists_sorter.insert(&key_buffer, document.to_ne_bytes())?; + + facet_exists_docids.entry(field_id).or_default().insert(document); // For the other extraction tasks, prefix the key with the field_id and the document_id key_buffer.extend_from_slice(&docid_bytes); @@ -99,7 +94,7 @@ pub fn extract_fid_docid_facet_values( Ok(( sorter_into_reader(fid_docid_facet_numbers_sorter, indexer.clone())?, sorter_into_reader(fid_docid_facet_strings_sorter, indexer.clone())?, - sorter_into_reader(fid_docid_facet_exists_sorter, indexer)?, + facet_exists_docids, )) } diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index bb695a99f..76d968919 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -8,12 +8,13 @@ mod extract_word_docids; mod extract_word_pair_proximity_docids; mod extract_word_position_docids; -use std::collections::HashSet; +use std::collections::{BTreeMap, HashSet}; use std::fs::File; use crossbeam_channel::Sender; use log::debug; use rayon::prelude::*; +use roaring::RoaringBitmap; use self::extract_docid_word_positions::extract_docid_word_positions; use self::extract_facet_number_docids::extract_facet_number_docids; @@ -72,12 +73,24 @@ pub(crate) fn data_from_obkv_documents( let ( docid_word_positions_chunks, - ( - docid_fid_facet_numbers_chunks, - (docid_fid_facet_strings_chunks, docid_fid_facet_exists_chunks), - ), + (docid_fid_facet_numbers_chunks, (docid_fid_facet_strings_chunks, facet_exists_docids)), ) = result?; + // merge facet_exists_docids hashmaps and send them as a typed chunk + { + let lmdb_writer_sx = lmdb_writer_sx.clone(); + rayon::spawn(move || { + let mut all = BTreeMap::default(); + for facet_exists_docids in facet_exists_docids { + for (field_id, docids) in facet_exists_docids { + let docids0 = all.entry(field_id).or_default(); + *docids0 |= docids; + } + } + let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetExistsDocids(all))); + }); + } + spawn_extraction_task::<_, _, Vec>>( docid_word_positions_chunks.clone(), indexer.clone(), @@ -141,12 +154,6 @@ pub(crate) fn data_from_obkv_documents( "field-id-facet-number-docids", ); - // spawn extraction task for field-id-facet-exists-docids - rayon::spawn(move || { - let reader = docid_fid_facet_exists_chunks.merge(merge_cbo_roaring_bitmaps, &indexer); - let _ = lmdb_writer_sx.send(reader.map(TypedChunk::FieldIdFacetExistsDocids)); - }); - Ok(()) } @@ -221,7 +228,7 @@ fn send_and_extract_flattened_documents_data( grenad::Reader, ( grenad::Reader, - (grenad::Reader, grenad::Reader), + (grenad::Reader, BTreeMap), ), )> { let flattened_documents_chunk = @@ -266,7 +273,7 @@ fn send_and_extract_flattened_documents_data( let ( docid_fid_facet_numbers_chunk, docid_fid_facet_strings_chunk, - docid_fid_facet_exists_chunk, + facet_exists_docids, ) = extract_fid_docid_facet_values( flattened_documents_chunk.clone(), indexer.clone(), @@ -291,7 +298,7 @@ fn send_and_extract_flattened_documents_data( Ok(( docid_fid_facet_numbers_chunk, - (docid_fid_facet_strings_chunk, docid_fid_facet_exists_chunk), + (docid_fid_facet_strings_chunk, facet_exists_docids), )) }, ); diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index e501e5efd..e1fd8f98d 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -1,11 +1,12 @@ use std::borrow::Cow; +use std::collections::BTreeMap; use std::convert::TryInto; use std::fs::File; use std::io; use grenad::MergerBuilder; use heed::types::ByteSlice; -use heed::{BytesDecode, RwTxn}; +use heed::{BytesDecode, BytesEncode, RwTxn}; use roaring::RoaringBitmap; use super::helpers::{ @@ -16,8 +17,8 @@ use super::{ClonableMmap, MergeFn}; use crate::heed_codec::facet::{decode_prefix_string, encode_prefix_string}; use crate::update::index_documents::helpers::as_cloneable_grenad; use crate::{ - lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, - Result, + error, lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, FieldId, + GeoPoint, Index, Result, BEU16, }; pub(crate) enum TypedChunk { @@ -35,7 +36,7 @@ pub(crate) enum TypedChunk { WordPairProximityDocids(grenad::Reader), FieldIdFacetStringDocids(grenad::Reader), FieldIdFacetNumberDocids(grenad::Reader), - FieldIdFacetExistsDocids(grenad::Reader), + FieldIdFacetExistsDocids(BTreeMap), GeoPoints(grenad::Reader), } @@ -147,16 +148,14 @@ pub(crate) fn write_typed_chunk_into_index( )?; is_merged_database = true; } - TypedChunk::FieldIdFacetExistsDocids(facet_id_exists_docids_iter) => { - append_entries_into_database( - facet_id_exists_docids_iter, + TypedChunk::FieldIdFacetExistsDocids(facet_id_exists_docids) => { + write_sorted_iterator_into_database( + facet_id_exists_docids.into_iter().map(|(k, v)| (BEU16::new(k), v)), &index.facet_id_exists_docids, + "facet-id-exists-docids", wtxn, - index_is_empty, - |value, _buffer| Ok(value), merge_cbo_roaring_bitmaps, - ) - .unwrap(); + )?; is_merged_database = true; } TypedChunk::WordPairProximityDocids(word_pair_proximity_docids_iter) => { @@ -270,6 +269,58 @@ fn merge_cbo_roaring_bitmaps( )?) } +fn write_sorted_iterator_into_database( + mut iterator: Iter, + database: &heed::Database, + database_name: &'static str, + wtxn: &mut RwTxn, + merge_values: Merge, +) -> Result<()> +where + for<'a> KeyCodec: BytesEncode<'a, EItem = Key>, + for<'a> ValueCodec: BytesEncode<'a, EItem = Value> + BytesDecode<'a, DItem = Value>, + Iter: Iterator, + Merge: Fn(&[u8], &[u8], &mut Vec) -> Result<()>, +{ + if database.is_empty(wtxn)? { + let mut database = database.iter_mut(wtxn)?.remap_types::(); + + while let Some((key, value)) = iterator.next() { + let key = KeyCodec::bytes_encode(&key) + .ok_or(error::SerializationError::Encoding { db_name: Some(database_name) })?; + if valid_lmdb_key(&key) { + let value = ValueCodec::bytes_encode(&value) + .ok_or(error::SerializationError::Encoding { db_name: Some(database_name) })?; + unsafe { database.append(&key, &value)? }; + } + } + + Ok(()) + } else { + let database = database.remap_types::(); + let mut buffer = Vec::new(); + while let Some((key, value)) = iterator.next() { + let key = KeyCodec::bytes_encode(&key) + .ok_or(error::SerializationError::Encoding { db_name: Some(database_name) })?; + if valid_lmdb_key(&key) { + let value = ValueCodec::bytes_encode(&value) + .ok_or(error::SerializationError::Encoding { db_name: Some(database_name) })?; + let value = match database.get(wtxn, &key)? { + Some(prev_value) => { + merge_values(&value, &prev_value, &mut buffer)?; + &buffer[..] + } + None => &value, + }; + + database.put(wtxn, &key, value)?; + } + } + + Ok(()) + } +} + /// Write provided entries in database using serialize_value function. /// merge_values function is used if an entry already exist in the database. fn write_entries_into_database( From d0eee5ff7a970b42b1a3723c1e627dc2f2ea8608 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 19 Jul 2022 13:44:36 +0200 Subject: [PATCH 51/55] Fix compiler error --- milli/src/search/facet/filter.rs | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index 7f3b928dd..903f9644f 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -287,14 +287,7 @@ impl<'a> Filter<'a> { Condition::NotExists => { let all_ids = index.documents_ids(rtxn)?; - let exist = Self::evaluate_operator( - rtxn, - index, - numbers_db, - strings_db, - field_id, - &Condition::Exists, - )?; + let exist = Self::evaluate_operator(rtxn, index, field_id, &Condition::Exists)?; let notexist = all_ids - exist; return Ok(notexist); From 1506683705b0ee01afdef14d1f07a527d6dadc17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 19 Jul 2022 14:42:35 +0200 Subject: [PATCH 52/55] Avoid using too much memory when indexing facet-exists-docids --- .../extract/extract_fid_docid_facet_values.rs | 19 +++++- .../src/update/index_documents/extract/mod.rs | 29 ++++---- .../src/update/index_documents/typed_chunk.rs | 68 +++---------------- 3 files changed, 40 insertions(+), 76 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index 368378792..cf116e6f5 100644 --- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -5,13 +5,15 @@ use std::io; use std::mem::size_of; use heed::zerocopy::AsBytes; +use heed::BytesEncode; use roaring::RoaringBitmap; use serde_json::Value; use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters}; use crate::error::InternalError; use crate::facet::value_encoding::f64_into_bytes; -use crate::{DocumentId, FieldId, Result, BEU32}; +use crate::update::index_documents::{create_writer, writer_into_reader}; +use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result, BEU32}; /// Extracts the facet values of each faceted field of each document. /// @@ -22,7 +24,7 @@ pub fn extract_fid_docid_facet_values( obkv_documents: grenad::Reader, indexer: GrenadParameters, faceted_fields: &HashSet, -) -> Result<(grenad::Reader, grenad::Reader, BTreeMap)> { +) -> Result<(grenad::Reader, grenad::Reader, grenad::Reader)> { let max_memory = indexer.max_memory_by_thread(); let mut fid_docid_facet_numbers_sorter = create_sorter( @@ -91,10 +93,21 @@ pub fn extract_fid_docid_facet_values( } } + let mut facet_exists_docids_writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); + for (fid, bitmap) in facet_exists_docids.into_iter() { + let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap(); + facet_exists_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?; + } + let facet_exists_docids_reader = writer_into_reader(facet_exists_docids_writer)?; + Ok(( sorter_into_reader(fid_docid_facet_numbers_sorter, indexer.clone())?, sorter_into_reader(fid_docid_facet_strings_sorter, indexer.clone())?, - facet_exists_docids, + facet_exists_docids_reader, )) } diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 76d968919..157886e63 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -8,13 +8,12 @@ mod extract_word_docids; mod extract_word_pair_proximity_docids; mod extract_word_position_docids; -use std::collections::{BTreeMap, HashSet}; +use std::collections::HashSet; use std::fs::File; use crossbeam_channel::Sender; use log::debug; use rayon::prelude::*; -use roaring::RoaringBitmap; use self::extract_docid_word_positions::extract_docid_word_positions; use self::extract_facet_number_docids::extract_facet_number_docids; @@ -73,21 +72,25 @@ pub(crate) fn data_from_obkv_documents( let ( docid_word_positions_chunks, - (docid_fid_facet_numbers_chunks, (docid_fid_facet_strings_chunks, facet_exists_docids)), + ( + docid_fid_facet_numbers_chunks, + (docid_fid_facet_strings_chunks, facet_exists_docids_chunks), + ), ) = result?; - // merge facet_exists_docids hashmaps and send them as a typed chunk + // merge facet_exists_docids and send them as a typed chunk { let lmdb_writer_sx = lmdb_writer_sx.clone(); rayon::spawn(move || { - let mut all = BTreeMap::default(); - for facet_exists_docids in facet_exists_docids { - for (field_id, docids) in facet_exists_docids { - let docids0 = all.entry(field_id).or_default(); - *docids0 |= docids; + debug!("merge {} database", "facet-id-exists-docids"); + match facet_exists_docids_chunks.merge(merge_cbo_roaring_bitmaps, &indexer) { + Ok(reader) => { + let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetExistsDocids(reader))); + } + Err(e) => { + let _ = lmdb_writer_sx.send(Err(e)); } } - let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetExistsDocids(all))); }); } @@ -228,7 +231,7 @@ fn send_and_extract_flattened_documents_data( grenad::Reader, ( grenad::Reader, - (grenad::Reader, BTreeMap), + (grenad::Reader, grenad::Reader), ), )> { let flattened_documents_chunk = @@ -273,7 +276,7 @@ fn send_and_extract_flattened_documents_data( let ( docid_fid_facet_numbers_chunk, docid_fid_facet_strings_chunk, - facet_exists_docids, + fid_facet_exists_docids_chunk, ) = extract_fid_docid_facet_values( flattened_documents_chunk.clone(), indexer.clone(), @@ -298,7 +301,7 @@ fn send_and_extract_flattened_documents_data( Ok(( docid_fid_facet_numbers_chunk, - (docid_fid_facet_strings_chunk, facet_exists_docids), + (docid_fid_facet_strings_chunk, fid_facet_exists_docids_chunk), )) }, ); diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index e1fd8f98d..5b7b00c21 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -1,12 +1,11 @@ use std::borrow::Cow; -use std::collections::BTreeMap; use std::convert::TryInto; use std::fs::File; use std::io; use grenad::MergerBuilder; use heed::types::ByteSlice; -use heed::{BytesDecode, BytesEncode, RwTxn}; +use heed::{BytesDecode, RwTxn}; use roaring::RoaringBitmap; use super::helpers::{ @@ -17,8 +16,8 @@ use super::{ClonableMmap, MergeFn}; use crate::heed_codec::facet::{decode_prefix_string, encode_prefix_string}; use crate::update::index_documents::helpers::as_cloneable_grenad; use crate::{ - error, lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, FieldId, - GeoPoint, Index, Result, BEU16, + lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, + Result, }; pub(crate) enum TypedChunk { @@ -36,7 +35,7 @@ pub(crate) enum TypedChunk { WordPairProximityDocids(grenad::Reader), FieldIdFacetStringDocids(grenad::Reader), FieldIdFacetNumberDocids(grenad::Reader), - FieldIdFacetExistsDocids(BTreeMap), + FieldIdFacetExistsDocids(grenad::Reader), GeoPoints(grenad::Reader), } @@ -149,11 +148,12 @@ pub(crate) fn write_typed_chunk_into_index( is_merged_database = true; } TypedChunk::FieldIdFacetExistsDocids(facet_id_exists_docids) => { - write_sorted_iterator_into_database( - facet_id_exists_docids.into_iter().map(|(k, v)| (BEU16::new(k), v)), + append_entries_into_database( + facet_id_exists_docids, &index.facet_id_exists_docids, - "facet-id-exists-docids", wtxn, + index_is_empty, + |value, _buffer| Ok(value), merge_cbo_roaring_bitmaps, )?; is_merged_database = true; @@ -269,58 +269,6 @@ fn merge_cbo_roaring_bitmaps( )?) } -fn write_sorted_iterator_into_database( - mut iterator: Iter, - database: &heed::Database, - database_name: &'static str, - wtxn: &mut RwTxn, - merge_values: Merge, -) -> Result<()> -where - for<'a> KeyCodec: BytesEncode<'a, EItem = Key>, - for<'a> ValueCodec: BytesEncode<'a, EItem = Value> + BytesDecode<'a, DItem = Value>, - Iter: Iterator, - Merge: Fn(&[u8], &[u8], &mut Vec) -> Result<()>, -{ - if database.is_empty(wtxn)? { - let mut database = database.iter_mut(wtxn)?.remap_types::(); - - while let Some((key, value)) = iterator.next() { - let key = KeyCodec::bytes_encode(&key) - .ok_or(error::SerializationError::Encoding { db_name: Some(database_name) })?; - if valid_lmdb_key(&key) { - let value = ValueCodec::bytes_encode(&value) - .ok_or(error::SerializationError::Encoding { db_name: Some(database_name) })?; - unsafe { database.append(&key, &value)? }; - } - } - - Ok(()) - } else { - let database = database.remap_types::(); - let mut buffer = Vec::new(); - while let Some((key, value)) = iterator.next() { - let key = KeyCodec::bytes_encode(&key) - .ok_or(error::SerializationError::Encoding { db_name: Some(database_name) })?; - if valid_lmdb_key(&key) { - let value = ValueCodec::bytes_encode(&value) - .ok_or(error::SerializationError::Encoding { db_name: Some(database_name) })?; - let value = match database.get(wtxn, &key)? { - Some(prev_value) => { - merge_values(&value, &prev_value, &mut buffer)?; - &buffer[..] - } - None => &value, - }; - - database.put(wtxn, &key, value)?; - } - } - - Ok(()) - } -} - /// Write provided entries in database using serialize_value function. /// merge_values function is used if an entry already exist in the database. fn write_entries_into_database( From 41a0ce07cb5f07b886df99ddbf1889164486b0a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 20 Jul 2022 16:20:35 +0200 Subject: [PATCH 53/55] Add a code comment, as suggested in PR review Co-authored-by: Many the fish --- milli/src/documents/builder.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs index dc027e1b7..1a57db34b 100644 --- a/milli/src/documents/builder.rs +++ b/milli/src/documents/builder.rs @@ -97,6 +97,7 @@ impl DocumentsBatchBuilder { .map(|(k, t)| (self.fields_index.insert(k), t)) .enumerate() .collect(); + // Make sure that we insert the fields ids in order as the obkv writer has this requirement. typed_fields_ids.sort_unstable_by_key(|(_, (fid, _))| *fid); let mut record = csv::StringRecord::new(); From d5e9b7305b0561bdd23aadb39c56e77d132c57bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Thu, 21 Jul 2022 13:19:42 +0400 Subject: [PATCH 54/55] Update version for next release (v0.32.0) --- benchmarks/Cargo.toml | 2 +- cli/Cargo.toml | 2 +- filter-parser/Cargo.toml | 2 +- flatten-serde-json/Cargo.toml | 2 +- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- json-depth-checker/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index b40519d99..896ccd739 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "benchmarks" -version = "0.31.1" +version = "0.32.0" edition = "2018" publish = false diff --git a/cli/Cargo.toml b/cli/Cargo.toml index e59710b72..e4de70031 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cli" -version = "0.31.1" +version = "0.32.0" edition = "2018" description = "A CLI to interact with a milli index" publish = false diff --git a/filter-parser/Cargo.toml b/filter-parser/Cargo.toml index 1a2e46929..8f61796b3 100644 --- a/filter-parser/Cargo.toml +++ b/filter-parser/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "filter-parser" -version = "0.31.1" +version = "0.32.0" edition = "2021" description = "The parser for the Meilisearch filter syntax" publish = false diff --git a/flatten-serde-json/Cargo.toml b/flatten-serde-json/Cargo.toml index d22f7d86d..5cbc35f25 100644 --- a/flatten-serde-json/Cargo.toml +++ b/flatten-serde-json/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "flatten-serde-json" -version = "0.31.1" +version = "0.32.0" edition = "2021" description = "Flatten serde-json objects like elastic search" readme = "README.md" diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index 7e9dd207a..46c50de43 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.31.1" +version = "0.32.0" authors = ["Clément Renault "] edition = "2018" description = "A small tool to do operations on the database" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 9e8781e55..43e046c11 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.31.1" +version = "0.32.0" authors = ["Clément Renault "] edition = "2018" publish = false diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 49be47c1e..ea1ee9193 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.31.1" +version = "0.32.0" authors = ["Clément Renault "] edition = "2018" publish = false diff --git a/json-depth-checker/Cargo.toml b/json-depth-checker/Cargo.toml index d4ea547fb..a0bb76676 100644 --- a/json-depth-checker/Cargo.toml +++ b/json-depth-checker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "json-depth-checker" -version = "0.31.1" +version = "0.32.0" edition = "2021" description = "A library that indicates if a JSON must be flattened" publish = false diff --git a/milli/Cargo.toml b/milli/Cargo.toml index d980c6041..37c7b7c84 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.31.1" +version = "0.32.0" authors = ["Kerollmops "] edition = "2018" From 1fe224f2c643a20922767f909f0bb0517e174189 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 21 Jul 2022 16:12:01 +0200 Subject: [PATCH 55/55] Update filter-parser/fuzz/.gitignore Co-authored-by: Many the fish --- filter-parser/fuzz/.gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/filter-parser/fuzz/.gitignore b/filter-parser/fuzz/.gitignore index 9a2e1d58c..084aa18c1 100644 --- a/filter-parser/fuzz/.gitignore +++ b/filter-parser/fuzz/.gitignore @@ -1,3 +1,3 @@ /corpus/ /artifacts/ -/target/ \ No newline at end of file +/target/