Skip to content
This repository has been archived by the owner on Apr 4, 2023. It is now read-only.

Add EXISTS filter #556

Merged
merged 58 commits into from
Aug 4, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
58 commits
Select commit Hold shift + click to select a range
048e174
Do not allocate when parsing CSV headers
Kerollmops Jun 14, 2022
eb63af1
Update grenad to 0.4.2
Kerollmops Jun 14, 2022
419ce39
Rework the DocumentsBatchBuilder/Reader to use grenad
Kerollmops Jun 14, 2022
e8297ad
Fix the tests for the new DocumentsBatchBuilder/Reader
Kerollmops Jun 14, 2022
6d0498d
Fix the fuzz tests
Kerollmops Jun 14, 2022
a4ceef9
Fix the cli for the new DocumentsBatchBuilder/Reader structs
Kerollmops Jun 14, 2022
f29114f
Fix http-ui to fit with the new DocumentsBatchBuilder/Reader structs
Kerollmops Jun 14, 2022
a97d4d6
Fix the benchmarks
Kerollmops Jun 14, 2022
bdc4263
Introduce the validate_documents_batch function
Kerollmops Jun 14, 2022
cefffde
Improve the .gitignore of the fuzz crate
Kerollmops Jun 14, 2022
0146175
Introduce the validate_documents_batch function
Kerollmops Jun 14, 2022
fcfc4ca
Move the Object type in the lib.rs file and use it everywhere
Kerollmops Jun 15, 2022
399eec5
Fix the indexation tests
Kerollmops Jun 15, 2022
2ceeb51
Support the auto-generated ids when validating documents
Kerollmops Jun 15, 2022
19eb3b4
Make sur that we do not accept floats as documents ids
Kerollmops Jun 15, 2022
8ebf5ee
Make the nested primary key work
Kerollmops Jun 15, 2022
dc3f092
Do not leak an internal grenad Error
Kerollmops Jun 16, 2022
ea85220
Fix the format used for a geo deleting benchmark
Kerollmops Jun 22, 2022
6a0a0ae
Make the Transform read from an EnrichedDocumentsBatchReader
Kerollmops Jun 20, 2022
5f1bfb7
Extract the primary key name and make it accessible
Kerollmops Jun 21, 2022
7425430
Constify the default primary key name
Kerollmops Jun 21, 2022
905af2a
Use the primary key and external id in the transform
Kerollmops Jun 21, 2022
c8ebf0d
Rename the validate function as an enriching function
Kerollmops Jun 21, 2022
d1a4da9
Generate a real UUIDv4 when ids are auto-generated
Kerollmops Jun 21, 2022
0bbcc7b
Expose the `DocumentId` struct to be sure to inject the generated ids
Kerollmops Jun 21, 2022
5d149d6
Remove tests for a function that no more exists
Kerollmops Jun 30, 2022
2eec290
Check the validity of the latitute and longitude numbers
Kerollmops Jul 11, 2022
dc61105
Fix the nested document id fetching function
Kerollmops Jul 11, 2022
a892a4a
Introduce a function to extend from a JSON array of objects
Kerollmops Jul 11, 2022
192793e
Add some tests to check for the nested documents ids
Kerollmops Jul 12, 2022
25e768f
Fix another issue with the nested primary key selector
Kerollmops Jul 12, 2022
448114c
Fix the benchmarks with the new indexation API
Kerollmops Jul 12, 2022
ab1571c
Simplify Transform::read_documents, enabled by enriched documents reader
Jul 18, 2022
fc9f3f3
Change DocumentsBatchReader to access cursor and index at same time
Jul 18, 2022
453d593
Add a database containing the docids where each field exists
Jul 19, 2022
a8641b4
Modify flatten_serde_json to keep dummy value for all object keys
Jun 14, 2022
72452f0
Implements the EXIST filter operator
May 25, 2022
dc64170
Improve syntax of EXISTS filter, allow “value NOT EXISTS”
Jun 14, 2022
0388b2d
Run cargo fmt
Jun 15, 2022
a5c9162
Improve parser for NOT EXISTS filter
Jun 15, 2022
722db7b
Ignore target directory of filter-parser/fuzz crate
Jun 15, 2022
bd15f56
Fix compiler warning
Jun 15, 2022
392472f
Apply suggestions from code review
loiclec Jun 16, 2022
30bd4db
Simplify indexing task for facet_exists_docids database
Jun 16, 2022
c17d616
Refactor index_documents_check_exists_database tests
Jun 16, 2022
ea0642c
Make filter parser more strict regarding spacing around operators
Jun 16, 2022
80b962b
Run cargo fmt
Jun 16, 2022
4f0bd31
Remove custom implementation of BytesEncode/Decode for the FieldId
Jul 4, 2022
1eb1e73
Add integration tests for the EXISTS filter
Jul 4, 2022
aed8c69
Refactor indexation of the "facet-id-exists-docids" database
Jul 19, 2022
d0eee5f
Fix compiler error
Jul 19, 2022
1506683
Avoid using too much memory when indexing facet-exists-docids
Jul 19, 2022
41a0ce0
Add a code comment, as suggested in PR review
loiclec Jul 20, 2022
941af58
Merge #561
bors[bot] Jul 21, 2022
d5e9b73
Update version for next release (v0.32.0)
curquiza Jul 21, 2022
e1bc610
Merge #595
bors[bot] Jul 21, 2022
0700370
Merge branch 'filter/field-exist'
Jul 21, 2022
1fe224f
Update filter-parser/fuzz/.gitignore
loiclec Jul 21, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion benchmarks/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "benchmarks"
version = "0.31.1"
version = "0.32.0"
edition = "2018"
publish = false

Expand Down
152 changes: 91 additions & 61 deletions benchmarks/benches/indexing.rs

Large diffs are not rendered by default.

54 changes: 24 additions & 30 deletions benchmarks/benches/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,12 @@ use std::path::Path;

use criterion::BenchmarkId;
use heed::EnvOpenOptions;
use milli::documents::DocumentBatchReader;
use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
use milli::update::{
IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings,
};
use milli::{Filter, Index};
use serde_json::{Map, Value};
use milli::{Filter, Index, Object};
use serde_json::Value;

pub struct Conf<'a> {
/// where we are going to create our database.mmdb directory
Expand Down Expand Up @@ -96,12 +96,10 @@ pub fn base_setup(conf: &Conf) -> Index {
update_method: IndexDocumentsMethod::ReplaceDocuments,
..Default::default()
};
let mut builder =
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
let documents = documents_from(conf.dataset, conf.dataset_format);

builder.add_documents(documents).unwrap();

let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
builder.execute().unwrap();
wtxn.commit().unwrap();

Expand Down Expand Up @@ -140,7 +138,7 @@ pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) {
}
}

pub fn documents_from(filename: &str, filetype: &str) -> DocumentBatchReader<impl BufRead + Seek> {
pub fn documents_from(filename: &str, filetype: &str) -> DocumentsBatchReader<impl BufRead + Seek> {
let reader =
File::open(filename).expect(&format!("could not find the dataset in: {}", filename));
let reader = BufReader::new(reader);
Expand All @@ -150,39 +148,35 @@ pub fn documents_from(filename: &str, filetype: &str) -> DocumentBatchReader<imp
"jsonl" => documents_from_jsonl(reader).unwrap(),
otherwise => panic!("invalid update format {:?}", otherwise),
};
DocumentBatchReader::from_reader(Cursor::new(documents)).unwrap()
DocumentsBatchReader::from_reader(Cursor::new(documents)).unwrap()
}

fn documents_from_jsonl(mut reader: impl BufRead) -> anyhow::Result<Vec<u8>> {
let mut writer = Cursor::new(Vec::new());
let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;
fn documents_from_jsonl(reader: impl BufRead) -> anyhow::Result<Vec<u8>> {
let mut documents = DocumentsBatchBuilder::new(Vec::new());

let mut buf = String::new();

while reader.read_line(&mut buf)? > 0 {
documents.extend_from_json(&mut buf.as_bytes())?;
buf.clear();
for result in serde_json::Deserializer::from_reader(reader).into_iter::<Object>() {
let object = result?;
documents.append_json_object(&object)?;
}
documents.finish()?;

Ok(writer.into_inner())
documents.into_inner().map_err(Into::into)
}

fn documents_from_json(reader: impl BufRead) -> anyhow::Result<Vec<u8>> {
let mut writer = Cursor::new(Vec::new());
let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;
let mut documents = DocumentsBatchBuilder::new(Vec::new());

documents.extend_from_json(reader)?;
documents.finish()?;
documents.append_json_array(reader)?;

Ok(writer.into_inner())
documents.into_inner().map_err(Into::into)
}

fn documents_from_csv(reader: impl BufRead) -> anyhow::Result<Vec<u8>> {
let mut writer = Cursor::new(Vec::new());
milli::documents::DocumentBatchBuilder::from_csv(reader, &mut writer)?.finish()?;
let csv = csv::Reader::from_reader(reader);

let mut documents = DocumentsBatchBuilder::new(Vec::new());
documents.append_csv(csv)?;

Ok(writer.into_inner())
documents.into_inner().map_err(Into::into)
}

enum AllowedType {
Expand Down Expand Up @@ -222,14 +216,14 @@ impl<R: Read> CSVDocumentDeserializer<R> {
}

impl<R: Read> Iterator for CSVDocumentDeserializer<R> {
type Item = anyhow::Result<Map<String, Value>>;
type Item = anyhow::Result<Object>;

fn next(&mut self) -> Option<Self::Item> {
let csv_document = self.documents.next()?;

match csv_document {
Ok(csv_document) => {
let mut document = Map::new();
let mut document = Object::new();

for ((field_name, field_type), value) in
self.headers.iter().zip(csv_document.into_iter())
Expand Down
2 changes: 1 addition & 1 deletion cli/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "cli"
version = "0.31.1"
version = "0.32.0"
edition = "2018"
description = "A CLI to interact with a milli index"
publish = false
Expand Down
48 changes: 24 additions & 24 deletions cli/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,12 @@ use std::time::Instant;
use byte_unit::Byte;
use eyre::Result;
use indicatif::{MultiProgress, ProgressBar, ProgressStyle};
use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
use milli::update::UpdateIndexingStep::{
ComputeIdsAndMergeDocuments, IndexDocuments, MergeDataIntoFinalDatabase, RemapDocumentAddition,
};
use milli::update::{self, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig};
use milli::Index;
use serde_json::{Map, Value};
use milli::{Index, Object};
use structopt::StructOpt;

#[cfg(target_os = "linux")]
Expand Down Expand Up @@ -225,9 +225,9 @@ impl Performer for DocumentAddition {
DocumentAdditionFormat::Jsonl => documents_from_jsonl(reader)?,
};

let reader = milli::documents::DocumentBatchReader::from_reader(Cursor::new(documents))?;
let reader = DocumentsBatchReader::from_reader(Cursor::new(documents))?;

println!("Adding {} documents to the index.", reader.len());
println!("Adding {} documents to the index.", reader.documents_count());

let mut txn = index.write_txn()?;
let config = milli::update::IndexerConfig { log_every_n: Some(100), ..Default::default() };
Expand Down Expand Up @@ -255,15 +255,18 @@ impl Performer for DocumentAddition {
let bar = progesses.add(bar);
bars.push(bar);
}
let mut addition = milli::update::IndexDocuments::new(
let addition = milli::update::IndexDocuments::new(
&mut txn,
&index,
&config,
indexing_config,
|step| indexing_callback(step, &bars),
)
.unwrap();
addition.add_documents(reader)?;
let (addition, user_error) = addition.add_documents(reader)?;
if let Err(error) = user_error {
return Err(error.into());
}

std::thread::spawn(move || {
progesses.join().unwrap();
Expand Down Expand Up @@ -321,35 +324,32 @@ fn indexing_callback(step: milli::update::UpdateIndexingStep, bars: &[ProgressBa
}

fn documents_from_jsonl(reader: impl Read) -> Result<Vec<u8>> {
let mut writer = Cursor::new(Vec::new());
let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;

let mut buf = String::new();
let mut reader = BufReader::new(reader);
let mut documents = DocumentsBatchBuilder::new(Vec::new());
let reader = BufReader::new(reader);

while reader.read_line(&mut buf)? > 0 {
documents.extend_from_json(&mut buf.as_bytes())?;
for result in serde_json::Deserializer::from_reader(reader).into_iter::<Object>() {
let object = result?;
documents.append_json_object(&object)?;
}
documents.finish()?;

Ok(writer.into_inner())
documents.into_inner().map_err(Into::into)
}

fn documents_from_json(reader: impl Read) -> Result<Vec<u8>> {
let mut writer = Cursor::new(Vec::new());
let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;
let mut documents = DocumentsBatchBuilder::new(Vec::new());

documents.extend_from_json(reader)?;
documents.finish()?;
documents.append_json_array(reader)?;

Ok(writer.into_inner())
documents.into_inner().map_err(Into::into)
}

fn documents_from_csv(reader: impl Read) -> Result<Vec<u8>> {
let mut writer = Cursor::new(Vec::new());
milli::documents::DocumentBatchBuilder::from_csv(reader, &mut writer)?.finish()?;
let csv = csv::Reader::from_reader(reader);

let mut documents = DocumentsBatchBuilder::new(Vec::new());
documents.append_csv(csv)?;

Ok(writer.into_inner())
documents.into_inner().map_err(Into::into)
}

#[derive(Debug, StructOpt)]
Expand Down Expand Up @@ -423,7 +423,7 @@ impl Search {
filter: &Option<String>,
offset: &Option<usize>,
limit: &Option<usize>,
) -> Result<Vec<Map<String, Value>>> {
) -> Result<Vec<Object>> {
let txn = index.read_txn()?;
let mut search = index.search(&txn);

Expand Down
2 changes: 1 addition & 1 deletion filter-parser/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "filter-parser"
version = "0.31.1"
version = "0.32.0"
edition = "2021"
description = "The parser for the Meilisearch filter syntax"
publish = false
Expand Down
1 change: 1 addition & 0 deletions filter-parser/fuzz/.gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
/corpus/
/artifacts/
/target/
30 changes: 24 additions & 6 deletions filter-parser/src/condition.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@

use nom::branch::alt;
use nom::bytes::complete::tag;
use nom::character::complete::multispace1;
use nom::combinator::cut;
use nom::sequence::tuple;
use nom::sequence::{terminated, tuple};
use Condition::*;

use crate::{parse_value, FilterCondition, IResult, Span, Token};
Expand All @@ -19,6 +20,8 @@ pub enum Condition<'a> {
GreaterThanOrEqual(Token<'a>),
Equal(Token<'a>),
NotEqual(Token<'a>),
Exists,
NotExists,
LowerThan(Token<'a>),
LowerThanOrEqual(Token<'a>),
Between { from: Token<'a>, to: Token<'a> },
Expand All @@ -33,14 +36,15 @@ impl<'a> Condition<'a> {
GreaterThanOrEqual(n) => (LowerThan(n), None),
Equal(s) => (NotEqual(s), None),
NotEqual(s) => (Equal(s), None),
Exists => (NotExists, None),
NotExists => (Exists, None),
LowerThan(n) => (GreaterThanOrEqual(n), None),
LowerThanOrEqual(n) => (GreaterThan(n), None),
Between { from, to } => (LowerThan(from), Some(GreaterThan(to))),
}
}
}

/// condition = value ("==" | ">" ...) value
/// condition = value ("=" | "!=" | ">" | ">=" | "<" | "<=") value
pub fn parse_condition(input: Span) -> IResult<FilterCondition> {
let operator = alt((tag("<="), tag(">="), tag("!="), tag("<"), tag(">"), tag("=")));
let (input, (fid, op, value)) = tuple((parse_value, operator, cut(parse_value)))(input)?;
Expand All @@ -58,10 +62,24 @@ pub fn parse_condition(input: Span) -> IResult<FilterCondition> {
Ok((input, condition))
}

/// to = value value TO value
/// exist = value "EXISTS"
pub fn parse_exists(input: Span) -> IResult<FilterCondition> {
let (input, key) = terminated(parse_value, tag("EXISTS"))(input)?;

Ok((input, FilterCondition::Condition { fid: key.into(), op: Exists }))
}
/// exist = value "NOT" WS+ "EXISTS"
pub fn parse_not_exists(input: Span) -> IResult<FilterCondition> {
let (input, key) = parse_value(input)?;

let (input, _) = tuple((tag("NOT"), multispace1, tag("EXISTS")))(input)?;
Ok((input, FilterCondition::Condition { fid: key.into(), op: NotExists }))
}

/// to = value value "TO" WS+ value
pub fn parse_to(input: Span) -> IResult<FilterCondition> {
let (input, (key, from, _, to)) =
tuple((parse_value, parse_value, tag("TO"), cut(parse_value)))(input)?;
let (input, (key, from, _, _, to)) =
tuple((parse_value, parse_value, tag("TO"), multispace1, cut(parse_value)))(input)?;

Ok((input, FilterCondition::Condition { fid: key, op: Between { from, to } }))
}
4 changes: 2 additions & 2 deletions filter-parser/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -128,10 +128,10 @@ impl<'a> Display for Error<'a> {
writeln!(f, "Was expecting a value but instead got `{}`.", escaped_input)?
}
ErrorKind::InvalidPrimary if input.trim().is_empty() => {
writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO` or `_geoRadius` but instead got nothing.")?
writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` but instead got nothing.")?
}
ErrorKind::InvalidPrimary => {
writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO` or `_geoRadius` at `{}`.", escaped_input)?
writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `{}`.", escaped_input)?
}
ErrorKind::ExpectedEof => {
writeln!(f, "Found unexpected characters at the end of the filter: `{}`. You probably forgot an `OR` or an `AND` rule.", escaped_input)?
Expand Down
Loading