Skip to content
This repository has been archived by the owner on Apr 4, 2023. It is now read-only.

Commit

Permalink
Merge #494
Browse files Browse the repository at this point in the history
494: Only flatten the required objects r=Kerollmops a=irevoire

Instead of flattening every object to write them in the flatenned sorter we now check if we needs to flatten the object or if we can insert it as-is.

```
group                                                             indexing_flatten-what-is-needed_a6031f9a    indexing_main_6b073738
-----                                                             ----------------------------------------    ----------------------
indexing/Indexing geo_point                                       1.00      25.1±0.20s        ? ?/sec         1.00      25.2±0.20s        ? ?/sec
indexing/Indexing movies in three batches                         1.01      18.3±0.12s        ? ?/sec         1.00      18.2±0.10s        ? ?/sec
indexing/Indexing movies with default settings                    1.00      17.6±0.11s        ? ?/sec         1.00      17.5±0.09s        ? ?/sec
indexing/Indexing songs in three batches with default settings    1.00      66.4±0.46s        ? ?/sec         1.03      68.3±1.01s        ? ?/sec
indexing/Indexing songs with default settings                     1.00      55.7±1.15s        ? ?/sec         1.14      63.2±0.78s        ? ?/sec
indexing/Indexing songs without any facets                        1.00      51.6±1.04s        ? ?/sec         1.16      59.6±1.00s        ? ?/sec
indexing/Indexing songs without faceted numbers                   1.00      55.3±1.09s        ? ?/sec         1.13      62.8±0.38s        ? ?/sec
indexing/Indexing wiki                                            1.00   1006.6±26.89s        ? ?/sec         1.00   1009.2±25.25s        ? ?/sec
indexing/Indexing wiki in three batches                           1.00   1140.5±11.97s        ? ?/sec         1.00    1142.0±9.97s        ? ?/sec
```

We now have performance similar to what we had before for the non nested datasets 🎉 

Co-authored-by: Tamo <tamo@meilisearch.com>
  • Loading branch information
bors[bot] and irevoire authored Apr 13, 2022
2 parents 7791ef9 + cff986e commit bca7360
Show file tree
Hide file tree
Showing 8 changed files with 251 additions and 9 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[workspace]
resolver = "2"
members = ["milli", "filter-parser", "flatten-serde-json", "http-ui", "benchmarks", "infos", "helpers", "cli"]
members = ["milli", "filter-parser", "flatten-serde-json", "json-depth-checker", "http-ui", "benchmarks", "infos", "helpers", "cli"]
default-members = ["milli"]

[profile.dev]
Expand Down
16 changes: 16 additions & 0 deletions json-depth-checker/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
[package]
name = "json-depth-checker"
version = "0.1.0"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
serde_json = "1.0"

[dev-dependencies]
criterion = "0.3"

[[bench]]
name = "depth"
harness = false
59 changes: 59 additions & 0 deletions json-depth-checker/benches/depth.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
use criterion::{criterion_group, criterion_main, Criterion};
use json_depth_checker::should_flatten_from_unchecked_slice;
use serde_json::json;

fn criterion_benchmark(c: &mut Criterion) {
let null = serde_json::to_vec(&json!(null)).unwrap();
let bool_true = serde_json::to_vec(&json!(true)).unwrap();
let bool_false = serde_json::to_vec(&json!(false)).unwrap();
let integer = serde_json::to_vec(&json!(42)).unwrap();
let float = serde_json::to_vec(&json!(1456.258)).unwrap();
let string = serde_json::to_vec(&json!("hello world")).unwrap();
let object = serde_json::to_vec(&json!({ "hello": "world",})).unwrap();
let complex_object = serde_json::to_vec(&json!({
"doggos": [
{ "bernard": true },
{ "michel": 42 },
false,
],
"bouvier": true,
"caniche": null,
}))
.unwrap();
let simple_array = serde_json::to_vec(&json!([
1,
2,
3,
"viva",
"l\"algeria",
true,
"[array]",
"escaped string \""
]))
.unwrap();
let array_of_array = serde_json::to_vec(&json!([1, [2, [3]]])).unwrap();
let array_of_object = serde_json::to_vec(&json!([1, [2, [3]], {}])).unwrap();

c.bench_function("null", |b| b.iter(|| should_flatten_from_unchecked_slice(&null)));
c.bench_function("true", |b| b.iter(|| should_flatten_from_unchecked_slice(&bool_true)));
c.bench_function("false", |b| b.iter(|| should_flatten_from_unchecked_slice(&bool_false)));
c.bench_function("integer", |b| b.iter(|| should_flatten_from_unchecked_slice(&integer)));
c.bench_function("float", |b| b.iter(|| should_flatten_from_unchecked_slice(&float)));
c.bench_function("string", |b| b.iter(|| should_flatten_from_unchecked_slice(&string)));
c.bench_function("object", |b| b.iter(|| should_flatten_from_unchecked_slice(&object)));
c.bench_function("complex object", |b| {
b.iter(|| should_flatten_from_unchecked_slice(&complex_object))
});
c.bench_function("simple array", |b| {
b.iter(|| should_flatten_from_unchecked_slice(&simple_array))
});
c.bench_function("array of array", |b| {
b.iter(|| should_flatten_from_unchecked_slice(&array_of_array))
});
c.bench_function("array of object", |b| {
b.iter(|| should_flatten_from_unchecked_slice(&array_of_object))
});
}

criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);
27 changes: 27 additions & 0 deletions json-depth-checker/fuzz/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
[package]
name = "json-depth-checker"
version = "0.0.0"
authors = ["Automatically generated"]
publish = false
edition = "2018"

[package.metadata]
cargo-fuzz = true

[dependencies]
libfuzzer-sys = "0.4"
arbitrary-json = "0.1.1"
serde_json = "1.0.79"

[dependencies.json-depth-checker]
path = ".."

# Prevent this from interfering with workspaces
[workspace]
members = ["."]

[[bin]]
name = "depth"
path = "fuzz_targets/depth.rs"
test = false
doc = false
13 changes: 13 additions & 0 deletions json-depth-checker/fuzz/fuzz_targets/depth.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#![no_main]
use arbitrary_json::ArbitraryValue;
use json_depth_checker::*;
use libfuzzer_sys::fuzz_target;

fuzz_target!(|value: ArbitraryValue| {
let value = serde_json::Value::from(value);
let left = should_flatten_from_value(&value);
let value = serde_json::to_vec(&value).unwrap();
let right = should_flatten_from_unchecked_slice(&value);

assert_eq!(left, right);
});
114 changes: 114 additions & 0 deletions json-depth-checker/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
use serde_json::Value;

/// Your json MUST BE valid and generated by `serde_json::to_vec` before being
/// sent in this function. This function is DUMB and FAST but makes a lot of
/// asumption about the way `serde_json` will generate its input.
///
/// Will return `true` if the JSON contains an object, an array of array
/// or an array containing an object. Returns `false` for everything else.
pub fn should_flatten_from_unchecked_slice(json: &[u8]) -> bool {
if json.is_empty() {
return false;
}

// since the json we receive has been generated by serde_json we know
// it doesn't contains any whitespace at the beginning thus we can check
// directly if we're looking at an object.
if json[0] == b'{' {
return true;
} else if json[0] != b'[' {
// if the json isn't an object or an array it means it's a simple value.
return false;
}

// The array case is a little bit more complex. We are looking for a second
// `[` but we need to ensure that it doesn't appear inside of a string. Thus
// we need to keep track of if we're in a string or not.

// will be used when we met a `\` to skip the next character.
let mut skip_next = false;
let mut in_string = false;

for byte in json.iter().skip(1) {
match byte {
// handle the backlash.
_ if skip_next => skip_next = false,
b'\\' => skip_next = true,

// handle the strings.
byte if in_string => {
if *byte == b'"' {
in_string = false;
}
}
b'"' => in_string = true,

// handle the arrays.
b'[' => return true,
// since we know the json is valid we don't need to ensure the
// array is correctly closed

// handle the objects.
b'{' => return true,

// ignore everything else
_ => (),
}
}

false
}

/// Consider using [`should_flatten_from_unchecked_slice`] when you can.
/// Will returns `true` if the json contains an object, an array of array
/// or an array containing an object.
/// Returns `false` for everything else.
/// This function has been written to test the [`should_flatten_from_unchecked_slice`].
pub fn should_flatten_from_value(json: &Value) -> bool {
match json {
Value::Object(..) => true,
Value::Array(array) => array.iter().any(|value| value.is_array() || value.is_object()),
_ => false,
}
}

#[cfg(test)]
mod tests {
use serde_json::*;

use super::*;

#[test]
fn test_shouldnt_flatten() {
let shouldnt_flatten = vec![
json!(null),
json!(true),
json!(false),
json!("a superb string"),
json!("a string escaping other \"string\""),
json!([null, true, false]),
json!(["hello", "world", "!"]),
json!(["a \"string\" escaping 'an other'", "\"[\"", "\"{\""]),
];
for value in shouldnt_flatten {
assert!(!should_flatten_from_value(&value));
let value = serde_json::to_vec(&value).unwrap();
assert!(!should_flatten_from_unchecked_slice(&value));
}
}

#[test]
fn test_should_flatten() {
let should_flatten = vec![
json!({}),
json!({ "hello": "world" }),
json!(["hello", ["world"]]),
json!([true, true, true, true, true, true, true, true, true, {}]),
];
for value in should_flatten {
assert!(should_flatten_from_value(&value));
let value = serde_json::to_vec(&value).unwrap();
assert!(should_flatten_from_unchecked_slice(&value));
}
}
}
1 change: 1 addition & 0 deletions milli/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ flatten-serde-json = { path = "../flatten-serde-json" }
grenad = { version = "0.4.1", default-features = false, features = ["tempfile"] }
geoutils = "0.4.1"
heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] }
json-depth-checker = { path = "../json-depth-checker" }
levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] }
meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.9" }
memmap2 = "0.5.3"
Expand Down
28 changes: 20 additions & 8 deletions milli/src/update/index_documents/transform.rs
Original file line number Diff line number Diff line change
Expand Up @@ -286,9 +286,10 @@ impl<'a, 'i> Transform<'a, 'i> {
})?;

self.original_sorter.insert(&docid.to_be_bytes(), base_obkv)?;
let buffer = self.flatten_from_fields_ids_map(KvReader::new(&base_obkv))?;

self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?;
match self.flatten_from_fields_ids_map(KvReader::new(&base_obkv))? {
Some(buffer) => self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?,
None => self.flattened_sorter.insert(docid.to_be_bytes(), base_obkv)?,
}
} else {
self.new_documents_ids.insert(docid);
}
Expand All @@ -300,8 +301,12 @@ impl<'a, 'i> Transform<'a, 'i> {
if let Some(flatten) = flattened_document {
self.flattened_sorter.insert(docid.to_be_bytes(), &flatten)?;
} else {
let buffer = self.flatten_from_fields_ids_map(KvReader::new(&obkv_buffer))?;
self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?;
match self.flatten_from_fields_ids_map(KvReader::new(&obkv_buffer))? {
Some(buffer) => self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?,
None => {
self.flattened_sorter.insert(docid.to_be_bytes(), obkv_buffer.clone())?
}
}
}

progress_callback(UpdateIndexingStep::RemapDocumentAddition {
Expand All @@ -326,8 +331,15 @@ impl<'a, 'i> Transform<'a, 'i> {
}

// Flatten a document from the fields ids map contained in self and insert the new
// created fields.
fn flatten_from_fields_ids_map(&mut self, obkv: KvReader<FieldId>) -> Result<Vec<u8>> {
// created fields. Returns `None` if the document doesn't need to be flattened.
fn flatten_from_fields_ids_map(&mut self, obkv: KvReader<FieldId>) -> Result<Option<Vec<u8>>> {
if obkv
.iter()
.all(|(_, value)| !json_depth_checker::should_flatten_from_unchecked_slice(value))
{
return Ok(None);
}

let mut doc = serde_json::Map::new();

for (k, v) in obkv.iter() {
Expand Down Expand Up @@ -357,7 +369,7 @@ impl<'a, 'i> Transform<'a, 'i> {
writer.insert(fid, &value)?;
}

Ok(buffer)
Ok(Some(buffer))
}

// Flatten a document from a field mapping generated by [create_fields_mapping]
Expand Down

0 comments on commit bca7360

Please sign in to comment.