Skip to content
This repository has been archived by the owner on Apr 4, 2023. It is now read-only.

Only flatten the required objects #494

Merged
merged 3 commits into from
Apr 14, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[workspace]
resolver = "2"
members = ["milli", "filter-parser", "flatten-serde-json", "http-ui", "benchmarks", "infos", "helpers", "cli"]
members = ["milli", "filter-parser", "flatten-serde-json", "json-depth-checker", "http-ui", "benchmarks", "infos", "helpers", "cli"]
default-members = ["milli"]

[profile.dev]
Expand Down
16 changes: 16 additions & 0 deletions json-depth-checker/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
[package]
name = "json-depth-checker"
version = "0.1.0"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
serde_json = "1.0"

[dev-dependencies]
criterion = "0.3"

[[bench]]
name = "depth"
harness = false
59 changes: 59 additions & 0 deletions json-depth-checker/benches/depth.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
use criterion::{criterion_group, criterion_main, Criterion};
use json_depth_checker::should_flatten_from_unchecked_slice;
use serde_json::json;

fn criterion_benchmark(c: &mut Criterion) {
let null = serde_json::to_vec(&json!(null)).unwrap();
let bool_true = serde_json::to_vec(&json!(true)).unwrap();
let bool_false = serde_json::to_vec(&json!(false)).unwrap();
let integer = serde_json::to_vec(&json!(42)).unwrap();
let float = serde_json::to_vec(&json!(1456.258)).unwrap();
let string = serde_json::to_vec(&json!("hello world")).unwrap();
let object = serde_json::to_vec(&json!({ "hello": "world",})).unwrap();
let complex_object = serde_json::to_vec(&json!({
"doggos": [
{ "bernard": true },
{ "michel": 42 },
false,
],
"bouvier": true,
"caniche": null,
}))
.unwrap();
let simple_array = serde_json::to_vec(&json!([
1,
2,
3,
"viva",
"l\"algeria",
irevoire marked this conversation as resolved.
Show resolved Hide resolved
true,
"[array]",
"escaped string \""
]))
.unwrap();
let array_of_array = serde_json::to_vec(&json!([1, [2, [3]]])).unwrap();
let array_of_object = serde_json::to_vec(&json!([1, [2, [3]], {}])).unwrap();

c.bench_function("null", |b| b.iter(|| should_flatten_from_unchecked_slice(&null)));
c.bench_function("true", |b| b.iter(|| should_flatten_from_unchecked_slice(&bool_true)));
c.bench_function("false", |b| b.iter(|| should_flatten_from_unchecked_slice(&bool_false)));
c.bench_function("integer", |b| b.iter(|| should_flatten_from_unchecked_slice(&integer)));
c.bench_function("float", |b| b.iter(|| should_flatten_from_unchecked_slice(&float)));
c.bench_function("string", |b| b.iter(|| should_flatten_from_unchecked_slice(&string)));
c.bench_function("object", |b| b.iter(|| should_flatten_from_unchecked_slice(&object)));
c.bench_function("complex object", |b| {
b.iter(|| should_flatten_from_unchecked_slice(&complex_object))
});
c.bench_function("simple array", |b| {
b.iter(|| should_flatten_from_unchecked_slice(&simple_array))
});
c.bench_function("array of array", |b| {
b.iter(|| should_flatten_from_unchecked_slice(&array_of_array))
});
c.bench_function("array of object", |b| {
b.iter(|| should_flatten_from_unchecked_slice(&array_of_object))
});
}

criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);
27 changes: 27 additions & 0 deletions json-depth-checker/fuzz/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
[package]
name = "json-depth-checker"
version = "0.0.0"
authors = ["Automatically generated"]
publish = false
edition = "2018"

[package.metadata]
cargo-fuzz = true

[dependencies]
libfuzzer-sys = "0.4"
arbitrary-json = "0.1.1"
serde_json = "1.0.79"

[dependencies.json-depth-checker]
path = ".."

# Prevent this from interfering with workspaces
[workspace]
members = ["."]

[[bin]]
name = "depth"
path = "fuzz_targets/depth.rs"
test = false
doc = false
13 changes: 13 additions & 0 deletions json-depth-checker/fuzz/fuzz_targets/depth.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#![no_main]
use arbitrary_json::ArbitraryValue;
use json_depth_checker::*;
use libfuzzer_sys::fuzz_target;

fuzz_target!(|value: ArbitraryValue| {
let value = serde_json::Value::from(value);
let left = should_flatten_from_value(&value);
let value = serde_json::to_vec(&value).unwrap();
let right = should_flatten_from_unchecked_slice(&value);

assert_eq!(left, right);
});
114 changes: 114 additions & 0 deletions json-depth-checker/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
use serde_json::Value;

/// Your json MUST BE valid and generated by `serde_json::to_vec` before being
/// sent in this function. This function is DUMB and FAST but makes a lot of
/// asumption about the way `serde_json` will generate its input.
///
/// Will return `true` if the JSON contains an object, an array of array
/// or an array containing an object. Returns `false` for everything else.
pub fn should_flatten_from_unchecked_slice(json: &[u8]) -> bool {
if json.is_empty() {
return false;
}

// since the json we receive has been generated by serde_json we know
// it doesn't contains any whitespace at the beginning thus we can check
// directly if we're looking at an object.
if json[0] == b'{' {
return true;
} else if json[0] != b'[' {
// if the json isn't an object or an array it means it's a simple value.
return false;
}

// The array case is a little bit more complex. We are looking for a second
// `[` but we need to ensure that it doesn't appear inside of a string. Thus
// we need to keep track of if we're in a string or not.

// will be used when we met a `\` to skip the next character.
let mut skip_next = false;
let mut in_string = false;

for byte in json.iter().skip(1) {
match byte {
// handle the backlash.
_ if skip_next => skip_next = false,
b'\\' => skip_next = true,

// handle the strings.
byte if in_string => {
if *byte == b'"' {
in_string = false;
}
}
b'"' => in_string = true,

// handle the arrays.
b'[' => return true,
// since we know the json is valid we don't need to ensure the
// array is correctly closed

// handle the objects.
b'{' => return true,

// ignore everything else
_ => (),
}
}

false
}

/// Consider using [`should_flatten_from_unchecked_slice`] when you can.
/// Will returns `true` if the json contains an object, an array of array
/// or an array containing an object.
/// Returns `false` for everything else.
/// This function has been written to test the [`should_flatten_from_unchecked_slice`].
pub fn should_flatten_from_value(json: &Value) -> bool {
match json {
Value::Object(..) => true,
Value::Array(array) => array.iter().any(|value| value.is_array() || value.is_object()),
_ => false,
}
}

#[cfg(test)]
mod tests {
use serde_json::*;

use super::*;

#[test]
fn test_shouldnt_flatten() {
let shouldnt_flatten = vec![
json!(null),
json!(true),
json!(false),
json!("a superb string"),
json!("a string escaping other \"string\""),
json!([null, true, false]),
json!(["hello", "world", "!"]),
json!(["a \"string\" escaping 'an other'", "\"[\"", "\"{\""]),
];
for value in shouldnt_flatten {
assert!(!should_flatten_from_value(&value));
let value = serde_json::to_vec(&value).unwrap();
assert!(!should_flatten_from_unchecked_slice(&value));
}
}

#[test]
fn test_should_flatten() {
let should_flatten = vec![
json!({}),
json!({ "hello": "world" }),
json!(["hello", ["world"]]),
json!([true, true, true, true, true, true, true, true, true, {}]),
];
for value in should_flatten {
assert!(should_flatten_from_value(&value));
let value = serde_json::to_vec(&value).unwrap();
assert!(should_flatten_from_unchecked_slice(&value));
}
}
}
1 change: 1 addition & 0 deletions milli/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ flatten-serde-json = { path = "../flatten-serde-json" }
grenad = { version = "0.4.1", default-features = false, features = ["tempfile"] }
geoutils = "0.4.1"
heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] }
json-depth-checker = { path = "../json-depth-checker" }
levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] }
meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.9" }
memmap2 = "0.5.3"
Expand Down
28 changes: 20 additions & 8 deletions milli/src/update/index_documents/transform.rs
Original file line number Diff line number Diff line change
Expand Up @@ -286,9 +286,10 @@ impl<'a, 'i> Transform<'a, 'i> {
})?;

self.original_sorter.insert(&docid.to_be_bytes(), base_obkv)?;
let buffer = self.flatten_from_fields_ids_map(KvReader::new(&base_obkv))?;

self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?;
match self.flatten_from_fields_ids_map(KvReader::new(&base_obkv))? {
Some(buffer) => self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?,
None => self.flattened_sorter.insert(docid.to_be_bytes(), base_obkv)?,
}
} else {
self.new_documents_ids.insert(docid);
}
Expand All @@ -300,8 +301,12 @@ impl<'a, 'i> Transform<'a, 'i> {
if let Some(flatten) = flattened_document {
self.flattened_sorter.insert(docid.to_be_bytes(), &flatten)?;
} else {
let buffer = self.flatten_from_fields_ids_map(KvReader::new(&obkv_buffer))?;
self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?;
match self.flatten_from_fields_ids_map(KvReader::new(&obkv_buffer))? {
Some(buffer) => self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?,
None => {
self.flattened_sorter.insert(docid.to_be_bytes(), obkv_buffer.clone())?
}
}
}

progress_callback(UpdateIndexingStep::RemapDocumentAddition {
Expand All @@ -326,8 +331,15 @@ impl<'a, 'i> Transform<'a, 'i> {
}

// Flatten a document from the fields ids map contained in self and insert the new
// created fields.
fn flatten_from_fields_ids_map(&mut self, obkv: KvReader<FieldId>) -> Result<Vec<u8>> {
// created fields. Returns `None` if the document doesn't need to be flattened.
fn flatten_from_fields_ids_map(&mut self, obkv: KvReader<FieldId>) -> Result<Option<Vec<u8>>> {
if obkv
.iter()
.all(|(_, value)| !json_depth_checker::should_flatten_from_unchecked_slice(value))
Kerollmops marked this conversation as resolved.
Show resolved Hide resolved
{
return Ok(None);
}

let mut doc = serde_json::Map::new();

for (k, v) in obkv.iter() {
Expand Down Expand Up @@ -357,7 +369,7 @@ impl<'a, 'i> Transform<'a, 'i> {
writer.insert(fid, &value)?;
}

Ok(buffer)
Ok(Some(buffer))
}

// Flatten a document from a field mapping generated by [create_fields_mapping]
Expand Down