Skip to content
This repository has been archived by the owner on Apr 4, 2023. It is now read-only.

Commit

Permalink
add benchmarks for indexing
Browse files Browse the repository at this point in the history
  • Loading branch information
irevoire committed Jul 7, 2021
1 parent 4c9531b commit 931021f
Show file tree
Hide file tree
Showing 5 changed files with 336 additions and 12 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/benchmarks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ on:
workflow_dispatch:
inputs:
dataset_name:
description: 'The name of the dataset used to benchmark (songs or wiki)'
description: 'The name of the dataset used to benchmark (songs, wiki or indexing)'
required: false
default: 'songs'

Expand Down
4 changes: 4 additions & 0 deletions benchmarks/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,7 @@ harness = false
[[bench]]
name = "wiki"
harness = false

[[bench]]
name = "indexing"
harness = false
13 changes: 9 additions & 4 deletions benchmarks/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,13 @@ _[More about critcmp](https://github.com/BurntSushi/critcmp)._

### On your machine

To run all the benchmarks (~4h):
To run all the benchmarks (~5h):

```bash
cargo bench
```

To run only the `songs` (~1h) or `wiki` (~3h) benchmark:
To run only the `songs` (~1h), `wiki` (~3h) or `indexing` (~4h) benchmark:

```bash
cargo bench --bench <dataset name>
Expand All @@ -47,7 +47,7 @@ If you don't want to download the datasets every time you update something on th

```bash
mkdir ~/datasets
MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the two datasets are downloaded
MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the three datasets are downloaded
touch build.rs
MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the code is compiled again but the datasets are not downloaded
```
Expand Down Expand Up @@ -84,6 +84,7 @@ Run the comparison script:
The benchmarks are available for the following datasets:
- `songs`
- `wiki`
- `movies`

### Songs

Expand All @@ -107,5 +108,9 @@ It was generated with the following command:
xsv sample --seed 42 500000 wiki-articles.csv -o smol-wiki-articles.csv
```

_[Download the generated `wiki` dataset](https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets/smol-wiki-articles.csv.gz)._
### Movies

`movies` is a really small dataset we uses as our example in the [getting started](https://docs.meilisearch.com/learn/getting_started/)

_[Download the `movies` dataset](https://docs.meilisearch.com/movies.json)._

314 changes: 314 additions & 0 deletions benchmarks/benches/indexing.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,314 @@
mod datasets_paths;

use std::fs::{create_dir_all, remove_dir_all, File};
use std::path::Path;

use criterion::{criterion_group, criterion_main, Criterion};
use heed::EnvOpenOptions;
use milli::update::{IndexDocumentsMethod, UpdateBuilder, UpdateFormat};
use milli::Index;

#[cfg(target_os = "linux")]
#[global_allocator]
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;

fn setup_dir(path: impl AsRef<Path>) {
match remove_dir_all(path.as_ref()) {
Ok(_) => (),
Err(e) if e.kind() == std::io::ErrorKind::NotFound => (),
Err(e) => panic!("{}", e),
}
create_dir_all(path).unwrap();
}

fn setup_index() -> Index {
let path = "benches.mmdb";
setup_dir(&path);
let mut options = EnvOpenOptions::new();
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
options.max_readers(10);
Index::new(options, path).unwrap()
}

fn indexing_songs_default(c: &mut Criterion) {
let index = setup_index();

let update_builder = UpdateBuilder::new(0);
let mut wtxn = index.write_txn().unwrap();
let mut builder = update_builder.settings(&mut wtxn, &index);

builder.set_primary_key("id".to_owned());
let displayed_fields = ["title", "album", "artist", "genre", "country", "released", "duration"]
.iter()
.map(|s| s.to_string())
.collect();
builder.set_displayed_fields(displayed_fields);

let searchable_fields = ["title", "album", "artist"].iter().map(|s| s.to_string()).collect();
builder.set_searchable_fields(searchable_fields);

let faceted_fields = ["released-timestamp", "duration-float", "genre", "country", "artist"]
.iter()
.map(|s| s.to_string())
.collect();
builder.set_filterable_fields(faceted_fields);
builder.execute(|_, _| ()).unwrap();
wtxn.commit().unwrap();

let index_ref = &index;

let mut group = c.benchmark_group("indexing");
group.sample_size(10);
group.bench_function("Indexing songs with default settings", |b| {
b.iter_with_setup(
move || {
let update_builder = UpdateBuilder::new(0);
let mut wtxn = index_ref.write_txn().unwrap();
let builder = update_builder.delete_documents(&mut wtxn, index_ref).unwrap();
builder.execute().unwrap();
wtxn.commit().unwrap();
()
},
move |_| {
let update_builder = UpdateBuilder::new(0);
let mut wtxn = index_ref.write_txn().unwrap();
let mut builder = update_builder.index_documents(&mut wtxn, index_ref);

builder.update_format(UpdateFormat::Csv);
builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
let reader = File::open(datasets_paths::SMOL_SONGS).expect(&format!(
"could not find the dataset in: {}",
datasets_paths::SMOL_SONGS
));
builder.execute(reader, |_, _| ()).unwrap();
wtxn.commit().unwrap();
},
)
});

index.prepare_for_closing().wait();
}

fn indexing_songs_without_faceted_numbers(c: &mut Criterion) {
let index = setup_index();

let update_builder = UpdateBuilder::new(0);
let mut wtxn = index.write_txn().unwrap();
let mut builder = update_builder.settings(&mut wtxn, &index);

builder.set_primary_key("id".to_owned());
let displayed_fields = ["title", "album", "artist", "genre", "country", "released", "duration"]
.iter()
.map(|s| s.to_string())
.collect();
builder.set_displayed_fields(displayed_fields);

let searchable_fields = ["title", "album", "artist"].iter().map(|s| s.to_string()).collect();
builder.set_searchable_fields(searchable_fields);

let faceted_fields = ["genre", "country", "artist"].iter().map(|s| s.to_string()).collect();
builder.set_filterable_fields(faceted_fields);
builder.execute(|_, _| ()).unwrap();
wtxn.commit().unwrap();

let index_ref = &index;

let mut group = c.benchmark_group("indexing");
group.sample_size(10);
group.bench_function("Indexing songs without faceted numbers", |b| {
b.iter_with_setup(
move || {
let update_builder = UpdateBuilder::new(0);
let mut wtxn = index_ref.write_txn().unwrap();
let builder = update_builder.delete_documents(&mut wtxn, index_ref).unwrap();
builder.execute().unwrap();
wtxn.commit().unwrap();
()
},
move |_| {
let update_builder = UpdateBuilder::new(0);
let mut wtxn = index_ref.write_txn().unwrap();
let mut builder = update_builder.index_documents(&mut wtxn, index_ref);

builder.update_format(UpdateFormat::Csv);
builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
let reader = File::open(datasets_paths::SMOL_SONGS).expect(&format!(
"could not find the dataset in: {}",
datasets_paths::SMOL_SONGS
));
builder.execute(reader, |_, _| ()).unwrap();
wtxn.commit().unwrap();
},
)
});
index.prepare_for_closing().wait();
}

fn indexing_songs_without_faceted_fields(c: &mut Criterion) {
let index = setup_index();

let update_builder = UpdateBuilder::new(0);
let mut wtxn = index.write_txn().unwrap();
let mut builder = update_builder.settings(&mut wtxn, &index);

builder.set_primary_key("id".to_owned());
let displayed_fields = ["title", "album", "artist", "genre", "country", "released", "duration"]
.iter()
.map(|s| s.to_string())
.collect();
builder.set_displayed_fields(displayed_fields);

let searchable_fields = ["title", "album", "artist"].iter().map(|s| s.to_string()).collect();
builder.set_searchable_fields(searchable_fields);
builder.execute(|_, _| ()).unwrap();
wtxn.commit().unwrap();

let index_ref = &index;

let mut group = c.benchmark_group("indexing");
group.sample_size(10);
group.bench_function("Indexing songs without any facets", |b| {
b.iter_with_setup(
move || {
let update_builder = UpdateBuilder::new(0);
let mut wtxn = index_ref.write_txn().unwrap();
let builder = update_builder.delete_documents(&mut wtxn, index_ref).unwrap();
builder.execute().unwrap();
wtxn.commit().unwrap();
()
},
move |_| {
let update_builder = UpdateBuilder::new(0);
let mut wtxn = index_ref.write_txn().unwrap();
let mut builder = update_builder.index_documents(&mut wtxn, index_ref);

builder.update_format(UpdateFormat::Csv);
builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
let reader = File::open(datasets_paths::SMOL_SONGS).expect(&format!(
"could not find the dataset in: {}",
datasets_paths::SMOL_SONGS
));
builder.execute(reader, |_, _| ()).unwrap();
wtxn.commit().unwrap();
},
)
});
index.prepare_for_closing().wait();
}

fn indexing_wiki(c: &mut Criterion) {
let index = setup_index();

let update_builder = UpdateBuilder::new(0);
let mut wtxn = index.write_txn().unwrap();
let mut builder = update_builder.settings(&mut wtxn, &index);

builder.set_primary_key("id".to_owned());
let displayed_fields = ["title", "body", "url"].iter().map(|s| s.to_string()).collect();
builder.set_displayed_fields(displayed_fields);

let searchable_fields = ["title", "body"].iter().map(|s| s.to_string()).collect();
builder.set_searchable_fields(searchable_fields);

// there is NO faceted fields at all

builder.execute(|_, _| ()).unwrap();
wtxn.commit().unwrap();

let index_ref = &index;

let mut group = c.benchmark_group("indexing");
group.sample_size(10);
group.bench_function("Indexing wiki", |b| {
b.iter_with_setup(
move || {
let update_builder = UpdateBuilder::new(0);
let mut wtxn = index_ref.write_txn().unwrap();
let builder = update_builder.delete_documents(&mut wtxn, index_ref).unwrap();
builder.execute().unwrap();
wtxn.commit().unwrap();
()
},
move |_| {
let update_builder = UpdateBuilder::new(0);
let mut wtxn = index_ref.write_txn().unwrap();
let mut builder = update_builder.index_documents(&mut wtxn, index_ref);

builder.update_format(UpdateFormat::Csv);
builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
let reader = File::open(datasets_paths::SMOL_WIKI_ARTICLES).expect(&format!(
"could not find the dataset in: {}",
datasets_paths::SMOL_SONGS
));
builder.execute(reader, |_, _| ()).unwrap();
wtxn.commit().unwrap();
},
)
});
index.prepare_for_closing().wait();
}

fn indexing_movies_default(c: &mut Criterion) {
let index = setup_index();

let update_builder = UpdateBuilder::new(0);
let mut wtxn = index.write_txn().unwrap();
let mut builder = update_builder.settings(&mut wtxn, &index);

builder.set_primary_key("id".to_owned());
let displayed_fields = ["title", "poster", "overview", "release_date", "genres"]
.iter()
.map(|s| s.to_string())
.collect();
builder.set_displayed_fields(displayed_fields);

let searchable_fields = ["title", "overview"].iter().map(|s| s.to_string()).collect();
builder.set_searchable_fields(searchable_fields);

let faceted_fields = ["released_date", "genres"].iter().map(|s| s.to_string()).collect();
builder.set_filterable_fields(faceted_fields);

builder.execute(|_, _| ()).unwrap();
wtxn.commit().unwrap();

let index_ref = &index;

let mut group = c.benchmark_group("indexing");
group.sample_size(10);
group.bench_function("Indexing movies with default settings", |b| {
b.iter_with_setup(
move || {
let update_builder = UpdateBuilder::new(0);
let mut wtxn = index_ref.write_txn().unwrap();
let builder = update_builder.delete_documents(&mut wtxn, index_ref).unwrap();
builder.execute().unwrap();
wtxn.commit().unwrap();
()
},
move |_| {
let update_builder = UpdateBuilder::new(0);
let mut wtxn = index_ref.write_txn().unwrap();
let mut builder = update_builder.index_documents(&mut wtxn, index_ref);

builder.update_format(UpdateFormat::Json);
builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
let reader = File::open(datasets_paths::MOVIES)
.expect(&format!("could not find the dataset in: {}", datasets_paths::MOVIES));
builder.execute(reader, |_, _| ()).unwrap();
wtxn.commit().unwrap();
},
)
});

index.prepare_for_closing().wait();
}

criterion_group!(
benches,
indexing_songs_default,
indexing_songs_without_faceted_numbers,
indexing_songs_without_faceted_fields,
indexing_wiki,
indexing_movies_default
);
criterion_main!(benches);
Loading

0 comments on commit 931021f

Please sign in to comment.