Skip to content
This repository has been archived by the owner on Apr 4, 2023. It is now read-only.

Commit

Permalink
Use smartstring to store the external id in our hashmap
Browse files Browse the repository at this point in the history
We need to store all the external id (primary key) in a hashmap
associated to their internal id during.
The smartstring remove heap allocation / memory usage and should
improve the cache locality.
  • Loading branch information
irevoire committed Apr 11, 2022
1 parent c830661 commit 3f34351
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 7 deletions.
1 change: 1 addition & 0 deletions milli/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ serde_json = { version = "1.0.79", features = ["preserve_order"] }
slice-group-by = "0.3.0"
smallstr = { version = "0.3.0", features = ["serde"] }
smallvec = "1.8.0"
smartstring = "1.0.1"
tempfile = "3.3.0"
time = { version = "0.3.7", features = ["serde-well-known", "formatting", "parsing", "macros"] }
uuid = { version = "0.8.2", features = ["v4"] }
Expand Down
7 changes: 5 additions & 2 deletions milli/src/update/index_documents/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1109,8 +1109,11 @@ mod tests {

let mut big_object = HashMap::new();
big_object.insert(S("id"), "wow");
let content: String =
(0..=u16::MAX).into_iter().map(|p| p.to_string()).reduce(|a, b| a + " " + &b).unwrap();
let content: String = (0..=u16::MAX)
.into_iter()
.map(|p| p.to_string())
.reduce(|a, b| a + " " + b.as_ref())
.unwrap();
big_object.insert("content".to_string(), &content);

let mut cursor = Cursor::new(Vec::new());
Expand Down
9 changes: 4 additions & 5 deletions milli/src/update/index_documents/transform.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ use itertools::Itertools;
use obkv::{KvReader, KvWriter};
use roaring::RoaringBitmap;
use serde_json::{Map, Value};
use smartstring::SmartString;

use super::helpers::{create_sorter, create_writer, keep_latest_obkv, merge_obkvs, MergeFn};
use super::{IndexDocumentsMethod, IndexerConfig};
Expand Down Expand Up @@ -55,7 +56,8 @@ pub struct Transform<'a, 'i> {
flattened_sorter: grenad::Sorter<MergeFn>,
replaced_documents_ids: RoaringBitmap,
new_documents_ids: RoaringBitmap,
new_external_documents_ids_builder: FxHashMap<Vec<u8>, u64>,
// To increase the cache locality and the heap usage we use smartstring.
new_external_documents_ids_builder: FxHashMap<SmartString<smartstring::Compact>, u64>,
documents_count: usize,
}

Expand Down Expand Up @@ -254,10 +256,7 @@ impl<'a, 'i> Transform<'a, 'i> {
None => {
// if the document has already been inserted in this
// batch we need to get its docid
match self
.new_external_documents_ids_builder
.entry(external_id.as_bytes().to_vec())
{
match self.new_external_documents_ids_builder.entry(external_id.into()) {
Entry::Occupied(entry) => (*entry.get() as u32, false),
// if the document has never been encountered we give it a new docid
// and push this new docid to the external documents ids builder
Expand Down

0 comments on commit 3f34351

Please sign in to comment.