Merge #752

752: Simplify primary key inference r=irevoire a=dureuill # Pull Request ## Related issue Related to meilisearch/meilisearch#3233 ## What does this PR do? ### User PoV - Change primary key inference to only consider a value as a candidate when it ends with "id", rather than when it simply contains "id". - Change primary key inference to always fail when there are multiple candidates. - Replace UserError::MissingPrimaryKey with `UserError::NoPrimaryKeyCandidateFound` and `UserError::MultiplePrimaryKeyCandidatesFound` ### Implementation-wise - Remove uses of UserError::MissingPrimaryKey not pertaining to inference. This introduces a possible panicking path. ## PR checklist Please check if your PR fulfills the following requirements: - [x] Does this PR fix an existing issue, or have you listed the changes applied in the PR description (and why they are needed)? - [x] Have you read the contributing guidelines? - [x] Have you made sure that the title is accurate and descriptive of the changes? Thank you so much for contributing to Meilisearch! Co-authored-by: Louis Dureuil <louis@meilisearch.com>
meilisearch · Jan 2, 2023 · 31155dc · 31155dc
2 parents a8defb5 + 4b166be
commit 31155dc
Show file tree

Hide file tree

Showing 4 changed files with 107 additions and 14 deletions.
diff --git a/milli/src/error.rs b/milli/src/error.rs
@@ -130,8 +130,10 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco
     MissingDocumentId { primary_key: String, document: Object },
     #[error("Document have too many matching `{}` attribute: `{}`.", .primary_key, serde_json::to_string(.document).unwrap())]
     TooManyDocumentIds { primary_key: String, document: Object },
-    #[error("The primary key inference process failed because the engine did not find any fields containing `id` substring in their name. If your document identifier does not contain any `id` substring, you can set the primary key of the index.")]
-    MissingPrimaryKey,
+    #[error("The primary key inference process failed because the engine did not find any field ending with `id` in its name. Please specify the primary key manually using the `primaryKey` query parameter.")]
+    NoPrimaryKeyCandidateFound,
+    #[error("The primary key inference process failed because the engine found {} fields ending with `id` in their name, such as '{}' and '{}'. Please specify the primary key manually using the `primaryKey` query parameter.", .candidates.len(), .candidates.get(0).unwrap(), .candidates.get(1).unwrap())]
+    MultiplePrimaryKeyCandidatesFound { candidates: Vec<String> },
     #[error("There is no more space left on the device. Consider increasing the size of the disk/partition.")]
     NoSpaceLeftOnDevice,
     #[error("Index already has a primary key: `{0}`.")]

diff --git a/milli/src/update/index_documents/enrich.rs b/milli/src/update/index_documents/enrich.rs
@@ -21,6 +21,10 @@ const DEFAULT_PRIMARY_KEY: &str = "id";
 ///  - all the documents id exist and are extracted,
 ///  - the validity of them but also,
 ///  - the validity of the `_geo` field depending on the settings.
+///
+/// # Panics
+///
+/// - if reader.is_empty(), this function may panic in some cases
 pub fn enrich_documents_batch<R: Read + Seek>(
     rtxn: &heed::RoTxn,
     index: &Index,
@@ -49,22 +53,44 @@ pub fn enrich_documents_batch<R: Read + Seek>(
                         primary_key: primary_key.to_string(),
                         document: obkv_to_object(&first_document, &documents_batch_index)?,
                     })),
-                    None => Ok(Err(UserError::MissingPrimaryKey)),
+                    None => unreachable!("Called with reader.is_empty()"),
                 };
             }
         },
         None => {
-            let guessed = documents_batch_index
+            let mut guesses: Vec<(u16, &str)> = documents_batch_index
                 .iter()
-                .filter(|(_, name)| name.to_lowercase().contains(DEFAULT_PRIMARY_KEY))
-                .min_by_key(|(fid, _)| *fid);
-            match guessed {
-                Some((id, name)) => PrimaryKey::flat(name.as_str(), *id),
-                None if autogenerate_docids => PrimaryKey::flat(
+                .filter(|(_, name)| name.to_lowercase().ends_with(DEFAULT_PRIMARY_KEY))
+                .map(|(field_id, name)| (*field_id, name.as_str()))
+                .collect();
+
+            // sort the keys in a deterministic, obvious way, so that fields are always in the same order.
+            guesses.sort_by(|(_, left_name), (_, right_name)| {
+                // shortest name first
+                left_name.len().cmp(&right_name.len()).then_with(
+                    // then alphabetical order
+                    || left_name.cmp(right_name),
+                )
+            });
+
+            match guesses.as_slice() {
+                [] if autogenerate_docids => PrimaryKey::flat(
                     DEFAULT_PRIMARY_KEY,
                     documents_batch_index.insert(DEFAULT_PRIMARY_KEY),
                 ),
-                None => return Ok(Err(UserError::MissingPrimaryKey)),
+                [] => return Ok(Err(UserError::NoPrimaryKeyCandidateFound)),
+                [(field_id, name)] => {
+                    log::info!("Primary key was not specified in index. Inferred to '{name}'");
+                    PrimaryKey::flat(name, *field_id)
+                }
+                multiple => {
+                    return Ok(Err(UserError::MultiplePrimaryKeyCandidatesFound {
+                        candidates: multiple
+                            .iter()
+                            .map(|(_, candidate)| candidate.to_string())
+                            .collect(),
+                    }));
+                }
             }
         }
     };

diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
@@ -1658,6 +1658,12 @@ mod tests {
             "branch_id_number": 0
         }]};
 
+        {
+            let mut wtxn = index.write_txn().unwrap();
+            index.put_primary_key(&mut wtxn, "id").unwrap();
+            wtxn.commit().unwrap();
+        }
+
         index.add_documents(doc1).unwrap();
         index.add_documents(doc2).unwrap();
 
@@ -1814,6 +1820,56 @@ mod tests {
         index.add_documents(doc4).unwrap_err();
     }
 
+    #[test]
+    fn primary_key_inference() {
+        let index = TempIndex::new();
+
+        let doc_no_id = documents! {[{
+            "title": "asdsad",
+            "state": "automated",
+            "priority": "normal",
+            "branch_id_number": 0
+        }]};
+        assert!(matches!(
+            index.add_documents(doc_no_id),
+            Err(Error::UserError(UserError::NoPrimaryKeyCandidateFound))
+        ));
+
+        let doc_multiple_ids = documents! {[{
+            "id": 228143,
+            "title": "something",
+            "state": "automated",
+            "priority": "normal",
+            "public_uid": "39c6499b",
+            "project_id": 78207,
+            "branch_id_number": 0
+        }]};
+
+        let Err(Error::UserError(UserError::MultiplePrimaryKeyCandidatesFound {
+            candidates
+        })) =
+            index.add_documents(doc_multiple_ids) else { panic!("Expected Error::UserError(MultiplePrimaryKeyCandidatesFound)") };
+
+        assert_eq!(candidates, vec![S("id"), S("project_id"), S("public_uid"),]);
+
+        let doc_inferable = documents! {[{
+            "video": "test.mp4",
+            "id": 228143,
+            "title": "something",
+            "state": "automated",
+            "priority": "normal",
+            "public_uid_": "39c6499b",
+            "project_id_": 78207,
+            "branch_id_number": 0
+        }]};
+
+        index.add_documents(doc_inferable).unwrap();
+
+        let txn = index.read_txn().unwrap();
+
+        assert_eq!(index.primary_key(&txn).unwrap().unwrap(), "id");
+    }
+
     #[test]
     fn long_words_must_be_skipped() {
         let index = TempIndex::new();

diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs
@@ -16,7 +16,7 @@ use super::helpers::{create_sorter, create_writer, keep_latest_obkv, merge_obkvs
 use super::{IndexDocumentsMethod, IndexerConfig};
 use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader};
 use crate::error::{Error, InternalError, UserError};
-use crate::index::db_name;
+use crate::index::{db_name, main_key};
 use crate::update::{AvailableDocumentsIds, ClearDocuments, UpdateIndexingStep};
 use crate::{
     ExternalDocumentsIds, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index,
@@ -459,7 +459,10 @@ impl<'a, 'i> Transform<'a, 'i> {
         let primary_key = self
             .index
             .primary_key(wtxn)?
-            .ok_or(Error::UserError(UserError::MissingPrimaryKey))?
+            .ok_or(Error::InternalError(InternalError::DatabaseMissingEntry {
+                db_name: db_name::MAIN,
+                key: Some(main_key::PRIMARY_KEY_KEY),
+            }))?
             .to_string();
 
         let mut external_documents_ids = self.index.external_documents_ids(wtxn)?;
@@ -557,8 +560,14 @@ impl<'a, 'i> Transform<'a, 'i> {
         mut new_fields_ids_map: FieldsIdsMap,
     ) -> Result<TransformOutput> {
         // There already has been a document addition, the primary key should be set by now.
-        let primary_key =
-            self.index.primary_key(wtxn)?.ok_or(UserError::MissingPrimaryKey)?.to_string();
+        let primary_key = self
+            .index
+            .primary_key(wtxn)?
+            .ok_or(InternalError::DatabaseMissingEntry {
+                db_name: db_name::MAIN,
+                key: Some(main_key::PRIMARY_KEY_KEY),
+            })?
+            .to_string();
         let field_distribution = self.index.field_distribution(wtxn)?;
 
         // Delete the soft deleted document ids from the maps inside the external_document_ids structure