From d4c8855ff5d1e6dcb48b3686e3296fc049f80d2d Mon Sep 17 00:00:00 2001 From: Michael Cooper Date: Wed, 10 Nov 2021 11:24:57 -0800 Subject: [PATCH 1/3] fix(adm): Use Firefox's full_keyword algorithm Notably, this algorithm never suggests more than the rest of the current word. Fixes #213 --- merino-adm/src/remote_settings/mod.rs | 127 ++++++++++++++++++++++++-- 1 file changed, 118 insertions(+), 9 deletions(-) diff --git a/merino-adm/src/remote_settings/mod.rs b/merino-adm/src/remote_settings/mod.rs index 3e0a97bbc0..cf7e331955 100644 --- a/merino-adm/src/remote_settings/mod.rs +++ b/merino-adm/src/remote_settings/mod.rs @@ -157,28 +157,33 @@ impl RemoteSettingsSuggester { continue; }; - let full_keyword = adm_suggestion - .keywords - .iter() - .max_by_key(|kw| kw.len()) - .expect("No keywords?") - .clone(); - let merino_suggestion = Suggestion { id: adm_suggestion.id, title: adm_suggestion.title.clone(), url: adm_suggestion.url.clone(), impression_url: adm_suggestion.impression_url.clone(), click_url: adm_suggestion.click_url.clone(), - full_keyword, + full_keyword: String::new(), provider: adm_suggestion.advertiser.clone(), is_sponsored: !NON_SPONSORED_IAB_CATEGORIES .contains(&adm_suggestion.iab_category.as_str()), icon: icon_url, score: Proportion::from(0.2), }; + for keyword in &adm_suggestion.keywords { - new_suggestions.insert(keyword.clone(), merino_suggestion.clone()); + let full_keyword = Self::get_full_keyword(keyword, &adm_suggestion.keywords); + + if keyword == "amaz" { + dbg!(keyword, &adm_suggestion.keywords, &full_keyword); + } + new_suggestions.insert( + keyword.clone(), + Suggestion { + full_keyword, + ..merino_suggestion.clone() + }, + ); } } } @@ -202,6 +207,41 @@ impl RemoteSettingsSuggester { Ok(()) } + + /// Gets the "full keyword" (the suggested completion) for a query. The data + /// from adM doesn't include this data directly, so we make our own based on + /// the available keywords. + /// + /// 1. Find the first keyword phrase that has more words than the query. Use + /// its first `queryWords.length` words as the full keyword. e.g., if the + /// query is "moz" and `result.keywords` is ["moz", "mozi", "mozil", + /// "mozill", "mozilla", "mozilla firefox"], pick "mozilla firefox", pop + /// off the "firefox" and use "mozilla" as the full keyword. + /// 2. If there isn't any keyword phrase with more words, then pick the + /// longest phrase. e.g., pick "mozilla" in the previous example + /// (assuming the "mozilla firefox" phrase isn't there). That might be + /// the query itself. + /// + fn get_full_keyword(partial_query: &str, all_keywords: &[String]) -> String { + let query_num_words = partial_query.split_whitespace().count(); + + // heuristic 1: more words + if let Some(longer_keyword_words) = all_keywords + .iter() + .map(|keyword| keyword.split_whitespace().collect::>()) + .find(|split_words| split_words.len() > query_num_words) + { + longer_keyword_words[..query_num_words].join(" ") + } else { + // heuristic 2 - longest phrase with partial query as a prefix + all_keywords + .iter() + .filter(|keyword| keyword.starts_with(partial_query)) + .cloned() + .max_by_key(String::len) + .unwrap_or_else(|| partial_query.to_string()) + } + } } #[async_trait] @@ -359,4 +399,73 @@ mod tests { Ok(()) } + + #[test] + fn get_full_keyword_matches_doc_heuristic_1() { + assert_eq!( + RemoteSettingsSuggester::get_full_keyword( + "moz", + &[ + "moz".to_string(), + "mozi".to_string(), + "mozil".to_string(), + "mozill".to_string(), + "mozilla".to_string(), + "mozilla firefox".to_string(), + ] + ), + "mozilla" + ); + assert_eq!( + RemoteSettingsSuggester::get_full_keyword( + "one t", + &[ + "one".to_string(), + "one t".to_string(), + "one tw".to_string(), + "one two".to_string(), + "one two t".to_string(), + "one two th".to_string(), + "one two thr".to_string(), + "one two thre".to_string(), + "one two three".to_string(), + ] + ), + "one two" + ); + } + + #[test] + fn get_full_keyword_matches_doc_heuristic_2() { + assert_eq!( + RemoteSettingsSuggester::get_full_keyword( + "moz", + &[ + "moz".to_string(), + "mozi".to_string(), + "mozil".to_string(), + "mozill".to_string(), + "mozilla".to_string(), + ] + ), + "mozilla" + ); + assert_eq!( + RemoteSettingsSuggester::get_full_keyword( + "one two t", + &[ + "one".to_string(), + "one t".to_string(), + "one tw".to_string(), + "one two".to_string(), + "one two t".to_string(), + "one two th".to_string(), + "one two thr".to_string(), + "one two thre".to_string(), + "one two three".to_string(), + ] + ), + "one two tree" + ); + } } From aedaef6b7c554e5a2c4e90dd992b0b4f55508a9d Mon Sep 17 00:00:00 2001 From: Michael Cooper Date: Wed, 10 Nov 2021 14:01:06 -0800 Subject: [PATCH 2/3] f review --- merino-adm/src/remote_settings/mod.rs | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/merino-adm/src/remote_settings/mod.rs b/merino-adm/src/remote_settings/mod.rs index cf7e331955..fe04ca2527 100644 --- a/merino-adm/src/remote_settings/mod.rs +++ b/merino-adm/src/remote_settings/mod.rs @@ -174,9 +174,6 @@ impl RemoteSettingsSuggester { for keyword in &adm_suggestion.keywords { let full_keyword = Self::get_full_keyword(keyword, &adm_suggestion.keywords); - if keyword == "amaz" { - dbg!(keyword, &adm_suggestion.keywords, &full_keyword); - } new_suggestions.insert( keyword.clone(), Suggestion { @@ -213,13 +210,13 @@ impl RemoteSettingsSuggester { /// the available keywords. /// /// 1. Find the first keyword phrase that has more words than the query. Use - /// its first `queryWords.length` words as the full keyword. e.g., if the - /// query is "moz" and `result.keywords` is ["moz", "mozi", "mozil", - /// "mozill", "mozilla", "mozilla firefox"], pick "mozilla firefox", pop - /// off the "firefox" and use "mozilla" as the full keyword. + /// its first `query_num_words` words as the full keyword. e.g., if the + /// query is `"moz"` and `all_keywords` is `["moz", "mozi", "mozil", + /// "mozill", "mozilla", "mozilla firefox"]`, pick `"mozilla firefox"`, + /// pop off the `"firefox"` and use `"mozilla"` as the full keyword. /// 2. If there isn't any keyword phrase with more words, then pick the - /// longest phrase. e.g., pick "mozilla" in the previous example - /// (assuming the "mozilla firefox" phrase isn't there). That might be + /// longest phrase. e.g., pick `"`mozilla" in the previous example + /// (assuming the `"mozilla firefox"` phrase isn't there). That might be /// the query itself. /// fn get_full_keyword(partial_query: &str, all_keywords: &[String]) -> String { From b0fdfa961976f066986d933fa64a27c111f01e0b Mon Sep 17 00:00:00 2001 From: Michael Cooper Date: Wed, 10 Nov 2021 14:01:50 -0800 Subject: [PATCH 3/3] f tests --- merino-adm/src/remote_settings/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/merino-adm/src/remote_settings/mod.rs b/merino-adm/src/remote_settings/mod.rs index fe04ca2527..42a8699935 100644 --- a/merino-adm/src/remote_settings/mod.rs +++ b/merino-adm/src/remote_settings/mod.rs @@ -462,7 +462,7 @@ mod tests { "one two three".to_string(), ] ), - "one two tree" + "one two three" ); } }