Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable usage of FuzzyTermQuery for specific fields via QueryParser #1750

Merged
merged 5 commits into from
Jan 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/directory/ram_directory.rs
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ impl Directory for RamDirectory {
let path_buf = PathBuf::from(path);
self.fs.write().unwrap().write(path_buf, data);
if path == *META_FILEPATH {
let _ = self.fs.write().unwrap().watch_router.broadcast();
drop(self.fs.write().unwrap().watch_router.broadcast());
}
Ok(())
}
Expand Down
2 changes: 1 addition & 1 deletion src/directory/watch_event_router.rs
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ mod tests {
watch_event_router.broadcast().wait().unwrap();
assert_eq!(2, counter.load(Ordering::SeqCst));
mem::drop(handle_a);
let _ = watch_event_router.broadcast();
drop(watch_event_router.broadcast());
watch_event_router.broadcast().wait().unwrap();
assert_eq!(2, counter.load(Ordering::SeqCst));
}
Expand Down
8 changes: 4 additions & 4 deletions src/indexer/json_term_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -89,11 +89,11 @@ pub(crate) fn index_json_values<'a>(
Ok(())
}

fn index_json_object<'a>(
fn index_json_object(
doc: DocId,
json_value: &serde_json::Map<String, serde_json::Value>,
text_analyzer: &TextAnalyzer,
json_term_writer: &mut JsonTermWriter<'a>,
json_term_writer: &mut JsonTermWriter,
postings_writer: &mut dyn PostingsWriter,
ctx: &mut IndexingContext,
positions_per_path: &mut IndexingPositionsPerPath,
Expand All @@ -113,11 +113,11 @@ fn index_json_object<'a>(
}
}

fn index_json_value<'a>(
fn index_json_value(
doc: DocId,
json_value: &serde_json::Value,
text_analyzer: &TextAnalyzer,
json_term_writer: &mut JsonTermWriter<'a>,
json_term_writer: &mut JsonTermWriter,
postings_writer: &mut dyn PostingsWriter,
ctx: &mut IndexingContext,
positions_per_path: &mut IndexingPositionsPerPath,
Expand Down
2 changes: 1 addition & 1 deletion src/indexer/segment_updater.rs
Original file line number Diff line number Diff line change
Expand Up @@ -577,7 +577,7 @@ impl SegmentUpdater {
for merge_operation in merge_candidates {
// If a merge cannot be started this is not a fatal error.
// We do log a warning in `start_merge`.
let _ = self.start_merge(merge_operation);
drop(self.start_merge(merge_operation));
}
}

Expand Down
2 changes: 1 addition & 1 deletion src/postings/compression/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,7 @@ pub mod tests {
const PADDING_VALUE: u32 = 234_234_345u32;
let expected_length = 154;
let mut encoder = BlockEncoder::new();
let input: Vec<u32> = (0u32..123u32).map(|i| 4 + i * 7 / 2).into_iter().collect();
let input: Vec<u32> = (0u32..123u32).map(|i| 4 + i * 7 / 2).collect();
for offset in &[0u32, 1u32, 2u32] {
let encoded_data = encoder.compress_vint_sorted(&input, *offset);
assert!(encoded_data.len() <= expected_length);
Expand Down
2 changes: 1 addition & 1 deletion src/query/boolean_query/boolean_query.rs
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ impl Query for BooleanQuery {
let sub_weights = self
.subqueries
.iter()
.map(|&(ref occur, ref subquery)| Ok((*occur, subquery.weight(enable_scoring)?)))
.map(|(occur, subquery)| Ok((*occur, subquery.weight(enable_scoring)?)))
.collect::<crate::Result<_>>()?;
Ok(Box::new(BooleanWeight::new(
sub_weights,
Expand Down
4 changes: 2 additions & 2 deletions src/query/boolean_query/boolean_weight.rs
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ impl<TScoreCombiner: ScoreCombiner> BooleanWeight<TScoreCombiner> {
boost: Score,
) -> crate::Result<HashMap<Occur, Vec<Box<dyn Scorer>>>> {
let mut per_occur_scorers: HashMap<Occur, Vec<Box<dyn Scorer>>> = HashMap::new();
for &(ref occur, ref subweight) in &self.weights {
for (occur, subweight) in &self.weights {
let sub_scorer: Box<dyn Scorer> = subweight.scorer(reader, boost)?;
per_occur_scorers
.entry(*occur)
Expand Down Expand Up @@ -191,7 +191,7 @@ impl<TScoreCombiner: ScoreCombiner + Sync> Weight for BooleanWeight<TScoreCombin
}

let mut explanation = Explanation::new("BooleanClause. Sum of ...", scorer.score());
for &(ref occur, ref subweight) in &self.weights {
for (occur, subweight) in &self.weights {
if is_positive_occur(*occur) {
if let Ok(child_explanation) = subweight.explain(reader, doc) {
explanation.add_detail(child_explanation);
Expand Down
4 changes: 2 additions & 2 deletions src/query/query_parser/logical_ast.rs
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,9 @@ impl fmt::Debug for LogicalAst {
if clause.is_empty() {
write!(formatter, "<emptyclause>")?;
} else {
let (ref occur, ref subquery) = clause[0];
let (occur, subquery) = &clause[0];
write!(formatter, "({}{:?}", occur_letter(*occur), subquery)?;
for &(ref occur, ref subquery) in &clause[1..] {
for (occur, subquery) in &clause[1..] {
write!(formatter, " {}{:?}", occur_letter(*occur), subquery)?;
}
formatter.write_str(")")?;
Expand Down
120 changes: 109 additions & 11 deletions src/query/query_parser/query_parser.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
use std::collections::HashMap;
use std::net::{AddrParseError, IpAddr};
use std::num::{ParseFloatError, ParseIntError};
use std::ops::Bound;
use std::str::{FromStr, ParseBoolError};

use rustc_hash::FxHashMap;
use tantivy_query_grammar::{UserInputAst, UserInputBound, UserInputLeaf, UserInputLiteral};

use super::logical_ast::*;
Expand All @@ -12,8 +12,8 @@ use crate::indexer::{
convert_to_fast_value_and_get_term, set_string_and_get_terms, JsonTermWriter,
};
use crate::query::{
AllQuery, BooleanQuery, BoostQuery, EmptyQuery, Occur, PhraseQuery, Query, RangeQuery,
TermQuery, TermSetQuery,
AllQuery, BooleanQuery, BoostQuery, EmptyQuery, FuzzyTermQuery, Occur, PhraseQuery, Query,
RangeQuery, TermQuery, TermSetQuery,
};
use crate::schema::{
Facet, FacetParseError, Field, FieldType, IndexRecordOption, IntoIpv6Addr, JsonObjectOptions,
Expand Down Expand Up @@ -160,6 +160,10 @@ fn trim_ast(logical_ast: LogicalAst) -> Option<LogicalAst> {
/// word lexicographically between `a` and `c` (inclusive lower bound, exclusive upper bound).
/// Inclusive bounds are `[]`, exclusive are `{}`.
///
/// * set terms: Using the `IN` operator, a field can be matched against a set of literals, e.g.
/// `title: IN [a b cd]` will match documents where `title` is either `a`, `b` or `cd`, but do so
/// more efficiently than the alternative query `title:a OR title:b OR title:c` does.
///
/// * date values: The query parser supports rfc3339 formatted dates. For example
/// `"2002-10-02T15:00:00.05Z"` or `some_date_field:[2002-10-02T15:00:00Z TO
/// 2002-10-02T18:00:00Z}`
Expand All @@ -174,6 +178,9 @@ fn trim_ast(logical_ast: LogicalAst) -> Option<LogicalAst> {
/// (See [`set_field_boost(...)`](QueryParser::set_field_boost)). Typically you may want to boost a
/// title field.
///
/// Additionally, specific fields can be marked to use fuzzy term queries for each literal
/// via the [`QueryParser::set_field_fuzzy`] method.
///
/// Phrase terms support the `~` slop operator which allows to set the phrase's matching
/// distance in words. `"big wolf"~1` will return documents containing the phrase `"big bad wolf"`.
#[derive(Clone)]
Expand All @@ -182,7 +189,15 @@ pub struct QueryParser {
default_fields: Vec<Field>,
conjunction_by_default: bool,
tokenizer_manager: TokenizerManager,
boost: HashMap<Field, Score>,
boost: FxHashMap<Field, Score>,
fuzzy: FxHashMap<Field, Fuzzy>,
}

#[derive(Clone)]
struct Fuzzy {
prefix: bool,
distance: u8,
transpose_cost_one: bool,
}

fn all_negative(ast: &LogicalAst) -> bool {
Expand Down Expand Up @@ -210,6 +225,7 @@ impl QueryParser {
tokenizer_manager,
conjunction_by_default: false,
boost: Default::default(),
fuzzy: Default::default(),
}
}

Expand Down Expand Up @@ -247,6 +263,30 @@ impl QueryParser {
self.boost.insert(field, boost);
}

/// Sets the given [field][`Field`] to use [fuzzy term queries][`FuzzyTermQuery`]
///
/// If set, the parse will produce queries using fuzzy term queries
/// with the given parameters for each literal matched against the given field.
///
/// See the [`FuzzyTermQuery::new`] and [`FuzzyTermQuery::new_prefix`] methods
/// for the meaning of the individual parameters.
pub fn set_field_fuzzy(
&mut self,
field: Field,
prefix: bool,
distance: u8,
transpose_cost_one: bool,
) {
self.fuzzy.insert(
field,
Fuzzy {
prefix,
distance,
transpose_cost_one,
},
);
}

/// Parse a query
///
/// Note that `parse_query` returns an error if the input
Expand All @@ -259,7 +299,7 @@ impl QueryParser {
/// in [Issue 5](https://github.com/fulmicoton/tantivy/issues/5)
pub fn parse_query(&self, query: &str) -> Result<Box<dyn Query>, QueryParserError> {
let logical_ast = self.parse_query_to_logical_ast(query)?;
Ok(convert_to_query(logical_ast))
Ok(convert_to_query(&self.fuzzy, logical_ast))
}

/// Parse the user query into an AST.
Expand Down Expand Up @@ -660,9 +700,30 @@ impl QueryParser {
}
}

fn convert_literal_to_query(logical_literal: LogicalLiteral) -> Box<dyn Query> {
fn convert_literal_to_query(
fuzzy: &FxHashMap<Field, Fuzzy>,
logical_literal: LogicalLiteral,
) -> Box<dyn Query> {
match logical_literal {
LogicalLiteral::Term(term) => Box::new(TermQuery::new(term, IndexRecordOption::WithFreqs)),
LogicalLiteral::Term(term) => {
if let Some(fuzzy) = fuzzy.get(&term.field()) {
if fuzzy.prefix {
Box::new(FuzzyTermQuery::new_prefix(
term,
fuzzy.distance,
fuzzy.transpose_cost_one,
))
} else {
Box::new(FuzzyTermQuery::new(
term,
fuzzy.distance,
fuzzy.transpose_cost_one,
))
}
} else {
Box::new(TermQuery::new(term, IndexRecordOption::WithFreqs))
}
}
LogicalLiteral::Phrase(term_with_offsets, slop) => Box::new(
PhraseQuery::new_with_offset_and_slop(term_with_offsets, slop),
),
Expand Down Expand Up @@ -755,12 +816,12 @@ fn generate_literals_for_json_object(
Ok(logical_literals)
}

fn convert_to_query(logical_ast: LogicalAst) -> Box<dyn Query> {
fn convert_to_query(fuzzy: &FxHashMap<Field, Fuzzy>, logical_ast: LogicalAst) -> Box<dyn Query> {
match trim_ast(logical_ast) {
Some(LogicalAst::Clause(trimmed_clause)) => {
let occur_subqueries = trimmed_clause
.into_iter()
.map(|(occur, subquery)| (occur, convert_to_query(subquery)))
.map(|(occur, subquery)| (occur, convert_to_query(fuzzy, subquery)))
.collect::<Vec<_>>();
assert!(
!occur_subqueries.is_empty(),
Expand All @@ -769,10 +830,10 @@ fn convert_to_query(logical_ast: LogicalAst) -> Box<dyn Query> {
Box::new(BooleanQuery::new(occur_subqueries))
}
Some(LogicalAst::Leaf(trimmed_logical_literal)) => {
convert_literal_to_query(*trimmed_logical_literal)
convert_literal_to_query(fuzzy, *trimmed_logical_literal)
}
Some(LogicalAst::Boost(ast, boost)) => {
let query = convert_to_query(*ast);
let query = convert_to_query(fuzzy, *ast);
let boosted_query = BoostQuery::new(query, boost);
Box::new(boosted_query)
}
Expand Down Expand Up @@ -1568,4 +1629,41 @@ mod test {
false,
);
}

#[test]
pub fn test_set_field_fuzzy() {
{
let mut query_parser = make_query_parser();
query_parser.set_field_fuzzy(
query_parser.schema.get_field("title").unwrap(),
false,
1,
true,
);
let query = query_parser.parse_query("abc").unwrap();
assert_eq!(
format!("{:?}", query),
"BooleanQuery { subqueries: [(Should, FuzzyTermQuery { term: Term(type=Str, \
field=0, \"abc\"), distance: 1, transposition_cost_one: true, prefix: false }), \
(Should, TermQuery(Term(type=Str, field=1, \"abc\")))] }"
);
}

{
let mut query_parser = make_query_parser();
query_parser.set_field_fuzzy(
query_parser.schema.get_field("text").unwrap(),
true,
2,
false,
);
let query = query_parser.parse_query("abc").unwrap();
assert_eq!(
format!("{:?}", query),
"BooleanQuery { subqueries: [(Should, TermQuery(Term(type=Str, field=0, \
\"abc\"))), (Should, FuzzyTermQuery { term: Term(type=Str, field=1, \"abc\"), \
distance: 2, transposition_cost_one: false, prefix: true })] }"
);
}
}
}
2 changes: 1 addition & 1 deletion src/schema/text_options.rs
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ mod tests {
let field = schema.get_field("body").unwrap();
let field_entry = schema.get_field_entry(field);
assert!(matches!(field_entry.field_type(),
&FieldType::Str(ref text_options)
FieldType::Str(text_options)
if text_options.get_indexing_options().unwrap().tokenizer() == "default"));
}

Expand Down
4 changes: 2 additions & 2 deletions src/snippet/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -124,9 +124,9 @@ impl Snippet {
///
/// Fragments must be valid in the sense that `&text[fragment.start..fragment.stop]`\
/// has to be a valid string.
fn search_fragments<'a>(
fn search_fragments(
tokenizer: &TextAnalyzer,
text: &'a str,
text: &str,
terms: &BTreeMap<String, Score>,
max_num_chars: usize,
) -> Vec<FragmentCandidate> {
Expand Down
Loading