diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index bc21bbc9ad..9a9a9d3717 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -79,7 +79,7 @@ impl Snippet { let mut html = String::new(); let mut start_from: usize = 0; - for item in self.highlighted.iter() { + for item in collapse_overlapped_ranges(&self.highlighted) { html.push_str(&encode_minimal(&self.fragment[start_from..item.start])); html.push_str(HIGHLIGHTEN_PREFIX); html.push_str(&encode_minimal(&self.fragment[item.clone()])); @@ -186,6 +186,53 @@ fn select_best_fragment_combination(fragments: &[FragmentCandidate], text: &str) } } +/// Returns ranges that are collapsed into non-overlapped ranges. +/// +/// ## Examples +/// - [0..1, 2..3] -> [0..1, 2..3] # no overlap +/// - [0..1, 1..2] -> [0..1, 1..2] # no overlap +/// - [0..2, 1..2] -> [0..2] # collapsed +/// - [0..2, 1..3] -> [0..3] # collapsed +/// - [0..3, 1..2] -> [0..3] # second range's end is also inside of the first range +/// +/// Note: This function assumes `ranges` is sorted by `Range.start` in ascending order. +fn collapse_overlapped_ranges(ranges: &[Range]) -> Vec> { + debug_assert!(is_sorted(ranges.iter().map(|range| range.start))); + + let mut result = Vec::new(); + let mut ranges_it = ranges.iter(); + + let mut current = match ranges_it.next() { + Some(range) => range.clone(), + None => return result, + }; + + for range in ranges { + if current.end > range.start { + current = current.start..std::cmp::max(current.end, range.end); + } else { + result.push(current); + current = range.clone(); + } + } + + result.push(current); + result +} + +fn is_sorted(mut it: impl Iterator) -> bool { + if let Some(first) = it.next() { + let mut prev = first; + for item in it { + if item < prev { + return false; + } + prev = item; + } + } + true +} + /// `SnippetGenerator` /// /// # Example @@ -320,10 +367,10 @@ mod tests { use maplit::btreemap; - use super::{search_fragments, select_best_fragment_combination}; + use super::{collapse_overlapped_ranges, search_fragments, select_best_fragment_combination}; use crate::query::QueryParser; use crate::schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions, TEXT}; - use crate::tokenizer::SimpleTokenizer; + use crate::tokenizer::{NgramTokenizer, SimpleTokenizer}; use crate::{Index, SnippetGenerator}; const TEST_TEXT: &str = r#"Rust is a systems programming language sponsored by @@ -588,4 +635,47 @@ Survey in 2016, 2017, and 2018."#; } Ok(()) } + + #[test] + fn test_collapse_overlapped_ranges() { + assert_eq!( + collapse_overlapped_ranges(&vec![0..1, 2..3,]), + vec![0..1, 2..3] + ); + assert_eq!( + collapse_overlapped_ranges(&vec![0..1, 1..2,]), + vec![0..1, 1..2] + ); + assert_eq!(collapse_overlapped_ranges(&vec![0..2, 1..2,]), vec![0..2]); + assert_eq!(collapse_overlapped_ranges(&vec![0..2, 1..3,]), vec![0..3]); + assert_eq!(collapse_overlapped_ranges(&vec![0..3, 1..2,]), vec![0..3]); + } + + #[test] + fn test_snippet_with_overlapped_highlighted_ranges() { + let text = "abc"; + + let mut terms = BTreeMap::new(); + terms.insert(String::from("ab"), 0.9); + terms.insert(String::from("bc"), 1.0); + + let fragments = search_fragments( + &From::from(NgramTokenizer::all_ngrams(2, 2)), + text, + &terms, + 3, + ); + + assert_eq!(fragments.len(), 1); + { + let first = &fragments[0]; + assert_eq!(first.score, 1.9); + assert_eq!(first.start_offset, 0); + assert_eq!(first.stop_offset, 3); + } + + let snippet = select_best_fragment_combination(&fragments[..], text); + assert_eq!(snippet.fragment, "abc"); + assert_eq!(snippet.to_html(), "abc"); + } }