From 0f673b168f8841d666115d73e4fa0e1026246673 Mon Sep 17 00:00:00 2001 From: Justin Greene Date: Tue, 16 Nov 2021 20:54:29 +0000 Subject: [PATCH 1/2] Update to tantivy 0.16.1 Add SnippetGenerator for getting highlighted snippets from documents --- Cargo.toml | 4 +-- src/document.rs | 2 +- src/facet.rs | 12 +++++--- src/index.rs | 5 ++-- src/lib.rs | 6 ++++ src/schemabuilder.rs | 64 ++++++++++++++++++++++++++++++++++++--- src/searcher.rs | 12 ++++---- src/snippet.rs | 69 +++++++++++++++++++++++++++++++++++++++++++ tests/tantivy_test.py | 26 +++++++++++++++- 9 files changed, 180 insertions(+), 20 deletions(-) create mode 100644 src/snippet.rs diff --git a/Cargo.toml b/Cargo.toml index 75501b5b..77361d3c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,7 +12,7 @@ crate-type = ["cdylib"] [dependencies] chrono = "0.4.19" -tantivy = "0.13.2" +tantivy = "0.16.1" itertools = "0.9.0" futures = "0.3.5" @@ -22,4 +22,4 @@ features = ["extension-module"] [package.metadata.maturin] requires-python = ">=3.7" -project-url = { Source = "https://github.com/quickwit-inc/tantivy-py" } +project-url = ["https://github.com/quickwit-inc/tantivy-py"] diff --git a/src/document.rs b/src/document.rs index df537732..e625d997 100644 --- a/src/document.rs +++ b/src/document.rs @@ -340,7 +340,7 @@ impl Document { } impl Document { - fn iter_values_for_field<'a>( + pub(crate) fn iter_values_for_field<'a>( &'a self, field: &str, ) -> impl Iterator + 'a { diff --git a/src/facet.rs b/src/facet.rs index 82144e2f..0ebf04f0 100644 --- a/src/facet.rs +++ b/src/facet.rs @@ -1,5 +1,8 @@ use pyo3::{basic::PyObjectProtocol, prelude::*, types::PyType}; use tantivy::schema; +use crate::{ + to_pyerr, +}; /// A Facet represent a point in a given hierarchy. /// @@ -46,10 +49,11 @@ impl Facet { /// /// Returns the created Facet. #[classmethod] - fn from_string(_cls: &PyType, facet_string: &str) -> Facet { - Facet { - inner: schema::Facet::from_text(facet_string), - } + fn from_string(_cls: &PyType, facet_string: &str) -> PyResult { + let inner = schema::Facet::from_text(facet_string).map_err(to_pyerr)?; + Ok(Facet { + inner: inner, + }) } /// Returns the list of `segments` that forms a facet path. diff --git a/src/index.rs b/src/index.rs index 89347532..f11d8a38 100644 --- a/src/index.rs +++ b/src/index.rs @@ -174,7 +174,7 @@ impl Index { if reuse { tv::Index::open_or_create(directory, schema.inner.clone()) } else { - tv::Index::create(directory, schema.inner.clone()) + tv::Index::create(directory, schema.inner.clone(), tv::IndexSettings::default()) } .map_err(to_pyerr)? } @@ -277,7 +277,8 @@ impl Index { #[staticmethod] fn exists(path: &str) -> PyResult { let directory = MmapDirectory::open(path).map_err(to_pyerr)?; - Ok(tv::Index::exists(&directory)) + let exists = tv::Index::exists(&directory).map_err(to_pyerr)?; + Ok(exists) } /// The schema of the current index. diff --git a/src/lib.rs b/src/lib.rs index 0593715c..e936941e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -8,13 +8,16 @@ mod query; mod schema; mod schemabuilder; mod searcher; +mod snippet; use document::Document; use facet::Facet; use index::Index; +use query::Query; use schema::Schema; use schemabuilder::SchemaBuilder; use searcher::{DocAddress, Searcher}; +use snippet::{SnippetGenerator, Snippet}; /// Python bindings for the search engine library Tantivy. /// @@ -75,6 +78,9 @@ fn tantivy(_py: Python, m: &PyModule) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; Ok(()) } diff --git a/src/schemabuilder.rs b/src/schemabuilder.rs index 58b2a275..3a783218 100644 --- a/src/schemabuilder.rs +++ b/src/schemabuilder.rs @@ -232,11 +232,17 @@ impl SchemaBuilder { /// Add a Facet field to the schema. /// Args: /// name (str): The name of the field. - fn add_facet_field(&mut self, name: &str) -> PyResult { + #[args(stored = false, indexed = false)] + fn add_facet_field(&mut self, + name: &str, + stored: bool, + indexed: bool) -> PyResult { let builder = &mut self.builder; + let opts = SchemaBuilder::build_facet_option(stored, indexed)?; + if let Some(builder) = builder.write().unwrap().as_mut() { - builder.add_facet_field(name); + builder.add_facet_field(name, opts); } else { return Err(exceptions::PyValueError::new_err( "Schema builder object isn't valid anymore.", @@ -253,11 +259,17 @@ impl SchemaBuilder { /// /// Args: /// name (str): The name of the field. - fn add_bytes_field(&mut self, name: &str) -> PyResult { + #[args(stored = false, indexed = false)] + fn add_bytes_field(&mut self, + name: &str, + stored: bool, + indexed: bool, + fast: Option<&str>) -> PyResult { let builder = &mut self.builder; + let opts = SchemaBuilder::build_bytes_option(stored, indexed, fast)?; if let Some(builder) = builder.write().unwrap().as_mut() { - builder.add_bytes_field(name); + builder.add_bytes_field(name, opts); } else { return Err(exceptions::PyValueError::new_err( "Schema builder object isn't valid anymore.", @@ -284,6 +296,50 @@ impl SchemaBuilder { } impl SchemaBuilder { + fn build_facet_option( + stored: bool, + indexed: bool, + ) -> PyResult { + let opts = schema::FacetOptions::default(); + + let opts = if stored { opts.set_stored() } else { opts }; + let opts = if indexed { opts.set_indexed() } else { opts }; + Ok(opts) + } + + fn build_bytes_option( + stored: bool, + indexed: bool, + fast: Option<&str>, + ) -> PyResult { + let opts = schema::BytesOptions::default(); + + let opts = if stored { opts.set_stored() } else { opts }; + let opts = if indexed { opts.set_indexed() } else { opts }; + + let fast = match fast { + Some(f) => { + let f = f.to_lowercase(); + match f.as_ref() { + "single" => Some(schema::Cardinality::SingleValue), + "multi" => Some(schema::Cardinality::MultiValues), + _ => return Err(exceptions::PyValueError::new_err( + "Invalid index option, valid choices are: 'multivalue' and 'singlevalue'" + )), + } + } + None => None, + }; + + let opts = if let Some(_f) = fast { + opts.set_fast() + } else { + opts + }; + + Ok(opts) + } + fn build_int_option( stored: bool, indexed: bool, diff --git a/src/searcher.rs b/src/searcher.rs index 2f0cc1bf..9abccdf4 100644 --- a/src/searcher.rs +++ b/src/searcher.rs @@ -196,8 +196,8 @@ impl Searcher { #[pyclass] #[derive(Clone, Debug)] pub(crate) struct DocAddress { - pub(crate) segment_ord: tv::SegmentLocalId, - pub(crate) doc: tv::DocId, + pub(crate) segment_ord: tv::SegmentOrdinal, + pub(crate) doc_id: tv::DocId, } #[pymethods] @@ -212,22 +212,22 @@ impl DocAddress { /// The segment local DocId #[getter] fn doc(&self) -> u32 { - self.doc + self.doc_id } } impl From<&tv::DocAddress> for DocAddress { fn from(doc_address: &tv::DocAddress) -> Self { DocAddress { - segment_ord: doc_address.segment_ord(), - doc: doc_address.doc(), + segment_ord: doc_address.segment_ord, + doc_id: doc_address.doc_id, } } } impl Into for &DocAddress { fn into(self) -> tv::DocAddress { - tv::DocAddress(self.segment_ord(), self.doc()) + tv::DocAddress { segment_ord: self.segment_ord, doc_id: self.doc_id } } } diff --git a/src/snippet.rs b/src/snippet.rs new file mode 100644 index 00000000..c2f28f09 --- /dev/null +++ b/src/snippet.rs @@ -0,0 +1,69 @@ +use pyo3::prelude::*; +use tantivy as tv; +use crate::{ + to_pyerr, +}; + +/// Tantivy schema. +/// +/// The schema is very strict. To build the schema the `SchemaBuilder` class is +/// provided. +#[pyclass] +pub(crate) struct Snippet { + pub(crate) inner: tv::Snippet, +} + +#[pyclass] +pub(crate) struct Range { + #[pyo3(get)] + start: usize, + #[pyo3(get)] + end: usize +} + +#[pymethods] +impl Snippet { + pub fn to_html(&self) -> PyResult { + Ok(self.inner.to_html()) + } + + pub fn highlighted(&self) -> Vec { + let highlighted = self.inner.highlighted(); + let results = highlighted.iter().map(|r| Range { start: r.start, end: r.end }).collect::>(); + results + } +} + + +#[pyclass] +pub(crate) struct SnippetGenerator { + pub(crate) field_name: String, + pub(crate) inner: tv::SnippetGenerator, +} + +#[pymethods] +impl SnippetGenerator { + #[staticmethod] + pub fn create( + searcher: &crate::Searcher, + query: &crate::Query, + schema: &crate::Schema, + field_name: &str + ) -> PyResult { + let field = schema.inner.get_field(field_name).ok_or("field not found").map_err(to_pyerr)?; + let generator = tv::SnippetGenerator::create(&*searcher.inner, query.get(), field).map_err(to_pyerr)?; + + return Ok(SnippetGenerator { field_name: field_name.to_string(), inner: generator }); + } + + pub fn snippet_from_doc(&self, doc: &crate::Document) -> crate::Snippet { + let text: String = doc + .iter_values_for_field(&self.field_name) + .flat_map(tv::schema::Value::text) + .collect::>() + .join(" "); + + let result = self.inner.snippet(&text); + Snippet { inner: result } + } +} \ No newline at end of file diff --git a/tests/tantivy_test.py b/tests/tantivy_test.py index 8c3b6368..87f9d563 100644 --- a/tests/tantivy_test.py +++ b/tests/tantivy_test.py @@ -1,7 +1,7 @@ import tantivy import pytest -from tantivy import Document, Index, SchemaBuilder, Schema +from tantivy import Document, Index, SchemaBuilder, SnippetGenerator def schema(): @@ -322,3 +322,27 @@ def test_document_with_facet(self): def test_document_error(self): with pytest.raises(ValueError): tantivy.Document(name={}) + + +class TestSnippets(object): + def test_document_snippet(self, dir_index): + index_dir, _ = dir_index + doc_schema = schema() + index = Index(doc_schema, str(index_dir)) + query = index.parse_query("sea whale", ["title", "body"]) + searcher = index.searcher() + result = searcher.search(query) + assert len(result.hits) == 1 + + snippet_generator = SnippetGenerator.create(searcher, query, doc_schema, "title") + + for (score, doc_address) in result.hits: + doc = searcher.doc(doc_address) + snippet = snippet_generator.snippet_from_doc(doc) + highlights = snippet.highlighted() + assert len(highlights) == 1 + first = highlights[0] + assert first.start == 20 + assert first.end == 23 + html_snippet = snippet.to_html() + assert html_snippet == 'The Old Man and the Sea' \ No newline at end of file From 118aacfa8a950d722a9fff454a1414febf2e2e3b Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Mon, 3 Jan 2022 22:48:07 +0900 Subject: [PATCH 2/2] Apply suggestions from code review --- src/snippet.rs | 2 +- tests/tantivy_test.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/snippet.rs b/src/snippet.rs index c2f28f09..3900c647 100644 --- a/src/snippet.rs +++ b/src/snippet.rs @@ -66,4 +66,4 @@ impl SnippetGenerator { let result = self.inner.snippet(&text); Snippet { inner: result } } -} \ No newline at end of file +} diff --git a/tests/tantivy_test.py b/tests/tantivy_test.py index 87f9d563..b5fc0272 100644 --- a/tests/tantivy_test.py +++ b/tests/tantivy_test.py @@ -345,4 +345,4 @@ def test_document_snippet(self, dir_index): assert first.start == 20 assert first.end == 23 html_snippet = snippet.to_html() - assert html_snippet == 'The Old Man and the Sea' \ No newline at end of file + assert html_snippet == 'The Old Man and the Sea'