From 7a8fce0ae7185b7b5dea33f673d8ebb7f899098d Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Tue, 10 Jan 2023 14:12:11 +0900 Subject: [PATCH] Minor mini fixes --- columnar/src/writer/column_operation.rs | 2 +- columnar/src/writer/column_writers.rs | 44 ++++++++++------------ columnar/src/writer/mod.rs | 17 ++++----- columnar/src/writer/serializer.rs | 4 +- src/query/query_parser/query_parser.rs | 10 ++++- src/schema/field_type.rs | 5 ++- src/schema/value.rs | 4 +- src/tokenizer/mod.rs | 3 +- src/tokenizer/regex_tokenizer.rs | 49 +++++++++++++++---------- 9 files changed, 76 insertions(+), 62 deletions(-) diff --git a/columnar/src/writer/column_operation.rs b/columnar/src/writer/column_operation.rs index 9e24d32e19..9cb6bfe632 100644 --- a/columnar/src/writer/column_operation.rs +++ b/columnar/src/writer/column_operation.rs @@ -133,7 +133,7 @@ pub(super) trait SymbolValue: Clone + Copy { impl SymbolValue for bool { fn serialize(self, buffer: &mut [u8]) -> u8 { - buffer[0] = if self { 1u8 } else { 0u8 }; + buffer[0] = u8::from(self); 1u8 } diff --git a/columnar/src/writer/column_writers.rs b/columnar/src/writer/column_writers.rs index d0e398756a..6857138480 100644 --- a/columnar/src/writer/column_writers.rs +++ b/columnar/src/writer/column_writers.rs @@ -9,18 +9,18 @@ use crate::{Cardinality, DocId, NumericalType, NumericalValue}; #[derive(Copy, Clone, Debug, Eq, PartialEq)] #[repr(u8)] enum DocumentStep { - SameDoc = 0, - NextDoc = 1, - SkippedDoc = 2, + Same = 0, + Next = 1, + Skipped = 2, } #[inline(always)] fn delta_with_last_doc(last_doc_opt: Option, doc: u32) -> DocumentStep { let expected_next_doc = last_doc_opt.map(|last_doc| last_doc + 1).unwrap_or(0u32); match doc.cmp(&expected_next_doc) { - Ordering::Less => DocumentStep::SameDoc, - Ordering::Equal => DocumentStep::NextDoc, - Ordering::Greater => DocumentStep::SkippedDoc, + Ordering::Less => DocumentStep::Same, + Ordering::Equal => DocumentStep::Next, + Ordering::Greater => DocumentStep::Skipped, } } @@ -56,15 +56,15 @@ impl ColumnWriter { pub(super) fn record(&mut self, doc: DocId, value: S, arena: &mut MemoryArena) { // Difference between `doc` and the last doc. match delta_with_last_doc(self.last_doc_opt, doc) { - DocumentStep::SameDoc => { + DocumentStep::Same => { // This is the last encounterred document. self.cardinality = Cardinality::Multivalued; } - DocumentStep::NextDoc => { + DocumentStep::Next => { self.last_doc_opt = Some(doc); self.write_symbol::(ColumnOperation::NewDoc(doc), arena); } - DocumentStep::SkippedDoc => { + DocumentStep::Skipped => { self.cardinality = self.cardinality.max(Cardinality::Optional); self.last_doc_opt = Some(doc); self.write_symbol::(ColumnOperation::NewDoc(doc), arena); @@ -79,8 +79,8 @@ impl ColumnWriter { // at the end of the column. pub(crate) fn get_cardinality(&self, num_docs: DocId) -> Cardinality { match delta_with_last_doc(self.last_doc_opt, num_docs) { - DocumentStep::SameDoc | DocumentStep::NextDoc => self.cardinality, - DocumentStep::SkippedDoc => self.cardinality.max(Cardinality::Optional), + DocumentStep::Same | DocumentStep::Next => self.cardinality, + DocumentStep::Skipped => self.cardinality.max(Cardinality::Optional), } } @@ -215,20 +215,14 @@ mod tests { #[test] fn test_delta_with_last_doc() { - assert_eq!(delta_with_last_doc(None, 0u32), DocumentStep::NextDoc); - assert_eq!(delta_with_last_doc(None, 1u32), DocumentStep::SkippedDoc); - assert_eq!(delta_with_last_doc(None, 2u32), DocumentStep::SkippedDoc); - assert_eq!(delta_with_last_doc(Some(0u32), 0u32), DocumentStep::SameDoc); - assert_eq!(delta_with_last_doc(Some(1u32), 1u32), DocumentStep::SameDoc); - assert_eq!(delta_with_last_doc(Some(1u32), 2u32), DocumentStep::NextDoc); - assert_eq!( - delta_with_last_doc(Some(1u32), 3u32), - DocumentStep::SkippedDoc - ); - assert_eq!( - delta_with_last_doc(Some(1u32), 4u32), - DocumentStep::SkippedDoc - ); + assert_eq!(delta_with_last_doc(None, 0u32), DocumentStep::Next); + assert_eq!(delta_with_last_doc(None, 1u32), DocumentStep::Skipped); + assert_eq!(delta_with_last_doc(None, 2u32), DocumentStep::Skipped); + assert_eq!(delta_with_last_doc(Some(0u32), 0u32), DocumentStep::Same); + assert_eq!(delta_with_last_doc(Some(1u32), 1u32), DocumentStep::Same); + assert_eq!(delta_with_last_doc(Some(1u32), 2u32), DocumentStep::Next); + assert_eq!(delta_with_last_doc(Some(1u32), 3u32), DocumentStep::Skipped); + assert_eq!(delta_with_last_doc(Some(1u32), 4u32), DocumentStep::Skipped); } #[track_caller] diff --git a/columnar/src/writer/mod.rs b/columnar/src/writer/mod.rs index 8812ac4f01..e53000baec 100644 --- a/columnar/src/writer/mod.rs +++ b/columnar/src/writer/mod.rs @@ -34,15 +34,14 @@ struct SpareBuffers { /// /// ```rust /// use tantivy_columnar::ColumnarWriter; -/// fn main() { -/// let mut columnar_writer = ColumnarWriter::default(); -/// columnar_writer.record_str(0u32 /* doc id */, "product_name", "Red backpack"); -/// columnar_writer.record_numerical(0u32 /* doc id */, "price", 10u64); -/// columnar_writer.record_str(1u32 /* doc id */, "product_name", "Apple"); -/// columnar_writer.record_numerical(0u32 /* doc id */, "price", 10.5f64); //< uh oh we ended up mixing integer and floats. -/// let mut wrt: Vec = Vec::new(); -/// columnar_writer.serialize(2u32, &mut wrt).unwrap(); -/// } +/// +/// let mut columnar_writer = ColumnarWriter::default(); +/// columnar_writer.record_str(0u32 /* doc id */, "product_name", "Red backpack"); +/// columnar_writer.record_numerical(0u32 /* doc id */, "price", 10u64); +/// columnar_writer.record_str(1u32 /* doc id */, "product_name", "Apple"); +/// columnar_writer.record_numerical(0u32 /* doc id */, "price", 10.5f64); //< uh oh we ended up mixing integer and floats. +/// let mut wrt: Vec = Vec::new(); +/// columnar_writer.serialize(2u32, &mut wrt).unwrap(); /// ``` pub struct ColumnarWriter { numerical_field_hash_map: ArenaHashMap, diff --git a/columnar/src/writer/serializer.rs b/columnar/src/writer/serializer.rs index fa351f4968..e1751da504 100644 --- a/columnar/src/writer/serializer.rs +++ b/columnar/src/writer/serializer.rs @@ -15,10 +15,10 @@ pub struct ColumnarSerializer { /// Returns a key consisting of the concatenation of the key and the column_type_and_cardinality /// code. -fn prepare_key<'a>( +fn prepare_key( key: &[u8], column_type_cardinality: ColumnTypeAndCardinality, - buffer: &'a mut Vec, + buffer: &mut Vec, ) { buffer.clear(); buffer.extend_from_slice(key); diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index 9421c64785..652ff146da 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -3,6 +3,8 @@ use std::num::{ParseFloatError, ParseIntError}; use std::ops::Bound; use std::str::{FromStr, ParseBoolError}; +use base64::engine::general_purpose::STANDARD as BASE64; +use base64::Engine; use rustc_hash::FxHashMap; use tantivy_query_grammar::{UserInputAst, UserInputBound, UserInputLeaf, UserInputLiteral}; @@ -403,7 +405,9 @@ impl QueryParser { Err(e) => Err(QueryParserError::from(e)), }, FieldType::Bytes(_) => { - let bytes = base64::decode(phrase).map_err(QueryParserError::ExpectedBase64)?; + let bytes = BASE64 + .decode(phrase) + .map_err(QueryParserError::ExpectedBase64)?; Ok(Term::from_field_bytes(field, &bytes)) } FieldType::IpAddr(_) => { @@ -498,7 +502,9 @@ impl QueryParser { Err(e) => Err(QueryParserError::from(e)), }, FieldType::Bytes(_) => { - let bytes = base64::decode(phrase).map_err(QueryParserError::ExpectedBase64)?; + let bytes = BASE64 + .decode(phrase) + .map_err(QueryParserError::ExpectedBase64)?; let bytes_term = Term::from_field_bytes(field, &bytes); Ok(vec![LogicalLiteral::Term(bytes_term)]) } diff --git a/src/schema/field_type.rs b/src/schema/field_type.rs index b92f4448d7..b51fddb890 100644 --- a/src/schema/field_type.rs +++ b/src/schema/field_type.rs @@ -1,6 +1,8 @@ use std::net::IpAddr; use std::str::FromStr; +use base64::engine::general_purpose::STANDARD as BASE64; +use base64::Engine; use serde::{Deserialize, Serialize}; use serde_json::Value as JsonValue; use thiserror::Error; @@ -358,7 +360,8 @@ impl FieldType { json: JsonValue::String(field_text), }), FieldType::Facet(_) => Ok(Value::Facet(Facet::from(&field_text))), - FieldType::Bytes(_) => base64::decode(&field_text) + FieldType::Bytes(_) => BASE64 + .decode(&field_text) .map(Value::Bytes) .map_err(|_| ValueParsingError::InvalidBase64 { base64: field_text }), FieldType::JsonObject(_) => Err(ValueParsingError::TypeError { diff --git a/src/schema/value.rs b/src/schema/value.rs index d3df1c46c5..6e101c9a47 100644 --- a/src/schema/value.rs +++ b/src/schema/value.rs @@ -1,6 +1,8 @@ use std::fmt; use std::net::Ipv6Addr; +use base64::engine::general_purpose::STANDARD as BASE64; +use base64::Engine; use serde::de::Visitor; use serde::{Deserialize, Deserializer, Serialize, Serializer}; use serde_json::Map; @@ -51,7 +53,7 @@ impl Serialize for Value { Value::Bool(b) => serializer.serialize_bool(b), Value::Date(ref date) => time::serde::rfc3339::serialize(&date.into_utc(), serializer), Value::Facet(ref facet) => facet.serialize(serializer), - Value::Bytes(ref bytes) => serializer.serialize_str(&base64::encode(bytes)), + Value::Bytes(ref bytes) => serializer.serialize_str(&BASE64.encode(bytes)), Value::JsonObject(ref obj) => obj.serialize(serializer), Value::IpAddr(ref obj) => { // Ensure IpV4 addresses get serialized as IpV4, but excluding IpV6 loopback. diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index 1e1d921033..e283382e10 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -126,6 +126,7 @@ mod facet_tokenizer; mod lower_caser; mod ngram_tokenizer; mod raw_tokenizer; +mod regex_tokenizer; mod remove_long; mod simple_tokenizer; mod split_compound_words; @@ -135,7 +136,6 @@ mod tokenized_string; mod tokenizer; mod tokenizer_manager; mod whitespace_tokenizer; -mod regex_tokenizer; pub use tokenizer_api::{ BoxTokenFilter, BoxTokenStream, Token, TokenFilter, TokenStream, Tokenizer, @@ -147,6 +147,7 @@ pub use self::facet_tokenizer::FacetTokenizer; pub use self::lower_caser::LowerCaser; pub use self::ngram_tokenizer::NgramTokenizer; pub use self::raw_tokenizer::RawTokenizer; +pub use self::regex_tokenizer::RegexTokenizer; pub use self::remove_long::RemoveLongFilter; pub use self::simple_tokenizer::SimpleTokenizer; pub use self::split_compound_words::SplitCompoundWords; diff --git a/src/tokenizer/regex_tokenizer.rs b/src/tokenizer/regex_tokenizer.rs index 45f566ef2d..54bf3e228c 100644 --- a/src/tokenizer/regex_tokenizer.rs +++ b/src/tokenizer/regex_tokenizer.rs @@ -1,14 +1,15 @@ use regex::Regex; -use crate::TantivyError; use super::{BoxTokenStream, Token, TokenStream, Tokenizer}; +use crate::TantivyError; /// Tokenize the text by using a regex pattern to split. /// Each match of the regex emits a distinct token, empty tokens will not be emitted. Anchors such /// as `\A` will match the text from the part where the last token was emitted or the beginning of /// the complete text if no token was emitted yet. /// -/// Example: `` 'aaa' bbb 'ccc' 'ddd' `` with the pattern `` '(?:\w*)' `` will be tokenized as followed: +/// Example: `` 'aaa' bbb 'ccc' 'ddd' `` with the pattern `` '(?:\w*)' `` will be tokenized as +/// followed: /// /// | Term | aaa | ccc | ddd | /// |----------|------|--------|-------| @@ -21,7 +22,7 @@ use super::{BoxTokenStream, Token, TokenStream, Tokenizer}; /// ```rust /// use tantivy::tokenizer::*; /// -/// let tokenizer = RegexTokenizer::new(r"'(?:\w*)'"); +/// let tokenizer = RegexTokenizer::new(r"'(?:\w*)'").unwrap(); /// let mut stream = tokenizer.token_stream("'aaa' bbb 'ccc' 'ddd'"); /// { /// let token = stream.next().unwrap(); @@ -46,10 +47,11 @@ use super::{BoxTokenStream, Token, TokenStream, Tokenizer}; #[derive(Clone)] pub struct RegexTokenizer { - regex: Regex + regex: Regex, } impl RegexTokenizer { + /// Creates a new RegexTokenizer. pub fn new(regex_pattern: &str) -> crate::Result { Regex::new(regex_pattern) .map_err(|_| TantivyError::InvalidArgument(regex_pattern.to_owned())) @@ -63,6 +65,7 @@ impl Tokenizer for RegexTokenizer { regex: self.regex.clone(), text, token: Token::default(), + cursor: 0, }) } } @@ -71,25 +74,28 @@ pub struct RegexTokenStream<'a> { regex: Regex, text: &'a str, token: Token, + cursor: usize, } impl<'a> TokenStream for RegexTokenStream<'a> { fn advance(&mut self) -> bool { - if let Some(m) = self.regex.find(self.text) { - if !m.as_str().is_empty() { - self.token.text.clear(); - self.token.text.push_str(&self.text[m.start()..m.end()]); + let Some(regex_match) = self.regex.find(self.text) else { + return false; + }; + if regex_match.as_str().is_empty() { + return false; + } + self.token.text.clear(); + self.token.text.push_str(regex_match.as_str()); - self.token.offset_from = self.token.offset_to + m.start(); - self.token.offset_to = self.token.offset_to + m.end(); + self.token.offset_from = self.cursor + regex_match.start(); + self.cursor += regex_match.end(); + self.token.offset_to = self.cursor; - self.token.position = self.token.position.wrapping_add(1); + self.token.position = self.token.position.wrapping_add(1); - self.text = &self.text[m.end()..]; - return true - } - } - false + self.text = &self.text[regex_match.end()..]; + true } fn token(&self) -> &Token { @@ -103,10 +109,10 @@ impl<'a> TokenStream for RegexTokenStream<'a> { #[cfg(test)] mod tests { + use crate::tokenizer::regex_tokenizer::RegexTokenizer; use crate::tokenizer::tests::assert_token; use crate::tokenizer::{TextAnalyzer, Token}; - use crate::tokenizer::regex_tokenizer::RegexTokenizer; - + #[test] fn test_regex_tokenizer() { let tokens = token_stream_helper("'aaa' bbb 'ccc' 'ddd'", r"'(?:\w*)'"); @@ -132,7 +138,10 @@ mod tests { fn test_regexp_tokenizer_error_on_invalid_regex() { let tokenizer = RegexTokenizer::new(r"\@"); assert_eq!(tokenizer.is_err(), true); - assert_eq!(tokenizer.err().unwrap().to_string(), "An invalid argument was passed: '\\@'"); + assert_eq!( + tokenizer.err().unwrap().to_string(), + "An invalid argument was passed: '\\@'" + ); } fn token_stream_helper(text: &str, pattern: &str) -> Vec { @@ -146,4 +155,4 @@ mod tests { token_stream.process(&mut add_token); tokens } -} \ No newline at end of file +}