Skip to content

Commit

Permalink
Minor mini fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
fulmicoton committed Jan 10, 2023
1 parent 196e42f commit 7a8fce0
Show file tree
Hide file tree
Showing 9 changed files with 76 additions and 62 deletions.
2 changes: 1 addition & 1 deletion columnar/src/writer/column_operation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ pub(super) trait SymbolValue: Clone + Copy {

impl SymbolValue for bool {
fn serialize(self, buffer: &mut [u8]) -> u8 {
buffer[0] = if self { 1u8 } else { 0u8 };
buffer[0] = u8::from(self);
1u8
}

Expand Down
44 changes: 19 additions & 25 deletions columnar/src/writer/column_writers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,18 @@ use crate::{Cardinality, DocId, NumericalType, NumericalValue};
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
#[repr(u8)]
enum DocumentStep {
SameDoc = 0,
NextDoc = 1,
SkippedDoc = 2,
Same = 0,
Next = 1,
Skipped = 2,
}

#[inline(always)]
fn delta_with_last_doc(last_doc_opt: Option<u32>, doc: u32) -> DocumentStep {
let expected_next_doc = last_doc_opt.map(|last_doc| last_doc + 1).unwrap_or(0u32);
match doc.cmp(&expected_next_doc) {
Ordering::Less => DocumentStep::SameDoc,
Ordering::Equal => DocumentStep::NextDoc,
Ordering::Greater => DocumentStep::SkippedDoc,
Ordering::Less => DocumentStep::Same,
Ordering::Equal => DocumentStep::Next,
Ordering::Greater => DocumentStep::Skipped,
}
}

Expand Down Expand Up @@ -56,15 +56,15 @@ impl ColumnWriter {
pub(super) fn record<S: SymbolValue>(&mut self, doc: DocId, value: S, arena: &mut MemoryArena) {
// Difference between `doc` and the last doc.
match delta_with_last_doc(self.last_doc_opt, doc) {
DocumentStep::SameDoc => {
DocumentStep::Same => {
// This is the last encounterred document.
self.cardinality = Cardinality::Multivalued;
}
DocumentStep::NextDoc => {
DocumentStep::Next => {
self.last_doc_opt = Some(doc);
self.write_symbol::<S>(ColumnOperation::NewDoc(doc), arena);
}
DocumentStep::SkippedDoc => {
DocumentStep::Skipped => {
self.cardinality = self.cardinality.max(Cardinality::Optional);
self.last_doc_opt = Some(doc);
self.write_symbol::<S>(ColumnOperation::NewDoc(doc), arena);
Expand All @@ -79,8 +79,8 @@ impl ColumnWriter {
// at the end of the column.
pub(crate) fn get_cardinality(&self, num_docs: DocId) -> Cardinality {
match delta_with_last_doc(self.last_doc_opt, num_docs) {
DocumentStep::SameDoc | DocumentStep::NextDoc => self.cardinality,
DocumentStep::SkippedDoc => self.cardinality.max(Cardinality::Optional),
DocumentStep::Same | DocumentStep::Next => self.cardinality,
DocumentStep::Skipped => self.cardinality.max(Cardinality::Optional),
}
}

Expand Down Expand Up @@ -215,20 +215,14 @@ mod tests {

#[test]
fn test_delta_with_last_doc() {
assert_eq!(delta_with_last_doc(None, 0u32), DocumentStep::NextDoc);
assert_eq!(delta_with_last_doc(None, 1u32), DocumentStep::SkippedDoc);
assert_eq!(delta_with_last_doc(None, 2u32), DocumentStep::SkippedDoc);
assert_eq!(delta_with_last_doc(Some(0u32), 0u32), DocumentStep::SameDoc);
assert_eq!(delta_with_last_doc(Some(1u32), 1u32), DocumentStep::SameDoc);
assert_eq!(delta_with_last_doc(Some(1u32), 2u32), DocumentStep::NextDoc);
assert_eq!(
delta_with_last_doc(Some(1u32), 3u32),
DocumentStep::SkippedDoc
);
assert_eq!(
delta_with_last_doc(Some(1u32), 4u32),
DocumentStep::SkippedDoc
);
assert_eq!(delta_with_last_doc(None, 0u32), DocumentStep::Next);
assert_eq!(delta_with_last_doc(None, 1u32), DocumentStep::Skipped);
assert_eq!(delta_with_last_doc(None, 2u32), DocumentStep::Skipped);
assert_eq!(delta_with_last_doc(Some(0u32), 0u32), DocumentStep::Same);
assert_eq!(delta_with_last_doc(Some(1u32), 1u32), DocumentStep::Same);
assert_eq!(delta_with_last_doc(Some(1u32), 2u32), DocumentStep::Next);
assert_eq!(delta_with_last_doc(Some(1u32), 3u32), DocumentStep::Skipped);
assert_eq!(delta_with_last_doc(Some(1u32), 4u32), DocumentStep::Skipped);
}

#[track_caller]
Expand Down
17 changes: 8 additions & 9 deletions columnar/src/writer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,15 +34,14 @@ struct SpareBuffers {
///
/// ```rust
/// use tantivy_columnar::ColumnarWriter;
/// fn main() {
/// let mut columnar_writer = ColumnarWriter::default();
/// columnar_writer.record_str(0u32 /* doc id */, "product_name", "Red backpack");
/// columnar_writer.record_numerical(0u32 /* doc id */, "price", 10u64);
/// columnar_writer.record_str(1u32 /* doc id */, "product_name", "Apple");
/// columnar_writer.record_numerical(0u32 /* doc id */, "price", 10.5f64); //< uh oh we ended up mixing integer and floats.
/// let mut wrt: Vec<u8> = Vec::new();
/// columnar_writer.serialize(2u32, &mut wrt).unwrap();
/// }
///
/// let mut columnar_writer = ColumnarWriter::default();
/// columnar_writer.record_str(0u32 /* doc id */, "product_name", "Red backpack");
/// columnar_writer.record_numerical(0u32 /* doc id */, "price", 10u64);
/// columnar_writer.record_str(1u32 /* doc id */, "product_name", "Apple");
/// columnar_writer.record_numerical(0u32 /* doc id */, "price", 10.5f64); //< uh oh we ended up mixing integer and floats.
/// let mut wrt: Vec<u8> = Vec::new();
/// columnar_writer.serialize(2u32, &mut wrt).unwrap();
/// ```
pub struct ColumnarWriter {
numerical_field_hash_map: ArenaHashMap,
Expand Down
4 changes: 2 additions & 2 deletions columnar/src/writer/serializer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@ pub struct ColumnarSerializer<W: io::Write> {

/// Returns a key consisting of the concatenation of the key and the column_type_and_cardinality
/// code.
fn prepare_key<'a>(
fn prepare_key(
key: &[u8],
column_type_cardinality: ColumnTypeAndCardinality,
buffer: &'a mut Vec<u8>,
buffer: &mut Vec<u8>,
) {
buffer.clear();
buffer.extend_from_slice(key);
Expand Down
10 changes: 8 additions & 2 deletions src/query/query_parser/query_parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ use std::num::{ParseFloatError, ParseIntError};
use std::ops::Bound;
use std::str::{FromStr, ParseBoolError};

use base64::engine::general_purpose::STANDARD as BASE64;
use base64::Engine;
use rustc_hash::FxHashMap;
use tantivy_query_grammar::{UserInputAst, UserInputBound, UserInputLeaf, UserInputLiteral};

Expand Down Expand Up @@ -403,7 +405,9 @@ impl QueryParser {
Err(e) => Err(QueryParserError::from(e)),
},
FieldType::Bytes(_) => {
let bytes = base64::decode(phrase).map_err(QueryParserError::ExpectedBase64)?;
let bytes = BASE64
.decode(phrase)
.map_err(QueryParserError::ExpectedBase64)?;
Ok(Term::from_field_bytes(field, &bytes))
}
FieldType::IpAddr(_) => {
Expand Down Expand Up @@ -498,7 +502,9 @@ impl QueryParser {
Err(e) => Err(QueryParserError::from(e)),
},
FieldType::Bytes(_) => {
let bytes = base64::decode(phrase).map_err(QueryParserError::ExpectedBase64)?;
let bytes = BASE64
.decode(phrase)
.map_err(QueryParserError::ExpectedBase64)?;
let bytes_term = Term::from_field_bytes(field, &bytes);
Ok(vec![LogicalLiteral::Term(bytes_term)])
}
Expand Down
5 changes: 4 additions & 1 deletion src/schema/field_type.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
use std::net::IpAddr;
use std::str::FromStr;

use base64::engine::general_purpose::STANDARD as BASE64;
use base64::Engine;
use serde::{Deserialize, Serialize};
use serde_json::Value as JsonValue;
use thiserror::Error;
Expand Down Expand Up @@ -358,7 +360,8 @@ impl FieldType {
json: JsonValue::String(field_text),
}),
FieldType::Facet(_) => Ok(Value::Facet(Facet::from(&field_text))),
FieldType::Bytes(_) => base64::decode(&field_text)
FieldType::Bytes(_) => BASE64
.decode(&field_text)
.map(Value::Bytes)
.map_err(|_| ValueParsingError::InvalidBase64 { base64: field_text }),
FieldType::JsonObject(_) => Err(ValueParsingError::TypeError {
Expand Down
4 changes: 3 additions & 1 deletion src/schema/value.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
use std::fmt;
use std::net::Ipv6Addr;

use base64::engine::general_purpose::STANDARD as BASE64;
use base64::Engine;
use serde::de::Visitor;
use serde::{Deserialize, Deserializer, Serialize, Serializer};
use serde_json::Map;
Expand Down Expand Up @@ -51,7 +53,7 @@ impl Serialize for Value {
Value::Bool(b) => serializer.serialize_bool(b),
Value::Date(ref date) => time::serde::rfc3339::serialize(&date.into_utc(), serializer),
Value::Facet(ref facet) => facet.serialize(serializer),
Value::Bytes(ref bytes) => serializer.serialize_str(&base64::encode(bytes)),
Value::Bytes(ref bytes) => serializer.serialize_str(&BASE64.encode(bytes)),
Value::JsonObject(ref obj) => obj.serialize(serializer),
Value::IpAddr(ref obj) => {
// Ensure IpV4 addresses get serialized as IpV4, but excluding IpV6 loopback.
Expand Down
3 changes: 2 additions & 1 deletion src/tokenizer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ mod facet_tokenizer;
mod lower_caser;
mod ngram_tokenizer;
mod raw_tokenizer;
mod regex_tokenizer;
mod remove_long;
mod simple_tokenizer;
mod split_compound_words;
Expand All @@ -135,7 +136,6 @@ mod tokenized_string;
mod tokenizer;
mod tokenizer_manager;
mod whitespace_tokenizer;
mod regex_tokenizer;

pub use tokenizer_api::{
BoxTokenFilter, BoxTokenStream, Token, TokenFilter, TokenStream, Tokenizer,
Expand All @@ -147,6 +147,7 @@ pub use self::facet_tokenizer::FacetTokenizer;
pub use self::lower_caser::LowerCaser;
pub use self::ngram_tokenizer::NgramTokenizer;
pub use self::raw_tokenizer::RawTokenizer;
pub use self::regex_tokenizer::RegexTokenizer;
pub use self::remove_long::RemoveLongFilter;
pub use self::simple_tokenizer::SimpleTokenizer;
pub use self::split_compound_words::SplitCompoundWords;
Expand Down
49 changes: 29 additions & 20 deletions src/tokenizer/regex_tokenizer.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
use regex::Regex;
use crate::TantivyError;

use super::{BoxTokenStream, Token, TokenStream, Tokenizer};
use crate::TantivyError;

/// Tokenize the text by using a regex pattern to split.
/// Each match of the regex emits a distinct token, empty tokens will not be emitted. Anchors such
/// as `\A` will match the text from the part where the last token was emitted or the beginning of
/// the complete text if no token was emitted yet.
///
/// Example: `` 'aaa' bbb 'ccc' 'ddd' `` with the pattern `` '(?:\w*)' `` will be tokenized as followed:
/// Example: `` 'aaa' bbb 'ccc' 'ddd' `` with the pattern `` '(?:\w*)' `` will be tokenized as
/// followed:
///
/// | Term | aaa | ccc | ddd |
/// |----------|------|--------|-------|
Expand All @@ -21,7 +22,7 @@ use super::{BoxTokenStream, Token, TokenStream, Tokenizer};
/// ```rust
/// use tantivy::tokenizer::*;
///
/// let tokenizer = RegexTokenizer::new(r"'(?:\w*)'");
/// let tokenizer = RegexTokenizer::new(r"'(?:\w*)'").unwrap();
/// let mut stream = tokenizer.token_stream("'aaa' bbb 'ccc' 'ddd'");
/// {
/// let token = stream.next().unwrap();
Expand All @@ -46,10 +47,11 @@ use super::{BoxTokenStream, Token, TokenStream, Tokenizer};

#[derive(Clone)]
pub struct RegexTokenizer {
regex: Regex
regex: Regex,
}

impl RegexTokenizer {
/// Creates a new RegexTokenizer.
pub fn new(regex_pattern: &str) -> crate::Result<RegexTokenizer> {
Regex::new(regex_pattern)
.map_err(|_| TantivyError::InvalidArgument(regex_pattern.to_owned()))
Expand All @@ -63,6 +65,7 @@ impl Tokenizer for RegexTokenizer {
regex: self.regex.clone(),
text,
token: Token::default(),
cursor: 0,
})
}
}
Expand All @@ -71,25 +74,28 @@ pub struct RegexTokenStream<'a> {
regex: Regex,
text: &'a str,
token: Token,
cursor: usize,
}

impl<'a> TokenStream for RegexTokenStream<'a> {
fn advance(&mut self) -> bool {
if let Some(m) = self.regex.find(self.text) {
if !m.as_str().is_empty() {
self.token.text.clear();
self.token.text.push_str(&self.text[m.start()..m.end()]);
let Some(regex_match) = self.regex.find(self.text) else {
return false;
};
if regex_match.as_str().is_empty() {
return false;
}
self.token.text.clear();
self.token.text.push_str(regex_match.as_str());

self.token.offset_from = self.token.offset_to + m.start();
self.token.offset_to = self.token.offset_to + m.end();
self.token.offset_from = self.cursor + regex_match.start();
self.cursor += regex_match.end();
self.token.offset_to = self.cursor;

self.token.position = self.token.position.wrapping_add(1);
self.token.position = self.token.position.wrapping_add(1);

self.text = &self.text[m.end()..];
return true
}
}
false
self.text = &self.text[regex_match.end()..];
true
}

fn token(&self) -> &Token {
Expand All @@ -103,10 +109,10 @@ impl<'a> TokenStream for RegexTokenStream<'a> {

#[cfg(test)]
mod tests {
use crate::tokenizer::regex_tokenizer::RegexTokenizer;
use crate::tokenizer::tests::assert_token;
use crate::tokenizer::{TextAnalyzer, Token};
use crate::tokenizer::regex_tokenizer::RegexTokenizer;


#[test]
fn test_regex_tokenizer() {
let tokens = token_stream_helper("'aaa' bbb 'ccc' 'ddd'", r"'(?:\w*)'");
Expand All @@ -132,7 +138,10 @@ mod tests {
fn test_regexp_tokenizer_error_on_invalid_regex() {
let tokenizer = RegexTokenizer::new(r"\@");
assert_eq!(tokenizer.is_err(), true);
assert_eq!(tokenizer.err().unwrap().to_string(), "An invalid argument was passed: '\\@'");
assert_eq!(
tokenizer.err().unwrap().to_string(),
"An invalid argument was passed: '\\@'"
);
}

fn token_stream_helper(text: &str, pattern: &str) -> Vec<Token> {
Expand All @@ -146,4 +155,4 @@ mod tests {
token_stream.process(&mut add_token);
tokens
}
}
}

0 comments on commit 7a8fce0

Please sign in to comment.