Skip to content

Commit

Permalink
Added a expand_dots JsonObjectOptions.
Browse files Browse the repository at this point in the history
Related with quickwit#2345.
  • Loading branch information
fulmicoton committed Nov 19, 2022
1 parent e758080 commit 639fef9
Show file tree
Hide file tree
Showing 6 changed files with 152 additions and 53 deletions.
76 changes: 62 additions & 14 deletions src/indexer/json_term_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,11 +67,12 @@ pub(crate) fn index_json_values<'a>(
doc: DocId,
json_values: impl Iterator<Item = crate::Result<&'a serde_json::Map<String, serde_json::Value>>>,
text_analyzer: &TextAnalyzer,
expand_dots_enabled: bool,
term_buffer: &mut Term,
postings_writer: &mut dyn PostingsWriter,
ctx: &mut IndexingContext,
) -> crate::Result<()> {
let mut json_term_writer = JsonTermWriter::wrap(term_buffer);
let mut json_term_writer = JsonTermWriter::wrap(term_buffer, expand_dots_enabled);
let mut positions_per_path: IndexingPositionsPerPath = Default::default();
for json_value_res in json_values {
let json_value = json_value_res?;
Expand Down Expand Up @@ -259,6 +260,7 @@ pub(crate) fn set_string_and_get_terms(
pub struct JsonTermWriter<'a> {
term_buffer: &'a mut Term,
path_stack: Vec<usize>,
expand_dots_enabled: bool,
}

/// Splits a json path supplied to the query parser in such a way that
Expand Down Expand Up @@ -298,23 +300,25 @@ impl<'a> JsonTermWriter<'a> {
pub fn from_field_and_json_path(
field: Field,
json_path: &str,
expand_dots_enabled: bool,
term_buffer: &'a mut Term,
) -> Self {
term_buffer.set_field_and_type(field, Type::Json);
let mut json_term_writer = Self::wrap(term_buffer);
let mut json_term_writer = Self::wrap(term_buffer, expand_dots_enabled);
for segment in split_json_path(json_path) {
json_term_writer.push_path_segment(&segment);
}
json_term_writer
}

pub fn wrap(term_buffer: &'a mut Term) -> Self {
pub fn wrap(term_buffer: &'a mut Term, expand_dots_enabled: bool) -> Self {
term_buffer.clear_with_type(Type::Json);
let mut path_stack = Vec::with_capacity(10);
path_stack.push(0);
Self {
term_buffer,
path_stack,
expand_dots_enabled,
}
}

Expand All @@ -336,11 +340,24 @@ impl<'a> JsonTermWriter<'a> {
self.trim_to_end_of_path();
let buffer = self.term_buffer.value_bytes_mut();
let buffer_len = buffer.len();

if self.path_stack.len() > 1 {
buffer[buffer_len - 1] = JSON_PATH_SEGMENT_SEP;
}
self.term_buffer.append_bytes(segment.as_bytes());
self.term_buffer.append_bytes(&[JSON_PATH_SEGMENT_SEP]);
if self.expand_dots_enabled && segment.as_bytes().contains(&b'.') {
// We need to replace `.` by JSON_PATH_SEGMENT_SEP.
self.term_buffer
.append_bytes(segment.as_bytes())
.iter_mut()
.for_each(|byte| {
if *byte == b'.' {
*byte = JSON_PATH_SEGMENT_SEP;
}
});
} else {
self.term_buffer.append_bytes(segment.as_bytes());
}
self.term_buffer.push_byte(JSON_PATH_SEGMENT_SEP);
self.path_stack.push(self.term_buffer.len_bytes());
}

Expand Down Expand Up @@ -391,7 +408,7 @@ mod tests {
fn test_json_writer() {
let field = Field::from_field_id(1);
let mut term = Term::with_type_and_field(Type::Json, field);
let mut json_writer = JsonTermWriter::wrap(&mut term);
let mut json_writer = JsonTermWriter::wrap(&mut term, false);
json_writer.push_path_segment("attributes");
json_writer.push_path_segment("color");
json_writer.set_str("red");
Expand Down Expand Up @@ -425,7 +442,7 @@ mod tests {
fn test_string_term() {
let field = Field::from_field_id(1);
let mut term = Term::with_type_and_field(Type::Json, field);
let mut json_writer = JsonTermWriter::wrap(&mut term);
let mut json_writer = JsonTermWriter::wrap(&mut term, false);
json_writer.push_path_segment("color");
json_writer.set_str("red");
assert_eq!(
Expand All @@ -438,7 +455,7 @@ mod tests {
fn test_i64_term() {
let field = Field::from_field_id(1);
let mut term = Term::with_type_and_field(Type::Json, field);
let mut json_writer = JsonTermWriter::wrap(&mut term);
let mut json_writer = JsonTermWriter::wrap(&mut term, false);
json_writer.push_path_segment("color");
json_writer.set_fast_value(-4i64);
assert_eq!(
Expand All @@ -451,7 +468,7 @@ mod tests {
fn test_u64_term() {
let field = Field::from_field_id(1);
let mut term = Term::with_type_and_field(Type::Json, field);
let mut json_writer = JsonTermWriter::wrap(&mut term);
let mut json_writer = JsonTermWriter::wrap(&mut term, false);
json_writer.push_path_segment("color");
json_writer.set_fast_value(4u64);
assert_eq!(
Expand All @@ -464,7 +481,7 @@ mod tests {
fn test_f64_term() {
let field = Field::from_field_id(1);
let mut term = Term::with_type_and_field(Type::Json, field);
let mut json_writer = JsonTermWriter::wrap(&mut term);
let mut json_writer = JsonTermWriter::wrap(&mut term, false);
json_writer.push_path_segment("color");
json_writer.set_fast_value(4.0f64);
assert_eq!(
Expand All @@ -477,7 +494,7 @@ mod tests {
fn test_bool_term() {
let field = Field::from_field_id(1);
let mut term = Term::with_type_and_field(Type::Json, field);
let mut json_writer = JsonTermWriter::wrap(&mut term);
let mut json_writer = JsonTermWriter::wrap(&mut term, false);
json_writer.push_path_segment("color");
json_writer.set_fast_value(true);
assert_eq!(
Expand All @@ -490,7 +507,7 @@ mod tests {
fn test_push_after_set_path_segment() {
let field = Field::from_field_id(1);
let mut term = Term::with_type_and_field(Type::Json, field);
let mut json_writer = JsonTermWriter::wrap(&mut term);
let mut json_writer = JsonTermWriter::wrap(&mut term, false);
json_writer.push_path_segment("attribute");
json_writer.set_str("something");
json_writer.push_path_segment("color");
Expand All @@ -505,7 +522,7 @@ mod tests {
fn test_pop_segment() {
let field = Field::from_field_id(1);
let mut term = Term::with_type_and_field(Type::Json, field);
let mut json_writer = JsonTermWriter::wrap(&mut term);
let mut json_writer = JsonTermWriter::wrap(&mut term, false);
json_writer.push_path_segment("color");
json_writer.push_path_segment("hue");
json_writer.pop_path_segment();
Expand All @@ -520,7 +537,7 @@ mod tests {
fn test_json_writer_path() {
let field = Field::from_field_id(1);
let mut term = Term::with_type_and_field(Type::Json, field);
let mut json_writer = JsonTermWriter::wrap(&mut term);
let mut json_writer = JsonTermWriter::wrap(&mut term, false);
json_writer.push_path_segment("color");
assert_eq!(json_writer.path(), b"color");
json_writer.push_path_segment("hue");
Expand All @@ -529,6 +546,37 @@ mod tests {
assert_eq!(json_writer.path(), b"color\x01hue");
}

#[test]
fn test_json_path_expand_dots_disabled() {
let field = Field::from_field_id(1);
let mut term = Term::with_type_and_field(Type::Json, field);
let mut json_writer = JsonTermWriter::wrap(&mut term, false);
json_writer.push_path_segment("color.hue");
assert_eq!(json_writer.path(), b"color.hue");
}

#[test]
fn test_json_path_expand_dots_enabled() {
let field = Field::from_field_id(1);
let mut term = Term::with_type_and_field(Type::Json, field);
let mut json_writer = JsonTermWriter::wrap(&mut term, true);
json_writer.push_path_segment("color.hue");
assert_eq!(json_writer.path(), b"color\x01hue");
}

#[test]
fn test_json_path_expand_dots_enabled_pop_segment() {
let field = Field::from_field_id(1);
let mut term = Term::with_type_and_field(Type::Json, field);
let mut json_writer = JsonTermWriter::wrap(&mut term, true);
json_writer.push_path_segment("hello");
assert_eq!(json_writer.path(), b"hello");
json_writer.push_path_segment("color.hue");
assert_eq!(json_writer.path(), b"hello\x01color\x01hue");
json_writer.pop_path_segment();
assert_eq!(json_writer.path(), b"hello");
}

#[test]
fn test_split_json_path_simple() {
let json_path = split_json_path("titi.toto");
Expand Down
28 changes: 25 additions & 3 deletions src/indexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ type AddBatchReceiver = channel::Receiver<AddBatch>;
mod tests_mmap {
use crate::collector::Count;
use crate::query::QueryParser;
use crate::schema::{Schema, STORED, TEXT};
use crate::schema::{JsonObjectOptions, Schema, TEXT};
use crate::{Index, Term};

#[test]
Expand All @@ -81,9 +81,9 @@ mod tests_mmap {
}

#[test]
fn test_json_field_espace() {
fn test_json_field_expand_dots_disabled_dot_escaped_required() {
let mut schema_builder = Schema::builder();
let json_field = schema_builder.add_json_field("json", TEXT | STORED);
let json_field = schema_builder.add_json_field("json", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests().unwrap();
let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello"});
Expand All @@ -99,4 +99,26 @@ mod tests_mmap {
let num_docs = searcher.search(&query, &Count).unwrap();
assert_eq!(num_docs, 1);
}

#[test]
fn test_json_field_expand_dots_enabled_dot_escape_not_required() {
let mut schema_builder = Schema::builder();
let json_options: JsonObjectOptions =
JsonObjectOptions::from(TEXT).set_expand_dots_enabled();
let json_field = schema_builder.add_json_field("json", json_options);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests().unwrap();
let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello"});
index_writer.add_document(doc!(json_field=>json)).unwrap();
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
assert_eq!(searcher.num_docs(), 1);
let parse_query = QueryParser::for_index(&index, Vec::new());
let query = parse_query
.parse_query(r#"json.k8s.container.name:prometheus"#)
.unwrap();
let num_docs = searcher.search(&query, &Count).unwrap();
assert_eq!(num_docs, 1);
}
}
13 changes: 7 additions & 6 deletions src/indexer/segment_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ impl SegmentWriter {
self.per_field_postings_writers.get_for_field_mut(field);
term_buffer.clear_with_field_and_type(field_entry.field_type().value_type(), field);

match *field_entry.field_type() {
match field_entry.field_type() {
FieldType::Facet(_) => {
for value in values {
let facet = value.as_facet().ok_or_else(make_schema_error)?;
Expand Down Expand Up @@ -307,14 +307,15 @@ impl SegmentWriter {
self.fieldnorms_writer.record(doc_id, field, num_vals);
}
}
FieldType::JsonObject(_) => {
FieldType::JsonObject(json_options) => {
let text_analyzer = &self.per_field_text_analyzers[field.field_id() as usize];
let json_values_it =
values.map(|value| value.as_json().ok_or_else(make_schema_error));
index_json_values(
doc_id,
json_values_it,
text_analyzer,
json_options.is_expand_dots_enabled(),
term_buffer,
postings_writer,
ctx,
Expand Down Expand Up @@ -557,7 +558,7 @@ mod tests {
let mut term = Term::with_type_and_field(Type::Json, json_field);
let mut term_stream = term_dict.stream().unwrap();

let mut json_term_writer = JsonTermWriter::wrap(&mut term);
let mut json_term_writer = JsonTermWriter::wrap(&mut term, false);

json_term_writer.push_path_segment("bool");
json_term_writer.set_fast_value(true);
Expand Down Expand Up @@ -648,7 +649,7 @@ mod tests {
let segment_reader = searcher.segment_reader(0u32);
let inv_index = segment_reader.inverted_index(json_field).unwrap();
let mut term = Term::with_type_and_field(Type::Json, json_field);
let mut json_term_writer = JsonTermWriter::wrap(&mut term);
let mut json_term_writer = JsonTermWriter::wrap(&mut term, false);
json_term_writer.push_path_segment("mykey");
json_term_writer.set_str("token");
let term_info = inv_index
Expand Down Expand Up @@ -692,7 +693,7 @@ mod tests {
let segment_reader = searcher.segment_reader(0u32);
let inv_index = segment_reader.inverted_index(json_field).unwrap();
let mut term = Term::with_type_and_field(Type::Json, json_field);
let mut json_term_writer = JsonTermWriter::wrap(&mut term);
let mut json_term_writer = JsonTermWriter::wrap(&mut term, false);
json_term_writer.push_path_segment("mykey");
json_term_writer.set_str("two tokens");
let term_info = inv_index
Expand Down Expand Up @@ -737,7 +738,7 @@ mod tests {
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let mut term = Term::with_type_and_field(Type::Json, json_field);
let mut json_term_writer = JsonTermWriter::wrap(&mut term);
let mut json_term_writer = JsonTermWriter::wrap(&mut term, false);
json_term_writer.push_path_segment("mykey");
json_term_writer.push_path_segment("field");
json_term_writer.set_str("hello");
Expand Down
58 changes: 30 additions & 28 deletions src/query/query_parser/query_parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ use crate::query::{
TermQuery, TermSetQuery,
};
use crate::schema::{
Facet, FacetParseError, Field, FieldType, IndexRecordOption, IntoIpv6Addr, Schema, Term, Type,
Facet, FacetParseError, Field, FieldType, IndexRecordOption, IntoIpv6Addr, JsonObjectOptions,
Schema, Term, Type,
};
use crate::time::format_description::well_known::Rfc3339;
use crate::time::OffsetDateTime;
Expand Down Expand Up @@ -482,28 +483,14 @@ impl QueryParser {
.into_iter()
.collect())
}
FieldType::JsonObject(ref json_options) => {
let option = json_options.get_text_indexing_options().ok_or_else(|| {
// This should have been seen earlier really.
QueryParserError::FieldNotIndexed(field_name.to_string())
})?;
let text_analyzer =
self.tokenizer_manager
.get(option.tokenizer())
.ok_or_else(|| QueryParserError::UnknownTokenizer {
field: field_name.to_string(),
tokenizer: option.tokenizer().to_string(),
})?;
let index_record_option = option.index_option();
generate_literals_for_json_object(
field_name,
field,
json_path,
phrase,
&text_analyzer,
index_record_option,
)
}
FieldType::JsonObject(ref json_options) => generate_literals_for_json_object(
field_name,
field,
json_path,
phrase,
&self.tokenizer_manager,
json_options,
),
FieldType::Facet(_) => match Facet::from_text(phrase) {
Ok(facet) => {
let facet_term = Term::from_facet(field, &facet);
Expand Down Expand Up @@ -767,17 +754,32 @@ fn generate_literals_for_json_object(
field: Field,
json_path: &str,
phrase: &str,
text_analyzer: &TextAnalyzer,
index_record_option: IndexRecordOption,
tokenizer_manager: &TokenizerManager,
json_options: &JsonObjectOptions,
) -> Result<Vec<LogicalLiteral>, QueryParserError> {
let text_options = json_options.get_text_indexing_options().ok_or_else(|| {
// This should have been seen earlier really.
QueryParserError::FieldNotIndexed(field_name.to_string())
})?;
let text_analyzer = tokenizer_manager
.get(text_options.tokenizer())
.ok_or_else(|| QueryParserError::UnknownTokenizer {
field: field_name.to_string(),
tokenizer: text_options.tokenizer().to_string(),
})?;
let index_record_option = text_options.index_option();
let mut logical_literals = Vec::new();
let mut term = Term::with_capacity(100);
let mut json_term_writer =
JsonTermWriter::from_field_and_json_path(field, json_path, &mut term);
let mut json_term_writer = JsonTermWriter::from_field_and_json_path(
field,
json_path,
json_options.is_expand_dots_enabled(),
&mut term,
);
if let Some(term) = convert_to_fast_value_and_get_term(&mut json_term_writer, phrase) {
logical_literals.push(LogicalLiteral::Term(term));
}
let terms = set_string_and_get_terms(&mut json_term_writer, phrase, text_analyzer);
let terms = set_string_and_get_terms(&mut json_term_writer, phrase, &text_analyzer);
drop(json_term_writer);
if terms.len() <= 1 {
for (_, term) in terms {
Expand Down
Loading

0 comments on commit 639fef9

Please sign in to comment.