From 178c45b01fd88438bec898567f54f810ebdd5b27 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Thu, 21 Sep 2023 16:30:25 +0800 Subject: [PATCH 1/2] Read list of JSON fields encoded in dictionary add method to get list of fields on InvertedIndexReader --- Cargo.toml | 1 + src/core/inverted_index_reader.rs | 23 ++++++++++++++++++++- src/indexer/mod.rs | 33 +++++++++++++++++++++++++++++++ src/schema/field_type.rs | 5 +++-- 4 files changed, 59 insertions(+), 3 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 4cc9700fed..0efb79d5fd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -63,6 +63,7 @@ common = { version= "0.6", path = "./common/", package = "tantivy-common" } tokenizer-api = { version= "0.2", path="./tokenizer-api", package="tantivy-tokenizer-api" } sketches-ddsketch = { version = "0.2.1", features = ["use_serde"] } futures-util = { version = "0.3.28", optional = true } +fnv = "1.0.7" [target.'cfg(windows)'.dependencies] winapi = "0.3.9" diff --git a/src/core/inverted_index_reader.rs b/src/core/inverted_index_reader.rs index eacb43711d..b94933aed5 100644 --- a/src/core/inverted_index_reader.rs +++ b/src/core/inverted_index_reader.rs @@ -1,11 +1,12 @@ use std::io; use common::BinarySerializable; +use fnv::FnvHashSet; use crate::directory::FileSlice; use crate::positions::PositionReader; use crate::postings::{BlockSegmentPostings, SegmentPostings, TermInfo}; -use crate::schema::{IndexRecordOption, Term}; +use crate::schema::{IndexRecordOption, Term, JSON_END_OF_PATH}; use crate::termdict::TermDictionary; /// The inverted index reader is in charge of accessing @@ -69,6 +70,26 @@ impl InvertedIndexReader { &self.termdict } + /// Return the fields encoded in the dictionary in lexicographic oder. + /// Only valid on JSON fields. + /// + /// Notice: This requires a full scan and therefore **very expensive**. + pub fn list_fields(&self) -> io::Result> { + let mut stream = self.termdict.stream()?; + let mut fields = Vec::new(); + let mut fields_set = FnvHashSet::default(); + while let Some((term, _term_info)) = stream.next() { + if let Some(index) = term.iter().position(|&byte| byte == JSON_END_OF_PATH) { + if !fields_set.contains(&term[..index]) { + fields_set.insert(term[..index].to_vec()); + fields.push(String::from_utf8_lossy(&term[..index]).to_string()); + } + } + } + + Ok(fields) + } + /// Resets the block segment to another position of the postings /// file. /// diff --git a/src/indexer/mod.rs b/src/indexer/mod.rs index 2e8b5bf538..f90c43a9c0 100644 --- a/src/indexer/mod.rs +++ b/src/indexer/mod.rs @@ -133,4 +133,37 @@ mod tests_mmap { assert_eq!(num_docs, 1); } } + + #[test] + fn test_json_field_list_fields() { + let mut schema_builder = Schema::builder(); + let json_options: JsonObjectOptions = + JsonObjectOptions::from(TEXT).set_expand_dots_enabled(); + let json_field = schema_builder.add_json_field("json", json_options); + let index = Index::create_in_ram(schema_builder.build()); + let mut index_writer = index.writer_for_tests().unwrap(); + let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello", "sub": {"a": 1, "b": 2}}); + index_writer.add_document(doc!(json_field=>json)).unwrap(); + let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello", "suber": {"a": 1, "b": 2}}); + index_writer.add_document(doc!(json_field=>json)).unwrap(); + index_writer.commit().unwrap(); + let reader = index.reader().unwrap(); + + let searcher = reader.searcher(); + assert_eq!(searcher.num_docs(), 2); + + let reader = &searcher.segment_readers()[0]; + let inverted_index = reader.inverted_index(json_field).unwrap(); + assert_eq!( + inverted_index.list_fields().unwrap(), + [ + "k8s\u{1}container\u{1}name", + "sub\u{1}a", + "sub\u{1}b", + "suber\u{1}a", + "suber\u{1}b", + "val", + ] + ); + } } diff --git a/src/schema/field_type.rs b/src/schema/field_type.rs index a419f68354..166977b38f 100644 --- a/src/schema/field_type.rs +++ b/src/schema/field_type.rs @@ -313,9 +313,10 @@ impl FieldType { /// Parses a field value from json, given the target FieldType. /// - /// Tantivy will not try to cast values. + /// Tantivy will try to cast values only with the coerce option. /// For instance, If the json value is the integer `3` and the - /// target field is a `Str`, this method will return an Error. + /// target field is a `Str`, this method will return an Error if `coerce` + /// is not enabled. pub fn value_from_json(&self, json: JsonValue) -> Result { match json { JsonValue::String(field_text) => { From d439e8af9b02edb3d3ae0d81d2d9cdbcdc012653 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Mon, 2 Oct 2023 19:38:07 +0800 Subject: [PATCH 2/2] add field type --- src/core/inverted_index_reader.rs | 14 ++++++++------ src/indexer/mod.rs | 19 +++++++++++-------- 2 files changed, 19 insertions(+), 14 deletions(-) diff --git a/src/core/inverted_index_reader.rs b/src/core/inverted_index_reader.rs index b94933aed5..5326ce7e93 100644 --- a/src/core/inverted_index_reader.rs +++ b/src/core/inverted_index_reader.rs @@ -6,7 +6,7 @@ use fnv::FnvHashSet; use crate::directory::FileSlice; use crate::positions::PositionReader; use crate::postings::{BlockSegmentPostings, SegmentPostings, TermInfo}; -use crate::schema::{IndexRecordOption, Term, JSON_END_OF_PATH}; +use crate::schema::{IndexRecordOption, Term, Type, JSON_END_OF_PATH}; use crate::termdict::TermDictionary; /// The inverted index reader is in charge of accessing @@ -70,19 +70,21 @@ impl InvertedIndexReader { &self.termdict } - /// Return the fields encoded in the dictionary in lexicographic oder. + /// Return the fields and types encoded in the dictionary in lexicographic oder. /// Only valid on JSON fields. /// /// Notice: This requires a full scan and therefore **very expensive**. - pub fn list_fields(&self) -> io::Result> { + /// TODO: Move to sstable to use the index. + pub fn list_fields(&self) -> io::Result> { let mut stream = self.termdict.stream()?; let mut fields = Vec::new(); let mut fields_set = FnvHashSet::default(); while let Some((term, _term_info)) = stream.next() { if let Some(index) = term.iter().position(|&byte| byte == JSON_END_OF_PATH) { - if !fields_set.contains(&term[..index]) { - fields_set.insert(term[..index].to_vec()); - fields.push(String::from_utf8_lossy(&term[..index]).to_string()); + if !fields_set.contains(&term[..index + 2]) { + fields_set.insert(term[..index + 2].to_vec()); + let typ = Type::from_code(term[index + 1]).unwrap(); + fields.push((String::from_utf8_lossy(&term[..index]).to_string(), typ)); } } } diff --git a/src/indexer/mod.rs b/src/indexer/mod.rs index f90c43a9c0..79b0c826fd 100644 --- a/src/indexer/mod.rs +++ b/src/indexer/mod.rs @@ -54,7 +54,7 @@ mod tests_mmap { use crate::collector::Count; use crate::query::QueryParser; - use crate::schema::{JsonObjectOptions, Schema, TEXT}; + use crate::schema::{JsonObjectOptions, Schema, Type, TEXT}; use crate::{Index, IndexWriter, Term}; #[test] @@ -146,23 +146,26 @@ mod tests_mmap { index_writer.add_document(doc!(json_field=>json)).unwrap(); let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello", "suber": {"a": 1, "b": 2}}); index_writer.add_document(doc!(json_field=>json)).unwrap(); + let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello", "suber": {"a": "mixed", "b": 2}}); + index_writer.add_document(doc!(json_field=>json)).unwrap(); index_writer.commit().unwrap(); let reader = index.reader().unwrap(); let searcher = reader.searcher(); - assert_eq!(searcher.num_docs(), 2); + assert_eq!(searcher.num_docs(), 3); let reader = &searcher.segment_readers()[0]; let inverted_index = reader.inverted_index(json_field).unwrap(); assert_eq!( inverted_index.list_fields().unwrap(), [ - "k8s\u{1}container\u{1}name", - "sub\u{1}a", - "sub\u{1}b", - "suber\u{1}a", - "suber\u{1}b", - "val", + ("k8s\u{1}container\u{1}name".to_string(), Type::Str), + ("sub\u{1}a".to_string(), Type::I64), + ("sub\u{1}b".to_string(), Type::I64), + ("suber\u{1}a".to_string(), Type::I64), + ("suber\u{1}a".to_string(), Type::Str), + ("suber\u{1}b".to_string(), Type::I64), + ("val".to_string(), Type::Str), ] ); }