diff --git a/Cargo.toml b/Cargo.toml index 4cc9700fed..0efb79d5fd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -63,6 +63,7 @@ common = { version= "0.6", path = "./common/", package = "tantivy-common" } tokenizer-api = { version= "0.2", path="./tokenizer-api", package="tantivy-tokenizer-api" } sketches-ddsketch = { version = "0.2.1", features = ["use_serde"] } futures-util = { version = "0.3.28", optional = true } +fnv = "1.0.7" [target.'cfg(windows)'.dependencies] winapi = "0.3.9" diff --git a/src/core/inverted_index_reader.rs b/src/core/inverted_index_reader.rs index eacb43711d..5326ce7e93 100644 --- a/src/core/inverted_index_reader.rs +++ b/src/core/inverted_index_reader.rs @@ -1,11 +1,12 @@ use std::io; use common::BinarySerializable; +use fnv::FnvHashSet; use crate::directory::FileSlice; use crate::positions::PositionReader; use crate::postings::{BlockSegmentPostings, SegmentPostings, TermInfo}; -use crate::schema::{IndexRecordOption, Term}; +use crate::schema::{IndexRecordOption, Term, Type, JSON_END_OF_PATH}; use crate::termdict::TermDictionary; /// The inverted index reader is in charge of accessing @@ -69,6 +70,28 @@ impl InvertedIndexReader { &self.termdict } + /// Return the fields and types encoded in the dictionary in lexicographic oder. + /// Only valid on JSON fields. + /// + /// Notice: This requires a full scan and therefore **very expensive**. + /// TODO: Move to sstable to use the index. + pub fn list_fields(&self) -> io::Result> { + let mut stream = self.termdict.stream()?; + let mut fields = Vec::new(); + let mut fields_set = FnvHashSet::default(); + while let Some((term, _term_info)) = stream.next() { + if let Some(index) = term.iter().position(|&byte| byte == JSON_END_OF_PATH) { + if !fields_set.contains(&term[..index + 2]) { + fields_set.insert(term[..index + 2].to_vec()); + let typ = Type::from_code(term[index + 1]).unwrap(); + fields.push((String::from_utf8_lossy(&term[..index]).to_string(), typ)); + } + } + } + + Ok(fields) + } + /// Resets the block segment to another position of the postings /// file. /// diff --git a/src/indexer/mod.rs b/src/indexer/mod.rs index 2e8b5bf538..79b0c826fd 100644 --- a/src/indexer/mod.rs +++ b/src/indexer/mod.rs @@ -54,7 +54,7 @@ mod tests_mmap { use crate::collector::Count; use crate::query::QueryParser; - use crate::schema::{JsonObjectOptions, Schema, TEXT}; + use crate::schema::{JsonObjectOptions, Schema, Type, TEXT}; use crate::{Index, IndexWriter, Term}; #[test] @@ -133,4 +133,40 @@ mod tests_mmap { assert_eq!(num_docs, 1); } } + + #[test] + fn test_json_field_list_fields() { + let mut schema_builder = Schema::builder(); + let json_options: JsonObjectOptions = + JsonObjectOptions::from(TEXT).set_expand_dots_enabled(); + let json_field = schema_builder.add_json_field("json", json_options); + let index = Index::create_in_ram(schema_builder.build()); + let mut index_writer = index.writer_for_tests().unwrap(); + let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello", "sub": {"a": 1, "b": 2}}); + index_writer.add_document(doc!(json_field=>json)).unwrap(); + let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello", "suber": {"a": 1, "b": 2}}); + index_writer.add_document(doc!(json_field=>json)).unwrap(); + let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello", "suber": {"a": "mixed", "b": 2}}); + index_writer.add_document(doc!(json_field=>json)).unwrap(); + index_writer.commit().unwrap(); + let reader = index.reader().unwrap(); + + let searcher = reader.searcher(); + assert_eq!(searcher.num_docs(), 3); + + let reader = &searcher.segment_readers()[0]; + let inverted_index = reader.inverted_index(json_field).unwrap(); + assert_eq!( + inverted_index.list_fields().unwrap(), + [ + ("k8s\u{1}container\u{1}name".to_string(), Type::Str), + ("sub\u{1}a".to_string(), Type::I64), + ("sub\u{1}b".to_string(), Type::I64), + ("suber\u{1}a".to_string(), Type::I64), + ("suber\u{1}a".to_string(), Type::Str), + ("suber\u{1}b".to_string(), Type::I64), + ("val".to_string(), Type::Str), + ] + ); + } } diff --git a/src/schema/field_type.rs b/src/schema/field_type.rs index a419f68354..166977b38f 100644 --- a/src/schema/field_type.rs +++ b/src/schema/field_type.rs @@ -313,9 +313,10 @@ impl FieldType { /// Parses a field value from json, given the target FieldType. /// - /// Tantivy will not try to cast values. + /// Tantivy will try to cast values only with the coerce option. /// For instance, If the json value is the integer `3` and the - /// target field is a `Str`, this method will return an Error. + /// target field is a `Str`, this method will return an Error if `coerce` + /// is not enabled. pub fn value_from_json(&self, json: JsonValue) -> Result { match json { JsonValue::String(field_text) => {