quickwit-oss · PSeitz · Oct 9, 2023 · Sep 21, 2023 · Oct 2, 2023
diff --git a/Cargo.toml b/Cargo.toml
@@ -63,6 +63,7 @@ common = { version= "0.6", path = "./common/", package = "tantivy-common" }
 tokenizer-api = { version= "0.2", path="./tokenizer-api", package="tantivy-tokenizer-api" }
 sketches-ddsketch = { version = "0.2.1", features = ["use_serde"] }
 futures-util = { version = "0.3.28", optional = true }
+fnv = "1.0.7"
 
 [target.'cfg(windows)'.dependencies]
 winapi = "0.3.9"

diff --git a/src/core/inverted_index_reader.rs b/src/core/inverted_index_reader.rs
@@ -1,11 +1,12 @@
 use std::io;
 
 use common::BinarySerializable;
+use fnv::FnvHashSet;
 
 use crate::directory::FileSlice;
 use crate::positions::PositionReader;
 use crate::postings::{BlockSegmentPostings, SegmentPostings, TermInfo};
-use crate::schema::{IndexRecordOption, Term};
+use crate::schema::{IndexRecordOption, Term, Type, JSON_END_OF_PATH};
 use crate::termdict::TermDictionary;
 
 /// The inverted index reader is in charge of accessing
@@ -69,6 +70,28 @@ impl InvertedIndexReader {
         &self.termdict
     }
 
+    /// Return the fields and types encoded in the dictionary in lexicographic oder.
+    /// Only valid on JSON fields.
+    ///
+    /// Notice: This requires a full scan and therefore **very expensive**.
+    /// TODO: Move to sstable to use the index.
+    pub fn list_fields(&self) -> io::Result<Vec<(String, Type)>> {
+        let mut stream = self.termdict.stream()?;
+        let mut fields = Vec::new();
+        let mut fields_set = FnvHashSet::default();
+        while let Some((term, _term_info)) = stream.next() {
+            if let Some(index) = term.iter().position(|&byte| byte == JSON_END_OF_PATH) {
+                if !fields_set.contains(&term[..index + 2]) {
+                    fields_set.insert(term[..index + 2].to_vec());
+                    let typ = Type::from_code(term[index + 1]).unwrap();
+                    fields.push((String::from_utf8_lossy(&term[..index]).to_string(), typ));
+                }
+            }
+        }
+
+        Ok(fields)
+    }
+
     /// Resets the block segment to another position of the postings
     /// file.
     ///

diff --git a/src/indexer/mod.rs b/src/indexer/mod.rs
@@ -54,7 +54,7 @@ mod tests_mmap {
 
     use crate::collector::Count;
     use crate::query::QueryParser;
-    use crate::schema::{JsonObjectOptions, Schema, TEXT};
+    use crate::schema::{JsonObjectOptions, Schema, Type, TEXT};
     use crate::{Index, IndexWriter, Term};
 
     #[test]
@@ -133,4 +133,40 @@ mod tests_mmap {
             assert_eq!(num_docs, 1);
         }
     }
+
+    #[test]
+    fn test_json_field_list_fields() {
+        let mut schema_builder = Schema::builder();
+        let json_options: JsonObjectOptions =
+            JsonObjectOptions::from(TEXT).set_expand_dots_enabled();
+        let json_field = schema_builder.add_json_field("json", json_options);
+        let index = Index::create_in_ram(schema_builder.build());
+        let mut index_writer = index.writer_for_tests().unwrap();
+        let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello", "sub": {"a": 1, "b": 2}});
+        index_writer.add_document(doc!(json_field=>json)).unwrap();
+        let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello", "suber": {"a": 1, "b": 2}});
+        index_writer.add_document(doc!(json_field=>json)).unwrap();
+        let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello", "suber": {"a": "mixed", "b": 2}});
+        index_writer.add_document(doc!(json_field=>json)).unwrap();
+        index_writer.commit().unwrap();
+        let reader = index.reader().unwrap();
+
+        let searcher = reader.searcher();
+        assert_eq!(searcher.num_docs(), 3);
+
+        let reader = &searcher.segment_readers()[0];
+        let inverted_index = reader.inverted_index(json_field).unwrap();
+        assert_eq!(
+            inverted_index.list_fields().unwrap(),
+            [
+                ("k8s\u{1}container\u{1}name".to_string(), Type::Str),
+                ("sub\u{1}a".to_string(), Type::I64),
+                ("sub\u{1}b".to_string(), Type::I64),
+                ("suber\u{1}a".to_string(), Type::I64),
+                ("suber\u{1}a".to_string(), Type::Str),
+                ("suber\u{1}b".to_string(), Type::I64),
+                ("val".to_string(), Type::Str),
+            ]
+        );
+    }
 }
diff --git a/src/schema/field_type.rs b/src/schema/field_type.rs
@@ -313,9 +313,10 @@ impl FieldType {
 
     /// Parses a field value from json, given the target FieldType.
     ///
-    /// Tantivy will not try to cast values.
+    /// Tantivy will try to cast values only with the coerce option.
     /// For instance, If the json value is the integer `3` and the
-    /// target field is a `Str`, this method will return an Error.
+    /// target field is a `Str`, this method will return an Error if `coerce`
+    /// is not enabled.
     pub fn value_from_json(&self, json: JsonValue) -> Result<OwnedValue, ValueParsingError> {
         match json {
             JsonValue::String(field_text) => {