From 178c45b01fd88438bec898567f54f810ebdd5b27 Mon Sep 17 00:00:00 2001
From: Pascal Seitz <pascal.seitz@gmail.com>
Date: Thu, 21 Sep 2023 16:30:25 +0800
Subject: [PATCH 1/2] Read list of JSON fields encoded in dictionary

add method to get list of fields on InvertedIndexReader
---
 Cargo.toml                        |  1 +
 src/core/inverted_index_reader.rs | 23 ++++++++++++++++++++-
 src/indexer/mod.rs                | 33 +++++++++++++++++++++++++++++++
 src/schema/field_type.rs          |  5 +++--
 4 files changed, 59 insertions(+), 3 deletions(-)
diff --git a/Cargo.toml b/Cargo.toml
index 4cc9700fed..0efb79d5fd 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -63,6 +63,7 @@ common = { version= "0.6", path = "./common/", package = "tantivy-common" }
 tokenizer-api = { version= "0.2", path="./tokenizer-api", package="tantivy-tokenizer-api" }
 sketches-ddsketch = { version = "0.2.1", features = ["use_serde"] }
 futures-util = { version = "0.3.28", optional = true }
+fnv = "1.0.7"
 
 [target.'cfg(windows)'.dependencies]
 winapi = "0.3.9"
diff --git a/src/core/inverted_index_reader.rs b/src/core/inverted_index_reader.rs
index eacb43711d..b94933aed5 100644
--- a/src/core/inverted_index_reader.rs
+++ b/src/core/inverted_index_reader.rs
@@ -1,11 +1,12 @@
 use std::io;
 
 use common::BinarySerializable;
+use fnv::FnvHashSet;
 
 use crate::directory::FileSlice;
 use crate::positions::PositionReader;
 use crate::postings::{BlockSegmentPostings, SegmentPostings, TermInfo};
-use crate::schema::{IndexRecordOption, Term};
+use crate::schema::{IndexRecordOption, Term, JSON_END_OF_PATH};
 use crate::termdict::TermDictionary;
 
 /// The inverted index reader is in charge of accessing
@@ -69,6 +70,26 @@ impl InvertedIndexReader {
         &self.termdict
     }
 
+    /// Return the fields encoded in the dictionary in lexicographic oder.
+    /// Only valid on JSON fields.
+    ///
+    /// Notice: This requires a full scan and therefore **very expensive**.
+    pub fn list_fields(&self) -> io::Result<Vec<String>> {
+        let mut stream = self.termdict.stream()?;
+        let mut fields = Vec::new();
+        let mut fields_set = FnvHashSet::default();
+        while let Some((term, _term_info)) = stream.next() {
+            if let Some(index) = term.iter().position(|&byte| byte == JSON_END_OF_PATH) {
+                if !fields_set.contains(&term[..index]) {
+                    fields_set.insert(term[..index].to_vec());
+                    fields.push(String::from_utf8_lossy(&term[..index]).to_string());
+                }
+            }
+        }
+
+        Ok(fields)
+    }
+
     /// Resets the block segment to another position of the postings
     /// file.
     ///
diff --git a/src/indexer/mod.rs b/src/indexer/mod.rs
index 2e8b5bf538..f90c43a9c0 100644
--- a/src/indexer/mod.rs
+++ b/src/indexer/mod.rs
@@ -133,4 +133,37 @@ mod tests_mmap {
             assert_eq!(num_docs, 1);
         }
     }
+
+    #[test]
+    fn test_json_field_list_fields() {
+        let mut schema_builder = Schema::builder();
+        let json_options: JsonObjectOptions =
+            JsonObjectOptions::from(TEXT).set_expand_dots_enabled();
+        let json_field = schema_builder.add_json_field("json", json_options);
+        let index = Index::create_in_ram(schema_builder.build());
+        let mut index_writer = index.writer_for_tests().unwrap();
+        let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello", "sub": {"a": 1, "b": 2}});
+        index_writer.add_document(doc!(json_field=>json)).unwrap();
+        let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello", "suber": {"a": 1, "b": 2}});
+        index_writer.add_document(doc!(json_field=>json)).unwrap();
+        index_writer.commit().unwrap();
+        let reader = index.reader().unwrap();
+
+        let searcher = reader.searcher();
+        assert_eq!(searcher.num_docs(), 2);
+
+        let reader = &searcher.segment_readers()[0];
+        let inverted_index = reader.inverted_index(json_field).unwrap();
+        assert_eq!(
+            inverted_index.list_fields().unwrap(),
+            [
+                "k8s\u{1}container\u{1}name",
+                "sub\u{1}a",
+                "sub\u{1}b",
+                "suber\u{1}a",
+                "suber\u{1}b",
+                "val",
+            ]
+        );
+    }
 }
diff --git a/src/schema/field_type.rs b/src/schema/field_type.rs
index a419f68354..166977b38f 100644
--- a/src/schema/field_type.rs
+++ b/src/schema/field_type.rs
@@ -313,9 +313,10 @@ impl FieldType {
 
     /// Parses a field value from json, given the target FieldType.
     ///
-    /// Tantivy will not try to cast values.
+    /// Tantivy will try to cast values only with the coerce option.
     /// For instance, If the json value is the integer `3` and the
-    /// target field is a `Str`, this method will return an Error.
+    /// target field is a `Str`, this method will return an Error if `coerce`
+    /// is not enabled.
     pub fn value_from_json(&self, json: JsonValue) -> Result<OwnedValue, ValueParsingError> {
         match json {
             JsonValue::String(field_text) => {

From d439e8af9b02edb3d3ae0d81d2d9cdbcdc012653 Mon Sep 17 00:00:00 2001
From: Pascal Seitz <pascal.seitz@gmail.com>
Date: Mon, 2 Oct 2023 19:38:07 +0800
Subject: [PATCH 2/2] add field type

---
 src/core/inverted_index_reader.rs | 14 ++++++++------
 src/indexer/mod.rs                | 19 +++++++++++--------
 2 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/src/core/inverted_index_reader.rs b/src/core/inverted_index_reader.rs
index b94933aed5..5326ce7e93 100644
--- a/src/core/inverted_index_reader.rs
+++ b/src/core/inverted_index_reader.rs
@@ -6,7 +6,7 @@ use fnv::FnvHashSet;
 use crate::directory::FileSlice;
 use crate::positions::PositionReader;
 use crate::postings::{BlockSegmentPostings, SegmentPostings, TermInfo};
-use crate::schema::{IndexRecordOption, Term, JSON_END_OF_PATH};
+use crate::schema::{IndexRecordOption, Term, Type, JSON_END_OF_PATH};
 use crate::termdict::TermDictionary;
 
 /// The inverted index reader is in charge of accessing
@@ -70,19 +70,21 @@ impl InvertedIndexReader {
         &self.termdict
     }
 
-    /// Return the fields encoded in the dictionary in lexicographic oder.
+    /// Return the fields and types encoded in the dictionary in lexicographic oder.
     /// Only valid on JSON fields.
     ///
     /// Notice: This requires a full scan and therefore **very expensive**.
-    pub fn list_fields(&self) -> io::Result<Vec<String>> {
+    /// TODO: Move to sstable to use the index.
+    pub fn list_fields(&self) -> io::Result<Vec<(String, Type)>> {
         let mut stream = self.termdict.stream()?;
         let mut fields = Vec::new();
         let mut fields_set = FnvHashSet::default();
         while let Some((term, _term_info)) = stream.next() {
             if let Some(index) = term.iter().position(|&byte| byte == JSON_END_OF_PATH) {
-                if !fields_set.contains(&term[..index]) {
-                    fields_set.insert(term[..index].to_vec());
-                    fields.push(String::from_utf8_lossy(&term[..index]).to_string());
+                if !fields_set.contains(&term[..index + 2]) {
+                    fields_set.insert(term[..index + 2].to_vec());
+                    let typ = Type::from_code(term[index + 1]).unwrap();
+                    fields.push((String::from_utf8_lossy(&term[..index]).to_string(), typ));
                 }
             }
         }
diff --git a/src/indexer/mod.rs b/src/indexer/mod.rs
index f90c43a9c0..79b0c826fd 100644
--- a/src/indexer/mod.rs
+++ b/src/indexer/mod.rs
@@ -54,7 +54,7 @@ mod tests_mmap {
 
     use crate::collector::Count;
     use crate::query::QueryParser;
-    use crate::schema::{JsonObjectOptions, Schema, TEXT};
+    use crate::schema::{JsonObjectOptions, Schema, Type, TEXT};
     use crate::{Index, IndexWriter, Term};
 
     #[test]
@@ -146,23 +146,26 @@ mod tests_mmap {
         index_writer.add_document(doc!(json_field=>json)).unwrap();
         let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello", "suber": {"a": 1, "b": 2}});
         index_writer.add_document(doc!(json_field=>json)).unwrap();
+        let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello", "suber": {"a": "mixed", "b": 2}});
+        index_writer.add_document(doc!(json_field=>json)).unwrap();
         index_writer.commit().unwrap();
         let reader = index.reader().unwrap();
 
         let searcher = reader.searcher();
-        assert_eq!(searcher.num_docs(), 2);
+        assert_eq!(searcher.num_docs(), 3);
 
         let reader = &searcher.segment_readers()[0];
         let inverted_index = reader.inverted_index(json_field).unwrap();
         assert_eq!(
             inverted_index.list_fields().unwrap(),
             [
-                "k8s\u{1}container\u{1}name",
-                "sub\u{1}a",
-                "sub\u{1}b",
-                "suber\u{1}a",
-                "suber\u{1}b",
-                "val",
+                ("k8s\u{1}container\u{1}name".to_string(), Type::Str),
+                ("sub\u{1}a".to_string(), Type::I64),
+                ("sub\u{1}b".to_string(), Type::I64),
+                ("suber\u{1}a".to_string(), Type::I64),
+                ("suber\u{1}a".to_string(), Type::Str),
+                ("suber\u{1}b".to_string(), Type::I64),
+                ("val".to_string(), Type::Str),
             ]
         );
     }