quickwit-oss · PSeitz · Aug 28, 2023 · Aug 25, 2023 · Aug 28, 2023
diff --git a/src/aggregation/agg_req_with_accessor.rs b/src/aggregation/agg_req_with_accessor.rs
@@ -112,12 +112,24 @@ impl AggregationWithAccessor {
                     fallback_type,
                 )?
             }
-            Average(AverageAggregation { field: field_name })
-            | Count(CountAggregation { field: field_name })
-            | Max(MaxAggregation { field: field_name })
-            | Min(MinAggregation { field: field_name })
-            | Stats(StatsAggregation { field: field_name })
-            | Sum(SumAggregation { field: field_name }) => {
+            Average(AverageAggregation {
+                field: field_name, ..
+            })
+            | Count(CountAggregation {
+                field: field_name, ..
+            })
+            | Max(MaxAggregation {
+                field: field_name, ..
+            })
+            | Min(MinAggregation {
+                field: field_name, ..
+            })
+            | Stats(StatsAggregation {
+                field: field_name, ..
+            })
+            | Sum(SumAggregation {
+                field: field_name, ..
+            }) => {
                 let (accessor, field_type) =
                     get_ff_reader(reader, field_name, Some(get_numeric_or_date_column_types()))?;
 

diff --git a/src/aggregation/bucket/term_agg.rs b/src/aggregation/bucket/term_agg.rs
@@ -1455,6 +1455,47 @@ mod tests {
 
         Ok(())
     }
+    #[test]
+    fn terms_empty_json() -> crate::Result<()> {
+        let mut schema_builder = Schema::builder();
+        let json = schema_builder.add_json_field("json", FAST);
+        let schema = schema_builder.build();
+        let index = Index::create_in_ram(schema);
+        let mut index_writer = index.writer_for_tests().unwrap();
+        // => Segment with empty json
+        index_writer.add_document(doc!()).unwrap();
+        index_writer.commit().unwrap();
+        // => Segment with json, but no field partially_empty
+        index_writer
+            .add_document(doc!(json => json!({"different_field": "blue"})))
+            .unwrap();
+        index_writer.commit().unwrap();
+        //// => Segment with field partially_empty
+        index_writer
+            .add_document(doc!(json => json!({"partially_empty": "blue"})))
+            .unwrap();
+        index_writer.add_document(doc!())?;
+        index_writer.commit().unwrap();
+
+        let agg_req: Aggregations = serde_json::from_value(json!({
+            "my_texts": {
+                "terms": {
+                    "field": "json.partially_empty"
+                },
+            }
+        }))
+        .unwrap();
+
+        let res = exec_request_with_query(agg_req, &index, None)?;
+
+        assert_eq!(res["my_texts"]["buckets"][0]["key"], "blue");
+        assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 1);
+        assert_eq!(res["my_texts"]["buckets"][1], serde_json::Value::Null);
+        assert_eq!(res["my_texts"]["sum_other_doc_count"], 0);
+        assert_eq!(res["my_texts"]["doc_count_error_upper_bound"], 0);
+
+        Ok(())
+    }
 
     #[test]
     fn terms_aggregation_bytes() -> crate::Result<()> {
@@ -1492,6 +1533,7 @@ mod tests {
 
         Ok(())
     }
+
     #[test]
     fn terms_aggregation_missing_multi_value() -> crate::Result<()> {
         let mut schema_builder = Schema::builder();

diff --git a/src/aggregation/metric/average.rs b/src/aggregation/metric/average.rs
@@ -20,12 +20,21 @@ use super::{IntermediateStats, SegmentStatsCollector};
 pub struct AverageAggregation {
     /// The field name to compute the average on.
     pub field: String,
+    /// The missing parameter defines how documents that are missing a value should be treated.
+    /// By default they will be ignored but it is also possible to treat them as if they had a
+    /// value. Examples in JSON format:
+    /// { "field": "my_numbers", "missing": "10.0" }
+    #[serde(default)]
+    pub missing: Option<f64>,
 }
 
 impl AverageAggregation {
     /// Creates a new [`AverageAggregation`] instance from a field name.
     pub fn from_field_name(field_name: String) -> Self {
-        Self { field: field_name }
+        Self {
+            field: field_name,
+            missing: None,
+        }
     }
     /// Returns the field name the aggregation is computed on.
     pub fn field_name(&self) -> &str {

diff --git a/src/aggregation/metric/count.rs b/src/aggregation/metric/count.rs
@@ -20,12 +20,21 @@ use super::{IntermediateStats, SegmentStatsCollector};
 pub struct CountAggregation {
     /// The field name to compute the count on.
     pub field: String,
+    /// The missing parameter defines how documents that are missing a value should be treated.
+    /// By default they will be ignored but it is also possible to treat them as if they had a
+    /// value. Examples in JSON format:
+    /// { "field": "my_numbers", "missing": "10.0" }
+    #[serde(default)]
+    pub missing: Option<f64>,
 }
 
 impl CountAggregation {
     /// Creates a new [`CountAggregation`] instance from a field name.
     pub fn from_field_name(field_name: String) -> Self {
-        Self { field: field_name }
+        Self {
+            field: field_name,
+            missing: None,
+        }
     }
     /// Returns the field name the aggregation is computed on.
     pub fn field_name(&self) -> &str {

diff --git a/src/aggregation/metric/max.rs b/src/aggregation/metric/max.rs
@@ -20,12 +20,21 @@ use super::{IntermediateStats, SegmentStatsCollector};
 pub struct MaxAggregation {
     /// The field name to compute the maximum on.
     pub field: String,
+    /// The missing parameter defines how documents that are missing a value should be treated.
+    /// By default they will be ignored but it is also possible to treat them as if they had a
+    /// value. Examples in JSON format:
+    /// { "field": "my_numbers", "missing": "10.0" }
+    #[serde(default)]
+    pub missing: Option<f64>,
 }
 
 impl MaxAggregation {
     /// Creates a new [`MaxAggregation`] instance from a field name.
     pub fn from_field_name(field_name: String) -> Self {
-        Self { field: field_name }
+        Self {
+            field: field_name,
+            missing: None,
+        }
     }
     /// Returns the field name the aggregation is computed on.
     pub fn field_name(&self) -> &str {
@@ -56,3 +65,55 @@ impl IntermediateMax {
         self.stats.finalize().max
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use crate::aggregation::agg_req::Aggregations;
+    use crate::aggregation::tests::exec_request_with_query;
+    use crate::schema::{Schema, FAST};
+    use crate::Index;
+
+    #[test]
+    fn test_max_agg_with_missing() -> crate::Result<()> {
+        let mut schema_builder = Schema::builder();
+        let json = schema_builder.add_json_field("json", FAST);
+        let schema = schema_builder.build();
+        let index = Index::create_in_ram(schema);
+        let mut index_writer = index.writer_for_tests().unwrap();
+        // => Segment with empty json
+        index_writer.add_document(doc!()).unwrap();
+        index_writer.commit().unwrap();
+        // => Segment with json, but no field partially_empty
+        index_writer
+            .add_document(doc!(json => json!({"different_field": "blue"})))
+            .unwrap();
+        index_writer.commit().unwrap();
+        //// => Segment with field partially_empty
+        index_writer
+            .add_document(doc!(json => json!({"partially_empty": 10.0})))
+            .unwrap();
+        index_writer.add_document(doc!())?;
+        index_writer.commit().unwrap();
+
+        let agg_req: Aggregations = serde_json::from_value(json!({
+            "my_stats": {
+                "max": {
+                    "field": "json.partially_empty",
+                    "missing": 100.0,
+                }
+            }
+        }))
+        .unwrap();
+
+        let res = exec_request_with_query(agg_req, &index, None)?;
+
+        assert_eq!(
+            res["my_stats"],
+            json!({
+                "value": 100.0,
+            })
+        );
+
+        Ok(())
+    }
+}
diff --git a/src/aggregation/metric/min.rs b/src/aggregation/metric/min.rs
@@ -20,12 +20,21 @@ use super::{IntermediateStats, SegmentStatsCollector};
 pub struct MinAggregation {
     /// The field name to compute the minimum on.
     pub field: String,
+    /// The missing parameter defines how documents that are missing a value should be treated.
+    /// By default they will be ignored but it is also possible to treat them as if they had a
+    /// value. Examples in JSON format:
+    /// { "field": "my_numbers", "missing": "10.0" }
+    #[serde(default)]
+    pub missing: Option<f64>,
 }
 
 impl MinAggregation {
     /// Creates a new [`MinAggregation`] instance from a field name.
     pub fn from_field_name(field_name: String) -> Self {
-        Self { field: field_name }
+        Self {
+            field: field_name,
+            missing: None,
+        }
     }
     /// Returns the field name the aggregation is computed on.
     pub fn field_name(&self) -> &str {

diff --git a/src/aggregation/metric/percentiles.rs b/src/aggregation/metric/percentiles.rs
@@ -80,6 +80,12 @@
     /// Whether to return the percentiles as a hash map
     #[serde(default = "default_as_true")]
     pub keyed: bool,
+    /// The missing parameter defines how documents that are missing a value should be treated.
+    /// By default they will be ignored but it is also possible to treat them as if they had a
+    /// value. Examples in JSON format:
+    /// { "field": "my_numbers", "missing": "10.0" }
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub missing: Option<f64>,
 }
 fn default_percentiles() -> &'static [f64] {
     &[1.0, 5.0, 25.0, 50.0, 75.0, 95.0, 99.0]
@@ -95,6 +101,7 @@
             field: field_name,
             percents: None,
             keyed: default_as_true(),
+            missing: None,
         }
     }
     /// Returns the field name the aggregation is computed on.
@@ -463,7 +470,7 @@

    fn test_aggregation_percentiles(merge_segments: bool) -> crate::Result<()> {
        use rand_distr::Distribution;
        let num_values_in_segment = vec![100, 30_000, 8000];
        let lg_norm = rand_distr::LogNormal::new(2.996f64, 0.979f64).unwrap();
        let mut rng = StdRng::from_seed([1u8; 32]);