apache · jayzhan211 · Sep 30, 2024 · Oct 1, 2024 · Oct 1, 2024 · Oct 1, 2024
diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs
@@ -325,11 +325,7 @@ config_namespace! {
         /// Aggregation ratio (number of distinct groups / number of input rows)
         /// threshold for skipping partial aggregation. If the value is greater
         /// then partial aggregation will skip aggregation for further input
-        pub skip_partial_aggregation_probe_ratio_threshold: f64, default = 0.8
-
-        /// Number of input rows partial aggregation partition should process, before
-        /// aggregation ratio check and trying to switch to skipping aggregation mode
-        pub skip_partial_aggregation_probe_rows_threshold: usize, default = 100_000
+        pub skip_partial_aggregation_probe_ratio_threshold: f64, default = 0.1
 
         /// Should DataFusion use row number estimates at the input to decide
         /// whether increasing parallelism is beneficial or not. By default,

diff --git a/datafusion/core/tests/data/aggregate_mixed_type.csv b/datafusion/core/tests/data/aggregate_mixed_type.csv
@@ -0,0 +1,17 @@
+c1,c2
+1,'a'
+2,'b'
+3,'c'
+4,'d'
+1,'a'
+2,'b'
+3,'c'
+4,'d'
+4,'d'
+3,'c'
+3,'c'
+5,'e'
+6,'f'
+7,'g'
+8,'a'
+9,'b'
diff --git a/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs b/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs
@@ -44,6 +44,8 @@ use rand::rngs::StdRng;
 use rand::{Rng, SeedableRng};
 use tokio::task::JoinSet;
 
+const BATCH_SIZE: usize = 50;
+
 /// Tests that streaming aggregate and batch (non streaming) aggregate produce
 /// same results
 #[tokio::test(flavor = "multi_thread")]
@@ -60,13 +62,14 @@ async fn streaming_aggregate_test() {
     ];
     let n = 300;
     let distincts = vec![10, 20];
+    let len = 1000;
     for distinct in distincts {
         let mut join_set = JoinSet::new();
         for i in 0..n {
             let test_idx = i % test_cases.len();
             let group_by_columns = test_cases[test_idx].clone();
             join_set.spawn(run_aggregate_test(
-                make_staggered_batches::<true>(1000, distinct, i as u64),
+                make_staggered_batches::<true>(len, distinct, i as u64),
                 group_by_columns,
             ));
         }
@@ -77,13 +80,19 @@ async fn streaming_aggregate_test() {
     }
 }
 
+fn new_ctx() -> SessionContext {
+    let session_config = SessionConfig::new()
+        .with_batch_size(BATCH_SIZE)
+        // Ensure most of the fuzzing test cases doesn't skip the partial aggregation
+        .with_skip_partial_aggregation_probe_ratio_threshold(1.0);
+    SessionContext::new_with_config(session_config)
+}
+
 /// Perform batch and streaming aggregation with same input
 /// and verify outputs of `AggregateExec` with pipeline breaking stream `GroupedHashAggregateStream`
 /// and non-pipeline breaking stream `BoundedAggregateStream` produces same result.
 async fn run_aggregate_test(input1: Vec<RecordBatch>, group_by_columns: Vec<&str>) {
     let schema = input1[0].schema();
-    let session_config = SessionConfig::new().with_batch_size(50);
-    let ctx = SessionContext::new_with_config(session_config);
     let mut sort_keys = vec![];
     for ordering_col in ["a", "b", "c"] {
         sort_keys.push(PhysicalSortExpr {
@@ -141,14 +150,15 @@ async fn run_aggregate_test(input1: Vec<RecordBatch>, group_by_columns: Vec<&str
         .unwrap(),
     ) as Arc<dyn ExecutionPlan>;
 
-    let task_ctx = ctx.task_ctx();
+    let task_ctx = new_ctx().task_ctx();
     let collected_usual = collect(aggregate_exec_usual.clone(), task_ctx.clone())
         .await
         .unwrap();
 
     let collected_running = collect(aggregate_exec_running.clone(), task_ctx.clone())
         .await
         .unwrap();
+
     assert!(collected_running.len() > 2);
     // Running should produce more chunk than the usual AggregateExec.
     // Otherwise it means that we cannot generate result in running mode.
@@ -232,7 +242,7 @@ pub(crate) fn make_staggered_batches<const STREAM: bool>(
     let mut batches = vec![];
     if STREAM {
         while remainder.num_rows() > 0 {
-            let batch_size = rng.gen_range(0..50);
+            let batch_size = rng.gen_range(0..BATCH_SIZE);
             if remainder.num_rows() < batch_size {
                 break;
             }
@@ -287,8 +297,7 @@ async fn group_by_string_test(
     let expected = compute_counts(&input, column_name);
 
     let schema = input[0].schema();
-    let session_config = SessionConfig::new().with_batch_size(50);
-    let ctx = SessionContext::new_with_config(session_config);
+    let ctx = new_ctx();
 
     let provider = MemTable::try_new(schema.clone(), vec![input]).unwrap();
     let provider = if sorted {

diff --git a/datafusion/execution/src/config.rs b/datafusion/execution/src/config.rs
@@ -388,6 +388,17 @@ impl SessionConfig {
         self
     }
 
+    /// Set the threshold for skip partial aggregation ratio
+    pub fn with_skip_partial_aggregation_probe_ratio_threshold(
+        mut self,
+        threshold: f64,
+    ) -> Self {
+        self.options
+            .execution
+            .skip_partial_aggregation_probe_ratio_threshold = threshold;
+        self
+    }
+
     /// Returns true if record batches will be examined between each operator
     /// and small batches will be coalesced into larger batches.
     pub fn coalesce_batches(&self) -> bool {

diff --git a/datafusion/physical-plan/src/aggregates/group_values/bytes.rs b/datafusion/physical-plan/src/aggregates/group_values/bytes.rs
@@ -45,6 +45,7 @@ impl<O: OffsetSizeTrait> GroupValues for GroupValuesByes<O> {
         &mut self,
         cols: &[ArrayRef],
         groups: &mut Vec<usize>,
+        _batch_hashes: &[u64],
     ) -> datafusion_common::Result<()> {
         assert_eq!(cols.len(), 1);
 
@@ -108,7 +109,7 @@ impl<O: OffsetSizeTrait> GroupValues for GroupValuesByes<O> {
 
                 self.num_groups = 0;
                 let mut group_indexes = vec![];
-                self.intern(&[remaining_group_values], &mut group_indexes)?;
+                self.intern(&[remaining_group_values], &mut group_indexes, &[])?;
 
                 // Verify that the group indexes were assigned in the correct order
                 assert_eq!(0, group_indexes[0]);

diff --git a/datafusion/physical-plan/src/aggregates/group_values/bytes_view.rs b/datafusion/physical-plan/src/aggregates/group_values/bytes_view.rs
@@ -46,6 +46,7 @@ impl GroupValues for GroupValuesBytesView {
         &mut self,
         cols: &[ArrayRef],
         groups: &mut Vec<usize>,
+        _batch_hashes: &[u64],
     ) -> datafusion_common::Result<()> {
         assert_eq!(cols.len(), 1);
 
@@ -109,7 +110,7 @@ impl GroupValues for GroupValuesBytesView {
 
                 self.num_groups = 0;
                 let mut group_indexes = vec![];
-                self.intern(&[remaining_group_values], &mut group_indexes)?;
+                self.intern(&[remaining_group_values], &mut group_indexes, &[])?;
 
                 // Verify that the group indexes were assigned in the correct order
                 assert_eq!(0, group_indexes[0]);

diff --git a/datafusion/physical-plan/src/aggregates/group_values/column.rs b/datafusion/physical-plan/src/aggregates/group_values/column.rs
@@ -19,7 +19,6 @@ use crate::aggregates::group_values::group_column::{
     ByteGroupValueBuilder, GroupColumn, PrimitiveGroupValueBuilder,
 };
 use crate::aggregates::group_values::GroupValues;
-use ahash::RandomState;
 use arrow::compute::cast;
 use arrow::datatypes::{
     Date32Type, Date64Type, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type,
@@ -28,7 +27,6 @@ use arrow::datatypes::{
 use arrow::record_batch::RecordBatch;
 use arrow_array::{Array, ArrayRef};
 use arrow_schema::{DataType, Schema, SchemaRef};
-use datafusion_common::hash_utils::create_hashes;
 use datafusion_common::{not_impl_err, DataFusionError, Result};
 use datafusion_execution::memory_pool::proxy::{RawTableAllocExt, VecAllocExt};
 use datafusion_expr::EmitTo;
@@ -68,9 +66,6 @@ pub struct GroupValuesColumn {
 
     /// reused buffer to store hashes
     hashes_buffer: Vec<u64>,
-
-    /// Random state for creating hashes
-    random_state: RandomState,
 }
 
 impl GroupValuesColumn {
@@ -83,7 +78,6 @@ impl GroupValuesColumn {
             map_size: 0,
             group_values: vec![],
             hashes_buffer: Default::default(),
-            random_state: Default::default(),
         })
     }
 
@@ -143,9 +137,12 @@ macro_rules! instantiate_primitive {
 }
 
 impl GroupValues for GroupValuesColumn {
-    fn intern(&mut self, cols: &[ArrayRef], groups: &mut Vec<usize>) -> Result<()> {
-        let n_rows = cols[0].len();
-
+    fn intern(
+        &mut self,
+        cols: &[ArrayRef],
+        groups: &mut Vec<usize>,
+        batch_hashes: &[u64],
+    ) -> Result<()> {
         if self.group_values.is_empty() {
             let mut v = Vec::with_capacity(cols.len());
 
@@ -195,12 +192,6 @@ impl GroupValues for GroupValuesColumn {
         // tracks to which group each of the input rows belongs
         groups.clear();
 
-        // 1.1 Calculate the group keys for the group values
-        let batch_hashes = &mut self.hashes_buffer;
-        batch_hashes.clear();
-        batch_hashes.resize(n_rows, 0);
-        create_hashes(cols, &self.random_state, batch_hashes)?;
-
         for (row, &target_hash) in batch_hashes.iter().enumerate() {
             let entry = self.map.get_mut(target_hash, |(exist_hash, group_idx)| {
                 // Somewhat surprisingly, this closure can be called even if the

diff --git a/datafusion/physical-plan/src/aggregates/group_values/mod.rs b/datafusion/physical-plan/src/aggregates/group_values/mod.rs
@@ -86,7 +86,12 @@ pub trait GroupValues: Send {
     /// If a row has the same value as a previous row, the same group id is
     /// assigned. If a row has a new value, the next available group id is
     /// assigned.
-    fn intern(&mut self, cols: &[ArrayRef], groups: &mut Vec<usize>) -> Result<()>;
+    fn intern(
+        &mut self,
+        cols: &[ArrayRef],
+        groups: &mut Vec<usize>,
+        batch_hashes: &[u64],
+    ) -> Result<()>;
 
     /// Returns the number of bytes of memory used by this [`GroupValues`]
     fn size(&self) -> usize;

diff --git a/datafusion/physical-plan/src/aggregates/group_values/primitive.rs b/datafusion/physical-plan/src/aggregates/group_values/primitive.rs
@@ -111,7 +111,12 @@ impl<T: ArrowPrimitiveType> GroupValues for GroupValuesPrimitive<T>
 where
     T::Native: HashValue,
 {
-    fn intern(&mut self, cols: &[ArrayRef], groups: &mut Vec<usize>) -> Result<()> {
+    fn intern(
+        &mut self,
+        cols: &[ArrayRef],
+        groups: &mut Vec<usize>,
+        _batch_hashes: &[u64],
+    ) -> Result<()> {
         assert_eq!(cols.len(), 1);
         groups.clear();
 

diff --git a/datafusion/physical-plan/src/aggregates/group_values/row.rs b/datafusion/physical-plan/src/aggregates/group_values/row.rs
@@ -108,7 +108,12 @@ impl GroupValuesRows {
 }
 
 impl GroupValues for GroupValuesRows {
-    fn intern(&mut self, cols: &[ArrayRef], groups: &mut Vec<usize>) -> Result<()> {
+    fn intern(
+        &mut self,
+        cols: &[ArrayRef],
+        groups: &mut Vec<usize>,
+        _batch_hashes: &[u64],
+    ) -> Result<()> {
         // Convert the group keys into the row format
         let group_rows = &mut self.rows_buffer;
         group_rows.clear();