From b9a1bf9b0659632fc7ecce1a519c7a69bd56b61e Mon Sep 17 00:00:00 2001 From: yangjiang Date: Wed, 10 Jan 2024 18:12:03 +0800 Subject: [PATCH 1/9] [Minor] extract const and add doc for in_list pruning --- .../core/src/physical_optimizer/pruning.rs | 93 ++++++++++++++++++- 1 file changed, 91 insertions(+), 2 deletions(-) diff --git a/datafusion/core/src/physical_optimizer/pruning.rs b/datafusion/core/src/physical_optimizer/pruning.rs index 2e372547053b..8f92582b01c3 100644 --- a/datafusion/core/src/physical_optimizer/pruning.rs +++ b/datafusion/core/src/physical_optimizer/pruning.rs @@ -926,11 +926,15 @@ fn build_is_null_column_expr( } } +const MAX_LIST_VALUE_SIZE_REWRITE: usize = 20; + /// Translate logical filter expression into pruning predicate /// expression that will evaluate to FALSE if it can be determined no /// rows between the min/max values could pass the predicates. /// /// Returns the pruning predicate as an [`PhysicalExpr`] +/// +/// Notice: For [`InListExpr`] if in list values more than 20, it will be rewritten to TRUE fn build_predicate_expression( expr: &Arc, schema: &Schema, @@ -960,7 +964,9 @@ fn build_predicate_expression( } } if let Some(in_list) = expr_any.downcast_ref::() { - if !in_list.list().is_empty() && in_list.list().len() < 20 { + if !in_list.list().is_empty() + && in_list.list().len() <= MAX_LIST_VALUE_SIZE_REWRITE + { let eq_op = if in_list.negated() { Operator::NotEq } else { @@ -1137,7 +1143,7 @@ mod tests { }; use datafusion_common::{ScalarValue, ToDFSchema}; use datafusion_expr::expr::InList; - use datafusion_expr::{cast, is_null, try_cast, Expr}; + use datafusion_expr::{cast, is_null, try_cast, Between, BinaryExpr, Expr}; use datafusion_physical_expr::create_physical_expr; use datafusion_physical_expr::execution_props::ExecutionProps; use std::collections::HashMap; @@ -1958,6 +1964,89 @@ mod tests { Ok(()) } + #[test] + fn row_group_predicate_between() -> Result<()> { + let schema = Schema::new(vec![ + Field::new("c1", DataType::Int32, false), + Field::new("c2", DataType::Int32, false), + ]); + + // test c1 BETWEEN 1 AND 5 + let expr1 = Expr::Between(Between { + expr: Box::new(col("c1")), + negated: false, + low: Box::new(lit(1)), + high: Box::new(lit(5)), + }); + + let expr2 = col("c1").gt_eq(lit(1)).and(col("c1").lt_eq(lit(5))); + + let predicate_expr1 = + test_build_predicate_expression(&expr1, &schema, &mut RequiredColumns::new()); + + let predicate_expr2 = + test_build_predicate_expression(&expr2, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr1.to_string(), predicate_expr2.to_string()); + + Ok(()) + } + + #[test] + fn row_group_predicate_between_with_in_list() -> Result<()> { + let schema = Schema::new(vec![ + Field::new("c1", DataType::Int32, false), + Field::new("c2", DataType::Int32, false), + ]); + // test c1 in(1, 2) + let expr1 = Expr::InList(InList::new( + Box::new(col("c1")), + vec![lit(1), lit(2)], + false, + )); + + // test c2 BETWEEN 4 AND 5 + let expr2 = Expr::Between(Between { + expr: Box::new(col("c2")), + negated: false, + low: Box::new(lit(4)), + high: Box::new(lit(5)), + }); + + // test c1 in(1, 2) and c2 BETWEEN 4 AND 5 + let expr3 = Expr::BinaryExpr(BinaryExpr { + left: Box::new(expr1), + op: Operator::And, + right: Box::new(expr2), + }); + + let expected_expr = "(c1_min@0 <= 1 AND 1 <= c1_max@1 OR c1_min@0 <= 2 AND 2 <= c1_max@1) AND c2_max@2 >= 4 AND c2_min@3 <= 5"; + let predicate_expr = + test_build_predicate_expression(&expr3, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + Ok(()) + } + + #[test] + fn row_group_predicate_in_list_to_many_values() -> Result<()> { + let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); + // test c1 in(1..21) + // in pruning.rs has MAX_LIST_VALUE_SIZE_REWRITE = 20, more than this value will be rewrite + // always true + let expr = Expr::InList(InList::new( + Box::new(col("c1")), + (1..=21).map(|i| lit(i)).collect(), + false, + )); + + let expected_expr = "true"; + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + Ok(()) + } + #[test] fn row_group_predicate_cast() -> Result<()> { let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); From 8b019be56ddffc34375d6573f5a845fdc28fb1a5 Mon Sep 17 00:00:00 2001 From: yangjiang Date: Wed, 10 Jan 2024 18:16:59 +0800 Subject: [PATCH 2/9] add explain --- datafusion/core/src/physical_optimizer/pruning.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/datafusion/core/src/physical_optimizer/pruning.rs b/datafusion/core/src/physical_optimizer/pruning.rs index 8f92582b01c3..93611662ba3e 100644 --- a/datafusion/core/src/physical_optimizer/pruning.rs +++ b/datafusion/core/src/physical_optimizer/pruning.rs @@ -1979,6 +1979,7 @@ mod tests { high: Box::new(lit(5)), }); + // test 1 <= c1 <= 5 let expr2 = col("c1").gt_eq(lit(1)).and(col("c1").lt_eq(lit(5))); let predicate_expr1 = From 2cd21e6a2595196531732c1210a5eba4234a9753 Mon Sep 17 00:00:00 2001 From: yangjiang Date: Wed, 10 Jan 2024 18:25:27 +0800 Subject: [PATCH 3/9] fix doc --- datafusion/core/src/physical_optimizer/pruning.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/core/src/physical_optimizer/pruning.rs b/datafusion/core/src/physical_optimizer/pruning.rs index 93611662ba3e..6ba488da97fa 100644 --- a/datafusion/core/src/physical_optimizer/pruning.rs +++ b/datafusion/core/src/physical_optimizer/pruning.rs @@ -934,7 +934,7 @@ const MAX_LIST_VALUE_SIZE_REWRITE: usize = 20; /// /// Returns the pruning predicate as an [`PhysicalExpr`] /// -/// Notice: For [`InListExpr`] if in list values more than 20, it will be rewritten to TRUE +/// Notice: For [`phys_expr::InListExpr`] if in list values more than 20, it will be rewritten to TRUE fn build_predicate_expression( expr: &Arc, schema: &Schema, From 7b7d4bb5c3f0c654c9577ab456996b6a368034d4 Mon Sep 17 00:00:00 2001 From: yangjiang Date: Wed, 10 Jan 2024 18:50:17 +0800 Subject: [PATCH 4/9] fix clippy --- datafusion/core/src/physical_optimizer/pruning.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/core/src/physical_optimizer/pruning.rs b/datafusion/core/src/physical_optimizer/pruning.rs index 6ba488da97fa..a33305a20df5 100644 --- a/datafusion/core/src/physical_optimizer/pruning.rs +++ b/datafusion/core/src/physical_optimizer/pruning.rs @@ -2036,7 +2036,7 @@ mod tests { // always true let expr = Expr::InList(InList::new( Box::new(col("c1")), - (1..=21).map(|i| lit(i)).collect(), + (1..=21).map(lit).collect(), false, )); From ae74ffa2feda2ae622c43195ef252476e8705271 Mon Sep 17 00:00:00 2001 From: Yang Jiang Date: Thu, 11 Jan 2024 10:16:13 +0800 Subject: [PATCH 5/9] Update datafusion/core/src/physical_optimizer/pruning.rs Co-authored-by: Andrew Lamb --- datafusion/core/src/physical_optimizer/pruning.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/datafusion/core/src/physical_optimizer/pruning.rs b/datafusion/core/src/physical_optimizer/pruning.rs index a33305a20df5..cc84353c4296 100644 --- a/datafusion/core/src/physical_optimizer/pruning.rs +++ b/datafusion/core/src/physical_optimizer/pruning.rs @@ -926,6 +926,8 @@ fn build_is_null_column_expr( } } +/// The maximum number of entries in an `InList` that might be rewritten into +/// an OR chain const MAX_LIST_VALUE_SIZE_REWRITE: usize = 20; /// Translate logical filter expression into pruning predicate From 5cee1b17a3a6994cda0c35ab1947b2f00c28c8e2 Mon Sep 17 00:00:00 2001 From: Yang Jiang Date: Thu, 11 Jan 2024 10:19:37 +0800 Subject: [PATCH 6/9] Update datafusion/core/src/physical_optimizer/pruning.rs Co-authored-by: Andrew Lamb --- datafusion/core/src/physical_optimizer/pruning.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/core/src/physical_optimizer/pruning.rs b/datafusion/core/src/physical_optimizer/pruning.rs index cc84353c4296..1102cbd3b4a4 100644 --- a/datafusion/core/src/physical_optimizer/pruning.rs +++ b/datafusion/core/src/physical_optimizer/pruning.rs @@ -936,7 +936,7 @@ const MAX_LIST_VALUE_SIZE_REWRITE: usize = 20; /// /// Returns the pruning predicate as an [`PhysicalExpr`] /// -/// Notice: For [`phys_expr::InListExpr`] if in list values more than 20, it will be rewritten to TRUE +/// Notice: Does not handle [`phys_expr::InListExpr`] greater than 20, which will be rewritten to TRUE fn build_predicate_expression( expr: &Arc, schema: &Schema, From 8a8a8e4939d86d3bd6a8f250d1a8cdff8123453f Mon Sep 17 00:00:00 2001 From: yangjiang Date: Thu, 11 Jan 2024 10:21:07 +0800 Subject: [PATCH 7/9] fix comment --- datafusion/core/src/physical_optimizer/pruning.rs | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/datafusion/core/src/physical_optimizer/pruning.rs b/datafusion/core/src/physical_optimizer/pruning.rs index 1102cbd3b4a4..dd02704344d5 100644 --- a/datafusion/core/src/physical_optimizer/pruning.rs +++ b/datafusion/core/src/physical_optimizer/pruning.rs @@ -1974,12 +1974,7 @@ mod tests { ]); // test c1 BETWEEN 1 AND 5 - let expr1 = Expr::Between(Between { - expr: Box::new(col("c1")), - negated: false, - low: Box::new(lit(1)), - high: Box::new(lit(5)), - }); + let expr1 = col("c1").between(lit(1), lit(5)); // test 1 <= c1 <= 5 let expr2 = col("c1").gt_eq(lit(1)).and(col("c1").lt_eq(lit(5))); @@ -2008,12 +2003,7 @@ mod tests { )); // test c2 BETWEEN 4 AND 5 - let expr2 = Expr::Between(Between { - expr: Box::new(col("c2")), - negated: false, - low: Box::new(lit(4)), - high: Box::new(lit(5)), - }); + let expr1 = col("c2").between(lit(4), lit(5)); // test c1 in(1, 2) and c2 BETWEEN 4 AND 5 let expr3 = Expr::BinaryExpr(BinaryExpr { From 054176071c7cc6d3aefa00f7cb302829550cd29e Mon Sep 17 00:00:00 2001 From: yangjiang Date: Thu, 11 Jan 2024 10:24:35 +0800 Subject: [PATCH 8/9] fix comment2 --- .../core/src/physical_optimizer/pruning.rs | 20 ++++--------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/datafusion/core/src/physical_optimizer/pruning.rs b/datafusion/core/src/physical_optimizer/pruning.rs index dd02704344d5..520790ff15a8 100644 --- a/datafusion/core/src/physical_optimizer/pruning.rs +++ b/datafusion/core/src/physical_optimizer/pruning.rs @@ -1996,21 +1996,13 @@ mod tests { Field::new("c2", DataType::Int32, false), ]); // test c1 in(1, 2) - let expr1 = Expr::InList(InList::new( - Box::new(col("c1")), - vec![lit(1), lit(2)], - false, - )); + let expr1 = col("cl").in_list(vec![lit(1), lit(2)], false); // test c2 BETWEEN 4 AND 5 - let expr1 = col("c2").between(lit(4), lit(5)); + let expr2 = col("c2").between(lit(4), lit(5)); // test c1 in(1, 2) and c2 BETWEEN 4 AND 5 - let expr3 = Expr::BinaryExpr(BinaryExpr { - left: Box::new(expr1), - op: Operator::And, - right: Box::new(expr2), - }); + let expr3 = expr1.and(expr2); let expected_expr = "(c1_min@0 <= 1 AND 1 <= c1_max@1 OR c1_min@0 <= 2 AND 2 <= c1_max@1) AND c2_max@2 >= 4 AND c2_min@3 <= 5"; let predicate_expr = @@ -2026,11 +2018,7 @@ mod tests { // test c1 in(1..21) // in pruning.rs has MAX_LIST_VALUE_SIZE_REWRITE = 20, more than this value will be rewrite // always true - let expr = Expr::InList(InList::new( - Box::new(col("c1")), - (1..=21).map(lit).collect(), - false, - )); + let expr = col("cl").in_list((1..=21).map(lit).collect(), false); let expected_expr = "true"; let predicate_expr = From 7094d19695720806c42af155b629b0649014a248 Mon Sep 17 00:00:00 2001 From: yangjiang Date: Thu, 11 Jan 2024 10:41:18 +0800 Subject: [PATCH 9/9] fix --- datafusion/core/src/physical_optimizer/pruning.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/datafusion/core/src/physical_optimizer/pruning.rs b/datafusion/core/src/physical_optimizer/pruning.rs index 520790ff15a8..b68dbabc0437 100644 --- a/datafusion/core/src/physical_optimizer/pruning.rs +++ b/datafusion/core/src/physical_optimizer/pruning.rs @@ -1145,7 +1145,7 @@ mod tests { }; use datafusion_common::{ScalarValue, ToDFSchema}; use datafusion_expr::expr::InList; - use datafusion_expr::{cast, is_null, try_cast, Between, BinaryExpr, Expr}; + use datafusion_expr::{cast, is_null, try_cast, Expr}; use datafusion_physical_expr::create_physical_expr; use datafusion_physical_expr::execution_props::ExecutionProps; use std::collections::HashMap; @@ -1996,7 +1996,7 @@ mod tests { Field::new("c2", DataType::Int32, false), ]); // test c1 in(1, 2) - let expr1 = col("cl").in_list(vec![lit(1), lit(2)], false); + let expr1 = col("c1").in_list(vec![lit(1), lit(2)], false); // test c2 BETWEEN 4 AND 5 let expr2 = col("c2").between(lit(4), lit(5)); @@ -2018,7 +2018,7 @@ mod tests { // test c1 in(1..21) // in pruning.rs has MAX_LIST_VALUE_SIZE_REWRITE = 20, more than this value will be rewrite // always true - let expr = col("cl").in_list((1..=21).map(lit).collect(), false); + let expr = col("c1").in_list((1..=21).map(lit).collect(), false); let expected_expr = "true"; let predicate_expr =