diff --git a/datafusion/src/physical_optimizer/pruning.rs b/datafusion/src/physical_optimizer/pruning.rs index a7e1fb00c230..9e8d9fa77858 100644 --- a/datafusion/src/physical_optimizer/pruning.rs +++ b/datafusion/src/physical_optimizer/pruning.rs @@ -553,12 +553,14 @@ fn build_predicate_expression( let corrected_op = expr_builder.correct_operator(op); let statistics_expr = match corrected_op { Operator::NotEq => { - // column != literal => (min, max) = literal => min > literal || literal > max + // column != literal => (min, max) = literal => + // !(min != literal && max != literal) ==> + // min != literal || literal != max let min_column_expr = expr_builder.min_column_expr()?; let max_column_expr = expr_builder.max_column_expr()?; min_column_expr - .gt(expr_builder.scalar_expr().clone()) - .or(expr_builder.scalar_expr().clone().gt(max_column_expr)) + .not_eq(expr_builder.scalar_expr().clone()) + .or(expr_builder.scalar_expr().clone().not_eq(max_column_expr)) } Operator::Eq => { // column = literal => (min, max) = literal => min <= literal && literal <= max @@ -940,7 +942,7 @@ mod tests { #[test] fn row_group_predicate_not_eq() -> Result<()> { let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); - let expected_expr = "#c1_min Gt Int32(1) Or Int32(1) Gt #c1_max"; + let expected_expr = "#c1_min NotEq Int32(1) Or Int32(1) NotEq #c1_max"; // test column on the left let expr = col("c1").not_eq(lit(1)); @@ -1190,6 +1192,34 @@ mod tests { assert_eq!(result, expected); } + #[test] + fn prune_not_eq_data() { + let schema = Arc::new(Schema::new(vec![Field::new("s1", DataType::Utf8, true)])); + + // Prune using s2 != 'M' + let expr = col("s1").not_eq(lit("M")); + + let statistics = TestStatistics::new().with( + "s1", + ContainerStats::new_utf8( + vec![Some("A"), Some("A"), Some("N"), Some("M"), None, Some("A")], // min + vec![Some("Z"), Some("L"), Some("Z"), Some("M"), None, None], // max + ), + ); + + // s1 [A, Z] ==> might have values that pass predicate + // s1 [A, L] ==> all rows pass the predicate + // s1 [N, Z] ==> all rows pass the predicate + // s1 [M, M] ==> all rows do not pass the predicate + // No stats for s2 ==> some rows could pass + // s2 [3, None] (null max) ==> some rows could pass + + let p = PruningPredicate::try_new(&expr, schema).unwrap(); + let result = p.prune(&statistics).unwrap(); + let expected = vec![true, true, true, false, true, true]; + assert_eq!(result, expected); + } + /// Creates setup for boolean chunk pruning /// /// For predicate "b1" (boolean expr)