From 070576bfb47a6a7205026c9dbfb3397a06cc2986 Mon Sep 17 00:00:00 2001 From: Shabana Baig <43451943+s-akhtar-baig@users.noreply.github.com> Date: Thu, 13 Jun 2024 15:01:34 -0400 Subject: [PATCH] Implement `BoundPredicateVisitor` for `ManifestFilterVisitor` (#367) * Implement all functions of BoundPredicateVisitor for ManifestFilterVisitor * Fix code comments * Refactor code and fixpredicate for is_some_and * Refactor code * Handle review comments * Handle review comments * Handle review comments * Refactor code --- .../src/expr/visitors/manifest_evaluator.rs | 1144 +++++++++++++++-- 1 file changed, 1014 insertions(+), 130 deletions(-) diff --git a/crates/iceberg/src/expr/visitors/manifest_evaluator.rs b/crates/iceberg/src/expr/visitors/manifest_evaluator.rs index c1b6dbed9..30ae58f3b 100644 --- a/crates/iceberg/src/expr/visitors/manifest_evaluator.rs +++ b/crates/iceberg/src/expr/visitors/manifest_evaluator.rs @@ -17,8 +17,9 @@ use crate::expr::visitors::bound_predicate_visitor::{visit, BoundPredicateVisitor}; use crate::expr::{BoundPredicate, BoundReference}; -use crate::spec::{Datum, FieldSummary, ManifestFile}; +use crate::spec::{Datum, FieldSummary, ManifestFile, PrimitiveLiteral, Type}; use crate::Result; +use crate::{Error, ErrorKind}; use fnv::FnvHashSet; /// Evaluates a [`ManifestFile`] to see if the partition summaries @@ -65,17 +66,19 @@ impl<'a> ManifestFilterVisitor<'a> { } } -// Remove this annotation once all todos have been removed -#[allow(unused_variables)] +const ROWS_MIGHT_MATCH: Result = Ok(true); +const ROWS_CANNOT_MATCH: Result = Ok(false); +const IN_PREDICATE_LIMIT: usize = 200; + impl BoundPredicateVisitor for ManifestFilterVisitor<'_> { type T = bool; fn always_true(&mut self) -> crate::Result { - Ok(true) + ROWS_MIGHT_MATCH } fn always_false(&mut self) -> crate::Result { - Ok(false) + ROWS_CANNOT_MATCH } fn and(&mut self, lhs: bool, rhs: bool) -> crate::Result { @@ -103,7 +106,15 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> { reference: &BoundReference, _predicate: &BoundPredicate, ) -> crate::Result { - todo!() + let field = self.field_summary_for_reference(reference); + + // contains_null encodes whether at least one partition value is null, + // lowerBound is null if all partition values are null + if ManifestFilterVisitor::are_all_null(field, &reference.field().field_type) { + ROWS_CANNOT_MATCH + } else { + ROWS_MIGHT_MATCH + } } fn is_nan( @@ -111,10 +122,18 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> { reference: &BoundReference, _predicate: &BoundPredicate, ) -> crate::Result { - Ok(self - .field_summary_for_reference(reference) - .contains_nan - .is_some()) + let field = self.field_summary_for_reference(reference); + if let Some(contains_nan) = field.contains_nan { + if !contains_nan { + return ROWS_CANNOT_MATCH; + } + } + + if ManifestFilterVisitor::are_all_null(field, &reference.field().field_type) { + return ROWS_CANNOT_MATCH; + } + + ROWS_MIGHT_MATCH } fn not_nan( @@ -122,79 +141,210 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> { reference: &BoundReference, _predicate: &BoundPredicate, ) -> crate::Result { - todo!() + let field = self.field_summary_for_reference(reference); + if let Some(contains_nan) = field.contains_nan { + // check if all values are nan + if contains_nan && !field.contains_null && field.lower_bound.is_none() { + return ROWS_CANNOT_MATCH; + } + } + ROWS_MIGHT_MATCH } fn less_than( &mut self, reference: &BoundReference, - literal: &Datum, + datum: &Datum, _predicate: &BoundPredicate, ) -> crate::Result { - todo!() + let field = self.field_summary_for_reference(reference); + match &field.lower_bound { + Some(bound) if datum <= bound => ROWS_CANNOT_MATCH, + Some(_) => ROWS_MIGHT_MATCH, + None => ROWS_CANNOT_MATCH, + } } fn less_than_or_eq( &mut self, reference: &BoundReference, - literal: &Datum, + datum: &Datum, _predicate: &BoundPredicate, ) -> crate::Result { - todo!() + let field = self.field_summary_for_reference(reference); + match &field.lower_bound { + Some(bound) if datum < bound => ROWS_CANNOT_MATCH, + Some(_) => ROWS_MIGHT_MATCH, + None => ROWS_CANNOT_MATCH, + } } fn greater_than( &mut self, reference: &BoundReference, - literal: &Datum, + datum: &Datum, _predicate: &BoundPredicate, ) -> crate::Result { - todo!() + let field = self.field_summary_for_reference(reference); + match &field.upper_bound { + Some(bound) if datum >= bound => ROWS_CANNOT_MATCH, + Some(_) => ROWS_MIGHT_MATCH, + None => ROWS_CANNOT_MATCH, + } } fn greater_than_or_eq( &mut self, reference: &BoundReference, - literal: &Datum, + datum: &Datum, _predicate: &BoundPredicate, ) -> crate::Result { - todo!() + let field = self.field_summary_for_reference(reference); + match &field.upper_bound { + Some(bound) if datum > bound => ROWS_CANNOT_MATCH, + Some(_) => ROWS_MIGHT_MATCH, + None => ROWS_CANNOT_MATCH, + } } fn eq( &mut self, reference: &BoundReference, - literal: &Datum, + datum: &Datum, _predicate: &BoundPredicate, ) -> crate::Result { - todo!() + let field = self.field_summary_for_reference(reference); + + if field.lower_bound.is_none() || field.upper_bound.is_none() { + return ROWS_CANNOT_MATCH; + } + + if let Some(lower_bound) = &field.lower_bound { + if lower_bound > datum { + return ROWS_CANNOT_MATCH; + } + } + + if let Some(upper_bound) = &field.upper_bound { + if upper_bound < datum { + return ROWS_CANNOT_MATCH; + } + } + + ROWS_MIGHT_MATCH } fn not_eq( &mut self, - reference: &BoundReference, - literal: &Datum, + _reference: &BoundReference, + _datum: &Datum, _predicate: &BoundPredicate, ) -> crate::Result { - todo!() + // because the bounds are not necessarily a min or max value, this cannot be answered using + // them. notEq(col, X) with (X, Y) doesn't guarantee that X is a value in col. + ROWS_MIGHT_MATCH } fn starts_with( &mut self, reference: &BoundReference, - literal: &Datum, + datum: &Datum, _predicate: &BoundPredicate, ) -> crate::Result { - todo!() + let field = self.field_summary_for_reference(reference); + + if field.lower_bound.is_none() || field.upper_bound.is_none() { + return ROWS_CANNOT_MATCH; + } + + let prefix = ManifestFilterVisitor::datum_as_str( + datum, + "Cannot perform starts_with on non-string value", + )?; + let prefix_len = prefix.len(); + + if let Some(lower_bound) = &field.lower_bound { + let lower_bound_str = ManifestFilterVisitor::datum_as_str( + lower_bound, + "Cannot perform starts_with on non-string lower bound", + )?; + let min_len = lower_bound_str.len().min(prefix_len); + if prefix.as_bytes().lt(&lower_bound_str.as_bytes()[..min_len]) { + return ROWS_CANNOT_MATCH; + } + } + + if let Some(upper_bound) = &field.upper_bound { + let upper_bound_str = ManifestFilterVisitor::datum_as_str( + upper_bound, + "Cannot perform starts_with on non-string upper bound", + )?; + let min_len = upper_bound_str.len().min(prefix_len); + if prefix.as_bytes().gt(&upper_bound_str.as_bytes()[..min_len]) { + return ROWS_CANNOT_MATCH; + } + } + + ROWS_MIGHT_MATCH } fn not_starts_with( &mut self, reference: &BoundReference, - literal: &Datum, + datum: &Datum, _predicate: &BoundPredicate, ) -> crate::Result { - todo!() + let field = self.field_summary_for_reference(reference); + + if field.contains_null || field.lower_bound.is_none() || field.upper_bound.is_none() { + return ROWS_MIGHT_MATCH; + } + + let prefix = ManifestFilterVisitor::datum_as_str( + datum, + "Cannot perform not_starts_with on non-string value", + )?; + let prefix_len = prefix.len(); + + // not_starts_with will match unless all values must start with the prefix. This happens when + // the lower and upper bounds both start with the prefix. + if let Some(lower_bound) = &field.lower_bound { + let lower_bound_str = ManifestFilterVisitor::datum_as_str( + lower_bound, + "Cannot perform not_starts_with on non-string lower bound", + )?; + + // if lower is shorter than the prefix then lower doesn't start with the prefix + if prefix_len > lower_bound_str.len() { + return ROWS_MIGHT_MATCH; + } + + if prefix + .as_bytes() + .eq(&lower_bound_str.as_bytes()[..prefix_len]) + { + if let Some(upper_bound) = &field.upper_bound { + let upper_bound_str = ManifestFilterVisitor::datum_as_str( + upper_bound, + "Cannot perform not_starts_with on non-string upper bound", + )?; + + // if upper is shorter than the prefix then upper can't start with the prefix + if prefix_len > upper_bound_str.len() { + return ROWS_MIGHT_MATCH; + } + + if prefix + .as_bytes() + .eq(&upper_bound_str.as_bytes()[..prefix_len]) + { + return ROWS_CANNOT_MATCH; + } + } + } + } + + ROWS_MIGHT_MATCH } fn r#in( @@ -203,16 +353,39 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> { literals: &FnvHashSet, _predicate: &BoundPredicate, ) -> crate::Result { - todo!() + let field = self.field_summary_for_reference(reference); + if field.lower_bound.is_none() { + return ROWS_CANNOT_MATCH; + } + + if literals.len() > IN_PREDICATE_LIMIT { + return ROWS_MIGHT_MATCH; + } + + if let Some(lower_bound) = &field.lower_bound { + if literals.iter().all(|datum| lower_bound > datum) { + return ROWS_CANNOT_MATCH; + } + } + + if let Some(upper_bound) = &field.upper_bound { + if literals.iter().all(|datum| upper_bound < datum) { + return ROWS_CANNOT_MATCH; + } + } + + ROWS_MIGHT_MATCH } fn not_in( &mut self, - reference: &BoundReference, - literals: &FnvHashSet, + _reference: &BoundReference, + _literals: &FnvHashSet, _predicate: &BoundPredicate, ) -> crate::Result { - todo!() + // because the bounds are not necessarily a min or max value, this cannot be answered using + // them. notIn(col, {X, ...}) with (X, Y) doesn't guarantee that X is a value in col. + ROWS_MIGHT_MATCH } } @@ -221,43 +394,222 @@ impl ManifestFilterVisitor<'_> { let pos = reference.accessor().position(); &self.partitions[pos] } + + fn are_all_null(field: &FieldSummary, r#type: &Type) -> bool { + // contains_null encodes whether at least one partition value is null, + // lowerBound is null if all partition values are null + let mut all_null: bool = field.contains_null && field.lower_bound.is_none(); + + if all_null && r#type.is_floating_type() { + // floating point types may include NaN values, which we check separately. + // In case bounds don't include NaN value, contains_nan needs to be checked against. + all_null = match field.contains_nan { + Some(val) => !val, + None => false, + } + } + + all_null + } + + fn datum_as_str<'a>(bound: &'a Datum, err_msg: &str) -> crate::Result<&'a String> { + let PrimitiveLiteral::String(bound) = bound.literal() else { + return Err(Error::new(ErrorKind::Unexpected, err_msg)); + }; + Ok(bound) + } } #[cfg(test)] mod test { - use crate::expr::visitors::inclusive_projection::InclusiveProjection; use crate::expr::visitors::manifest_evaluator::ManifestEvaluator; use crate::expr::{ - Bind, BoundPredicate, Predicate, PredicateOperator, Reference, UnaryExpression, + BinaryExpression, Bind, Predicate, PredicateOperator, Reference, SetExpression, + UnaryExpression, }; use crate::spec::{ - FieldSummary, ManifestContentType, ManifestFile, NestedField, PartitionField, - PartitionSpec, PartitionSpecRef, PrimitiveType, Schema, SchemaRef, Transform, Type, + Datum, FieldSummary, ManifestContentType, ManifestFile, NestedField, PrimitiveType, Schema, + SchemaRef, Type, }; use crate::Result; + use fnv::FnvHashSet; + use std::ops::Not; use std::sync::Arc; - fn create_schema_and_partition_spec() -> Result<(SchemaRef, PartitionSpecRef)> { + const INT_MIN_VALUE: i32 = 30; + const INT_MAX_VALUE: i32 = 79; + + const STRING_MIN_VALUE: &str = "a"; + const STRING_MAX_VALUE: &str = "z"; + + fn create_schema() -> Result { let schema = Schema::builder() - .with_fields(vec![Arc::new(NestedField::optional( - 1, - "a", - Type::Primitive(PrimitiveType::Float), - ))]) + .with_fields(vec![ + Arc::new(NestedField::required( + 1, + "id", + Type::Primitive(PrimitiveType::Int), + )), + Arc::new(NestedField::optional( + 2, + "all_nulls_missing_nan", + Type::Primitive(PrimitiveType::String), + )), + Arc::new(NestedField::optional( + 3, + "some_nulls", + Type::Primitive(PrimitiveType::String), + )), + Arc::new(NestedField::optional( + 4, + "no_nulls", + Type::Primitive(PrimitiveType::String), + )), + Arc::new(NestedField::optional( + 5, + "float", + Type::Primitive(PrimitiveType::Float), + )), + Arc::new(NestedField::optional( + 6, + "all_nulls_double", + Type::Primitive(PrimitiveType::Double), + )), + Arc::new(NestedField::optional( + 7, + "all_nulls_no_nans", + Type::Primitive(PrimitiveType::Float), + )), + Arc::new(NestedField::optional( + 8, + "all_nans", + Type::Primitive(PrimitiveType::Double), + )), + Arc::new(NestedField::optional( + 9, + "both_nan_and_null", + Type::Primitive(PrimitiveType::Float), + )), + Arc::new(NestedField::optional( + 10, + "no_nan_or_null", + Type::Primitive(PrimitiveType::Double), + )), + Arc::new(NestedField::optional( + 11, + "all_nulls_missing_nan_float", + Type::Primitive(PrimitiveType::Float), + )), + Arc::new(NestedField::optional( + 12, + "all_same_value_or_null", + Type::Primitive(PrimitiveType::String), + )), + Arc::new(NestedField::optional( + 13, + "no_nulls_same_value_a", + Type::Primitive(PrimitiveType::String), + )), + ]) .build()?; - let spec = PartitionSpec::builder() - .with_spec_id(1) - .with_fields(vec![PartitionField::builder() - .source_id(1) - .name("a".to_string()) - .field_id(1) - .transform(Transform::Identity) - .build()]) - .build() - .unwrap(); + Ok(Arc::new(schema)) + } - Ok((Arc::new(schema), Arc::new(spec))) + fn create_partitions() -> Vec { + vec![ + // id + FieldSummary { + contains_null: false, + contains_nan: None, + lower_bound: Some(Datum::int(INT_MIN_VALUE)), + upper_bound: Some(Datum::int(INT_MAX_VALUE)), + }, + // all_nulls_missing_nan + FieldSummary { + contains_null: true, + contains_nan: None, + lower_bound: None, + upper_bound: None, + }, + // some_nulls + FieldSummary { + contains_null: true, + contains_nan: None, + lower_bound: Some(Datum::string(STRING_MIN_VALUE)), + upper_bound: Some(Datum::string(STRING_MAX_VALUE)), + }, + // no_nulls + FieldSummary { + contains_null: false, + contains_nan: None, + lower_bound: Some(Datum::string(STRING_MIN_VALUE)), + upper_bound: Some(Datum::string(STRING_MAX_VALUE)), + }, + // float + FieldSummary { + contains_null: true, + contains_nan: None, + lower_bound: Some(Datum::float(0.0)), + upper_bound: Some(Datum::float(20.0)), + }, + // all_nulls_double + FieldSummary { + contains_null: true, + contains_nan: None, + lower_bound: None, + upper_bound: None, + }, + // all_nulls_no_nans + FieldSummary { + contains_null: true, + contains_nan: Some(false), + lower_bound: None, + upper_bound: None, + }, + // all_nans + FieldSummary { + contains_null: false, + contains_nan: Some(true), + lower_bound: None, + upper_bound: None, + }, + // both_nan_and_null + FieldSummary { + contains_null: true, + contains_nan: Some(true), + lower_bound: None, + upper_bound: None, + }, + // no_nan_or_null + FieldSummary { + contains_null: false, + contains_nan: Some(false), + lower_bound: Some(Datum::float(0.0)), + upper_bound: Some(Datum::float(20.0)), + }, + // all_nulls_missing_nan_float + FieldSummary { + contains_null: true, + contains_nan: None, + lower_bound: None, + upper_bound: None, + }, + // all_same_value_or_null + FieldSummary { + contains_null: true, + contains_nan: None, + lower_bound: Some(Datum::string(STRING_MIN_VALUE)), + upper_bound: Some(Datum::string(STRING_MIN_VALUE)), + }, + // no_nulls_same_value_a + FieldSummary { + contains_null: false, + contains_nan: None, + lower_bound: Some(Datum::string(STRING_MIN_VALUE)), + upper_bound: Some(Datum::string(STRING_MIN_VALUE)), + }, + ] } fn create_manifest_file(partitions: Vec) -> ManifestFile { @@ -280,131 +632,663 @@ mod test { } } - fn create_partition_schema( - partition_spec: &PartitionSpecRef, - schema: &Schema, - ) -> Result { - let partition_type = partition_spec.partition_type(schema)?; + #[test] + fn test_always_true() -> Result<()> { + let case_sensitive = false; + let schema = create_schema()?; + let partitions = create_partitions(); + let manifest_file = create_manifest_file(partitions); - let partition_fields: Vec<_> = partition_type.fields().iter().map(Arc::clone).collect(); + let filter = Predicate::AlwaysTrue.bind(schema.clone(), case_sensitive)?; + + assert!(ManifestEvaluator::new(filter).eval(&manifest_file)?); + + Ok(()) + } + + #[test] + fn test_always_false() -> Result<()> { + let case_sensitive = false; + let schema = create_schema()?; + let partitions = create_partitions(); + let manifest_file = create_manifest_file(partitions); + + let filter = Predicate::AlwaysFalse.bind(schema.clone(), case_sensitive)?; - let partition_schema = Arc::new( - Schema::builder() - .with_schema_id(partition_spec.spec_id) - .with_fields(partition_fields) - .build()?, + assert!(!ManifestEvaluator::new(filter).eval(&manifest_file)?); + + Ok(()) + } + + #[test] + fn test_all_nulls() -> Result<()> { + let case_sensitive = true; + let schema = create_schema()?; + let partitions = create_partitions(); + let manifest_file = create_manifest_file(partitions); + + // all_nulls_missing_nan + let all_nulls_missing_nan_filter = Predicate::Unary(UnaryExpression::new( + PredicateOperator::NotNull, + Reference::new("all_nulls_missing_nan"), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + !ManifestEvaluator::new(all_nulls_missing_nan_filter).eval(&manifest_file)?, + "Should skip: all nulls column with non-floating type contains all null" ); - Ok(partition_schema) + // all_nulls_missing_nan_float + let all_nulls_missing_nan_float_filter = Predicate::Unary(UnaryExpression::new( + PredicateOperator::NotNull, + Reference::new("all_nulls_missing_nan_float"), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + ManifestEvaluator::new(all_nulls_missing_nan_float_filter).eval(&manifest_file)?, + "Should read: no NaN information may indicate presence of NaN value" + ); + + // some_nulls + let some_nulls_filter = Predicate::Unary(UnaryExpression::new( + PredicateOperator::NotNull, + Reference::new("some_nulls"), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + ManifestEvaluator::new(some_nulls_filter).eval(&manifest_file)?, + "Should read: column with some nulls contains a non-null value" + ); + + // no_nulls + let no_nulls_filter = Predicate::Unary(UnaryExpression::new( + PredicateOperator::NotNull, + Reference::new("no_nulls"), + )) + .bind(schema.clone(), case_sensitive)?; + + assert!( + ManifestEvaluator::new(no_nulls_filter).eval(&manifest_file)?, + "Should read: non-null column contains a non-null value" + ); + + Ok(()) } - fn create_partition_filter( - partition_spec: PartitionSpecRef, - partition_schema: SchemaRef, - filter: &BoundPredicate, - case_sensitive: bool, - ) -> Result { - let mut inclusive_projection = InclusiveProjection::new(partition_spec); + #[test] + fn test_no_nulls() -> Result<()> { + let case_sensitive = true; + let schema = create_schema()?; + let partitions = create_partitions(); + let manifest_file = create_manifest_file(partitions); - let partition_filter = inclusive_projection - .project(filter)? - .rewrite_not() - .bind(partition_schema, case_sensitive)?; + // all_nulls_missing_nan + let all_nulls_missing_nan_filter = Predicate::Unary(UnaryExpression::new( + PredicateOperator::IsNull, + Reference::new("all_nulls_missing_nan"), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + ManifestEvaluator::new(all_nulls_missing_nan_filter).eval(&manifest_file)?, + "Should read: at least one null value in all null column" + ); + + // some_nulls + let some_nulls_filter = Predicate::Unary(UnaryExpression::new( + PredicateOperator::IsNull, + Reference::new("some_nulls"), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + ManifestEvaluator::new(some_nulls_filter).eval(&manifest_file)?, + "Should read: column with some nulls contains a null value" + ); + + // no_nulls + let no_nulls_filter = Predicate::Unary(UnaryExpression::new( + PredicateOperator::IsNull, + Reference::new("no_nulls"), + )) + .bind(schema.clone(), case_sensitive)?; + + assert!( + !ManifestEvaluator::new(no_nulls_filter).eval(&manifest_file)?, + "Should skip: non-null column contains no null values" + ); + + // both_nan_and_null + let both_nan_and_null_filter = Predicate::Unary(UnaryExpression::new( + PredicateOperator::IsNull, + Reference::new("both_nan_and_null"), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + ManifestEvaluator::new(both_nan_and_null_filter).eval(&manifest_file)?, + "Should read: both_nan_and_null column contains no null values" + ); - Ok(partition_filter) + Ok(()) } - fn create_manifest_evaluator( - schema: SchemaRef, - partition_spec: PartitionSpecRef, - filter: &BoundPredicate, - case_sensitive: bool, - ) -> Result { - let partition_schema = create_partition_schema(&partition_spec, &schema)?; - let partition_filter = create_partition_filter( - partition_spec, - partition_schema.clone(), - filter, - case_sensitive, - )?; + #[test] + fn test_is_nan() -> Result<()> { + let case_sensitive = true; + let schema = create_schema()?; + let partitions = create_partitions(); + let manifest_file = create_manifest_file(partitions); + + // float + let float_filter = Predicate::Unary(UnaryExpression::new( + PredicateOperator::IsNan, + Reference::new("float"), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + ManifestEvaluator::new(float_filter).eval(&manifest_file)?, + "Should read: no information on if there are nan value in float column" + ); + + // all_nulls_double + let all_nulls_double_filter = Predicate::Unary(UnaryExpression::new( + PredicateOperator::IsNan, + Reference::new("all_nulls_double"), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + ManifestEvaluator::new(all_nulls_double_filter).eval(&manifest_file)?, + "Should read: no NaN information may indicate presence of NaN value" + ); + + // all_nulls_missing_nan_float + let all_nulls_missing_nan_float_filter = Predicate::Unary(UnaryExpression::new( + PredicateOperator::IsNan, + Reference::new("all_nulls_missing_nan_float"), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + ManifestEvaluator::new(all_nulls_missing_nan_float_filter).eval(&manifest_file)?, + "Should read: no NaN information may indicate presence of NaN value" + ); + + // all_nulls_no_nans + let all_nulls_no_nans_filter = Predicate::Unary(UnaryExpression::new( + PredicateOperator::IsNan, + Reference::new("all_nulls_no_nans"), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + !ManifestEvaluator::new(all_nulls_no_nans_filter).eval(&manifest_file)?, + "Should skip: no nan column doesn't contain nan value" + ); + + // all_nans + let all_nans_filter = Predicate::Unary(UnaryExpression::new( + PredicateOperator::IsNan, + Reference::new("all_nans"), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + ManifestEvaluator::new(all_nans_filter).eval(&manifest_file)?, + "Should read: all_nans column contains nan value" + ); + + // both_nan_and_null + let both_nan_and_null_filter = Predicate::Unary(UnaryExpression::new( + PredicateOperator::IsNan, + Reference::new("both_nan_and_null"), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + ManifestEvaluator::new(both_nan_and_null_filter).eval(&manifest_file)?, + "Should read: both_nan_and_null column contains nan value" + ); - Ok(ManifestEvaluator::new(partition_filter)) + // no_nan_or_null + let no_nan_or_null_filter = Predicate::Unary(UnaryExpression::new( + PredicateOperator::IsNan, + Reference::new("no_nan_or_null"), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + !ManifestEvaluator::new(no_nan_or_null_filter).eval(&manifest_file)?, + "Should skip: no_nan_or_null column doesn't contain nan value" + ); + + Ok(()) } #[test] - fn test_manifest_file_empty_partitions() -> Result<()> { - let case_sensitive = false; + fn test_not_nan() -> Result<()> { + let case_sensitive = true; + let schema = create_schema()?; + let partitions = create_partitions(); + let manifest_file = create_manifest_file(partitions); + + // float + let float_filter = Predicate::Unary(UnaryExpression::new( + PredicateOperator::NotNan, + Reference::new("float"), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + ManifestEvaluator::new(float_filter).eval(&manifest_file)?, + "Should read: no information on if there are nan value in float column" + ); - let (schema, partition_spec) = create_schema_and_partition_spec()?; + // all_nulls_double + let all_nulls_double_filter = Predicate::Unary(UnaryExpression::new( + PredicateOperator::NotNan, + Reference::new("all_nulls_double"), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + ManifestEvaluator::new(all_nulls_double_filter).eval(&manifest_file)?, + "Should read: all null column contains non nan value" + ); - let filter = Predicate::AlwaysTrue.bind(schema.clone(), case_sensitive)?; + // all_nulls_no_nans + let all_nulls_no_nans_filter = Predicate::Unary(UnaryExpression::new( + PredicateOperator::NotNan, + Reference::new("all_nulls_no_nans"), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + ManifestEvaluator::new(all_nulls_no_nans_filter).eval(&manifest_file)?, + "Should read: no_nans column contains non nan value" + ); - let manifest_file = create_manifest_file(vec![]); + // all_nans + let all_nans_filter = Predicate::Unary(UnaryExpression::new( + PredicateOperator::NotNan, + Reference::new("all_nans"), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + !ManifestEvaluator::new(all_nans_filter).eval(&manifest_file)?, + "Should skip: all nans column doesn't contain non nan value" + ); - let manifest_evaluator = - create_manifest_evaluator(schema, partition_spec, &filter, case_sensitive)?; + // both_nan_and_null + let both_nan_and_null_filter = Predicate::Unary(UnaryExpression::new( + PredicateOperator::NotNan, + Reference::new("both_nan_and_null"), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + ManifestEvaluator::new(both_nan_and_null_filter).eval(&manifest_file)?, + "Should read: both_nan_and_null nans column contains non nan value" + ); + + // no_nan_or_null + let no_nan_or_null_filter = Predicate::Unary(UnaryExpression::new( + PredicateOperator::NotNan, + Reference::new("no_nan_or_null"), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + ManifestEvaluator::new(no_nan_or_null_filter).eval(&manifest_file)?, + "Should read: no_nan_or_null column contains non nan value" + ); - let result = manifest_evaluator.eval(&manifest_file)?; + Ok(()) + } - assert!(result); + #[test] + fn test_and() -> Result<()> { + let case_sensitive = true; + let schema = create_schema()?; + let partitions = create_partitions(); + let manifest_file = create_manifest_file(partitions); + + let filter = Predicate::Binary(BinaryExpression::new( + PredicateOperator::LessThan, + Reference::new("id"), + Datum::int(INT_MIN_VALUE - 25), + )) + .and(Predicate::Binary(BinaryExpression::new( + PredicateOperator::GreaterThanOrEq, + Reference::new("id"), + Datum::int(INT_MIN_VALUE - 30), + ))) + .bind(schema.clone(), case_sensitive)?; + assert!( + !ManifestEvaluator::new(filter).eval(&manifest_file)?, + "Should read: no information on if there are nan value in float column" + ); Ok(()) } #[test] - fn test_manifest_file_trivial_partition_passing_filter() -> Result<()> { + fn test_or() -> Result<()> { let case_sensitive = true; + let schema = create_schema()?; + let partitions = create_partitions(); + let manifest_file = create_manifest_file(partitions); + + let filter = Predicate::Binary(BinaryExpression::new( + PredicateOperator::LessThan, + Reference::new("id"), + Datum::int(INT_MIN_VALUE - 25), + )) + .or(Predicate::Binary(BinaryExpression::new( + PredicateOperator::GreaterThanOrEq, + Reference::new("id"), + Datum::int(INT_MAX_VALUE + 1), + ))) + .bind(schema.clone(), case_sensitive)?; + assert!( + !ManifestEvaluator::new(filter).eval(&manifest_file)?, + "Should skip: or(false, false)" + ); - let (schema, partition_spec) = create_schema_and_partition_spec()?; + Ok(()) + } - let filter = Predicate::Unary(UnaryExpression::new( - PredicateOperator::IsNull, - Reference::new("a"), + #[test] + fn test_not() -> Result<()> { + let case_sensitive = true; + let schema = create_schema()?; + let partitions = create_partitions(); + let manifest_file = create_manifest_file(partitions); + + let filter = Predicate::Binary(BinaryExpression::new( + PredicateOperator::LessThan, + Reference::new("id"), + Datum::int(INT_MIN_VALUE - 25), )) + .not() .bind(schema.clone(), case_sensitive)?; + assert!( + ManifestEvaluator::new(filter).eval(&manifest_file)?, + "Should read: not(false)" + ); - let manifest_file = create_manifest_file(vec![FieldSummary { - contains_null: true, - contains_nan: None, - lower_bound: None, - upper_bound: None, - }]); + let filter = Predicate::Binary(BinaryExpression::new( + PredicateOperator::GreaterThan, + Reference::new("id"), + Datum::int(INT_MIN_VALUE - 25), + )) + .not() + .bind(schema.clone(), case_sensitive)?; + assert!( + !ManifestEvaluator::new(filter).eval(&manifest_file)?, + "Should skip: not(true)" + ); - let manifest_evaluator = - create_manifest_evaluator(schema, partition_spec, &filter, case_sensitive)?; + Ok(()) + } - let result = manifest_evaluator.eval(&manifest_file)?; + #[test] + fn test_less_than() -> Result<()> { + let case_sensitive = true; + let schema = create_schema()?; + let partitions = create_partitions(); + let manifest_file = create_manifest_file(partitions); + + let filter = Predicate::Binary(BinaryExpression::new( + PredicateOperator::LessThan, + Reference::new("id"), + Datum::int(INT_MIN_VALUE - 25), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + !ManifestEvaluator::new(filter).eval(&manifest_file)?, + "Should not read: id range below lower bound (5 < 30)" + ); - assert!(result); + Ok(()) + } + + #[test] + fn test_less_than_or_eq() -> Result<()> { + let case_sensitive = true; + let schema = create_schema()?; + let partitions = create_partitions(); + let manifest_file = create_manifest_file(partitions); + + let filter = Predicate::Binary(BinaryExpression::new( + PredicateOperator::LessThanOrEq, + Reference::new("id"), + Datum::int(INT_MIN_VALUE - 25), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + !ManifestEvaluator::new(filter).eval(&manifest_file)?, + "Should not read: id range below lower bound (5 < 30)" + ); Ok(()) } #[test] - fn test_manifest_file_trivial_partition_rejected_filter() -> Result<()> { + fn test_greater_than() -> Result<()> { let case_sensitive = true; + let schema = create_schema()?; + let partitions = create_partitions(); + let manifest_file = create_manifest_file(partitions); + + let filter = Predicate::Binary(BinaryExpression::new( + PredicateOperator::GreaterThan, + Reference::new("id"), + Datum::int(INT_MAX_VALUE + 6), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + !ManifestEvaluator::new(filter).eval(&manifest_file)?, + "Should not read: id range above upper bound (85 < 79)" + ); - let (schema, partition_spec) = create_schema_and_partition_spec()?; + Ok(()) + } - let filter = Predicate::Unary(UnaryExpression::new( - PredicateOperator::IsNan, - Reference::new("a"), + #[test] + fn test_greater_than_or_eq() -> Result<()> { + let case_sensitive = true; + let schema = create_schema()?; + let partitions = create_partitions(); + let manifest_file = create_manifest_file(partitions); + + let filter = Predicate::Binary(BinaryExpression::new( + PredicateOperator::GreaterThanOrEq, + Reference::new("id"), + Datum::int(INT_MAX_VALUE + 6), )) .bind(schema.clone(), case_sensitive)?; + assert!( + !ManifestEvaluator::new(filter).eval(&manifest_file)?, + "Should not read: id range above upper bound (85 < 79)" + ); - let manifest_file = create_manifest_file(vec![FieldSummary { - contains_null: false, - contains_nan: None, - lower_bound: None, - upper_bound: None, - }]); + let filter = Predicate::Binary(BinaryExpression::new( + PredicateOperator::GreaterThanOrEq, + Reference::new("id"), + Datum::int(INT_MAX_VALUE), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + ManifestEvaluator::new(filter).eval(&manifest_file)?, + "Should read: one possible id" + ); + + Ok(()) + } + + #[test] + fn test_eq() -> Result<()> { + let case_sensitive = true; + let schema = create_schema()?; + let partitions = create_partitions(); + let manifest_file = create_manifest_file(partitions); + + let filter = Predicate::Binary(BinaryExpression::new( + PredicateOperator::Eq, + Reference::new("id"), + Datum::int(INT_MIN_VALUE - 25), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + !ManifestEvaluator::new(filter).eval(&manifest_file)?, + "Should not read: id below lower bound" + ); + + let filter = Predicate::Binary(BinaryExpression::new( + PredicateOperator::Eq, + Reference::new("id"), + Datum::int(INT_MIN_VALUE), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + ManifestEvaluator::new(filter).eval(&manifest_file)?, + "Should read: id equal to lower bound" + ); + + Ok(()) + } + + #[test] + fn test_not_eq() -> Result<()> { + let case_sensitive = true; + let schema = create_schema()?; + let partitions = create_partitions(); + let manifest_file = create_manifest_file(partitions); + + let filter = Predicate::Binary(BinaryExpression::new( + PredicateOperator::NotEq, + Reference::new("id"), + Datum::int(INT_MIN_VALUE - 25), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + ManifestEvaluator::new(filter).eval(&manifest_file)?, + "Should read: id below lower bound" + ); + + Ok(()) + } + + #[test] + fn test_in() -> Result<()> { + let case_sensitive = true; + let schema = create_schema()?; + let partitions = create_partitions(); + let manifest_file = create_manifest_file(partitions); + + let filter = Predicate::Set(SetExpression::new( + PredicateOperator::In, + Reference::new("id"), + FnvHashSet::from_iter(vec![ + Datum::int(INT_MIN_VALUE - 25), + Datum::int(INT_MIN_VALUE - 24), + ]), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + !ManifestEvaluator::new(filter).eval(&manifest_file)?, + "Should not read: id below lower bound (5 < 30, 6 < 30)" + ); + + let filter = Predicate::Set(SetExpression::new( + PredicateOperator::In, + Reference::new("id"), + FnvHashSet::from_iter(vec![ + Datum::int(INT_MIN_VALUE - 1), + Datum::int(INT_MIN_VALUE), + ]), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + ManifestEvaluator::new(filter).eval(&manifest_file)?, + "Should read: id equal to lower bound (30 == 30)" + ); - let manifest_evaluator = - create_manifest_evaluator(schema, partition_spec, &filter, case_sensitive)?; + Ok(()) + } + + #[test] + fn test_not_in() -> Result<()> { + let case_sensitive = true; + let schema = create_schema()?; + let partitions = create_partitions(); + let manifest_file = create_manifest_file(partitions); + + let filter = Predicate::Set(SetExpression::new( + PredicateOperator::NotIn, + Reference::new("id"), + FnvHashSet::from_iter(vec![ + Datum::int(INT_MIN_VALUE - 25), + Datum::int(INT_MIN_VALUE - 24), + ]), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + ManifestEvaluator::new(filter).eval(&manifest_file)?, + "Should read: id below lower bound (5 < 30, 6 < 30)" + ); - let result = manifest_evaluator.eval(&manifest_file).unwrap(); + Ok(()) + } - assert!(!result); + #[test] + fn test_starts_with() -> Result<()> { + let case_sensitive = false; + let schema = create_schema()?; + let partitions = create_partitions(); + let manifest_file = create_manifest_file(partitions); + + let filter = Predicate::Binary(BinaryExpression::new( + PredicateOperator::StartsWith, + Reference::new("some_nulls"), + Datum::string("a"), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + ManifestEvaluator::new(filter).eval(&manifest_file)?, + "Should read: range matches" + ); + + let filter = Predicate::Binary(BinaryExpression::new( + PredicateOperator::StartsWith, + Reference::new("some_nulls"), + Datum::string("zzzz"), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + !ManifestEvaluator::new(filter).eval(&manifest_file)?, + "Should skip: range doesn't match" + ); + + Ok(()) + } + + #[test] + fn test_not_starts_with() -> Result<()> { + let case_sensitive = false; + let schema = create_schema()?; + let partitions = create_partitions(); + let manifest_file = create_manifest_file(partitions); + + let filter = Predicate::Binary(BinaryExpression::new( + PredicateOperator::NotStartsWith, + Reference::new("some_nulls"), + Datum::string("a"), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + ManifestEvaluator::new(filter).eval(&manifest_file)?, + "Should read: range matches" + ); + + let filter = Predicate::Binary(BinaryExpression::new( + PredicateOperator::NotStartsWith, + Reference::new("no_nulls_same_value_a"), + Datum::string("a"), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + !ManifestEvaluator::new(filter).eval(&manifest_file)?, + "Should not read: all values start with the prefix" + ); Ok(()) }