From aee9f523f2fdef5e104419e5d88603884aaa3536 Mon Sep 17 00:00:00 2001 From: s-akhtar-baig Date: Thu, 9 May 2024 12:19:48 -0700 Subject: [PATCH 1/8] Implement all functions of BoundPredicateVisitor for ManifestFilterVisitor --- .../src/expr/visitors/manifest_evaluator.rs | 1149 ++++++++++++++--- 1 file changed, 997 insertions(+), 152 deletions(-) diff --git a/crates/iceberg/src/expr/visitors/manifest_evaluator.rs b/crates/iceberg/src/expr/visitors/manifest_evaluator.rs index bcb596718..498f8e0a0 100644 --- a/crates/iceberg/src/expr/visitors/manifest_evaluator.rs +++ b/crates/iceberg/src/expr/visitors/manifest_evaluator.rs @@ -17,8 +17,9 @@ use crate::expr::visitors::bound_predicate_visitor::{visit, BoundPredicateVisitor}; use crate::expr::{BoundPredicate, BoundReference}; -use crate::spec::{Datum, FieldSummary, ManifestFile}; +use crate::spec::{Datum, FieldSummary, Literal, ManifestFile, PrimitiveLiteral}; use crate::Result; +use crate::{Error, ErrorKind}; use fnv::FnvHashSet; #[derive(Debug)] @@ -69,17 +70,19 @@ impl<'a> ManifestFilterVisitor<'a> { } } -// Remove this annotation once all todos have been removed -#[allow(unused_variables)] +const ROWS_MIGHT_MATCH: Result = Ok(true); +const ROWS_CANNOT_MATCH: Result = Ok(false); +const IN_PREDICATE_LIMIT: usize = 200; + impl BoundPredicateVisitor for ManifestFilterVisitor<'_> { type T = bool; fn always_true(&mut self) -> crate::Result { - Ok(true) + ROWS_MIGHT_MATCH } fn always_false(&mut self) -> crate::Result { - Ok(false) + ROWS_CANNOT_MATCH } fn and(&mut self, lhs: bool, rhs: bool) -> crate::Result { @@ -107,7 +110,19 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> { reference: &BoundReference, _predicate: &BoundPredicate, ) -> crate::Result { - todo!() + let field: &FieldSummary = self.field_summary_for_reference(reference); + + // contains_null encodes whether at least one partition value is null, + // lowerBound is null if all partition values are null + let mut all_null: bool = field.contains_null && field.lower_bound.is_none(); + + if all_null && reference.field().field_type.is_floating_type() { + // floating point types may include NaN values, which we check separately. + // In case bounds don't include NaN value, contains_nan needs to be checked against. + all_null = field.contains_nan.is_some_and(|x| !x); + } + + Ok(!all_null) } fn is_nan( @@ -115,10 +130,12 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> { reference: &BoundReference, _predicate: &BoundPredicate, ) -> crate::Result { - Ok(self - .field_summary_for_reference(reference) - .contains_nan - .is_some()) + let field: &FieldSummary = self.field_summary_for_reference(reference); + if let Some(contains_nan) = field.contains_nan { + Ok(contains_nan) + } else { + ROWS_MIGHT_MATCH + } } fn not_nan( @@ -126,79 +143,209 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> { reference: &BoundReference, _predicate: &BoundPredicate, ) -> crate::Result { - todo!() + let field: &FieldSummary = self.field_summary_for_reference(reference); + if field.contains_nan.is_some_and(|x| x) + && !field.contains_null + && field.lower_bound.is_none() + { + ROWS_CANNOT_MATCH + } else { + ROWS_MIGHT_MATCH + } } fn less_than( &mut self, reference: &BoundReference, - literal: &Datum, + datum: &Datum, _predicate: &BoundPredicate, ) -> crate::Result { - todo!() + let field: &FieldSummary = self.field_summary_for_reference(reference); + if let Some(lower_bound) = &field.lower_bound { + Ok(lower_bound < &Literal::from(datum.clone())) + } else { + ROWS_CANNOT_MATCH + } } fn less_than_or_eq( &mut self, reference: &BoundReference, - literal: &Datum, + datum: &Datum, _predicate: &BoundPredicate, ) -> crate::Result { - todo!() + let field: &FieldSummary = self.field_summary_for_reference(reference); + if let Some(lower_bound) = &field.lower_bound { + Ok(lower_bound <= &Literal::from(datum.clone())) + } else { + ROWS_CANNOT_MATCH + } } fn greater_than( &mut self, reference: &BoundReference, - literal: &Datum, + datum: &Datum, _predicate: &BoundPredicate, ) -> crate::Result { - todo!() + let field: &FieldSummary = self.field_summary_for_reference(reference); + if let Some(upper_bound) = &field.upper_bound { + Ok(upper_bound > &Literal::from(datum.clone())) + } else { + ROWS_CANNOT_MATCH + } } fn greater_than_or_eq( &mut self, reference: &BoundReference, - literal: &Datum, + datum: &Datum, _predicate: &BoundPredicate, ) -> crate::Result { - todo!() + let field: &FieldSummary = self.field_summary_for_reference(reference); + if let Some(upper_bound) = &field.upper_bound { + Ok(upper_bound >= &Literal::from(datum.clone())) + } else { + ROWS_CANNOT_MATCH + } } fn eq( &mut self, reference: &BoundReference, - literal: &Datum, + datum: &Datum, _predicate: &BoundPredicate, ) -> crate::Result { - todo!() + Ok(self.less_than_or_eq(reference, datum, _predicate)? + && self.greater_than_or_eq(reference, datum, _predicate)?) } fn not_eq( &mut self, - reference: &BoundReference, - literal: &Datum, + _reference: &BoundReference, + _datum: &Datum, _predicate: &BoundPredicate, ) -> crate::Result { - todo!() + // because the bounds are not necessarily a min or max value, this cannot be answered using + // them. notEq(col, X) with (X, Y) doesn't guarantee that X is a value in col. + ROWS_MIGHT_MATCH } fn starts_with( &mut self, reference: &BoundReference, - literal: &Datum, + datum: &Datum, _predicate: &BoundPredicate, ) -> crate::Result { - todo!() + let field: &FieldSummary = self.field_summary_for_reference(reference); + + if field.lower_bound.is_none() || field.upper_bound.is_none() { + return ROWS_CANNOT_MATCH; + } + + let PrimitiveLiteral::String(prefix) = datum.literal() else { + return Err(Error::new( + ErrorKind::Unexpected, + "Cannot perfom starts_with on non-string value", + )); + }; + + let prefix_len = prefix.chars().count(); + + if let Some(lower_bound) = field.lower_bound.clone() { + let Literal::Primitive(PrimitiveLiteral::String(lower_bound)) = lower_bound else { + return Err(Error::new( + ErrorKind::Unexpected, + "Cannot perfom starts_with on non-string lower bound", + )); + }; + + let min_len = lower_bound.chars().count().min(prefix_len); + let truncated_lower_bound = String::from(&lower_bound[..min_len]); + if prefix < &truncated_lower_bound { + return ROWS_CANNOT_MATCH; + } + } + + if let Some(upper_bound) = field.upper_bound.clone() { + let Literal::Primitive(PrimitiveLiteral::String(upper_bound)) = upper_bound else { + return Err(Error::new( + ErrorKind::Unexpected, + "Cannot perfom starts_with on non-string upper bound", + )); + }; + + let min_len = upper_bound.chars().count().min(prefix_len); + let truncated_upper_bound = String::from(&upper_bound[..min_len]); + if prefix > &truncated_upper_bound { + return ROWS_CANNOT_MATCH; + } + } + + ROWS_MIGHT_MATCH } fn not_starts_with( &mut self, reference: &BoundReference, - literal: &Datum, + datum: &Datum, _predicate: &BoundPredicate, ) -> crate::Result { - todo!() + let field: &FieldSummary = self.field_summary_for_reference(reference); + + if field.contains_null || field.lower_bound.is_none() || field.upper_bound.is_none() { + return ROWS_MIGHT_MATCH; + } + + let PrimitiveLiteral::String(prefix) = datum.literal() else { + return Err(Error::new( + ErrorKind::Unexpected, + "Cannot perfom not_starts_with on non-string value", + )); + }; + + let prefix_len = prefix.chars().count(); + + // not_starts_with will match unless all values must start with the prefix. This happens when + // the lower and upper bounds both start with the prefix. + if let Some(lower_bound) = field.lower_bound.clone() { + let Literal::Primitive(PrimitiveLiteral::String(lower_bound)) = lower_bound else { + return Err(Error::new( + ErrorKind::Unexpected, + "Cannot perfom not_starts_with on non-string lower bound", + )); + }; + + // if lower is shorter than the prefix then lower doesn't start with the prefix + if prefix_len > lower_bound.chars().count() { + return ROWS_MIGHT_MATCH; + } + + let truncated_lower_bound = String::from(&lower_bound[..prefix_len]); + if prefix == &truncated_lower_bound { + if let Some(upper_bound) = field.upper_bound.clone() { + let Literal::Primitive(PrimitiveLiteral::String(upper_bound)) = upper_bound + else { + return Err(Error::new( + ErrorKind::Unexpected, + "Cannot perfom not_starts_with on non-string upper bound", + )); + }; + + // if upper is shorter than the prefix then upper can't start with the prefix + if prefix_len > upper_bound.chars().count() { + return ROWS_MIGHT_MATCH; + } + + let truncated_upper_bound = String::from(&upper_bound[..prefix_len]); + if prefix == &truncated_upper_bound { + return ROWS_CANNOT_MATCH; + } + } + } + } + + ROWS_MIGHT_MATCH } fn r#in( @@ -207,16 +354,42 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> { literals: &FnvHashSet, _predicate: &BoundPredicate, ) -> crate::Result { - todo!() + let field: &FieldSummary = self.field_summary_for_reference(reference); + if field.lower_bound.is_none() { + return ROWS_CANNOT_MATCH; + } + + if literals.len() > IN_PREDICATE_LIMIT { + return ROWS_MIGHT_MATCH; + } + + if literals + .iter() + .all(|datum| field.lower_bound.as_ref().unwrap() > &Literal::from(datum.clone())) + { + return ROWS_CANNOT_MATCH; + } + + if field.upper_bound.is_some() + && literals + .iter() + .all(|datum| field.upper_bound.as_ref().unwrap() < &Literal::from(datum.clone())) + { + return ROWS_CANNOT_MATCH; + } + + ROWS_MIGHT_MATCH } fn not_in( &mut self, - reference: &BoundReference, - literals: &FnvHashSet, + _reference: &BoundReference, + _literals: &FnvHashSet, _predicate: &BoundPredicate, ) -> crate::Result { - todo!() + // because the bounds are not necessarily a min or max value, this cannot be answered using + // them. notIn(col, {X, ...}) with (X, Y) doesn't guarantee that X is a value in col. + ROWS_MIGHT_MATCH } } @@ -229,63 +402,194 @@ impl ManifestFilterVisitor<'_> { #[cfg(test)] mod test { - use crate::expr::visitors::inclusive_projection::InclusiveProjection; use crate::expr::visitors::manifest_evaluator::ManifestEvaluator; use crate::expr::{ - Bind, BoundPredicate, Predicate, PredicateOperator, Reference, UnaryExpression, + BinaryExpression, Bind, Predicate, PredicateOperator, Reference, SetExpression, + UnaryExpression, }; use crate::spec::{ - FieldSummary, ManifestContentType, ManifestFile, NestedField, PartitionField, - PartitionSpec, PartitionSpecRef, PrimitiveType, Schema, SchemaRef, Transform, Type, + Datum, FieldSummary, Literal, ManifestContentType, ManifestFile, NestedField, + PrimitiveType, Schema, SchemaRef, Type, }; use crate::Result; + use fnv::FnvHashSet; + use std::ops::Not; use std::sync::Arc; - fn create_schema_and_partition_spec() -> Result<(SchemaRef, PartitionSpecRef)> { - let schema = Schema::builder() - .with_fields(vec![Arc::new(NestedField::optional( - 1, - "a", - Type::Primitive(PrimitiveType::Float), - ))]) - .build()?; + const INT_MIN_VALUE: i32 = 30; + const INT_MAX_VALUE: i32 = 79; - let spec = PartitionSpec::builder() - .with_spec_id(1) - .with_fields(vec![PartitionField::builder() - .source_id(1) - .name("a".to_string()) - .field_id(1) - .transform(Transform::Identity) - .build()]) - .build() - .unwrap(); + const STRING_MIN_VALUE: &str = "a"; + const STRING_MAX_VALUE: &str = "z"; - Ok((Arc::new(schema), Arc::new(spec))) - } - - fn create_schema_and_partition_spec_with_id_mismatch() -> Result<(SchemaRef, PartitionSpecRef)> - { + fn create_schema() -> Result { let schema = Schema::builder() - .with_fields(vec![Arc::new(NestedField::optional( - 1, - "a", - Type::Primitive(PrimitiveType::Float), - ))]) + .with_fields(vec![ + Arc::new(NestedField::required( + 1, + "id", + Type::Primitive(PrimitiveType::Int), + )), + Arc::new(NestedField::optional( + 2, + "all_nulls_missing_nan", + Type::Primitive(PrimitiveType::String), + )), + Arc::new(NestedField::optional( + 3, + "some_nulls", + Type::Primitive(PrimitiveType::String), + )), + Arc::new(NestedField::optional( + 4, + "no_nulls", + Type::Primitive(PrimitiveType::String), + )), + Arc::new(NestedField::optional( + 5, + "float", + Type::Primitive(PrimitiveType::Float), + )), + Arc::new(NestedField::optional( + 6, + "all_nulls_double", + Type::Primitive(PrimitiveType::Double), + )), + Arc::new(NestedField::optional( + 7, + "all_nulls_no_nans", + Type::Primitive(PrimitiveType::Float), + )), + Arc::new(NestedField::optional( + 8, + "all_nans", + Type::Primitive(PrimitiveType::Double), + )), + Arc::new(NestedField::optional( + 9, + "both_nan_and_null", + Type::Primitive(PrimitiveType::Float), + )), + Arc::new(NestedField::optional( + 10, + "no_nan_or_null", + Type::Primitive(PrimitiveType::Double), + )), + Arc::new(NestedField::optional( + 11, + "all_nulls_missing_nan_float", + Type::Primitive(PrimitiveType::Float), + )), + Arc::new(NestedField::optional( + 12, + "all_same_value_or_null", + Type::Primitive(PrimitiveType::String), + )), + Arc::new(NestedField::optional( + 13, + "no_nulls_same_value_a", + Type::Primitive(PrimitiveType::String), + )), + ]) .build()?; - let spec = PartitionSpec::builder() - .with_spec_id(999) - .with_fields(vec![PartitionField::builder() - .source_id(1) - .name("a".to_string()) - .field_id(1) - .transform(Transform::Identity) - .build()]) - .build() - .unwrap(); + Ok(Arc::new(schema)) + } - Ok((Arc::new(schema), Arc::new(spec))) + fn create_partitions() -> Vec { + vec![ + // id + FieldSummary { + contains_null: false, + contains_nan: None, + lower_bound: Some(Literal::int(INT_MIN_VALUE)), + upper_bound: Some(Literal::int(INT_MAX_VALUE)), + }, + // all_nulls_missing_nan + FieldSummary { + contains_null: true, + contains_nan: None, + lower_bound: None, + upper_bound: None, + }, + // some_nulls + FieldSummary { + contains_null: true, + contains_nan: None, + lower_bound: Some(Literal::string(STRING_MIN_VALUE)), + upper_bound: Some(Literal::string(STRING_MAX_VALUE)), + }, + // no_nulls + FieldSummary { + contains_null: false, + contains_nan: None, + lower_bound: Some(Literal::string(STRING_MIN_VALUE)), + upper_bound: Some(Literal::string(STRING_MAX_VALUE)), + }, + // float + FieldSummary { + contains_null: true, + contains_nan: None, + lower_bound: Some(Literal::float(0.0)), + upper_bound: Some(Literal::float(20.0)), + }, + // all_nulls_double + FieldSummary { + contains_null: true, + contains_nan: None, + lower_bound: None, + upper_bound: None, + }, + // all_nulls_no_nans + FieldSummary { + contains_null: true, + contains_nan: Some(false), + lower_bound: None, + upper_bound: None, + }, + // all_nans + FieldSummary { + contains_null: false, + contains_nan: Some(true), + lower_bound: None, + upper_bound: None, + }, + // both_nan_and_null + FieldSummary { + contains_null: true, + contains_nan: Some(true), + lower_bound: None, + upper_bound: None, + }, + // no_nan_or_null + FieldSummary { + contains_null: false, + contains_nan: Some(false), + lower_bound: Some(Literal::float(0.0)), + upper_bound: Some(Literal::float(20.0)), + }, + // all_nulls_missing_nan_float + FieldSummary { + contains_null: true, + contains_nan: None, + lower_bound: None, + upper_bound: None, + }, + // all_same_value_or_null + FieldSummary { + contains_null: true, + contains_nan: None, + lower_bound: Some(Literal::string(STRING_MIN_VALUE)), + upper_bound: Some(Literal::string(STRING_MIN_VALUE)), + }, + // no_nulls_same_value_a + FieldSummary { + contains_null: false, + contains_nan: None, + lower_bound: Some(Literal::string(STRING_MIN_VALUE)), + upper_bound: Some(Literal::string(STRING_MIN_VALUE)), + }, + ] } fn create_manifest_file(partitions: Vec) -> ManifestFile { @@ -308,131 +612,672 @@ mod test { } } - fn create_partition_schema( - partition_spec: &PartitionSpecRef, - schema: &SchemaRef, - ) -> Result { - let partition_type = partition_spec.partition_type(schema)?; + #[test] + fn test_always_true() -> Result<()> { + let case_sensitive = false; + let schema = create_schema()?; + let partitions = create_partitions(); + let manifest_file = create_manifest_file(partitions); + + let filter = Predicate::AlwaysTrue.bind(schema.clone(), case_sensitive)?; + + assert!(ManifestEvaluator::new(filter, case_sensitive).eval(&manifest_file)?); + + Ok(()) + } + + #[test] + fn test_always_false() -> Result<()> { + let case_sensitive = false; + let schema = create_schema()?; + let partitions = create_partitions(); + let manifest_file = create_manifest_file(partitions); + + let filter = Predicate::AlwaysFalse.bind(schema.clone(), case_sensitive)?; + + assert!(!ManifestEvaluator::new(filter, case_sensitive).eval(&manifest_file)?); + + Ok(()) + } + + #[test] + fn test_all_nulls() -> Result<()> { + let case_sensitive = true; + let schema = create_schema()?; + let partitions = create_partitions(); + let manifest_file = create_manifest_file(partitions); + + // all_nulls_missing_nan + let all_nulls_missing_nan_filter = Predicate::Unary(UnaryExpression::new( + PredicateOperator::NotNull, + Reference::new("all_nulls_missing_nan"), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + !ManifestEvaluator::new(all_nulls_missing_nan_filter, case_sensitive) + .eval(&manifest_file)?, + "Should skip: all nulls column with non-floating type contains all null" + ); - let partition_fields: Vec<_> = partition_type.fields().iter().map(Arc::clone).collect(); + // all_nulls_missing_nan_float + let all_nulls_missing_nan_float_filter = Predicate::Unary(UnaryExpression::new( + PredicateOperator::NotNull, + Reference::new("all_nulls_missing_nan_float"), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + ManifestEvaluator::new(all_nulls_missing_nan_float_filter, case_sensitive) + .eval(&manifest_file)?, + "Should read: no NaN information may indicate presence of NaN value" + ); - let partition_schema = Arc::new( - Schema::builder() - .with_schema_id(partition_spec.spec_id) - .with_fields(partition_fields) - .build()?, + // some_nulls + let some_nulls_filter = Predicate::Unary(UnaryExpression::new( + PredicateOperator::NotNull, + Reference::new("some_nulls"), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + ManifestEvaluator::new(some_nulls_filter, case_sensitive).eval(&manifest_file)?, + "Should read: column with some nulls contains a non-null value" ); - Ok(partition_schema) + // no_nulls + let no_nulls_filter = Predicate::Unary(UnaryExpression::new( + PredicateOperator::NotNull, + Reference::new("no_nulls"), + )) + .bind(schema.clone(), case_sensitive)?; + + assert!( + ManifestEvaluator::new(no_nulls_filter, case_sensitive).eval(&manifest_file)?, + "Should read: non-null column contains a non-null value" + ); + + Ok(()) } - fn create_partition_filter( - partition_spec: PartitionSpecRef, - partition_schema: SchemaRef, - filter: &BoundPredicate, - case_sensitive: bool, - ) -> Result { - let mut inclusive_projection = InclusiveProjection::new(partition_spec); + #[test] + fn test_no_nulls() -> Result<()> { + let case_sensitive = true; + let schema = create_schema()?; + let partitions = create_partitions(); + let manifest_file = create_manifest_file(partitions); + + // all_nulls_missing_nan + let all_nulls_missing_nan_filter = Predicate::Unary(UnaryExpression::new( + PredicateOperator::IsNull, + Reference::new("all_nulls_missing_nan"), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + ManifestEvaluator::new(all_nulls_missing_nan_filter, case_sensitive) + .eval(&manifest_file)?, + "Should read: at least one null value in all null column" + ); + + // some_nulls + let some_nulls_filter = Predicate::Unary(UnaryExpression::new( + PredicateOperator::IsNull, + Reference::new("some_nulls"), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + ManifestEvaluator::new(some_nulls_filter, case_sensitive).eval(&manifest_file)?, + "Should read: column with some nulls contains a null value" + ); + + // no_nulls + let no_nulls_filter = Predicate::Unary(UnaryExpression::new( + PredicateOperator::IsNull, + Reference::new("no_nulls"), + )) + .bind(schema.clone(), case_sensitive)?; + + assert!( + !ManifestEvaluator::new(no_nulls_filter, case_sensitive).eval(&manifest_file)?, + "Should skip: non-null column contains no null values" + ); - let partition_filter = inclusive_projection - .project(filter)? - .rewrite_not() - .bind(partition_schema, case_sensitive)?; + // both_nan_and_null + let both_nan_and_null_filter = Predicate::Unary(UnaryExpression::new( + PredicateOperator::IsNull, + Reference::new("both_nan_and_null"), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + ManifestEvaluator::new(both_nan_and_null_filter, case_sensitive) + .eval(&manifest_file)?, + "Should read: both_nan_and_null column contains no null values" + ); - Ok(partition_filter) + Ok(()) } - fn create_manifest_evaluator( - schema: SchemaRef, - partition_spec: PartitionSpecRef, - filter: &BoundPredicate, - case_sensitive: bool, - ) -> Result { - let partition_schema = create_partition_schema(&partition_spec, &schema)?; - let partition_filter = create_partition_filter( - partition_spec, - partition_schema.clone(), - filter, - case_sensitive, - )?; + #[test] + fn test_is_nan() -> Result<()> { + let case_sensitive = true; + let schema = create_schema()?; + let partitions = create_partitions(); + let manifest_file = create_manifest_file(partitions); - Ok(ManifestEvaluator::new(partition_filter, case_sensitive)) + // float + let float_filter = Predicate::Unary(UnaryExpression::new( + PredicateOperator::IsNan, + Reference::new("float"), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + ManifestEvaluator::new(float_filter, case_sensitive).eval(&manifest_file)?, + "Should read: no information on if there are nan value in float column" + ); + + // all_nulls_double + let all_nulls_double_filter = Predicate::Unary(UnaryExpression::new( + PredicateOperator::IsNan, + Reference::new("all_nulls_double"), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + ManifestEvaluator::new(all_nulls_double_filter, case_sensitive).eval(&manifest_file)?, + "Should read: no NaN information may indicate presence of NaN value" + ); + + // all_nulls_missing_nan_float + let all_nulls_missing_nan_float_filter = Predicate::Unary(UnaryExpression::new( + PredicateOperator::IsNan, + Reference::new("all_nulls_missing_nan_float"), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + ManifestEvaluator::new(all_nulls_missing_nan_float_filter, case_sensitive) + .eval(&manifest_file)?, + "Should read: no NaN information may indicate presence of NaN value" + ); + + // all_nulls_no_nans + let all_nulls_no_nans_filter = Predicate::Unary(UnaryExpression::new( + PredicateOperator::IsNan, + Reference::new("all_nulls_no_nans"), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + !ManifestEvaluator::new(all_nulls_no_nans_filter, case_sensitive) + .eval(&manifest_file)?, + "Should skip: no nan column doesn't contain nan value" + ); + + // all_nans + let all_nans_filter = Predicate::Unary(UnaryExpression::new( + PredicateOperator::IsNan, + Reference::new("all_nans"), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + ManifestEvaluator::new(all_nans_filter, case_sensitive).eval(&manifest_file)?, + "Should read: all_nans column contains nan value" + ); + + // both_nan_and_null + let both_nan_and_null_filter = Predicate::Unary(UnaryExpression::new( + PredicateOperator::IsNan, + Reference::new("both_nan_and_null"), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + ManifestEvaluator::new(both_nan_and_null_filter, case_sensitive) + .eval(&manifest_file)?, + "Should read: both_nan_and_null column contains nan value" + ); + + // no_nan_or_null + let no_nan_or_null_filter = Predicate::Unary(UnaryExpression::new( + PredicateOperator::IsNan, + Reference::new("no_nan_or_null"), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + !ManifestEvaluator::new(no_nan_or_null_filter, case_sensitive).eval(&manifest_file)?, + "Should skip: no_nan_or_null column doesn't contain nan value" + ); + + Ok(()) } #[test] - fn test_manifest_file_empty_partitions() -> Result<()> { - let case_sensitive = false; + fn test_not_nan() -> Result<()> { + let case_sensitive = true; + let schema = create_schema()?; + let partitions = create_partitions(); + let manifest_file = create_manifest_file(partitions); + + // float + let float_filter = Predicate::Unary(UnaryExpression::new( + PredicateOperator::NotNan, + Reference::new("float"), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + ManifestEvaluator::new(float_filter, case_sensitive).eval(&manifest_file)?, + "Should read: no information on if there are nan value in float column" + ); - let (schema, partition_spec) = create_schema_and_partition_spec()?; + // all_nulls_double + let all_nulls_double_filter = Predicate::Unary(UnaryExpression::new( + PredicateOperator::NotNan, + Reference::new("all_nulls_double"), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + ManifestEvaluator::new(all_nulls_double_filter, case_sensitive).eval(&manifest_file)?, + "Should read: all null column contains non nan value" + ); - let filter = Predicate::AlwaysTrue.bind(schema.clone(), case_sensitive)?; + // all_nulls_no_nans + let all_nulls_no_nans_filter = Predicate::Unary(UnaryExpression::new( + PredicateOperator::NotNan, + Reference::new("all_nulls_no_nans"), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + ManifestEvaluator::new(all_nulls_no_nans_filter, case_sensitive) + .eval(&manifest_file)?, + "Should read: no_nans column contains non nan value" + ); + + // all_nans + let all_nans_filter = Predicate::Unary(UnaryExpression::new( + PredicateOperator::NotNan, + Reference::new("all_nans"), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + !ManifestEvaluator::new(all_nans_filter, case_sensitive).eval(&manifest_file)?, + "Should skip: all nans column doesn't contain non nan value" + ); - let manifest_file = create_manifest_file(vec![]); + // both_nan_and_null + let both_nan_and_null_filter = Predicate::Unary(UnaryExpression::new( + PredicateOperator::NotNan, + Reference::new("both_nan_and_null"), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + ManifestEvaluator::new(both_nan_and_null_filter, case_sensitive) + .eval(&manifest_file)?, + "Should read: both_nan_and_null nans column contains non nan value" + ); - let manifest_evaluator = - create_manifest_evaluator(schema, partition_spec, &filter, case_sensitive)?; + // no_nan_or_null + let no_nan_or_null_filter = Predicate::Unary(UnaryExpression::new( + PredicateOperator::NotNan, + Reference::new("no_nan_or_null"), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + ManifestEvaluator::new(no_nan_or_null_filter, case_sensitive).eval(&manifest_file)?, + "Should read: no_nan_or_null column contains non nan value" + ); - let result = manifest_evaluator.eval(&manifest_file)?; + Ok(()) + } - assert!(result); + #[test] + fn test_and() -> Result<()> { + let case_sensitive = true; + let schema = create_schema()?; + let partitions = create_partitions(); + let manifest_file = create_manifest_file(partitions); + + let filter = Predicate::Binary(BinaryExpression::new( + PredicateOperator::LessThan, + Reference::new("id"), + Datum::int(INT_MIN_VALUE - 25), + )) + .and(Predicate::Binary(BinaryExpression::new( + PredicateOperator::GreaterThanOrEq, + Reference::new("id"), + Datum::int(INT_MIN_VALUE - 30), + ))) + .bind(schema.clone(), case_sensitive)?; + assert!( + !ManifestEvaluator::new(filter, case_sensitive).eval(&manifest_file)?, + "Should read: no information on if there are nan value in float column" + ); Ok(()) } #[test] - fn test_manifest_file_trivial_partition_passing_filter() -> Result<()> { + fn test_or() -> Result<()> { let case_sensitive = true; + let schema = create_schema()?; + let partitions = create_partitions(); + let manifest_file = create_manifest_file(partitions); + + let filter = Predicate::Binary(BinaryExpression::new( + PredicateOperator::LessThan, + Reference::new("id"), + Datum::int(INT_MIN_VALUE - 25), + )) + .or(Predicate::Binary(BinaryExpression::new( + PredicateOperator::GreaterThanOrEq, + Reference::new("id"), + Datum::int(INT_MAX_VALUE + 1), + ))) + .bind(schema.clone(), case_sensitive)?; + assert!( + !ManifestEvaluator::new(filter, case_sensitive).eval(&manifest_file)?, + "Should skip: or(false, false)" + ); - let (schema, partition_spec) = create_schema_and_partition_spec()?; + Ok(()) + } - let filter = Predicate::Unary(UnaryExpression::new( - PredicateOperator::IsNull, - Reference::new("a"), + #[test] + fn test_not() -> Result<()> { + let case_sensitive = true; + let schema = create_schema()?; + let partitions = create_partitions(); + let manifest_file = create_manifest_file(partitions); + + let filter = Predicate::Binary(BinaryExpression::new( + PredicateOperator::LessThan, + Reference::new("id"), + Datum::int(INT_MIN_VALUE - 25), )) + .not() .bind(schema.clone(), case_sensitive)?; + assert!( + ManifestEvaluator::new(filter, case_sensitive).eval(&manifest_file)?, + "Should read: not(false)" + ); - let manifest_file = create_manifest_file(vec![FieldSummary { - contains_null: true, - contains_nan: None, - lower_bound: None, - upper_bound: None, - }]); + let filter = Predicate::Binary(BinaryExpression::new( + PredicateOperator::GreaterThan, + Reference::new("id"), + Datum::int(INT_MIN_VALUE - 25), + )) + .not() + .bind(schema.clone(), case_sensitive)?; + assert!( + !ManifestEvaluator::new(filter, case_sensitive).eval(&manifest_file)?, + "Should skip: not(true)" + ); - let manifest_evaluator = - create_manifest_evaluator(schema, partition_spec, &filter, case_sensitive)?; + Ok(()) + } - let result = manifest_evaluator.eval(&manifest_file)?; + #[test] + fn test_less_than() -> Result<()> { + let case_sensitive = true; + let schema = create_schema()?; + let partitions = create_partitions(); + let manifest_file = create_manifest_file(partitions); + + let filter = Predicate::Binary(BinaryExpression::new( + PredicateOperator::LessThan, + Reference::new("id"), + Datum::int(INT_MIN_VALUE - 25), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + !ManifestEvaluator::new(filter, case_sensitive).eval(&manifest_file)?, + "Should not read: id range below lower bound (5 < 30)" + ); - assert!(result); + Ok(()) + } + + #[test] + fn test_less_than_or_eq() -> Result<()> { + let case_sensitive = true; + let schema = create_schema()?; + let partitions = create_partitions(); + let manifest_file = create_manifest_file(partitions); + + let filter = Predicate::Binary(BinaryExpression::new( + PredicateOperator::LessThanOrEq, + Reference::new("id"), + Datum::int(INT_MIN_VALUE - 25), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + !ManifestEvaluator::new(filter, case_sensitive).eval(&manifest_file)?, + "Should not read: id range below lower bound (5 < 30)" + ); Ok(()) } #[test] - fn test_manifest_file_trivial_partition_rejected_filter() -> Result<()> { + fn test_greater_than() -> Result<()> { let case_sensitive = true; + let schema = create_schema()?; + let partitions = create_partitions(); + let manifest_file = create_manifest_file(partitions); + + let filter = Predicate::Binary(BinaryExpression::new( + PredicateOperator::GreaterThan, + Reference::new("id"), + Datum::int(INT_MAX_VALUE + 6), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + !ManifestEvaluator::new(filter, case_sensitive).eval(&manifest_file)?, + "Should not read: id range above upper bound (85 < 79)" + ); + + Ok(()) + } - let (schema, partition_spec) = create_schema_and_partition_spec()?; + #[test] + fn test_greater_than_or_eq() -> Result<()> { + let case_sensitive = true; + let schema = create_schema()?; + let partitions = create_partitions(); + let manifest_file = create_manifest_file(partitions); + + let filter = Predicate::Binary(BinaryExpression::new( + PredicateOperator::GreaterThanOrEq, + Reference::new("id"), + Datum::int(INT_MAX_VALUE + 6), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + !ManifestEvaluator::new(filter, case_sensitive).eval(&manifest_file)?, + "Should not read: id range above upper bound (85 < 79)" + ); - let filter = Predicate::Unary(UnaryExpression::new( - PredicateOperator::IsNan, - Reference::new("a"), + let filter = Predicate::Binary(BinaryExpression::new( + PredicateOperator::GreaterThanOrEq, + Reference::new("id"), + Datum::int(INT_MAX_VALUE), )) .bind(schema.clone(), case_sensitive)?; + assert!( + ManifestEvaluator::new(filter, case_sensitive).eval(&manifest_file)?, + "Should read: one possible id" + ); + + Ok(()) + } + + #[test] + fn test_eq() -> Result<()> { + let case_sensitive = true; + let schema = create_schema()?; + let partitions = create_partitions(); + let manifest_file = create_manifest_file(partitions); + + let filter = Predicate::Binary(BinaryExpression::new( + PredicateOperator::Eq, + Reference::new("id"), + Datum::int(INT_MIN_VALUE - 25), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + !ManifestEvaluator::new(filter, case_sensitive).eval(&manifest_file)?, + "Should not read: id below lower bound" + ); + + let filter = Predicate::Binary(BinaryExpression::new( + PredicateOperator::Eq, + Reference::new("id"), + Datum::int(INT_MIN_VALUE), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + ManifestEvaluator::new(filter, case_sensitive).eval(&manifest_file)?, + "Should read: id equal to lower bound" + ); + + Ok(()) + } - let manifest_file = create_manifest_file(vec![FieldSummary { - contains_null: false, - contains_nan: None, - lower_bound: None, - upper_bound: None, - }]); + #[test] + fn test_not_eq() -> Result<()> { + let case_sensitive = true; + let schema = create_schema()?; + let partitions = create_partitions(); + let manifest_file = create_manifest_file(partitions); + + let filter = Predicate::Binary(BinaryExpression::new( + PredicateOperator::NotEq, + Reference::new("id"), + Datum::int(INT_MIN_VALUE - 25), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + ManifestEvaluator::new(filter, case_sensitive).eval(&manifest_file)?, + "Should read: id below lower bound" + ); - let manifest_evaluator = - create_manifest_evaluator(schema, partition_spec, &filter, case_sensitive)?; + Ok(()) + } - let result = manifest_evaluator.eval(&manifest_file).unwrap(); + #[test] + fn test_in() -> Result<()> { + let case_sensitive = true; + let schema = create_schema()?; + let partitions = create_partitions(); + let manifest_file = create_manifest_file(partitions); + + let filter = Predicate::Set(SetExpression::new( + PredicateOperator::In, + Reference::new("id"), + FnvHashSet::from_iter(vec![ + Datum::int(INT_MIN_VALUE - 25), + Datum::int(INT_MIN_VALUE - 24), + ]), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + !ManifestEvaluator::new(filter, case_sensitive).eval(&manifest_file)?, + "Should not read: id below lower bound (5 < 30, 6 < 30)" + ); + + let filter = Predicate::Set(SetExpression::new( + PredicateOperator::In, + Reference::new("id"), + FnvHashSet::from_iter(vec![ + Datum::int(INT_MIN_VALUE - 1), + Datum::int(INT_MIN_VALUE), + ]), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + ManifestEvaluator::new(filter, case_sensitive).eval(&manifest_file)?, + "Should read: id equal to lower bound (30 == 30)" + ); - assert!(!result); + Ok(()) + } + + #[test] + fn test_not_in() -> Result<()> { + let case_sensitive = true; + let schema = create_schema()?; + let partitions = create_partitions(); + let manifest_file = create_manifest_file(partitions); + + let filter = Predicate::Set(SetExpression::new( + PredicateOperator::NotIn, + Reference::new("id"), + FnvHashSet::from_iter(vec![ + Datum::int(INT_MIN_VALUE - 25), + Datum::int(INT_MIN_VALUE - 24), + ]), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + ManifestEvaluator::new(filter, case_sensitive).eval(&manifest_file)?, + "Should read: id below lower bound (5 < 30, 6 < 30)" + ); + + Ok(()) + } + + #[test] + fn test_starts_with() -> Result<()> { + let case_sensitive = false; + let schema = create_schema()?; + let partitions = create_partitions(); + let manifest_file = create_manifest_file(partitions); + + let filter = Predicate::Binary(BinaryExpression::new( + PredicateOperator::StartsWith, + Reference::new("some_nulls"), + Datum::string("a"), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + ManifestEvaluator::new(filter, case_sensitive).eval(&manifest_file)?, + "Should read: range matches" + ); + + let filter = Predicate::Binary(BinaryExpression::new( + PredicateOperator::StartsWith, + Reference::new("some_nulls"), + Datum::string("zzzz"), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + !ManifestEvaluator::new(filter, case_sensitive).eval(&manifest_file)?, + "Should skip: range doesn't match" + ); + + Ok(()) + } + + #[test] + fn test_not_starts_with() -> Result<()> { + let case_sensitive = false; + let schema = create_schema()?; + let partitions = create_partitions(); + let manifest_file = create_manifest_file(partitions); + + let filter = Predicate::Binary(BinaryExpression::new( + PredicateOperator::NotStartsWith, + Reference::new("some_nulls"), + Datum::string("a"), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + ManifestEvaluator::new(filter, case_sensitive).eval(&manifest_file)?, + "Should read: range matches" + ); + + let filter = Predicate::Binary(BinaryExpression::new( + PredicateOperator::NotStartsWith, + Reference::new("no_nulls_same_value_a"), + Datum::string("a"), + )) + .bind(schema.clone(), case_sensitive)?; + assert!( + !ManifestEvaluator::new(filter, case_sensitive).eval(&manifest_file)?, + "Should not read: all values start with the prefix" + ); Ok(()) } From 0371fd41ca6b0931eef098fdca55aec5945ed7cb Mon Sep 17 00:00:00 2001 From: s-akhtar-baig Date: Thu, 9 May 2024 12:48:14 -0700 Subject: [PATCH 2/8] Fix code comments --- .../iceberg/src/expr/visitors/manifest_evaluator.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/crates/iceberg/src/expr/visitors/manifest_evaluator.rs b/crates/iceberg/src/expr/visitors/manifest_evaluator.rs index dd5295ea4..c312de69c 100644 --- a/crates/iceberg/src/expr/visitors/manifest_evaluator.rs +++ b/crates/iceberg/src/expr/visitors/manifest_evaluator.rs @@ -242,7 +242,7 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> { let PrimitiveLiteral::String(prefix) = datum.literal() else { return Err(Error::new( ErrorKind::Unexpected, - "Cannot perfom starts_with on non-string value", + "Cannot perform starts_with on non-string value", )); }; @@ -252,7 +252,7 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> { let Literal::Primitive(PrimitiveLiteral::String(lower_bound)) = lower_bound else { return Err(Error::new( ErrorKind::Unexpected, - "Cannot perfom starts_with on non-string lower bound", + "Cannot perform starts_with on non-string lower bound", )); }; @@ -267,7 +267,7 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> { let Literal::Primitive(PrimitiveLiteral::String(upper_bound)) = upper_bound else { return Err(Error::new( ErrorKind::Unexpected, - "Cannot perfom starts_with on non-string upper bound", + "Cannot perform starts_with on non-string upper bound", )); }; @@ -296,7 +296,7 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> { let PrimitiveLiteral::String(prefix) = datum.literal() else { return Err(Error::new( ErrorKind::Unexpected, - "Cannot perfom not_starts_with on non-string value", + "Cannot perform not_starts_with on non-string value", )); }; @@ -308,7 +308,7 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> { let Literal::Primitive(PrimitiveLiteral::String(lower_bound)) = lower_bound else { return Err(Error::new( ErrorKind::Unexpected, - "Cannot perfom not_starts_with on non-string lower bound", + "Cannot perform not_starts_with on non-string lower bound", )); }; @@ -324,7 +324,7 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> { else { return Err(Error::new( ErrorKind::Unexpected, - "Cannot perfom not_starts_with on non-string upper bound", + "Cannot perform not_starts_with on non-string upper bound", )); }; From 965b6b2a2e1ad45cc602aa9bf555bf7f1fc300d7 Mon Sep 17 00:00:00 2001 From: s-akhtar-baig Date: Thu, 9 May 2024 16:09:45 -0700 Subject: [PATCH 3/8] Refactor code and fixpredicate for is_some_and --- crates/iceberg/src/expr/visitors/manifest_evaluator.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/crates/iceberg/src/expr/visitors/manifest_evaluator.rs b/crates/iceberg/src/expr/visitors/manifest_evaluator.rs index c312de69c..8c5bd382e 100644 --- a/crates/iceberg/src/expr/visitors/manifest_evaluator.rs +++ b/crates/iceberg/src/expr/visitors/manifest_evaluator.rs @@ -101,6 +101,7 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> { Ok(self.field_summary_for_reference(reference).contains_null) } + #[allow(clippy::bool_comparison)] fn not_null( &mut self, reference: &BoundReference, @@ -115,7 +116,7 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> { if all_null && reference.field().field_type.is_floating_type() { // floating point types may include NaN values, which we check separately. // In case bounds don't include NaN value, contains_nan needs to be checked against. - all_null = field.contains_nan.is_some_and(|x| !x); + all_null = field.contains_nan.is_some_and(|x| x == false); } Ok(!all_null) @@ -134,13 +135,14 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> { } } + #[allow(clippy::bool_comparison)] fn not_nan( &mut self, reference: &BoundReference, _predicate: &BoundPredicate, ) -> crate::Result { let field: &FieldSummary = self.field_summary_for_reference(reference); - if field.contains_nan.is_some_and(|x| x) + if field.contains_nan.is_some_and(|x| x == true) && !field.contains_null && field.lower_bound.is_none() { @@ -248,7 +250,7 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> { let prefix_len = prefix.chars().count(); - if let Some(lower_bound) = field.lower_bound.clone() { + if let Some(lower_bound) = &field.lower_bound { let Literal::Primitive(PrimitiveLiteral::String(lower_bound)) = lower_bound else { return Err(Error::new( ErrorKind::Unexpected, @@ -263,7 +265,7 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> { } } - if let Some(upper_bound) = field.upper_bound.clone() { + if let Some(upper_bound) = &field.upper_bound { let Literal::Primitive(PrimitiveLiteral::String(upper_bound)) = upper_bound else { return Err(Error::new( ErrorKind::Unexpected, From 766a4c4ef906490b65348e2836ee31e9ed8f3fee Mon Sep 17 00:00:00 2001 From: s-akhtar-baig Date: Thu, 9 May 2024 16:14:18 -0700 Subject: [PATCH 4/8] Refactor code --- crates/iceberg/src/expr/visitors/manifest_evaluator.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/iceberg/src/expr/visitors/manifest_evaluator.rs b/crates/iceberg/src/expr/visitors/manifest_evaluator.rs index 8c5bd382e..9934a7cde 100644 --- a/crates/iceberg/src/expr/visitors/manifest_evaluator.rs +++ b/crates/iceberg/src/expr/visitors/manifest_evaluator.rs @@ -306,7 +306,7 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> { // not_starts_with will match unless all values must start with the prefix. This happens when // the lower and upper bounds both start with the prefix. - if let Some(lower_bound) = field.lower_bound.clone() { + if let Some(lower_bound) = &field.lower_bound { let Literal::Primitive(PrimitiveLiteral::String(lower_bound)) = lower_bound else { return Err(Error::new( ErrorKind::Unexpected, @@ -321,7 +321,7 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> { let truncated_lower_bound = String::from(&lower_bound[..prefix_len]); if prefix == &truncated_lower_bound { - if let Some(upper_bound) = field.upper_bound.clone() { + if let Some(upper_bound) = &field.upper_bound { let Literal::Primitive(PrimitiveLiteral::String(upper_bound)) = upper_bound else { return Err(Error::new( From 00448933dae5cbc45cec0dd88d7ec5dcf97e5ad0 Mon Sep 17 00:00:00 2001 From: s-akhtar-baig Date: Fri, 10 May 2024 13:36:07 -0700 Subject: [PATCH 5/8] Handle review comments --- .../src/expr/visitors/manifest_evaluator.rs | 87 +++++++++++-------- 1 file changed, 50 insertions(+), 37 deletions(-) diff --git a/crates/iceberg/src/expr/visitors/manifest_evaluator.rs b/crates/iceberg/src/expr/visitors/manifest_evaluator.rs index 9934a7cde..6425e55bd 100644 --- a/crates/iceberg/src/expr/visitors/manifest_evaluator.rs +++ b/crates/iceberg/src/expr/visitors/manifest_evaluator.rs @@ -17,7 +17,7 @@ use crate::expr::visitors::bound_predicate_visitor::{visit, BoundPredicateVisitor}; use crate::expr::{BoundPredicate, BoundReference}; -use crate::spec::{Datum, FieldSummary, Literal, ManifestFile, PrimitiveLiteral}; +use crate::spec::{Datum, FieldSummary, Literal, ManifestFile, PrimitiveLiteral, Type}; use crate::Result; use crate::{Error, ErrorKind}; use fnv::FnvHashSet; @@ -101,7 +101,6 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> { Ok(self.field_summary_for_reference(reference).contains_null) } - #[allow(clippy::bool_comparison)] fn not_null( &mut self, reference: &BoundReference, @@ -111,28 +110,29 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> { // contains_null encodes whether at least one partition value is null, // lowerBound is null if all partition values are null - let mut all_null: bool = field.contains_null && field.lower_bound.is_none(); - - if all_null && reference.field().field_type.is_floating_type() { - // floating point types may include NaN values, which we check separately. - // In case bounds don't include NaN value, contains_nan needs to be checked against. - all_null = field.contains_nan.is_some_and(|x| x == false); + if self.are_all_null(field, &reference.field().field_type) { + ROWS_CANNOT_MATCH + } else { + ROWS_MIGHT_MATCH } - - Ok(!all_null) } + #[allow(clippy::bool_comparison)] fn is_nan( &mut self, reference: &BoundReference, _predicate: &BoundPredicate, ) -> crate::Result { let field: &FieldSummary = self.field_summary_for_reference(reference); - if let Some(contains_nan) = field.contains_nan { - Ok(contains_nan) - } else { - ROWS_MIGHT_MATCH + if field.contains_nan.is_some_and(|x| x == false) { + return ROWS_CANNOT_MATCH; + } + + if self.are_all_null(field, &reference.field().field_type) { + return ROWS_CANNOT_MATCH; } + + ROWS_MIGHT_MATCH } #[allow(clippy::bool_comparison)] @@ -159,8 +159,8 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> { _predicate: &BoundPredicate, ) -> crate::Result { let field: &FieldSummary = self.field_summary_for_reference(reference); - if let Some(lower_bound) = &field.lower_bound { - Ok(lower_bound < &Literal::from(datum.clone())) + if let Some(Literal::Primitive(lower_bound)) = &field.lower_bound { + Ok(lower_bound < datum.literal()) } else { ROWS_CANNOT_MATCH } @@ -173,8 +173,8 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> { _predicate: &BoundPredicate, ) -> crate::Result { let field: &FieldSummary = self.field_summary_for_reference(reference); - if let Some(lower_bound) = &field.lower_bound { - Ok(lower_bound <= &Literal::from(datum.clone())) + if let Some(Literal::Primitive(lower_bound)) = &field.lower_bound { + Ok(lower_bound <= datum.literal()) } else { ROWS_CANNOT_MATCH } @@ -187,8 +187,8 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> { _predicate: &BoundPredicate, ) -> crate::Result { let field: &FieldSummary = self.field_summary_for_reference(reference); - if let Some(upper_bound) = &field.upper_bound { - Ok(upper_bound > &Literal::from(datum.clone())) + if let Some(Literal::Primitive(upper_bound)) = &field.upper_bound { + Ok(upper_bound > datum.literal()) } else { ROWS_CANNOT_MATCH } @@ -201,8 +201,8 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> { _predicate: &BoundPredicate, ) -> crate::Result { let field: &FieldSummary = self.field_summary_for_reference(reference); - if let Some(upper_bound) = &field.upper_bound { - Ok(upper_bound >= &Literal::from(datum.clone())) + if let Some(Literal::Primitive(upper_bound)) = &field.upper_bound { + Ok(upper_bound >= datum.literal()) } else { ROWS_CANNOT_MATCH } @@ -259,7 +259,7 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> { }; let min_len = lower_bound.chars().count().min(prefix_len); - let truncated_lower_bound = String::from(&lower_bound[..min_len]); + let truncated_lower_bound = lower_bound.chars().take(min_len).collect::(); if prefix < &truncated_lower_bound { return ROWS_CANNOT_MATCH; } @@ -274,7 +274,7 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> { }; let min_len = upper_bound.chars().count().min(prefix_len); - let truncated_upper_bound = String::from(&upper_bound[..min_len]); + let truncated_upper_bound = upper_bound.chars().take(min_len).collect::(); if prefix > &truncated_upper_bound { return ROWS_CANNOT_MATCH; } @@ -319,7 +319,7 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> { return ROWS_MIGHT_MATCH; } - let truncated_lower_bound = String::from(&lower_bound[..prefix_len]); + let truncated_lower_bound = lower_bound.chars().take(prefix_len).collect::(); if prefix == &truncated_lower_bound { if let Some(upper_bound) = &field.upper_bound { let Literal::Primitive(PrimitiveLiteral::String(upper_bound)) = upper_bound @@ -335,7 +335,8 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> { return ROWS_MIGHT_MATCH; } - let truncated_upper_bound = String::from(&upper_bound[..prefix_len]); + let truncated_upper_bound = + upper_bound.chars().take(prefix_len).collect::(); if prefix == &truncated_upper_bound { return ROWS_CANNOT_MATCH; } @@ -361,19 +362,16 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> { return ROWS_MIGHT_MATCH; } - if literals - .iter() - .all(|datum| field.lower_bound.as_ref().unwrap() > &Literal::from(datum.clone())) - { - return ROWS_CANNOT_MATCH; + if let Some(Literal::Primitive(lower_bound)) = &field.lower_bound { + if literals.iter().all(|datum| lower_bound > datum.literal()) { + return ROWS_CANNOT_MATCH; + } } - if field.upper_bound.is_some() - && literals - .iter() - .all(|datum| field.upper_bound.as_ref().unwrap() < &Literal::from(datum.clone())) - { - return ROWS_CANNOT_MATCH; + if let Some(Literal::Primitive(upper_bound)) = &field.upper_bound { + if literals.iter().all(|datum| upper_bound < datum.literal()) { + return ROWS_CANNOT_MATCH; + } } ROWS_MIGHT_MATCH @@ -396,6 +394,21 @@ impl ManifestFilterVisitor<'_> { let pos = reference.accessor().position(); &self.partitions[pos] } + + #[allow(clippy::bool_comparison)] + fn are_all_null(&self, field: &FieldSummary, r#type: &Type) -> bool { + // contains_null encodes whether at least one partition value is null, + // lowerBound is null if all partition values are null + let mut all_null: bool = field.contains_null && field.lower_bound.is_none(); + + if all_null && r#type.is_floating_type() { + // floating point types may include NaN values, which we check separately. + // In case bounds don't include NaN value, contains_nan needs to be checked against. + all_null = field.contains_nan.is_some_and(|x| x == false); + } + + all_null + } } #[cfg(test)] From 552f59b6bbf2099089f91e1d58b26a268e6fd678 Mon Sep 17 00:00:00 2001 From: s-akhtar-baig Date: Wed, 15 May 2024 15:15:33 -0700 Subject: [PATCH 6/8] Handle review comments --- .../src/expr/visitors/manifest_evaluator.rs | 95 +++++++++++-------- 1 file changed, 58 insertions(+), 37 deletions(-) diff --git a/crates/iceberg/src/expr/visitors/manifest_evaluator.rs b/crates/iceberg/src/expr/visitors/manifest_evaluator.rs index 6425e55bd..3a61f84c1 100644 --- a/crates/iceberg/src/expr/visitors/manifest_evaluator.rs +++ b/crates/iceberg/src/expr/visitors/manifest_evaluator.rs @@ -106,7 +106,7 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> { reference: &BoundReference, _predicate: &BoundPredicate, ) -> crate::Result { - let field: &FieldSummary = self.field_summary_for_reference(reference); + let field = self.field_summary_for_reference(reference); // contains_null encodes whether at least one partition value is null, // lowerBound is null if all partition values are null @@ -117,15 +117,16 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> { } } - #[allow(clippy::bool_comparison)] fn is_nan( &mut self, reference: &BoundReference, _predicate: &BoundPredicate, ) -> crate::Result { - let field: &FieldSummary = self.field_summary_for_reference(reference); - if field.contains_nan.is_some_and(|x| x == false) { - return ROWS_CANNOT_MATCH; + let field = self.field_summary_for_reference(reference); + if let Some(contains_nan) = field.contains_nan { + if !contains_nan { + return ROWS_CANNOT_MATCH; + } } if self.are_all_null(field, &reference.field().field_type) { @@ -135,21 +136,18 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> { ROWS_MIGHT_MATCH } - #[allow(clippy::bool_comparison)] fn not_nan( &mut self, reference: &BoundReference, _predicate: &BoundPredicate, ) -> crate::Result { - let field: &FieldSummary = self.field_summary_for_reference(reference); - if field.contains_nan.is_some_and(|x| x == true) - && !field.contains_null - && field.lower_bound.is_none() - { - ROWS_CANNOT_MATCH - } else { - ROWS_MIGHT_MATCH + let field = self.field_summary_for_reference(reference); + if let Some(contains_nan) = field.contains_nan { + if contains_nan && !field.contains_null && field.lower_bound.is_none() { + return ROWS_CANNOT_MATCH; + } } + ROWS_MIGHT_MATCH } fn less_than( @@ -158,12 +156,13 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> { datum: &Datum, _predicate: &BoundPredicate, ) -> crate::Result { - let field: &FieldSummary = self.field_summary_for_reference(reference); + let field = self.field_summary_for_reference(reference); if let Some(Literal::Primitive(lower_bound)) = &field.lower_bound { - Ok(lower_bound < datum.literal()) - } else { - ROWS_CANNOT_MATCH + if datum.literal() <= lower_bound { + return ROWS_CANNOT_MATCH; + } } + ROWS_MIGHT_MATCH } fn less_than_or_eq( @@ -172,12 +171,13 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> { datum: &Datum, _predicate: &BoundPredicate, ) -> crate::Result { - let field: &FieldSummary = self.field_summary_for_reference(reference); + let field = self.field_summary_for_reference(reference); if let Some(Literal::Primitive(lower_bound)) = &field.lower_bound { - Ok(lower_bound <= datum.literal()) - } else { - ROWS_CANNOT_MATCH + if datum.literal() < lower_bound { + return ROWS_CANNOT_MATCH; + } } + ROWS_MIGHT_MATCH } fn greater_than( @@ -186,12 +186,13 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> { datum: &Datum, _predicate: &BoundPredicate, ) -> crate::Result { - let field: &FieldSummary = self.field_summary_for_reference(reference); + let field = self.field_summary_for_reference(reference); if let Some(Literal::Primitive(upper_bound)) = &field.upper_bound { - Ok(upper_bound > datum.literal()) - } else { - ROWS_CANNOT_MATCH + if datum.literal() >= upper_bound { + return ROWS_CANNOT_MATCH; + } } + ROWS_MIGHT_MATCH } fn greater_than_or_eq( @@ -200,12 +201,13 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> { datum: &Datum, _predicate: &BoundPredicate, ) -> crate::Result { - let field: &FieldSummary = self.field_summary_for_reference(reference); + let field = self.field_summary_for_reference(reference); if let Some(Literal::Primitive(upper_bound)) = &field.upper_bound { - Ok(upper_bound >= datum.literal()) - } else { - ROWS_CANNOT_MATCH + if datum.literal() > upper_bound { + return ROWS_CANNOT_MATCH; + } } + ROWS_MIGHT_MATCH } fn eq( @@ -214,8 +216,25 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> { datum: &Datum, _predicate: &BoundPredicate, ) -> crate::Result { - Ok(self.less_than_or_eq(reference, datum, _predicate)? - && self.greater_than_or_eq(reference, datum, _predicate)?) + let field = self.field_summary_for_reference(reference); + + if field.lower_bound.is_none() || field.upper_bound.is_none() { + return ROWS_CANNOT_MATCH; + } + + if let Some(Literal::Primitive(lower_bound)) = &field.lower_bound { + if lower_bound > datum.literal() { + return ROWS_CANNOT_MATCH; + } + } + + if let Some(Literal::Primitive(upper_bound)) = &field.upper_bound { + if upper_bound < datum.literal() { + return ROWS_CANNOT_MATCH; + } + } + + ROWS_MIGHT_MATCH } fn not_eq( @@ -235,7 +254,7 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> { datum: &Datum, _predicate: &BoundPredicate, ) -> crate::Result { - let field: &FieldSummary = self.field_summary_for_reference(reference); + let field = self.field_summary_for_reference(reference); if field.lower_bound.is_none() || field.upper_bound.is_none() { return ROWS_CANNOT_MATCH; @@ -289,7 +308,7 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> { datum: &Datum, _predicate: &BoundPredicate, ) -> crate::Result { - let field: &FieldSummary = self.field_summary_for_reference(reference); + let field = self.field_summary_for_reference(reference); if field.contains_null || field.lower_bound.is_none() || field.upper_bound.is_none() { return ROWS_MIGHT_MATCH; @@ -353,7 +372,7 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> { literals: &FnvHashSet, _predicate: &BoundPredicate, ) -> crate::Result { - let field: &FieldSummary = self.field_summary_for_reference(reference); + let field = self.field_summary_for_reference(reference); if field.lower_bound.is_none() { return ROWS_CANNOT_MATCH; } @@ -395,7 +414,6 @@ impl ManifestFilterVisitor<'_> { &self.partitions[pos] } - #[allow(clippy::bool_comparison)] fn are_all_null(&self, field: &FieldSummary, r#type: &Type) -> bool { // contains_null encodes whether at least one partition value is null, // lowerBound is null if all partition values are null @@ -404,7 +422,10 @@ impl ManifestFilterVisitor<'_> { if all_null && r#type.is_floating_type() { // floating point types may include NaN values, which we check separately. // In case bounds don't include NaN value, contains_nan needs to be checked against. - all_null = field.contains_nan.is_some_and(|x| x == false); + all_null = match field.contains_nan { + Some(val) => !val, + None => false, + } } all_null From 4bbd9f6bbe4b8082fadf3ab1bba1c8ef763d66e3 Mon Sep 17 00:00:00 2001 From: s-akhtar-baig Date: Mon, 10 Jun 2024 10:13:01 -0700 Subject: [PATCH 7/8] Handle review comments --- .../src/expr/visitors/manifest_evaluator.rs | 127 ++++++++---------- 1 file changed, 59 insertions(+), 68 deletions(-) diff --git a/crates/iceberg/src/expr/visitors/manifest_evaluator.rs b/crates/iceberg/src/expr/visitors/manifest_evaluator.rs index 69d053928..4f1c416c9 100644 --- a/crates/iceberg/src/expr/visitors/manifest_evaluator.rs +++ b/crates/iceberg/src/expr/visitors/manifest_evaluator.rs @@ -17,7 +17,7 @@ use crate::expr::visitors::bound_predicate_visitor::{visit, BoundPredicateVisitor}; use crate::expr::{BoundPredicate, BoundReference}; -use crate::spec::{Datum, FieldSummary, Literal, ManifestFile, PrimitiveLiteral, Type}; +use crate::spec::{Datum, FieldSummary, ManifestFile, PrimitiveLiteral, Type}; use crate::Result; use crate::{Error, ErrorKind}; use fnv::FnvHashSet; @@ -110,7 +110,7 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> { // contains_null encodes whether at least one partition value is null, // lowerBound is null if all partition values are null - if self.are_all_null(field, &reference.field().field_type) { + if ManifestFilterVisitor::are_all_null(field, &reference.field().field_type) { ROWS_CANNOT_MATCH } else { ROWS_MIGHT_MATCH @@ -129,7 +129,7 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> { } } - if self.are_all_null(field, &reference.field().field_type) { + if ManifestFilterVisitor::are_all_null(field, &reference.field().field_type) { return ROWS_CANNOT_MATCH; } @@ -143,6 +143,7 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> { ) -> crate::Result { let field = self.field_summary_for_reference(reference); if let Some(contains_nan) = field.contains_nan { + // check if all values are nan if contains_nan && !field.contains_null && field.lower_bound.is_none() { return ROWS_CANNOT_MATCH; } @@ -157,12 +158,11 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> { _predicate: &BoundPredicate, ) -> crate::Result { let field = self.field_summary_for_reference(reference); - if let Some(Literal::Primitive(lower_bound)) = &field.lower_bound { - if datum.literal() <= lower_bound { - return ROWS_CANNOT_MATCH; - } + match &field.lower_bound { + Some(bound) if datum <= bound => ROWS_CANNOT_MATCH, + Some(_) => ROWS_MIGHT_MATCH, + None => ROWS_CANNOT_MATCH, } - ROWS_MIGHT_MATCH } fn less_than_or_eq( @@ -172,12 +172,11 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> { _predicate: &BoundPredicate, ) -> crate::Result { let field = self.field_summary_for_reference(reference); - if let Some(Literal::Primitive(lower_bound)) = &field.lower_bound { - if datum.literal() < lower_bound { - return ROWS_CANNOT_MATCH; - } + match &field.lower_bound { + Some(bound) if datum < bound => ROWS_CANNOT_MATCH, + Some(_) => ROWS_MIGHT_MATCH, + None => ROWS_CANNOT_MATCH, } - ROWS_MIGHT_MATCH } fn greater_than( @@ -187,12 +186,11 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> { _predicate: &BoundPredicate, ) -> crate::Result { let field = self.field_summary_for_reference(reference); - if let Some(Literal::Primitive(upper_bound)) = &field.upper_bound { - if datum.literal() >= upper_bound { - return ROWS_CANNOT_MATCH; - } + match &field.upper_bound { + Some(bound) if datum >= bound => ROWS_CANNOT_MATCH, + Some(_) => ROWS_MIGHT_MATCH, + None => ROWS_CANNOT_MATCH, } - ROWS_MIGHT_MATCH } fn greater_than_or_eq( @@ -202,12 +200,11 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> { _predicate: &BoundPredicate, ) -> crate::Result { let field = self.field_summary_for_reference(reference); - if let Some(Literal::Primitive(upper_bound)) = &field.upper_bound { - if datum.literal() > upper_bound { - return ROWS_CANNOT_MATCH; - } + match &field.upper_bound { + Some(bound) if datum > bound => ROWS_CANNOT_MATCH, + Some(_) => ROWS_MIGHT_MATCH, + None => ROWS_CANNOT_MATCH, } - ROWS_MIGHT_MATCH } fn eq( @@ -222,14 +219,14 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> { return ROWS_CANNOT_MATCH; } - if let Some(Literal::Primitive(lower_bound)) = &field.lower_bound { - if lower_bound > datum.literal() { + if let Some(lower_bound) = &field.lower_bound { + if lower_bound > datum { return ROWS_CANNOT_MATCH; } } - if let Some(Literal::Primitive(upper_bound)) = &field.upper_bound { - if upper_bound < datum.literal() { + if let Some(upper_bound) = &field.upper_bound { + if upper_bound < datum { return ROWS_CANNOT_MATCH; } } @@ -267,34 +264,32 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> { )); }; - let prefix_len = prefix.chars().count(); + let prefix_len = prefix.len(); if let Some(lower_bound) = &field.lower_bound { - let Literal::Primitive(PrimitiveLiteral::String(lower_bound)) = lower_bound else { + let PrimitiveLiteral::String(lower_bound) = lower_bound.literal() else { return Err(Error::new( ErrorKind::Unexpected, "Cannot perform starts_with on non-string lower bound", )); }; - let min_len = lower_bound.chars().count().min(prefix_len); - let truncated_lower_bound = lower_bound.chars().take(min_len).collect::(); - if prefix < &truncated_lower_bound { + let min_len = lower_bound.len().min(prefix_len); + if prefix.as_bytes().lt(&lower_bound.as_bytes()[..min_len]) { return ROWS_CANNOT_MATCH; } } if let Some(upper_bound) = &field.upper_bound { - let Literal::Primitive(PrimitiveLiteral::String(upper_bound)) = upper_bound else { + let PrimitiveLiteral::String(upper_bound) = upper_bound.literal() else { return Err(Error::new( ErrorKind::Unexpected, "Cannot perform starts_with on non-string upper bound", )); }; - let min_len = upper_bound.chars().count().min(prefix_len); - let truncated_upper_bound = upper_bound.chars().take(min_len).collect::(); - if prefix > &truncated_upper_bound { + let min_len = upper_bound.len().min(prefix_len); + if prefix.as_bytes().gt(&upper_bound.as_bytes()[..min_len]) { return ROWS_CANNOT_MATCH; } } @@ -321,12 +316,12 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> { )); }; - let prefix_len = prefix.chars().count(); + let prefix_len = prefix.len(); // not_starts_with will match unless all values must start with the prefix. This happens when // the lower and upper bounds both start with the prefix. if let Some(lower_bound) = &field.lower_bound { - let Literal::Primitive(PrimitiveLiteral::String(lower_bound)) = lower_bound else { + let PrimitiveLiteral::String(lower_bound) = lower_bound.literal() else { return Err(Error::new( ErrorKind::Unexpected, "Cannot perform not_starts_with on non-string lower bound", @@ -334,15 +329,13 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> { }; // if lower is shorter than the prefix then lower doesn't start with the prefix - if prefix_len > lower_bound.chars().count() { + if prefix_len > lower_bound.len() { return ROWS_MIGHT_MATCH; } - let truncated_lower_bound = lower_bound.chars().take(prefix_len).collect::(); - if prefix == &truncated_lower_bound { + if prefix.as_bytes().eq(&lower_bound.as_bytes()[..prefix_len]) { if let Some(upper_bound) = &field.upper_bound { - let Literal::Primitive(PrimitiveLiteral::String(upper_bound)) = upper_bound - else { + let PrimitiveLiteral::String(upper_bound) = upper_bound.literal() else { return Err(Error::new( ErrorKind::Unexpected, "Cannot perform not_starts_with on non-string upper bound", @@ -350,13 +343,11 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> { }; // if upper is shorter than the prefix then upper can't start with the prefix - if prefix_len > upper_bound.chars().count() { + if prefix_len > upper_bound.len() { return ROWS_MIGHT_MATCH; } - let truncated_upper_bound = - upper_bound.chars().take(prefix_len).collect::(); - if prefix == &truncated_upper_bound { + if prefix.as_bytes().eq(&upper_bound.as_bytes()[..prefix_len]) { return ROWS_CANNOT_MATCH; } } @@ -381,14 +372,14 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> { return ROWS_MIGHT_MATCH; } - if let Some(Literal::Primitive(lower_bound)) = &field.lower_bound { - if literals.iter().all(|datum| lower_bound > datum.literal()) { + if let Some(lower_bound) = &field.lower_bound { + if literals.iter().all(|datum| lower_bound > datum) { return ROWS_CANNOT_MATCH; } } - if let Some(Literal::Primitive(upper_bound)) = &field.upper_bound { - if literals.iter().all(|datum| upper_bound < datum.literal()) { + if let Some(upper_bound) = &field.upper_bound { + if literals.iter().all(|datum| upper_bound < datum) { return ROWS_CANNOT_MATCH; } } @@ -414,7 +405,7 @@ impl ManifestFilterVisitor<'_> { &self.partitions[pos] } - fn are_all_null(&self, field: &FieldSummary, r#type: &Type) -> bool { + fn are_all_null(field: &FieldSummary, r#type: &Type) -> bool { // contains_null encodes whether at least one partition value is null, // lowerBound is null if all partition values are null let mut all_null: bool = field.contains_null && field.lower_bound.is_none(); @@ -440,8 +431,8 @@ mod test { UnaryExpression, }; use crate::spec::{ - Datum, FieldSummary, Literal, ManifestContentType, ManifestFile, NestedField, - PrimitiveType, Schema, SchemaRef, Type, + Datum, FieldSummary, ManifestContentType, ManifestFile, NestedField, PrimitiveType, Schema, + SchemaRef, Type, }; use crate::Result; use fnv::FnvHashSet; @@ -534,8 +525,8 @@ mod test { FieldSummary { contains_null: false, contains_nan: None, - lower_bound: Some(Literal::int(INT_MIN_VALUE)), - upper_bound: Some(Literal::int(INT_MAX_VALUE)), + lower_bound: Some(Datum::int(INT_MIN_VALUE)), + upper_bound: Some(Datum::int(INT_MAX_VALUE)), }, // all_nulls_missing_nan FieldSummary { @@ -548,22 +539,22 @@ mod test { FieldSummary { contains_null: true, contains_nan: None, - lower_bound: Some(Literal::string(STRING_MIN_VALUE)), - upper_bound: Some(Literal::string(STRING_MAX_VALUE)), + lower_bound: Some(Datum::string(STRING_MIN_VALUE)), + upper_bound: Some(Datum::string(STRING_MAX_VALUE)), }, // no_nulls FieldSummary { contains_null: false, contains_nan: None, - lower_bound: Some(Literal::string(STRING_MIN_VALUE)), - upper_bound: Some(Literal::string(STRING_MAX_VALUE)), + lower_bound: Some(Datum::string(STRING_MIN_VALUE)), + upper_bound: Some(Datum::string(STRING_MAX_VALUE)), }, // float FieldSummary { contains_null: true, contains_nan: None, - lower_bound: Some(Literal::float(0.0)), - upper_bound: Some(Literal::float(20.0)), + lower_bound: Some(Datum::float(0.0)), + upper_bound: Some(Datum::float(20.0)), }, // all_nulls_double FieldSummary { @@ -597,8 +588,8 @@ mod test { FieldSummary { contains_null: false, contains_nan: Some(false), - lower_bound: Some(Literal::float(0.0)), - upper_bound: Some(Literal::float(20.0)), + lower_bound: Some(Datum::float(0.0)), + upper_bound: Some(Datum::float(20.0)), }, // all_nulls_missing_nan_float FieldSummary { @@ -611,15 +602,15 @@ mod test { FieldSummary { contains_null: true, contains_nan: None, - lower_bound: Some(Literal::string(STRING_MIN_VALUE)), - upper_bound: Some(Literal::string(STRING_MIN_VALUE)), + lower_bound: Some(Datum::string(STRING_MIN_VALUE)), + upper_bound: Some(Datum::string(STRING_MIN_VALUE)), }, // no_nulls_same_value_a FieldSummary { contains_null: false, contains_nan: None, - lower_bound: Some(Literal::string(STRING_MIN_VALUE)), - upper_bound: Some(Literal::string(STRING_MIN_VALUE)), + lower_bound: Some(Datum::string(STRING_MIN_VALUE)), + upper_bound: Some(Datum::string(STRING_MIN_VALUE)), }, ] } From bf73b169788b3a2a4a427f8d7e9be7862f6129e0 Mon Sep 17 00:00:00 2001 From: s-akhtar-baig Date: Wed, 12 Jun 2024 14:16:38 -0700 Subject: [PATCH 8/8] Refactor code --- .../src/expr/visitors/manifest_evaluator.rs | 93 +++++++++---------- 1 file changed, 45 insertions(+), 48 deletions(-) diff --git a/crates/iceberg/src/expr/visitors/manifest_evaluator.rs b/crates/iceberg/src/expr/visitors/manifest_evaluator.rs index 4f1c416c9..30ae58f3b 100644 --- a/crates/iceberg/src/expr/visitors/manifest_evaluator.rs +++ b/crates/iceberg/src/expr/visitors/manifest_evaluator.rs @@ -257,39 +257,30 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> { return ROWS_CANNOT_MATCH; } - let PrimitiveLiteral::String(prefix) = datum.literal() else { - return Err(Error::new( - ErrorKind::Unexpected, - "Cannot perform starts_with on non-string value", - )); - }; - + let prefix = ManifestFilterVisitor::datum_as_str( + datum, + "Cannot perform starts_with on non-string value", + )?; let prefix_len = prefix.len(); if let Some(lower_bound) = &field.lower_bound { - let PrimitiveLiteral::String(lower_bound) = lower_bound.literal() else { - return Err(Error::new( - ErrorKind::Unexpected, - "Cannot perform starts_with on non-string lower bound", - )); - }; - - let min_len = lower_bound.len().min(prefix_len); - if prefix.as_bytes().lt(&lower_bound.as_bytes()[..min_len]) { + let lower_bound_str = ManifestFilterVisitor::datum_as_str( + lower_bound, + "Cannot perform starts_with on non-string lower bound", + )?; + let min_len = lower_bound_str.len().min(prefix_len); + if prefix.as_bytes().lt(&lower_bound_str.as_bytes()[..min_len]) { return ROWS_CANNOT_MATCH; } } if let Some(upper_bound) = &field.upper_bound { - let PrimitiveLiteral::String(upper_bound) = upper_bound.literal() else { - return Err(Error::new( - ErrorKind::Unexpected, - "Cannot perform starts_with on non-string upper bound", - )); - }; - - let min_len = upper_bound.len().min(prefix_len); - if prefix.as_bytes().gt(&upper_bound.as_bytes()[..min_len]) { + let upper_bound_str = ManifestFilterVisitor::datum_as_str( + upper_bound, + "Cannot perform starts_with on non-string upper bound", + )?; + let min_len = upper_bound_str.len().min(prefix_len); + if prefix.as_bytes().gt(&upper_bound_str.as_bytes()[..min_len]) { return ROWS_CANNOT_MATCH; } } @@ -309,45 +300,44 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> { return ROWS_MIGHT_MATCH; } - let PrimitiveLiteral::String(prefix) = datum.literal() else { - return Err(Error::new( - ErrorKind::Unexpected, - "Cannot perform not_starts_with on non-string value", - )); - }; - + let prefix = ManifestFilterVisitor::datum_as_str( + datum, + "Cannot perform not_starts_with on non-string value", + )?; let prefix_len = prefix.len(); // not_starts_with will match unless all values must start with the prefix. This happens when // the lower and upper bounds both start with the prefix. if let Some(lower_bound) = &field.lower_bound { - let PrimitiveLiteral::String(lower_bound) = lower_bound.literal() else { - return Err(Error::new( - ErrorKind::Unexpected, - "Cannot perform not_starts_with on non-string lower bound", - )); - }; + let lower_bound_str = ManifestFilterVisitor::datum_as_str( + lower_bound, + "Cannot perform not_starts_with on non-string lower bound", + )?; // if lower is shorter than the prefix then lower doesn't start with the prefix - if prefix_len > lower_bound.len() { + if prefix_len > lower_bound_str.len() { return ROWS_MIGHT_MATCH; } - if prefix.as_bytes().eq(&lower_bound.as_bytes()[..prefix_len]) { + if prefix + .as_bytes() + .eq(&lower_bound_str.as_bytes()[..prefix_len]) + { if let Some(upper_bound) = &field.upper_bound { - let PrimitiveLiteral::String(upper_bound) = upper_bound.literal() else { - return Err(Error::new( - ErrorKind::Unexpected, - "Cannot perform not_starts_with on non-string upper bound", - )); - }; + let upper_bound_str = ManifestFilterVisitor::datum_as_str( + upper_bound, + "Cannot perform not_starts_with on non-string upper bound", + )?; // if upper is shorter than the prefix then upper can't start with the prefix - if prefix_len > upper_bound.len() { + if prefix_len > upper_bound_str.len() { return ROWS_MIGHT_MATCH; } - if prefix.as_bytes().eq(&upper_bound.as_bytes()[..prefix_len]) { + if prefix + .as_bytes() + .eq(&upper_bound_str.as_bytes()[..prefix_len]) + { return ROWS_CANNOT_MATCH; } } @@ -421,6 +411,13 @@ impl ManifestFilterVisitor<'_> { all_null } + + fn datum_as_str<'a>(bound: &'a Datum, err_msg: &str) -> crate::Result<&'a String> { + let PrimitiveLiteral::String(bound) = bound.literal() else { + return Err(Error::new(ErrorKind::Unexpected, err_msg)); + }; + Ok(bound) + } } #[cfg(test)]