From 3b962d35623acf66bc8ab6da7315c13c55a03f41 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 3 Apr 2024 07:40:14 -0400 Subject: [PATCH 1/5] Minor: Improve documentation for FilterPushdown and remove deprecated method --- datafusion/core/src/datasource/provider.rs | 43 +++++++++++++--------- 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/datafusion/core/src/datasource/provider.rs b/datafusion/core/src/datasource/provider.rs index f2e3e907e5ce..1c556bb95446 100644 --- a/datafusion/core/src/datasource/provider.rs +++ b/datafusion/core/src/datasource/provider.rs @@ -96,6 +96,10 @@ pub trait TableProvider: Sync + Send { /// which *all* of the `Expr`s evaluate to `true` must be returned (aka the /// expressions are `AND`ed together). /// + /// To enable filter pushdown you must override of + /// [`Self::supports_filters_pushdown`] as the default implementation does + /// not and `filters` will be empty. + /// /// DataFusion pushes filtering into the scans whenever possible /// ("Filter Pushdown"), and depending on the format and the /// implementation of the format, evaluating the predicate during the scan @@ -154,28 +158,31 @@ pub trait TableProvider: Sync + Send { limit: Option, ) -> Result>; - /// Tests whether the table provider can make use of a filter expression - /// to optimise data retrieval. - #[deprecated(since = "20.0.0", note = "use supports_filters_pushdown instead")] - fn supports_filter_pushdown( - &self, - _filter: &Expr, - ) -> Result { - Ok(TableProviderFilterPushDown::Unsupported) - } - - /// Tests whether the table provider can make use of any or all filter expressions - /// to optimise data retrieval. - /// Note: the returned vector much have the same size as the filters argument. - #[allow(deprecated)] + /// Specify if DataFusion should provide filter expressions to the + /// TableProvider to apply *during* the scan. + /// + /// The return value must have one element for each filter expression passed + /// in. The value of each element indicates if the TableProvider can apply + /// that particular filter during the scan. + /// + /// Some TableProviders can evaluate filters more efficiently than the + /// `Filter` operator in DataFusion, for example by using an index. + /// + /// By default, returns [`Unsupported`] for all filters, meaning no filters + /// will be provided to [`Self::scan`]. If the TableProvider can implement + /// filter pushdown, it should return either [`Exact`] or [`Inexact`]. + /// + /// [`Unsupported`]: TableProviderFilterPushDown::Unsupported + /// [`Exact`]: TableProviderFilterPushDown::Exact + /// [`Inexact`]: TableProviderFilterPushDown::Inexact fn supports_filters_pushdown( &self, filters: &[&Expr], ) -> Result> { - filters - .iter() - .map(|f| self.supports_filter_pushdown(f)) - .collect() + Ok(vec![ + TableProviderFilterPushDown::Unsupported; + filters.len() + ]) } /// Get statistics for this table, if available From 61b4193996b2f61b15b54ae2241d200c10b440c3 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 3 Apr 2024 07:49:25 -0400 Subject: [PATCH 2/5] Remove deprecated method --- datafusion/core/src/dataframe/mod.rs | 8 ++-- .../core/src/datasource/cte_worktable.rs | 11 +++-- .../core/src/datasource/listing/table.rs | 44 +++++++++++-------- datafusion/core/src/datasource/view.rs | 9 ++-- .../provider_filter_pushdown.rs | 7 ++- 5 files changed, 45 insertions(+), 34 deletions(-) diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs index 1db4f8ede692..683cb809a5b1 100644 --- a/datafusion/core/src/dataframe/mod.rs +++ b/datafusion/core/src/dataframe/mod.rs @@ -1430,12 +1430,12 @@ impl TableProvider for DataFrameTableProvider { Some(&self.plan) } - fn supports_filter_pushdown( + fn supports_filters_pushdown( &self, - _filter: &Expr, - ) -> Result { + filters: &[&Expr], + ) -> Result> { // A filter is added on the DataFrame when given - Ok(TableProviderFilterPushDown::Exact) + Ok(vec![TableProviderFilterPushDown::Exact; filters.len()]) } fn schema(&self) -> SchemaRef { diff --git a/datafusion/core/src/datasource/cte_worktable.rs b/datafusion/core/src/datasource/cte_worktable.rs index 71075839b9a0..f8fd94d4d3fd 100644 --- a/datafusion/core/src/datasource/cte_worktable.rs +++ b/datafusion/core/src/datasource/cte_worktable.rs @@ -89,11 +89,14 @@ impl TableProvider for CteWorkTable { ))) } - fn supports_filter_pushdown( + fn supports_filters_pushdown( &self, - _filter: &Expr, - ) -> Result { + filters: &[&Expr], + ) -> Result> { // TODO: should we support filter pushdown? - Ok(TableProviderFilterPushDown::Unsupported) + Ok(vec![ + TableProviderFilterPushDown::Unsupported; + filters.len() + ]) } } diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs index c1e337b5c44a..5ef7b6241b60 100644 --- a/datafusion/core/src/datasource/listing/table.rs +++ b/datafusion/core/src/datasource/listing/table.rs @@ -685,26 +685,32 @@ impl TableProvider for ListingTable { .await } - fn supports_filter_pushdown( + fn supports_filters_pushdown( &self, - filter: &Expr, - ) -> Result { - if expr_applicable_for_cols( - &self - .options - .table_partition_cols - .iter() - .map(|x| x.0.clone()) - .collect::>(), - filter, - ) { - // if filter can be handled by partiton pruning, it is exact - Ok(TableProviderFilterPushDown::Exact) - } else { - // otherwise, we still might be able to handle the filter with file - // level mechanisms such as Parquet row group pruning. - Ok(TableProviderFilterPushDown::Inexact) - } + filters: &[&Expr], + ) -> Result> { + let support: Vec<_> = filters + .iter() + .map(|filter| { + if expr_applicable_for_cols( + &self + .options + .table_partition_cols + .iter() + .map(|x| x.0.clone()) + .collect::>(), + filter, + ) { + // if filter can be handled by partition pruning, it is exact + TableProviderFilterPushDown::Exact + } else { + // otherwise, we still might be able to handle the filter with file + // level mechanisms such as Parquet row group pruning. + TableProviderFilterPushDown::Inexact + } + }) + .collect(); + Ok(support) } fn get_table_definition(&self) -> Option<&str> { diff --git a/datafusion/core/src/datasource/view.rs b/datafusion/core/src/datasource/view.rs index d1b7dad15225..31e812332c94 100644 --- a/datafusion/core/src/datasource/view.rs +++ b/datafusion/core/src/datasource/view.rs @@ -93,13 +93,12 @@ impl TableProvider for ViewTable { fn get_table_definition(&self) -> Option<&str> { self.definition.as_deref() } - - fn supports_filter_pushdown( + fn supports_filters_pushdown( &self, - _filter: &Expr, - ) -> Result { + filters: &[&Expr], + ) -> Result> { // A filter is added on the View when given - Ok(TableProviderFilterPushDown::Exact) + Ok(vec![TableProviderFilterPushDown::Exact; filters.len()]) } async fn scan( diff --git a/datafusion/core/tests/custom_sources_cases/provider_filter_pushdown.rs b/datafusion/core/tests/custom_sources_cases/provider_filter_pushdown.rs index bc6d85a74a51..2ae41391f42d 100644 --- a/datafusion/core/tests/custom_sources_cases/provider_filter_pushdown.rs +++ b/datafusion/core/tests/custom_sources_cases/provider_filter_pushdown.rs @@ -214,8 +214,11 @@ impl TableProvider for CustomProvider { } } - fn supports_filter_pushdown(&self, _: &Expr) -> Result { - Ok(TableProviderFilterPushDown::Exact) + fn supports_filters_pushdown( + &self, + filters: &[&Expr], + ) -> Result> { + Ok(vec![TableProviderFilterPushDown::Exact; filters.len()]) } } From a48f93f38b48168b609c034683d447aa8f475d1b Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 3 Apr 2024 08:42:23 -0400 Subject: [PATCH 3/5] Improve docs --- datafusion/expr/src/table_source.rs | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/datafusion/expr/src/table_source.rs b/datafusion/expr/src/table_source.rs index 565f48c1c5a9..3f734f04d4ad 100644 --- a/datafusion/expr/src/table_source.rs +++ b/datafusion/expr/src/table_source.rs @@ -24,20 +24,28 @@ use datafusion_common::{Constraints, Result}; use std::any::Any; -/// Indicates whether and how a filter expression can be handled by a -/// TableProvider for table scans. +/// Indicates how a filter expression is handled by +/// [`TableProvider::scan`]. +/// +/// Filter expressions are boolean expressions used to reduce the number of +/// rows that are read from a table. Only rows that evaluate to `true` ("pass +/// the filter") are returned. +/// +/// [`TableProvider::scan`]: https://docs.rs/datafusion/latest/datafusion/datasource/provider/trait.TableProvider.html#tymethod.scan #[derive(Debug, Clone, PartialEq, Eq)] pub enum TableProviderFilterPushDown { - /// The expression cannot be used by the provider. + /// The filter cannot be used by the provider and will not be pushed down. Unsupported, - /// The expression can be used to reduce the data retrieved, - /// but the provider cannot guarantee it will omit all tuples that - /// may be filtered. In this case, DataFusion will apply an additional - /// `Filter` operation after the scan to ensure all rows are filtered correctly. + /// The filter can be used, but the provider might still return some tuples + /// that do not pass the filter. + /// + /// In this case, DataFusion applies an additional `Filter` operation + /// after the scan to ensure all rows are filtered correctly. Inexact, - /// The provider **guarantees** that it will omit **all** tuples that are - /// filtered by the filter expression. This is the fastest option, if available - /// as DataFusion will not apply additional filtering. + /// The provider **guarantees** that it will omit **only** tuples which + /// pass the filter. + /// + /// In this case, DataFusion will not apply additional filtering. Exact, } From 111dd95534a7139533338c0ccb59e3dd82a2bc91 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 3 Apr 2024 08:42:56 -0400 Subject: [PATCH 4/5] more --- datafusion/expr/src/table_source.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/datafusion/expr/src/table_source.rs b/datafusion/expr/src/table_source.rs index 3f734f04d4ad..9d46b289a4b4 100644 --- a/datafusion/expr/src/table_source.rs +++ b/datafusion/expr/src/table_source.rs @@ -27,9 +27,10 @@ use std::any::Any; /// Indicates how a filter expression is handled by /// [`TableProvider::scan`]. /// -/// Filter expressions are boolean expressions used to reduce the number of +/// Filter expressions are boolean expressions used to reduce the number of /// rows that are read from a table. Only rows that evaluate to `true` ("pass -/// the filter") are returned. +/// the filter") are returned. Rows that evaluate to `false` or `NULL` are +/// omitted. /// /// [`TableProvider::scan`]: https://docs.rs/datafusion/latest/datafusion/datasource/provider/trait.TableProvider.html#tymethod.scan #[derive(Debug, Clone, PartialEq, Eq)] From 5eec7843fdc48cb5de44ca96f00411eb3553f4c7 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 3 Apr 2024 12:12:38 -0400 Subject: [PATCH 5/5] Apply suggestions from code review Co-authored-by: Phillip LeBlanc --- datafusion/core/src/datasource/provider.rs | 4 ++-- datafusion/expr/src/table_source.rs | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/datafusion/core/src/datasource/provider.rs b/datafusion/core/src/datasource/provider.rs index 1c556bb95446..9aac072ed4e2 100644 --- a/datafusion/core/src/datasource/provider.rs +++ b/datafusion/core/src/datasource/provider.rs @@ -96,7 +96,7 @@ pub trait TableProvider: Sync + Send { /// which *all* of the `Expr`s evaluate to `true` must be returned (aka the /// expressions are `AND`ed together). /// - /// To enable filter pushdown you must override of + /// To enable filter pushdown you must override /// [`Self::supports_filters_pushdown`] as the default implementation does /// not and `filters` will be empty. /// @@ -166,7 +166,7 @@ pub trait TableProvider: Sync + Send { /// that particular filter during the scan. /// /// Some TableProviders can evaluate filters more efficiently than the - /// `Filter` operator in DataFusion, for example by using an index. + /// `Filter` operator in DataFusion, for example by using an index. /// /// By default, returns [`Unsupported`] for all filters, meaning no filters /// will be provided to [`Self::scan`]. If the TableProvider can implement diff --git a/datafusion/expr/src/table_source.rs b/datafusion/expr/src/table_source.rs index 9d46b289a4b4..f662f4d9f77d 100644 --- a/datafusion/expr/src/table_source.rs +++ b/datafusion/expr/src/table_source.rs @@ -43,7 +43,7 @@ pub enum TableProviderFilterPushDown { /// In this case, DataFusion applies an additional `Filter` operation /// after the scan to ensure all rows are filtered correctly. Inexact, - /// The provider **guarantees** that it will omit **only** tuples which + /// The provider **guarantees** that it will omit **only** tuples which /// pass the filter. /// /// In this case, DataFusion will not apply additional filtering.