Skip to content

Commit

Permalink
Improve docs for TableProvider::supports_filters_pushdown and remov…
Browse files Browse the repository at this point in the history
…e deprecated function (#9923)

* Minor: Improve documentation for FilterPushdown and remove deprecated method

* Remove deprecated method

* Improve docs

* more

* Apply suggestions from code review

Co-authored-by: Phillip LeBlanc <phillip@leblanc.tech>

---------

Co-authored-by: Phillip LeBlanc <phillip@leblanc.tech>
  • Loading branch information
alamb and phillipleblanc authored Apr 5, 2024
1 parent 1553a36 commit 308ebc5
Show file tree
Hide file tree
Showing 7 changed files with 89 additions and 62 deletions.
8 changes: 4 additions & 4 deletions datafusion/core/src/dataframe/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1430,12 +1430,12 @@ impl TableProvider for DataFrameTableProvider {
Some(&self.plan)
}

fn supports_filter_pushdown(
fn supports_filters_pushdown(
&self,
_filter: &Expr,
) -> Result<TableProviderFilterPushDown> {
filters: &[&Expr],
) -> Result<Vec<TableProviderFilterPushDown>> {
// A filter is added on the DataFrame when given
Ok(TableProviderFilterPushDown::Exact)
Ok(vec![TableProviderFilterPushDown::Exact; filters.len()])
}

fn schema(&self) -> SchemaRef {
Expand Down
11 changes: 7 additions & 4 deletions datafusion/core/src/datasource/cte_worktable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -89,11 +89,14 @@ impl TableProvider for CteWorkTable {
)))
}

fn supports_filter_pushdown(
fn supports_filters_pushdown(
&self,
_filter: &Expr,
) -> Result<TableProviderFilterPushDown> {
filters: &[&Expr],
) -> Result<Vec<TableProviderFilterPushDown>> {
// TODO: should we support filter pushdown?
Ok(TableProviderFilterPushDown::Unsupported)
Ok(vec![
TableProviderFilterPushDown::Unsupported;
filters.len()
])
}
}
44 changes: 25 additions & 19 deletions datafusion/core/src/datasource/listing/table.rs
Original file line number Diff line number Diff line change
Expand Up @@ -685,26 +685,32 @@ impl TableProvider for ListingTable {
.await
}

fn supports_filter_pushdown(
fn supports_filters_pushdown(
&self,
filter: &Expr,
) -> Result<TableProviderFilterPushDown> {
if expr_applicable_for_cols(
&self
.options
.table_partition_cols
.iter()
.map(|x| x.0.clone())
.collect::<Vec<_>>(),
filter,
) {
// if filter can be handled by partiton pruning, it is exact
Ok(TableProviderFilterPushDown::Exact)
} else {
// otherwise, we still might be able to handle the filter with file
// level mechanisms such as Parquet row group pruning.
Ok(TableProviderFilterPushDown::Inexact)
}
filters: &[&Expr],
) -> Result<Vec<TableProviderFilterPushDown>> {
let support: Vec<_> = filters
.iter()
.map(|filter| {
if expr_applicable_for_cols(
&self
.options
.table_partition_cols
.iter()
.map(|x| x.0.clone())
.collect::<Vec<_>>(),
filter,
) {
// if filter can be handled by partition pruning, it is exact
TableProviderFilterPushDown::Exact
} else {
// otherwise, we still might be able to handle the filter with file
// level mechanisms such as Parquet row group pruning.
TableProviderFilterPushDown::Inexact
}
})
.collect();
Ok(support)
}

fn get_table_definition(&self) -> Option<&str> {
Expand Down
43 changes: 25 additions & 18 deletions datafusion/core/src/datasource/provider.rs
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,10 @@ pub trait TableProvider: Sync + Send {
/// which *all* of the `Expr`s evaluate to `true` must be returned (aka the
/// expressions are `AND`ed together).
///
/// To enable filter pushdown you must override
/// [`Self::supports_filters_pushdown`] as the default implementation does
/// not and `filters` will be empty.
///
/// DataFusion pushes filtering into the scans whenever possible
/// ("Filter Pushdown"), and depending on the format and the
/// implementation of the format, evaluating the predicate during the scan
Expand Down Expand Up @@ -154,28 +158,31 @@ pub trait TableProvider: Sync + Send {
limit: Option<usize>,
) -> Result<Arc<dyn ExecutionPlan>>;

/// Tests whether the table provider can make use of a filter expression
/// to optimise data retrieval.
#[deprecated(since = "20.0.0", note = "use supports_filters_pushdown instead")]
fn supports_filter_pushdown(
&self,
_filter: &Expr,
) -> Result<TableProviderFilterPushDown> {
Ok(TableProviderFilterPushDown::Unsupported)
}

/// Tests whether the table provider can make use of any or all filter expressions
/// to optimise data retrieval.
/// Note: the returned vector much have the same size as the filters argument.
#[allow(deprecated)]
/// Specify if DataFusion should provide filter expressions to the
/// TableProvider to apply *during* the scan.
///
/// The return value must have one element for each filter expression passed
/// in. The value of each element indicates if the TableProvider can apply
/// that particular filter during the scan.
///
/// Some TableProviders can evaluate filters more efficiently than the
/// `Filter` operator in DataFusion, for example by using an index.
///
/// By default, returns [`Unsupported`] for all filters, meaning no filters
/// will be provided to [`Self::scan`]. If the TableProvider can implement
/// filter pushdown, it should return either [`Exact`] or [`Inexact`].
///
/// [`Unsupported`]: TableProviderFilterPushDown::Unsupported
/// [`Exact`]: TableProviderFilterPushDown::Exact
/// [`Inexact`]: TableProviderFilterPushDown::Inexact
fn supports_filters_pushdown(
&self,
filters: &[&Expr],
) -> Result<Vec<TableProviderFilterPushDown>> {
filters
.iter()
.map(|f| self.supports_filter_pushdown(f))
.collect()
Ok(vec![
TableProviderFilterPushDown::Unsupported;
filters.len()
])
}

/// Get statistics for this table, if available
Expand Down
9 changes: 4 additions & 5 deletions datafusion/core/src/datasource/view.rs
Original file line number Diff line number Diff line change
Expand Up @@ -93,13 +93,12 @@ impl TableProvider for ViewTable {
fn get_table_definition(&self) -> Option<&str> {
self.definition.as_deref()
}

fn supports_filter_pushdown(
fn supports_filters_pushdown(
&self,
_filter: &Expr,
) -> Result<TableProviderFilterPushDown> {
filters: &[&Expr],
) -> Result<Vec<TableProviderFilterPushDown>> {
// A filter is added on the View when given
Ok(TableProviderFilterPushDown::Exact)
Ok(vec![TableProviderFilterPushDown::Exact; filters.len()])
}

async fn scan(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -214,8 +214,11 @@ impl TableProvider for CustomProvider {
}
}

fn supports_filter_pushdown(&self, _: &Expr) -> Result<TableProviderFilterPushDown> {
Ok(TableProviderFilterPushDown::Exact)
fn supports_filters_pushdown(
&self,
filters: &[&Expr],
) -> Result<Vec<TableProviderFilterPushDown>> {
Ok(vec![TableProviderFilterPushDown::Exact; filters.len()])
}
}

Expand Down
29 changes: 19 additions & 10 deletions datafusion/expr/src/table_source.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,20 +24,29 @@ use datafusion_common::{Constraints, Result};

use std::any::Any;

/// Indicates whether and how a filter expression can be handled by a
/// TableProvider for table scans.
/// Indicates how a filter expression is handled by
/// [`TableProvider::scan`].
///
/// Filter expressions are boolean expressions used to reduce the number of
/// rows that are read from a table. Only rows that evaluate to `true` ("pass
/// the filter") are returned. Rows that evaluate to `false` or `NULL` are
/// omitted.
///
/// [`TableProvider::scan`]: https://docs.rs/datafusion/latest/datafusion/datasource/provider/trait.TableProvider.html#tymethod.scan
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum TableProviderFilterPushDown {
/// The expression cannot be used by the provider.
/// The filter cannot be used by the provider and will not be pushed down.
Unsupported,
/// The expression can be used to reduce the data retrieved,
/// but the provider cannot guarantee it will omit all tuples that
/// may be filtered. In this case, DataFusion will apply an additional
/// `Filter` operation after the scan to ensure all rows are filtered correctly.
/// The filter can be used, but the provider might still return some tuples
/// that do not pass the filter.
///
/// In this case, DataFusion applies an additional `Filter` operation
/// after the scan to ensure all rows are filtered correctly.
Inexact,
/// The provider **guarantees** that it will omit **all** tuples that are
/// filtered by the filter expression. This is the fastest option, if available
/// as DataFusion will not apply additional filtering.
/// The provider **guarantees** that it will omit **only** tuples which
/// pass the filter.
///
/// In this case, DataFusion will not apply additional filtering.
Exact,
}

Expand Down

0 comments on commit 308ebc5

Please sign in to comment.