From f73dbc30bb80a87d5b57c1cdb0232d15f0a5a75a Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sun, 25 Aug 2024 07:06:37 -0400 Subject: [PATCH] Minor: improve `RowFilter` and `ArrowPredicate` docs (#6301) * Minor: improve `RowFilter` and `ArrowPredicate` docs * tweak --- parquet/src/arrow/arrow_reader/filter.rs | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/filter.rs b/parquet/src/arrow/arrow_reader/filter.rs index 4686e1512727..2e22f7e01cf0 100644 --- a/parquet/src/arrow/arrow_reader/filter.rs +++ b/parquet/src/arrow/arrow_reader/filter.rs @@ -20,6 +20,8 @@ use arrow_array::{BooleanArray, RecordBatch}; use arrow_schema::ArrowError; /// A predicate operating on [`RecordBatch`] +/// +/// See [`RowFilter`] for more information on the use of this trait. pub trait ArrowPredicate: Send + 'static { /// Returns the [`ProjectionMask`] that describes the columns required /// to evaluate this predicate. All projected columns will be provided in the `batch` @@ -29,7 +31,7 @@ pub trait ArrowPredicate: Send + 'static { /// Evaluate this predicate for the given [`RecordBatch`] containing the columns /// identified by [`Self::projection`] /// - /// Must return a [`BooleanArray`] that has the same length as the input + /// Must return a [`BooleanArray`] that has the same length as the input /// `batch` where each row indicates whether the row should be returned: /// * `true`:the row should be returned /// * `false` or `null`: the row should not be returned @@ -68,12 +70,17 @@ where } } -/// A [`RowFilter`] allows pushing down a filter predicate to skip IO and decode +/// Filter applied *during* the parquet read process +/// +/// [`RowFilter`] applies predicates in order, after decoding only the columns +/// required. As predicates eliminate rows, fewer rows from subsequent columns +/// may be required, thus potentially reducing IO and decode. /// -/// This consists of a list of [`ArrowPredicate`] where only the rows that satisfy all -/// of the predicates will be returned. Any [`RowSelection`] will be applied prior +/// A `RowFilter` consists of a list of [`ArrowPredicate`]s. Only the rows for which +/// all the predicates evaluate to `true` will be returned. +/// Any [`RowSelection`] provided to the reader will be applied prior /// to the first predicate, and each predicate in turn will then be used to compute -/// a more refined [`RowSelection`] to use when evaluating the subsequent predicates. +/// a more refined [`RowSelection`] used when evaluating the subsequent predicates. /// /// Once all predicates have been evaluated, the final [`RowSelection`] is applied /// to the top-level [`ProjectionMask`] to produce the final output [`RecordBatch`].