Skip to content

Commit

Permalink
parquet: Add tests for page pruning on unsigned integers (apache#9888)
Browse files Browse the repository at this point in the history
  • Loading branch information
progval authored Mar 31, 2024
1 parent ef601d2 commit a23f507
Show file tree
Hide file tree
Showing 2 changed files with 155 additions and 2 deletions.
43 changes: 41 additions & 2 deletions datafusion/core/tests/parquet/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ use arrow::{
Array, ArrayRef, BinaryArray, Date32Array, Date64Array, FixedSizeBinaryArray,
Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, StringArray,
TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray,
TimestampSecondArray,
TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array,
},
datatypes::{DataType, Field, Schema},
record_batch::RecordBatch,
Expand Down Expand Up @@ -65,6 +65,7 @@ enum Scenario {
Dates,
Int,
Int32Range,
UInt,
Float64,
Decimal,
DecimalBloomFilterInt32,
Expand Down Expand Up @@ -387,7 +388,7 @@ fn make_timestamp_batch(offset: Duration) -> RecordBatch {
.unwrap()
}

/// Return record batch with i32 sequence
/// Return record batch with i8, i16, i32, and i64 sequences
///
/// Columns are named
/// "i8" -> Int8Array
Expand Down Expand Up @@ -417,6 +418,36 @@ fn make_int_batches(start: i8, end: i8) -> RecordBatch {
.unwrap()
}

/// Return record batch with i8, i16, i32, and i64 sequences
///
/// Columns are named
/// "u8" -> UInt8Array
/// "u16" -> UInt16Array
/// "u32" -> UInt32Array
/// "u64" -> UInt64Array
fn make_uint_batches(start: u8, end: u8) -> RecordBatch {
let schema = Arc::new(Schema::new(vec![
Field::new("u8", DataType::UInt8, true),
Field::new("u16", DataType::UInt16, true),
Field::new("u32", DataType::UInt32, true),
Field::new("u64", DataType::UInt64, true),
]));
let v8: Vec<u8> = (start..end).collect();
let v16: Vec<u16> = (start as _..end as _).collect();
let v32: Vec<u32> = (start as _..end as _).collect();
let v64: Vec<u64> = (start as _..end as _).collect();
RecordBatch::try_new(
schema,
vec![
Arc::new(UInt8Array::from(v8)) as ArrayRef,
Arc::new(UInt16Array::from(v16)) as ArrayRef,
Arc::new(UInt32Array::from(v32)) as ArrayRef,
Arc::new(UInt64Array::from(v64)) as ArrayRef,
],
)
.unwrap()
}

fn make_int32_range(start: i32, end: i32) -> RecordBatch {
let schema = Arc::new(Schema::new(vec![Field::new("i", DataType::Int32, true)]));
let v = vec![start, end];
Expand Down Expand Up @@ -620,6 +651,14 @@ fn create_data_batch(scenario: Scenario) -> Vec<RecordBatch> {
Scenario::Int32Range => {
vec![make_int32_range(0, 10), make_int32_range(200000, 300000)]
}
Scenario::UInt => {
vec![
make_uint_batches(0, 5),
make_uint_batches(1, 6),
make_uint_batches(5, 10),
make_uint_batches(250, 255),
]
}
Scenario::Float64 => {
vec![
make_f64_batch(vec![-5.0, -4.0, -3.0, -2.0, -1.0]),
Expand Down
114 changes: 114 additions & 0 deletions datafusion/core/tests/parquet/page_pruning.rs
Original file line number Diff line number Diff line change
Expand Up @@ -515,6 +515,120 @@ int_tests!(16);
int_tests!(32);
int_tests!(64);

macro_rules! uint_tests {
($bits:expr) => {
paste::item! {
#[tokio::test]
// null count min max
// page-0 0 0 4
// page-1 0 1 5
// page-2 0 5 9
// page-3 0 250 254
async fn [<prune_uint $bits _lt>]() {
test_prune(
Scenario::UInt,
&format!("SELECT * FROM t where u{} < 6", $bits),
Some(0),
Some(5),
11,
)
.await;
}

#[tokio::test]
async fn [<prune_uint $bits _gt >]() {
test_prune(
Scenario::UInt,
&format!("SELECT * FROM t where u{} > 253", $bits),
Some(0),
Some(15),
1,
)
.await;
}

#[tokio::test]
async fn [<prune_uint $bits _eq >]() {
test_prune(
Scenario::UInt,
&format!("SELECT * FROM t where u{} = 6", $bits),
Some(0),
Some(15),
1,
)
.await;
}

#[tokio::test]
async fn [<prune_uint $bits _scalar_fun_and_eq >]() {
test_prune(
Scenario::UInt,
&format!("SELECT * FROM t where power(u{}, 2) = 36 and u{} = 6", $bits, $bits),
Some(0),
Some(15),
1,
)
.await;
}

#[tokio::test]
async fn [<prune_uint $bits _scalar_fun >]() {
test_prune(
Scenario::UInt,
&format!("SELECT * FROM t where power(u{}, 2) = 25", $bits),
Some(0),
Some(0),
2,
)
.await;
}

#[tokio::test]
async fn [<prune_uint $bits _complex_expr>]() {
test_prune(
Scenario::UInt,
&format!("SELECT * FROM t where u{}+1 = 6", $bits),
Some(0),
Some(0),
2,
)
.await;
}

#[tokio::test]
async fn [<prune_uint $bits _eq_in_list >]() {
// result of sql "SELECT * FROM t where in (1)"
test_prune(
Scenario::UInt,
&format!("SELECT * FROM t where u{} in (6)", $bits),
Some(0),
Some(15),
1,
)
.await;
}

#[tokio::test]
async fn [<prune_uint $bits _eq_in_list_negated >]() {
// result of sql "SELECT * FROM t where not in (6)" prune nothing
test_prune(
Scenario::UInt,
&format!("SELECT * FROM t where u{} not in (6)", $bits),
Some(0),
Some(0),
19,
)
.await;
}
}
}
}

uint_tests!(8);
uint_tests!(16);
uint_tests!(32);
uint_tests!(64);

#[tokio::test]
// null count min max
// page-0 0 -5.0 -1.0
Expand Down

0 comments on commit a23f507

Please sign in to comment.