Skip to content

Commit

Permalink
Make Precision<usize> copy to make it clear clones are not expensive (
Browse files Browse the repository at this point in the history
#11828)

* Minor: make it clearer that clone() is not slow

* Make Precision<T> Copy when T is Copy
  • Loading branch information
alamb authored Aug 13, 2024
1 parent 7a1a23d commit 5d3cda5
Show file tree
Hide file tree
Showing 6 changed files with 39 additions and 26 deletions.
23 changes: 19 additions & 4 deletions datafusion/common/src/stats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ use arrow_schema::{Schema, SchemaRef};

/// Represents a value with a degree of certainty. `Precision` is used to
/// propagate information the precision of statistical values.
#[derive(Clone, PartialEq, Eq, Default)]
#[derive(Clone, PartialEq, Eq, Default, Copy)]
pub enum Precision<T: Debug + Clone + PartialEq + Eq + PartialOrd> {
/// The exact value is known
Exact(T),
Expand Down Expand Up @@ -503,9 +503,9 @@ mod tests {
let inexact_precision = Precision::Inexact(42);
let absent_precision = Precision::<i32>::Absent;

assert_eq!(exact_precision.clone().to_inexact(), inexact_precision);
assert_eq!(inexact_precision.clone().to_inexact(), inexact_precision);
assert_eq!(absent_precision.clone().to_inexact(), absent_precision);
assert_eq!(exact_precision.to_inexact(), inexact_precision);
assert_eq!(inexact_precision.to_inexact(), inexact_precision);
assert_eq!(absent_precision.to_inexact(), absent_precision);
}

#[test]
Expand Down Expand Up @@ -545,4 +545,19 @@ mod tests {
assert_eq!(precision2.multiply(&precision3), Precision::Inexact(15));
assert_eq!(precision1.multiply(&absent_precision), Precision::Absent);
}

#[test]
fn test_precision_cloning() {
// Precision<usize> is copy
let precision: Precision<usize> = Precision::Exact(42);
let p2 = precision;
assert_eq!(precision, p2);

// Precision<ScalarValue> is not copy (requires .clone())
let precision: Precision<ScalarValue> =
Precision::Exact(ScalarValue::Int64(Some(42)));
// Clippy would complain about this if it were Copy
let p2 = precision.clone();
assert_eq!(precision, p2);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,7 @@ impl FileScanConfig {
}

let table_stats = Statistics {
num_rows: self.statistics.num_rows.clone(),
num_rows: self.statistics.num_rows,
// TODO correct byte size?
total_byte_size: Precision::Absent,
column_statistics: table_cols_stats,
Expand Down
26 changes: 13 additions & 13 deletions datafusion/core/src/datasource/statistics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,18 @@
use std::mem;
use std::sync::Arc;

use super::listing::PartitionedFile;
use crate::arrow::datatypes::{Schema, SchemaRef};
use crate::error::Result;
use crate::functions_aggregate::min_max::{MaxAccumulator, MinAccumulator};
use crate::physical_plan::{Accumulator, ColumnStatistics, Statistics};
use arrow_schema::DataType;
use futures::{Stream, StreamExt};

use datafusion_common::stats::Precision;
use datafusion_common::ScalarValue;

use futures::{Stream, StreamExt};
use crate::arrow::datatypes::{Schema, SchemaRef};
use crate::error::Result;
use crate::functions_aggregate::min_max::{MaxAccumulator, MinAccumulator};
use crate::physical_plan::{Accumulator, ColumnStatistics, Statistics};

use super::listing::PartitionedFile;

/// Get all files as well as the file level summary statistics (no statistic for partition columns).
/// If the optional `limit` is provided, includes only sufficient files. Needed to read up to
Expand Down Expand Up @@ -62,8 +63,8 @@ pub async fn get_statistics_with_limit(
result_files.push(file);

// First file, we set them directly from the file statistics.
num_rows = file_stats.num_rows.clone();
total_byte_size = file_stats.total_byte_size.clone();
num_rows = file_stats.num_rows;
total_byte_size = file_stats.total_byte_size;
for (index, file_column) in
file_stats.column_statistics.clone().into_iter().enumerate()
{
Expand Down Expand Up @@ -93,10 +94,10 @@ pub async fn get_statistics_with_limit(
// counts across all the files in question. If any file does not
// provide any information or provides an inexact value, we demote
// the statistic precision to inexact.
num_rows = add_row_stats(file_stats.num_rows.clone(), num_rows);
num_rows = add_row_stats(file_stats.num_rows, num_rows);

total_byte_size =
add_row_stats(file_stats.total_byte_size.clone(), total_byte_size);
add_row_stats(file_stats.total_byte_size, total_byte_size);

for (file_col_stats, col_stats) in file_stats
.column_statistics
Expand All @@ -110,8 +111,7 @@ pub async fn get_statistics_with_limit(
distinct_count: _,
} = file_col_stats;

col_stats.null_count =
add_row_stats(file_nc.clone(), col_stats.null_count.clone());
col_stats.null_count = add_row_stats(*file_nc, col_stats.null_count);
set_max_if_greater(file_max, &mut col_stats.max_value);
set_min_if_lesser(file_min, &mut col_stats.min_value)
}
Expand Down Expand Up @@ -192,7 +192,7 @@ pub(crate) fn get_col_stats(
None => None,
};
ColumnStatistics {
null_count: null_counts[i].clone(),
null_count: null_counts[i],
max_value: max_value.map(Precision::Exact).unwrap_or(Precision::Absent),
min_value: min_value.map(Precision::Exact).unwrap_or(Precision::Absent),
distinct_count: Precision::Absent,
Expand Down
2 changes: 1 addition & 1 deletion datafusion/physical-expr/src/analysis.rs
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ impl ExprBoundaries {
Ok(ExprBoundaries {
column,
interval,
distinct_count: col_stats.distinct_count.clone(),
distinct_count: col_stats.distinct_count,
})
}

Expand Down
2 changes: 1 addition & 1 deletion datafusion/physical-plan/src/filter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -324,7 +324,7 @@ fn collect_new_statistics(
(Precision::Inexact(lower), Precision::Inexact(upper))
};
ColumnStatistics {
null_count: input_column_stats[idx].null_count.clone().to_inexact(),
null_count: input_column_stats[idx].null_count.to_inexact(),
max_value,
min_value,
distinct_count: distinct_count.to_inexact(),
Expand Down
10 changes: 4 additions & 6 deletions datafusion/physical-plan/src/joins/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -827,12 +827,12 @@ fn estimate_join_cardinality(
JoinType::Inner | JoinType::Left | JoinType::Right | JoinType::Full => {
let ij_cardinality = estimate_inner_join_cardinality(
Statistics {
num_rows: left_stats.num_rows.clone(),
num_rows: left_stats.num_rows,
total_byte_size: Precision::Absent,
column_statistics: left_col_stats,
},
Statistics {
num_rows: right_stats.num_rows.clone(),
num_rows: right_stats.num_rows,
total_byte_size: Precision::Absent,
column_statistics: right_col_stats,
},
Expand Down Expand Up @@ -1024,7 +1024,7 @@ fn max_distinct_count(
stats: &ColumnStatistics,
) -> Precision<usize> {
match &stats.distinct_count {
dc @ (Precision::Exact(_) | Precision::Inexact(_)) => dc.clone(),
&dc @ (Precision::Exact(_) | Precision::Inexact(_)) => dc,
_ => {
// The number can never be greater than the number of rows we have
// minus the nulls (since they don't count as distinct values).
Expand Down Expand Up @@ -2054,9 +2054,7 @@ mod tests {
);
assert_eq!(
partial_join_stats.map(|s| s.column_statistics),
expected_cardinality
.clone()
.map(|_| [left_col_stats, right_col_stats].concat())
expected_cardinality.map(|_| [left_col_stats, right_col_stats].concat())
);
}
Ok(())
Expand Down

0 comments on commit 5d3cda5

Please sign in to comment.