-
Notifications
You must be signed in to change notification settings - Fork 796
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Write null counts in parquet files when they are present #6257
base: master
Are you sure you want to change the base?
Changes from 1 commit
e00e160
09c4047
6c05e60
60aec4e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -125,22 +125,35 @@ pub fn from_thrift( | |
) -> Result<Option<Statistics>> { | ||
Ok(match thrift_stats { | ||
Some(stats) => { | ||
// Number of nulls recorded, when it is not available, we just mark it as 0. | ||
// TODO this should be `None` if there is no information about NULLS. | ||
// see https://github.com/apache/arrow-rs/pull/6216/files | ||
let null_count = stats.null_count.unwrap_or(0); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the removal of this |
||
|
||
if null_count < 0 { | ||
return Err(ParquetError::General(format!( | ||
"Statistics null count is negative {}", | ||
null_count | ||
))); | ||
} | ||
// transform null count to u64 | ||
let null_count = stats | ||
.null_count | ||
.map(|null_count| { | ||
if null_count < 0 { | ||
return Err(ParquetError::General(format!( | ||
"Statistics null count is negative {}", | ||
null_count | ||
))); | ||
} | ||
Ok(null_count as u64) | ||
}) | ||
.transpose()?; | ||
|
||
// Generic null count. | ||
let null_count = Some(null_count as u64); | ||
// Generic distinct count (count of distinct values occurring) | ||
let distinct_count = stats.distinct_count.map(|value| value as u64); | ||
let distinct_count = stats | ||
.distinct_count | ||
.map(|distinct_count| { | ||
if distinct_count < 0 { | ||
return Err(ParquetError::General(format!( | ||
"Statistics distinct count is negative {}", | ||
distinct_count | ||
))); | ||
} | ||
|
||
Ok(distinct_count as u64) | ||
}) | ||
.transpose()?; | ||
|
||
// Whether or not statistics use deprecated min/max fields. | ||
let old_format = stats.min_value.is_none() && stats.max_value.is_none(); | ||
// Generic min value as bytes. | ||
|
@@ -244,20 +257,21 @@ pub fn from_thrift( | |
pub fn to_thrift(stats: Option<&Statistics>) -> Option<TStatistics> { | ||
let stats = stats?; | ||
|
||
// record null counts if greater than zero. | ||
// | ||
// TODO: This should be Some(0) if there are no nulls. | ||
// see https://github.com/apache/arrow-rs/pull/6216/files | ||
// record null count if it can fit in i64 | ||
let null_count = stats | ||
.null_count_opt() | ||
.map(|value| value as i64) | ||
.filter(|&x| x > 0); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The removal of this filter is what fixes the statistics while writing Maybe it was intended to be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
.and_then(|value| i64::try_from(value).ok()); | ||
|
||
// record distinct count if it can fit in i64 | ||
let distinct_count = stats | ||
.distinct_count() | ||
.and_then(|value| i64::try_from(value).ok()); | ||
|
||
let mut thrift_stats = TStatistics { | ||
max: None, | ||
min: None, | ||
null_count, | ||
distinct_count: stats.distinct_count().map(|value| value as i64), | ||
distinct_count, | ||
max_value: None, | ||
min_value: None, | ||
is_max_value_exact: None, | ||
|
@@ -404,9 +418,20 @@ impl Statistics { | |
/// Returns number of null values for the column, if known. | ||
/// Note that this includes all nulls when column is part of the complex type. | ||
/// | ||
/// Note this API returns Some(0) even if the null count was not present | ||
/// in the statistics. | ||
/// See <https://github.com/apache/arrow-rs/pull/6216/files> | ||
/// Note: Versions of this library prior to `53.0.0` returned 0 if the null count was | ||
/// not available. This method returns `None` in that case. | ||
/// | ||
/// Also, versions of this library prior to `53.0.0` did not store the null count in the | ||
/// statistics if the null count was `0`. | ||
/// | ||
/// To preserve the prior behavior and read null counts properly from older files | ||
/// you should default to zero: | ||
/// | ||
/// ```no_run | ||
/// # use parquet::file::statistics::Statistics; | ||
/// # let statistics: Statistics = todo!(); | ||
/// let null_count = statistics.null_count_opt().unwrap_or(0); | ||
/// ``` | ||
pub fn null_count_opt(&self) -> Option<u64> { | ||
statistics_enum_func![self, null_count_opt] | ||
} | ||
|
@@ -1041,4 +1066,91 @@ mod tests { | |
true, | ||
)); | ||
} | ||
|
||
#[test] | ||
fn test_count_encoding() { | ||
statistics_count_test(None, None); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This test fails like this without the changes in this PR:
|
||
statistics_count_test(Some(0), Some(0)); | ||
statistics_count_test(Some(100), Some(2000)); | ||
statistics_count_test(Some(1), None); | ||
statistics_count_test(None, Some(1)); | ||
} | ||
|
||
#[test] | ||
fn test_count_encoding_distinct_too_large() { | ||
// statistics are stored using i64, so test trying to store larger values | ||
let statistics = make_bool_stats(Some(u64::MAX), Some(100)); | ||
let thrift_stats = to_thrift(Some(&statistics)).unwrap(); | ||
assert_eq!(thrift_stats.distinct_count, None); // can't store u64 max --> null | ||
assert_eq!(thrift_stats.null_count, Some(100)); | ||
} | ||
|
||
#[test] | ||
fn test_count_encoding_null_too_large() { | ||
// statistics are stored using i64, so test trying to store larger values | ||
let statistics = make_bool_stats(Some(100), Some(u64::MAX)); | ||
let thrift_stats = to_thrift(Some(&statistics)).unwrap(); | ||
assert_eq!(thrift_stats.distinct_count, Some(100)); | ||
assert_eq!(thrift_stats.null_count, None); // can' store u64 max --> null | ||
} | ||
|
||
#[test] | ||
fn test_count_decoding_distinct_invalid() { | ||
let tstatistics = TStatistics { | ||
distinct_count: Some(-42), | ||
..Default::default() | ||
}; | ||
let err = from_thrift(Type::BOOLEAN, Some(tstatistics)).unwrap_err(); | ||
assert_eq!( | ||
err.to_string(), | ||
"Parquet error: Statistics distinct count is negative -42" | ||
); | ||
} | ||
|
||
#[test] | ||
fn test_count_decoding_null_invalid() { | ||
let tstatistics = TStatistics { | ||
null_count: Some(-42), | ||
..Default::default() | ||
}; | ||
let err = from_thrift(Type::BOOLEAN, Some(tstatistics)).unwrap_err(); | ||
assert_eq!( | ||
err.to_string(), | ||
"Parquet error: Statistics null count is negative -42" | ||
); | ||
} | ||
|
||
/// Writes statistics to thrift and reads them back and ensures: | ||
/// - The statistics are the same | ||
/// - The statistics written to thrift are the same as the original statistics | ||
fn statistics_count_test(distinct_count: Option<u64>, null_count: Option<u64>) { | ||
let statistics = make_bool_stats(distinct_count, null_count); | ||
|
||
let thrift_stats = to_thrift(Some(&statistics)).unwrap(); | ||
assert_eq!(thrift_stats.null_count.map(|c| c as u64), null_count); | ||
assert_eq!( | ||
thrift_stats.distinct_count.map(|c| c as u64), | ||
distinct_count | ||
); | ||
|
||
let round_tripped = from_thrift(Type::BOOLEAN, Some(thrift_stats)) | ||
.unwrap() | ||
.unwrap(); | ||
assert_eq!(round_tripped, statistics); | ||
} | ||
|
||
fn make_bool_stats(distinct_count: Option<u64>, null_count: Option<u64>) -> Statistics { | ||
let min = Some(true); | ||
let max = Some(false); | ||
let is_min_max_deprecated = false; | ||
|
||
// test is about the counts, so we aren't really testing the min/max values | ||
Statistics::Boolean(ValueStatistics::new( | ||
min, | ||
max, | ||
distinct_count, | ||
null_count, | ||
is_min_max_deprecated, | ||
)) | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
By default reading null counts will work with files written with older versions of parquet-rs
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For breaking / major release, is it acceptable to include an upgrade instruction of "add this config to maintain to old behavior"?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Absolutely
It is important to note that the default behavior in this PR is the old behavior (in other words there should be changes needed in downstream consumers of this code)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The default in this PR is
missing_null_counts_as_zero = true
, which maintains the old behavior, right?If "add this config to maintain old behavior" is acceptable for a breaking release, then I would expect the default to be the new behavior.
IOW, I'd expect what you said on the parquet mailing list
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes
My thinking was
Since there are two different apis:
Statistics::null_count
would now returnOption<..>
so users of the library will ned to update their code anyways and thus can choose at that time which behavior they wantedStatisticsConverter
's API didn't change and thus it keeps the previous behavior. This is what I would persoanlly want for a system -- no change for reading parquet files that were written with previous versions of the rust writer.