From ee5694078c86c8201549654246900a4232d531a9 Mon Sep 17 00:00:00 2001 From: Fischer <89784872+Fischer0522@users.noreply.github.com> Date: Sat, 20 Jul 2024 19:01:01 +0800 Subject: [PATCH] fix panic in `ParquetMetadata::memory_size`: check has_min_max_set before invoking min()/max() (#6092) * fix: check has_min_max_set before invoking min()/max() * chore: add unit test for statistics heap size * Fixup test --------- Co-authored-by: Andrew Lamb --- parquet/src/file/metadata/memory.rs | 9 +++++++- parquet/src/file/metadata/mod.rs | 32 +++++++++++++++++++++++++---- 2 files changed, 36 insertions(+), 5 deletions(-) diff --git a/parquet/src/file/metadata/memory.rs b/parquet/src/file/metadata/memory.rs index 57b2f7eec0c2..57d5aaa2dd2f 100644 --- a/parquet/src/file/metadata/memory.rs +++ b/parquet/src/file/metadata/memory.rs @@ -173,7 +173,14 @@ impl HeapSize for PageIndex { impl HeapSize for ValueStatistics { fn heap_size(&self) -> usize { - self.min().heap_size() + self.max().heap_size() + if self.has_min_max_set() { + return self.min().heap_size() + self.max().heap_size(); + } else if self.min_is_exact() { + return self.min().heap_size(); + } else if self.max_is_exact() { + return self.max().heap_size(); + } + 0 } } impl HeapSize for bool { diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 1d338bb8eed8..9f2084a35d7c 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -1291,7 +1291,11 @@ mod tests { let columns = schema_descr .columns() .iter() - .map(|column_descr| ColumnChunkMetaData::builder(column_descr.clone()).build()) + .map(|column_descr| { + ColumnChunkMetaData::builder(column_descr.clone()) + .set_statistics(Statistics::new::(None, None, None, 0, false)) + .build() + }) .collect::>>() .unwrap(); let row_group_meta = RowGroupMetaData::builder(schema_descr.clone()) @@ -1317,11 +1321,31 @@ mod tests { num_rows, created_by, key_value_metadata, - schema_descr, + schema_descr.clone(), column_orders, ); - let parquet_meta = ParquetMetaData::new(file_metadata.clone(), row_group_meta.clone()); - let base_expected_size = 1320; + + // Now, add in Exact Statistics + let columns_with_stats = schema_descr + .columns() + .iter() + .map(|column_descr| { + ColumnChunkMetaData::builder(column_descr.clone()) + .set_statistics(Statistics::new::(Some(0), Some(100), None, 0, false)) + .build() + }) + .collect::>>() + .unwrap(); + + let row_group_meta_with_stats = RowGroupMetaData::builder(schema_descr) + .set_num_rows(1000) + .set_column_metadata(columns_with_stats) + .build() + .unwrap(); + let row_group_meta_with_stats = vec![row_group_meta_with_stats]; + + let parquet_meta = ParquetMetaData::new(file_metadata.clone(), row_group_meta_with_stats); + let base_expected_size = 2024; assert_eq!(parquet_meta.memory_size(), base_expected_size); let mut column_index = ColumnIndexBuilder::new();