Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Parquet: write column_orders in FileMetaData #5158

Merged
merged 1 commit into from
Dec 5, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 89 additions & 2 deletions parquet/src/file/writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -323,14 +323,27 @@ impl<W: Write + Send> SerializedFileWriter<W> {
None => Some(self.kv_metadatas.clone()),
};

// We only include ColumnOrder for leaf nodes.
// Currently only supported ColumnOrder is TypeDefinedOrder so we set this
// for all leaf nodes.
Comment on lines +327 to +328
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In future there may be new supported order, see here: apache/parquet-format#221

// Even if the column has an undefined sort order, such as INTERVAL, this
// is still technically the defined TYPEORDER so it should still be set.
let column_orders = (0..self.schema_descr().num_columns())
.map(|_| parquet::ColumnOrder::TYPEORDER(parquet::TypeDefinedOrder {}))
.collect();
// This field is optional, perhaps in cases where no min/max fields are set
// in any Statistics or ColumnIndex object in the whole file.
// But for simplicity we always set this field.
let column_orders = Some(column_orders);
Comment on lines +334 to +337
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.


let file_metadata = parquet::FileMetaData {
num_rows,
row_groups,
key_value_metadata,
version: self.props.writer_version().as_num(),
schema: types::to_thrift(self.schema.as_ref())?,
created_by: Some(self.props.created_by().to_owned()),
column_orders: None,
column_orders,
encryption_algorithm: None,
footer_signing_key_metadata: None,
};
Expand Down Expand Up @@ -738,7 +751,9 @@ mod tests {
use bytes::Bytes;
use std::fs::File;

use crate::basic::{Compression, Encoding, LogicalType, Repetition, Type};
use crate::basic::{
ColumnOrder, Compression, ConvertedType, Encoding, LogicalType, Repetition, SortOrder, Type,
};
use crate::column::page::{Page, PageReader};
use crate::column::reader::get_typed_column_reader;
use crate::compression::{create_codec, Codec, CodecOptionsBuilder};
Expand Down Expand Up @@ -851,6 +866,78 @@ mod tests {
assert_eq!(reader.get_row_iter(None).unwrap().count(), 0);
}

#[test]
fn test_file_writer_column_orders_populated() {
let file = tempfile::tempfile().unwrap();

let schema = Arc::new(
types::Type::group_type_builder("schema")
.with_fields(vec![
Arc::new(
types::Type::primitive_type_builder("col1", Type::INT32)
.build()
.unwrap(),
),
Arc::new(
types::Type::primitive_type_builder("col2", Type::FIXED_LEN_BYTE_ARRAY)
.with_converted_type(ConvertedType::INTERVAL)
.with_length(12)
.build()
.unwrap(),
),
Arc::new(
types::Type::group_type_builder("nested")
.with_repetition(Repetition::REQUIRED)
.with_fields(vec![
Arc::new(
types::Type::primitive_type_builder(
"col3",
Type::FIXED_LEN_BYTE_ARRAY,
)
.with_logical_type(Some(LogicalType::Float16))
.with_length(2)
.build()
.unwrap(),
),
Arc::new(
types::Type::primitive_type_builder("col4", Type::BYTE_ARRAY)
.with_logical_type(Some(LogicalType::String))
.build()
.unwrap(),
),
])
.build()
.unwrap(),
),
])
.build()
.unwrap(),
);

let props = Default::default();
let writer = SerializedFileWriter::new(file.try_clone().unwrap(), schema, props).unwrap();
writer.close().unwrap();

let reader = SerializedFileReader::new(file).unwrap();

// only leaves
let expected = vec![
// INT32
ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::SIGNED),
// INTERVAL
ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::UNDEFINED),
// Float16
ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::SIGNED),
// String
ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::UNSIGNED),
];
let actual = reader.metadata().file_metadata().column_orders();

assert!(actual.is_some());
let actual = actual.unwrap();
assert_eq!(*actual, expected);
}

#[test]
fn test_file_writer_with_metadata() {
let file = tempfile::tempfile().unwrap();
Expand Down
Loading