diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 2b9f261d9f42..f0b75f302552 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -323,6 +323,19 @@ impl SerializedFileWriter { None => Some(self.kv_metadatas.clone()), }; + // We only include ColumnOrder for leaf nodes. + // Currently only supported ColumnOrder is TypeDefinedOrder so we set this + // for all leaf nodes. + // Even if the column has an undefined sort order, such as INTERVAL, this + // is still technically the defined TYPEORDER so it should still be set. + let column_orders = (0..self.schema_descr().num_columns()) + .map(|_| parquet::ColumnOrder::TYPEORDER(parquet::TypeDefinedOrder {})) + .collect(); + // This field is optional, perhaps in cases where no min/max fields are set + // in any Statistics or ColumnIndex object in the whole file. + // But for simplicity we always set this field. + let column_orders = Some(column_orders); + let file_metadata = parquet::FileMetaData { num_rows, row_groups, @@ -330,7 +343,7 @@ impl SerializedFileWriter { version: self.props.writer_version().as_num(), schema: types::to_thrift(self.schema.as_ref())?, created_by: Some(self.props.created_by().to_owned()), - column_orders: None, + column_orders, encryption_algorithm: None, footer_signing_key_metadata: None, }; @@ -738,7 +751,9 @@ mod tests { use bytes::Bytes; use std::fs::File; - use crate::basic::{Compression, Encoding, LogicalType, Repetition, Type}; + use crate::basic::{ + ColumnOrder, Compression, ConvertedType, Encoding, LogicalType, Repetition, SortOrder, Type, + }; use crate::column::page::{Page, PageReader}; use crate::column::reader::get_typed_column_reader; use crate::compression::{create_codec, Codec, CodecOptionsBuilder}; @@ -851,6 +866,78 @@ mod tests { assert_eq!(reader.get_row_iter(None).unwrap().count(), 0); } + #[test] + fn test_file_writer_column_orders_populated() { + let file = tempfile::tempfile().unwrap(); + + let schema = Arc::new( + types::Type::group_type_builder("schema") + .with_fields(vec![ + Arc::new( + types::Type::primitive_type_builder("col1", Type::INT32) + .build() + .unwrap(), + ), + Arc::new( + types::Type::primitive_type_builder("col2", Type::FIXED_LEN_BYTE_ARRAY) + .with_converted_type(ConvertedType::INTERVAL) + .with_length(12) + .build() + .unwrap(), + ), + Arc::new( + types::Type::group_type_builder("nested") + .with_repetition(Repetition::REQUIRED) + .with_fields(vec![ + Arc::new( + types::Type::primitive_type_builder( + "col3", + Type::FIXED_LEN_BYTE_ARRAY, + ) + .with_logical_type(Some(LogicalType::Float16)) + .with_length(2) + .build() + .unwrap(), + ), + Arc::new( + types::Type::primitive_type_builder("col4", Type::BYTE_ARRAY) + .with_logical_type(Some(LogicalType::String)) + .build() + .unwrap(), + ), + ]) + .build() + .unwrap(), + ), + ]) + .build() + .unwrap(), + ); + + let props = Default::default(); + let writer = SerializedFileWriter::new(file.try_clone().unwrap(), schema, props).unwrap(); + writer.close().unwrap(); + + let reader = SerializedFileReader::new(file).unwrap(); + + // only leaves + let expected = vec![ + // INT32 + ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::SIGNED), + // INTERVAL + ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::UNDEFINED), + // Float16 + ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::SIGNED), + // String + ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::UNSIGNED), + ]; + let actual = reader.metadata().file_metadata().column_orders(); + + assert!(actual.is_some()); + let actual = actual.unwrap(); + assert_eq!(*actual, expected); + } + #[test] fn test_file_writer_with_metadata() { let file = tempfile::tempfile().unwrap();