apache · alamb · Jun 21, 2024 · Jun 10, 2024 · Jun 10, 2024 · Jun 10, 2024
diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml
@@ -68,6 +68,9 @@ twox-hash = { version = "1.6", default-features = false }
 paste = { version = "1.0" }
 half = { version = "2.1", default-features = false, features = ["num-traits"] }
 
+dsi-progress-logger = { version = "0.2.4", optional = true }
+simplelog = { version = "0.12.2", optional = true }
+
 [dev-dependencies]
 base64 = { version = "0.22", default-features = false, features = ["std"] }
 criterion = { version = "0.5", default-features = false }
@@ -114,12 +117,19 @@ async = ["futures", "tokio"]
 object_store = ["dep:object_store", "async"]
 # Group Zstd dependencies
 zstd = ["dep:zstd", "zstd-sys"]
+# Enable progress logging
+log = ["dep:simplelog", "dep:dsi-progress-logger"]
 
 [[example]]
 name = "read_parquet"
 required-features = ["arrow"]
 path = "./examples/read_parquet.rs"
 
+[[example]]
+name = "write_parquet"
+required-features = ["log"]
+path = "./examples/write_parquet.rs"
+
 [[example]]
 name = "async_read_parquet"
 required-features = ["arrow", "async"]

diff --git a/parquet/examples/write_parquet.rs b/parquet/examples/write_parquet.rs
@@ -0,0 +1,71 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::fs::File;
+use std::sync::Arc;
+
+use dsi_progress_logger::prelude::*;
+
+use arrow::array::{StructArray, UInt64Builder};
+use arrow::datatypes::DataType::UInt64;
+use arrow::datatypes::{Field, Schema};
+use parquet::arrow::ArrowWriter as ParquetWriter;
+use parquet::basic::Encoding;
+use parquet::errors::Result;
+use parquet::file::properties::WriterProperties;
+
+fn main() -> Result<()> {
+    let _ = simplelog::SimpleLogger::init(simplelog::LevelFilter::Info, Default::default());
+
+    let properties = WriterProperties::builder()
+        .set_column_bloom_filter_enabled("id".into(), true)
+        .set_column_encoding("id".into(), Encoding::DELTA_BINARY_PACKED)
+        .build();
+    let schema = Arc::new(Schema::new(vec![Field::new("id", UInt64, false)]));
+    // Create parquet file that will be read.
+    let path = "/tmp/test.parquet";
+    let file = File::create(path).unwrap();
+    let mut writer = ParquetWriter::try_new(file, schema.clone(), Some(properties))?;
+
+    let num_iterations = 3000;
+    let mut pl = progress_logger!(
+        item_name = "iterations",
+        display_memory = true,
+        expected_updates = Some(num_iterations as usize)
+    );
+    pl.start("Writing batches");
+    let mut array_builder = UInt64Builder::new();
+    for i in 0..num_iterations {
+        pl.update();
+        for j in 0..1_000_000 {
+            array_builder.append_value(i + j);
+        }
+        writer.write(
+            &StructArray::new(
+                schema.fields().clone(),
+                vec![Arc::new(array_builder.finish())],
+                None,
+            )
+            .into(),
+        )?;
+    }
+    writer.flush()?;
+    writer.close()?;
+    pl.done();
+
+    Ok(())
+}
diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs
@@ -43,7 +43,7 @@ use crate::column::writer::{
 };
 use crate::data_type::{ByteArray, FixedLenByteArray};
 use crate::errors::{ParquetError, Result};
-use crate::file::metadata::{ColumnChunkMetaData, KeyValue, RowGroupMetaDataPtr};
+use crate::file::metadata::{ColumnChunkMetaData, KeyValue, RowGroupMetaData};
 use crate::file::properties::{WriterProperties, WriterPropertiesPtr};
 use crate::file::reader::{ChunkReader, Length};
 use crate::file::writer::{SerializedFileWriter, SerializedRowGroupWriter};
@@ -185,7 +185,7 @@ impl<W: Write + Send> ArrowWriter<W> {
     }
 
     /// Returns metadata for any flushed row groups
-    pub fn flushed_row_groups(&self) -> &[RowGroupMetaDataPtr] {
+    pub fn flushed_row_groups(&self) -> &[RowGroupMetaData] {
         self.writer.flushed_row_groups()
     }
 
@@ -1039,7 +1039,9 @@ mod tests {
     use crate::file::metadata::ParquetMetaData;
     use crate::file::page_index::index::Index;
     use crate::file::page_index::index_reader::read_pages_locations;
-    use crate::file::properties::{EnabledStatistics, ReaderProperties, WriterVersion};
+    use crate::file::properties::{
+        BloomFilterPosition, EnabledStatistics, ReaderProperties, WriterVersion,
+    };
     use crate::file::serialized_reader::ReadOptionsBuilder;
     use crate::file::{
         reader::{FileReader, SerializedFileReader},
@@ -1687,6 +1689,7 @@ mod tests {
         values: ArrayRef,
         schema: SchemaRef,
         bloom_filter: bool,
+        bloom_filter_position: BloomFilterPosition,
     }
 
     impl RoundTripOptions {
@@ -1697,6 +1700,7 @@ mod tests {
                 values,
                 schema: Arc::new(schema),
                 bloom_filter: false,
+                bloom_filter_position: BloomFilterPosition::AfterRowGroup,
             }
         }
     }
@@ -1716,6 +1720,7 @@ mod tests {
             values,
             schema,
             bloom_filter,
+            bloom_filter_position,
         } = options;
 
         let encodings = match values.data_type() {
@@ -1756,6 +1761,7 @@ mod tests {
                             .set_dictionary_page_size_limit(dictionary_size.max(1))
                             .set_encoding(*encoding)
                             .set_bloom_filter_enabled(bloom_filter)
+                            .set_bloom_filter_position(bloom_filter_position)
                             .build();
 
                         files.push(roundtrip_opts(&expected_batch, props))
@@ -2103,6 +2109,22 @@ mod tests {
         values_required::<BinaryArray, _>(many_vecs_iter);
     }
 
+    #[test]
+    fn i32_column_bloom_filter_at_end() {
+        let array = Arc::new(Int32Array::from_iter(0..SMALL_SIZE as i32));
+        let mut options = RoundTripOptions::new(array, false);
+        options.bloom_filter = true;
+        options.bloom_filter_position = BloomFilterPosition::End;
+
+        let files = one_column_roundtrip_with_options(options);
+        check_bloom_filter(
+            files,
+            "col".to_string(),
+            (0..SMALL_SIZE as i32).collect(),
+            (SMALL_SIZE as i32 + 1..SMALL_SIZE as i32 + 10).collect(),
+        );
+    }
+
     #[test]
     fn i32_column_bloom_filter() {
         let array = Arc::new(Int32Array::from_iter(0..SMALL_SIZE as i32));

diff --git a/parquet/src/arrow/async_writer/mod.rs b/parquet/src/arrow/async_writer/mod.rs
@@ -54,7 +54,7 @@ use crate::{
     arrow::arrow_writer::ArrowWriterOptions,
     arrow::ArrowWriter,
     errors::{ParquetError, Result},
-    file::{metadata::RowGroupMetaDataPtr, properties::WriterProperties},
+    file::{metadata::RowGroupMetaData, properties::WriterProperties},
     format::{FileMetaData, KeyValue},
 };
 use arrow_array::RecordBatch;
@@ -172,7 +172,7 @@ impl<W: AsyncFileWriter> AsyncArrowWriter<W> {
     }
 
     /// Returns metadata for any flushed row groups
-    pub fn flushed_row_groups(&self) -> &[RowGroupMetaDataPtr] {
+    pub fn flushed_row_groups(&self) -> &[RowGroupMetaData] {
         self.sync_writer.flushed_row_groups()
     }
 

diff --git a/parquet/src/file/metadata.rs b/parquet/src/file/metadata.rs
@@ -324,6 +324,11 @@ impl RowGroupMetaData {
         &self.columns
     }
 
+    /// Returns mutable slice of column chunk metadata.
+    pub fn columns_mut(&mut self) -> &mut [ColumnChunkMetaData] {
+        &mut self.columns
+    }
+
     /// Number of rows in this row group.
     pub fn num_rows(&self) -> i64 {
         self.num_rows

diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs
@@ -43,6 +43,8 @@ pub const DEFAULT_STATISTICS_ENABLED: EnabledStatistics = EnabledStatistics::Pag
 pub const DEFAULT_MAX_STATISTICS_SIZE: usize = 4096;
 /// Default value for [`WriterProperties::max_row_group_size`]
 pub const DEFAULT_MAX_ROW_GROUP_SIZE: usize = 1024 * 1024;
+/// Default value for [`WriterProperties::bloom_filter_position`]
+pub const DEFAULT_BLOOM_FILTER_POSITION: BloomFilterPosition = BloomFilterPosition::AfterRowGroup;
 /// Default value for [`WriterProperties::created_by`]
 pub const DEFAULT_CREATED_BY: &str = concat!("parquet-rs version ", env!("CARGO_PKG_VERSION"));
 /// Default value for [`WriterProperties::column_index_truncate_length`]
@@ -86,6 +88,24 @@ impl FromStr for WriterVersion {
     }
 }
 
+/// Where in the file [`ArrowWriter`](crate::arrow::arrow_writer::ArrowWriter) should
+/// write Bloom filters
+///
+/// Basic constant, which is not part of the Thrift definition.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum BloomFilterPosition {
+    /// Write Bloom Filters of each row group right after the row group
+    ///
+    /// This saves memory by writing it as soon as it is computed, at the cost
+    /// of data locality for readers
+    AfterRowGroup,
+    /// Write Bloom Filters at the end of the file
+    ///
+    /// This allows better data locality for readers, at the cost of memory usage
+    /// for writers.
+    End,
+}
+
 /// Reference counted writer properties.
 pub type WriterPropertiesPtr = Arc<WriterProperties>;
 
@@ -131,6 +151,7 @@ pub struct WriterProperties {
     data_page_row_count_limit: usize,
     write_batch_size: usize,
     max_row_group_size: usize,
+    bloom_filter_position: BloomFilterPosition,
     writer_version: WriterVersion,
     created_by: String,
     pub(crate) key_value_metadata: Option<Vec<KeyValue>>,
@@ -217,6 +238,11 @@ impl WriterProperties {
         self.max_row_group_size
     }
 
+    /// Returns maximum number of rows in a row group.
+    pub fn bloom_filter_position(&self) -> BloomFilterPosition {
+        self.bloom_filter_position
+    }
+
     /// Returns configured writer version.
     pub fn writer_version(&self) -> WriterVersion {
         self.writer_version
@@ -337,6 +363,7 @@ pub struct WriterPropertiesBuilder {
     data_page_row_count_limit: usize,
     write_batch_size: usize,
     max_row_group_size: usize,
+    bloom_filter_position: BloomFilterPosition,
     writer_version: WriterVersion,
     created_by: String,
     key_value_metadata: Option<Vec<KeyValue>>,
@@ -356,6 +383,7 @@ impl WriterPropertiesBuilder {
             data_page_row_count_limit: usize::MAX,
             write_batch_size: DEFAULT_WRITE_BATCH_SIZE,
             max_row_group_size: DEFAULT_MAX_ROW_GROUP_SIZE,
+            bloom_filter_position: DEFAULT_BLOOM_FILTER_POSITION,
             writer_version: DEFAULT_WRITER_VERSION,
             created_by: DEFAULT_CREATED_BY.to_string(),
             key_value_metadata: None,
@@ -375,6 +403,7 @@ impl WriterPropertiesBuilder {
             data_page_row_count_limit: self.data_page_row_count_limit,
             write_batch_size: self.write_batch_size,
             max_row_group_size: self.max_row_group_size,
+            bloom_filter_position: self.bloom_filter_position,
             writer_version: self.writer_version,
             created_by: self.created_by,
             key_value_metadata: self.key_value_metadata,
@@ -479,6 +508,12 @@ impl WriterPropertiesBuilder {
         self
     }
 
+    /// Sets where in the final file Bloom Filters are written
+    pub fn set_bloom_filter_position(mut self, value: BloomFilterPosition) -> Self {
+        self.bloom_filter_position = value;
+        self
+    }
+
     /// Sets "created by" property.
     pub fn set_created_by(mut self, value: String) -> Self {
         self.created_by = value;
@@ -991,6 +1026,7 @@ mod tests {
         );
         assert_eq!(props.write_batch_size(), DEFAULT_WRITE_BATCH_SIZE);
         assert_eq!(props.max_row_group_size(), DEFAULT_MAX_ROW_GROUP_SIZE);
+        assert_eq!(props.bloom_filter_position(), DEFAULT_BLOOM_FILTER_POSITION);
         assert_eq!(props.writer_version(), DEFAULT_WRITER_VERSION);
         assert_eq!(props.created_by(), DEFAULT_CREATED_BY);
         assert_eq!(props.key_value_metadata(), None);