From eb6dd87c446595b3631e03a5d7687f438b61aabb Mon Sep 17 00:00:00 2001 From: sadikovi Date: Thu, 5 Apr 2018 20:19:32 +1200 Subject: [PATCH 01/32] update public modules --- src/encodings/mod.rs | 2 +- src/lib.rs | 8 +++----- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/encodings/mod.rs b/src/encodings/mod.rs index 7c6586a..d0172a6 100644 --- a/src/encodings/mod.rs +++ b/src/encodings/mod.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. +mod rle; pub mod encoding; pub mod decoding; -pub mod rle; pub mod levels; diff --git a/src/lib.rs b/src/lib.rs index a7a52f9..26a2b61 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -37,7 +37,6 @@ extern crate flate2; extern crate rand; extern crate x86intrin; -// TODO: don't expose everything! #[macro_use] pub mod errors; pub mod basic; @@ -45,11 +44,10 @@ pub mod data_type; mod parquet_thrift; #[macro_use] -pub mod util; +mod util; +mod compression; +mod encodings; pub mod column; pub mod record; -pub mod compression; - pub mod schema; pub mod file; -pub mod encodings; From 331fe7d3fc32a39a8922286f12663f185d1ee40e Mon Sep 17 00:00:00 2001 From: sadikovi Date: Thu, 5 Apr 2018 21:38:51 +1200 Subject: [PATCH 02/32] add docs --- src/bin/parquet-read.rs | 54 +++++++++++++++++++++++++++++++++++++++ src/bin/parquet-schema.rs | 35 +++++++++++++++++++++++++ src/lib.rs | 45 ++++++++++++++++++++++++++++++++ 3 files changed, 134 insertions(+) diff --git a/src/bin/parquet-read.rs b/src/bin/parquet-read.rs index 70530b3..e655e05 100644 --- a/src/bin/parquet-read.rs +++ b/src/bin/parquet-read.rs @@ -1,3 +1,57 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Binary file to read data from a Parquet file. +//! +//! # Install +//! +//! `parquet-read` can be installed using `cargo`: +//! ``` +//! cargo install parquet +//! ``` +//! After this `parquet-read` should be globally available: +//! ``` +//! parquet-read XYZ.parquet +//! ``` +//! +//! The binary can also be built from the source code and run as follows: +//! ``` +//! cargo run --bin parquet-read XYZ.parquet +//! ``` +//! +//! # Usage +//! +//! ``` +//! parquet-read [num-records] +//! ``` +//! where `file-path` is the path to a Parquet file and `num-records` is the optional +//! numeric option that allows to specify number of records to read from a file. +//! When not provided, all records are read. +//! +//! Note that `parquet-read` reads full file schema, no projection or filtering is +//! applied. +//! +//! For example, +//! ``` +//! parquet-read data/alltypes_plain.snappy.parquet +//! +//! parquet-read data/alltypes_plain.snappy.parquet 4 +//! ``` + extern crate parquet; use std::env; diff --git a/src/bin/parquet-schema.rs b/src/bin/parquet-schema.rs index 40e3aef..1eb6277 100644 --- a/src/bin/parquet-schema.rs +++ b/src/bin/parquet-schema.rs @@ -15,6 +15,41 @@ // specific language governing permissions and limitations // under the License. +//! Binary file to print the schema and metadata of a Parquet file. +//! +//! # Install +//! +//! `parquet-schema` can be installed using `cargo`: +//! ``` +//! cargo install parquet +//! ``` +//! After this `parquet-schema` should be globally available: +//! ``` +//! parquet-schema XYZ.parquet +//! ``` +//! +//! The binary can also be built from the source code and run as follows: +//! ``` +//! cargo run --bin parquet-schema XYZ.parquet +//! ``` +//! +//! # Usage +//! +//! ``` +//! parquet-schema [verbose] +//! ``` +//! where `file-path` is the path to a Parquet file and `verbose` is the optional boolean +//! flag that allows to print schema only, when set to `false` (default behaviour when +//! not provided), or print full file metadata, when set to `true`. +//! For example, +//! ``` +//! parquet-schema data/alltypes_plain.snappy.parquet +//! +//! parquet-schema data/alltypes_plain.snappy.parquet false +//! +//! parquet-schema data/alltypes_plain.snappy.parquet true +//! ``` + extern crate parquet; use std::env; diff --git a/src/lib.rs b/src/lib.rs index 26a2b61..0cb9414 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -15,6 +15,51 @@ // specific language governing permissions and limitations // under the License. +//! [Apache Parquet](http://parquet.apache.org) is a columnar storage format that +//! provides efficient data compression and encoding schemes to improve performance of +//! handling complex nested data structures. Parquet implements record-shredding and +//! assembly algorithm described in the Dremel paper. +//! +//! Crate provides API to access file schema and metadata from a Parquet file, extract +//! row groups or column chunks from a file, or read records/values. +//! +//! # Example +//! ```rust +//! use std::fs::File; +//! use std::path::Path; +//! use parquet::file::reader::{FileReader, SerializedFileReader}; +//! +//! // Opening a file +//! let path = Path::new("data/alltypes_plain.parquet"); +//! let file = File::open(&path).expect("File should exist"); +//! let reader = SerializedFileReader::new(file).expect("Reader should open the file"); +//! +//! // Accessing file metadata or row group metadata +//! let metadata = reader.metadata(); +//! let file_metadata = metadata.file_metadata(); +//! for i in 0..metadata.num_row_groups() { +//! let row_group_metadata = metadata.row_group(i); +//! } +//! +//! // Accessing row group readers in a file +//! for i in 0..reader.num_row_groups() { +//! let row_group_reader = reader.get_row_group(i).expect("Should be okay"); +//! } +//! +//! // Reading data using record API +//! let mut iter = reader.get_row_iter(None).expect("Should be okay"); +//! while let Some(record) = iter.next() { +//! // do something with the record... +//! } +//! ``` +//! +//! # Schema and metadata +//! +//! # File and row group API +//! +//! # Read API +//! + #![feature(type_ascription)] #![feature(rustc_private)] #![feature(specialization)] From cc88b2f8ea6aabf8cb023004f9074cf2102cee74 Mon Sep 17 00:00:00 2001 From: Ivan Sadikov Date: Fri, 6 Apr 2018 09:21:03 +1200 Subject: [PATCH 03/32] add doc build in readme --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index d38a181..e134585 100644 --- a/README.md +++ b/README.md @@ -43,5 +43,8 @@ be printed). ## Benchmarks Run `cargo bench` for benchmarks. +## Docs +To build documentation, run `cargo doc --no-deps`. To compile and view in the browser, run `cargo doc --no-deps --open`. + ## License Licensed under the Apache License, Version 2.0: http://www.apache.org/licenses/LICENSE-2.0. From 5801b6ed4874d7c4fc221a5f326701c708254399 Mon Sep 17 00:00:00 2001 From: sadikovi Date: Fri, 6 Apr 2018 10:14:26 +1200 Subject: [PATCH 04/32] update lib doc --- src/lib.rs | 53 +++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 41 insertions(+), 12 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 0cb9414..b09962f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -32,33 +32,62 @@ //! // Opening a file //! let path = Path::new("data/alltypes_plain.parquet"); //! let file = File::open(&path).expect("File should exist"); -//! let reader = SerializedFileReader::new(file).expect("Reader should open the file"); +//! let reader = SerializedFileReader::new(file).expect("Valid Parquet file"); //! -//! // Accessing file metadata or row group metadata -//! let metadata = reader.metadata(); -//! let file_metadata = metadata.file_metadata(); -//! for i in 0..metadata.num_row_groups() { -//! let row_group_metadata = metadata.row_group(i); -//! } -//! -//! // Accessing row group readers in a file -//! for i in 0..reader.num_row_groups() { -//! let row_group_reader = reader.get_row_group(i).expect("Should be okay"); +//! // Accessing Parquet metadata +//! let parquet_metadata = reader.metadata(); +//! let file_metadata = parquet_metadata.file_metadata(); +//! for i in 0..parquet_metadata.num_row_groups() { +//! // Accessing row group metadata +//! let row_group_metadata = parquet_metadata.row_group(i); +//! // Accessing column chunk metadata +//! for j in 0..row_group_metadata.num_columns() { +//! let column_chunk_metadata = row_group_metadata.column(j); +//! } //! } //! //! // Reading data using record API //! let mut iter = reader.get_row_iter(None).expect("Should be okay"); //! while let Some(record) = iter.next() { //! // do something with the record... +//! println!("{}", record); //! } +//! +//! // Accessing row group readers in a file +//! for i in 0..reader.num_row_groups() { +//! let row_group_reader = reader.get_row_group(i).expect("Should be okay"); +//! } +//! //! ``` //! -//! # Schema and metadata +//! # Metadata +//! +//! Module [`file::metadata`] contains Parquet metadata structs, including file metadata, +//! that has information about file schema, version, and number of rows, row group +//! metadata with a set of column chunks that contain column type and encodings, number +//! of values and compressed/uncompressed size in bytes. +//! +//! # Schema and type +//! +//! Parquet schema can be extracted from [`file::metadata::FileMetaData`] and is +//! represented by Parquet type. +//! +//! Parquet type is described by [`schema::types::Type`], including top level message +//! type or schema. Refer to the [`schema`] module for the detailed information. //! //! # File and row group API //! +//! File reader [`file::reader::FileReader`] is a starting point for working with Parquet +//! files. Provides set of methods to get file metadata, row group readers +//! [`file::reader::RowGroupReader`] to get access to column readers and record iterator. +//! //! # Read API //! +//! Crate has several methods to read data from a Parquet file: +//! - Low level column reader API (see [`column`] module) +//! - Arrow API (_TODO_) +//! - High level record API (see [`record`] module) +//! #![feature(type_ascription)] #![feature(rustc_private)] From b3cc916617b0378db1c0df1890d5265b88dbed74 Mon Sep 17 00:00:00 2001 From: sadikovi Date: Fri, 6 Apr 2018 16:49:13 +1200 Subject: [PATCH 05/32] update docs for basic.rs --- src/basic.rs | 96 +++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 92 insertions(+), 4 deletions(-) diff --git a/src/basic.rs b/src/basic.rs index 2ab53ed..10bfa77 100644 --- a/src/basic.rs +++ b/src/basic.rs @@ -15,6 +15,9 @@ // specific language governing permissions and limitations // under the License. +//! Contains Rust mappings for Thrift definition. +//! See `parquet.thrift` file to see raw definitions of enums listed below. + use std::convert; use std::fmt; use std::result; @@ -27,7 +30,13 @@ use parquet_thrift::parquet; // ---------------------------------------------------------------------- // Types from the Thrift definition -/// Mirrors `parquet::Type` +// Mirrors `parquet::Type` + +/// Types supported by Parquet. +/// These types are intended to be used in combination with the encodings to control +/// the on disk storage format. +/// For example INT16 is not included as a type since a good encoding of INT32 +/// would handle this. #[derive(Debug, Clone, Copy, PartialEq)] pub enum Type { BOOLEAN, @@ -40,43 +49,122 @@ pub enum Type { FIXED_LEN_BYTE_ARRAY } -/// Mirrors `parquet::ConvertedType` +// Mirrors `parquet::ConvertedType` + +/// Common types (logical types) used by frameworks when using Parquet. +/// This helps map between types in those frameworks to the base types in Parquet. +/// This is only metadata and not needed to read or write the data. #[derive(Debug, Clone, Copy, PartialEq)] pub enum LogicalType { NONE, + /// A BYTE_ARRAY actually contains UTF8 encoded chars. UTF8, + + /// A map is converted as an optional field containing a repeated key/value pair. MAP, + + /// A key/value pair is converted into a group of two fields. MAP_KEY_VALUE, + + /// A list is converted into an optional field containing a repeated field for its + /// values. LIST, + + /// An enum is converted into a binary field ENUM, + + /// A decimal value. + /// This may be used to annotate binary or fixed primitive types. The + /// underlying byte array stores the unscaled value encoded as two's + /// complement using big-endian byte order (the most significant byte is the + /// zeroth element). The value of the decimal is the value /// 10^{-scale}. + /// + /// This must be accompanied by a (maximum) precision and a scale in the + /// SchemaElement. The precision specifies the number of digits in the decimal + /// and the scale stores the location of the decimal point. For example 1.23 + /// would have precision 3 (3 total digits) and scale 2 (the decimal point is + /// 2 digits over). DECIMAL, + + /// A date stored as days since Unix epoch, encoded as the INT32 physical type. DATE, + + /// The total number of milliseconds since midnight. The value is stored as an INT32 + /// physical type. TIME_MILLIS, + + /// The total number of microseconds since midnight. The value is stored as an INT64 + /// physical type. TIME_MICROS, + + /// Date and time recorded as milliseconds since the Unix epoch. + /// Recorded as a physical type of INT64. TIMESTAMP_MILLIS, + + /// Date and time recorded as microseconds since the Unix epoch. + /// The value is stored as an INT64 physical type. TIMESTAMP_MICROS, + + /// An unsigned 8 bit integer value stored as INT32 physical type. UINT_8, + + /// An unsigned 16 bit integer value stored as INT32 physical type. UINT_16, + + /// An unsigned 32 bit integer value stored as INT32 physical type. UINT_32, + + /// An unsigned 64 bit integer value stored as INT64 physical type. UINT_64, + + /// A signed 8 bit integer value stored as INT32 physical type. INT_8, + + /// A signed 16 bit integer value stored as INT32 physical type. INT_16, + + /// A signed 32 bit integer value stored as INT32 physical type. INT_32, + + /// A signed 64 bit integer value stored as INT64 physical type. INT_64, + + /// A JSON document embedded within a single UTF8 column. JSON, + + /// A BSON document embedded within a single BINARY column. BSON, + + /// An interval of time. + /// + /// This type annotates data stored as a FIXED_LEN_BYTE_ARRAY of length 12. + /// This data is composed of three separate little endian unsigned integers. + /// Each stores a component of a duration of time. The first integer identifies + /// the number of months associated with the duration, the second identifies + /// the number of days associated with the duration and the third identifies + /// the number of milliseconds associated with the provided duration. + /// This duration of time is independent of any particular timezone or date. INTERVAL } -/// Mirrors `parquet::FieldRepetitionType` +// Mirrors `parquet::FieldRepetitionType` + +/// Representation of field types in schema. #[derive(Debug, Clone, Copy, PartialEq)] pub enum Repetition { + /// Field is required (can not be null) and each record has exactly 1 value. REQUIRED, + /// Field is optional (can be null) and each record has 0 or 1 values. OPTIONAL, + /// Field is repeated and can contain 0 or more values. REPEATED } -/// Mirrors `parquet::Encoding` +// Mirrors `parquet::Encoding` + +/// Encodings supported by Parquet. +/// Not all encodings are valid for all types. These enums are also used to specify the +/// encoding of definition and repetition levels. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub enum Encoding { PLAIN, From efc7c371cbf4daf86e9b601353d4cc03f48ae70d Mon Sep 17 00:00:00 2001 From: sadikovi Date: Sat, 7 Apr 2018 11:49:14 +1200 Subject: [PATCH 06/32] add license header --- build.rs | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/build.rs b/build.rs index 2d39e7e..10f92c9 100644 --- a/build.rs +++ b/build.rs @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + use std::env; use std::fs; use std::process::Command; From ecc1bc9f36d9c521ef55dd2add67f726895b482b Mon Sep 17 00:00:00 2001 From: sadikovi Date: Sat, 7 Apr 2018 12:18:46 +1200 Subject: [PATCH 07/32] add docs for basic and errors --- src/basic.rs | 51 +++++++++++++++++++++++++++++++++++++++++++++++---- src/errors.rs | 17 +++++++++++++++-- src/lib.rs | 2 +- 3 files changed, 63 insertions(+), 7 deletions(-) diff --git a/src/basic.rs b/src/basic.rs index 10bfa77..a7ede9a 100644 --- a/src/basic.rs +++ b/src/basic.rs @@ -26,10 +26,10 @@ use std::str; use errors::ParquetError; use parquet_thrift::parquet; - // ---------------------------------------------------------------------- // Types from the Thrift definition +// ---------------------------------------------------------------------- // Mirrors `parquet::Type` /// Types supported by Parquet. @@ -49,6 +49,7 @@ pub enum Type { FIXED_LEN_BYTE_ARRAY } +// ---------------------------------------------------------------------- // Mirrors `parquet::ConvertedType` /// Common types (logical types) used by frameworks when using Parquet. @@ -77,7 +78,7 @@ pub enum LogicalType { /// This may be used to annotate binary or fixed primitive types. The /// underlying byte array stores the unscaled value encoded as two's /// complement using big-endian byte order (the most significant byte is the - /// zeroth element). The value of the decimal is the value /// 10^{-scale}. + /// zeroth element). /// /// This must be accompanied by a (maximum) precision and a scale in the /// SchemaElement. The precision specifies the number of digits in the decimal @@ -147,6 +148,7 @@ pub enum LogicalType { INTERVAL } +// ---------------------------------------------------------------------- // Mirrors `parquet::FieldRepetitionType` /// Representation of field types in schema. @@ -160,6 +162,7 @@ pub enum Repetition { REPEATED } +// ---------------------------------------------------------------------- // Mirrors `parquet::Encoding` /// Encodings supported by Parquet. @@ -167,17 +170,53 @@ pub enum Repetition { /// encoding of definition and repetition levels. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub enum Encoding { + /// Default byte encoding. + /// - BOOLEAN - 1 bit per value, 0 is false; 1 is true. + /// - INT32 - 4 bytes per value, stored as little-endian. + /// - INT64 - 8 bytes per value, stored as little-endian. + /// - FLOAT - 4 bytes per value, stored as little-endian. + /// - DOUBLE - 8 bytes per value, stored as little-endian. + /// - BYTE_ARRAY - 4 byte length stored as little endian, followed by bytes. + /// FIXED_LEN_BYTE_ARRAY - just the bytes are stored. PLAIN, + + /// **Deprecated** dictionary encoding. + /// The values in the dictionary are encoded using PLAIN encoding. + /// Since it is deprecated, RLE_DICTIONARY encoding is used for a data page, and PLAIN + /// encoding is used for dictionary page. PLAIN_DICTIONARY, + + /// Group packed run length encoding. + /// Usable for definition/repetition levels encoding and boolean values. RLE, + + /// Bit packed encoding. + /// This can only be used if the data has a known max width. + /// Usable for definition/repetition levels encoding. BIT_PACKED, + + /// Delta encoding for integers, either INT32 or INT64. + /// Works best on sorted data. DELTA_BINARY_PACKED, + + /// Encoding for byte arrays to separate the length values and the data. + /// The lengths are encoded using DELTA_BINARY_PACKED encoding. DELTA_LENGTH_BYTE_ARRAY, + + /// Incremental encoding for byte arrays. + /// Prefix lengths are encoded using DELTA_BINARY_PACKED encoding. + /// Suffixes are stored using DELTA_LENGTH_BYTE_ARRAY encoding. DELTA_BYTE_ARRAY, + + /// Dictionary encoding. + /// The ids are encoded using the RLE encoding. RLE_DICTIONARY } -/// Mirrors `parquet::CompressionCodec` +// ---------------------------------------------------------------------- +// Mirrors `parquet::CompressionCodec` + +/// Supported compression algorithms. #[derive(Debug, Clone, Copy, PartialEq)] pub enum Compression { UNCOMPRESSED, @@ -189,7 +228,11 @@ pub enum Compression { ZSTD } -/// Mirrors `parquet::PageType` +// ---------------------------------------------------------------------- +// Mirrors `parquet::PageType` + +/// Available data pages for Parquet file format. +/// Note that some of the page types may not be supported. #[derive(Debug, Clone, Copy, PartialEq)] pub enum PageType { DATA_PAGE, diff --git a/src/errors.rs b/src/errors.rs index 01348bb..3833d57 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +//! Common Parquet errors and macros. + use std::cell; use std::convert; use std::io; @@ -24,8 +26,11 @@ use snap; use thrift; quick_error! { + /// Set of errors that can be produced during different operations in Parquet. #[derive(Debug, PartialEq)] pub enum ParquetError { + /// General Parquet error. + /// Returned when code violates normal workflow of working with Parquet files. General(message: String) { display("Parquet error: {}", message) description(message) @@ -34,10 +39,15 @@ quick_error! { from(e: thrift::Error) -> (format!("underlying Thrift error: {}", e)) from(e: cell::BorrowMutError) -> (format!("underlying borrow error: {}", e)) } + /// "Not yet implemented" Parquet error. + /// Returned when functionality is not yet available. NYI(message: String) { display("NYI: {}", message) description(message) } + /// "End of file" Parquet error. + /// Returned when IO related failures occur, e.g. when there are not enough bytes to + /// decode. EOF(message: String) { display("EOF: {}", message) description(message) @@ -45,9 +55,11 @@ quick_error! { } } +/// A specialized `Result` for Parquet errors. pub type Result = result::Result; -/// Conversion from `ParquetError` TO other types of `Error`s +// ---------------------------------------------------------------------- +// Conversion from `ParquetError` to other types of `Error`s impl convert::From for io::Error { fn from(e: ParquetError) -> Self { @@ -55,7 +67,8 @@ impl convert::From for io::Error { } } -/// Convenient macros for different errors +// ---------------------------------------------------------------------- +// Convenient macros for different errors macro_rules! general_err { ($fmt:expr) => (ParquetError::General($fmt.to_owned())); diff --git a/src/lib.rs b/src/lib.rs index b09962f..85ccb7e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -21,7 +21,7 @@ //! assembly algorithm described in the Dremel paper. //! //! Crate provides API to access file schema and metadata from a Parquet file, extract -//! row groups or column chunks from a file, or read records/values. +//! row groups or column chunks from a file, and read records/values. //! //! # Example //! ```rust From 9d3cee088c95b79380cdeef22faaff2062cd726a Mon Sep 17 00:00:00 2001 From: sadikovi Date: Sat, 7 Apr 2018 14:37:57 +1200 Subject: [PATCH 08/32] add doc for data_type --- src/data_type.rs | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/src/data_type.rs b/src/data_type.rs index a095ab2..492a55f 100644 --- a/src/data_type.rs +++ b/src/data_type.rs @@ -15,33 +15,37 @@ // specific language governing permissions and limitations // under the License. +//! Data types that connect Parquet physical types with their Rust-specific +//! representations. + use std::mem; use basic::Type; use rand::{Rand, Rng}; use util::memory::{ByteBuffer, ByteBufferPtr}; -// ---------------------------------------------------------------------- -// Types connect Parquet physical types with Rust-specific types - // TODO: alignment? // TODO: we could also use [u32; 3], however it seems there is no easy way // to convert [u32] to [u32; 3] in decoding. +/// Rust representation for logical type INT96, value backed by a vector of `u32`. #[derive(Clone, Debug)] pub struct Int96 { value: Option> } impl Int96 { + /// Creates new INT96 type struct with no data set. pub fn new() -> Self { Int96 { value: None } } + /// Returns underlying data as slice of [`u32`]. pub fn data(&self) -> &[u32] { assert!(self.value.is_some()); &self.value.as_ref().unwrap() } + /// Sets data for this INT96 type. pub fn set_data(&mut self, v: Vec) { assert_eq!(v.len(), 3); self.value = Some(v); @@ -79,31 +83,37 @@ impl Rand for Int96 { } } - +/// Rust representation for BYTE_ARRAY and FIXED_LEN_BYTE_ARRAY Parquet physical types. +/// It is backed by a byte buffer. #[derive(Clone, Debug)] pub struct ByteArray { data: Option } impl ByteArray { + /// Creates new byte array with no data set. pub fn new() -> Self { ByteArray { data: None } } + /// Gets length of the underlying byte buffer. pub fn len(&self) -> usize { assert!(self.data.is_some()); self.data.as_ref().unwrap().len() } + /// Returns slice of data. pub fn data(&self) -> &[u8] { assert!(self.data.is_some()); self.data.as_ref().unwrap().as_ref() } + /// Set data from another byte buffer. pub fn set_data(&mut self, data: ByteBufferPtr) { self.data = Some(data); } + /// Returns `ByteArray` instance with slice of values for a data. pub fn slice(&self, start: usize, len: usize) -> Self { assert!(self.data.is_some()); Self::from(self.data.as_ref().unwrap().range(start, len)) @@ -162,11 +172,9 @@ impl Rand for ByteArray { } } - -// ---------------------------------------------------------------------- -// AsBytes converts an instance of data type to a slice of u8 - +/// Converts an instance of data type to a slice of bytes as `u8`. pub trait AsBytes { + /// Returns slice of bytes for this data type. fn as_bytes(&self) -> &[u8]; } @@ -225,15 +233,16 @@ impl AsBytes for str { } } - -// ---------------------------------------------------------------------- -// DataType trait, which contains the Parquet physical type info as well as -// the Rust primitive type presentation. - +/// Contains the Parquet physical type information as well as the Rust primitive type +/// presentation. pub trait DataType { type T: ::std::cmp::PartialEq + ::std::fmt::Debug + ::std::default::Default + ::std::clone::Clone + Rand + AsBytes; + + /// Returns Parquet physical type. fn get_physical_type() -> Type; + + /// Returns size in bytes for Rust representation of the physical type. fn get_type_size() -> usize; } From 9da396e205e6a49aafe0930d777083fe5a0dfb0e Mon Sep 17 00:00:00 2001 From: sadikovi Date: Sun, 8 Apr 2018 20:30:19 +1200 Subject: [PATCH 09/32] add schema doc --- src/basic.rs | 4 ++-- src/data_type.rs | 4 ++-- src/lib.rs | 3 ++- src/schema/mod.rs | 45 +++++++++++++++++++++++++++++++++++++++++++ src/schema/parser.rs | 44 ++++++++++++++++++++++++++++++++++-------- src/schema/printer.rs | 37 ++++++++++++++++++++++++++++++++--- src/schema/types.rs | 45 ++++++++++++++++++++++++++++++++----------- 7 files changed, 155 insertions(+), 27 deletions(-) diff --git a/src/basic.rs b/src/basic.rs index a7ede9a..c416465 100644 --- a/src/basic.rs +++ b/src/basic.rs @@ -33,8 +33,8 @@ use parquet_thrift::parquet; // Mirrors `parquet::Type` /// Types supported by Parquet. -/// These types are intended to be used in combination with the encodings to control -/// the on disk storage format. +/// These physical types are intended to be used in combination with the encodings to +/// control the on disk storage format. /// For example INT16 is not included as a type since a good encoding of INT32 /// would handle this. #[derive(Debug, Clone, Copy, PartialEq)] diff --git a/src/data_type.rs b/src/data_type.rs index 492a55f..9db0a64 100644 --- a/src/data_type.rs +++ b/src/data_type.rs @@ -27,7 +27,7 @@ use util::memory::{ByteBuffer, ByteBufferPtr}; // TODO: alignment? // TODO: we could also use [u32; 3], however it seems there is no easy way // to convert [u32] to [u32; 3] in decoding. -/// Rust representation for logical type INT96, value backed by a vector of `u32`. +/// Rust representation for logical type INT96, value is backed by a vector of `u32`. #[derive(Clone, Debug)] pub struct Int96 { value: Option> @@ -84,7 +84,7 @@ impl Rand for Int96 { } /// Rust representation for BYTE_ARRAY and FIXED_LEN_BYTE_ARRAY Parquet physical types. -/// It is backed by a byte buffer. +/// Value is backed by a byte buffer. #[derive(Clone, Debug)] pub struct ByteArray { data: Option diff --git a/src/lib.rs b/src/lib.rs index 85ccb7e..4bc0a48 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -73,7 +73,8 @@ //! represented by Parquet type. //! //! Parquet type is described by [`schema::types::Type`], including top level message -//! type or schema. Refer to the [`schema`] module for the detailed information. +//! type or schema. Refer to the [`schema`] module for the detailed information on Type +//! API, printing and parsing of message types. //! //! # File and row group API //! diff --git a/src/schema/mod.rs b/src/schema/mod.rs index 747d0da..c64c045 100644 --- a/src/schema/mod.rs +++ b/src/schema/mod.rs @@ -15,6 +15,51 @@ // specific language governing permissions and limitations // under the License. +//! Contains definitions of Parquet schema and methods to print and parse schema. +//! +//! # Example +//! +//! ```rust +//! use std::rc::Rc; +//! use parquet::schema::types::Type; +//! use parquet::basic::{Type as PhysicalType, LogicalType, Repetition}; +//! use parquet::schema::printer; +//! use parquet::schema::parser; +//! +//! // Create the following schema: +//! // message schema { +//! // OPTIONAL BYTE_ARRAY a (UTF8); +//! // REQUIRED INT32 b; +//! // } +//! +//! let field_a = Type::primitive_type_builder("a", PhysicalType::BYTE_ARRAY) +//! .with_logical_type(LogicalType::UTF8) +//! .with_repetition(Repetition::OPTIONAL) +//! .build() +//! .unwrap(); +//! +//! let field_b = Type::primitive_type_builder("b", PhysicalType::INT32) +//! .with_repetition(Repetition::REQUIRED) +//! .build() +//! .unwrap(); +//! +//! let schema = Type::group_type_builder("schema") +//! .with_fields(&mut vec![Rc::new(field_a), Rc::new(field_b)]) +//! .build() +//! .unwrap(); +//! +//! let mut buf = Vec::new(); +//! +//! // Print schema into buffer +//! printer::print_schema(&mut buf, &schema); +//! +//! // Parse schema from the string. +//! let string_schema = String::from_utf8(buf).unwrap(); +//! let parsed_schema = parser::parse_message_type(&string_schema).unwrap(); +//! +//! assert_eq!(schema, parsed_schema); +//! ``` + pub mod types; pub mod printer; pub mod parser; diff --git a/src/schema/parser.rs b/src/schema/parser.rs index 8cc671b..7f704bc 100644 --- a/src/schema/parser.rs +++ b/src/schema/parser.rs @@ -15,13 +15,41 @@ // specific language governing permissions and limitations // under the License. +//! Parquet schema parser. +//! Provides methods to parse and validate string message type into Parquet [`Type`]. +//! +//! # Example +//! +//! ```rust +//! use parquet::schema::parser::parse_message_type; +//! +//! let message_type = " +//! message spark_schema { +//! OPTIONAL BYTE_ARRAY a (UTF8); +//! REQUIRED INT32 b; +//! REQUIRED DOUBLE c; +//! REQUIRED BOOLEAN d; +//! OPTIONAL group e (LIST) { +//! REPEATED group list { +//! REQUIRED INT32 element; +//! } +//! } +//! } +//! "; +//! +//! let schema = parse_message_type(message_type).expect("Expected valid schema"); +//! println!("{:?}", schema); +//! ``` + use std::rc::Rc; use basic::{LogicalType, Repetition, Type as PhysicalType}; use errors::{ParquetError, Result}; use schema::types::{Type, TypePtr}; -/// Parses message type into `Type` class that can be used to extract individual columns +/// Parses message type as string into a Parquet `Type` which, for example, could be +/// used to extract individual columns. Returns Parquet general error when parsing or +/// validation fails. pub fn parse_message_type<'a>(message_type: &'a str) -> Result { let mut parser = Parser { tokenizer: &mut Tokenizer::from_str(message_type) }; parser.parse_message_type() @@ -92,14 +120,14 @@ impl<'a> Iterator for Tokenizer<'a> { } } -/// Internal Schema parser +/// Internal Schema parser. /// Traverses message type using tokenizer and parses each group/primitive type /// recursively. struct Parser<'a> { tokenizer: &'a mut Tokenizer<'a> } -// Utility function to assert token on validity +// Utility function to assert token on validity. fn assert_token(token: Option<&str>, expected: &str) -> Result<()> { match token { Some(value) if value == expected => Ok(()), @@ -108,7 +136,7 @@ fn assert_token(token: Option<&str>, expected: &str) -> Result<()> { } } -// Utility function to parse i32 or return general error +// Utility function to parse i32 or return general error. fn parse_i32( value: Option<&str>, not_found_msg: &str, @@ -121,9 +149,9 @@ fn parse_i32( } impl<'a> Parser<'a> { - // Entry function to parse message type, uses internal tokenizer + // Entry function to parse message type, uses internal tokenizer. fn parse_message_type(&mut self) -> Result { - // Check that message type starts with "message" + // Check that message type starts with "message". match self.tokenizer.next() { Some("message") => { let name = self.tokenizer.next() @@ -139,8 +167,8 @@ impl<'a> Parser<'a> { } } - // Parse child types for a current group type. - // This is only invoked on root and group types + // Parses child types for a current group type. + // This is only invoked on root and group types. fn parse_child_types(&mut self) -> Result> { assert_token(self.tokenizer.next(), "{")?; let mut vec = Vec::new(); diff --git a/src/schema/printer.rs b/src/schema/printer.rs index 7ceb49f..f17b302 100644 --- a/src/schema/printer.rs +++ b/src/schema/printer.rs @@ -15,6 +15,35 @@ // specific language governing permissions and limitations // under the License. +//! Parquet schema printer. +//! Provides methods to print Parquet file schema and list file metadata. +//! +//! # Example +//! +//! ```rust +//! use std::fs::File; +//! use std::path::Path; +//! use parquet::file::reader::{FileReader, SerializedFileReader}; +//! use parquet::schema::printer::{ +//! print_parquet_metadata, +//! print_file_metadata, +//! print_schema +//! }; +//! +//! // Open a file +//! let path = Path::new("data/alltypes_plain.parquet"); +//! let file = File::open(&path).expect("File should exist"); +//! let reader = SerializedFileReader::new(file).expect("Valid Parquet file"); +//! +//! let parquet_metadata = reader.metadata(); +//! +//! print_parquet_metadata(&mut std::io::stdout(), &parquet_metadata); +//! +//! print_file_metadata(&mut std::io::stdout(), &parquet_metadata.file_metadata()); +//! +//! print_schema(&mut std::io::stdout(), &parquet_metadata.file_metadata().schema()); +//! ``` + use std::fmt; use std::io; @@ -27,7 +56,7 @@ use file::metadata::{ }; use schema::types::Type; -/// Prints Parquet metadata +/// Prints Parquet metadata [`ParquetMetaData`] information. #[allow(unused_must_use)] pub fn print_parquet_metadata(out: &mut io::Write, metadata: &ParquetMetaData) { print_file_metadata(out, &metadata.file_metadata()); @@ -43,7 +72,7 @@ pub fn print_parquet_metadata(out: &mut io::Write, metadata: &ParquetMetaData) { } } -/// Prints metadata information from `file_metadata`. +/// Prints file metadata [`FileMetaData`] information. #[allow(unused_must_use)] pub fn print_file_metadata(out: &mut io::Write, file_metadata: &FileMetaData) { writeln!(out, "version: {}", file_metadata.version()); @@ -55,6 +84,7 @@ pub fn print_file_metadata(out: &mut io::Write, file_metadata: &FileMetaData) { print_schema(out, schema); } +/// Prints Parquet [`Type`] information. #[allow(unused_must_use)] pub fn print_schema(out: &mut io::Write, tp: &Type) { // TODO: better if we can pass fmt::Write to Printer. @@ -123,7 +153,8 @@ fn print_dashes(out: &mut io::Write, num: i32) { const INDENT_WIDTH: i32 = 2; -pub struct Printer<'a> { +/// Struct for printing Parquet message type. +struct Printer<'a> { output: &'a mut fmt::Write, indent: i32 } diff --git a/src/schema/types.rs b/src/schema/types.rs index 37fab9d..cb5d0f7 100644 --- a/src/schema/types.rs +++ b/src/schema/types.rs @@ -50,6 +50,7 @@ pub enum Type { } impl Type { + /// Creates primitive type builder with provided field name and physical type. pub fn primitive_type_builder( name: &str, physical_type: PhysicalType @@ -57,10 +58,12 @@ impl Type { PrimitiveTypeBuilder::new(name, physical_type) } + /// Creates group type builder with provided column name. pub fn group_type_builder(name: &str) -> GroupTypeBuilder { GroupTypeBuilder::new(name) } + /// Returns [`BasicTypeInfo`] information about the type. pub fn get_basic_info(&self) -> &BasicTypeInfo { match *self { Type::PrimitiveType { ref basic_info, .. } => &basic_info, @@ -68,21 +71,23 @@ impl Type { } } + /// Returns this type's field name. pub fn name(&self) -> &str { self.get_basic_info().name() } /// Gets the fields from this group type. - /// NOTE: this will panic if called on a non-group type. + /// Note that this will panic if called on a non-group type. // TODO: should we return `&[&Type]` here? pub fn get_fields(&self) -> &[TypePtr] { match *self { - Type::GroupType{ ref fields, .. } => &fields[..], + Type::GroupType { ref fields, .. } => &fields[..], _ => panic!("Cannot call get_fields() on a non-group type") } } - /// Get physical type or panic if current type is not primitive + /// Gets physical type of this primitive type. + /// Note that this will panic if called on a non-primitive type. pub fn get_physical_type(&self) -> PhysicalType { match *self { Type::PrimitiveType { basic_info: _, physical_type, .. } => physical_type, @@ -90,7 +95,8 @@ impl Type { } } - /// Check if `sub_type` schema is part of current schema, e.g. projected columns + /// Checks if `sub_type` schema is part of current schema. + /// This method can be used to check if projected columns are part of the root schema. pub fn check_contains(&self, sub_type: &Type) -> bool { // Names match, and repetitions match or not set for both let basic_match = self.get_basic_info().name() == sub_type.get_basic_info().name() && @@ -122,7 +128,7 @@ impl Type { } } - /// Whether this is a primitive type. + /// Returns `true` if this type is a primitive type, `false` otherwise. pub fn is_primitive(&self) -> bool { match *self { Type::PrimitiveType { .. } => true, @@ -130,7 +136,7 @@ impl Type { } } - /// Whether this is a group type. + /// Returns `true` if this type is a group type, `false` otherwise. pub fn is_group(&self) -> bool { match *self { Type::GroupType { .. } => true, @@ -138,7 +144,7 @@ impl Type { } } - /// Whether this is the top-level schema type (message type). + /// Returns `true` if this type is the top-level schema type (message type). pub fn is_schema(&self) -> bool { match *self { Type::GroupType { ref basic_info, .. } => !basic_info.has_repetition(), @@ -162,7 +168,8 @@ pub struct PrimitiveTypeBuilder<'a> { } impl<'a> PrimitiveTypeBuilder<'a> { - fn new(name: &'a str, physical_type: PhysicalType) -> Self { + /// Creates new primitive type builder with provided field name and physical type. + pub fn new(name: &'a str, physical_type: PhysicalType) -> Self { Self { name: name, repetition: Repetition::OPTIONAL, @@ -175,38 +182,49 @@ impl<'a> PrimitiveTypeBuilder<'a> { } } + /// Sets [`Repetition`] for this field and returns itself. pub fn with_repetition(mut self, repetition: Repetition) -> Self { self.repetition = repetition; self } + /// Sets [`LogicalType`] for this field and returns itself. pub fn with_logical_type(mut self, logical_type: LogicalType) -> Self { self.logical_type = logical_type; self } + /// Sets type length. + /// This is only applied to FIXED_LEN_BYTE_ARRAY and INT96 (INTERVAL) types, because + /// they maintain fixed size underlying byte array. + /// By default, value is `0`. pub fn with_length(mut self, length: i32) -> Self { self.length = length; self } + /// Sets precision for Parquet DECIMAL physical type. + /// By default, it equals to `0` and used only for decimal context. pub fn with_precision(mut self, precision: i32) -> Self { self.precision = precision; self } + /// Sets scale for Parquet DECIMAL physical type. + /// By default, it equals to `0` and used only for decimal context. pub fn with_scale(mut self, scale: i32) -> Self { self.scale = scale; self } + /// Sets optional field id. pub fn with_id(mut self, id: i32) -> Self { self.id = Some(id); self } - // Creates a new `PrimitiveType` instance from the gathered attributes. - // This also checks various illegal conditions and returns `Err` if that that happen. + /// Creates a new `PrimitiveType` instance from the gathered attributes. + /// Also checks various illegal conditions and returns `Err` if that that happen. pub fn build(self) -> Result { let basic_info = BasicTypeInfo { name: String::from(self.name), @@ -304,6 +322,7 @@ pub struct GroupTypeBuilder<'a> { } impl<'a> GroupTypeBuilder<'a> { + /// Creates new group type builder with provided field name. pub fn new(name: &'a str) -> Self { Self { name: name, @@ -314,27 +333,31 @@ impl<'a> GroupTypeBuilder<'a> { } } + /// Sets [`Repetition`] for this field and returns itself. pub fn with_repetition(mut self, repetition: Repetition) -> Self { self.repetition = Some(repetition); self } + /// Sets [`LogicalType`] for this field and returns itself. pub fn with_logical_type(mut self, logical_type: LogicalType) -> Self { self.logical_type = logical_type; self } + /// Sets a list of fields that should be child nodes of this field. pub fn with_fields(mut self, fields: &mut Vec) -> Self { self.fields.append(fields); self } + /// Sets optional field id. pub fn with_id(mut self, id: i32) -> Self { self.id = Some(id); self } - // Create a new `GroupType` instance from the gathered attributes. + /// Creates a new `GroupType` instance from the gathered attributes. pub fn build(self) -> Result { let basic_info = BasicTypeInfo { name: String::from(self.name), From 0092fcab872cb62f1ab2e9f028d63ae3113f1e4b Mon Sep 17 00:00:00 2001 From: sadikovi Date: Sun, 8 Apr 2018 22:07:42 +1200 Subject: [PATCH 10/32] update schema docs --- src/schema/mod.rs | 8 ++--- src/schema/types.rs | 82 +++++++++++++++++++++++++++++++++++---------- 2 files changed, 68 insertions(+), 22 deletions(-) diff --git a/src/schema/mod.rs b/src/schema/mod.rs index c64c045..5580ec1 100644 --- a/src/schema/mod.rs +++ b/src/schema/mod.rs @@ -21,12 +21,12 @@ //! //! ```rust //! use std::rc::Rc; -//! use parquet::schema::types::Type; //! use parquet::basic::{Type as PhysicalType, LogicalType, Repetition}; -//! use parquet::schema::printer; -//! use parquet::schema::parser; +//! use parquet::schema::{parser, printer}; +//! use parquet::schema::types::Type; //! //! // Create the following schema: +//! // //! // message schema { //! // OPTIONAL BYTE_ARRAY a (UTF8); //! // REQUIRED INT32 b; @@ -53,7 +53,7 @@ //! // Print schema into buffer //! printer::print_schema(&mut buf, &schema); //! -//! // Parse schema from the string. +//! // Parse schema from the string //! let string_schema = String::from_utf8(buf).unwrap(); //! let parsed_schema = parser::parse_message_type(&string_schema).unwrap(); //! diff --git a/src/schema/types.rs b/src/schema/types.rs index cb5d0f7..1277590 100644 --- a/src/schema/types.rs +++ b/src/schema/types.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +//! Contains structs and methods to build Parquet schema and schema descriptors. + use std::collections::HashMap; use std::convert::From; use std::fmt; @@ -27,11 +29,15 @@ use parquet_thrift::parquet::SchemaElement; // ---------------------------------------------------------------------- // Parquet Type definitions +/// Type alias for `Rc`. pub type TypePtr = Rc; +/// Type alias for `Rc`. pub type SchemaDescPtr = Rc; +/// Type alias for `Rc`. pub type ColumnDescPtr = Rc; /// Representation of a Parquet type. +/// Used to describe primitive leaf fields and structs, including top-level schema. /// Note that the top-level schema type is represented using `GroupType` whose /// repetition is `None`. #[derive(Debug, PartialEq)] @@ -194,7 +200,7 @@ impl<'a> PrimitiveTypeBuilder<'a> { self } - /// Sets type length. + /// Sets type length and returns itself. /// This is only applied to FIXED_LEN_BYTE_ARRAY and INT96 (INTERVAL) types, because /// they maintain fixed size underlying byte array. /// By default, value is `0`. @@ -203,21 +209,21 @@ impl<'a> PrimitiveTypeBuilder<'a> { self } - /// Sets precision for Parquet DECIMAL physical type. + /// Sets precision for Parquet DECIMAL physical type and returns itself. /// By default, it equals to `0` and used only for decimal context. pub fn with_precision(mut self, precision: i32) -> Self { self.precision = precision; self } - /// Sets scale for Parquet DECIMAL physical type. + /// Sets scale for Parquet DECIMAL physical type and returns itself. /// By default, it equals to `0` and used only for decimal context. pub fn with_scale(mut self, scale: i32) -> Self { self.scale = scale; self } - /// Sets optional field id. + /// Sets optional field id and returns itself. pub fn with_id(mut self, id: i32) -> Self { self.id = Some(id); self @@ -346,12 +352,13 @@ impl<'a> GroupTypeBuilder<'a> { } /// Sets a list of fields that should be child nodes of this field. + /// Returns updated self. pub fn with_fields(mut self, fields: &mut Vec) -> Self { self.fields.append(fields); self } - /// Sets optional field id. + /// Sets optional field id and returns itself. pub fn with_id(mut self, id: i32) -> Self { self.id = Some(id); self @@ -383,27 +390,35 @@ pub struct BasicTypeInfo { } impl BasicTypeInfo { + /// Returns field name. pub fn name(&self) -> &str { &self.name } + /// Returns `true` if type has repetition field set, `false` otherwise. + /// This is mostly applied to group type, because primitive type always has + /// repetition set. pub fn has_repetition(&self) -> bool { self.repetition.is_some() } + /// Returns [`Repetition`] value for the type. pub fn repetition(&self) -> Repetition { assert!(self.repetition.is_some()); self.repetition.unwrap() } + /// Returns [`LogicalType`] value for the type. pub fn logical_type(&self) -> LogicalType { self.logical_type } + /// Returns `true` if id is set, `false` otherwise. pub fn has_id(&self) -> bool { self.id.is_some() } + /// Returns id value for the type. pub fn id(&self) -> i32 { assert!(self.id.is_some()); self.id.unwrap() @@ -421,10 +436,22 @@ pub struct ColumnPath { } impl ColumnPath { + /// Creates new column path from vector of field names. pub fn new(parts: Vec) -> Self { ColumnPath { parts: parts } } + /// Returns string representation of this column path. + /// ```rust + /// use parquet::schema::types::ColumnPath; + /// + /// let path = ColumnPath::new(vec![ + /// "a".to_string(), + /// "b".to_string(), + /// "c".to_string() + /// ]); + /// assert_eq!(&path.string(), "a.b.c"); + /// ``` pub fn string(&self) -> String { self.parts.join(".") } @@ -457,7 +484,6 @@ impl From for ColumnPath { } } - /// A descriptor for leaf-level primitive columns. /// This encapsulates information such as definition and repetition levels and is used to /// re-assemble nested data. @@ -483,6 +509,7 @@ pub struct ColumnDescriptor { } impl ColumnDescriptor { + /// Creates new descriptor for leaf-level column. pub fn new( primitive_type: TypePtr, root_type: Option, @@ -499,63 +526,76 @@ impl ColumnDescriptor { } } + /// Returns maximum definition level for this column. pub fn max_def_level(&self) -> i16 { self.max_def_level } + /// Returns maximum repetition level for this column. pub fn max_rep_level(&self) -> i16 { self.max_rep_level } + /// Returns [`ColumnPath`] for this column. pub fn path(&self) -> &ColumnPath { &self.path } + /// Returns root [`Type`] (most top-level parent field) for this leaf column. pub fn root_type(&self) -> &Type { assert!(self.root_type.is_some()); self.root_type.as_ref().unwrap() } + /// Returns column name. pub fn name(&self) -> &str { self.primitive_type.name() } + /// Returns [`LogicalType`] for this column. pub fn logical_type(&self) -> LogicalType { self.primitive_type.get_basic_info().logical_type() } + /// Returns physical type for this column. + /// Note that it will panic if called on a non-primitive type. pub fn physical_type(&self) -> PhysicalType { match self.primitive_type.as_ref() { - &Type::PrimitiveType{ physical_type, .. } => physical_type, + &Type::PrimitiveType { physical_type, .. } => physical_type, _ => panic!("Expected primitive type!") } } + /// Returns type length for this column. + /// Note that it will panic if called on a non-primitive type. pub fn type_length(&self) -> i32 { match self.primitive_type.as_ref() { - &Type::PrimitiveType{ type_length, .. } => type_length, + &Type::PrimitiveType { type_length, .. } => type_length, _ => panic!("Expected primitive type!") } } + /// Returns type precision for this column. + /// Note that it will panic if called on a non-primitive type. pub fn type_precision(&self) -> i32 { match self.primitive_type.as_ref() { - &Type::PrimitiveType{ precision, .. } => precision, + &Type::PrimitiveType { precision, .. } => precision, _ => panic!("Expected primitive type!") } } + /// Returns type scale for this column. + /// Note that it will panic if called on a non-primitive type. pub fn type_scale(&self) -> i32 { match self.primitive_type.as_ref() { - &Type::PrimitiveType{ scale, .. } => scale, + &Type::PrimitiveType { scale, .. } => scale, _ => panic!("Expected primitive type!") } } - } -/// A schema descriptor. This encapsulates the top-level schemas for all the columns, as -/// well as all descriptors for all the primitive columns. +/// A schema descriptor. This encapsulates the top-level schemas for all the columns, +/// as well as all descriptors for all the primitive columns. pub struct SchemaDescriptor { // The top-level schema (the "message" type). // This must be a `GroupType` where each field is a root column type in the schema. @@ -567,7 +607,7 @@ pub struct SchemaDescriptor { // Mapping from a leaf column's index to the root column type that it // comes from. For instance: the leaf `a.b.c.d` would have a link back to `a`: - // -- a <------ + // -- a <-----+ // -- -- b | // -- -- -- c | // -- -- -- -- d @@ -575,6 +615,7 @@ pub struct SchemaDescriptor { } impl SchemaDescriptor { + /// Creates new schema descriptor from Parquet schema. pub fn new(tp: TypePtr) -> Self { assert!(tp.is_group(), "SchemaDescriptor should take a GroupType"); let mut leaves = vec![]; @@ -596,11 +637,11 @@ impl SchemaDescriptor { Self { schema: tp, leaves: leaves, - leaf_to_base: - leaf_to_base + leaf_to_base: leaf_to_base } } + /// Returns [`ColumnDescriptor`] for a field position. pub fn column(&self, i: usize) -> ColumnDescPtr { assert!( i < self.leaves.len(), @@ -611,14 +652,17 @@ impl SchemaDescriptor { self.leaves[i].clone() } + /// Returns slice of [`ColumnDescriptor`]. pub fn columns(&self) -> &[ColumnDescPtr] { &self.leaves } + /// Returns number of leaf-level columns. pub fn num_columns(&self) -> usize { self.leaves.len() } + /// Returns column root [`Type`] for a field position. pub fn get_column_root(&self, i: usize) -> &Type { assert!( i < self.leaves.len(), @@ -631,10 +675,12 @@ impl SchemaDescriptor { result.unwrap().as_ref() } + /// Returns schema as [`Type`]. pub fn root_schema(&self) -> &Type { self.schema.as_ref() } + /// Returns schema name. pub fn name(&self) -> &str { self.schema.name() } @@ -692,8 +738,8 @@ fn build_tree( } } -/// Conversion from Thrift equivalents - +/// Conversion from Thrift equivalents. +/// Should be considered private. pub fn from_thrift(elements: &mut [SchemaElement]) -> Result { let mut index = 0; let mut schema_nodes = Vec::new(); From 1c2420e2a8db6d70cf71fd68731e8ba25486e4a3 Mon Sep 17 00:00:00 2001 From: sadikovi Date: Mon, 9 Apr 2018 21:38:29 +1200 Subject: [PATCH 11/32] add record docs --- src/record/api.rs | 21 ++++++++++++--------- src/record/mod.rs | 2 ++ src/record/reader.rs | 13 ++++++++----- 3 files changed, 22 insertions(+), 14 deletions(-) diff --git a/src/record/api.rs b/src/record/api.rs index ba43578..1431d3b 100644 --- a/src/record/api.rs +++ b/src/record/api.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +//! Contains Row enum that is used to represent record in Rust. + use std::fmt; use basic::{LogicalType, Type as PhysicalType}; @@ -32,8 +34,7 @@ macro_rules! nyi { }); } -/// Row API to represent nested Parquet record. -/// Use implementation methods to update how Parquet types are mapped to Row. +/// Row API to represent a nested Parquet record. #[derive(Clone, Debug, PartialEq)] pub enum Row { // Primitive types @@ -55,7 +56,7 @@ pub enum Row { } impl Row { - /// Converts BOOLEAN into boolean value. + /// Converts Parquet BOOLEAN type with logical type into `bool` value. pub fn convert_bool( _physical_type: PhysicalType, _logical_type: LogicalType, @@ -64,7 +65,7 @@ impl Row { Row::Bool(value) } - // Converts INT32 into integer value. + /// Converts Parquet INT32 type with logical type into `i32` value. pub fn convert_int32( physical_type: PhysicalType, logical_type: LogicalType, @@ -78,7 +79,7 @@ impl Row { } } - /// Converts INT64 into long value. + /// Converts Parquet INT64 type with logical type into `i64` value. pub fn convert_int64( physical_type: PhysicalType, logical_type: LogicalType, @@ -90,7 +91,8 @@ impl Row { } } - /// Converts nanosecond timestamps stored as INT96 into milliseconds + /// Converts Parquet INT96 (nanosecond timestamps) type and logical type into + /// `Timestamp` value. pub fn convert_int96( _physical_type: PhysicalType, _logical_type: LogicalType, @@ -108,7 +110,7 @@ impl Row { Row::Timestamp(millis) } - /// Converts FLOAT into float value. + /// Converts Parquet FLOAT type with logical type into `f32` value. pub fn convert_float( _physical_type: PhysicalType, _logical_type: LogicalType, @@ -117,7 +119,7 @@ impl Row { Row::Float(value) } - /// Converts DOUBLE into double value. + /// Converts Parquet DOUBLE type with logical type into `f64` value. pub fn convert_double( _physical_type: PhysicalType, _logical_type: LogicalType, @@ -126,7 +128,8 @@ impl Row { Row::Double(value) } - /// Converts BYTE_ARRAY into either UTF8 string or array of bytes. + /// Converts Parquet BYTE_ARRAY type with logical type into either UTF8 string or + /// array of bytes. pub fn convert_byte_array( physical_type: PhysicalType, logical_type: LogicalType, diff --git a/src/record/mod.rs b/src/record/mod.rs index c59c51f..1760242 100644 --- a/src/record/mod.rs +++ b/src/record/mod.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +//! Contains record-based API for reading Parquet files. + pub mod api; pub mod reader; mod triplet; diff --git a/src/record/reader.rs b/src/record/reader.rs index 1c79495..c8b4eaf 100644 --- a/src/record/reader.rs +++ b/src/record/reader.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +//! Contains implementation of record assembly and converting Parquet types into `Row`s. + use std::collections::HashMap; use std::rc::Rc; @@ -29,7 +31,8 @@ use record::triplet::TripletIter; const DEFAULT_BATCH_SIZE: usize = 256; /// Tree builder for `Reader` enum. -/// Serves as a container of options for building a reader tree and a builder. +/// Serves as a container of options for building a reader tree and a builder, and +/// accessing a records iterator [`RowIter`]. pub struct TreeBuilder { // Batch size (>= 1) for triplet iterators batch_size: usize @@ -476,7 +479,7 @@ impl Reader { // ---------------------------------------------------------------------- // Row iterators -/// Iterator of `Row`s. +/// Iterator of [`Row`]s. /// It is used either for a single row group to iterate over data in that row group, or /// an entire file with auto buffering of all row groups. pub struct RowIter<'a> { @@ -489,7 +492,7 @@ pub struct RowIter<'a> { } impl<'a> RowIter<'a> { - /// Creates iterator of `Row`s for all row groups in a file. + /// Creates iterator of [`Row`]s for all row groups in a file. pub fn from_file(proj: Option, reader: &'a FileReader) -> Result { let descr = Self::get_proj_descr(proj, reader.metadata().file_metadata().schema_descr_ptr())?; @@ -505,7 +508,7 @@ impl<'a> RowIter<'a> { }) } - /// Creates iterator of `Row`s for a specific row group. + /// Creates iterator of [`Row`]s for a specific row group. pub fn from_row_group(proj: Option, reader: &'a RowGroupReader) -> Result { let descr = Self::get_proj_descr(proj, reader.metadata().schema_descr_ptr())?; let tree_builder = Self::tree_builder(); @@ -579,7 +582,7 @@ impl<'a> Iterator for RowIter<'a> { } } -/// Internal iterator of `Row`s for a reader. +/// Internal iterator of [`Row`]s for a reader. pub struct ReaderIter { root_reader: Reader, records_left: usize From f37efd8c002bacbf7917c6356cd0f2f198540e81 Mon Sep 17 00:00:00 2001 From: sadikovi Date: Mon, 9 Apr 2018 22:40:11 +1200 Subject: [PATCH 12/32] add column docs --- src/column/mod.rs | 78 ++++++++++++++++++++++++++++++++++++++++++++ src/column/page.rs | 37 ++++++++++++--------- src/column/reader.rs | 9 +++-- src/lib.rs | 2 +- 4 files changed, 108 insertions(+), 18 deletions(-) diff --git a/src/column/mod.rs b/src/column/mod.rs index e6ab26d..2683fc2 100644 --- a/src/column/mod.rs +++ b/src/column/mod.rs @@ -15,5 +15,83 @@ // specific language governing permissions and limitations // under the License. +//! Low level column reader API. +//! +//! This API is designed for the direct mapping with subsequent manual handling of +//! definition and repetition values and spacing. This allows to create column vectors +//! and batches and map them directly to Parquet data. +//! +//! See below an example of using the API. +//! +//! # Example +//! ```rust +//! use std::fs::File; +//! use std::path::Path; +//! +//! use parquet::basic::Type; +//! use parquet::data_type::Int32Type; +//! use parquet::column::reader::get_typed_column_reader; +//! use parquet::file::reader::{FileReader, SerializedFileReader}; +//! +//! // Open Parquet file and initialize reader +//! let path = Path::new("data/alltypes_plain.parquet"); +//! let file = File::open(&path).unwrap(); +//! let parquet_reader = SerializedFileReader::new(file).unwrap(); +//! let metadata = parquet_reader.metadata(); +//! +//! for i in 0..metadata.num_row_groups() { +//! let row_group_reader = parquet_reader.get_row_group(i).unwrap(); +//! let row_group_metadata = metadata.row_group(i); +//! +//! for j in 0..row_group_metadata.num_columns() { +//! let column = row_group_metadata.column(j); +//! +//! // Extract column reader and map to typed column reader for required columns. +//! let column_reader = row_group_reader +//! .get_column_reader(j) +//! .expect("Valid column reader"); +//! +//! // Extract typed column reader for any INT32 column in the file. +//! // It is also possible to extract certain columns based on column descriptors +//! // from metadata. +//! +//! match column.column_type() { +//! Type::INT32 => { +//! let mut typed_column_reader = +//! get_typed_column_reader::(column_reader); +//! +//! // See `read_batch` method for comments on different parameters. +//! let mut values = vec![0; 16]; +//! let mut def_levels = vec![0; 16]; +//! let mut rep_levels = vec![0; 16]; +//! +//! let num_values = typed_column_reader.read_batch( +//! 8, // batch size +//! Some(&mut def_levels), // definition levels +//! Some(&mut rep_levels), // repetition levels +//! &mut values // read values +//! ); +//! +//! println!( +//! "Read {:?} values, values: {:?}, def_levels: {:?}, rep_levels: {:?}", +//! num_values, +//! values, +//! def_levels, +//! rep_levels, +//! ); +//! }, +//! _ => { +//! // Skip any other columns for now, but there could be similar processing. +//! println!( +//! "Skipped column {} of type {}", +//! column.column_path().string(), +//! column.column_type() +//! ); +//! } +//! } +//! } +//! } +//! ``` + pub mod page; pub mod reader; diff --git a/src/column/page.rs b/src/column/page.rs index ac121dd..cc24b39 100644 --- a/src/column/page.rs +++ b/src/column/page.rs @@ -15,12 +15,15 @@ // specific language governing permissions and limitations // under the License. +//! Contains Parquet Page definitions and page reader interface. + use basic::{PageType, Encoding}; use errors::Result; use util::memory::ByteBufferPtr; /// Parquet Page definition. /// +/// List of supported pages. /// These are 1-to-1 mapped from the equivalent Thrift definitions, except `buf` which /// used to store uncompressed bytes of the page. pub enum Page { @@ -50,44 +53,48 @@ pub enum Page { } impl Page { + /// Returns [`PageType`] for this page. pub fn page_type(&self) -> PageType { match self { - &Page::DataPage{ .. } => PageType::DATA_PAGE, - &Page::DataPageV2{ .. } => PageType::DATA_PAGE_V2, - &Page::DictionaryPage{ .. } => PageType::DICTIONARY_PAGE + &Page::DataPage { .. } => PageType::DATA_PAGE, + &Page::DataPageV2 { .. } => PageType::DATA_PAGE_V2, + &Page::DictionaryPage { .. } => PageType::DICTIONARY_PAGE } } + /// Returns internal byte buffer reference for this page. pub fn buffer(&self) -> &ByteBufferPtr { match self { - &Page::DataPage{ ref buf, .. } => &buf, - &Page::DataPageV2{ ref buf, .. } => &buf, - &Page::DictionaryPage{ ref buf, .. } => &buf + &Page::DataPage { ref buf, .. } => &buf, + &Page::DataPageV2 { ref buf, .. } => &buf, + &Page::DictionaryPage { ref buf, .. } => &buf } } + /// Returns number of values in this page. pub fn num_values(&self) -> u32 { match self { - &Page::DataPage{ num_values, .. } => num_values, - &Page::DataPageV2{ num_values, .. } => num_values, - &Page::DictionaryPage{ num_values, .. } => num_values + &Page::DataPage { num_values, .. } => num_values, + &Page::DataPageV2 { num_values, .. } => num_values, + &Page::DictionaryPage { num_values, .. } => num_values } } + /// Returns this page [`Encoding`]. pub fn encoding(&self) -> Encoding { match self { - &Page::DataPage{ encoding, .. } => encoding, - &Page::DataPageV2{ encoding, .. } => encoding, - &Page::DictionaryPage{ encoding, .. } => encoding + &Page::DataPage { encoding, .. } => encoding, + &Page::DataPageV2 { encoding, .. } => encoding, + &Page::DictionaryPage { encoding, .. } => encoding } } } -/// API for reading pages from a column chunk. This offers a iterator like API to get the -/// next page. +/// API for reading pages from a column chunk. +/// This offers a iterator like API to get the next page. pub trait PageReader { /// Gets the next page in the column chunk associated with this reader. - /// Returns `None` if there's no page left. + /// Returns `None` if there are no pages left. fn get_next_page(&mut self) -> Result>; } diff --git a/src/column/reader.rs b/src/column/reader.rs index d3b5e0d..0961681 100644 --- a/src/column/reader.rs +++ b/src/column/reader.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +//! Contains column reader API. + use std::cmp::{max, min}; use std::collections::HashMap; use std::mem; @@ -28,6 +30,7 @@ use errors::{Result, ParquetError}; use schema::types::ColumnDescPtr; use util::memory::ByteBufferPtr; +/// Column reader for a Parquet type. pub enum ColumnReader { BoolColumnReader(ColumnReaderImpl), Int32ColumnReader(ColumnReaderImpl), @@ -67,6 +70,7 @@ pub fn get_column_reader( /// Gets a typed column reader for the specific type `T`, by "up-casting" `col_reader` of /// non-generic type to a generic column reader type `ColumnReaderImpl`. +/// /// NOTE: the caller MUST guarantee that the actual enum value for `col_reader` matches /// the type `T`. Otherwise, disastrous consequence could happen. pub fn get_typed_column_reader( @@ -84,7 +88,7 @@ pub fn get_typed_column_reader( } } -/// A value reader for a particular primitive column. +/// Typed value reader for a particular primitive column. pub struct ColumnReaderImpl { descr: ColumnDescPtr, def_level_decoder: Option, @@ -104,6 +108,7 @@ pub struct ColumnReaderImpl { } impl ColumnReaderImpl where T: 'static { + /// Creates new column reader based on column descriptor and page reader. pub fn new(descr: ColumnDescPtr, page_reader: Box) -> Self { Self { descr: descr, @@ -335,7 +340,7 @@ impl ColumnReaderImpl where T: 'static { Ok(true) } - // Resolves and updates encoding and set decoder for the current page + /// Resolves and updates encoding and set decoder for the current page fn set_current_page_encoding( &mut self, mut encoding: Encoding, diff --git a/src/lib.rs b/src/lib.rs index 4bc0a48..b971fb4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -84,7 +84,7 @@ //! //! # Read API //! -//! Crate has several methods to read data from a Parquet file: +//! Crate offers several methods to read data from a Parquet file: //! - Low level column reader API (see [`column`] module) //! - Arrow API (_TODO_) //! - High level record API (see [`record`] module) From a7f3bb7e6455fb11f88cb54a538973aabcfc6dcb Mon Sep 17 00:00:00 2001 From: sadikovi Date: Tue, 10 Apr 2018 10:59:51 +1200 Subject: [PATCH 13/32] add file doc and some minor updates --- src/column/mod.rs | 3 +- src/file/metadata.rs | 79 ++++++++++++++++++++++++++++++++++---------- src/file/mod.rs | 21 ++++++++++++ src/file/reader.rs | 55 ++++++++++++++++++------------ src/lib.rs | 25 ++++++++++++-- 5 files changed, 140 insertions(+), 43 deletions(-) diff --git a/src/column/mod.rs b/src/column/mod.rs index 2683fc2..77cceba 100644 --- a/src/column/mod.rs +++ b/src/column/mod.rs @@ -18,12 +18,13 @@ //! Low level column reader API. //! //! This API is designed for the direct mapping with subsequent manual handling of -//! definition and repetition values and spacing. This allows to create column vectors +//! definition and repetition levels and spacing. This allows to create column vectors //! and batches and map them directly to Parquet data. //! //! See below an example of using the API. //! //! # Example +//! //! ```rust //! use std::fs::File; //! use std::path::Path; diff --git a/src/file/metadata.rs b/src/file/metadata.rs index d1fac90..8c36bd3 100644 --- a/src/file/metadata.rs +++ b/src/file/metadata.rs @@ -15,6 +15,18 @@ // specific language governing permissions and limitations // under the License. +//! Contains information about available Parquet metadata. +//! The hierarchy of metadata is as follows: +//! +//! [`ParquetMetaData`] contains [`FileMetaData`] and zero or more [`RowGroupMetaData`] +//! for each row group. +//! +//! Each [`RowGroupMetaData`] contains one or more [`ColumnChunkMetaData`] for each +//! column chunk. +//! +//! Each [`ColumnChunkMetaData`] contains information about the leaf (primitive) column, +//! including descriptor, page metadata, byte sizes, compression, encodings, etc. + use std::rc::Rc; use basic::{Compression, Encoding, Type}; @@ -23,14 +35,17 @@ use schema::types::{ColumnDescriptor, ColumnDescPtr, ColumnPath}; use schema::types::{SchemaDescriptor, SchemaDescPtr, Type as SchemaType, TypePtr}; use parquet_thrift::parquet::{ColumnChunk, ColumnMetaData, RowGroup}; +/// Reference counted pointer for [`ParquetMetaData`]. pub type ParquetMetaDataPtr = Rc; +/// Global Parquet metadata. pub struct ParquetMetaData { file_metadata: FileMetaDataPtr, row_groups: Vec } impl ParquetMetaData { + /// Creates new Parquet metadata from file metadata and list of row group metadata. pub fn new(file_metadata: FileMetaData, row_groups: Vec) -> Self { ParquetMetaData { file_metadata: Rc::new(file_metadata), @@ -38,26 +53,32 @@ impl ParquetMetaData { } } + /// Returns [`FileMetaData`] reference. pub fn file_metadata(&self) -> FileMetaDataPtr { self.file_metadata.clone() } + /// Returns number of row groups in this file. pub fn num_row_groups(&self) -> usize { self.row_groups.len() } + /// Returns row group for a particular position. + /// Position should be less than number of row groups `num_row_groups`. pub fn row_group(&self, i: usize) -> RowGroupMetaDataPtr { self.row_groups[i].clone() } + /// Returns slice of row group reference counted pointers in this file. pub fn row_groups(&self) -> &[RowGroupMetaDataPtr] { &self.row_groups.as_slice() } } +/// Reference counted pointer for [`FileMetaData`]. pub type FileMetaDataPtr = Rc; -/// Metadata for a Parquet file +/// Metadata for a Parquet file. pub struct FileMetaData { version: i32, num_rows: i64, @@ -67,6 +88,7 @@ pub struct FileMetaData { } impl FileMetaData { + /// Creates new file metadata. pub fn new( version: i32, num_rows: i64, @@ -83,34 +105,47 @@ impl FileMetaData { } } + /// Returns version of this file. pub fn version(&self) -> i32 { self.version } + /// Returns number of rows in the file. pub fn num_rows(&self) -> i64 { self.num_rows } + /// String message for application that wrote this file. This will have format + /// ` version (build )`. + /// For example, + /// ```shell + /// parquet-mr version 1.8.1 (build 4aba4dae7bb0d4edbcf7923ae1339f28fd3f7fcf) + /// impala version 1.3.0-INTERNAL (build 8a48ddb1eff84592b3fc06bc6f51ec120e1fffc9) + /// ``` pub fn created_by(&self) -> &Option { &self.created_by } + /// Returns reference to the Parquet schema. pub fn schema(&self) -> &SchemaType { self.schema.as_ref() } + /// Returns reference to schema descriptor for the file. pub fn schema_descr(&self) -> &SchemaDescriptor { &self.schema_descr } + /// Returns [`Rc`] for schema descriptor. pub fn schema_descr_ptr(&self) -> SchemaDescPtr { self.schema_descr.clone() } } +/// Reference counted pointer for [`RowGroupMetaData`]. pub type RowGroupMetaDataPtr = Rc; -/// Metadata for a row group +/// Metadata for a row group. pub struct RowGroupMetaData { columns: Vec, num_rows: i64, @@ -119,34 +154,42 @@ pub struct RowGroupMetaData { } impl RowGroupMetaData { + /// Number of columns in this row group. pub fn num_columns(&self) -> usize { self.columns.len() } + /// Returns reference to a [`ColumnChunkMetaData`] for a column index. pub fn column(&self, i: usize) -> &ColumnChunkMetaData { &self.columns[i] } + /// Returns slice of column chunk metadata [`Rc`] pointers. pub fn columns(&self) -> &[ColumnChunkMetaDataPtr] { &self.columns } + /// Returns number of rows in this row group. pub fn num_rows(&self) -> i64 { self.num_rows } + /// Returns total byte size of all the uncompressed column data in this row group. pub fn total_byte_size(&self) -> i64 { self.total_byte_size } + /// Returns reference to a schema descriptor. pub fn schema_descr(&self) -> &SchemaDescriptor { self.schema_descr.as_ref() } + /// Returns reference counted clone of schema descriptor. pub fn schema_descr_ptr(&self) -> SchemaDescPtr { self.schema_descr.clone() } + /// Internal conversion from Thrift method. pub fn from_thrift( schema_descr: SchemaDescPtr, mut rg: RowGroup @@ -168,9 +211,10 @@ impl RowGroupMetaData { } } +/// Reference counted pointer for [`ColumnChunkMetaData`]. pub type ColumnChunkMetaDataPtr = Rc; -/// Metadata for a column chunk +/// Metadata for a column chunk. pub struct ColumnChunkMetaData { column_type: Type, column_path: ColumnPath, @@ -187,7 +231,7 @@ pub struct ColumnChunkMetaData { dictionary_page_offset: Option } -/// Represents common operations for a column chunk +/// Represents common operations for a column chunk. impl ColumnChunkMetaData { /// File where the column chunk is stored. If not set, assumed to /// be at the same file as the metadata. @@ -206,73 +250,74 @@ impl ColumnChunkMetaData { self.column_type } - /// Path (or identifier) of this column + /// Path (or identifier) of this column. pub fn column_path(&self) -> &ColumnPath { &self.column_path } - /// Descriptor for this column + /// Descriptor for this column. pub fn column_descr(&self) -> &ColumnDescriptor { self.column_descr.as_ref() } - /// Reference counted clone of descriptor for this column + /// Reference counted clone of descriptor for this column. pub fn column_descr_ptr(&self) -> ColumnDescPtr { self.column_descr.clone() } - /// All encodings used for this column + /// All encodings used for this column. pub fn encodings(&self) -> &Vec { &self.encodings } - /// Total number of values in this column chunk + /// Total number of values in this column chunk. pub fn num_values(&self) -> i64 { self.num_values } + /// Compression for this column. pub fn compression(&self) -> Compression { self.compression } - /// Get the total compressed data size of this column chunk + /// Returns the total compressed data size of this column chunk. pub fn compressed_size(&self) -> i64 { self.total_compressed_size } - /// Get the total uncompressed data size of this column chunk + /// Returns the total uncompressed data size of this column chunk. pub fn uncompressed_size(&self) -> i64 { self.total_uncompressed_size } - /// Get the offset for the column data + /// Returns the offset for the column data. pub fn data_page_offset(&self) -> i64 { self.data_page_offset } - /// Whether this column chunk contains a index page + /// Returns `true` if this column chunk contains a index page, `false` otherwise. pub fn has_index_page(&self) -> bool { self.index_page_offset.is_some() } - /// Get the offset for the index page + /// Returns the offset for the index page. pub fn index_page_offset(&self) -> Option { self.index_page_offset } - /// Whether this column chunk contains a dictionary page + /// Returns `true` if this column chunk contains a dictionary page, `false` otherwise. pub fn has_dictionary_page(&self) -> bool { self.dictionary_page_offset.is_some() } /// TODO: add statistics - /// Get the offset for the dictionary page, if any + /// Returns the offset for the dictionary page, if any. pub fn dictionary_page_offset(&self) -> Option { self.dictionary_page_offset } - /// Conversion from Thrift + /// Internal conversion from Thrift method. pub fn from_thrift(column_descr: ColumnDescPtr, cc: ColumnChunk) -> Result { if cc.meta_data.is_none() { return Err(general_err!("Expected to have column metadata")); diff --git a/src/file/mod.rs b/src/file/mod.rs index 0b63bd8..fcc4784 100644 --- a/src/file/mod.rs +++ b/src/file/mod.rs @@ -15,5 +15,26 @@ // specific language governing permissions and limitations // under the License. +//! Main entrypoint of working with Parquet files. +//! Contains various file metadata, file and row group readers. +//! +//! # Example +//! +//! ```rust +//! use std::fs::File; +//! use std::path::Path; +//! use parquet::file::reader::{FileReader, SerializedFileReader}; +//! +//! // Opening a file +//! let path = Path::new("data/alltypes_plain.parquet"); +//! let file = File::open(&path).expect("File should exist"); +//! +//! let reader = SerializedFileReader::new(file).expect("Valid Parquet file"); +//! +//! let parquet_metadata = reader.metadata(); +//! let row_group = reader.get_row_group(0); +//! ``` +//! + pub mod metadata; pub mod reader; diff --git a/src/file/reader.rs b/src/file/reader.rs index 33ce0dc..a5ce9c8 100644 --- a/src/file/reader.rs +++ b/src/file/reader.rs @@ -15,6 +15,9 @@ // specific language governing permissions and limitations // under the License. +//! Contains file reader API, and provides methods to access file metadata, row group +//! readers to read individual column chunks, or access record iterator. + use std::fs::File; use std::io::{self, BufReader, Read, Seek, SeekFrom}; use std::rc::Rc; @@ -38,22 +41,21 @@ use util::memory::ByteBufferPtr; // APIs for file & row group readers /// Parquet file reader API. With this, user can get metadata information about the -/// Parquet file, and can get reader for each row group. +/// Parquet file, can get reader for each row group, and access record iterator. pub trait FileReader { - /// Get metadata information about this file + /// Get metadata information about this file. fn metadata(&self) -> ParquetMetaDataPtr; - /// Get the total number of row groups for this file + /// Get the total number of row groups for this file. fn num_row_groups(&self) -> usize; /// Get the `i`th row group reader. Note this doesn't do bound check. - /// N.B.: Box<..> has 'static lifetime in default, but here we need the lifetime to be - /// the same as this. Otherwise, the row group metadata stored in the row group reader - /// may outlive the file reader. fn get_row_group(&self, i: usize) -> Result>; /// Get full iterator of `Row` from a file (over all row groups). + /// /// Iterator will automatically load the next row group to advance. + /// /// Projected schema can be a subset of or equal to the file schema, when it is None, /// full file schema is assumed. fn get_row_iter(&self, projection: Option) -> Result; @@ -65,13 +67,13 @@ pub trait RowGroupReader { /// Get metadata information about this row group. fn metadata(&self) -> RowGroupMetaDataPtr; - /// Get the total number of column chunks in this row group + /// Get the total number of column chunks in this row group. fn num_columns(&self) -> usize; - /// Get page reader for the `i`th column chunk + /// Get page reader for the `i`th column chunk. fn get_column_page_reader(&self, i: usize) -> Result>; - /// Get value reader for the `i`th column chunk + /// Get value reader for the `i`th column chunk. fn get_column_reader(&self, i: usize) -> Result; /// Get iterator of `Row` from this row group. @@ -80,14 +82,15 @@ pub trait RowGroupReader { fn get_row_iter(&self, projection: Option) -> Result; } - /// A thin wrapper on `T: Read` to be used by Thrift transport. Write is not supported. -pub struct TMemoryBuffer<'a, T> where T: 'a + Read { +struct TMemoryBuffer<'a, T> where T: 'a + Read { data: &'a mut T } impl<'a, T: 'a + Read> TMemoryBuffer<'a, T> { - pub fn new(data: &'a mut T) -> Self { Self { data: data } } + fn new(data: &'a mut T) -> Self { + Self { data: data } + } } impl<'a, T: 'a + Read> Read for TMemoryBuffer<'a, T> { @@ -103,12 +106,16 @@ impl<'a, T: 'a + Read> Read for TMemoryBuffer<'a, T> { const FOOTER_SIZE: usize = 8; const PARQUET_MAGIC: [u8; 4] = [b'P', b'A', b'R', b'1']; +/// A serialized implementation for a file reader. +/// Main entrypoint for using Parquet API. pub struct SerializedFileReader { buf: BufReader, metadata: ParquetMetaDataPtr } impl SerializedFileReader { + /// Creates file reader from a Parquet file. + /// Returns error if Parquet file does not exist or is corrupt. pub fn new(file: File) -> Result { let mut buf = BufReader::new(file); let metadata = Self::parse_metadata(&mut buf)?; @@ -125,25 +132,25 @@ impl SerializedFileReader { let file_metadata = buf.get_ref().metadata()?; let file_size = file_metadata.len(); if file_size < (FOOTER_SIZE as u64) { - return Err(general_err!("Invalid parquet file. Size is smaller than footer")); + return Err(general_err!("Invalid Parquet file. Size is smaller than footer")); } let mut footer_buffer: [u8; FOOTER_SIZE] = [0; FOOTER_SIZE]; buf.seek(SeekFrom::End(-(FOOTER_SIZE as i64)))?; buf.read_exact(&mut footer_buffer)?; if footer_buffer[4..] != PARQUET_MAGIC { - return Err(general_err!("Invalid parquet file. Corrupt footer")); + return Err(general_err!("Invalid Parquet file. Corrupt footer")); } let metadata_len = LittleEndian::read_i32(&footer_buffer[0..4]) as i64; if metadata_len < 0 { return Err(general_err!( - "Invalid parquet file. Metadata length is less than zero ({})", + "Invalid Parquet file. Metadata length is less than zero ({})", metadata_len )); } let metadata_start: i64 = file_size as i64 - FOOTER_SIZE as i64 - metadata_len; if metadata_start < 0 { return Err(general_err!( - "Invalid parquet file. Metadata start is less than zero ({})", + "Invalid Parquet file. Metadata start is less than zero ({})", metadata_start )); } @@ -185,6 +192,7 @@ impl FileReader for SerializedFileReader { fn get_row_group(&self, i: usize) -> Result> { let row_group_metadata = self.metadata.row_group(i); + // Row groups should be processed sequentially. let f = self.buf.get_ref().try_clone()?; Ok(Box::new(SerializedRowGroupReader::new(f, row_group_metadata))) } @@ -194,13 +202,14 @@ impl FileReader for SerializedFileReader { } } -/// A serialized impl for row group reader +/// A serialized implementation for row group reader. pub struct SerializedRowGroupReader { buf: BufReader, metadata: RowGroupMetaDataPtr } impl SerializedRowGroupReader { + /// Creates new serialized row group reader. pub fn new(file: File, metadata: RowGroupMetaDataPtr) -> Self { let buf = BufReader::new(file); Self { buf, metadata } @@ -262,7 +271,7 @@ impl RowGroupReader for SerializedRowGroupReader { } -/// A serialized impl for Parquet page reader +/// A serialized implementation for Parquet page reader. pub struct SerializedPageReader { // The file chunk buffer which references exactly the bytes for the column trunk to be // read by this page reader @@ -279,6 +288,7 @@ pub struct SerializedPageReader { } impl SerializedPageReader { + /// Creates new serialized page reader from file chunk. pub fn new( buf: FileChunk, total_num_values: i64, @@ -294,6 +304,7 @@ impl SerializedPageReader { Ok(result) } + /// Reads Page header from Thrift. fn read_page_header(&mut self) -> Result { let transport = TMemoryBuffer::new(&mut self.buf); let mut prot = TCompactInputProtocol::new(transport); @@ -421,7 +432,7 @@ mod tests { assert!(reader_result.is_err()); assert_eq!( reader_result.err().unwrap(), - general_err!("Invalid parquet file. Size is smaller than footer") + general_err!("Invalid Parquet file. Size is smaller than footer") ); } @@ -432,7 +443,7 @@ mod tests { assert!(reader_result.is_err()); assert_eq!( reader_result.err().unwrap(), - general_err!("Invalid parquet file. Corrupt footer") + general_err!("Invalid Parquet file. Corrupt footer") ); } @@ -444,7 +455,7 @@ mod tests { assert!(reader_result.is_err()); assert_eq!( reader_result.err().unwrap(), - general_err!("Invalid parquet file. Metadata length is less than zero (-16777216)") + general_err!("Invalid Parquet file. Metadata length is less than zero (-16777216)") ); } @@ -456,7 +467,7 @@ mod tests { assert!(reader_result.is_err()); assert_eq!( reader_result.err().unwrap(), - general_err!("Invalid parquet file. Metadata start is less than zero (-255)") + general_err!("Invalid Parquet file. Metadata start is less than zero (-255)") ); } diff --git a/src/lib.rs b/src/lib.rs index b971fb4..7dfa94e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -23,18 +23,38 @@ //! Crate provides API to access file schema and metadata from a Parquet file, extract //! row groups or column chunks from a file, and read records/values. //! +//! # Usage +//! +//! This crate is [on crates.io](https://crates.io/crates/parquet) and can be used by +//! adding `parquet` to the list of dependencies in `Cargo.toml`. +//! +//! ```toml +//! [dependencies] +//! parquet = "0.1" +//! ``` +//! +//! and this to the project's crate root: +//! +//! ```rust +//! extern crate parquet; +//! ``` +//! //! # Example +//! +//! Below is the example of reading a Parquet file, listing Parquet metadata including +//! column chunk metadata, using record API and accessing row group readers. +//! //! ```rust //! use std::fs::File; //! use std::path::Path; //! use parquet::file::reader::{FileReader, SerializedFileReader}; //! -//! // Opening a file +//! // Creating a file reader //! let path = Path::new("data/alltypes_plain.parquet"); //! let file = File::open(&path).expect("File should exist"); //! let reader = SerializedFileReader::new(file).expect("Valid Parquet file"); //! -//! // Accessing Parquet metadata +//! // Listing Parquet metadata //! let parquet_metadata = reader.metadata(); //! let file_metadata = parquet_metadata.file_metadata(); //! for i in 0..parquet_metadata.num_row_groups() { @@ -57,7 +77,6 @@ //! for i in 0..reader.num_row_groups() { //! let row_group_reader = reader.get_row_group(i).expect("Should be okay"); //! } -//! //! ``` //! //! # Metadata From c2e1bdd5c04f4501d5e5b8a81538613fa807ee42 Mon Sep 17 00:00:00 2001 From: sadikovi Date: Tue, 10 Apr 2018 11:49:20 +1200 Subject: [PATCH 14/32] update readme --- README.md | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/README.md b/README.md index e134585..bfe27a3 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,33 @@ An [Apache Parquet](https://parquet.apache.org/) implementation in Rust (work in progress) +## Usage +Add this to your Cargo.toml: +```toml +[dependencies] +parquet = "0.1" +``` + +and this to your crate root: +```rust +extern crate parquet; +``` + +Example usage: +```rust +use std::fs::File; +use std::path::Path; +use parquet::file::reader::{FileReader, SerializedFileReader}; + +let file = File::open(&Path::new("/path/to/file")).expect("File should exist"); +let reader = SerializedFileReader::new(file).expect("Valid Parquet file"); +let mut iter = reader.get_row_iter(None).unwrap(); +while let Some(record) = iter.next() { + println!("{}", record); +} +``` +See crate documentation on available API. + ## Requirements - Rust nightly - Thrift 0.11.0 or higher From 99a135390b19c394c87234c0f25eb68c39c9b930 Mon Sep 17 00:00:00 2001 From: sadikovi Date: Tue, 10 Apr 2018 11:56:45 +1200 Subject: [PATCH 15/32] add docs.rs link --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index bfe27a3..a6c68ac 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,7 @@ [![Coverage Status](https://coveralls.io/repos/github/sunchao/parquet-rs/badge.svg?branch=master)](https://coveralls.io/github/sunchao/parquet-rs?branch=master) [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) [![](http://meritbadge.herokuapp.com/parquet)](https://crates.io/crates/parquet) +[![Documentation](https://docs.rs/parquet/badge.svg)](https://docs.rs/parquet) An [Apache Parquet](https://parquet.apache.org/) implementation in Rust (work in progress) @@ -25,8 +26,8 @@ use std::fs::File; use std::path::Path; use parquet::file::reader::{FileReader, SerializedFileReader}; -let file = File::open(&Path::new("/path/to/file")).expect("File should exist"); -let reader = SerializedFileReader::new(file).expect("Valid Parquet file"); +let file = File::open(&Path::new("/path/to/file")).unwrap(); +let reader = SerializedFileReader::new(file).unwrap(); let mut iter = reader.get_row_iter(None).unwrap(); while let Some(record) = iter.next() { println!("{}", record); From 5d34f72aeb568f18a7ec11cad1b57d91d4bce769 Mon Sep 17 00:00:00 2001 From: Ivan Sadikov Date: Tue, 10 Apr 2018 20:13:23 +1200 Subject: [PATCH 16/32] update docs --- src/file/metadata.rs | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/file/metadata.rs b/src/file/metadata.rs index 8c36bd3..9c764eb 100644 --- a/src/file/metadata.rs +++ b/src/file/metadata.rs @@ -16,6 +16,7 @@ // under the License. //! Contains information about available Parquet metadata. +//! //! The hierarchy of metadata is as follows: //! //! [`ParquetMetaData`] contains [`FileMetaData`] and zero or more [`RowGroupMetaData`] @@ -63,7 +64,7 @@ impl ParquetMetaData { self.row_groups.len() } - /// Returns row group for a particular position. + /// Returns row group metadata at `i`th position. /// Position should be less than number of row groups `num_row_groups`. pub fn row_group(&self, i: usize) -> RowGroupMetaDataPtr { self.row_groups[i].clone() @@ -136,7 +137,7 @@ impl FileMetaData { &self.schema_descr } - /// Returns [`Rc`] for schema descriptor. + /// Returns reference counted clone for schema descriptor. pub fn schema_descr_ptr(&self) -> SchemaDescPtr { self.schema_descr.clone() } @@ -159,7 +160,7 @@ impl RowGroupMetaData { self.columns.len() } - /// Returns reference to a [`ColumnChunkMetaData`] for a column index. + /// Returns reference to a [`ColumnChunkMetaData`] at `i`th index. pub fn column(&self, i: usize) -> &ColumnChunkMetaData { &self.columns[i] } @@ -174,7 +175,7 @@ impl RowGroupMetaData { self.num_rows } - /// Returns total byte size of all the uncompressed column data in this row group. + /// Returns total byte size of all uncompressed column data in this row group. pub fn total_byte_size(&self) -> i64 { self.total_byte_size } @@ -189,7 +190,7 @@ impl RowGroupMetaData { self.schema_descr.clone() } - /// Internal conversion from Thrift method. + /// Internal conversion from Thrift. pub fn from_thrift( schema_descr: SchemaDescPtr, mut rg: RowGroup @@ -317,7 +318,7 @@ impl ColumnChunkMetaData { self.dictionary_page_offset } - /// Internal conversion from Thrift method. + /// Internal conversion from Thrift. pub fn from_thrift(column_descr: ColumnDescPtr, cc: ColumnChunk) -> Result { if cc.meta_data.is_none() { return Err(general_err!("Expected to have column metadata")); From db3ae7a91d42ae2184f55b91636a25f07024ad8d Mon Sep 17 00:00:00 2001 From: Ivan Sadikov Date: Tue, 10 Apr 2018 20:23:25 +1200 Subject: [PATCH 17/32] update file docs --- src/file/metadata.rs | 47 ++++++++++++++++++++++++-------------------- src/file/mod.rs | 15 +++++++------- src/file/reader.rs | 29 +++++++++++++-------------- 3 files changed, 47 insertions(+), 44 deletions(-) diff --git a/src/file/metadata.rs b/src/file/metadata.rs index 9c764eb..245daf3 100644 --- a/src/file/metadata.rs +++ b/src/file/metadata.rs @@ -22,11 +22,13 @@ //! [`ParquetMetaData`] contains [`FileMetaData`] and zero or more [`RowGroupMetaData`] //! for each row group. //! -//! Each [`RowGroupMetaData`] contains one or more [`ColumnChunkMetaData`] for each -//! column chunk. +//! [`FileMetaData`] includes file version, application specific metadata. //! -//! Each [`ColumnChunkMetaData`] contains information about the leaf (primitive) column, -//! including descriptor, page metadata, byte sizes, compression, encodings, etc. +//! Each [`RowGroupMetaData`] contains information about row group and one or more +//! [`ColumnChunkMetaData`] for each column chunk. +//! +//! [`ColumnChunkMetaData`] has information about column chunk (primitive leaf column), +//! including encoding/compression, number of values, etc. use std::rc::Rc; @@ -46,7 +48,8 @@ pub struct ParquetMetaData { } impl ParquetMetaData { - /// Creates new Parquet metadata from file metadata and list of row group metadata. + /// Creates Parquet metadata from file metadata and a list of row group metadata for + /// each available row group. pub fn new(file_metadata: FileMetaData, row_groups: Vec) -> Self { ParquetMetaData { file_metadata: Rc::new(file_metadata), @@ -54,7 +57,7 @@ impl ParquetMetaData { } } - /// Returns [`FileMetaData`] reference. + /// Returns file metadata as reference counted clone. pub fn file_metadata(&self) -> FileMetaDataPtr { self.file_metadata.clone() } @@ -64,7 +67,7 @@ impl ParquetMetaData { self.row_groups.len() } - /// Returns row group metadata at `i`th position. + /// Returns row group metadata for `i`th position. /// Position should be less than number of row groups `num_row_groups`. pub fn row_group(&self, i: usize) -> RowGroupMetaDataPtr { self.row_groups[i].clone() @@ -116,23 +119,24 @@ impl FileMetaData { self.num_rows } - /// String message for application that wrote this file. This will have format - /// ` version (build )`. - /// For example, + /// String message for application that wrote this file. + /// + /// This should have the following format: + /// ` version (build )`. + /// /// ```shell - /// parquet-mr version 1.8.1 (build 4aba4dae7bb0d4edbcf7923ae1339f28fd3f7fcf) - /// impala version 1.3.0-INTERNAL (build 8a48ddb1eff84592b3fc06bc6f51ec120e1fffc9) + /// parquet-mr version 1.8.0 (build 0fda28af84b9746396014ad6a415b90592a98b3b) /// ``` pub fn created_by(&self) -> &Option { &self.created_by } - /// Returns reference to the Parquet schema. + /// Returns Parquet ['Type`] that describes schema in this file. pub fn schema(&self) -> &SchemaType { self.schema.as_ref() } - /// Returns reference to schema descriptor for the file. + /// Returns a reference to schema descriptor. pub fn schema_descr(&self) -> &SchemaDescriptor { &self.schema_descr } @@ -160,7 +164,7 @@ impl RowGroupMetaData { self.columns.len() } - /// Returns reference to a [`ColumnChunkMetaData`] at `i`th index. + /// Returns column chunk metadata for `i`th column. pub fn column(&self, i: usize) -> &ColumnChunkMetaData { &self.columns[i] } @@ -170,12 +174,12 @@ impl RowGroupMetaData { &self.columns } - /// Returns number of rows in this row group. + /// Number of rows in this row group. pub fn num_rows(&self) -> i64 { self.num_rows } - /// Returns total byte size of all uncompressed column data in this row group. + /// Total byte size of all uncompressed column data in this row group. pub fn total_byte_size(&self) -> i64 { self.total_byte_size } @@ -190,7 +194,7 @@ impl RowGroupMetaData { self.schema_descr.clone() } - /// Internal conversion from Thrift. + /// Internal method to convert from Thrift. pub fn from_thrift( schema_descr: SchemaDescPtr, mut rg: RowGroup @@ -234,8 +238,9 @@ pub struct ColumnChunkMetaData { /// Represents common operations for a column chunk. impl ColumnChunkMetaData { - /// File where the column chunk is stored. If not set, assumed to - /// be at the same file as the metadata. + /// File where the column chunk is stored. + /// + /// If not set, assumed to belong to the same file as the metadata. /// This path is relative to the current file. pub fn file_path(&self) -> Option<&String> { self.file_path.as_ref() @@ -318,7 +323,7 @@ impl ColumnChunkMetaData { self.dictionary_page_offset } - /// Internal conversion from Thrift. + /// Internal method to convert from Thrift. pub fn from_thrift(column_descr: ColumnDescPtr, cc: ColumnChunk) -> Result { if cc.meta_data.is_none() { return Err(general_err!("Expected to have column metadata")); diff --git a/src/file/mod.rs b/src/file/mod.rs index fcc4784..8e66e9c 100644 --- a/src/file/mod.rs +++ b/src/file/mod.rs @@ -15,8 +15,11 @@ // specific language governing permissions and limitations // under the License. -//! Main entrypoint of working with Parquet files. -//! Contains various file metadata, file and row group readers. +//! Main entrypoint for working with Parquet API. +//! Provides access to file and row group readers, record API, etc. +//! +//! See [`reader::SerializedFileReader`] for a starting reference and +//! [`metadata::ParquetMetaData`] for file metadata. //! //! # Example //! @@ -25,16 +28,12 @@ //! use std::path::Path; //! use parquet::file::reader::{FileReader, SerializedFileReader}; //! -//! // Opening a file -//! let path = Path::new("data/alltypes_plain.parquet"); -//! let file = File::open(&path).expect("File should exist"); -//! -//! let reader = SerializedFileReader::new(file).expect("Valid Parquet file"); +//! let file = File::open(&Path::new("data/alltypes_plain.parquet")).unwrap(); +//! let reader = SerializedFileReader::new(file).unwrap(); //! //! let parquet_metadata = reader.metadata(); //! let row_group = reader.get_row_group(0); //! ``` -//! pub mod metadata; pub mod reader; diff --git a/src/file/reader.rs b/src/file/reader.rs index a5ce9c8..7f3b1d0 100644 --- a/src/file/reader.rs +++ b/src/file/reader.rs @@ -52,7 +52,7 @@ pub trait FileReader { /// Get the `i`th row group reader. Note this doesn't do bound check. fn get_row_group(&self, i: usize) -> Result>; - /// Get full iterator of `Row` from a file (over all row groups). + /// Get full iterator of `Row`s from a file (over all row groups). /// /// Iterator will automatically load the next row group to advance. /// @@ -76,7 +76,8 @@ pub trait RowGroupReader { /// Get value reader for the `i`th column chunk. fn get_column_reader(&self, i: usize) -> Result; - /// Get iterator of `Row` from this row group. + /// Get iterator of `Row`s from this row group. + /// /// Projected schema can be a subset of or equal to the file schema, when it is None, /// full file schema is assumed. fn get_row_iter(&self, projection: Option) -> Result; @@ -106,8 +107,7 @@ impl<'a, T: 'a + Read> Read for TMemoryBuffer<'a, T> { const FOOTER_SIZE: usize = 8; const PARQUET_MAGIC: [u8; 4] = [b'P', b'A', b'R', b'1']; -/// A serialized implementation for a file reader. -/// Main entrypoint for using Parquet API. +/// A serialized implementation for Parquet [`FileReader`]. pub struct SerializedFileReader { buf: BufReader, metadata: ParquetMetaDataPtr @@ -202,15 +202,15 @@ impl FileReader for SerializedFileReader { } } -/// A serialized implementation for row group reader. +/// A serialized implementation for Parquet [`RowGroupReader`]. pub struct SerializedRowGroupReader { buf: BufReader, metadata: RowGroupMetaDataPtr } impl SerializedRowGroupReader { - /// Creates new serialized row group reader. - pub fn new(file: File, metadata: RowGroupMetaDataPtr) -> Self { + /// Creates new row group reader from a file and row group metadata. + fn new(file: File, metadata: RowGroupMetaDataPtr) -> Self { let buf = BufReader::new(file); Self { buf, metadata } } @@ -270,26 +270,25 @@ impl RowGroupReader for SerializedRowGroupReader { } } - -/// A serialized implementation for Parquet page reader. +/// A serialized implementation for Parquet [`PageReader`]. pub struct SerializedPageReader { - // The file chunk buffer which references exactly the bytes for the column trunk to be - // read by this page reader + // The file chunk buffer which references exactly the bytes for the column trunk + // to be read by this page reader. buf: FileChunk, // The compression codec for this column chunk. Only set for non-PLAIN codec. decompressor: Option>, - // The number of values we have seen so far + // The number of values we have seen so far. seen_num_values: i64, - // The number of total values in this column chunk + // The number of total values in this column chunk. total_num_values: i64 } impl SerializedPageReader { - /// Creates new serialized page reader from file chunk. - pub fn new( + /// Creates a new serialized page reader from file chunk. + fn new( buf: FileChunk, total_num_values: i64, compression: Compression From 08fd9e803de7fe97d1fb26027cbfe17b91db7575 Mon Sep 17 00:00:00 2001 From: Ivan Sadikov Date: Tue, 10 Apr 2018 21:13:11 +1200 Subject: [PATCH 18/32] add comment for column/reader.rs tests --- src/column/reader.rs | 65 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 58 insertions(+), 7 deletions(-) diff --git a/src/column/reader.rs b/src/column/reader.rs index 0961681..27ecfa8 100644 --- a/src/column/reader.rs +++ b/src/column/reader.rs @@ -624,8 +624,51 @@ mod tests { ); } - // == helper methods to make pages and test == - + // ---------------------------------------------------------------------- + // Helper methods to make pages and test + // + // # Overview + // + // Most of the test functionality is implemented in `ColumnReaderTester`, which + // provides some general data page test methods: + // - `test_read_batch_general` + // - `test_read_batch` + // + // There are also some high level wrappers that are part of `ColumnReaderTester`: + // - `plain_v1` -> call `test_read_batch_general` with data page v1 and plain encoding + // - `plain_v2` -> call `test_read_batch_general` with data page v2 and plain encoding + // - `dict_v1` -> call `test_read_batch_general` with data page v1 + dictionary page + // - `dict_v2` -> call `test_read_batch_general` with data page v2 + dictionary page + // + // And even higher level wrappers that simplify testing of almost the same test cases: + // - `get_test_int32_type`, provides dummy schema type + // - `get_test_int64_type`, provides dummy schema type + // - `test_read_batch_int32`, wrapper for `read_batch` tests, since they are basically + // the same, just different def/rep levels and batch size. + // + // # Page assembly + // + // Page construction and generation of values, definition and repetition levels happens + // in `make_pages` function. + // All values are randomly generated based on provided min/max, levels are calculated + // based on provided max level for column descriptor (which is basically either int32 + // or int64 type in tests) and `levels_per_page` variable. + // + // We use `DataPageBuilder` and its implementation `DataPageBuilderImpl` to actually + // turn values, definition and repetition levels into data pages (either v1 or v2). + // + // Those data pages are then stored as part of `TestPageReader` (we just pass vector + // of generated pages directly), which implements `PageReader` interface. + // + // # Comparison + // + // This allows us to pass test page reader into column reader, so we can test + // functionality of column reader - see `test_read_batch`, where we create column + // reader -> typed column reader, buffer values in `read_batch` method and compare + // output with generated data. + + // Returns dummy Parquet `Type` for primitive field, because most of our tests use + // INT32 physical type. fn get_test_int32_type() -> SchemaType { SchemaType::primitive_type_builder("a", PhysicalType::INT32) .with_repetition(Repetition::REQUIRED) @@ -635,6 +678,7 @@ mod tests { .expect("build() should be OK") } + // Returns dummy Parquet `Type` for INT64 physical type. fn get_test_int64_type() -> SchemaType { SchemaType::primitive_type_builder("a", PhysicalType::INT64) .with_repetition(Repetition::REQUIRED) @@ -644,7 +688,10 @@ mod tests { .expect("build() should be OK") } - // Tests `read_batch()` functionality for Int32Type. + // Tests `read_batch()` functionality for INT32. + // + // This is a high level wrapper on `ColumnReaderTester` that allows us to specify some + // boilerplate code for setting up definition/repetition levels and column descriptor. fn test_read_batch_int32( batch_size: usize, values: &mut[i32], @@ -688,7 +735,7 @@ mod tests { Self { rep_levels: Vec::new(), def_levels: Vec::new(), values: Vec::new() } } - // method to generate and test data pages v1 + // Method to generate and test data pages v1 fn plain_v1( &mut self, desc: ColumnDescPtr, @@ -710,7 +757,7 @@ mod tests { ); } - // method to generate and test data pages v2 + // Method to generate and test data pages v2 fn plain_v2( &mut self, desc: ColumnDescPtr, @@ -732,6 +779,7 @@ mod tests { ); } + // Method to generate and test dictionary page + data pages v1 fn dict_v1( &mut self, desc: ColumnDescPtr, @@ -753,6 +801,7 @@ mod tests { ); } + // Method to generate and test dictionary page + data pages v2 fn dict_v2( &mut self, desc: ColumnDescPtr, @@ -1000,7 +1049,10 @@ mod tests { ) where T: 'static { assert!( self.num_values >= values.len() as u32, - "num_values: {}, values.len(): {}", self.num_values, values.len()); + "num_values: {}, values.len(): {}", + self.num_values, + values.len() + ); self.encoding = Some(encoding); let mut encoder: Box> = get_encoder::( self.desc.clone(), encoding, self.mem_tracker.clone() @@ -1037,7 +1089,6 @@ mod tests { } } } - } fn make_pages( From 655d5b6d1b579772bce1c4cbbd087427becbb32b Mon Sep 17 00:00:00 2001 From: Ivan Sadikov Date: Wed, 11 Apr 2018 20:34:47 +1200 Subject: [PATCH 19/32] add parquet version, minor updates --- README.md | 3 +++ src/basic.rs | 9 ++++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index a6c68ac..aedc5c7 100644 --- a/README.md +++ b/README.md @@ -35,6 +35,9 @@ while let Some(record) = iter.next() { ``` See crate documentation on available API. +## Versions +- [Parquet-format](https://github.com/apache/parquet-format) 2.3.2 + ## Requirements - Rust nightly - Thrift 0.11.0 or higher diff --git a/src/basic.rs b/src/basic.rs index c416465..8fbe39d 100644 --- a/src/basic.rs +++ b/src/basic.rs @@ -177,38 +177,45 @@ pub enum Encoding { /// - FLOAT - 4 bytes per value, stored as little-endian. /// - DOUBLE - 8 bytes per value, stored as little-endian. /// - BYTE_ARRAY - 4 byte length stored as little endian, followed by bytes. - /// FIXED_LEN_BYTE_ARRAY - just the bytes are stored. + /// - FIXED_LEN_BYTE_ARRAY - just the bytes are stored. PLAIN, /// **Deprecated** dictionary encoding. + /// /// The values in the dictionary are encoded using PLAIN encoding. /// Since it is deprecated, RLE_DICTIONARY encoding is used for a data page, and PLAIN /// encoding is used for dictionary page. PLAIN_DICTIONARY, /// Group packed run length encoding. + /// /// Usable for definition/repetition levels encoding and boolean values. RLE, /// Bit packed encoding. + /// /// This can only be used if the data has a known max width. /// Usable for definition/repetition levels encoding. BIT_PACKED, /// Delta encoding for integers, either INT32 or INT64. + /// /// Works best on sorted data. DELTA_BINARY_PACKED, /// Encoding for byte arrays to separate the length values and the data. + /// /// The lengths are encoded using DELTA_BINARY_PACKED encoding. DELTA_LENGTH_BYTE_ARRAY, /// Incremental encoding for byte arrays. + /// /// Prefix lengths are encoded using DELTA_BINARY_PACKED encoding. /// Suffixes are stored using DELTA_LENGTH_BYTE_ARRAY encoding. DELTA_BYTE_ARRAY, /// Dictionary encoding. + /// /// The ids are encoded using the RLE encoding. RLE_DICTIONARY } From c6f350ca194916f43a76cd5c98e97909ba0c32db Mon Sep 17 00:00:00 2001 From: Ivan Sadikov Date: Wed, 11 Apr 2018 20:47:36 +1200 Subject: [PATCH 20/32] update comment for types.rs --- src/schema/types.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/schema/types.rs b/src/schema/types.rs index 1277590..25e2056 100644 --- a/src/schema/types.rs +++ b/src/schema/types.rs @@ -738,8 +738,7 @@ fn build_tree( } } -/// Conversion from Thrift equivalents. -/// Should be considered private. +/// Internal method to convert from Thrift. pub fn from_thrift(elements: &mut [SchemaElement]) -> Result { let mut index = 0; let mut schema_nodes = Vec::new(); From 54a1c0dfdf29027cd0da8029f48918420f4e7e65 Mon Sep 17 00:00:00 2001 From: Ivan Sadikov Date: Wed, 11 Apr 2018 21:01:40 +1200 Subject: [PATCH 21/32] reexport modules --- benches/decoding.rs | 6 +++--- benches/encoding.rs | 4 ++-- src/lib.rs | 7 ++++++- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/benches/decoding.rs b/benches/decoding.rs index 10be4a3..ec1f097 100644 --- a/benches/decoding.rs +++ b/benches/decoding.rs @@ -30,9 +30,9 @@ use std::rc::Rc; use parquet::basic::*; use parquet::data_type::*; -use parquet::encodings::decoding::*; -use parquet::encodings::encoding::*; -use parquet::util::memory::{ByteBufferPtr, MemTracker}; +use parquet::decoding::*; +use parquet::encoding::*; +use parquet::memory::{ByteBufferPtr, MemTracker}; macro_rules! plain { ($fname:ident, $num_values:expr, $batch_size:expr, $ty:ident, $pty:expr, diff --git a/benches/encoding.rs b/benches/encoding.rs index b23ed02..e7ab04c 100644 --- a/benches/encoding.rs +++ b/benches/encoding.rs @@ -30,8 +30,8 @@ use std::rc::Rc; use parquet::basic::*; use parquet::data_type::*; -use parquet::encodings::encoding::*; -use parquet::util::memory::MemTracker; +use parquet::encoding::*; +use parquet::memory::MemTracker; macro_rules! plain { ($fname:ident, $batch_size:expr, $ty:ident, $pty:expr, $gen_data_fn:expr) => { diff --git a/src/lib.rs b/src/lib.rs index 7dfa94e..ff17447 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -137,10 +137,15 @@ pub mod basic; pub mod data_type; mod parquet_thrift; +// Exported for external use, such as benchmarks +pub use util::memory; +pub use encodings::encoding; +pub use encodings::decoding; + #[macro_use] mod util; -mod compression; mod encodings; +pub mod compression; pub mod column; pub mod record; pub mod schema; From c1308345c53f0fa2af0d5e81e64873a5b47da057 Mon Sep 17 00:00:00 2001 From: Ivan Sadikov Date: Wed, 11 Apr 2018 21:29:26 +1200 Subject: [PATCH 22/32] add compression docs --- src/compression.rs | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/src/compression.rs b/src/compression.rs index 377653f..e54c0ec 100644 --- a/src/compression.rs +++ b/src/compression.rs @@ -15,6 +15,20 @@ // specific language governing permissions and limitations // under the License. +//! Contains codec interface and supported codec implementations. +//! +//! See [`Compression`](`CodecType`) for all available compression algorithms. +//! +//! # Example +//! +//! ```rust +//! use parquet::basic::Compression; +//! use parquet::compression::create_codec; +//! +//! let codec = create_codec(Compression::SNAPPY); +//! assert!(codec.is_ok()); +//! ``` + use std::io::{Read, Write}; use basic::Compression as CodecType; @@ -25,6 +39,7 @@ use flate2::read::GzDecoder; use flate2::write::GzEncoder; use snap::{decompress_len, Decoder, Encoder}; +/// Parquet compression codec interface. pub trait Codec { /// Compresses data stored in slice `input_buf` and returns a new vector with the /// compressed data. @@ -37,8 +52,7 @@ pub trait Codec { fn decompress(&mut self, input_buf: &[u8], output_buf: &mut Vec) -> Result; } - -/// Given the compression type `codec`, returns a codec used to compress & decompress +/// Given the compression type `codec`, returns a codec used to compress and decompress /// bytes for the compression type. /// This returns `None` if the codec type is `UNCOMPRESSED`. pub fn create_codec(codec: CodecType) -> Result>> { @@ -51,12 +65,14 @@ pub fn create_codec(codec: CodecType) -> Result>> { } } +/// Codec for Snappy compression format. pub struct SnappyCodec { decoder: Decoder, encoder: Encoder } impl SnappyCodec { + /// Creates new Snappy compression codec. fn new() -> Self { Self { decoder: Decoder::new(), @@ -79,9 +95,11 @@ impl Codec for SnappyCodec { } } +/// Codec for GZIP compression algorithm. pub struct GZipCodec {} impl GZipCodec { + /// Creates new GZIP compression codec. fn new() -> Self { Self {} } @@ -108,9 +126,11 @@ const BROTLI_DEFAULT_BUFFER_SIZE: usize = 4096; const BROTLI_DEFAULT_COMPRESSION_QUALITY: u32 = 9; // supported levels 0-9 const BROTLI_DEFAULT_LG_WINDOW_SIZE: u32 = 22; // recommended between 20-22 +/// Codec for Brotli compression algorithm. pub struct BrotliCodec {} impl BrotliCodec { + /// Creates new Brotli compression codec. fn new() -> Self { Self {} } From e6c9803d4006b01e2140cf8a356c8b75b0c8f567 Mon Sep 17 00:00:00 2001 From: Ivan Sadikov Date: Wed, 11 Apr 2018 21:42:11 +1200 Subject: [PATCH 23/32] update docs --- src/compression.rs | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/compression.rs b/src/compression.rs index e54c0ec..a74c169 100644 --- a/src/compression.rs +++ b/src/compression.rs @@ -25,8 +25,18 @@ //! use parquet::basic::Compression; //! use parquet::compression::create_codec; //! -//! let codec = create_codec(Compression::SNAPPY); -//! assert!(codec.is_ok()); +//! let mut codec = match create_codec(Compression::SNAPPY) { +//! Ok(Some(codec)) => codec, +//! _ => panic!() +//! }; +//! +//! let data = vec![b'p', b'a', b'r', b'q', b'u', b'e', b't']; +//! let compressed = codec.compress(&data[..]).unwrap(); +//! +//! let mut output = vec![]; +//! codec.decompress(&compressed[..], &mut output).unwrap(); +//! +//! assert_eq!(output, data); //! ``` use std::io::{Read, Write}; From 2e5924598f581d960939e62e3f9af351a2d57f09 Mon Sep 17 00:00:00 2001 From: Ivan Sadikov Date: Wed, 11 Apr 2018 22:05:47 +1200 Subject: [PATCH 24/32] add memory docs --- src/util/memory.rs | 47 +++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 44 insertions(+), 3 deletions(-) diff --git a/src/util/memory.rs b/src/util/memory.rs index 0fe5106..72f0915 100644 --- a/src/util/memory.rs +++ b/src/util/memory.rs @@ -25,18 +25,21 @@ use std::rc::{Rc, Weak}; // ---------------------------------------------------------------------- // Memory Tracker classes +/// Reference counted pointer for [`MemTracker`]. pub type MemTrackerPtr = Rc; +/// Non-owning reference for [`MemTracker`]. pub type WeakMemTrackerPtr = Weak; +/// Struct to track memory usage information. #[derive(Debug)] pub struct MemTracker { - // Memory usage information tracked by this. In the tuple, the first element is the - // current memory allocated (in bytes), and the second element is the maximum memory - // allocated so far (in bytes). + // In the tuple, the first element is the current memory allocated (in bytes), + // and the second element is the maximum memory allocated so far (in bytes). memory_usage: Cell<(i64, i64)> } impl MemTracker { + /// Creates new memory tracker. #[inline] pub fn new() -> MemTracker { MemTracker { @@ -70,7 +73,9 @@ impl MemTracker { // ---------------------------------------------------------------------- // Buffer classes +/// Type alias for [`Buffer`]. pub type ByteBuffer = Buffer; +/// Type alias for [`BufferPtr`]. pub type ByteBufferPtr = BufferPtr; /// A resize-able buffer class with generic member, with optional memory tracker. @@ -88,6 +93,7 @@ pub struct Buffer { } impl Buffer { + /// Creates new empty buffer. pub fn new() -> Self { Buffer { data: vec![], @@ -96,6 +102,7 @@ impl Buffer { } } + /// Adds [`MemTracker`] for this buffer. #[inline] pub fn with_mem_tracker(mut self, mc: MemTrackerPtr) -> Self { mc.alloc((self.data.capacity() * self.type_length) as i64); @@ -103,11 +110,13 @@ impl Buffer { self } + /// Returns slice of data in this buffer. #[inline] pub fn data(&self) -> &[T] { self.data.as_slice() } + /// Sets data for this buffer. #[inline] pub fn set_data(&mut self, new_data: Vec) { if let Some(ref mc) = self.mem_tracker { @@ -117,6 +126,12 @@ impl Buffer { self.data = new_data; } + /// Resizes underlying data in place to a new length `new_size`. + /// + /// If `new_size` is less than current length, data is truncated, otherwise, it is + /// extended to `new_size` with provided default value `init_value`. + /// + /// Memory tracker is also updated, if available. #[inline] pub fn resize(&mut self, new_size: usize, init_value: T) { let old_capacity = self.data.capacity(); @@ -127,11 +142,15 @@ impl Buffer { } } + /// Clears underlying data. #[inline] pub fn clear(&mut self) { self.data.clear() } + /// Reserves capacity `additional_capacity` for underlying data vector. + /// + /// Memory tracker is also updated, if available. #[inline] pub fn reserve(&mut self, additional_capacity: usize) { let old_capacity = self.data.capacity(); @@ -144,6 +163,8 @@ impl Buffer { } } + /// Returns [`BufferPtr`] with buffer data. + /// Buffer data is reset. #[inline] pub fn consume(&mut self) -> BufferPtr { let old_data = mem::replace(&mut self.data, vec![]); @@ -154,26 +175,33 @@ impl Buffer { result } + /// Adds `value` to the buffer. #[inline] pub fn push(&mut self, value: T) { self.data.push(value) } + /// Returns current capacity for the buffer. #[inline] pub fn capacity(&self) -> usize { self.data.capacity() } + /// Returns current size for the buffer. #[inline] pub fn size(&self) -> usize { self.data.len() } + /// Returns `true` if memory tracker is added to buffer, `false` otherwise. #[inline] pub fn is_mem_tracked(&self) -> bool { self.mem_tracker.is_some() } + /// Returns memory tracker associated with this buffer. + /// This may panic, if memory tracker is not set, use method above to check if + /// memory tracker is available. #[inline] pub fn mem_tracker(&self) -> &MemTrackerPtr { self.mem_tracker.as_ref().unwrap() @@ -245,6 +273,7 @@ pub struct BufferPtr { } impl BufferPtr { + /// Creates new buffer from a vector. pub fn new(v: Vec) -> Self { let len = v.len(); Self { @@ -255,10 +284,14 @@ impl BufferPtr { } } + /// Returns slice of data in this buffer. pub fn data(&self) -> &[T] { &self.data[self.start..self.start + self.len] } + /// Updates this buffer with new `start` position and length `len`. + /// + /// Range should be within current start position and length. pub fn with_range(mut self, start: usize, len: usize) -> Self { assert!(start <= self.len); assert!(start + len <= self.len); @@ -267,23 +300,29 @@ impl BufferPtr { self } + /// Adds memory tracker to this buffer. pub fn with_mem_tracker(mut self, mc: MemTrackerPtr) -> Self { self.mem_tracker = Some(mc); self } + /// Returns start position of this buffer. pub fn start(&self) -> usize { self.start } + /// Returns length of this buffer pub fn len(&self) -> usize { self.len } + /// Returns `true` if this buffer has memory tracker, `false` otherwise. pub fn is_mem_tracked(&self) -> bool { self.mem_tracker.is_some() } + /// Returns a shallow copy of the buffer. + /// Reference counted pointer to the data is copied. pub fn all(&self) -> BufferPtr { BufferPtr { data: self.data.clone(), @@ -293,6 +332,7 @@ impl BufferPtr { } } + /// Returns a shallow copy of the buffer that starts with `start` position. pub fn start_from(&self, start: usize) -> BufferPtr { assert!(start <= self.len); BufferPtr { @@ -303,6 +343,7 @@ impl BufferPtr { } } + /// Returns a shallow copy that is a range slice within this buffer. pub fn range(&self, start: usize, len: usize) -> BufferPtr { assert!(start + len <= self.len); BufferPtr { From 5bc4cd60185f55c1b88c37f907eb08bb38c9c4e3 Mon Sep 17 00:00:00 2001 From: Ivan Sadikov Date: Thu, 12 Apr 2018 10:19:31 +1200 Subject: [PATCH 25/32] update global doc --- src/basic.rs | 2 +- src/encodings/decoding.rs | 8 ++++---- src/lib.rs | 26 ++++++++++++++------------ 3 files changed, 19 insertions(+), 17 deletions(-) diff --git a/src/basic.rs b/src/basic.rs index 8fbe39d..b7fc80e 100644 --- a/src/basic.rs +++ b/src/basic.rs @@ -16,7 +16,7 @@ // under the License. //! Contains Rust mappings for Thrift definition. -//! See `parquet.thrift` file to see raw definitions of enums listed below. +//! Refer to `parquet.thrift` file to see raw definitions. use std::convert; use std::fmt; diff --git a/src/encodings/decoding.rs b/src/encodings/decoding.rs index 4013db0..29ff56f 100644 --- a/src/encodings/decoding.rs +++ b/src/encodings/decoding.rs @@ -31,6 +31,7 @@ use util::memory::{ByteBuffer, ByteBufferPtr}; // ---------------------------------------------------------------------- // Decoders +/// An Parquet decoder for the data type `T`. pub trait Decoder { /// Sets the data to decode to be `data`, which should contain `num_values` of values /// to decode. @@ -43,15 +44,15 @@ pub trait Decoder { /// unless the remaining number of values is less than `buffer.len()`. fn get(&mut self, buffer: &mut [T::T]) -> Result; - /// Returns the number of values left in this decoder stream + /// Returns the number of values left in this decoder stream. fn values_left(&self) -> usize; - /// Returns the encoding for this decoder + /// Returns the encoding for this decoder. fn encoding(&self) -> Encoding; } - /// Gets a decoder for the column descriptor `descr` and encoding type `encoding`. +/// /// NOTE: the primitive type in `descr` MUST match the data type `T`, otherwise /// disastrous consequence could occur. pub fn get_decoder( @@ -82,7 +83,6 @@ pub fn get_decoder( Ok(decoder) } - // ---------------------------------------------------------------------- // PLAIN Decoding diff --git a/src/lib.rs b/src/lib.rs index ff17447..6a8c47a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -81,25 +81,27 @@ //! //! # Metadata //! -//! Module [`file::metadata`] contains Parquet metadata structs, including file metadata, -//! that has information about file schema, version, and number of rows, row group -//! metadata with a set of column chunks that contain column type and encodings, number -//! of values and compressed/uncompressed size in bytes. +//! Module [`metadata`](`file::metadata`) contains Parquet metadata structs, including +//! file metadata, that has information about file schema, version, and number of rows, +//! row group metadata with a set of column chunks that contain column type and encodings, +//! number of values and compressed/uncompressed size in bytes. //! //! # Schema and type //! -//! Parquet schema can be extracted from [`file::metadata::FileMetaData`] and is -//! represented by Parquet type. +//! Parquet schema can be extracted from [`FileMetaData`](`file::metadata::FileMetaData`) +//! and is represented by Parquet type. //! -//! Parquet type is described by [`schema::types::Type`], including top level message -//! type or schema. Refer to the [`schema`] module for the detailed information on Type -//! API, printing and parsing of message types. +//! Parquet type is described by [`Type`](`schema::types::Type`), including top level +//! message type or schema. Refer to the [`schema`] module for the detailed information +//! on Type API, printing and parsing of message types. //! //! # File and row group API //! -//! File reader [`file::reader::FileReader`] is a starting point for working with Parquet -//! files. Provides set of methods to get file metadata, row group readers -//! [`file::reader::RowGroupReader`] to get access to column readers and record iterator. +//! Module [`file`] contains all definitions to explore Parquet files metadata and data. +//! File reader [`FileReader`](`file::reader::FileReader`) is a starting point for +//! working with Parquet files - it provides set of methods to get file metadata, row +//! group readers [`RowGroupReader`](`file::reader::RowGroupReader`) to get access to +//! column readers and record iterator. //! //! # Read API //! From d566524c68e0f337594746147c842d15e98ca68d Mon Sep 17 00:00:00 2001 From: Ivan Sadikov Date: Thu, 12 Apr 2018 20:53:53 +1200 Subject: [PATCH 26/32] add decoding docs --- src/encodings/decoding.rs | 37 ++++++++++++++++++++++++++++++++++--- 1 file changed, 34 insertions(+), 3 deletions(-) diff --git a/src/encodings/decoding.rs b/src/encodings/decoding.rs index 29ff56f..be05e75 100644 --- a/src/encodings/decoding.rs +++ b/src/encodings/decoding.rs @@ -31,7 +31,7 @@ use util::memory::{ByteBuffer, ByteBufferPtr}; // ---------------------------------------------------------------------- // Decoders -/// An Parquet decoder for the data type `T`. +/// A Parquet decoder for the data type `T`. pub trait Decoder { /// Sets the data to decode to be `data`, which should contain `num_values` of values /// to decode. @@ -86,6 +86,10 @@ pub fn get_decoder( // ---------------------------------------------------------------------- // PLAIN Decoding +/// Plain decoding that supports all types. +/// Values are encoded back to back. For native types, data is encoded as little endian. +/// Floating point types are encoded in IEEE. +/// See [`PlainDecoder`](`::encoding::PlainEncoder`) for more information. pub struct PlainDecoder { // The remaining number of values in the byte array num_values: usize, @@ -107,6 +111,7 @@ pub struct PlainDecoder { } impl PlainDecoder { + /// Creates new plain decoder. pub fn new(type_length: i32) -> Self { PlainDecoder { data: None, @@ -253,10 +258,13 @@ impl Decoder for PlainDecoder { } } - // ---------------------------------------------------------------------- // RLE_DICTIONARY/PLAIN_DICTIONARY Decoding +/// Dictionary decoder. +/// The dictionary encoding builds a dictionary of values encountered in a given column. +/// The dictionary is be stored in a dictionary page per column chunk. +/// See [`DictEncoder`](`::encoding::DictEncoder`) for more information. pub struct DictDecoder { // The dictionary, which maps ids to the values dictionary: Vec, @@ -272,6 +280,7 @@ pub struct DictDecoder { } impl DictDecoder { + /// Creates new dictionary decoder. pub fn new() -> Self { Self { dictionary: vec![], @@ -281,6 +290,7 @@ impl DictDecoder { } } + /// Decodes and sets values for dictionary using `decoder` decoder. pub fn set_dict(&mut self, mut decoder: Box>) -> Result<()> { let num_values = decoder.values_left(); self.dictionary.resize(num_values, T::T::default()); @@ -325,6 +335,7 @@ impl Decoder for DictDecoder { /// RLE/Bit-Packing hybrid decoding for values. /// Currently is used only for data pages v2 and supports boolean types. +/// See [`RleValueEncoder`](`::encoding::RleValueEncoder`) for more information. pub struct RleValueDecoder { values_left: usize, decoder: Option, @@ -396,6 +407,9 @@ impl Decoder for RleValueDecoder { // ---------------------------------------------------------------------- // DELTA_BINARY_PACKED Decoding +/// Delta binary packed decoder. +/// Supports INT32 and INT64 types. +/// See [`DeltaBitPackEncoder`](`::encoding::DeltaBitPackEncoder`) for more information. pub struct DeltaBitPackDecoder { bit_reader: BitReader, initialized: bool, @@ -421,6 +435,7 @@ pub struct DeltaBitPackDecoder { } impl DeltaBitPackDecoder { + /// Creates new delta bit packed decoder. pub fn new() -> Self { Self { bit_reader: BitReader::from(vec![]), @@ -441,11 +456,13 @@ impl DeltaBitPackDecoder { } } + /// Returns underlying bit reader offset. pub fn get_offset(&self) -> usize { assert!(self.initialized, "Bit reader is not initialized"); self.bit_reader.get_byte_offset() } + /// Initializes new mini block. #[inline] fn init_block(&mut self) -> Result<()> { self.min_delta = self.bit_reader @@ -467,6 +484,7 @@ impl DeltaBitPackDecoder { Ok(()) } + /// Loads delta into mini block. #[inline] fn load_deltas_in_mini_block(&mut self) -> Result<()> { self.deltas_in_mini_block.clear(); @@ -564,8 +582,9 @@ impl Decoder for DeltaBitPackDecoder { } } -// Helper trait to define specific conversions when decoding values +/// Helper trait to define specific conversions when decoding values trait DeltaBitPackDecoderConversion { + /// Sets decoded value based on type `T`. #[inline] fn set_decoded_value( &self, buffer: @@ -616,6 +635,11 @@ impl DeltaBitPackDecoderConversion for DeltaBitPackDecoder // ---------------------------------------------------------------------- // DELTA_LENGTH_BYTE_ARRAY Decoding +/// Delta length byte array decoder. +/// Only applied to byte arrays to separate the length values and the data, the lengths +/// are encoded using DELTA_BINARY_PACKED encoding. +/// See [`DeltaLengthByteArrayEncoder`](`::encoding::DeltaLengthByteArrayEncoder`) +/// for more information. pub struct DeltaLengthByteArrayDecoder { // Lengths for each byte array in `data` // TODO: add memory tracker to this @@ -638,6 +662,7 @@ pub struct DeltaLengthByteArrayDecoder { } impl DeltaLengthByteArrayDecoder { + /// Creates new delta length byte array decoder. pub fn new() -> Self { Self { lengths: vec![], @@ -703,6 +728,11 @@ impl Decoder for DeltaLengthByteArrayDecoder { // ---------------------------------------------------------------------- // DELTA_BYTE_ARRAY Decoding +/// Delta byte array decoder. +/// Prefix lengths are encoded using `DELTA_BINARY_PACKED` encoding, Suffixes are stored +/// using `DELTA_LENGTH_BYTE_ARRAY` encoding. +/// See [`DeltaByteArrayEncoder`](`::encoding::DeltaByteArrayEncoder`) for more +/// information. pub struct DeltaByteArrayDecoder { // Prefix lengths for each byte array // TODO: add memory tracker to this @@ -725,6 +755,7 @@ pub struct DeltaByteArrayDecoder { } impl DeltaByteArrayDecoder { + /// Creates new delta byte array decoder. pub fn new() -> Self { Self { prefix_lengths: vec![], From b96050be4ce6501a8df65417c36203768138d409 Mon Sep 17 00:00:00 2001 From: Ivan Sadikov Date: Thu, 12 Apr 2018 21:30:48 +1200 Subject: [PATCH 27/32] update docs --- src/encodings/decoding.rs | 2 ++ src/encodings/encoding.rs | 59 +++++++++++++++++++++++++++++++++------ src/schema/mod.rs | 2 +- src/util/memory.rs | 4 +-- 4 files changed, 56 insertions(+), 11 deletions(-) diff --git a/src/encodings/decoding.rs b/src/encodings/decoding.rs index be05e75..2c77310 100644 --- a/src/encodings/decoding.rs +++ b/src/encodings/decoding.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +//! Contains all supported decoders for Parquet. + use std::cmp; use std::marker::PhantomData; use std::mem; diff --git a/src/encodings/encoding.rs b/src/encodings/encoding.rs index f0ebb17..9de56b2 100644 --- a/src/encodings/encoding.rs +++ b/src/encodings/encoding.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +//! Contains all supported encoders for Parquet. + use std::cmp; use std::io::Write; use std::marker::PhantomData; @@ -49,7 +51,6 @@ pub trait Encoder { fn flush_buffer(&mut self) -> Result; } - /// Gets a encoder for the particular data type `T` and encoding `encoding`. Memory usage /// for the encoder instance is tracked by `mem_tracker`. pub fn get_encoder( @@ -81,10 +82,20 @@ pub fn get_encoder( Ok(encoder) } - // ---------------------------------------------------------------------- // Plain encoding +/// Plain encoding that supports all types. +/// Values are encoded back to back. +/// The plain encoding is used whenever a more efficient encoding can not be used. +/// It stores the data in the following format: +/// - BOOLEAN - 1 bit per value, 0 is false; 1 is true. +/// - INT32 - 4 bytes per value, stored as little-endian. +/// - INT64 - 8 bytes per value, stored as little-endian. +/// - FLOAT - 4 bytes per value, stored as IEEE little-endian. +/// - DOUBLE - 8 bytes per value, stored as IEEE little-endian. +/// - BYTE_ARRAY - 4 byte length stored as little endian, followed by bytes. +/// - FIXED_LEN_BYTE_ARRAY - just the bytes are stored. pub struct PlainEncoder { buffer: ByteBuffer, bit_writer: BitWriter, @@ -93,6 +104,7 @@ pub struct PlainEncoder { } impl PlainEncoder { + /// Creates new plain encoder. pub fn new(desc: ColumnDescPtr, mem_tracker: MemTrackerPtr, vec: Vec) -> Self { let mut byte_buffer = ByteBuffer::new().with_mem_tracker(mem_tracker); byte_buffer.set_data(vec); @@ -171,7 +183,6 @@ impl Encoder for PlainEncoder { } } - // ---------------------------------------------------------------------- // Dictionary encoding @@ -179,6 +190,16 @@ const INITIAL_HASH_TABLE_SIZE: usize = 1024; const MAX_HASH_LOAD: f32 = 0.7; const HASH_SLOT_EMPTY: i32 = -1; +/// Dictionary encoder. +/// The dictionary encoding builds a dictionary of values encountered in a given column. +/// The dictionary page is written first, before the data pages of the column chunk. +/// +/// Dictionary page format: the entries in the dictionary - in dictionary order - +/// using the plain encoding. +/// +/// Data page format: the bit width used to encode the entry ids stored as 1 byte +/// (max bit width = 32), followed by the values encoded using RLE/Bit packed described +/// above (with the given bit width). pub struct DictEncoder { // Descriptor for the column to be encoded. desc: ColumnDescPtr, @@ -209,6 +230,7 @@ pub struct DictEncoder { } impl DictEncoder { + /// Creates new dictionary encoder. pub fn new(desc: ColumnDescPtr, mem_tracker: MemTrackerPtr) -> Self { let mut slots = Buffer::new().with_mem_tracker(mem_tracker.clone()); slots.resize(INITIAL_HASH_TABLE_SIZE, -1); @@ -224,6 +246,7 @@ impl DictEncoder { } } + /// Returns number of unique entries in the dictionary. pub fn num_entries(&self) -> usize { self.uniques.size() } @@ -368,6 +391,7 @@ pub struct RleValueEncoder { } impl RleValueEncoder { + /// Creates new rle value encoder. pub fn new() -> Self { Self { encoder: None, @@ -439,19 +463,29 @@ const MAX_BIT_WRITER_SIZE: usize = 10 * 1024 * 1024; const DEFAULT_BLOCK_SIZE: usize = 128; const DEFAULT_NUM_MINI_BLOCKS: usize = 4; +/// Delta bit packed encoder. +/// Consists of a header followed by blocks of delta encoded values binary packed. +/// /// Delta-binary-packing: +/// ``` /// [page-header] [block 1], [block 2], ... [block N] +/// ``` +/// /// Each page header consists of: +/// ``` /// [block size] [number of miniblocks in a block] [total value count] [first value] +/// ``` +/// /// Each block consists of: +/// ``` /// [min delta] [list of bitwidths of miniblocks] [miniblocks] +/// ``` /// /// Current implementation writes values in `put` method, multiple calls to `put` to /// existing block or start new block if block size is exceeded. Calling `flush_buffer` /// writes out all data and resets internal state, including page header. /// -/// Supports only Int32Type and Int64Type. -/// +/// Supports only INT32 and INT64. pub struct DeltaBitPackEncoder { page_header_writer: BitWriter, bit_writer: BitWriter, @@ -467,6 +501,7 @@ pub struct DeltaBitPackEncoder { } impl DeltaBitPackEncoder { + /// Creates new delta bit packed encoder. pub fn new() -> Self { let block_size = DEFAULT_BLOCK_SIZE; let num_mini_blocks = DEFAULT_NUM_MINI_BLOCKS; @@ -489,8 +524,8 @@ impl DeltaBitPackEncoder { } } - // Writes page header for blocks, this method is invoked when we are done encoding - // values. It is also okay to encode when no values have been provided + /// Writes page header for blocks, this method is invoked when we are done encoding + /// values. It is also okay to encode when no values have been provided fn write_page_header(&mut self) { // We ignore the result of each 'put' operation, because MAX_PAGE_HEADER_WRITER_SIZE // is chosen to fit all header values and guarantees that writes will not fail. @@ -629,7 +664,7 @@ impl Encoder for DeltaBitPackEncoder { } } -// Helper trait to define specific conversions and subtractions when computing deltas +/// Helper trait to define specific conversions and subtractions when computing deltas trait DeltaBitPackEncoderConversion { // Method should panic if type is not supported, otherwise no-op #[inline] @@ -709,6 +744,9 @@ impl DeltaBitPackEncoderConversion for DeltaBitPackEncoder } } +// ---------------------------------------------------------------------- +// DELTA_LENGTH_BYTE_ARRAY encoding + /// Encoding for byte arrays to separate the length values and the data. /// The lengths are encoded using DELTA_BINARY_PACKED encoding, data is /// stored as raw bytes. @@ -721,6 +759,7 @@ pub struct DeltaLengthByteArrayEncoder { } impl DeltaLengthByteArrayEncoder { + /// Creates new delta length byte array encoder. pub fn new() -> Self { Self { len_encoder: DeltaBitPackEncoder::new(), @@ -767,6 +806,9 @@ impl Encoder for DeltaLengthByteArrayEncoder { } } +// ---------------------------------------------------------------------- +// DELTA_BYTE_ARRAY encoding + /// Encoding for byte arrays, prefix lengths are encoded using DELTA_BINARY_PACKED /// encoding, followed by suffixes with DELTA_LENGTH_BYTE_ARRAY encoding. pub struct DeltaByteArrayEncoder { @@ -777,6 +819,7 @@ pub struct DeltaByteArrayEncoder { } impl DeltaByteArrayEncoder { + /// Creates new delta byte array encoder. pub fn new() -> Self { Self { prefix_len_encoder: DeltaBitPackEncoder::::new(), diff --git a/src/schema/mod.rs b/src/schema/mod.rs index 5580ec1..ee086e0 100644 --- a/src/schema/mod.rs +++ b/src/schema/mod.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -//! Contains definitions of Parquet schema and methods to print and parse schema. +//! Parquet schema definitions and methods to print and parse schema. //! //! # Example //! diff --git a/src/util/memory.rs b/src/util/memory.rs index 72f0915..4032eca 100644 --- a/src/util/memory.rs +++ b/src/util/memory.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +//! Utility methods and structs for working with memory. + use std::cell::Cell; use std::fmt::{Debug, Display, Formatter, Result as FmtResult}; use std::io::{Result as IoResult, Write}; @@ -69,7 +71,6 @@ impl MemTracker { } } - // ---------------------------------------------------------------------- // Buffer classes @@ -256,7 +257,6 @@ impl Drop for Buffer { } } - // ---------------------------------------------------------------------- // Immutable Buffer (BufferPtr) classes From 605a1fa32fb70733cc2634197628197f9a64cee4 Mon Sep 17 00:00:00 2001 From: Ivan Sadikov Date: Thu, 12 Apr 2018 21:49:58 +1200 Subject: [PATCH 28/32] update docs --- src/column/page.rs | 4 ++-- src/compression.rs | 3 ++- src/record/reader.rs | 11 ++++++----- src/schema/parser.rs | 9 +++++---- src/schema/printer.rs | 7 ++++--- src/schema/types.rs | 22 +++++++++++----------- 6 files changed, 30 insertions(+), 26 deletions(-) diff --git a/src/column/page.rs b/src/column/page.rs index cc24b39..870532f 100644 --- a/src/column/page.rs +++ b/src/column/page.rs @@ -53,7 +53,7 @@ pub enum Page { } impl Page { - /// Returns [`PageType`] for this page. + /// Returns [`PageType`](`::basic::PageType`) for this page. pub fn page_type(&self) -> PageType { match self { &Page::DataPage { .. } => PageType::DATA_PAGE, @@ -80,7 +80,7 @@ impl Page { } } - /// Returns this page [`Encoding`]. + /// Returns this page [`Encoding`](`::basic::Encoding`). pub fn encoding(&self) -> Encoding { match self { &Page::DataPage { encoding, .. } => encoding, diff --git a/src/compression.rs b/src/compression.rs index a74c169..e5c3326 100644 --- a/src/compression.rs +++ b/src/compression.rs @@ -17,7 +17,8 @@ //! Contains codec interface and supported codec implementations. //! -//! See [`Compression`](`CodecType`) for all available compression algorithms. +//! See [`Compression`](`::basic::Compression`) enum for all available compression +//! algorithms. //! //! # Example //! diff --git a/src/record/reader.rs b/src/record/reader.rs index c8b4eaf..24bd3b5 100644 --- a/src/record/reader.rs +++ b/src/record/reader.rs @@ -15,7 +15,8 @@ // specific language governing permissions and limitations // under the License. -//! Contains implementation of record assembly and converting Parquet types into `Row`s. +//! Contains implementation of record assembly and converting Parquet types into +//! [`Row`](`::record::api::Row`)s. use std::collections::HashMap; use std::rc::Rc; @@ -479,7 +480,7 @@ impl Reader { // ---------------------------------------------------------------------- // Row iterators -/// Iterator of [`Row`]s. +/// Iterator of [`Row`](`::record::api::Row`)s. /// It is used either for a single row group to iterate over data in that row group, or /// an entire file with auto buffering of all row groups. pub struct RowIter<'a> { @@ -492,7 +493,7 @@ pub struct RowIter<'a> { } impl<'a> RowIter<'a> { - /// Creates iterator of [`Row`]s for all row groups in a file. + /// Creates iterator of [`Row`](`::record::api::Row`)s for all row groups in a file. pub fn from_file(proj: Option, reader: &'a FileReader) -> Result { let descr = Self::get_proj_descr(proj, reader.metadata().file_metadata().schema_descr_ptr())?; @@ -508,7 +509,7 @@ impl<'a> RowIter<'a> { }) } - /// Creates iterator of [`Row`]s for a specific row group. + /// Creates iterator of [`Row`](`::record::api::Row`)s for a specific row group. pub fn from_row_group(proj: Option, reader: &'a RowGroupReader) -> Result { let descr = Self::get_proj_descr(proj, reader.metadata().schema_descr_ptr())?; let tree_builder = Self::tree_builder(); @@ -582,7 +583,7 @@ impl<'a> Iterator for RowIter<'a> { } } -/// Internal iterator of [`Row`]s for a reader. +/// Internal iterator of [`Row`](`::record::api::Row`)s for a reader. pub struct ReaderIter { root_reader: Reader, records_left: usize diff --git a/src/schema/parser.rs b/src/schema/parser.rs index 7f704bc..15c0c45 100644 --- a/src/schema/parser.rs +++ b/src/schema/parser.rs @@ -16,7 +16,8 @@ // under the License. //! Parquet schema parser. -//! Provides methods to parse and validate string message type into Parquet [`Type`]. +//! Provides methods to parse and validate string message type into Parquet +//! [`Type`](`::schema::types::Type`). //! //! # Example //! @@ -47,9 +48,9 @@ use basic::{LogicalType, Repetition, Type as PhysicalType}; use errors::{ParquetError, Result}; use schema::types::{Type, TypePtr}; -/// Parses message type as string into a Parquet `Type` which, for example, could be -/// used to extract individual columns. Returns Parquet general error when parsing or -/// validation fails. +/// Parses message type as string into a Parquet [`Type`](`::schema::types::Type`) which, +/// for example, could be used to extract individual columns. Returns Parquet general +/// error when parsing or validation fails. pub fn parse_message_type<'a>(message_type: &'a str) -> Result { let mut parser = Parser { tokenizer: &mut Tokenizer::from_str(message_type) }; parser.parse_message_type() diff --git a/src/schema/printer.rs b/src/schema/printer.rs index f17b302..f34a6e8 100644 --- a/src/schema/printer.rs +++ b/src/schema/printer.rs @@ -56,7 +56,8 @@ use file::metadata::{ }; use schema::types::Type; -/// Prints Parquet metadata [`ParquetMetaData`] information. +/// Prints Parquet metadata [`ParquetMetaData`](`::file::metadata::ParquetMetaData`) +/// information. #[allow(unused_must_use)] pub fn print_parquet_metadata(out: &mut io::Write, metadata: &ParquetMetaData) { print_file_metadata(out, &metadata.file_metadata()); @@ -72,7 +73,7 @@ pub fn print_parquet_metadata(out: &mut io::Write, metadata: &ParquetMetaData) { } } -/// Prints file metadata [`FileMetaData`] information. +/// Prints file metadata [`FileMetaData`](`::file::metadata::FileMetaData`) information. #[allow(unused_must_use)] pub fn print_file_metadata(out: &mut io::Write, file_metadata: &FileMetaData) { writeln!(out, "version: {}", file_metadata.version()); @@ -84,7 +85,7 @@ pub fn print_file_metadata(out: &mut io::Write, file_metadata: &FileMetaData) { print_schema(out, schema); } -/// Prints Parquet [`Type`] information. +/// Prints Parquet [`Type`](`::schema::types::Type`) information. #[allow(unused_must_use)] pub fn print_schema(out: &mut io::Write, tp: &Type) { // TODO: better if we can pass fmt::Write to Printer. diff --git a/src/schema/types.rs b/src/schema/types.rs index 25e2056..5fb4717 100644 --- a/src/schema/types.rs +++ b/src/schema/types.rs @@ -188,13 +188,13 @@ impl<'a> PrimitiveTypeBuilder<'a> { } } - /// Sets [`Repetition`] for this field and returns itself. + /// Sets [`Repetition`](`::basic::Repetition`) for this field and returns itself. pub fn with_repetition(mut self, repetition: Repetition) -> Self { self.repetition = repetition; self } - /// Sets [`LogicalType`] for this field and returns itself. + /// Sets [`LogicalType`](`::basic::LogicalType`) for this field and returns itself. pub fn with_logical_type(mut self, logical_type: LogicalType) -> Self { self.logical_type = logical_type; self @@ -339,13 +339,13 @@ impl<'a> GroupTypeBuilder<'a> { } } - /// Sets [`Repetition`] for this field and returns itself. + /// Sets [`Repetition`](`::basic::Repetition`) for this field and returns itself. pub fn with_repetition(mut self, repetition: Repetition) -> Self { self.repetition = Some(repetition); self } - /// Sets [`LogicalType`] for this field and returns itself. + /// Sets [`LogicalType`](`::basic::LogicalType`) for this field and returns itself. pub fn with_logical_type(mut self, logical_type: LogicalType) -> Self { self.logical_type = logical_type; self @@ -402,13 +402,13 @@ impl BasicTypeInfo { self.repetition.is_some() } - /// Returns [`Repetition`] value for the type. + /// Returns [`Repetition`](`::basic::Repetition`) value for the type. pub fn repetition(&self) -> Repetition { assert!(self.repetition.is_some()); self.repetition.unwrap() } - /// Returns [`LogicalType`] value for the type. + /// Returns [`LogicalType`](`::basic::LogicalType`) value for the type. pub fn logical_type(&self) -> LogicalType { self.logical_type } @@ -425,7 +425,6 @@ impl BasicTypeInfo { } } - // ---------------------------------------------------------------------- // Parquet descriptor definitions @@ -541,7 +540,8 @@ impl ColumnDescriptor { &self.path } - /// Returns root [`Type`] (most top-level parent field) for this leaf column. + /// Returns root [`Type`](`::schema::types::Type`) (most top-level parent field) for + /// this leaf column. pub fn root_type(&self) -> &Type { assert!(self.root_type.is_some()); self.root_type.as_ref().unwrap() @@ -552,7 +552,7 @@ impl ColumnDescriptor { self.primitive_type.name() } - /// Returns [`LogicalType`] for this column. + /// Returns [`LogicalType`](`::basic::LogicalType`) for this column. pub fn logical_type(&self) -> LogicalType { self.primitive_type.get_basic_info().logical_type() } @@ -662,7 +662,7 @@ impl SchemaDescriptor { self.leaves.len() } - /// Returns column root [`Type`] for a field position. + /// Returns column root [`Type`](`::schema::types::Type`) for a field position. pub fn get_column_root(&self, i: usize) -> &Type { assert!( i < self.leaves.len(), @@ -675,7 +675,7 @@ impl SchemaDescriptor { result.unwrap().as_ref() } - /// Returns schema as [`Type`]. + /// Returns schema as [`Type`](`::schema::types::Type`). pub fn root_schema(&self) -> &Type { self.schema.as_ref() } From c78f550150dba5f1793e002c31cd25e2766930e4 Mon Sep 17 00:00:00 2001 From: Ivan Sadikov Date: Thu, 12 Apr 2018 21:51:46 +1200 Subject: [PATCH 29/32] fix doc tests --- src/encodings/encoding.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/encodings/encoding.rs b/src/encodings/encoding.rs index 9de56b2..068b159 100644 --- a/src/encodings/encoding.rs +++ b/src/encodings/encoding.rs @@ -467,17 +467,17 @@ const DEFAULT_NUM_MINI_BLOCKS: usize = 4; /// Consists of a header followed by blocks of delta encoded values binary packed. /// /// Delta-binary-packing: -/// ``` +/// ```shell /// [page-header] [block 1], [block 2], ... [block N] /// ``` /// /// Each page header consists of: -/// ``` +/// ```shell /// [block size] [number of miniblocks in a block] [total value count] [first value] /// ``` /// /// Each block consists of: -/// ``` +/// ```shell /// [min delta] [list of bitwidths of miniblocks] [miniblocks] /// ``` /// From af1342170a6b9bc44e4bf81434f533ab7a154c41 Mon Sep 17 00:00:00 2001 From: Ivan Sadikov Date: Thu, 12 Apr 2018 22:06:30 +1200 Subject: [PATCH 30/32] update from_thrift comments --- src/file/metadata.rs | 6 +++--- src/schema/types.rs | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/file/metadata.rs b/src/file/metadata.rs index 245daf3..c0b070b 100644 --- a/src/file/metadata.rs +++ b/src/file/metadata.rs @@ -194,7 +194,7 @@ impl RowGroupMetaData { self.schema_descr.clone() } - /// Internal method to convert from Thrift. + /// Method to convert from Thrift. pub fn from_thrift( schema_descr: SchemaDescPtr, mut rg: RowGroup @@ -323,8 +323,8 @@ impl ColumnChunkMetaData { self.dictionary_page_offset } - /// Internal method to convert from Thrift. - pub fn from_thrift(column_descr: ColumnDescPtr, cc: ColumnChunk) -> Result { + /// Method to convert from Thrift. + fn from_thrift(column_descr: ColumnDescPtr, cc: ColumnChunk) -> Result { if cc.meta_data.is_none() { return Err(general_err!("Expected to have column metadata")); } diff --git a/src/schema/types.rs b/src/schema/types.rs index 5fb4717..0e176e7 100644 --- a/src/schema/types.rs +++ b/src/schema/types.rs @@ -738,7 +738,7 @@ fn build_tree( } } -/// Internal method to convert from Thrift. +/// Method to convert from Thrift. pub fn from_thrift(elements: &mut [SchemaElement]) -> Result { let mut index = 0; let mut schema_nodes = Vec::new(); From 51a4965b8cb431d6b25499e08fb32fc114152530 Mon Sep 17 00:00:00 2001 From: Ivan Sadikov Date: Fri, 13 Apr 2018 20:28:13 +1200 Subject: [PATCH 31/32] update parquet version in readme --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index aedc5c7..48b83cd 100644 --- a/README.md +++ b/README.md @@ -35,8 +35,8 @@ while let Some(record) = iter.next() { ``` See crate documentation on available API. -## Versions -- [Parquet-format](https://github.com/apache/parquet-format) 2.3.2 +## Supported Parquet Version +- Parquet-format 2.4.0 ## Requirements - Rust nightly From 1a11586528897e8da652faa656699df00075f7a3 Mon Sep 17 00:00:00 2001 From: Ivan Sadikov Date: Fri, 13 Apr 2018 20:52:48 +1200 Subject: [PATCH 32/32] update comments --- src/schema/types.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/schema/types.rs b/src/schema/types.rs index 0e176e7..2651752 100644 --- a/src/schema/types.rs +++ b/src/schema/types.rs @@ -229,8 +229,8 @@ impl<'a> PrimitiveTypeBuilder<'a> { self } - /// Creates a new `PrimitiveType` instance from the gathered attributes. - /// Also checks various illegal conditions and returns `Err` if that that happen. + /// Creates a new `PrimitiveType` instance from the collected attributes. + /// Returns `Err` in case of any building conditions are not met. pub fn build(self) -> Result { let basic_info = BasicTypeInfo { name: String::from(self.name),