Skip to content

Commit

Permalink
chore: add docs, part of #37
Browse files Browse the repository at this point in the history
- add pragma `#![warn(missing_docs)]` to `parquet`

This is the final component in the effort to make Arrow
fully-documented. The entire project now generates warning
for missing docs, if any.

- arrow: replace `tonic`'s deprecated `compile_with_config`
with suggested method

- new deprecation:
The following types were not used anywhere and were possibly strays.
They've been marked as deprecated and will be removed in future
versions.

- `parquet::data_types::SliceAsBytesDataType`
- `parquet::column::writer::Level`
  • Loading branch information
ByteBaker committed Oct 2, 2024
1 parent fc05eca commit bd846e4
Show file tree
Hide file tree
Showing 24 changed files with 264 additions and 24 deletions.
10 changes: 5 additions & 5 deletions arrow-flight/gen/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,18 +26,18 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
let proto_path = Path::new("../format/Flight.proto");

tonic_build::configure()
// protoc in unbuntu builder needs this option
// protoc in Ubuntu builder needs this option
.protoc_arg("--experimental_allow_proto3_optional")
.out_dir("src")
.compile_with_config(prost_config(), &[proto_path], &[proto_dir])?;
.compile_protos_with_config(prost_config(), &[proto_path], &[proto_dir])?;

// read file contents to string
let mut file = OpenOptions::new()
.read(true)
.open("src/arrow.flight.protocol.rs")?;
let mut buffer = String::new();
file.read_to_string(&mut buffer)?;
// append warning that file was auto-generate
// append warning that file was auto-generated
let mut file = OpenOptions::new()
.write(true)
.truncate(true)
Expand All @@ -49,10 +49,10 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
let proto_path = Path::new("../format/FlightSql.proto");

tonic_build::configure()
// protoc in ubuntu builder needs this option
// protoc in Ubuntu builder needs this option
.protoc_arg("--experimental_allow_proto3_optional")
.out_dir("src/sql")
.compile_with_config(prost_config(), &[proto_path], &[proto_dir])?;
.compile_protos_with_config(prost_config(), &[proto_path], &[proto_dir])?;

// read file contents to string
let mut file = OpenOptions::new()
Expand Down
1 change: 1 addition & 0 deletions parquet/src/arrow/async_reader/metadata.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ use std::ops::Range;

/// A data source that can be used with [`MetadataLoader`] to load [`ParquetMetaData`]
pub trait MetadataFetch {
/// Fetches a range of bytes asynchronously
fn fetch(&mut self, range: Range<usize>) -> BoxFuture<'_, Result<Bytes>>;
}

Expand Down
47 changes: 45 additions & 2 deletions parquet/src/basic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,13 +47,21 @@ pub use crate::format::{
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
#[allow(non_camel_case_types)]
pub enum Type {
/// A boolean value.
BOOLEAN,
/// 32-bit signed integer.
INT32,
/// 64-bit signed integer.
INT64,
/// 96-bit signed integer for timestamps.
INT96,
/// IEEE 754 single-precision floating point value.
FLOAT,
/// IEEE 754 double-precision floating point value.
DOUBLE,
/// Arbitrary length byte array.
BYTE_ARRAY,
/// Fixed length byte array.
FIXED_LEN_BYTE_ARRAY,
}

Expand All @@ -70,6 +78,7 @@ pub enum Type {
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[allow(non_camel_case_types)]
pub enum ConvertedType {
/// No type conversion.
NONE,
/// A BYTE_ARRAY actually contains UTF8 encoded chars.
UTF8,
Expand Down Expand Up @@ -171,31 +180,53 @@ pub enum ConvertedType {
/// [`ConvertedType`]. Please see the README.md for more details.
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum LogicalType {
/// A UTF8 encoded string.
String,
/// A map of key-value pairs.
Map,
/// A list of elements.
List,
/// A set of predefined values.
Enum,
/// A decimal value with a specified scale and precision.
Decimal {
/// The number of digits in the decimal.
scale: i32,
/// The location of the decimal point.
precision: i32,
},
/// A date stored as days since Unix epoch.
Date,
/// A time stored as [`TimeUnit`] since midnight.
Time {
/// Whether the time is adjusted to UTC.
is_adjusted_to_u_t_c: bool,
/// The unit of time.
unit: TimeUnit,
},
/// A timestamp stored as [`TimeUnit`] since Unix epoch.
Timestamp {
/// Whether the timestamp is adjusted to UTC.
is_adjusted_to_u_t_c: bool,
/// The unit of time.
unit: TimeUnit,
},
/// An integer with a specified bit width and signedness.
Integer {
/// The number of bits in the integer.
bit_width: i8,
/// Whether the integer is signed.
is_signed: bool,
},
/// An unknown logical type.
Unknown,
/// A JSON document.
Json,
/// A BSON document.
Bson,
/// A UUID.
Uuid,
/// A 16-bit floating point number.
Float16,
}

Expand Down Expand Up @@ -350,13 +381,21 @@ impl FromStr for Encoding {
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[allow(non_camel_case_types)]
pub enum Compression {
/// No compression.
UNCOMPRESSED,
/// [Snappy compression](https://en.wikipedia.org/wiki/Snappy_(compression))
SNAPPY,
/// [Gzip compression](https://www.ietf.org/rfc/rfc1952.txt)
GZIP(GzipLevel),
/// [LZO compression](https://en.wikipedia.org/wiki/Lempel%E2%80%93Ziv%E2%80%93Oberhumer)
LZO,
/// [Brotli compression](https://datatracker.ietf.org/doc/html/rfc7932)
BROTLI(BrotliLevel),
/// [LZ4 compression](https://lz4.org/), [(deprecated)](https://issues.apache.org/jira/browse/PARQUET-2032)
LZ4,
/// [ZSTD compression](https://datatracker.ietf.org/doc/html/rfc8878)
ZSTD(ZstdLevel),
/// [LZ4 compression](https://lz4.org/).
LZ4_RAW,
}

Expand Down Expand Up @@ -447,16 +486,20 @@ impl FromStr for Compression {
}

// ----------------------------------------------------------------------
// Mirrors `parquet::PageType`

/// Mirrors [parquet::PageType]
///
/// Available data pages for Parquet file format.
/// Note that some of the page types may not be supported.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[allow(non_camel_case_types)]
pub enum PageType {
/// Data page Parquet 1.0
DATA_PAGE,
/// Index page
INDEX_PAGE,
/// Dictionary page
DICTIONARY_PAGE,
/// Data page Parquet 2.0
DATA_PAGE_V2,
}

Expand Down
28 changes: 28 additions & 0 deletions parquet/src/column/page.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,29 +31,51 @@ use crate::format::PageHeader;
/// used to store uncompressed bytes of the page.
#[derive(Clone)]
pub enum Page {
/// Data page Parquet format v1.
DataPage {
/// The underlying data buffer
buf: Bytes,
/// Number of values in this page
num_values: u32,
/// Encoding for values in this page
encoding: Encoding,
/// Definition level encoding
def_level_encoding: Encoding,
/// Repetition level encoding
rep_level_encoding: Encoding,
/// Optional statistics for this page
statistics: Option<Statistics>,
},
/// Data page Parquet format v2.
DataPageV2 {
/// The underlying data buffer
buf: Bytes,
/// Number of values in this page
num_values: u32,
/// Encoding for values in this page
encoding: Encoding,
/// Number of null values in this page
num_nulls: u32,
/// Number of rows in this page
num_rows: u32,
/// Length of definition levels
def_levels_byte_len: u32,
/// Length of repetition levels
rep_levels_byte_len: u32,
/// Is this page compressed
is_compressed: bool,
/// Optional statistics for this page
statistics: Option<Statistics>,
},
/// Dictionary page.
DictionaryPage {
/// The underlying data buffer
buf: Bytes,
/// Number of values in this page
num_values: u32,
/// Encoding for values in this page
encoding: Encoding,
/// Is dictionary page sorted
is_sorted: bool,
},
}
Expand Down Expand Up @@ -235,11 +257,17 @@ impl CompressedPage {

/// Contains page write metrics.
pub struct PageWriteSpec {
/// The type of page being written
pub page_type: PageType,
/// The total size of the page, before compression
pub uncompressed_size: usize,
/// The compressed size of the page
pub compressed_size: usize,
/// The number of values in the page
pub num_values: u32,
/// The offset of the page in the column chunk
pub offset: u64,
/// The number of bytes written to the underlying sink
pub bytes_written: u64,
}

Expand Down
8 changes: 8 additions & 0 deletions parquet/src/column/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,21 @@ pub(crate) mod decoder;

/// Column reader for a Parquet type.
pub enum ColumnReader {
/// Column reader for boolean type
BoolColumnReader(ColumnReaderImpl<BoolType>),
/// Column reader for int32 type
Int32ColumnReader(ColumnReaderImpl<Int32Type>),
/// Column reader for int64 type
Int64ColumnReader(ColumnReaderImpl<Int64Type>),
/// Column reader for int96 type
Int96ColumnReader(ColumnReaderImpl<Int96Type>),
/// Column reader for float type
FloatColumnReader(ColumnReaderImpl<FloatType>),
/// Column reader for double type
DoubleColumnReader(ColumnReaderImpl<DoubleType>),
/// Column reader for byte array type
ByteArrayColumnReader(ColumnReaderImpl<ByteArrayType>),
/// Column reader for fixed length byte array type
FixedLenByteArrayColumnReader(ColumnReaderImpl<FixedLenByteArrayType>),
}

Expand Down
15 changes: 15 additions & 0 deletions parquet/src/column/writer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,13 +61,21 @@ macro_rules! downcast_writer {

/// Column writer for a Parquet type.
pub enum ColumnWriter<'a> {
/// Column writer for boolean type
BoolColumnWriter(ColumnWriterImpl<'a, BoolType>),
/// Column writer for int32 type
Int32ColumnWriter(ColumnWriterImpl<'a, Int32Type>),
/// Column writer for int64 type
Int64ColumnWriter(ColumnWriterImpl<'a, Int64Type>),
/// Column writer for int96 (timestamp) type
Int96ColumnWriter(ColumnWriterImpl<'a, Int96Type>),
/// Column writer for float type
FloatColumnWriter(ColumnWriterImpl<'a, FloatType>),
/// Column writer for double type
DoubleColumnWriter(ColumnWriterImpl<'a, DoubleType>),
/// Column writer for byte array type
ByteArrayColumnWriter(ColumnWriterImpl<'a, ByteArrayType>),
/// Column writer for fixed length byte array type
FixedLenByteArrayColumnWriter(ColumnWriterImpl<'a, FixedLenByteArrayType>),
}

Expand All @@ -90,6 +98,11 @@ impl<'a> ColumnWriter<'a> {
}
}

#[deprecated(
since = "54.0.0",
note = "Seems like a stray and nobody knows what's it for. Will be removed in the next release."
)]
#[allow(missing_docs)]
pub enum Level {
Page,
Column,
Expand Down Expand Up @@ -309,6 +322,7 @@ impl<T: Default> ColumnMetrics<T> {
/// Typed column writer for a primitive column.
pub type ColumnWriterImpl<'a, T> = GenericColumnWriter<'a, ColumnValueEncoderImpl<T>>;

/// Generic column writer for a primitive column.
pub struct GenericColumnWriter<'a, E: ColumnValueEncoder> {
// Column writer properties
descr: ColumnDescPtr,
Expand Down Expand Up @@ -344,6 +358,7 @@ pub struct GenericColumnWriter<'a, E: ColumnValueEncoder> {
}

impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
/// Returns a new instance of [`GenericColumnWriter`].
pub fn new(
descr: ColumnDescPtr,
props: WriterPropertiesPtr,
Expand Down
Loading

0 comments on commit bd846e4

Please sign in to comment.