Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: add docs, part of #37 #6496

Merged
merged 1 commit into from
Oct 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions arrow-flight/gen/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,18 +26,18 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
let proto_path = Path::new("../format/Flight.proto");

tonic_build::configure()
// protoc in unbuntu builder needs this option
// protoc in Ubuntu builder needs this option
.protoc_arg("--experimental_allow_proto3_optional")
.out_dir("src")
.compile_with_config(prost_config(), &[proto_path], &[proto_dir])?;
.compile_protos_with_config(prost_config(), &[proto_path], &[proto_dir])?;
ByteBaker marked this conversation as resolved.
Show resolved Hide resolved

// read file contents to string
let mut file = OpenOptions::new()
.read(true)
.open("src/arrow.flight.protocol.rs")?;
let mut buffer = String::new();
file.read_to_string(&mut buffer)?;
// append warning that file was auto-generate
// append warning that file was auto-generated
let mut file = OpenOptions::new()
.write(true)
.truncate(true)
Expand All @@ -49,10 +49,10 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
let proto_path = Path::new("../format/FlightSql.proto");

tonic_build::configure()
// protoc in ubuntu builder needs this option
// protoc in Ubuntu builder needs this option
.protoc_arg("--experimental_allow_proto3_optional")
.out_dir("src/sql")
.compile_with_config(prost_config(), &[proto_path], &[proto_dir])?;
.compile_protos_with_config(prost_config(), &[proto_path], &[proto_dir])?;

// read file contents to string
let mut file = OpenOptions::new()
Expand Down
1 change: 1 addition & 0 deletions parquet/src/arrow/async_reader/metadata.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ use std::ops::Range;

/// A data source that can be used with [`MetadataLoader`] to load [`ParquetMetaData`]
pub trait MetadataFetch {
/// Fetches a range of bytes asynchronously
fn fetch(&mut self, range: Range<usize>) -> BoxFuture<'_, Result<Bytes>>;
}

Expand Down
47 changes: 45 additions & 2 deletions parquet/src/basic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,13 +47,21 @@ pub use crate::format::{
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
#[allow(non_camel_case_types)]
pub enum Type {
/// A boolean value.
BOOLEAN,
/// 32-bit signed integer.
INT32,
/// 64-bit signed integer.
INT64,
/// 96-bit signed integer for timestamps.
INT96,
/// IEEE 754 single-precision floating point value.
FLOAT,
/// IEEE 754 double-precision floating point value.
DOUBLE,
/// Arbitrary length byte array.
BYTE_ARRAY,
/// Fixed length byte array.
FIXED_LEN_BYTE_ARRAY,
}

Expand All @@ -70,6 +78,7 @@ pub enum Type {
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[allow(non_camel_case_types)]
pub enum ConvertedType {
/// No type conversion.
NONE,
/// A BYTE_ARRAY actually contains UTF8 encoded chars.
UTF8,
Expand Down Expand Up @@ -171,31 +180,53 @@ pub enum ConvertedType {
/// [`ConvertedType`]. Please see the README.md for more details.
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum LogicalType {
/// A UTF8 encoded string.
String,
/// A map of key-value pairs.
Map,
/// A list of elements.
List,
/// A set of predefined values.
Enum,
/// A decimal value with a specified scale and precision.
Decimal {
/// The number of digits in the decimal.
scale: i32,
/// The location of the decimal point.
precision: i32,
},
/// A date stored as days since Unix epoch.
Date,
/// A time stored as [`TimeUnit`] since midnight.
Time {
/// Whether the time is adjusted to UTC.
is_adjusted_to_u_t_c: bool,
/// The unit of time.
unit: TimeUnit,
},
/// A timestamp stored as [`TimeUnit`] since Unix epoch.
Timestamp {
/// Whether the timestamp is adjusted to UTC.
is_adjusted_to_u_t_c: bool,
/// The unit of time.
unit: TimeUnit,
},
/// An integer with a specified bit width and signedness.
Integer {
/// The number of bits in the integer.
bit_width: i8,
/// Whether the integer is signed.
is_signed: bool,
},
/// An unknown logical type.
Unknown,
/// A JSON document.
Json,
/// A BSON document.
Bson,
/// A UUID.
Uuid,
/// A 16-bit floating point number.
Float16,
}

Expand Down Expand Up @@ -350,13 +381,21 @@ impl FromStr for Encoding {
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[allow(non_camel_case_types)]
pub enum Compression {
/// No compression.
UNCOMPRESSED,
/// [Snappy compression](https://en.wikipedia.org/wiki/Snappy_(compression))
SNAPPY,
/// [Gzip compression](https://www.ietf.org/rfc/rfc1952.txt)
GZIP(GzipLevel),
/// [LZO compression](https://en.wikipedia.org/wiki/Lempel%E2%80%93Ziv%E2%80%93Oberhumer)
LZO,
/// [Brotli compression](https://datatracker.ietf.org/doc/html/rfc7932)
BROTLI(BrotliLevel),
/// [LZ4 compression](https://lz4.org/), [(deprecated)](https://issues.apache.org/jira/browse/PARQUET-2032)
LZ4,
/// [ZSTD compression](https://datatracker.ietf.org/doc/html/rfc8878)
ZSTD(ZstdLevel),
/// [LZ4 compression](https://lz4.org/).
LZ4_RAW,
}

Expand Down Expand Up @@ -447,16 +486,20 @@ impl FromStr for Compression {
}

// ----------------------------------------------------------------------
// Mirrors `parquet::PageType`

/// Mirrors [parquet::PageType]
///
/// Available data pages for Parquet file format.
/// Note that some of the page types may not be supported.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[allow(non_camel_case_types)]
pub enum PageType {
/// Data page Parquet 1.0
DATA_PAGE,
/// Index page
INDEX_PAGE,
/// Dictionary page
DICTIONARY_PAGE,
/// Data page Parquet 2.0
DATA_PAGE_V2,
}

Expand Down
28 changes: 28 additions & 0 deletions parquet/src/column/page.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,29 +31,51 @@ use crate::format::PageHeader;
/// used to store uncompressed bytes of the page.
#[derive(Clone)]
pub enum Page {
/// Data page Parquet format v1.
DataPage {
/// The underlying data buffer
buf: Bytes,
/// Number of values in this page
num_values: u32,
/// Encoding for values in this page
encoding: Encoding,
/// Definition level encoding
def_level_encoding: Encoding,
/// Repetition level encoding
rep_level_encoding: Encoding,
/// Optional statistics for this page
statistics: Option<Statistics>,
},
/// Data page Parquet format v2.
DataPageV2 {
/// The underlying data buffer
buf: Bytes,
/// Number of values in this page
num_values: u32,
/// Encoding for values in this page
encoding: Encoding,
/// Number of null values in this page
num_nulls: u32,
/// Number of rows in this page
num_rows: u32,
/// Length of definition levels
def_levels_byte_len: u32,
/// Length of repetition levels
rep_levels_byte_len: u32,
/// Is this page compressed
is_compressed: bool,
/// Optional statistics for this page
statistics: Option<Statistics>,
},
/// Dictionary page.
DictionaryPage {
/// The underlying data buffer
buf: Bytes,
/// Number of values in this page
num_values: u32,
/// Encoding for values in this page
encoding: Encoding,
/// Is dictionary page sorted
is_sorted: bool,
},
}
Expand Down Expand Up @@ -235,11 +257,17 @@ impl CompressedPage {

/// Contains page write metrics.
pub struct PageWriteSpec {
/// The type of page being written
pub page_type: PageType,
/// The total size of the page, before compression
pub uncompressed_size: usize,
/// The compressed size of the page
pub compressed_size: usize,
/// The number of values in the page
pub num_values: u32,
/// The offset of the page in the column chunk
pub offset: u64,
/// The number of bytes written to the underlying sink
pub bytes_written: u64,
}

Expand Down
8 changes: 8 additions & 0 deletions parquet/src/column/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,21 @@ pub(crate) mod decoder;

/// Column reader for a Parquet type.
pub enum ColumnReader {
/// Column reader for boolean type
BoolColumnReader(ColumnReaderImpl<BoolType>),
/// Column reader for int32 type
Int32ColumnReader(ColumnReaderImpl<Int32Type>),
/// Column reader for int64 type
Int64ColumnReader(ColumnReaderImpl<Int64Type>),
/// Column reader for int96 type
Int96ColumnReader(ColumnReaderImpl<Int96Type>),
/// Column reader for float type
FloatColumnReader(ColumnReaderImpl<FloatType>),
/// Column reader for double type
DoubleColumnReader(ColumnReaderImpl<DoubleType>),
/// Column reader for byte array type
ByteArrayColumnReader(ColumnReaderImpl<ByteArrayType>),
/// Column reader for fixed length byte array type
FixedLenByteArrayColumnReader(ColumnReaderImpl<FixedLenByteArrayType>),
}

Expand Down
15 changes: 15 additions & 0 deletions parquet/src/column/writer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,13 +61,21 @@ macro_rules! downcast_writer {

/// Column writer for a Parquet type.
pub enum ColumnWriter<'a> {
/// Column writer for boolean type
BoolColumnWriter(ColumnWriterImpl<'a, BoolType>),
/// Column writer for int32 type
Int32ColumnWriter(ColumnWriterImpl<'a, Int32Type>),
/// Column writer for int64 type
Int64ColumnWriter(ColumnWriterImpl<'a, Int64Type>),
/// Column writer for int96 (timestamp) type
Int96ColumnWriter(ColumnWriterImpl<'a, Int96Type>),
/// Column writer for float type
FloatColumnWriter(ColumnWriterImpl<'a, FloatType>),
/// Column writer for double type
DoubleColumnWriter(ColumnWriterImpl<'a, DoubleType>),
/// Column writer for byte array type
ByteArrayColumnWriter(ColumnWriterImpl<'a, ByteArrayType>),
/// Column writer for fixed length byte array type
FixedLenByteArrayColumnWriter(ColumnWriterImpl<'a, FixedLenByteArrayType>),
}

Expand All @@ -90,6 +98,11 @@ impl<'a> ColumnWriter<'a> {
}
}

#[deprecated(
since = "54.0.0",
note = "Seems like a stray and nobody knows what's it for. Will be removed in the next release."
ByteBaker marked this conversation as resolved.
Show resolved Hide resolved
)]
#[allow(missing_docs)]
pub enum Level {
Page,
Column,
Expand Down Expand Up @@ -309,6 +322,7 @@ impl<T: Default> ColumnMetrics<T> {
/// Typed column writer for a primitive column.
pub type ColumnWriterImpl<'a, T> = GenericColumnWriter<'a, ColumnValueEncoderImpl<T>>;

/// Generic column writer for a primitive column.
pub struct GenericColumnWriter<'a, E: ColumnValueEncoder> {
// Column writer properties
descr: ColumnDescPtr,
Expand Down Expand Up @@ -344,6 +358,7 @@ pub struct GenericColumnWriter<'a, E: ColumnValueEncoder> {
}

impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
/// Returns a new instance of [`GenericColumnWriter`].
pub fn new(
descr: ColumnDescPtr,
props: WriterPropertiesPtr,
Expand Down
Loading
Loading