Skip to content

Commit

Permalink
Add support for reading orc column statistics (#6588)
Browse files Browse the repository at this point in the history
Summary:
Currently, the dwrf module only supports reading the column statistics of dwrf format, and does not support reading the column statistics of orc format. When I use velox to read the orc table, the following exception occurs:
```
presto:tpch_velox> create table region_orc with(format = 'ORC') as select * from tpch.sf10.region;
CREATE TABLE: 5 rows

Query 20230804_032253_00003_tubug, FINISHED, 1 node
Splits: 22 total, 22 done (100.00%)
0:35 [5 rows, 0B] [0 rows/s, 0B/s]

presto:tpch_velox> select * from region_orc;

Query 20230804_043524_00023_m8qw7, FAILED, 1 node
Splits: 2 total, 0 done (0.00%)
0:40 [0 rows, 0B] [0 rows/s, 0B/s]

Query 20230804_043524_00023_m8qw7 failed:  Operator::getOutput failed for [operator: TableScan, plan node ID: 0]: vector

```
with this pr, we support read orc column statistics, than we can read orc data through velox:
```
presto:tpch_velox> select * from region_orc;
 regionkey |    name     |                                                       comment
-----------+-------------+---------------------------------------------------------------------------------------------------------------------
         0 | AFRICA      | lar deposits. blithely final packages cajole. regular waters are final requests. regular accounts are according to
         1 | AMERICA     | hs use ironic, even requests. s
         2 | ASIA        | ges. thinly even pinto beans ca
         3 | EUROPE      | ly final courts cajole furiously final excuse
         4 | MIDDLE EAST | uickly special accounts cajole carefully blithely close requests. carefully final asymptotes haggle furiousl
(5 rows)

Query 20230804_044016_00026_m8qw7, FINISHED, 1 node
Splits: 2 total, 2 done (100.00%)
0:14 [5 rows, 603B] [0 rows/s, 44B/s]

presto:tpch_velox>
```

CC: Yuhta

Pull Request resolved: #6588

Reviewed By: Yuhta

Differential Revision: D58370230

Pulled By: kevinwilfong

fbshipit-source-id: 6957726497768533b56881eba18481c4c0472a24
  • Loading branch information
wypb authored and facebook-github-bot committed Jun 14, 2024
1 parent 39743cd commit 0c0a973
Show file tree
Hide file tree
Showing 14 changed files with 2,054 additions and 1,192 deletions.
340 changes: 337 additions & 3 deletions velox/dwio/dwrf/common/FileMetadata.h
Original file line number Diff line number Diff line change
Expand Up @@ -336,6 +336,334 @@ class UserMetadataItemWrapper : public ProtoWrapperBase {
}
};

class IntegerStatisticsWrapper : public ProtoWrapperBase {
public:
explicit IntegerStatisticsWrapper(
const proto::IntegerStatistics* intStatistics)
: ProtoWrapperBase(DwrfFormat::kDwrf, intStatistics) {}

explicit IntegerStatisticsWrapper(
const proto::orc::IntegerStatistics* intStatistics)
: ProtoWrapperBase(DwrfFormat::kOrc, intStatistics) {}

bool hasMinimum() const {
return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_minimum()
: orcPtr()->has_minimum();
}

int64_t minimum() const {
return format_ == DwrfFormat::kDwrf ? dwrfPtr()->minimum()
: orcPtr()->minimum();
}

bool hasMaximum() const {
return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_maximum()
: orcPtr()->has_maximum();
}

int64_t maximum() const {
return format_ == DwrfFormat::kDwrf ? dwrfPtr()->maximum()
: orcPtr()->maximum();
}

bool hasSum() const {
return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_sum()
: orcPtr()->has_sum();
}

int64_t sum() const {
return format_ == DwrfFormat::kDwrf ? dwrfPtr()->sum() : orcPtr()->sum();
}

private:
// private helper with no format checking
inline const proto::IntegerStatistics* dwrfPtr() const {
return reinterpret_cast<const proto::IntegerStatistics*>(rawProtoPtr());
}
inline const proto::orc::IntegerStatistics* orcPtr() const {
return reinterpret_cast<const proto::orc::IntegerStatistics*>(
rawProtoPtr());
}
};

class DoubleStatisticsWrapper : public ProtoWrapperBase {
public:
explicit DoubleStatisticsWrapper(
const proto::DoubleStatistics* doubleStatistics)
: ProtoWrapperBase(DwrfFormat::kDwrf, doubleStatistics) {}

explicit DoubleStatisticsWrapper(
const proto::orc::DoubleStatistics* doubleStatistics)
: ProtoWrapperBase(DwrfFormat::kOrc, doubleStatistics) {}

bool hasMinimum() const {
return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_minimum()
: orcPtr()->has_minimum();
}

double minimum() const {
return format_ == DwrfFormat::kDwrf ? dwrfPtr()->minimum()
: orcPtr()->minimum();
}

bool hasMaximum() const {
return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_maximum()
: orcPtr()->has_maximum();
}

double maximum() const {
return format_ == DwrfFormat::kDwrf ? dwrfPtr()->maximum()
: orcPtr()->maximum();
}

bool hasSum() const {
return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_sum()
: orcPtr()->has_sum();
}

double sum() const {
return format_ == DwrfFormat::kDwrf ? dwrfPtr()->sum() : orcPtr()->sum();
}

private:
// private helper with no format checking
inline const proto::DoubleStatistics* dwrfPtr() const {
return reinterpret_cast<const proto::DoubleStatistics*>(rawProtoPtr());
}
inline const proto::orc::DoubleStatistics* orcPtr() const {
return reinterpret_cast<const proto::orc::DoubleStatistics*>(rawProtoPtr());
}
};

class StringStatisticsWrapper : public ProtoWrapperBase {
public:
explicit StringStatisticsWrapper(
const proto::StringStatistics* stringStatistics)
: ProtoWrapperBase(DwrfFormat::kDwrf, stringStatistics) {}

explicit StringStatisticsWrapper(
const proto::orc::StringStatistics* stringStatistics)
: ProtoWrapperBase(DwrfFormat::kOrc, stringStatistics) {}

bool hasMinimum() const {
return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_minimum()
: orcPtr()->has_minimum();
}

const std::string& minimum() const {
return format_ == DwrfFormat::kDwrf ? dwrfPtr()->minimum()
: orcPtr()->minimum();
}

bool hasMaximum() const {
return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_maximum()
: orcPtr()->has_maximum();
}

const std::string& maximum() const {
return format_ == DwrfFormat::kDwrf ? dwrfPtr()->maximum()
: orcPtr()->maximum();
}

bool hasSum() const {
return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_sum()
: orcPtr()->has_sum();
}

int64_t sum() const {
return format_ == DwrfFormat::kDwrf ? dwrfPtr()->sum() : orcPtr()->sum();
}

private:
// private helper with no format checking
inline const proto::StringStatistics* dwrfPtr() const {
return reinterpret_cast<const proto::StringStatistics*>(rawProtoPtr());
}
inline const proto::orc::StringStatistics* orcPtr() const {
return reinterpret_cast<const proto::orc::StringStatistics*>(rawProtoPtr());
}
};

class BucketStatisticsWrapper : public ProtoWrapperBase {
public:
explicit BucketStatisticsWrapper(
const proto::BucketStatistics* bucketStatistics)
: ProtoWrapperBase(DwrfFormat::kDwrf, bucketStatistics) {}

explicit BucketStatisticsWrapper(
const proto::orc::BucketStatistics* bucketStatistics)
: ProtoWrapperBase(DwrfFormat::kOrc, bucketStatistics) {}

int countSize() const {
return format_ == DwrfFormat::kDwrf ? dwrfPtr()->count_size()
: orcPtr()->count_size();
}

uint64_t count(int index) const {
return format_ == DwrfFormat::kDwrf ? dwrfPtr()->count(index)
: orcPtr()->count(index);
}

private:
// private helper with no format checking
inline const proto::BucketStatistics* dwrfPtr() const {
return reinterpret_cast<const proto::BucketStatistics*>(rawProtoPtr());
}
inline const proto::orc::BucketStatistics* orcPtr() const {
return reinterpret_cast<const proto::orc::BucketStatistics*>(rawProtoPtr());
}
};

class BinaryStatisticsWrapper : public ProtoWrapperBase {
public:
explicit BinaryStatisticsWrapper(
const proto::BinaryStatistics* binaryStatistics)
: ProtoWrapperBase(DwrfFormat::kDwrf, binaryStatistics) {}

explicit BinaryStatisticsWrapper(
const proto::orc::BinaryStatistics* binaryStatistics)
: ProtoWrapperBase(DwrfFormat::kOrc, binaryStatistics) {}

bool hasSum() const {
return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_sum()
: orcPtr()->has_sum();
}

int64_t sum() const {
return format_ == DwrfFormat::kDwrf ? dwrfPtr()->sum() : orcPtr()->sum();
}

private:
// private helper with no format checking
inline const proto::BinaryStatistics* dwrfPtr() const {
return reinterpret_cast<const proto::BinaryStatistics*>(rawProtoPtr());
}
inline const proto::orc::BinaryStatistics* orcPtr() const {
return reinterpret_cast<const proto::orc::BinaryStatistics*>(rawProtoPtr());
}
};

class ColumnStatisticsWrapper : public ProtoWrapperBase {
public:
explicit ColumnStatisticsWrapper(
const proto::ColumnStatistics* columnStatistics)
: ProtoWrapperBase(DwrfFormat::kDwrf, columnStatistics) {}

explicit ColumnStatisticsWrapper(
const proto::orc::ColumnStatistics* columnStatistics)
: ProtoWrapperBase(DwrfFormat::kOrc, columnStatistics) {}

bool hasNumberOfValues() const {
return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_numberofvalues()
: orcPtr()->has_numberofvalues();
}

uint64_t numberOfValues() const {
return format_ == DwrfFormat::kDwrf ? dwrfPtr()->numberofvalues()
: orcPtr()->numberofvalues();
}

bool hasHasNull() const {
return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_hasnull()
: orcPtr()->has_hasnull();
}

bool hasNull() const {
return format_ == DwrfFormat::kDwrf ? dwrfPtr()->hasnull()
: orcPtr()->hasnull();
}

bool hasRawSize() const {
return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_rawsize() : false;
}

uint64_t rawSize() const {
return format_ == DwrfFormat::kDwrf ? dwrfPtr()->rawsize() : 0;
}

bool hasSize() const {
return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_size() : false;
}

uint64_t size() const {
return format_ == DwrfFormat::kDwrf ? dwrfPtr()->size() : 0;
}

bool hasIntStatistics() const {
return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_intstatistics()
: orcPtr()->has_intstatistics();
}

IntegerStatisticsWrapper intStatistics() const {
return format_ == DwrfFormat::kDwrf
? IntegerStatisticsWrapper(&dwrfPtr()->intstatistics())
: IntegerStatisticsWrapper(&orcPtr()->intstatistics());
}

bool hasDoubleStatistics() const {
return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_doublestatistics()
: orcPtr()->has_doublestatistics();
}

DoubleStatisticsWrapper doubleStatistics() const {
return format_ == DwrfFormat::kDwrf
? DoubleStatisticsWrapper(&dwrfPtr()->doublestatistics())
: DoubleStatisticsWrapper(&orcPtr()->doublestatistics());
}

bool hasStringStatistics() const {
return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_stringstatistics()
: orcPtr()->has_stringstatistics();
}

StringStatisticsWrapper stringStatistics() const {
return format_ == DwrfFormat::kDwrf
? StringStatisticsWrapper(&dwrfPtr()->stringstatistics())
: StringStatisticsWrapper(&orcPtr()->stringstatistics());
}

bool hasBucketStatistics() const {
return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_bucketstatistics()
: orcPtr()->has_bucketstatistics();
}

BucketStatisticsWrapper bucketStatistics() const {
return format_ == DwrfFormat::kDwrf
? BucketStatisticsWrapper(&dwrfPtr()->bucketstatistics())
: BucketStatisticsWrapper(&orcPtr()->bucketstatistics());
}

bool hasBinaryStatistics() const {
return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_binarystatistics()
: orcPtr()->has_binarystatistics();
}

BinaryStatisticsWrapper binaryStatistics() const {
return format_ == DwrfFormat::kDwrf
? BinaryStatisticsWrapper(&dwrfPtr()->binarystatistics())
: BinaryStatisticsWrapper(&orcPtr()->binarystatistics());
}

bool hasMapStatistics() const {
return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_mapstatistics()
: false;
}

const ::facebook::velox::dwrf::proto::MapStatistics& mapStatistics() const {
VELOX_CHECK_EQ(format_, DwrfFormat::kDwrf);
return dwrfPtr()->mapstatistics();
}

private:
// private helper with no format checking
inline const proto::ColumnStatistics* dwrfPtr() const {
return reinterpret_cast<const proto::ColumnStatistics*>(rawProtoPtr());
}
inline const proto::orc::ColumnStatistics* orcPtr() const {
return reinterpret_cast<const proto::orc::ColumnStatistics*>(rawProtoPtr());
}
};

class FooterWrapper : public ProtoWrapperBase {
public:
explicit FooterWrapper(const proto::Footer* footer)
Expand Down Expand Up @@ -424,9 +752,9 @@ class FooterWrapper : public ProtoWrapperBase {
return dwrfPtr()->stripecacheoffsets();
}

// TODO: ORC has not supported column statistics yet
int statisticsSize() const {
return format_ == DwrfFormat::kDwrf ? dwrfPtr()->statistics_size() : 0;
return format_ == DwrfFormat::kDwrf ? dwrfPtr()->statistics_size()
: orcPtr()->statistics_size();
}

const ::google::protobuf::RepeatedPtrField<
Expand All @@ -436,12 +764,18 @@ class FooterWrapper : public ProtoWrapperBase {
return dwrfPtr()->statistics();
}

const ::facebook::velox::dwrf::proto::ColumnStatistics& statistics(
const ::facebook::velox::dwrf::proto::ColumnStatistics& dwrfStatistics(
int index) const {
VELOX_CHECK_EQ(format_, DwrfFormat::kDwrf);
return dwrfPtr()->statistics(index);
}

ColumnStatisticsWrapper statistics(int index) const {
return format_ == DwrfFormat::kDwrf
? ColumnStatisticsWrapper(&dwrfPtr()->statistics(index))
: ColumnStatisticsWrapper(&orcPtr()->statistics(index));
}

// TODO: ORC has not supported encryption yet
bool hasEncryption() const {
return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_encryption() : false;
Expand Down
Loading

0 comments on commit 0c0a973

Please sign in to comment.