diff --git a/velox/dwio/dwrf/common/FileMetadata.h b/velox/dwio/dwrf/common/FileMetadata.h index ac8bb09e6ebb..34e531a6b29d 100644 --- a/velox/dwio/dwrf/common/FileMetadata.h +++ b/velox/dwio/dwrf/common/FileMetadata.h @@ -336,6 +336,334 @@ class UserMetadataItemWrapper : public ProtoWrapperBase { } }; +class IntegerStatisticsWrapper : public ProtoWrapperBase { + public: + explicit IntegerStatisticsWrapper( + const proto::IntegerStatistics* intStatistics) + : ProtoWrapperBase(DwrfFormat::kDwrf, intStatistics) {} + + explicit IntegerStatisticsWrapper( + const proto::orc::IntegerStatistics* intStatistics) + : ProtoWrapperBase(DwrfFormat::kOrc, intStatistics) {} + + bool hasMinimum() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_minimum() + : orcPtr()->has_minimum(); + } + + int64_t minimum() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->minimum() + : orcPtr()->minimum(); + } + + bool hasMaximum() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_maximum() + : orcPtr()->has_maximum(); + } + + int64_t maximum() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->maximum() + : orcPtr()->maximum(); + } + + bool hasSum() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_sum() + : orcPtr()->has_sum(); + } + + int64_t sum() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->sum() : orcPtr()->sum(); + } + + private: + // private helper with no format checking + inline const proto::IntegerStatistics* dwrfPtr() const { + return reinterpret_cast(rawProtoPtr()); + } + inline const proto::orc::IntegerStatistics* orcPtr() const { + return reinterpret_cast( + rawProtoPtr()); + } +}; + +class DoubleStatisticsWrapper : public ProtoWrapperBase { + public: + explicit DoubleStatisticsWrapper( + const proto::DoubleStatistics* doubleStatistics) + : ProtoWrapperBase(DwrfFormat::kDwrf, doubleStatistics) {} + + explicit DoubleStatisticsWrapper( + const proto::orc::DoubleStatistics* doubleStatistics) + : ProtoWrapperBase(DwrfFormat::kOrc, doubleStatistics) {} + + bool hasMinimum() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_minimum() + : orcPtr()->has_minimum(); + } + + double minimum() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->minimum() + : orcPtr()->minimum(); + } + + bool hasMaximum() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_maximum() + : orcPtr()->has_maximum(); + } + + double maximum() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->maximum() + : orcPtr()->maximum(); + } + + bool hasSum() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_sum() + : orcPtr()->has_sum(); + } + + double sum() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->sum() : orcPtr()->sum(); + } + + private: + // private helper with no format checking + inline const proto::DoubleStatistics* dwrfPtr() const { + return reinterpret_cast(rawProtoPtr()); + } + inline const proto::orc::DoubleStatistics* orcPtr() const { + return reinterpret_cast(rawProtoPtr()); + } +}; + +class StringStatisticsWrapper : public ProtoWrapperBase { + public: + explicit StringStatisticsWrapper( + const proto::StringStatistics* stringStatistics) + : ProtoWrapperBase(DwrfFormat::kDwrf, stringStatistics) {} + + explicit StringStatisticsWrapper( + const proto::orc::StringStatistics* stringStatistics) + : ProtoWrapperBase(DwrfFormat::kOrc, stringStatistics) {} + + bool hasMinimum() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_minimum() + : orcPtr()->has_minimum(); + } + + const std::string& minimum() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->minimum() + : orcPtr()->minimum(); + } + + bool hasMaximum() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_maximum() + : orcPtr()->has_maximum(); + } + + const std::string& maximum() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->maximum() + : orcPtr()->maximum(); + } + + bool hasSum() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_sum() + : orcPtr()->has_sum(); + } + + int64_t sum() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->sum() : orcPtr()->sum(); + } + + private: + // private helper with no format checking + inline const proto::StringStatistics* dwrfPtr() const { + return reinterpret_cast(rawProtoPtr()); + } + inline const proto::orc::StringStatistics* orcPtr() const { + return reinterpret_cast(rawProtoPtr()); + } +}; + +class BucketStatisticsWrapper : public ProtoWrapperBase { + public: + explicit BucketStatisticsWrapper( + const proto::BucketStatistics* bucketStatistics) + : ProtoWrapperBase(DwrfFormat::kDwrf, bucketStatistics) {} + + explicit BucketStatisticsWrapper( + const proto::orc::BucketStatistics* bucketStatistics) + : ProtoWrapperBase(DwrfFormat::kOrc, bucketStatistics) {} + + int countSize() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->count_size() + : orcPtr()->count_size(); + } + + uint64_t count(int index) const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->count(index) + : orcPtr()->count(index); + } + + private: + // private helper with no format checking + inline const proto::BucketStatistics* dwrfPtr() const { + return reinterpret_cast(rawProtoPtr()); + } + inline const proto::orc::BucketStatistics* orcPtr() const { + return reinterpret_cast(rawProtoPtr()); + } +}; + +class BinaryStatisticsWrapper : public ProtoWrapperBase { + public: + explicit BinaryStatisticsWrapper( + const proto::BinaryStatistics* binaryStatistics) + : ProtoWrapperBase(DwrfFormat::kDwrf, binaryStatistics) {} + + explicit BinaryStatisticsWrapper( + const proto::orc::BinaryStatistics* binaryStatistics) + : ProtoWrapperBase(DwrfFormat::kOrc, binaryStatistics) {} + + bool hasSum() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_sum() + : orcPtr()->has_sum(); + } + + int64_t sum() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->sum() : orcPtr()->sum(); + } + + private: + // private helper with no format checking + inline const proto::BinaryStatistics* dwrfPtr() const { + return reinterpret_cast(rawProtoPtr()); + } + inline const proto::orc::BinaryStatistics* orcPtr() const { + return reinterpret_cast(rawProtoPtr()); + } +}; + +class ColumnStatisticsWrapper : public ProtoWrapperBase { + public: + explicit ColumnStatisticsWrapper( + const proto::ColumnStatistics* columnStatistics) + : ProtoWrapperBase(DwrfFormat::kDwrf, columnStatistics) {} + + explicit ColumnStatisticsWrapper( + const proto::orc::ColumnStatistics* columnStatistics) + : ProtoWrapperBase(DwrfFormat::kOrc, columnStatistics) {} + + bool hasNumberOfValues() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_numberofvalues() + : orcPtr()->has_numberofvalues(); + } + + uint64_t numberOfValues() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->numberofvalues() + : orcPtr()->numberofvalues(); + } + + bool hasHasNull() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_hasnull() + : orcPtr()->has_hasnull(); + } + + bool hasNull() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->hasnull() + : orcPtr()->hasnull(); + } + + bool hasRawSize() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_rawsize() : false; + } + + uint64_t rawSize() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->rawsize() : 0; + } + + bool hasSize() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_size() : false; + } + + uint64_t size() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->size() : 0; + } + + bool hasIntStatistics() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_intstatistics() + : orcPtr()->has_intstatistics(); + } + + IntegerStatisticsWrapper intStatistics() const { + return format_ == DwrfFormat::kDwrf + ? IntegerStatisticsWrapper(&dwrfPtr()->intstatistics()) + : IntegerStatisticsWrapper(&orcPtr()->intstatistics()); + } + + bool hasDoubleStatistics() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_doublestatistics() + : orcPtr()->has_doublestatistics(); + } + + DoubleStatisticsWrapper doubleStatistics() const { + return format_ == DwrfFormat::kDwrf + ? DoubleStatisticsWrapper(&dwrfPtr()->doublestatistics()) + : DoubleStatisticsWrapper(&orcPtr()->doublestatistics()); + } + + bool hasStringStatistics() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_stringstatistics() + : orcPtr()->has_stringstatistics(); + } + + StringStatisticsWrapper stringStatistics() const { + return format_ == DwrfFormat::kDwrf + ? StringStatisticsWrapper(&dwrfPtr()->stringstatistics()) + : StringStatisticsWrapper(&orcPtr()->stringstatistics()); + } + + bool hasBucketStatistics() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_bucketstatistics() + : orcPtr()->has_bucketstatistics(); + } + + BucketStatisticsWrapper bucketStatistics() const { + return format_ == DwrfFormat::kDwrf + ? BucketStatisticsWrapper(&dwrfPtr()->bucketstatistics()) + : BucketStatisticsWrapper(&orcPtr()->bucketstatistics()); + } + + bool hasBinaryStatistics() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_binarystatistics() + : orcPtr()->has_binarystatistics(); + } + + BinaryStatisticsWrapper binaryStatistics() const { + return format_ == DwrfFormat::kDwrf + ? BinaryStatisticsWrapper(&dwrfPtr()->binarystatistics()) + : BinaryStatisticsWrapper(&orcPtr()->binarystatistics()); + } + + bool hasMapStatistics() const { + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_mapstatistics() + : false; + } + + const ::facebook::velox::dwrf::proto::MapStatistics& mapStatistics() const { + VELOX_CHECK_EQ(format_, DwrfFormat::kDwrf); + return dwrfPtr()->mapstatistics(); + } + + private: + // private helper with no format checking + inline const proto::ColumnStatistics* dwrfPtr() const { + return reinterpret_cast(rawProtoPtr()); + } + inline const proto::orc::ColumnStatistics* orcPtr() const { + return reinterpret_cast(rawProtoPtr()); + } +}; + class FooterWrapper : public ProtoWrapperBase { public: explicit FooterWrapper(const proto::Footer* footer) @@ -424,9 +752,9 @@ class FooterWrapper : public ProtoWrapperBase { return dwrfPtr()->stripecacheoffsets(); } - // TODO: ORC has not supported column statistics yet int statisticsSize() const { - return format_ == DwrfFormat::kDwrf ? dwrfPtr()->statistics_size() : 0; + return format_ == DwrfFormat::kDwrf ? dwrfPtr()->statistics_size() + : orcPtr()->statistics_size(); } const ::google::protobuf::RepeatedPtrField< @@ -436,12 +764,18 @@ class FooterWrapper : public ProtoWrapperBase { return dwrfPtr()->statistics(); } - const ::facebook::velox::dwrf::proto::ColumnStatistics& statistics( + const ::facebook::velox::dwrf::proto::ColumnStatistics& dwrfStatistics( int index) const { VELOX_CHECK_EQ(format_, DwrfFormat::kDwrf); return dwrfPtr()->statistics(index); } + ColumnStatisticsWrapper statistics(int index) const { + return format_ == DwrfFormat::kDwrf + ? ColumnStatisticsWrapper(&dwrfPtr()->statistics(index)) + : ColumnStatisticsWrapper(&orcPtr()->statistics(index)); + } + // TODO: ORC has not supported encryption yet bool hasEncryption() const { return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_encryption() : false; diff --git a/velox/dwio/dwrf/common/Statistics.cpp b/velox/dwio/dwrf/common/Statistics.cpp index af4bddcc8692..8252a289a87d 100644 --- a/velox/dwio/dwrf/common/Statistics.cpp +++ b/velox/dwio/dwrf/common/Statistics.cpp @@ -21,74 +21,75 @@ namespace facebook::velox::dwrf { using namespace dwio::common; std::unique_ptr buildColumnStatisticsFromProto( - const proto::ColumnStatistics& s, + const ColumnStatisticsWrapper& stats, const StatsContext& statsContext) { ColumnStatistics colStats( - s.has_numberofvalues() ? std::optional(s.numberofvalues()) : std::nullopt, - s.has_hasnull() ? std::optional(s.hasnull()) : std::nullopt, - s.has_rawsize() ? std::optional(s.rawsize()) : std::nullopt, - s.has_size() ? std::optional(s.size()) : std::nullopt); + stats.hasNumberOfValues() ? std::optional(stats.numberOfValues()) + : std::nullopt, + stats.hasHasNull() ? std::optional(stats.hasNull()) : std::nullopt, + stats.hasRawSize() ? std::optional(stats.rawSize()) : std::nullopt, + stats.hasSize() ? std::optional(stats.size()) : std::nullopt); // detailed stats is only defined when has non-null value - if (!s.has_numberofvalues() || s.numberofvalues() > 0) { - if (s.has_intstatistics()) { - const auto& intStats = s.intstatistics(); + if (!stats.hasNumberOfValues() || stats.numberOfValues() > 0) { + if (stats.hasIntStatistics()) { + const auto& intStats = stats.intStatistics(); return std::make_unique( colStats, - intStats.has_minimum() ? std::optional(intStats.minimum()) - : std::nullopt, - intStats.has_maximum() ? std::optional(intStats.maximum()) - : std::nullopt, - intStats.has_sum() ? std::optional(intStats.sum()) : std::nullopt); - } else if (s.has_doublestatistics()) { - const auto& dStats = s.doublestatistics(); + intStats.hasMinimum() ? std::optional(intStats.minimum()) + : std::nullopt, + intStats.hasMaximum() ? std::optional(intStats.maximum()) + : std::nullopt, + intStats.hasSum() ? std::optional(intStats.sum()) : std::nullopt); + } else if (stats.hasDoubleStatistics()) { + const auto& dStats = stats.doubleStatistics(); // Comparing against NaN doesn't make sense, and to prevent downstream // from incorrectly using it, need to make sure min/max/sum doens't have // NaN. - auto hasNan = (dStats.has_minimum() && std::isnan(dStats.minimum())) || - (dStats.has_maximum() && std::isnan(dStats.maximum())) || - (dStats.has_sum() && std::isnan(dStats.sum())); + auto hasNan = (dStats.hasMinimum() && std::isnan(dStats.minimum())) || + (dStats.hasMaximum() && std::isnan(dStats.maximum())) || + (dStats.hasSum() && std::isnan(dStats.sum())); if (!hasNan) { return std::make_unique( colStats, - dStats.has_minimum() ? std::optional(dStats.minimum()) - : std::nullopt, - dStats.has_maximum() ? std::optional(dStats.maximum()) - : std::nullopt, - dStats.has_sum() ? std::optional(dStats.sum()) : std::nullopt); + dStats.hasMinimum() ? std::optional(dStats.minimum()) + : std::nullopt, + dStats.hasMaximum() ? std::optional(dStats.maximum()) + : std::nullopt, + dStats.hasSum() ? std::optional(dStats.sum()) : std::nullopt); } - } else if (s.has_stringstatistics()) { + } else if (stats.hasStringStatistics()) { // DWRF_5_0 is the first version that string stats are saved as UTF8 // bytes, hence only process string stats for version >= DWRF_5_0 if (statsContext.writerVersion >= WriterVersion::DWRF_5_0 || statsContext.writerName == kPrestoWriter || statsContext.writerName == kDwioWriter) { - const auto& strStats = s.stringstatistics(); + const auto& strStats = stats.stringStatistics(); return std::make_unique( colStats, - strStats.has_minimum() ? std::optional(strStats.minimum()) - : std::nullopt, - strStats.has_maximum() ? std::optional(strStats.maximum()) - : std::nullopt, + strStats.hasMinimum() ? std::optional(strStats.minimum()) + : std::nullopt, + strStats.hasMaximum() ? std::optional(strStats.maximum()) + : std::nullopt, // In proto, length(sum) is defined as sint. We need to make sure // length is not negative - (strStats.has_sum() && strStats.sum() >= 0) + (strStats.hasSum() && strStats.sum() >= 0) ? std::optional(strStats.sum()) : std::nullopt); } - } else if (s.has_bucketstatistics()) { - const auto& bucketStats = s.bucketstatistics(); + } else if (stats.hasBucketStatistics()) { + const auto& bucketStats = stats.bucketStatistics(); // Need to make sure there is at least one bucket. True count is saved in // bucket 0 - if (bucketStats.count_size() > 0) { + if (bucketStats.countSize() > 0) { return std::make_unique( colStats, bucketStats.count(0)); } - } else if (s.has_binarystatistics()) { - const auto& binStats = s.binarystatistics(); + } else if (stats.hasBinaryStatistics()) { + const auto& binStats = stats.binaryStatistics(); // In proto, length(sum) is defined as sint. We need to make sure length // is not negative - if (binStats.has_sum() && binStats.sum() >= 0) { + if (binStats.hasSum() && binStats.sum() >= 0) { return std::make_unique( colStats, static_cast(binStats.sum())); } diff --git a/velox/dwio/dwrf/common/Statistics.h b/velox/dwio/dwrf/common/Statistics.h index 0f26682fe1db..864fb08db372 100644 --- a/velox/dwio/dwrf/common/Statistics.h +++ b/velox/dwio/dwrf/common/Statistics.h @@ -18,6 +18,7 @@ #include "velox/dwio/common/Statistics.h" #include "velox/dwio/dwrf/common/Common.h" +#include "velox/dwio/dwrf/common/FileMetadata.h" #include "velox/dwio/dwrf/common/wrap/dwrf-proto-wrapper.h" namespace facebook::velox::dwrf { @@ -39,7 +40,7 @@ struct StatsContext : public dwio::common::StatsContext { }; std::unique_ptr buildColumnStatisticsFromProto( - const proto::ColumnStatistics& stats, + const ColumnStatisticsWrapper& stats, const StatsContext& statsContext); } // namespace facebook::velox::dwrf diff --git a/velox/dwio/dwrf/reader/BinaryStreamReader.cpp b/velox/dwio/dwrf/reader/BinaryStreamReader.cpp index fe0fd4b132e3..6fbaef711435 100644 --- a/velox/dwio/dwrf/reader/BinaryStreamReader.cpp +++ b/velox/dwio/dwrf/reader/BinaryStreamReader.cpp @@ -125,7 +125,7 @@ BinaryStreamReader::getStatistics() const { for (auto node = 0; node < footerStatsSize; node++) { if (columnSelector_.shouldReadNode(node)) { stats[node] = - stripeReaderBase_.getReader().getFooter().statistics(node); + stripeReaderBase_.getReader().getFooter().dwrfStatistics(node); } } } diff --git a/velox/dwio/dwrf/reader/DwrfData.cpp b/velox/dwio/dwrf/reader/DwrfData.cpp index 1c42d741a9aa..94cb60c28b9a 100644 --- a/velox/dwio/dwrf/reader/DwrfData.cpp +++ b/velox/dwio/dwrf/reader/DwrfData.cpp @@ -160,8 +160,8 @@ void DwrfData::filterRowGroups( } for (auto i = 0; i < index_->entry_size(); i++) { const auto& entry = index_->entry(i); - auto columnStats = - buildColumnStatisticsFromProto(entry.statistics(), *dwrfContext); + auto columnStats = buildColumnStatisticsFromProto( + ColumnStatisticsWrapper(&entry.statistics()), *dwrfContext); if (filter && !testFilter( filter, columnStats.get(), rowGroupSize, fileType_->type())) { diff --git a/velox/dwio/dwrf/reader/ReaderBase.cpp b/velox/dwio/dwrf/reader/ReaderBase.cpp index 085ce2c501f5..bb45df2987c4 100644 --- a/velox/dwio/dwrf/reader/ReaderBase.cpp +++ b/velox/dwio/dwrf/reader/ReaderBase.cpp @@ -61,7 +61,8 @@ FooterStatisticsImpl::FooterStatisticsImpl( for (uint32_t statsIndex = 0; statsIndex < stats->statistics_size(); ++statsIndex) { colStats_[node + statsIndex] = buildColumnStatisticsFromProto( - stats->statistics(statsIndex), statsContext); + ColumnStatisticsWrapper(&stats->statistics(statsIndex)), + statsContext); } } } @@ -248,7 +249,7 @@ std::unique_ptr ReaderBase::getColumnStatistics( "column index out of range"); StatsContext statsContext(getWriterVersion()); if (!handler_->isEncrypted(index)) { - auto& stats = footer_->statistics(index); + auto stats = footer_->statistics(index); return buildColumnStatisticsFromProto(stats, statsContext); } @@ -259,7 +260,7 @@ std::unique_ptr ReaderBase::getColumnStatistics( // if key is not loaded, return plaintext stats if (!decrypter.isKeyLoaded()) { - auto& stats = footer_->statistics(index); + auto stats = footer_->statistics(index); return buildColumnStatisticsFromProto(stats, statsContext); } @@ -275,7 +276,7 @@ std::unique_ptr ReaderBase::getColumnStatistics( auto stats = readProtoFromString( group.statistics(nodeIndex), &decrypter); return buildColumnStatisticsFromProto( - stats->statistics(index - root), statsContext); + ColumnStatisticsWrapper(&stats->statistics(index - root)), statsContext); } std::shared_ptr ReaderBase::convertType( diff --git a/velox/dwio/dwrf/test/CMakeLists.txt b/velox/dwio/dwrf/test/CMakeLists.txt index fd3a0493f30c..d2883d006945 100644 --- a/velox/dwio/dwrf/test/CMakeLists.txt +++ b/velox/dwio/dwrf/test/CMakeLists.txt @@ -34,13 +34,22 @@ add_test(velox_dwio_dwrf_buffered_output_stream_test target_link_libraries(velox_dwio_dwrf_buffered_output_stream_test velox_link_libs Folly::folly ${TEST_LINK_LIBS}) -add_executable(velox_dwio_dwrf_column_statistics_test TestColumnStatistics.cpp) +add_executable(velox_dwio_dwrf_column_statistics_test + TestDwrfColumnStatistics.cpp) add_test(velox_dwio_dwrf_column_statistics_test velox_dwio_dwrf_column_statistics_test) target_link_libraries(velox_dwio_dwrf_column_statistics_test velox_link_libs Folly::folly ${TEST_LINK_LIBS}) +add_executable(velox_dwio_orc_column_statistics_test + TestOrcColumnStatistics.cpp) +add_test(velox_dwio_orc_column_statistics_test + velox_dwio_orc_column_statistics_test) + +target_link_libraries(velox_dwio_orc_column_statistics_test velox_link_libs + Folly::folly ${TEST_LINK_LIBS}) + add_executable(velox_dwio_dwrf_compression_test CompressionTest.cpp) add_test(velox_dwio_dwrf_compression_test velox_dwio_dwrf_compression_test) diff --git a/velox/dwio/dwrf/test/ColumnStatisticsBase.h b/velox/dwio/dwrf/test/ColumnStatisticsBase.h new file mode 100644 index 000000000000..32b09aacfcf0 --- /dev/null +++ b/velox/dwio/dwrf/test/ColumnStatisticsBase.h @@ -0,0 +1,975 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include "velox/dwio/common/Statistics.h" +#include "velox/dwio/dwrf/writer/StatisticsBuilder.h" + +namespace facebook::velox::dwrf { +class ColumnStatisticsBase { + public: + ColumnStatisticsBase() + : arena_(std::make_unique()) {} + + void testSize() { + { + StatisticsBuilder missingSize{options()}; + ASSERT_FALSE(missingSize.getSize().has_value()); + StatisticsBuilder hasSize{StatisticsBuilderOptions{ + /*stringLengthLimit=*/32, /*initialSize=*/10}}; + ASSERT_TRUE(hasSize.getSize().has_value()); + EXPECT_EQ(10, hasSize.getSize().value()); + + hasSize.merge(missingSize, /*ignoreSize=*/true); + EXPECT_FALSE(missingSize.getSize().has_value()); + ASSERT_TRUE(hasSize.getSize().has_value()); + EXPECT_EQ(10, hasSize.getSize().value()); + + // Coercing to missing/invalid size when not ignoring by default. + hasSize.merge(missingSize); + EXPECT_FALSE(missingSize.getSize().has_value()); + EXPECT_FALSE(hasSize.getSize().has_value()); + } + { + StatisticsBuilder from{options()}; + ASSERT_FALSE(from.getSize().has_value()); + + StatisticsBuilder to{options()}; + ASSERT_FALSE(to.getSize().has_value()); + to.incrementSize(64); + ASSERT_FALSE(to.getSize().has_value()); + + to.ensureSize(); + ASSERT_EQ(0, to.getSize().value()); + to.incrementSize(64); + EXPECT_EQ(64, to.getSize().value()); + + to.merge(from, /*ignoreSize=*/true); + EXPECT_FALSE(from.getSize().has_value()); + EXPECT_EQ(64, to.getSize().value()); + + to.merge(from); + EXPECT_FALSE(from.getSize().has_value()); + EXPECT_FALSE(to.getSize().has_value()); + } + } + + void testInteger() { + IntegerStatisticsBuilder builder{options()}; + // empty builder should have all defaults + EXPECT_EQ(std::numeric_limits::max(), builder.getMinimum()); + EXPECT_EQ(std::numeric_limits::min(), builder.getMaximum()); + EXPECT_EQ(0, builder.getSum()); + EXPECT_EQ(0, builder.getNumberOfValues()); + + builder.addValues(3); + builder.addValues(1); + builder.addValues(5); + + // stats should be merged + IntegerStatisticsBuilder target{options()}; + target.merge(*builder.build()); + auto stats = as(target.build()); + EXPECT_EQ(3, stats->getNumberOfValues()); + EXPECT_EQ(1, stats->getMinimum()); + EXPECT_EQ(5, stats->getMaximum()); + EXPECT_EQ(9, stats->getSum()); + + // stats should be merged again + builder.addValues(0); + builder.addValues(6); + target.merge(*builder.build()); + stats = as(target.build()); + EXPECT_EQ(8, stats->getNumberOfValues()); + EXPECT_EQ(0, stats->getMinimum()); + EXPECT_EQ(6, stats->getMaximum()); + EXPECT_EQ(24, stats->getSum()); + + target.merge(*builder.build()); + stats = as(target.build()); + EXPECT_EQ(13, stats->getNumberOfValues()); + EXPECT_EQ(0, stats->getMinimum()); + EXPECT_EQ(6, stats->getMaximum()); + EXPECT_EQ(39, stats->getSum()); + + // add value + target.addValues(100, 2); + stats = as(target.build()); + EXPECT_EQ(15, stats->getNumberOfValues()); + EXPECT_EQ(0, stats->getMinimum()); + EXPECT_EQ(100, stats->getMaximum()); + EXPECT_EQ(239, stats->getSum()); + + // reset + builder.reset(); + EXPECT_EQ(std::numeric_limits::max(), builder.getMinimum()); + EXPECT_EQ(std::numeric_limits::min(), builder.getMaximum()); + EXPECT_EQ(0, builder.getSum()); + EXPECT_EQ(0, builder.getNumberOfValues()); + } + + void testIntegerMissingStats( + ColumnStatisticsWrapper columnStatisticsWrapper, + void* protoPtr, + DwrfFormat format) { + IntegerStatisticsBuilder target{options()}; + target.addValues(1, 5); + auto stats = as(target.build()); + EXPECT_EQ(5, stats->getSum()); + + // merge missing stats + target.merge( + *buildColumnStatisticsFromProto(columnStatisticsWrapper, context())); + stats = as(target.build()); + EXPECT_EQ(stats, nullptr); + + // merge again + if (format == DwrfFormat::kDwrf) { + auto intPtr = reinterpret_cast(protoPtr); + intPtr->set_minimum(0); + intPtr->set_maximum(1); + intPtr->set_sum(100); + } else { + auto intPtr = reinterpret_cast(protoPtr); + intPtr->set_minimum(0); + intPtr->set_maximum(1); + intPtr->set_sum(100); + } + + target.merge( + *buildColumnStatisticsFromProto(columnStatisticsWrapper, context())); + stats = as(target.build()); + EXPECT_EQ(stats, nullptr); + + // add again + target.addValues(2); + stats = as(target.build()); + EXPECT_EQ(stats, nullptr); + } + + void testIntegerEmptyStats( + ColumnStatisticsWrapper columnStatisticsWrapper, + void* protoPtr, + DwrfFormat format) { + IntegerStatisticsBuilder target{options()}; + target.addValues(1, 5); + auto stats = as(target.build()); + EXPECT_EQ(5, stats->getSum()); + + // merge empty stats + target.merge( + *buildColumnStatisticsFromProto(columnStatisticsWrapper, context())); + stats = as(target.build()); + EXPECT_EQ(5, stats->getSum()); + + // merge again + if (format == DwrfFormat::kDwrf) { + auto columnPtr = reinterpret_cast(protoPtr); + columnPtr->clear_numberofvalues(); + } else { + auto columnPtr = + reinterpret_cast(protoPtr); + columnPtr->clear_numberofvalues(); + } + target.merge( + *buildColumnStatisticsFromProto(columnStatisticsWrapper, context())); + stats = as(target.build()); + EXPECT_EQ(stats, nullptr); + } + + void testIntegerOverflow() { + IntegerStatisticsBuilder target{options()}; + auto testMinMax = + [&](int64_t val1, int64_t val2, int64_t min, int64_t max) { + target.reset(); + target.addValues(val1); + auto stats = + as(target.build()); + EXPECT_EQ(val1, stats->getMaximum()); + EXPECT_EQ(val1, stats->getMinimum()); + EXPECT_EQ(val1, stats->getSum()); + + target.addValues(val2); + stats = as(target.build()); + EXPECT_EQ(max, stats->getMaximum()); + EXPECT_EQ(min, stats->getMinimum()); + EXPECT_FALSE(stats->getSum().has_value()); + }; + + testMinMax( + std::numeric_limits::min(), + -1, + std::numeric_limits::min(), + -1); + testMinMax( + std::numeric_limits::max(), + 1, + 1, + std::numeric_limits::max()); + + // make sure we also capture overflow that happened for adding multiple + // items + target.reset(); + target.addValues(std::numeric_limits::max() / 10, 11); + auto stats = as(target.build()); + EXPECT_EQ(11, stats->getNumberOfValues()); + EXPECT_EQ(stats->getMaximum().value(), stats->getMinimum().value()); + EXPECT_FALSE(stats->getSum().has_value()); + + // merge overflow + auto testMergeOverflow = [&](int64_t val1, int64_t val2) { + target.reset(); + target.addValues(val1); + IntegerStatisticsBuilder builder{options()}; + builder.addValues(val2); + target.merge(builder); + stats = as(target.build()); + EXPECT_FALSE(stats->getSum().has_value()); + }; + testMergeOverflow(std::numeric_limits::min(), -1); + testMergeOverflow(std::numeric_limits::max(), 1); + } + + void testDoubles() { + DoubleStatisticsBuilder builder{options()}; + // empty builder should have all defaults + EXPECT_EQ(std::numeric_limits::infinity(), builder.getMinimum()); + EXPECT_EQ(-std::numeric_limits::infinity(), builder.getMaximum()); + EXPECT_EQ(0, builder.getSum()); + EXPECT_EQ(0, builder.getNumberOfValues()); + + builder.addValues(3); + builder.addValues(1); + builder.addValues(5); + + // stats should be merged + DoubleStatisticsBuilder target{options()}; + target.merge(*builder.build()); + auto stats = as(target.build()); + EXPECT_EQ(3, stats->getNumberOfValues()); + EXPECT_EQ(1, stats->getMinimum()); + EXPECT_EQ(5, stats->getMaximum()); + EXPECT_EQ(9, stats->getSum()); + + // stats should be merged again + builder.addValues(0); + builder.addValues(6); + target.merge(*builder.build()); + stats = as(target.build()); + EXPECT_EQ(8, stats->getNumberOfValues()); + EXPECT_EQ(0, stats->getMinimum()); + EXPECT_EQ(6, stats->getMaximum()); + EXPECT_EQ(24, stats->getSum()); + + target.merge(*builder.build()); + stats = as(target.build()); + EXPECT_EQ(13, stats->getNumberOfValues()); + EXPECT_EQ(0, stats->getMinimum()); + EXPECT_EQ(6, stats->getMaximum()); + EXPECT_EQ(39, stats->getSum()); + + // add value + target.addValues(100, 2); + stats = as(target.build()); + EXPECT_EQ(15, stats->getNumberOfValues()); + EXPECT_EQ(0, stats->getMinimum()); + EXPECT_EQ(100, stats->getMaximum()); + EXPECT_EQ(239, stats->getSum()); + + // reset + builder.reset(); + EXPECT_EQ(std::numeric_limits::infinity(), builder.getMinimum()); + EXPECT_EQ(-std::numeric_limits::infinity(), builder.getMaximum()); + EXPECT_EQ(0, builder.getSum()); + EXPECT_EQ(0, builder.getNumberOfValues()); + } + + void testDoubleMissingStats( + ColumnStatisticsWrapper columnStatisticsWrapper, + void* protoPtr, + DwrfFormat format) { + DoubleStatisticsBuilder target{options()}; + target.addValues(1, 5); + auto stats = as(target.build()); + EXPECT_EQ(5, stats->getSum()); + + target.merge( + *buildColumnStatisticsFromProto(columnStatisticsWrapper, context())); + stats = as(target.build()); + EXPECT_EQ(stats, nullptr); + + // merge again + if (format == DwrfFormat::kDwrf) { + auto doubleProto = reinterpret_cast(protoPtr); + doubleProto->set_minimum(0); + doubleProto->set_maximum(1); + doubleProto->set_sum(100); + } else { + auto doubleProto = + reinterpret_cast(protoPtr); + doubleProto->set_minimum(0); + doubleProto->set_maximum(1); + doubleProto->set_sum(100); + } + target.merge( + *buildColumnStatisticsFromProto(columnStatisticsWrapper, context())); + stats = as(target.build()); + EXPECT_EQ(stats, nullptr); + + // add again + target.addValues(2); + stats = as(target.build()); + EXPECT_EQ(stats, nullptr); + } + + void testDoubleEmptyStats( + ColumnStatisticsWrapper columnStatisticsWrapper, + void* protoPtr, + DwrfFormat format) { + DoubleStatisticsBuilder target{options()}; + target.addValues(1, 5); + auto stats = as(target.build()); + EXPECT_EQ(5, stats->getSum()); + + // merge empty stats + target.merge( + *buildColumnStatisticsFromProto(columnStatisticsWrapper, context())); + stats = as(target.build()); + EXPECT_EQ(5, stats->getSum()); + + // merge again + if (format == DwrfFormat::kDwrf) { + auto columnPtr = reinterpret_cast(protoPtr); + columnPtr->clear_numberofvalues(); + } else { + auto columnPtr = + reinterpret_cast(protoPtr); + columnPtr->clear_numberofvalues(); + } + target.merge( + *buildColumnStatisticsFromProto(columnStatisticsWrapper, context())); + stats = as(target.build()); + EXPECT_EQ(stats, nullptr); + } + + void testDoubleNaN() { + DoubleStatisticsBuilder target{options()}; + // test nan. Nan causes fallback to basic stats. + target.addValues(std::numeric_limits::quiet_NaN()); + auto stats = as(target.build()); + EXPECT_EQ(stats, nullptr); + + target.reset(); + target.addValues(std::numeric_limits::infinity()); + target.addValues(-std::numeric_limits::infinity()); + stats = as(target.build()); + EXPECT_EQ(stats->getMaximum(), std::numeric_limits::infinity()); + EXPECT_EQ(stats->getMinimum(), -std::numeric_limits::infinity()); + EXPECT_FALSE(stats->getSum().has_value()); + + target.reset(); + DoubleStatisticsBuilder builder{options()}; + target.addValues(std::numeric_limits::infinity()); + builder.addValues(-std::numeric_limits::infinity()); + target.merge(*builder.build()); + stats = as(target.build()); + EXPECT_EQ(stats->getMaximum(), std::numeric_limits::infinity()); + EXPECT_EQ(stats->getMinimum(), -std::numeric_limits::infinity()); + EXPECT_FALSE(stats->getSum().has_value()); + } + + void testString() { + StringStatisticsBuilder builder{options()}; + // empty builder should have all defaults + EXPECT_FALSE(builder.getMinimum().has_value()); + EXPECT_FALSE(builder.getMaximum().has_value()); + EXPECT_EQ(0, builder.getTotalLength()); + EXPECT_EQ(0, builder.getNumberOfValues()); + + builder.addValues("xx"); + builder.addValues("bb"); + builder.addValues("yy"); + + // stats should be merged + StringStatisticsBuilder target{options()}; + target.merge(*builder.build()); + auto stats = as(target.build()); + EXPECT_EQ(3, stats->getNumberOfValues()); + EXPECT_EQ("bb", stats->getMinimum()); + EXPECT_EQ("yy", stats->getMaximum()); + EXPECT_EQ(6, stats->getTotalLength()); + + // stats should be merged again + builder.addValues("aa"); + builder.addValues("zz"); + target.merge(*builder.build()); + stats = as(target.build()); + EXPECT_EQ(8, stats->getNumberOfValues()); + EXPECT_EQ("aa", stats->getMinimum()); + EXPECT_EQ("zz", stats->getMaximum()); + EXPECT_EQ(16, stats->getTotalLength()); + + target.merge(*builder.build()); + stats = as(target.build()); + EXPECT_EQ(13, stats->getNumberOfValues()); + EXPECT_EQ("aa", stats->getMinimum()); + EXPECT_EQ("zz", stats->getMaximum()); + EXPECT_EQ(26, stats->getTotalLength()); + + // add value + target.addValues("zzz", 2); + stats = as(target.build()); + EXPECT_EQ(15, stats->getNumberOfValues()); + EXPECT_EQ("aa", stats->getMinimum()); + EXPECT_EQ("zzz", stats->getMaximum()); + EXPECT_EQ(32, stats->getTotalLength()); + + // reset + builder.reset(); + EXPECT_FALSE(builder.getMinimum().has_value()); + EXPECT_FALSE(builder.getMaximum().has_value()); + EXPECT_EQ(0, builder.getTotalLength()); + EXPECT_EQ(0, builder.getNumberOfValues()); + } + + void testStringMissingStats( + ColumnStatisticsWrapper columnStatisticsWrapper, + void* protoPtr, + DwrfFormat format) { + StringStatisticsBuilder target{options()}; + target.addValues("zz", 5); + auto stats = as(target.build()); + EXPECT_EQ(10, stats->getTotalLength()); + + // merge missing stats + target.merge( + *buildColumnStatisticsFromProto(columnStatisticsWrapper, context())); + stats = as(target.build()); + EXPECT_EQ(stats, nullptr); + + // merge again + if (format == DwrfFormat::kDwrf) { + auto strProto = reinterpret_cast(protoPtr); + strProto->set_minimum("aa"); + strProto->set_maximum("bb"); + strProto->set_sum(100); + } else { + auto strProto = reinterpret_cast(protoPtr); + strProto->set_minimum("aa"); + strProto->set_maximum("bb"); + strProto->set_sum(100); + } + + target.merge( + *buildColumnStatisticsFromProto(columnStatisticsWrapper, context())); + stats = as(target.build()); + EXPECT_EQ(stats, nullptr); + + // add again + target.addValues("aa"); + stats = as(target.build()); + EXPECT_EQ(stats, nullptr); + } + + void testStringEmptyStats( + ColumnStatisticsWrapper columnStatisticsWrapper, + void* protoPtr, + DwrfFormat format) { + StringStatisticsBuilder target{options()}; + target.addValues("zz", 5); + auto stats = as(target.build()); + EXPECT_EQ(10, stats->getTotalLength()); + + // merge empty stats + target.merge( + *buildColumnStatisticsFromProto(columnStatisticsWrapper, context())); + stats = as(target.build()); + EXPECT_EQ(10, stats->getTotalLength()); + + // merge again + if (format == DwrfFormat::kDwrf) { + auto proto = reinterpret_cast(protoPtr); + proto->clear_numberofvalues(); + } else { + auto proto = reinterpret_cast(protoPtr); + proto->clear_numberofvalues(); + } + + target.merge( + *buildColumnStatisticsFromProto(columnStatisticsWrapper, context())); + stats = as(target.build()); + EXPECT_EQ(stats, nullptr); + } + + void testStringLengthThreshold() { + StringStatisticsBuilder target{StatisticsBuilderOptions{2}}; + target.addValues("yyy"); + auto stats = as(target.build()); + EXPECT_FALSE(stats->getMinimum().has_value()); + EXPECT_FALSE(stats->getMaximum().has_value()); + + // merge empty stats + target.addValues("aa"); + target.addValues("zz"); + stats = as(target.build()); + EXPECT_EQ(stats->getMinimum(), "aa"); + EXPECT_EQ(stats->getMaximum(), "zz"); + } + + void testStringLengthOverflow( + ColumnStatisticsWrapper columnStatisticsWrapper, + void* protoPtr, + DwrfFormat format) { + // add value causing overflow + StringStatisticsBuilder target{options()}; + + if (format == DwrfFormat::kDwrf) { + auto strProto = reinterpret_cast(protoPtr); + strProto->set_sum(std::numeric_limits::max()); + strProto->set_minimum("foo"); + } else { + auto strProto = reinterpret_cast(protoPtr); + strProto->set_sum(std::numeric_limits::max()); + strProto->set_minimum("foo"); + } + target.merge( + *buildColumnStatisticsFromProto(columnStatisticsWrapper, context())); + EXPECT_TRUE(target.getTotalLength().has_value()); + auto stats = as(target.build()); + EXPECT_TRUE(stats->getTotalLength().has_value()); + + target.addValues("foo"); + EXPECT_TRUE(target.getTotalLength().has_value()); + stats = as(target.build()); + EXPECT_FALSE(stats->getTotalLength().has_value()); + + // merge causing overflow + target.reset(); + target.merge( + *buildColumnStatisticsFromProto(columnStatisticsWrapper, context())); + target.merge( + *buildColumnStatisticsFromProto(columnStatisticsWrapper, context())); + EXPECT_TRUE(target.getTotalLength().has_value()); + stats = as(target.build()); + EXPECT_FALSE(stats->getTotalLength().has_value()); + } + + void testBoolean() { + BooleanStatisticsBuilder builder{options()}; + // empty builder should have all defaults + EXPECT_EQ(0, builder.getTrueCount()); + EXPECT_EQ(0, builder.getNumberOfValues()); + + builder.addValues(true, 2); + + // stats should be merged + BooleanStatisticsBuilder target{options()}; + target.merge(*builder.build()); + auto stats = as(target.build()); + EXPECT_EQ(2, stats->getNumberOfValues()); + EXPECT_EQ(2, stats->getTrueCount()); + + // stats should be merged again + target.merge(*builder.build()); + stats = as(target.build()); + EXPECT_EQ(4, stats->getNumberOfValues()); + EXPECT_EQ(4, stats->getTrueCount()); + + // add value + target.addValues(false, 2); + stats = as(target.build()); + EXPECT_EQ(6, stats->getNumberOfValues()); + EXPECT_EQ(4, stats->getTrueCount()); + + // reset + builder.reset(); + EXPECT_EQ(0, builder.getTrueCount()); + EXPECT_EQ(0, builder.getNumberOfValues()); + } + + void testBooleanMissingStats( + ColumnStatisticsWrapper columnStatisticsWrapper, + void* protoPtr, + DwrfFormat format) { + BooleanStatisticsBuilder target{options()}; + target.addValues(true, 5); + auto stats = as(target.build()); + EXPECT_EQ(5, stats->getTrueCount()); + + // merge missing stats + target.merge( + *buildColumnStatisticsFromProto(columnStatisticsWrapper, context())); + stats = as(target.build()); + EXPECT_EQ(stats, nullptr); + + // merge again + if (format == DwrfFormat::kDwrf) { + auto boolProto = reinterpret_cast(protoPtr); + boolProto->add_count(1); + } else { + auto boolProto = + reinterpret_cast(protoPtr); + boolProto->add_count(1); + } + + target.merge( + *buildColumnStatisticsFromProto(columnStatisticsWrapper, context())); + stats = as(target.build()); + EXPECT_EQ(stats, nullptr); + + // add again + target.addValues(true); + stats = as(target.build()); + EXPECT_EQ(stats, nullptr); + } + + void testBooleanEmptyStats( + ColumnStatisticsWrapper columnStatisticsWrapper, + void* protoPtr, + DwrfFormat format) { + BooleanStatisticsBuilder target{options()}; + target.addValues(true, 5); + auto stats = as(target.build()); + EXPECT_EQ(5, stats->getTrueCount()); + + // merge empty stats + target.merge( + *buildColumnStatisticsFromProto(columnStatisticsWrapper, context())); + stats = as(target.build()); + EXPECT_EQ(5, stats->getTrueCount()); + + // merge again + if (format == DwrfFormat::kDwrf) { + auto proto = reinterpret_cast(protoPtr); + proto->clear_numberofvalues(); + } else { + auto proto = reinterpret_cast(protoPtr); + proto->clear_numberofvalues(); + } + + target.merge( + *buildColumnStatisticsFromProto(columnStatisticsWrapper, context())); + stats = as(target.build()); + EXPECT_EQ(stats, nullptr); + } + + void testBasic() { + StatisticsBuilder builder{options()}; + EXPECT_EQ(0, builder.getNumberOfValues()); + EXPECT_EQ(0, builder.getRawSize()); + EXPECT_FALSE(builder.hasNull().value()); + + builder.increaseValueCount(5); + builder.increaseRawSize(10); + builder.setHasNull(); + + // stats should be merged + StatisticsBuilder target{options()}; + target.merge(*builder.build()); + auto stats = target.build(); + EXPECT_EQ(5, stats->getNumberOfValues()); + EXPECT_EQ(10, stats->getRawSize()); + EXPECT_TRUE(stats->hasNull().value()); + + // stats should be merged again + target.merge(*builder.build()); + stats = target.build(); + EXPECT_EQ(10, stats->getNumberOfValues()); + EXPECT_EQ(20, stats->getRawSize()); + EXPECT_TRUE(stats->hasNull().value()); + + // add value + target.increaseValueCount(1); + target.increaseRawSize(2); + stats = target.build(); + EXPECT_EQ(11, stats->getNumberOfValues()); + EXPECT_EQ(22, stats->getRawSize()); + EXPECT_TRUE(stats->hasNull().value()); + + // reset + builder.reset(); + EXPECT_EQ(0, builder.getNumberOfValues()); + EXPECT_EQ(0, builder.getRawSize()); + EXPECT_FALSE(builder.hasNull().value()); + } + + void testBasicMissingStats(ColumnStatisticsWrapper columnStatisticsWrapper) { + StatisticsBuilder target{options()}; + target.increaseValueCount(5); + target.increaseRawSize(10); + auto stats = target.build(); + EXPECT_EQ(5, stats->getNumberOfValues()); + EXPECT_EQ(10, stats->getRawSize()); + EXPECT_FALSE(stats->hasNull().value()); + + // merge missing stats + target.merge( + *buildColumnStatisticsFromProto(columnStatisticsWrapper, context())); + stats = target.build(); + EXPECT_FALSE(stats->getNumberOfValues().has_value()); + EXPECT_FALSE(stats->getRawSize().has_value()); + EXPECT_FALSE(stats->hasNull().has_value()); + + // add again + target.increaseValueCount(5); + target.increaseRawSize(10); + target.setHasNull(); + stats = target.build(); + EXPECT_FALSE(stats->getNumberOfValues().has_value()); + EXPECT_FALSE(stats->getRawSize().has_value()); + EXPECT_TRUE(stats->hasNull().value()); + } + + void testBasicHasNull( + ColumnStatisticsWrapper columnStatisticsWrapper, + DwrfFormat format) { + enum class State { kTrue = 0, kFalse, kMissing }; + auto test = [&](State to, State from, State expected) { + StatisticsBuilder target{options()}; + if (to == State::kTrue) { + target.setHasNull(); + } else if (to == State::kMissing) { + // merge against unknown + target.merge(*buildColumnStatisticsFromProto( + columnStatisticsWrapper, context())); + } + + if (format == DwrfFormat::kDwrf) { + auto columnStatistics = + google::protobuf::Arena::CreateMessage( + arena_.get()); + if (from == State::kFalse) { + columnStatistics->set_hasnull(false); + } else if (from == State::kTrue) { + columnStatistics->set_hasnull(true); + } + target.merge(*buildColumnStatisticsFromProto( + ColumnStatisticsWrapper(columnStatistics), context())); + } else { + auto columnStatistics = google::protobuf::Arena::CreateMessage< + proto::orc::ColumnStatistics>(arena_.get()); + if (from == State::kFalse) { + columnStatistics->set_hasnull(false); + } else if (from == State::kTrue) { + columnStatistics->set_hasnull(true); + } + target.merge(*buildColumnStatisticsFromProto( + ColumnStatisticsWrapper(columnStatistics), context())); + } + auto stats = target.build(); + if (expected == State::kFalse) { + EXPECT_FALSE(stats->hasNull().value()); + } else if (expected == State::kTrue) { + EXPECT_TRUE(stats->hasNull().value()); + } else { + EXPECT_FALSE(stats->hasNull().has_value()); + } + }; + + // true / any => true + test(State::kTrue, State::kTrue, State::kTrue); + test(State::kTrue, State::kFalse, State::kTrue); + test(State::kTrue, State::kMissing, State::kTrue); + // unknown / true => true + // unknown / unknown or false => unknown + test(State::kMissing, State::kTrue, State::kTrue); + test(State::kMissing, State::kFalse, State::kMissing); + test(State::kMissing, State::kMissing, State::kMissing); + // false / unknown => unknown + // false / false => false + // false / true => true + test(State::kFalse, State::kMissing, State::kMissing); + test(State::kFalse, State::kFalse, State::kFalse); + test(State::kFalse, State::kTrue, State::kTrue); + } + + void testBinary() { + BinaryStatisticsBuilder builder{options()}; + // empty builder should have all defaults + EXPECT_EQ(0, builder.getTotalLength()); + EXPECT_EQ(0, builder.getNumberOfValues()); + + builder.addValues(5, 2); + + // stats should be merged + BinaryStatisticsBuilder target{options()}; + target.merge(*builder.build()); + auto stats = as(target.build()); + EXPECT_EQ(2, stats->getNumberOfValues()); + EXPECT_EQ(10, stats->getTotalLength()); + + // stats should be merged again + target.merge(*builder.build()); + stats = as(target.build()); + EXPECT_EQ(4, stats->getNumberOfValues()); + EXPECT_EQ(20, stats->getTotalLength()); + + // add value + target.addValues(10); + stats = as(target.build()); + EXPECT_EQ(5, stats->getNumberOfValues()); + EXPECT_EQ(30, stats->getTotalLength()); + + // reset + builder.reset(); + EXPECT_EQ(0, builder.getTotalLength()); + EXPECT_EQ(0, builder.getNumberOfValues()); + } + + void testBinaryMissingStats( + ColumnStatisticsWrapper columnStatisticsWrapper, + void* protoPtr, + DwrfFormat format) { + BinaryStatisticsBuilder target{options()}; + target.addValues(5); + auto stats = as(target.build()); + EXPECT_EQ(5, stats->getTotalLength()); + + // merge missing stats + target.merge( + *buildColumnStatisticsFromProto(columnStatisticsWrapper, context())); + stats = as(target.build()); + EXPECT_EQ(stats, nullptr); + + // merge again + if (format == DwrfFormat::kDwrf) { + auto binProto = reinterpret_cast(protoPtr); + binProto->set_sum(100); + } else { + auto binProto = reinterpret_cast(protoPtr); + binProto->set_sum(100); + } + + target.merge( + *buildColumnStatisticsFromProto(columnStatisticsWrapper, context())); + stats = as(target.build()); + EXPECT_EQ(stats, nullptr); + + // add again + target.addValues(10); + stats = as(target.build()); + EXPECT_EQ(stats, nullptr); + } + + void testBinaryEmptyStats( + ColumnStatisticsWrapper columnStatisticsWrapper, + void* protoPtr, + DwrfFormat format) { + BinaryStatisticsBuilder target{options()}; + target.addValues(5); + auto stats = as(target.build()); + EXPECT_EQ(5, stats->getTotalLength()); + + // merge empty stats + target.merge( + *buildColumnStatisticsFromProto(columnStatisticsWrapper, context())); + stats = as(target.build()); + EXPECT_EQ(5, stats->getTotalLength()); + + // merge again + if (format == DwrfFormat::kDwrf) { + auto proto = reinterpret_cast(protoPtr); + proto->clear_numberofvalues(); + } else { + auto proto = reinterpret_cast(protoPtr); + proto->clear_numberofvalues(); + } + + target.merge( + *buildColumnStatisticsFromProto(columnStatisticsWrapper, context())); + stats = as(target.build()); + EXPECT_EQ(stats, nullptr); + } + + void testBinaryLengthOverflow( + ColumnStatisticsWrapper columnStatisticsWrapper, + void* protoPtr, + DwrfFormat format) { + // add value causing overflow + BinaryStatisticsBuilder target{options()}; + target.addValues(std::numeric_limits::max()); + auto stats = as(target.build()); + EXPECT_NE(stats, nullptr); + target.addValues(1); + EXPECT_TRUE(target.getTotalLength().has_value()); + stats = as(target.build()); + EXPECT_EQ(stats, nullptr); + + // merge causing overflow + target.reset(); + target.addValues(std::numeric_limits::max()); + + if (format == DwrfFormat::kDwrf) { + auto binProto = reinterpret_cast(protoPtr); + binProto->set_sum(1); + } else { + auto binProto = reinterpret_cast(protoPtr); + binProto->set_sum(1); + } + + target.merge( + *buildColumnStatisticsFromProto(columnStatisticsWrapper, context())); + EXPECT_TRUE(target.getTotalLength().has_value()); + stats = as(target.build()); + EXPECT_EQ(stats, nullptr); + } + + void testInitialSize() { + StatisticsBuilder target{options()}; + target.increaseValueCount(1); + EXPECT_FALSE(target.getSize().has_value()); + auto stats = target.build(); + EXPECT_FALSE(stats->getSize().has_value()); + + StatisticsBuilder target2{StatisticsBuilderOptions{16, 100U}}; + target2.increaseValueCount(1); + EXPECT_EQ(target2.getSize().value(), 100); + stats = target2.build(); + EXPECT_EQ(stats->getSize().value(), 100); + target2.reset(); + EXPECT_EQ(target2.getSize().value(), 100); + stats = target2.build(); + EXPECT_EQ(stats->getSize().value(), 100); + } + + protected: + StatisticsBuilderOptions options() { + StatisticsBuilderOptions options{16}; + return options; + } + + facebook::velox::dwrf::StatsContext context() { + facebook::velox::dwrf::StatsContext context{WriterVersion_CURRENT}; + return context; + } + + template + std::unique_ptr as(std::unique_ptr&& ptr) { + auto p = ptr.release(); + if (auto cp = dynamic_cast(p)) { + return std::unique_ptr(cp); + } + delete p; + return nullptr; + } + + private: + std::unique_ptr arena_; +}; +} // namespace facebook::velox::dwrf diff --git a/velox/dwio/dwrf/test/ColumnWriterStatsTests.cpp b/velox/dwio/dwrf/test/ColumnWriterStatsTests.cpp index fd770677b35f..7a39d4fdc4c9 100644 --- a/velox/dwio/dwrf/test/ColumnWriterStatsTests.cpp +++ b/velox/dwio/dwrf/test/ColumnWriterStatsTests.cpp @@ -127,7 +127,7 @@ void verifyStats( for (auto count = 0; count < rowIndex->entry_size(); count++) { auto stridStatistics = buildColumnStatisticsFromProto( - rowIndex->entry(count).statistics(), + ColumnStatisticsWrapper(&rowIndex->entry(count).statistics()), dwrf::StatsContext(WriterVersion_CURRENT)); // TODO, take in a lambda to verify the entire statistics instead of Just // the rawSize. diff --git a/velox/dwio/dwrf/test/E2EWriterTest.cpp b/velox/dwio/dwrf/test/E2EWriterTest.cpp index 22d27f0f6d77..b526f055f332 100644 --- a/velox/dwio/dwrf/test/E2EWriterTest.cpp +++ b/velox/dwio/dwrf/test/E2EWriterTest.cpp @@ -229,10 +229,10 @@ class E2EWriterTest : public testing::Test { } } auto stats = reader->getFooter().statistics(mapTypeId); - ASSERT_TRUE(stats.has_mapstatistics()); - ASSERT_EQ(featureStreamSizes.size(), stats.mapstatistics().stats_size()); - for (size_t i = 0; i != stats.mapstatistics().stats_size(); ++i) { - const auto& entry = stats.mapstatistics().stats(i); + ASSERT_TRUE(stats.hasMapStatistics()); + ASSERT_EQ(featureStreamSizes.size(), stats.mapStatistics().stats_size()); + for (size_t i = 0; i != stats.mapStatistics().stats_size(); ++i) { + const auto& entry = stats.mapStatistics().stats(i); ASSERT_TRUE(entry.stats().has_size()); EXPECT_EQ( featureStreamSizes.at(dwrf::constructKey(entry.key())), diff --git a/velox/dwio/dwrf/test/TestColumnStatistics.cpp b/velox/dwio/dwrf/test/TestColumnStatistics.cpp deleted file mode 100644 index 99bef6ce8c26..000000000000 --- a/velox/dwio/dwrf/test/TestColumnStatistics.cpp +++ /dev/null @@ -1,1138 +0,0 @@ -/* - * Copyright (c) Facebook, Inc. and its affiliates. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include "velox/dwio/common/Statistics.h" -#include "velox/dwio/dwrf/writer/StatisticsBuilder.h" -#include "velox/type/fbhive/HiveTypeParser.h" - -using namespace facebook::velox::dwio::common; -using namespace facebook::velox::dwrf; -using facebook::velox::type::fbhive::HiveTypeParser; - -StatisticsBuilderOptions options{16}; - -facebook::velox::dwrf::StatsContext context{WriterVersion_CURRENT}; -template -std::unique_ptr as(std::unique_ptr&& ptr) { - auto p = ptr.release(); - if (auto cp = dynamic_cast(p)) { - return std::unique_ptr(cp); - } - delete p; - return nullptr; -} - -TEST(StatisticsBuilder, size) { - { - StatisticsBuilder missingSize{options}; - ASSERT_FALSE(missingSize.getSize().has_value()); - StatisticsBuilder hasSize{ - StatisticsBuilderOptions{/*stringLengthLimit=*/32, /*initialSize=*/10}}; - ASSERT_TRUE(hasSize.getSize().has_value()); - EXPECT_EQ(10, hasSize.getSize().value()); - - hasSize.merge(missingSize, /*ignoreSize=*/true); - EXPECT_FALSE(missingSize.getSize().has_value()); - ASSERT_TRUE(hasSize.getSize().has_value()); - EXPECT_EQ(10, hasSize.getSize().value()); - - // Coercing to missing/invalid size when not ignoring by default. - hasSize.merge(missingSize); - EXPECT_FALSE(missingSize.getSize().has_value()); - EXPECT_FALSE(hasSize.getSize().has_value()); - } - { - StatisticsBuilder from{options}; - ASSERT_FALSE(from.getSize().has_value()); - - StatisticsBuilder to{options}; - ASSERT_FALSE(to.getSize().has_value()); - to.incrementSize(64); - ASSERT_FALSE(to.getSize().has_value()); - - to.ensureSize(); - ASSERT_EQ(0, to.getSize().value()); - to.incrementSize(64); - EXPECT_EQ(64, to.getSize().value()); - - to.merge(from, /*ignoreSize=*/true); - EXPECT_FALSE(from.getSize().has_value()); - EXPECT_EQ(64, to.getSize().value()); - - to.merge(from); - EXPECT_FALSE(from.getSize().has_value()); - EXPECT_FALSE(to.getSize().has_value()); - } -} - -TEST(StatisticsBuilder, integer) { - IntegerStatisticsBuilder builder{options}; - // empty builder should have all defaults - EXPECT_EQ(std::numeric_limits::max(), builder.getMinimum()); - EXPECT_EQ(std::numeric_limits::min(), builder.getMaximum()); - EXPECT_EQ(0, builder.getSum()); - EXPECT_EQ(0, builder.getNumberOfValues()); - - builder.addValues(3); - builder.addValues(1); - builder.addValues(5); - - // stats should be merged - IntegerStatisticsBuilder target{options}; - target.merge(*builder.build()); - auto stats = as(target.build()); - EXPECT_EQ(3, stats->getNumberOfValues()); - EXPECT_EQ(1, stats->getMinimum()); - EXPECT_EQ(5, stats->getMaximum()); - EXPECT_EQ(9, stats->getSum()); - - // stats should be merged again - builder.addValues(0); - builder.addValues(6); - target.merge(*builder.build()); - stats = as(target.build()); - EXPECT_EQ(8, stats->getNumberOfValues()); - EXPECT_EQ(0, stats->getMinimum()); - EXPECT_EQ(6, stats->getMaximum()); - EXPECT_EQ(24, stats->getSum()); - - target.merge(*builder.build()); - stats = as(target.build()); - EXPECT_EQ(13, stats->getNumberOfValues()); - EXPECT_EQ(0, stats->getMinimum()); - EXPECT_EQ(6, stats->getMaximum()); - EXPECT_EQ(39, stats->getSum()); - - // add value - target.addValues(100, 2); - stats = as(target.build()); - EXPECT_EQ(15, stats->getNumberOfValues()); - EXPECT_EQ(0, stats->getMinimum()); - EXPECT_EQ(100, stats->getMaximum()); - EXPECT_EQ(239, stats->getSum()); - - // reset - builder.reset(); - EXPECT_EQ(std::numeric_limits::max(), builder.getMinimum()); - EXPECT_EQ(std::numeric_limits::min(), builder.getMaximum()); - EXPECT_EQ(0, builder.getSum()); - EXPECT_EQ(0, builder.getNumberOfValues()); -} - -TEST(StatisticsBuilder, integerMissingStats) { - IntegerStatisticsBuilder target{options}; - target.addValues(1, 5); - auto stats = as(target.build()); - EXPECT_EQ(5, stats->getSum()); - - // merge missing stats - proto::ColumnStatistics proto; - auto intProto = proto.mutable_intstatistics(); - target.merge(*buildColumnStatisticsFromProto(proto, context)); - stats = as(target.build()); - EXPECT_EQ(stats, nullptr); - - // merge again - intProto->set_minimum(0); - intProto->set_maximum(1); - intProto->set_sum(100); - target.merge(*buildColumnStatisticsFromProto(proto, context)); - stats = as(target.build()); - EXPECT_EQ(stats, nullptr); - - // add again - target.addValues(2); - stats = as(target.build()); - EXPECT_EQ(stats, nullptr); -} - -TEST(StatisticsBuilder, integerEmptyStats) { - IntegerStatisticsBuilder target{options}; - target.addValues(1, 5); - auto stats = as(target.build()); - EXPECT_EQ(5, stats->getSum()); - - // merge empty stats - proto::ColumnStatistics proto; - proto.set_numberofvalues(0); - target.merge(*buildColumnStatisticsFromProto(proto, context)); - stats = as(target.build()); - EXPECT_EQ(5, stats->getSum()); - - // merge again - proto.clear_numberofvalues(); - target.merge(*buildColumnStatisticsFromProto(proto, context)); - stats = as(target.build()); - EXPECT_EQ(stats, nullptr); -} - -TEST(StatisticsBuilder, integerOverflow) { - IntegerStatisticsBuilder target{options}; - auto testMinMax = [&](int64_t val1, int64_t val2, int64_t min, int64_t max) { - target.reset(); - target.addValues(val1); - auto stats = as(target.build()); - EXPECT_EQ(val1, stats->getMaximum()); - EXPECT_EQ(val1, stats->getMinimum()); - EXPECT_EQ(val1, stats->getSum()); - - target.addValues(val2); - stats = as(target.build()); - EXPECT_EQ(max, stats->getMaximum()); - EXPECT_EQ(min, stats->getMinimum()); - EXPECT_FALSE(stats->getSum().has_value()); - }; - - testMinMax( - std::numeric_limits::min(), - -1, - std::numeric_limits::min(), - -1); - testMinMax( - std::numeric_limits::max(), - 1, - 1, - std::numeric_limits::max()); - - // make sure we also capture overflow that happened for adding multiple items - target.reset(); - target.addValues(std::numeric_limits::max() / 10, 11); - auto stats = as(target.build()); - EXPECT_EQ(11, stats->getNumberOfValues()); - EXPECT_EQ(stats->getMaximum().value(), stats->getMinimum().value()); - EXPECT_FALSE(stats->getSum().has_value()); - - // merge overflow - auto testMergeOverflow = [&](int64_t val1, int64_t val2) { - target.reset(); - target.addValues(val1); - IntegerStatisticsBuilder builder{options}; - builder.addValues(val2); - target.merge(builder); - stats = as(target.build()); - EXPECT_FALSE(stats->getSum().has_value()); - }; - testMergeOverflow(std::numeric_limits::min(), -1); - testMergeOverflow(std::numeric_limits::max(), 1); -} - -TEST(StatisticsBuilder, doubles) { - DoubleStatisticsBuilder builder{options}; - // empty builder should have all defaults - EXPECT_EQ(std::numeric_limits::infinity(), builder.getMinimum()); - EXPECT_EQ(-std::numeric_limits::infinity(), builder.getMaximum()); - EXPECT_EQ(0, builder.getSum()); - EXPECT_EQ(0, builder.getNumberOfValues()); - - builder.addValues(3); - builder.addValues(1); - builder.addValues(5); - - // stats should be merged - DoubleStatisticsBuilder target{options}; - target.merge(*builder.build()); - auto stats = as(target.build()); - EXPECT_EQ(3, stats->getNumberOfValues()); - EXPECT_EQ(1, stats->getMinimum()); - EXPECT_EQ(5, stats->getMaximum()); - EXPECT_EQ(9, stats->getSum()); - - // stats should be merged again - builder.addValues(0); - builder.addValues(6); - target.merge(*builder.build()); - stats = as(target.build()); - EXPECT_EQ(8, stats->getNumberOfValues()); - EXPECT_EQ(0, stats->getMinimum()); - EXPECT_EQ(6, stats->getMaximum()); - EXPECT_EQ(24, stats->getSum()); - - target.merge(*builder.build()); - stats = as(target.build()); - EXPECT_EQ(13, stats->getNumberOfValues()); - EXPECT_EQ(0, stats->getMinimum()); - EXPECT_EQ(6, stats->getMaximum()); - EXPECT_EQ(39, stats->getSum()); - - // add value - target.addValues(100, 2); - stats = as(target.build()); - EXPECT_EQ(15, stats->getNumberOfValues()); - EXPECT_EQ(0, stats->getMinimum()); - EXPECT_EQ(100, stats->getMaximum()); - EXPECT_EQ(239, stats->getSum()); - - // reset - builder.reset(); - EXPECT_EQ(std::numeric_limits::infinity(), builder.getMinimum()); - EXPECT_EQ(-std::numeric_limits::infinity(), builder.getMaximum()); - EXPECT_EQ(0, builder.getSum()); - EXPECT_EQ(0, builder.getNumberOfValues()); -} - -TEST(StatisticsBuilder, doubleMissingStats) { - DoubleStatisticsBuilder target{options}; - target.addValues(1, 5); - auto stats = as(target.build()); - EXPECT_EQ(5, stats->getSum()); - - // merge missing stats - proto::ColumnStatistics proto; - auto doubleProto = proto.mutable_doublestatistics(); - target.merge(*buildColumnStatisticsFromProto(proto, context)); - stats = as(target.build()); - EXPECT_EQ(stats, nullptr); - - // merge again - doubleProto->set_minimum(0); - doubleProto->set_maximum(1); - doubleProto->set_sum(100); - target.merge(*buildColumnStatisticsFromProto(proto, context)); - stats = as(target.build()); - EXPECT_EQ(stats, nullptr); - - // add again - target.addValues(2); - stats = as(target.build()); - EXPECT_EQ(stats, nullptr); -} - -TEST(StatisticsBuilder, doubleEmptyStats) { - DoubleStatisticsBuilder target{options}; - target.addValues(1, 5); - auto stats = as(target.build()); - EXPECT_EQ(5, stats->getSum()); - - // merge empty stats - proto::ColumnStatistics proto; - proto.set_numberofvalues(0); - target.merge(*buildColumnStatisticsFromProto(proto, context)); - stats = as(target.build()); - EXPECT_EQ(5, stats->getSum()); - - // merge again - proto.clear_numberofvalues(); - target.merge(*buildColumnStatisticsFromProto(proto, context)); - stats = as(target.build()); - EXPECT_EQ(stats, nullptr); -} - -TEST(StatisticsBuilder, doubleNaN) { - DoubleStatisticsBuilder target{options}; - // test nan. Nan causes fallback to basic stats. - target.addValues(std::numeric_limits::quiet_NaN()); - auto stats = as(target.build()); - EXPECT_EQ(stats, nullptr); - - target.reset(); - target.addValues(std::numeric_limits::infinity()); - target.addValues(-std::numeric_limits::infinity()); - stats = as(target.build()); - EXPECT_EQ(stats->getMaximum(), std::numeric_limits::infinity()); - EXPECT_EQ(stats->getMinimum(), -std::numeric_limits::infinity()); - EXPECT_FALSE(stats->getSum().has_value()); - - target.reset(); - DoubleStatisticsBuilder builder{options}; - target.addValues(std::numeric_limits::infinity()); - builder.addValues(-std::numeric_limits::infinity()); - target.merge(*builder.build()); - stats = as(target.build()); - EXPECT_EQ(stats->getMaximum(), std::numeric_limits::infinity()); - EXPECT_EQ(stats->getMinimum(), -std::numeric_limits::infinity()); - EXPECT_FALSE(stats->getSum().has_value()); -} - -TEST(StatisticsBuilder, string) { - StringStatisticsBuilder builder{options}; - // empty builder should have all defaults - EXPECT_FALSE(builder.getMinimum().has_value()); - EXPECT_FALSE(builder.getMaximum().has_value()); - EXPECT_EQ(0, builder.getTotalLength()); - EXPECT_EQ(0, builder.getNumberOfValues()); - - builder.addValues("xx"); - builder.addValues("bb"); - builder.addValues("yy"); - - // stats should be merged - StringStatisticsBuilder target{options}; - target.merge(*builder.build()); - auto stats = as(target.build()); - EXPECT_EQ(3, stats->getNumberOfValues()); - EXPECT_EQ("bb", stats->getMinimum()); - EXPECT_EQ("yy", stats->getMaximum()); - EXPECT_EQ(6, stats->getTotalLength()); - - // stats should be merged again - builder.addValues("aa"); - builder.addValues("zz"); - target.merge(*builder.build()); - stats = as(target.build()); - EXPECT_EQ(8, stats->getNumberOfValues()); - EXPECT_EQ("aa", stats->getMinimum()); - EXPECT_EQ("zz", stats->getMaximum()); - EXPECT_EQ(16, stats->getTotalLength()); - - target.merge(*builder.build()); - stats = as(target.build()); - EXPECT_EQ(13, stats->getNumberOfValues()); - EXPECT_EQ("aa", stats->getMinimum()); - EXPECT_EQ("zz", stats->getMaximum()); - EXPECT_EQ(26, stats->getTotalLength()); - - // add value - target.addValues("zzz", 2); - stats = as(target.build()); - EXPECT_EQ(15, stats->getNumberOfValues()); - EXPECT_EQ("aa", stats->getMinimum()); - EXPECT_EQ("zzz", stats->getMaximum()); - EXPECT_EQ(32, stats->getTotalLength()); - - // reset - builder.reset(); - EXPECT_FALSE(builder.getMinimum().has_value()); - EXPECT_FALSE(builder.getMaximum().has_value()); - EXPECT_EQ(0, builder.getTotalLength()); - EXPECT_EQ(0, builder.getNumberOfValues()); -} - -TEST(StatisticsBuilder, stringMissingStats) { - StringStatisticsBuilder target{options}; - target.addValues("zz", 5); - auto stats = as(target.build()); - EXPECT_EQ(10, stats->getTotalLength()); - - // merge missing stats - proto::ColumnStatistics proto; - auto strProto = proto.mutable_stringstatistics(); - target.merge(*buildColumnStatisticsFromProto(proto, context)); - stats = as(target.build()); - EXPECT_EQ(stats, nullptr); - - // merge again - strProto->set_minimum("aa"); - strProto->set_maximum("bb"); - strProto->set_sum(100); - target.merge(*buildColumnStatisticsFromProto(proto, context)); - stats = as(target.build()); - EXPECT_EQ(stats, nullptr); - - // add again - target.addValues("aa"); - stats = as(target.build()); - EXPECT_EQ(stats, nullptr); -} - -TEST(StatisticsBuilder, stringEmptyStats) { - StringStatisticsBuilder target{options}; - target.addValues("zz", 5); - auto stats = as(target.build()); - EXPECT_EQ(10, stats->getTotalLength()); - - // merge empty stats - proto::ColumnStatistics proto; - proto.set_numberofvalues(0); - target.merge(*buildColumnStatisticsFromProto(proto, context)); - stats = as(target.build()); - EXPECT_EQ(10, stats->getTotalLength()); - - // merge again - proto.clear_numberofvalues(); - target.merge(*buildColumnStatisticsFromProto(proto, context)); - stats = as(target.build()); - EXPECT_EQ(stats, nullptr); -} - -TEST(StatisticsBuilder, stringLengthThreshold) { - StringStatisticsBuilder target{StatisticsBuilderOptions{2}}; - target.addValues("yyy"); - auto stats = as(target.build()); - EXPECT_FALSE(stats->getMinimum().has_value()); - EXPECT_FALSE(stats->getMaximum().has_value()); - - // merge empty stats - target.addValues("aa"); - target.addValues("zz"); - stats = as(target.build()); - EXPECT_EQ(stats->getMinimum(), "aa"); - EXPECT_EQ(stats->getMaximum(), "zz"); -} - -TEST(StatisticsBuilder, stringLengthOverflow) { - // add value causing overflow - StringStatisticsBuilder target{options}; - proto::ColumnStatistics proto; - proto.set_numberofvalues(1); - auto strProto = proto.mutable_stringstatistics(); - strProto->set_sum(std::numeric_limits::max()); - strProto->set_minimum("foo"); - target.merge(*buildColumnStatisticsFromProto(proto, context)); - EXPECT_TRUE(target.getTotalLength().has_value()); - auto stats = as(target.build()); - EXPECT_TRUE(stats->getTotalLength().has_value()); - - target.addValues("foo"); - EXPECT_TRUE(target.getTotalLength().has_value()); - stats = as(target.build()); - EXPECT_FALSE(stats->getTotalLength().has_value()); - - // merge causing overflow - target.reset(); - target.merge(*buildColumnStatisticsFromProto(proto, context)); - target.merge(*buildColumnStatisticsFromProto(proto, context)); - EXPECT_TRUE(target.getTotalLength().has_value()); - stats = as(target.build()); - EXPECT_FALSE(stats->getTotalLength().has_value()); -} - -TEST(StatisticsBuilder, boolean) { - BooleanStatisticsBuilder builder{options}; - // empty builder should have all defaults - EXPECT_EQ(0, builder.getTrueCount()); - EXPECT_EQ(0, builder.getNumberOfValues()); - - builder.addValues(true, 2); - - // stats should be merged - BooleanStatisticsBuilder target{options}; - target.merge(*builder.build()); - auto stats = as(target.build()); - EXPECT_EQ(2, stats->getNumberOfValues()); - EXPECT_EQ(2, stats->getTrueCount()); - - // stats should be merged again - target.merge(*builder.build()); - stats = as(target.build()); - EXPECT_EQ(4, stats->getNumberOfValues()); - EXPECT_EQ(4, stats->getTrueCount()); - - // add value - target.addValues(false, 2); - stats = as(target.build()); - EXPECT_EQ(6, stats->getNumberOfValues()); - EXPECT_EQ(4, stats->getTrueCount()); - - // reset - builder.reset(); - EXPECT_EQ(0, builder.getTrueCount()); - EXPECT_EQ(0, builder.getNumberOfValues()); -} - -TEST(StatisticsBuilder, booleanMissingStats) { - BooleanStatisticsBuilder target{options}; - target.addValues(true, 5); - auto stats = as(target.build()); - EXPECT_EQ(5, stats->getTrueCount()); - - // merge missing stats - proto::ColumnStatistics proto; - auto boolProto = proto.mutable_bucketstatistics(); - target.merge(*buildColumnStatisticsFromProto(proto, context)); - stats = as(target.build()); - EXPECT_EQ(stats, nullptr); - - // merge again - boolProto->add_count(1); - target.merge(*buildColumnStatisticsFromProto(proto, context)); - stats = as(target.build()); - EXPECT_EQ(stats, nullptr); - - // add again - target.addValues(true); - stats = as(target.build()); - EXPECT_EQ(stats, nullptr); -} - -TEST(StatisticsBuilder, booleanEmptyStats) { - BooleanStatisticsBuilder target{options}; - target.addValues(true, 5); - auto stats = as(target.build()); - EXPECT_EQ(5, stats->getTrueCount()); - - // merge empty stats - proto::ColumnStatistics proto; - proto.set_numberofvalues(0); - target.merge(*buildColumnStatisticsFromProto(proto, context)); - stats = as(target.build()); - EXPECT_EQ(5, stats->getTrueCount()); - - // merge again - proto.clear_numberofvalues(); - target.merge(*buildColumnStatisticsFromProto(proto, context)); - stats = as(target.build()); - EXPECT_EQ(stats, nullptr); -} - -TEST(StatisticsBuilder, basic) { - StatisticsBuilder builder{options}; - EXPECT_EQ(0, builder.getNumberOfValues()); - EXPECT_EQ(0, builder.getRawSize()); - EXPECT_FALSE(builder.hasNull().value()); - - builder.increaseValueCount(5); - builder.increaseRawSize(10); - builder.setHasNull(); - - // stats should be merged - StatisticsBuilder target{options}; - target.merge(*builder.build()); - auto stats = target.build(); - EXPECT_EQ(5, stats->getNumberOfValues()); - EXPECT_EQ(10, stats->getRawSize()); - EXPECT_TRUE(stats->hasNull().value()); - - // stats should be merged again - target.merge(*builder.build()); - stats = target.build(); - EXPECT_EQ(10, stats->getNumberOfValues()); - EXPECT_EQ(20, stats->getRawSize()); - EXPECT_TRUE(stats->hasNull().value()); - - // add value - target.increaseValueCount(1); - target.increaseRawSize(2); - stats = target.build(); - EXPECT_EQ(11, stats->getNumberOfValues()); - EXPECT_EQ(22, stats->getRawSize()); - EXPECT_TRUE(stats->hasNull().value()); - - // reset - builder.reset(); - EXPECT_EQ(0, builder.getNumberOfValues()); - EXPECT_EQ(0, builder.getRawSize()); - EXPECT_FALSE(builder.hasNull().value()); -} - -TEST(StatisticsBuilder, basicMissingStats) { - StatisticsBuilder target{options}; - target.increaseValueCount(5); - target.increaseRawSize(10); - auto stats = target.build(); - EXPECT_EQ(5, stats->getNumberOfValues()); - EXPECT_EQ(10, stats->getRawSize()); - EXPECT_FALSE(stats->hasNull().value()); - - // merge missing stats - proto::ColumnStatistics proto; - target.merge(*buildColumnStatisticsFromProto(proto, context)); - stats = target.build(); - EXPECT_FALSE(stats->getNumberOfValues().has_value()); - EXPECT_FALSE(stats->getRawSize().has_value()); - EXPECT_FALSE(stats->hasNull().has_value()); - - // add again - target.increaseValueCount(5); - target.increaseRawSize(10); - target.setHasNull(); - stats = target.build(); - EXPECT_FALSE(stats->getNumberOfValues().has_value()); - EXPECT_FALSE(stats->getRawSize().has_value()); - EXPECT_TRUE(stats->hasNull().value()); -} - -TEST(StatisticsBuilder, basicHasNull) { - enum class State { kTrue = 0, kFalse, kMissing }; - auto test = [](State to, State from, State expected) { - StatisticsBuilder target{options}; - if (to == State::kTrue) { - target.setHasNull(); - } else if (to == State::kMissing) { - // merge against unknown - proto::ColumnStatistics proto; - target.merge(*buildColumnStatisticsFromProto(proto, context)); - } - - proto::ColumnStatistics proto; - if (from == State::kFalse) { - proto.set_hasnull(false); - } else if (from == State::kTrue) { - proto.set_hasnull(true); - } - - target.merge(*buildColumnStatisticsFromProto(proto, context)); - auto stats = target.build(); - if (expected == State::kFalse) { - EXPECT_FALSE(stats->hasNull().value()); - } else if (expected == State::kTrue) { - EXPECT_TRUE(stats->hasNull().value()); - } else { - EXPECT_FALSE(stats->hasNull().has_value()); - } - }; - - // true / any => true - test(State::kTrue, State::kTrue, State::kTrue); - test(State::kTrue, State::kFalse, State::kTrue); - test(State::kTrue, State::kMissing, State::kTrue); - // unknown / true => true - // unknown / unknown or false => unknown - test(State::kMissing, State::kTrue, State::kTrue); - test(State::kMissing, State::kFalse, State::kMissing); - test(State::kMissing, State::kMissing, State::kMissing); - // false / unknown => unknown - // false / false => false - // false / true => true - test(State::kFalse, State::kMissing, State::kMissing); - test(State::kFalse, State::kFalse, State::kFalse); - test(State::kFalse, State::kTrue, State::kTrue); -} - -TEST(StatisticsBuilder, binary) { - BinaryStatisticsBuilder builder{options}; - // empty builder should have all defaults - EXPECT_EQ(0, builder.getTotalLength()); - EXPECT_EQ(0, builder.getNumberOfValues()); - - builder.addValues(5, 2); - - // stats should be merged - BinaryStatisticsBuilder target{options}; - target.merge(*builder.build()); - auto stats = as(target.build()); - EXPECT_EQ(2, stats->getNumberOfValues()); - EXPECT_EQ(10, stats->getTotalLength()); - - // stats should be merged again - target.merge(*builder.build()); - stats = as(target.build()); - EXPECT_EQ(4, stats->getNumberOfValues()); - EXPECT_EQ(20, stats->getTotalLength()); - - // add value - target.addValues(10); - stats = as(target.build()); - EXPECT_EQ(5, stats->getNumberOfValues()); - EXPECT_EQ(30, stats->getTotalLength()); - - // reset - builder.reset(); - EXPECT_EQ(0, builder.getTotalLength()); - EXPECT_EQ(0, builder.getNumberOfValues()); -} - -TEST(StatisticsBuilder, binaryMissingStats) { - BinaryStatisticsBuilder target{options}; - target.addValues(5); - auto stats = as(target.build()); - EXPECT_EQ(5, stats->getTotalLength()); - - // merge missing stats - proto::ColumnStatistics proto; - auto binProto = proto.mutable_binarystatistics(); - target.merge(*buildColumnStatisticsFromProto(proto, context)); - stats = as(target.build()); - EXPECT_EQ(stats, nullptr); - - // merge again - binProto->set_sum(100); - target.merge(*buildColumnStatisticsFromProto(proto, context)); - stats = as(target.build()); - EXPECT_EQ(stats, nullptr); - - // add again - target.addValues(10); - stats = as(target.build()); - EXPECT_EQ(stats, nullptr); -} - -TEST(StatisticsBuilder, binaryEmptyStats) { - BinaryStatisticsBuilder target{options}; - target.addValues(5); - auto stats = as(target.build()); - EXPECT_EQ(5, stats->getTotalLength()); - - // merge empty stats - proto::ColumnStatistics proto; - proto.set_numberofvalues(0); - target.merge(*buildColumnStatisticsFromProto(proto, context)); - stats = as(target.build()); - EXPECT_EQ(5, stats->getTotalLength()); - - // merge again - proto.clear_numberofvalues(); - target.merge(*buildColumnStatisticsFromProto(proto, context)); - stats = as(target.build()); - EXPECT_EQ(stats, nullptr); -} - -TEST(StatisticsBuilder, binaryLengthOverflow) { - // add value causing overflow - BinaryStatisticsBuilder target{options}; - target.addValues(std::numeric_limits::max()); - auto stats = as(target.build()); - EXPECT_NE(stats, nullptr); - target.addValues(1); - EXPECT_TRUE(target.getTotalLength().has_value()); - stats = as(target.build()); - EXPECT_EQ(stats, nullptr); - - // merge causing overflow - target.reset(); - target.addValues(std::numeric_limits::max()); - proto::ColumnStatistics proto; - auto binProto = proto.mutable_binarystatistics(); - binProto->set_sum(1); - target.merge(*buildColumnStatisticsFromProto(proto, context)); - EXPECT_TRUE(target.getTotalLength().has_value()); - stats = as(target.build()); - EXPECT_EQ(stats, nullptr); -} - -TEST(StatisticsBuilder, initialSize) { - StatisticsBuilder target{options}; - target.increaseValueCount(1); - EXPECT_FALSE(target.getSize().has_value()); - auto stats = target.build(); - EXPECT_FALSE(stats->getSize().has_value()); - - StatisticsBuilder target2{StatisticsBuilderOptions{16, 100U}}; - target2.increaseValueCount(1); - EXPECT_EQ(target2.getSize().value(), 100); - stats = target2.build(); - EXPECT_EQ(stats->getSize().value(), 100); - target2.reset(); - EXPECT_EQ(target2.getSize().value(), 100); - stats = target2.build(); - EXPECT_EQ(stats->getSize().value(), 100); -} - -proto::KeyInfo createKeyInfo(int64_t key) { - proto::KeyInfo keyInfo; - keyInfo.set_intkey(key); - return keyInfo; -} - -inline bool operator==( - const ColumnStatistics& lhs, - const ColumnStatistics& rhs) { - return (lhs.hasNull() == rhs.hasNull()) && - (lhs.getNumberOfValues() == rhs.getNumberOfValues()) && - (lhs.getRawSize() == rhs.getRawSize()); -} - -void checkEntries( - const std::vector& entries, - const std::vector& expectedEntries) { - EXPECT_EQ(expectedEntries.size(), entries.size()); - for (const auto& entry : entries) { - EXPECT_NE( - std::find_if( - expectedEntries.begin(), - expectedEntries.end(), - [&](const ColumnStatistics& expectedStats) { - return expectedStats == entry; - }), - expectedEntries.end()); - } -} - -struct TestKeyStats { - TestKeyStats(int64_t key, bool hasNull, uint64_t valueCount, uint64_t rawSize) - : key{key}, hasNull{hasNull}, valueCount{valueCount}, rawSize{rawSize} {} - - int64_t key; - bool hasNull; - uint64_t valueCount; - uint64_t rawSize; -}; - -struct MapStatsAddValueTestCase { - explicit MapStatsAddValueTestCase( - const std::vector& input, - const std::vector& expected) - : input{input}, expected{expected} {} - - std::vector input; - std::vector expected; -}; - -class MapStatisticsBuilderAddValueTest - : public ::testing::Test, - public ::testing::WithParamInterface {}; - -TEST_P(MapStatisticsBuilderAddValueTest, addValues) { - auto type = HiveTypeParser{}.parse("map"); - MapStatisticsBuilder mapStatsBuilder{*type, options}; - - for (const auto& entry : GetParam().input) { - StatisticsBuilder statsBuilder{options}; - if (entry.hasNull) { - statsBuilder.setHasNull(); - } - statsBuilder.increaseValueCount(entry.valueCount); - statsBuilder.increaseRawSize(entry.rawSize); - mapStatsBuilder.addValues(createKeyInfo(entry.key), statsBuilder); - } - - const auto& expectedTestEntries = GetParam().expected; - std::vector expectedEntryStats{}; - expectedEntryStats.reserve(expectedTestEntries.size()); - for (const auto& entry : expectedTestEntries) { - StatisticsBuilder statsBuilder{options}; - if (entry.hasNull) { - statsBuilder.setHasNull(); - } - statsBuilder.increaseValueCount(entry.valueCount); - statsBuilder.increaseRawSize(entry.rawSize); - expectedEntryStats.push_back(statsBuilder); - } - - std::vector entryStats; - const auto& outputEntries = mapStatsBuilder.getEntryStatistics(); - entryStats.reserve(outputEntries.size()); - for (const auto& entry : outputEntries) { - entryStats.push_back(*entry.second); - } - - checkEntries(entryStats, expectedEntryStats); -} - -INSTANTIATE_TEST_SUITE_P( - MapStatisticsBuilderAddValueTestSuite, - MapStatisticsBuilderAddValueTest, - testing::Values( - MapStatsAddValueTestCase{{}, {}}, - MapStatsAddValueTestCase{ - {TestKeyStats{1, false, 1, 21}}, - {TestKeyStats{1, false, 1, 21}}}, - MapStatsAddValueTestCase{ - {TestKeyStats{1, false, 1, 21}, TestKeyStats{1, true, 3, 42}}, - {TestKeyStats{1, true, 4, 63}}}, - MapStatsAddValueTestCase{ - {TestKeyStats{1, false, 1, 21}, TestKeyStats{2, true, 3, 42}}, - {TestKeyStats{1, false, 1, 21}, TestKeyStats{2, true, 3, 42}}}, - MapStatsAddValueTestCase{ - {TestKeyStats{1, false, 1, 21}, - TestKeyStats{2, false, 3, 42}, - TestKeyStats{2, false, 3, 42}, - TestKeyStats{1, true, 1, 42}}, - {TestKeyStats{1, true, 2, 63}, TestKeyStats{2, false, 6, 84}}})); - -struct MapStatsMergeTestCase { - std::vector> inputs; - std::vector expected; -}; - -class MapStatisticsBuilderMergeTest - : public ::testing::Test, - public ::testing::WithParamInterface {}; - -TEST_P(MapStatisticsBuilderMergeTest, addValues) { - auto type = HiveTypeParser{}.parse("map"); - - const auto& inputTestEntries = GetParam().inputs; - std::vector> mapStatsBuilders; - mapStatsBuilders.reserve(inputTestEntries.size()); - for (const auto& input : inputTestEntries) { - std::unique_ptr mapStatsBuilder = - std::make_unique(*type, options); - for (const auto& entry : input) { - StatisticsBuilder statsBuilder{options}; - if (entry.hasNull) { - statsBuilder.setHasNull(); - } - statsBuilder.increaseValueCount(entry.valueCount); - statsBuilder.increaseRawSize(entry.rawSize); - mapStatsBuilder->addValues(createKeyInfo(entry.key), statsBuilder); - } - mapStatsBuilders.push_back(std::move(mapStatsBuilder)); - } - - MapStatisticsBuilder aggregateMapStatsBuilder{*type, options}; - for (const auto& mapStatsBuilder : mapStatsBuilders) { - aggregateMapStatsBuilder.merge(*mapStatsBuilder); - } - - const auto& expectedTestEntries = GetParam().expected; - std::vector expectedEntryStats{}; - expectedEntryStats.reserve(expectedTestEntries.size()); - for (const auto& entry : expectedTestEntries) { - StatisticsBuilder statsBuilder{options}; - if (entry.hasNull) { - statsBuilder.setHasNull(); - } - statsBuilder.increaseValueCount(entry.valueCount); - statsBuilder.increaseRawSize(entry.rawSize); - expectedEntryStats.push_back(statsBuilder); - } - - std::vector entryStats; - const auto& aggregatedEntries = aggregateMapStatsBuilder.getEntryStatistics(); - entryStats.reserve(aggregatedEntries.size()); - for (const auto& entry : aggregatedEntries) { - entryStats.push_back(*entry.second); - } - - checkEntries(entryStats, expectedEntryStats); -} - -INSTANTIATE_TEST_SUITE_P( - MapStatisticsBuilderMergeTestSuite, - MapStatisticsBuilderMergeTest, - testing::Values( - MapStatsMergeTestCase{{}, {}}, - MapStatsMergeTestCase{ - {{TestKeyStats{1, false, 1, 21}}}, - {TestKeyStats{1, false, 1, 21}}}, - MapStatsMergeTestCase{ - {{}, {TestKeyStats{1, false, 1, 21}}}, - {TestKeyStats{1, false, 1, 21}}}, - MapStatsMergeTestCase{ - {{TestKeyStats{1, false, 1, 21}}, {TestKeyStats{1, true, 3, 42}}}, - {TestKeyStats{1, true, 4, 63}}}, - MapStatsMergeTestCase{ - {{TestKeyStats{1, false, 1, 21}}, {TestKeyStats{2, true, 3, 42}}}, - {TestKeyStats{1, false, 1, 21}, TestKeyStats{2, true, 3, 42}}}, - MapStatsMergeTestCase{ - {{TestKeyStats{1, false, 1, 21}, TestKeyStats{2, false, 3, 42}}, - {TestKeyStats{2, false, 3, 42}}, - {TestKeyStats{1, true, 1, 42}}}, - {TestKeyStats{1, true, 2, 63}, TestKeyStats{2, false, 6, 84}}})); - -TEST(MapStatisticsBuilderTest, innerStatsType) { - { - auto type = HiveTypeParser{}.parse("map"); - MapStatisticsBuilder mapStatsBuilder{*type, options}; - - DoubleStatisticsBuilder statsBuilder{options}; - statsBuilder.addValues(0.1); - statsBuilder.addValues(1.0); - mapStatsBuilder.addValues(createKeyInfo(1), statsBuilder); - - auto& doubleStats = dynamic_cast( - *mapStatsBuilder.getEntryStatistics().at(KeyInfo{1})); - - EXPECT_EQ(0.1, doubleStats.getMinimum()); - EXPECT_EQ(1.0, doubleStats.getMaximum()); - } - { - auto type = HiveTypeParser{}.parse("map"); - MapStatisticsBuilder mapStatsBuilder{*type, options}; - - IntegerStatisticsBuilder statsBuilder{options}; - statsBuilder.addValues(1); - statsBuilder.addValues(2); - mapStatsBuilder.addValues(createKeyInfo(1), statsBuilder); - - auto& intStats = dynamic_cast( - *mapStatsBuilder.getEntryStatistics().at(KeyInfo{1})); - - EXPECT_EQ(1, intStats.getMinimum()); - EXPECT_EQ(2, intStats.getMaximum()); - EXPECT_EQ(3, intStats.getSum()); - } -} - -TEST(MapStatisticsBuilderTest, incrementSize) { - auto type = HiveTypeParser{}.parse("map"); - MapStatisticsBuilder mapStatsBuilder{*type, options}; - - DoubleStatisticsBuilder statsBuilder1{options}; - statsBuilder1.addValues(0.1); - statsBuilder1.addValues(1.0); - ASSERT_FALSE(statsBuilder1.getSize().has_value()); - mapStatsBuilder.addValues(createKeyInfo(1), statsBuilder1); - EXPECT_FALSE(mapStatsBuilder.getEntryStatistics() - .at(KeyInfo{1}) - ->getSize() - .has_value()); - - DoubleStatisticsBuilder statsBuilder2{options}; - statsBuilder2.addValues(0.3); - statsBuilder2.addValues(3.0); - ASSERT_FALSE(statsBuilder2.getSize().has_value()); - mapStatsBuilder.addValues(createKeyInfo(2), statsBuilder2); - EXPECT_FALSE(mapStatsBuilder.getEntryStatistics() - .at(KeyInfo{2}) - ->getSize() - .has_value()); - - mapStatsBuilder.incrementSize(createKeyInfo(1), 4); - ASSERT_TRUE(mapStatsBuilder.getEntryStatistics() - .at(KeyInfo{1}) - ->getSize() - .has_value()); - EXPECT_EQ( - 4, - mapStatsBuilder.getEntryStatistics().at(KeyInfo{1})->getSize().value()); - ASSERT_FALSE(mapStatsBuilder.getEntryStatistics() - .at(KeyInfo{2}) - ->getSize() - .has_value()); - mapStatsBuilder.incrementSize(createKeyInfo(2), 8); - ASSERT_TRUE(mapStatsBuilder.getEntryStatistics() - .at(KeyInfo{1}) - ->getSize() - .has_value()); - EXPECT_EQ( - 4, - mapStatsBuilder.getEntryStatistics().at(KeyInfo{1})->getSize().value()); - ASSERT_TRUE(mapStatsBuilder.getEntryStatistics() - .at(KeyInfo{2}) - ->getSize() - .has_value()); - EXPECT_EQ( - 8, - mapStatsBuilder.getEntryStatistics().at(KeyInfo{2})->getSize().value()); - - mapStatsBuilder.incrementSize(createKeyInfo(1), 8); - ASSERT_TRUE(mapStatsBuilder.getEntryStatistics() - .at(KeyInfo{1}) - ->getSize() - .has_value()); - EXPECT_EQ( - 12, - mapStatsBuilder.getEntryStatistics().at(KeyInfo{1})->getSize().value()); - ASSERT_TRUE(mapStatsBuilder.getEntryStatistics() - .at(KeyInfo{2}) - ->getSize() - .has_value()); - EXPECT_EQ( - 8, - mapStatsBuilder.getEntryStatistics().at(KeyInfo{2})->getSize().value()); -} - -TEST(MapStatisticsBuilderTest, mergeKeyStats) { - auto type = HiveTypeParser{}.parse("map"); - MapStatisticsBuilder mapStatsBuilder{*type, options}; - - mapStatsBuilder.incrementSize(createKeyInfo(1), 42); - auto& keyStats = dynamic_cast( - *mapStatsBuilder.getEntryStatistics().at(KeyInfo{1})); - ASSERT_EQ(0, keyStats.getNumberOfValues()); - ASSERT_TRUE(keyStats.getRawSize().has_value()); - ASSERT_EQ(0, keyStats.getRawSize().value()); - ASSERT_TRUE(keyStats.getSize().has_value()); - ASSERT_EQ(42, keyStats.getSize().value()); - - IntegerStatisticsBuilder statsBuilder{options}; - statsBuilder.addValues(1); - statsBuilder.addValues(2); - statsBuilder.increaseRawSize(8); - mapStatsBuilder.addValues(createKeyInfo(1), statsBuilder); - - keyStats = dynamic_cast( - *mapStatsBuilder.getEntryStatistics().at(KeyInfo{1})); - ASSERT_EQ(2, keyStats.getNumberOfValues()); - ASSERT_TRUE(keyStats.getRawSize().has_value()); - ASSERT_EQ(8, keyStats.getRawSize().value()); - EXPECT_TRUE(keyStats.getSize().has_value()); - EXPECT_EQ(42, keyStats.getSize().value()); -} diff --git a/velox/dwio/dwrf/test/TestDwrfColumnStatistics.cpp b/velox/dwio/dwrf/test/TestDwrfColumnStatistics.cpp new file mode 100644 index 000000000000..cf13c6df826b --- /dev/null +++ b/velox/dwio/dwrf/test/TestDwrfColumnStatistics.cpp @@ -0,0 +1,500 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/dwio/dwrf/test/ColumnStatisticsBase.h" +#include "velox/type/fbhive/HiveTypeParser.h" + +using namespace facebook::velox::dwio::common; +using namespace facebook::velox::dwrf; +using facebook::velox::type::fbhive::HiveTypeParser; + +class ColumnStatisticsTest : public ::testing::Test, + public ColumnStatisticsBase {}; + +TEST_F(ColumnStatisticsTest, size) { + testSize(); +} + +TEST_F(ColumnStatisticsTest, integer) { + testInteger(); +} + +TEST_F(ColumnStatisticsTest, integerMissingStats) { + proto::ColumnStatistics proto; + auto intProto = proto.mutable_intstatistics(); + auto columnStatisticsWrapper = ColumnStatisticsWrapper(&proto); + testIntegerMissingStats(columnStatisticsWrapper, intProto, DwrfFormat::kDwrf); +} + +TEST_F(ColumnStatisticsTest, integerEmptyStats) { + proto::ColumnStatistics proto; + proto.set_numberofvalues(0); + auto columnStatisticsWrapper = ColumnStatisticsWrapper(&proto); + testIntegerEmptyStats( + columnStatisticsWrapper, (void*)&proto, DwrfFormat::kDwrf); +} + +TEST_F(ColumnStatisticsTest, integerOverflow) { + testIntegerOverflow(); +} + +TEST_F(ColumnStatisticsTest, doubles) { + testDoubles(); +} + +TEST_F(ColumnStatisticsTest, doubleMissingStats) { + proto::ColumnStatistics proto; + auto doubleProto = proto.mutable_doublestatistics(); + auto columnStatisticsWrapper = ColumnStatisticsWrapper(&proto); + testDoubleMissingStats( + columnStatisticsWrapper, doubleProto, DwrfFormat::kDwrf); +} + +TEST_F(ColumnStatisticsTest, doubleEmptyStats) { + proto::ColumnStatistics proto; + proto.set_numberofvalues(0); + auto columnStatisticsWrapper = ColumnStatisticsWrapper(&proto); + testDoubleEmptyStats( + columnStatisticsWrapper, (void*)&proto, DwrfFormat::kDwrf); +} + +TEST_F(ColumnStatisticsTest, doubleNaN) { + testDoubleNaN(); +} + +TEST_F(ColumnStatisticsTest, string) { + testString(); +} + +TEST_F(ColumnStatisticsTest, stringMissingStats) { + proto::ColumnStatistics proto; + auto strProto = proto.mutable_stringstatistics(); + auto columnStatisticsWrapper = ColumnStatisticsWrapper(&proto); + testStringMissingStats(columnStatisticsWrapper, strProto, DwrfFormat::kDwrf); +} + +TEST_F(ColumnStatisticsTest, stringEmptyStats) { + proto::ColumnStatistics proto; + proto.set_numberofvalues(0); + auto columnStatisticsWrapper = ColumnStatisticsWrapper(&proto); + testStringEmptyStats( + columnStatisticsWrapper, (void*)&proto, DwrfFormat::kDwrf); +} + +TEST_F(ColumnStatisticsTest, stringLengthThreshold) { + testStringLengthThreshold(); +} + +TEST_F(ColumnStatisticsTest, stringLengthOverflow) { + proto::ColumnStatistics proto; + proto.set_numberofvalues(1); + auto strProto = proto.mutable_stringstatistics(); + auto columnStatisticsWrapper = ColumnStatisticsWrapper(&proto); + testStringLengthOverflow( + columnStatisticsWrapper, strProto, DwrfFormat::kDwrf); +} + +TEST_F(ColumnStatisticsTest, boolean) { + testBoolean(); +} + +TEST_F(ColumnStatisticsTest, booleanMissingStats) { + proto::ColumnStatistics proto; + auto boolProto = proto.mutable_bucketstatistics(); + auto columnStatisticsWrapper = ColumnStatisticsWrapper(&proto); + testBooleanMissingStats( + columnStatisticsWrapper, boolProto, DwrfFormat::kDwrf); +} + +TEST_F(ColumnStatisticsTest, booleanEmptyStats) { + proto::ColumnStatistics proto; + proto.set_numberofvalues(0); + auto columnStatisticsWrapper = ColumnStatisticsWrapper(&proto); + testBooleanEmptyStats( + columnStatisticsWrapper, (void*)&proto, DwrfFormat::kDwrf); +} + +TEST_F(ColumnStatisticsTest, basic) { + testBasic(); +} + +TEST_F(ColumnStatisticsTest, basicMissingStats) { + proto::ColumnStatistics proto; + auto columnStatisticsWrapper = ColumnStatisticsWrapper(&proto); + testBasicMissingStats(columnStatisticsWrapper); +} + +TEST_F(ColumnStatisticsTest, basicHasNull) { + proto::ColumnStatistics proto; + auto columnStatisticsWrapper = ColumnStatisticsWrapper(&proto); + testBasicHasNull(columnStatisticsWrapper, DwrfFormat::kDwrf); +} + +TEST_F(ColumnStatisticsTest, binary) { + testBinary(); +} + +TEST_F(ColumnStatisticsTest, binaryMissingStats) { + proto::ColumnStatistics proto; + auto binProto = proto.mutable_binarystatistics(); + auto columnStatisticsWrapper = ColumnStatisticsWrapper(&proto); + testBinaryMissingStats(columnStatisticsWrapper, binProto, DwrfFormat::kDwrf); +} + +TEST_F(ColumnStatisticsTest, binaryEmptyStats) { + proto::ColumnStatistics proto; + proto.set_numberofvalues(0); + auto columnStatisticsWrapper = ColumnStatisticsWrapper(&proto); + testBinaryEmptyStats( + columnStatisticsWrapper, (void*)&proto, DwrfFormat::kDwrf); +} + +TEST_F(ColumnStatisticsTest, binaryLengthOverflow) { + proto::ColumnStatistics proto; + auto binProto = proto.mutable_binarystatistics(); + auto columnStatisticsWrapper = ColumnStatisticsWrapper(&proto); + testBinaryLengthOverflow( + columnStatisticsWrapper, binProto, DwrfFormat::kDwrf); +} + +TEST_F(ColumnStatisticsTest, initialSize) { + testInitialSize(); +} + +proto::KeyInfo createKeyInfo(int64_t key) { + proto::KeyInfo keyInfo; + keyInfo.set_intkey(key); + return keyInfo; +} + +inline bool operator==( + const ColumnStatistics& lhs, + const ColumnStatistics& rhs) { + return (lhs.hasNull() == rhs.hasNull()) && + (lhs.getNumberOfValues() == rhs.getNumberOfValues()) && + (lhs.getRawSize() == rhs.getRawSize()); +} + +void checkEntries( + const std::vector& entries, + const std::vector& expectedEntries) { + EXPECT_EQ(expectedEntries.size(), entries.size()); + for (const auto& entry : entries) { + EXPECT_NE( + std::find_if( + expectedEntries.begin(), + expectedEntries.end(), + [&](const ColumnStatistics& expectedStats) { + return expectedStats == entry; + }), + expectedEntries.end()); + } +} + +struct TestKeyStats { + TestKeyStats(int64_t key, bool hasNull, uint64_t valueCount, uint64_t rawSize) + : key{key}, hasNull{hasNull}, valueCount{valueCount}, rawSize{rawSize} {} + + int64_t key; + bool hasNull; + uint64_t valueCount; + uint64_t rawSize; +}; + +struct MapStatsAddValueTestCase { + explicit MapStatsAddValueTestCase( + const std::vector& input, + const std::vector& expected) + : input{input}, expected{expected} {} + + std::vector input; + std::vector expected; +}; + +class MapStatisticsBuilderAddValueTest + : public ::testing::Test, + public ::testing::WithParamInterface {}; + +StatisticsBuilderOptions options{16}; +TEST_P(MapStatisticsBuilderAddValueTest, addValues) { + auto type = HiveTypeParser{}.parse("map"); + MapStatisticsBuilder mapStatsBuilder{*type, options}; + + for (const auto& entry : GetParam().input) { + StatisticsBuilder statsBuilder{options}; + if (entry.hasNull) { + statsBuilder.setHasNull(); + } + statsBuilder.increaseValueCount(entry.valueCount); + statsBuilder.increaseRawSize(entry.rawSize); + mapStatsBuilder.addValues(createKeyInfo(entry.key), statsBuilder); + } + + const auto& expectedTestEntries = GetParam().expected; + std::vector expectedEntryStats{}; + expectedEntryStats.reserve(expectedTestEntries.size()); + for (const auto& entry : expectedTestEntries) { + StatisticsBuilder statsBuilder{options}; + if (entry.hasNull) { + statsBuilder.setHasNull(); + } + statsBuilder.increaseValueCount(entry.valueCount); + statsBuilder.increaseRawSize(entry.rawSize); + expectedEntryStats.push_back(statsBuilder); + } + + std::vector entryStats; + const auto& outputEntries = mapStatsBuilder.getEntryStatistics(); + entryStats.reserve(outputEntries.size()); + for (const auto& entry : outputEntries) { + entryStats.push_back(*entry.second); + } + + checkEntries(entryStats, expectedEntryStats); +} + +INSTANTIATE_TEST_SUITE_P( + MapStatisticsBuilderAddValueTestSuite, + MapStatisticsBuilderAddValueTest, + testing::Values( + MapStatsAddValueTestCase{{}, {}}, + MapStatsAddValueTestCase{ + {TestKeyStats{1, false, 1, 21}}, + {TestKeyStats{1, false, 1, 21}}}, + MapStatsAddValueTestCase{ + {TestKeyStats{1, false, 1, 21}, TestKeyStats{1, true, 3, 42}}, + {TestKeyStats{1, true, 4, 63}}}, + MapStatsAddValueTestCase{ + {TestKeyStats{1, false, 1, 21}, TestKeyStats{2, true, 3, 42}}, + {TestKeyStats{1, false, 1, 21}, TestKeyStats{2, true, 3, 42}}}, + MapStatsAddValueTestCase{ + {TestKeyStats{1, false, 1, 21}, + TestKeyStats{2, false, 3, 42}, + TestKeyStats{2, false, 3, 42}, + TestKeyStats{1, true, 1, 42}}, + {TestKeyStats{1, true, 2, 63}, TestKeyStats{2, false, 6, 84}}})); + +struct MapStatsMergeTestCase { + std::vector> inputs; + std::vector expected; +}; + +class MapStatisticsBuilderMergeTest + : public ::testing::Test, + public ::testing::WithParamInterface {}; + +TEST_P(MapStatisticsBuilderMergeTest, addValues) { + auto type = HiveTypeParser{}.parse("map"); + + const auto& inputTestEntries = GetParam().inputs; + std::vector> mapStatsBuilders; + mapStatsBuilders.reserve(inputTestEntries.size()); + for (const auto& input : inputTestEntries) { + std::unique_ptr mapStatsBuilder = + std::make_unique(*type, options); + for (const auto& entry : input) { + StatisticsBuilder statsBuilder{options}; + if (entry.hasNull) { + statsBuilder.setHasNull(); + } + statsBuilder.increaseValueCount(entry.valueCount); + statsBuilder.increaseRawSize(entry.rawSize); + mapStatsBuilder->addValues(createKeyInfo(entry.key), statsBuilder); + } + mapStatsBuilders.push_back(std::move(mapStatsBuilder)); + } + + MapStatisticsBuilder aggregateMapStatsBuilder{*type, options}; + for (const auto& mapStatsBuilder : mapStatsBuilders) { + aggregateMapStatsBuilder.merge(*mapStatsBuilder); + } + + const auto& expectedTestEntries = GetParam().expected; + std::vector expectedEntryStats{}; + expectedEntryStats.reserve(expectedTestEntries.size()); + for (const auto& entry : expectedTestEntries) { + StatisticsBuilder statsBuilder{options}; + if (entry.hasNull) { + statsBuilder.setHasNull(); + } + statsBuilder.increaseValueCount(entry.valueCount); + statsBuilder.increaseRawSize(entry.rawSize); + expectedEntryStats.push_back(statsBuilder); + } + + std::vector entryStats; + const auto& aggregatedEntries = aggregateMapStatsBuilder.getEntryStatistics(); + entryStats.reserve(aggregatedEntries.size()); + for (const auto& entry : aggregatedEntries) { + entryStats.push_back(*entry.second); + } + + checkEntries(entryStats, expectedEntryStats); +} + +INSTANTIATE_TEST_SUITE_P( + MapStatisticsBuilderMergeTestSuite, + MapStatisticsBuilderMergeTest, + testing::Values( + MapStatsMergeTestCase{{}, {}}, + MapStatsMergeTestCase{ + {{TestKeyStats{1, false, 1, 21}}}, + {TestKeyStats{1, false, 1, 21}}}, + MapStatsMergeTestCase{ + {{}, {TestKeyStats{1, false, 1, 21}}}, + {TestKeyStats{1, false, 1, 21}}}, + MapStatsMergeTestCase{ + {{TestKeyStats{1, false, 1, 21}}, {TestKeyStats{1, true, 3, 42}}}, + {TestKeyStats{1, true, 4, 63}}}, + MapStatsMergeTestCase{ + {{TestKeyStats{1, false, 1, 21}}, {TestKeyStats{2, true, 3, 42}}}, + {TestKeyStats{1, false, 1, 21}, TestKeyStats{2, true, 3, 42}}}, + MapStatsMergeTestCase{ + {{TestKeyStats{1, false, 1, 21}, TestKeyStats{2, false, 3, 42}}, + {TestKeyStats{2, false, 3, 42}}, + {TestKeyStats{1, true, 1, 42}}}, + {TestKeyStats{1, true, 2, 63}, TestKeyStats{2, false, 6, 84}}})); + +TEST(MapStatisticsBuilderTest, innerStatsType) { + { + auto type = HiveTypeParser{}.parse("map"); + MapStatisticsBuilder mapStatsBuilder{*type, options}; + + DoubleStatisticsBuilder statsBuilder{options}; + statsBuilder.addValues(0.1); + statsBuilder.addValues(1.0); + mapStatsBuilder.addValues(createKeyInfo(1), statsBuilder); + + auto& doubleStats = dynamic_cast( + *mapStatsBuilder.getEntryStatistics().at(KeyInfo{1})); + + EXPECT_EQ(0.1, doubleStats.getMinimum()); + EXPECT_EQ(1.0, doubleStats.getMaximum()); + } + { + auto type = HiveTypeParser{}.parse("map"); + MapStatisticsBuilder mapStatsBuilder{*type, options}; + + IntegerStatisticsBuilder statsBuilder{options}; + statsBuilder.addValues(1); + statsBuilder.addValues(2); + mapStatsBuilder.addValues(createKeyInfo(1), statsBuilder); + + auto& intStats = dynamic_cast( + *mapStatsBuilder.getEntryStatistics().at(KeyInfo{1})); + + EXPECT_EQ(1, intStats.getMinimum()); + EXPECT_EQ(2, intStats.getMaximum()); + EXPECT_EQ(3, intStats.getSum()); + } +} + +TEST(MapStatisticsBuilderTest, incrementSize) { + auto type = HiveTypeParser{}.parse("map"); + MapStatisticsBuilder mapStatsBuilder{*type, options}; + + DoubleStatisticsBuilder statsBuilder1{options}; + statsBuilder1.addValues(0.1); + statsBuilder1.addValues(1.0); + ASSERT_FALSE(statsBuilder1.getSize().has_value()); + mapStatsBuilder.addValues(createKeyInfo(1), statsBuilder1); + EXPECT_FALSE(mapStatsBuilder.getEntryStatistics() + .at(KeyInfo{1}) + ->getSize() + .has_value()); + + DoubleStatisticsBuilder statsBuilder2{options}; + statsBuilder2.addValues(0.3); + statsBuilder2.addValues(3.0); + ASSERT_FALSE(statsBuilder2.getSize().has_value()); + mapStatsBuilder.addValues(createKeyInfo(2), statsBuilder2); + EXPECT_FALSE(mapStatsBuilder.getEntryStatistics() + .at(KeyInfo{2}) + ->getSize() + .has_value()); + + mapStatsBuilder.incrementSize(createKeyInfo(1), 4); + ASSERT_TRUE(mapStatsBuilder.getEntryStatistics() + .at(KeyInfo{1}) + ->getSize() + .has_value()); + EXPECT_EQ( + 4, + mapStatsBuilder.getEntryStatistics().at(KeyInfo{1})->getSize().value()); + ASSERT_FALSE(mapStatsBuilder.getEntryStatistics() + .at(KeyInfo{2}) + ->getSize() + .has_value()); + mapStatsBuilder.incrementSize(createKeyInfo(2), 8); + ASSERT_TRUE(mapStatsBuilder.getEntryStatistics() + .at(KeyInfo{1}) + ->getSize() + .has_value()); + EXPECT_EQ( + 4, + mapStatsBuilder.getEntryStatistics().at(KeyInfo{1})->getSize().value()); + ASSERT_TRUE(mapStatsBuilder.getEntryStatistics() + .at(KeyInfo{2}) + ->getSize() + .has_value()); + EXPECT_EQ( + 8, + mapStatsBuilder.getEntryStatistics().at(KeyInfo{2})->getSize().value()); + + mapStatsBuilder.incrementSize(createKeyInfo(1), 8); + ASSERT_TRUE(mapStatsBuilder.getEntryStatistics() + .at(KeyInfo{1}) + ->getSize() + .has_value()); + EXPECT_EQ( + 12, + mapStatsBuilder.getEntryStatistics().at(KeyInfo{1})->getSize().value()); + ASSERT_TRUE(mapStatsBuilder.getEntryStatistics() + .at(KeyInfo{2}) + ->getSize() + .has_value()); + EXPECT_EQ( + 8, + mapStatsBuilder.getEntryStatistics().at(KeyInfo{2})->getSize().value()); +} + +TEST(MapStatisticsBuilderTest, mergeKeyStats) { + auto type = HiveTypeParser{}.parse("map"); + MapStatisticsBuilder mapStatsBuilder{*type, options}; + + mapStatsBuilder.incrementSize(createKeyInfo(1), 42); + auto& keyStats = dynamic_cast( + *mapStatsBuilder.getEntryStatistics().at(KeyInfo{1})); + ASSERT_EQ(0, keyStats.getNumberOfValues()); + ASSERT_TRUE(keyStats.getRawSize().has_value()); + ASSERT_EQ(0, keyStats.getRawSize().value()); + ASSERT_TRUE(keyStats.getSize().has_value()); + ASSERT_EQ(42, keyStats.getSize().value()); + + IntegerStatisticsBuilder statsBuilder{options}; + statsBuilder.addValues(1); + statsBuilder.addValues(2); + statsBuilder.increaseRawSize(8); + mapStatsBuilder.addValues(createKeyInfo(1), statsBuilder); + + keyStats = dynamic_cast( + *mapStatsBuilder.getEntryStatistics().at(KeyInfo{1})); + ASSERT_EQ(2, keyStats.getNumberOfValues()); + ASSERT_TRUE(keyStats.getRawSize().has_value()); + ASSERT_EQ(8, keyStats.getRawSize().value()); + EXPECT_TRUE(keyStats.getSize().has_value()); + EXPECT_EQ(42, keyStats.getSize().value()); +} diff --git a/velox/dwio/dwrf/test/TestOrcColumnStatistics.cpp b/velox/dwio/dwrf/test/TestOrcColumnStatistics.cpp new file mode 100644 index 000000000000..db11bf2439aa --- /dev/null +++ b/velox/dwio/dwrf/test/TestOrcColumnStatistics.cpp @@ -0,0 +1,178 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/dwio/dwrf/test/ColumnStatisticsBase.h" + +using namespace facebook::velox::dwrf; + +class ColumnStatisticsTest : public ::testing::Test, + public ColumnStatisticsBase {}; + +TEST_F(ColumnStatisticsTest, size) { + testSize(); +} + +TEST_F(ColumnStatisticsTest, integer) { + testInteger(); +} + +TEST_F(ColumnStatisticsTest, integerMissingStats) { + proto::orc::ColumnStatistics proto; + auto columnStatisticsWrapper = ColumnStatisticsWrapper(&proto); + auto intProto = proto.mutable_intstatistics(); + testIntegerMissingStats(columnStatisticsWrapper, intProto, DwrfFormat::kOrc); +} + +TEST_F(ColumnStatisticsTest, integerEmptyStats) { + proto::orc::ColumnStatistics proto; + proto.set_numberofvalues(0); + auto columnStatisticsWrapper = ColumnStatisticsWrapper(&proto); + testIntegerEmptyStats( + columnStatisticsWrapper, (void*)&proto, DwrfFormat::kOrc); +} + +TEST_F(ColumnStatisticsTest, integerOverflow) { + testIntegerOverflow(); +} + +TEST_F(ColumnStatisticsTest, doubles) { + testDoubles(); +} + +TEST_F(ColumnStatisticsTest, doubleMissingStats) { + proto::orc::ColumnStatistics proto; + auto columnStatisticsWrapper = ColumnStatisticsWrapper(&proto); + auto doubleProto = proto.mutable_doublestatistics(); + testDoubleMissingStats( + columnStatisticsWrapper, doubleProto, DwrfFormat::kOrc); +} + +TEST_F(ColumnStatisticsTest, doubleEmptyStats) { + proto::orc::ColumnStatistics proto; + proto.set_numberofvalues(0); + auto columnStatisticsWrapper = ColumnStatisticsWrapper(&proto); + testDoubleEmptyStats( + columnStatisticsWrapper, (void*)&proto, DwrfFormat::kOrc); +} + +TEST_F(ColumnStatisticsTest, doubleNaN) { + testDoubleNaN(); +} + +TEST_F(ColumnStatisticsTest, string) { + testString(); +} + +TEST_F(ColumnStatisticsTest, stringMissingStats) { + proto::orc::ColumnStatistics proto; + auto columnStatisticsWrapper = ColumnStatisticsWrapper(&proto); + auto strProto = proto.mutable_stringstatistics(); + testStringMissingStats(columnStatisticsWrapper, strProto, DwrfFormat::kOrc); +} + +TEST_F(ColumnStatisticsTest, stringEmptyStats) { + proto::orc::ColumnStatistics proto; + proto.set_numberofvalues(0); + auto columnStatisticsWrapper = ColumnStatisticsWrapper(&proto); + testStringEmptyStats( + columnStatisticsWrapper, (void*)&proto, DwrfFormat::kOrc); +} + +TEST_F(ColumnStatisticsTest, stringLengthThreshold) { + testStringLengthThreshold(); +} + +TEST_F(ColumnStatisticsTest, stringLengthOverflow) { + proto::orc::ColumnStatistics proto; + proto.set_numberofvalues(1); + auto strProto = proto.mutable_stringstatistics(); + auto columnStatisticsWrapper = ColumnStatisticsWrapper(&proto); + testStringLengthOverflow(columnStatisticsWrapper, strProto, DwrfFormat::kOrc); +} + +TEST_F(ColumnStatisticsTest, boolean) { + testBoolean(); +} + +TEST_F(ColumnStatisticsTest, booleanMissingStats) { + proto::orc::ColumnStatistics proto; + auto boolProto = proto.mutable_bucketstatistics(); + auto columnStatisticsWrapper = ColumnStatisticsWrapper(&proto); + testBooleanMissingStats(columnStatisticsWrapper, boolProto, DwrfFormat::kOrc); +} + +TEST_F(ColumnStatisticsTest, booleanEmptyStats) { + proto::orc::ColumnStatistics proto; + proto.set_numberofvalues(0); + auto columnStatisticsWrapper = ColumnStatisticsWrapper(&proto); + testBooleanEmptyStats( + columnStatisticsWrapper, (void*)&proto, DwrfFormat::kOrc); +} + +TEST_F(ColumnStatisticsTest, basic) { + testBasic(); +} + +TEST_F(ColumnStatisticsTest, basicMissingStats) { + proto::orc::ColumnStatistics proto; + auto columnStatisticsWrapper = ColumnStatisticsWrapper(&proto); + testBasicMissingStats(columnStatisticsWrapper); +} + +TEST_F(ColumnStatisticsTest, basicHasNull) { + proto::orc::ColumnStatistics proto; + auto columnStatisticsWrapper = ColumnStatisticsWrapper(&proto); + testBasicHasNull(columnStatisticsWrapper, DwrfFormat::kOrc); +} + +TEST_F(ColumnStatisticsTest, binary) { + testBinary(); +} + +TEST_F(ColumnStatisticsTest, binaryMissingStats) { + proto::orc::ColumnStatistics proto; + auto binProto = proto.mutable_binarystatistics(); + auto columnStatisticsWrapper = ColumnStatisticsWrapper(&proto); + testBinaryMissingStats(columnStatisticsWrapper, binProto, DwrfFormat::kOrc); +} + +TEST_F(ColumnStatisticsTest, binaryEmptyStats) { + proto::orc::ColumnStatistics proto; + proto.set_numberofvalues(0); + auto columnStatisticsWrapper = ColumnStatisticsWrapper(&proto); + testBinaryEmptyStats( + columnStatisticsWrapper, (void*)&proto, DwrfFormat::kOrc); +} + +TEST_F(ColumnStatisticsTest, binaryLengthOverflow) { + proto::orc::ColumnStatistics proto; + auto binProto = proto.mutable_binarystatistics(); + auto columnStatisticsWrapper = ColumnStatisticsWrapper(&proto); + testBinaryLengthOverflow(columnStatisticsWrapper, binProto, DwrfFormat::kOrc); +} + +TEST_F(ColumnStatisticsTest, initialSize) { + testInitialSize(); +} + +TEST(MapStatistics, orcUnsupportedMapStatistics) { + proto::orc::ColumnStatistics proto; + auto columnStatisticsWrapper = ColumnStatisticsWrapper(&proto); + ASSERT_FALSE(columnStatisticsWrapper.hasMapStatistics()); + ASSERT_THROW( + columnStatisticsWrapper.mapStatistics(), + ::facebook::velox::VeloxRuntimeError); +} diff --git a/velox/dwio/dwrf/writer/StatisticsBuilder.cpp b/velox/dwio/dwrf/writer/StatisticsBuilder.cpp index bf9e690f59de..3940e29f17b1 100644 --- a/velox/dwio/dwrf/writer/StatisticsBuilder.cpp +++ b/velox/dwio/dwrf/writer/StatisticsBuilder.cpp @@ -115,7 +115,8 @@ std::unique_ptr StatisticsBuilder::build() proto::ColumnStatistics stats; toProto(stats); StatsContext context{WriterVersion_CURRENT}; - return buildColumnStatisticsFromProto(stats, context); + return buildColumnStatisticsFromProto( + ColumnStatisticsWrapper(&stats), context); } std::unique_ptr StatisticsBuilder::create(