Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reduce execution time of Parquet C++ tests #14750

Merged
merged 10 commits into from
Jan 17, 2024
5 changes: 2 additions & 3 deletions cpp/tests/io/parquet_misc_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -138,9 +138,8 @@ TEST_P(ParquetSizedTest, DictionaryTest)
unsigned int const cardinality = (1 << (GetParam() - 1)) + 1;
unsigned int const nrows = std::max(cardinality * 3 / 2, 3'000'000U);

auto elements = cudf::detail::make_counting_transform_iterator(0, [cardinality](auto i) {
return "a unique string value suffixed with " + std::to_string(i % cardinality);
});
auto const elements = cudf::detail::make_counting_transform_iterator(
0, [cardinality](auto i) { return std::to_string(i % cardinality); });
auto const col0 = cudf::test::strings_column_wrapper(elements, elements + nrows);
auto const expected = table_view{{col0}};

Expand Down
40 changes: 22 additions & 18 deletions cpp/tests/io/parquet_reader_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,7 @@ TEST_F(ParquetReaderTest, UserBoundsWithNullsMixedTypes)

TEST_F(ParquetReaderTest, UserBoundsWithNullsLarge)
{
constexpr int num_rows = 30 * 1000000;
constexpr int num_rows = 30 * 10000;

std::mt19937 gen(6747);
std::bernoulli_distribution bn(0.7f);
Expand All @@ -251,21 +251,23 @@ TEST_F(ParquetReaderTest, UserBoundsWithNullsLarge)

cudf::test::fixed_width_column_wrapper<int> col(values, values + num_rows, valids);

// this file will have row groups of 1,000,000 each
// this file will have row groups of 10,000 each
cudf::table_view tbl({col});
auto filepath = temp_env->get_temp_filepath("UserBoundsWithNullsLarge.parquet");
cudf::io::parquet_writer_options out_args =
cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, tbl);
cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, tbl)
.row_group_size_rows(10000)
.max_page_size_rows(1000);
cudf::io::write_parquet(out_args);

// skip_rows / num_rows
// clang-format off
std::vector<std::pair<int, int>> params{ {-1, -1}, {31, -1}, {32, -1}, {33, -1}, {1613470, -1}, {1999999, -1},
std::vector<std::pair<int, int>> params{ {-1, -1}, {31, -1}, {32, -1}, {33, -1}, {16130, -1}, {19999, -1},
{31, 1}, {32, 1}, {33, 1},
// deliberately span some row group boundaries
{999000, 1001}, {999000, 2000}, {2999999, 2}, {13999997, -1},
{16785678, 3}, {22996176, 31},
{24001231, 17}, {29000001, 989999}, {29999999, 1} };
{9900, 1001}, {9900, 2000}, {29999, 2}, {139997, -1},
{167878, 3}, {229976, 31},
{240031, 17}, {290001, 9899}, {299999, 1} };
// clang-format on
for (auto p : params) {
cudf::io::parquet_reader_options read_args =
Expand All @@ -285,25 +287,27 @@ TEST_F(ParquetReaderTest, UserBoundsWithNullsLarge)

TEST_F(ParquetReaderTest, ListUserBoundsWithNullsLarge)
{
constexpr int num_rows = 5 * 1000000;
constexpr int num_rows = 5 * 10000;
auto colp = make_parquet_list_list_col<int>(0, num_rows, 5, 8, true);
cudf::column_view col = *colp;

// this file will have row groups of 1,000,000 each
// this file will have row groups of 10,000 each
cudf::table_view tbl({col});
auto filepath = temp_env->get_temp_filepath("ListUserBoundsWithNullsLarge.parquet");
cudf::io::parquet_writer_options out_args =
cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, tbl);
cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, tbl)
.row_group_size_rows(10000)
.max_page_size_rows(1000);
cudf::io::write_parquet(out_args);

// skip_rows / num_rows
// clang-format off
std::vector<std::pair<int, int>> params{ {-1, -1}, {31, -1}, {32, -1}, {33, -1}, {161470, -1}, {4499997, -1},
std::vector<std::pair<int, int>> params{ {-1, -1}, {31, -1}, {32, -1}, {33, -1}, {1670, -1}, {44997, -1},
{31, 1}, {32, 1}, {33, 1},
// deliberately span some row group boundaries
{999000, 1001}, {999000, 2000}, {2999999, 2},
{1678567, 3}, {4299676, 31},
{4001231, 17}, {1900000, 989999}, {4999999, 1} };
{9900, 1001}, {9900, 2000}, {29999, 2},
{16567, 3}, {42976, 31},
{40231, 17}, {19000, 9899}, {49999, 1} };
// clang-format on
for (auto p : params) {
cudf::io::parquet_reader_options read_args =
Expand Down Expand Up @@ -1951,7 +1955,7 @@ TEST_F(ParquetReaderTest, RepeatedNoAnnotations)

TEST_F(ParquetReaderTest, DeltaSkipRowsWithNulls)
{
constexpr int num_rows = 50'000;
constexpr int num_rows = 10'000;
constexpr auto seed = 21337;

std::mt19937 engine{seed};
Expand Down Expand Up @@ -2003,7 +2007,7 @@ TEST_F(ParquetReaderTest, DeltaSkipRowsWithNulls)
.stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN)
.compression(cudf::io::compression_type::NONE)
.dictionary_policy(cudf::io::dictionary_policy::NEVER)
.max_page_size_rows(20'000)
.max_page_size_rows(5'000)
.write_v2_headers(true)
.build();
cudf::io::write_parquet(out_opts);
Expand All @@ -2018,7 +2022,7 @@ TEST_F(ParquetReaderTest, DeltaSkipRowsWithNulls)
// skip and truncate
{1, 32}, {1, 33}, {32, 32}, {33, 139},
// cross page boundaries
{10'000, 20'000}
{3'000, 5'000}
};

// clang-format on
Expand All @@ -2044,7 +2048,7 @@ TEST_F(ParquetReaderTest, DeltaSkipRowsWithNulls)
.stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN)
.compression(cudf::io::compression_type::NONE)
.dictionary_policy(cudf::io::dictionary_policy::NEVER)
.max_page_size_rows(20'000)
.max_page_size_rows(5'000)
.write_v2_headers(true);
cudf::io::write_parquet(out_opts2);

Expand Down
94 changes: 21 additions & 73 deletions cpp/tests/io/parquet_v2_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@

#include <cudf/io/parquet.hpp>

using cudf::test::iterators::no_nulls;

// Base test fixture for V2 header tests
class ParquetV2Test : public ::cudf::test::BaseFixtureWithParam<bool> {};

Expand All @@ -33,7 +35,7 @@ INSTANTIATE_TEST_SUITE_P(ParquetV2ReadWriteTest,

TEST_P(ParquetV2Test, MultiColumn)
{
constexpr auto num_rows = 100000;
constexpr auto num_rows = 50000;
auto const is_v2 = GetParam();

// auto col0_data = random_values<bool>(num_rows);
Expand All @@ -45,27 +47,25 @@ TEST_P(ParquetV2Test, MultiColumn)
auto col6_vals = random_values<int16_t>(num_rows);
auto col7_vals = random_values<int32_t>(num_rows);
auto col8_vals = random_values<int64_t>(num_rows);
auto col6_data = cudf::detail::make_counting_transform_iterator(0, [col6_vals](auto i) {
auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&col6_vals](auto i) {
vuule marked this conversation as resolved.
Show resolved Hide resolved
return numeric::decimal32{col6_vals[i], numeric::scale_type{5}};
});
auto col7_data = cudf::detail::make_counting_transform_iterator(0, [col7_vals](auto i) {
auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&col7_vals](auto i) {
return numeric::decimal64{col7_vals[i], numeric::scale_type{-5}};
});
auto col8_data = cudf::detail::make_counting_transform_iterator(0, [col8_vals](auto i) {
auto col8_data = cudf::detail::make_counting_transform_iterator(0, [&col8_vals](auto i) {
return numeric::decimal128{col8_vals[i], numeric::scale_type{-6}};
});
auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });

// column_wrapper<bool> col0{
// col0_data.begin(), col0_data.end(), validity};
column_wrapper<int8_t> col1{col1_data.begin(), col1_data.end(), validity};
column_wrapper<int16_t> col2{col2_data.begin(), col2_data.end(), validity};
column_wrapper<int32_t> col3{col3_data.begin(), col3_data.end(), validity};
column_wrapper<float> col4{col4_data.begin(), col4_data.end(), validity};
column_wrapper<double> col5{col5_data.begin(), col5_data.end(), validity};
column_wrapper<numeric::decimal32> col6{col6_data, col6_data + num_rows, validity};
column_wrapper<numeric::decimal64> col7{col7_data, col7_data + num_rows, validity};
column_wrapper<numeric::decimal128> col8{col8_data, col8_data + num_rows, validity};
// column_wrapper<bool> col0{col0_data.begin(), col0_data.end(), no_nulls()};
column_wrapper<int8_t> col1{col1_data.begin(), col1_data.end(), no_nulls()};
column_wrapper<int16_t> col2{col2_data.begin(), col2_data.end(), no_nulls()};
column_wrapper<int32_t> col3{col3_data.begin(), col3_data.end(), no_nulls()};
column_wrapper<float> col4{col4_data.begin(), col4_data.end(), no_nulls()};
column_wrapper<double> col5{col5_data.begin(), col5_data.end(), no_nulls()};
column_wrapper<numeric::decimal32> col6{col6_data, col6_data + num_rows, no_nulls()};
column_wrapper<numeric::decimal64> col7{col7_data, col7_data + num_rows, no_nulls()};
column_wrapper<numeric::decimal128> col8{col8_data, col8_data + num_rows, no_nulls()};

auto expected = table_view{{col1, col2, col3, col4, col5, col6, col7, col8}};

Expand Down Expand Up @@ -108,17 +108,17 @@ TEST_P(ParquetV2Test, MultiColumnWithNulls)
auto col5_data = random_values<double>(num_rows);
auto col6_vals = random_values<int32_t>(num_rows);
auto col7_vals = random_values<int64_t>(num_rows);
auto col6_data = cudf::detail::make_counting_transform_iterator(0, [col6_vals](auto i) {
auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&col6_vals](auto i) {
return numeric::decimal32{col6_vals[i], numeric::scale_type{-2}};
});
auto col7_data = cudf::detail::make_counting_transform_iterator(0, [col7_vals](auto i) {
auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&col7_vals](auto i) {
return numeric::decimal64{col7_vals[i], numeric::scale_type{-8}};
});
// auto col0_mask = cudf::detail::make_counting_transform_iterator(
// 0, [](auto i) { return (i % 2); });
auto col1_mask =
cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i < 10); });
auto col2_mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
auto col2_mask = no_nulls();
auto col3_mask =
cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i == (num_rows - 1)); });
auto col4_mask =
Expand Down Expand Up @@ -181,11 +181,10 @@ TEST_P(ParquetV2Test, Strings)

auto seq_col0 = random_values<int>(num_rows);
auto seq_col2 = random_values<float>(num_rows);
auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });

column_wrapper<int> col0{seq_col0.begin(), seq_col0.end(), validity};
column_wrapper<int> col0{seq_col0.begin(), seq_col0.end(), no_nulls()};
column_wrapper<cudf::string_view> col1{strings.begin(), strings.end()};
column_wrapper<float> col2{seq_col2.begin(), seq_col2.end(), validity};
column_wrapper<float> col2{seq_col2.begin(), seq_col2.end(), no_nulls()};

auto expected = table_view{{col0, col1, col2}};

Expand Down Expand Up @@ -688,60 +687,9 @@ TEST_P(ParquetV2Test, PartitionedWriteEmptyColumns)
CUDF_TEST_EXPECT_TABLES_EQUAL(expected2, result2.tbl->view());
}

TEST_P(ParquetV2Test, LargeColumnIndex)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you help me understand why this test has been removed?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's a great question, I should have left a comment before reviews.
This test was included only to test the case where the writer writes the data in two batches (not the same as chunks!). Batching has since been disabled so we don't need this (huge) test.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Understood, thanks!

{
// create a file large enough to be written in 2 batches (currently 1GB per batch)
// pick fragment size that num_rows is divisible by, so we'll get equal sized row groups
const std::string s1(1000, 'a');
const std::string s2(1000, 'b');
constexpr auto num_rows = 512 * 1024;
constexpr auto frag_size = num_rows / 128;
auto const is_v2 = GetParam();

auto col0_elements = cudf::detail::make_counting_transform_iterator(
0, [&](auto i) { return (i < num_rows) ? s1 : s2; });
auto col0 = cudf::test::strings_column_wrapper(col0_elements, col0_elements + 2 * num_rows);

auto const expected = table_view{{col0, col0}};

auto const filepath = temp_env->get_temp_filepath("LargeColumnIndex.parquet");
const cudf::io::parquet_writer_options out_opts =
cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
.stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN)
.compression(cudf::io::compression_type::NONE)
.dictionary_policy(cudf::io::dictionary_policy::NEVER)
.write_v2_headers(is_v2)
.max_page_fragment_size(frag_size)
.row_group_size_bytes(1024 * 1024 * 1024)
.row_group_size_rows(num_rows);
cudf::io::write_parquet(out_opts);

auto const source = cudf::io::datasource::create(filepath);
cudf::io::parquet::detail::FileMetaData fmd;

read_footer(source, &fmd);

for (auto const& rg : fmd.row_groups) {
for (size_t c = 0; c < rg.columns.size(); c++) {
auto const& chunk = rg.columns[c];

auto const ci = read_column_index(source, chunk);
auto const stats = get_statistics(chunk);

// check trunc(page.min) <= stats.min && trun(page.max) >= stats.max
auto const ptype = fmd.schema[c + 1].type;
auto const ctype = fmd.schema[c + 1].converted_type;
ASSERT_TRUE(stats.min_value.has_value());
ASSERT_TRUE(stats.max_value.has_value());
EXPECT_TRUE(compare_binary(ci.min_values[0], stats.min_value.value(), ptype, ctype) <= 0);
EXPECT_TRUE(compare_binary(ci.max_values[0], stats.max_value.value(), ptype, ctype) >= 0);
}
}
}

TEST_P(ParquetV2Test, CheckColumnOffsetIndex)
{
constexpr auto num_rows = 100000;
constexpr auto num_rows = 50000;
auto const is_v2 = GetParam();
auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2
: cudf::io::parquet::detail::PageType::DATA_PAGE;
Expand Down
Loading