diff --git a/cpp/include/cudf/detail/groupby/sort_helper.hpp b/cpp/include/cudf/detail/groupby/sort_helper.hpp index 471cd060dc3..1e36b2b2797 100644 --- a/cpp/include/cudf/detail/groupby/sort_helper.hpp +++ b/cpp/include/cudf/detail/groupby/sort_helper.hpp @@ -60,17 +60,7 @@ struct sort_groupby_helper { */ sort_groupby_helper(table_view const& keys, null_policy include_null_keys = null_policy::EXCLUDE, - sorted keys_pre_sorted = sorted::NO) - : _keys(keys), - _num_keys(-1), - _keys_pre_sorted(keys_pre_sorted), - _include_null_keys(include_null_keys) - { - if (keys_pre_sorted == sorted::YES and include_null_keys == null_policy::EXCLUDE and - has_nulls(keys)) { - _keys_pre_sorted = sorted::NO; - } - }; + sorted keys_pre_sorted = sorted::NO); ~sort_groupby_helper() = default; sort_groupby_helper(sort_groupby_helper const&) = delete; @@ -227,6 +217,8 @@ struct sort_groupby_helper { column_ptr _unsorted_keys_labels; ///< Group labels for unsorted _keys column_ptr _keys_bitmask_column; ///< Column representing rows with one or more nulls values table_view _keys; ///< Input keys to sort by + table_view _unflattened_keys; ///< Input keys, unflattened and possibly nested + std::vector _struct_null_vectors; ///< Null vectors for struct columns in _keys index_vector_ptr _group_offsets; ///< Indices into sorted _keys indicating starting index of each groups diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu index a26d69e3d46..533f193d692 100644 --- a/cpp/src/groupby/groupby.cu +++ b/cpp/src/groupby/groupby.cu @@ -31,6 +31,7 @@ #include #include #include +#include #include @@ -62,6 +63,8 @@ std::pair, std::vector> groupby::disp rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { + using namespace cudf::structs::detail; + // If sort groupby has been called once on this groupby object, then // always use sort groupby from now on. Because once keys are sorted, // all the aggs that can be done by hash groupby are efficiently done by @@ -70,7 +73,13 @@ std::pair, std::vector> groupby::disp // satisfied with a hash implementation if (_keys_are_sorted == sorted::NO and not _helper and detail::hash::can_use_hash_groupby(_keys, requests)) { - return detail::hash::groupby(_keys, requests, _include_null_keys, stream, mr); + // Optionally flatten nested key columns. + auto [flattened_keys, _, __, ___] = + flatten_nested_columns(_keys, {}, {}, column_nullability::FORCE); + auto [grouped_keys, results] = + detail::hash::groupby(flattened_keys, requests, _include_null_keys, stream, mr); + return std::make_pair(unflatten_nested_columns(std::move(grouped_keys), _keys), + std::move(results)); } else { return sort_aggregate(requests, stream, mr); } diff --git a/cpp/src/groupby/sort/sort_helper.cu b/cpp/src/groupby/sort/sort_helper.cu index 5e944f75712..69d68f7b6bc 100644 --- a/cpp/src/groupby/sort/sort_helper.cu +++ b/cpp/src/groupby/sort/sort_helper.cu @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -88,6 +89,31 @@ namespace cudf { namespace groupby { namespace detail { namespace sort { + +sort_groupby_helper::sort_groupby_helper(table_view const& keys, + null_policy include_null_keys, + sorted keys_pre_sorted) + : _unflattened_keys(keys), + _num_keys(-1), + _keys_pre_sorted(keys_pre_sorted), + _include_null_keys(include_null_keys) +{ + using namespace cudf::structs::detail; + + auto [flattened_keys, _, __, struct_null_vectors] = + flatten_nested_columns(keys, {}, {}, column_nullability::FORCE); + _struct_null_vectors = std::move(struct_null_vectors); + _keys = flattened_keys; + + // Cannot depend on caller's sorting if the column contains nulls, + // and null values are to be excluded. + // Re-sort the data, to filter out nulls more easily. + if (keys_pre_sorted == sorted::YES and include_null_keys == null_policy::EXCLUDE and + has_nulls(keys)) { + _keys_pre_sorted = sorted::NO; + } +}; + size_type sort_groupby_helper::num_keys(rmm::cuda_stream_view stream) { if (_num_keys > -1) return _num_keys; @@ -309,7 +335,7 @@ std::unique_ptr sort_groupby_helper::unique_keys(rmm::cuda_stream_view st auto gather_map_it = thrust::make_transform_iterator( group_offsets(stream).begin(), [idx_data] __device__(size_type i) { return idx_data[i]; }); - return cudf::detail::gather(_keys, + return cudf::detail::gather(_unflattened_keys, gather_map_it, gather_map_it + num_groups(stream), out_of_bounds_policy::DONT_CHECK, @@ -320,7 +346,7 @@ std::unique_ptr
sort_groupby_helper::unique_keys(rmm::cuda_stream_view st std::unique_ptr
sort_groupby_helper::sorted_keys(rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - return cudf::detail::gather(_keys, + return cudf::detail::gather(_unflattened_keys, key_sort_order(stream), cudf::out_of_bounds_policy::DONT_CHECK, cudf::detail::negative_index_policy::NOT_ALLOWED, diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index edfbba74eb1..d9553d463ab 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -80,6 +80,7 @@ ConfigureTest(GROUPBY_TEST groupby/replace_nulls_tests.cpp groupby/shift_tests.cpp groupby/std_tests.cpp + groupby/structs_tests.cpp groupby/sum_of_squares_tests.cpp groupby/sum_scan_tests.cpp groupby/sum_tests.cpp diff --git a/cpp/tests/groupby/structs_tests.cpp b/cpp/tests/groupby/structs_tests.cpp new file mode 100644 index 00000000000..00126a4a5a0 --- /dev/null +++ b/cpp/tests/groupby/structs_tests.cpp @@ -0,0 +1,319 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include + +#include +#include "cudf/aggregation.hpp" +#include "cudf/types.hpp" + +using namespace cudf::test::iterators; + +namespace cudf { +namespace test { + +template +struct groupby_structs_test : public cudf::test::BaseFixture { +}; + +TYPED_TEST_CASE(groupby_structs_test, cudf::test::FixedWidthTypes); + +using V = int32_t; // Type of Aggregation Column. +using M0 = int32_t; // Type of STRUCT's first (i.e. 0th) member. +using R = cudf::detail::target_type_t; // Type of aggregation result. +using offsets = std::vector; +using strings = strings_column_wrapper; +using structs = structs_column_wrapper; + +template +using fwcw = fixed_width_column_wrapper; + +template +using lcw = lists_column_wrapper; + +namespace { +static constexpr auto null = -1; // Signifies null value. + +// Checking with a single aggregation, and aggregation column. +// This test is orthogonal to the aggregation type; it focuses on testing the grouping +// with STRUCT keys. +auto sum_agg() { return cudf::make_sum_aggregation(); } + +// Set this to true to enable printing, for debugging. +auto constexpr print_enabled = false; + +void print_agg_results(column_view const& keys, column_view const& vals) +{ + if constexpr (print_enabled) { + auto requests = std::vector{}; + requests.push_back(groupby::aggregation_request{}); + requests.back().values = vals; + requests.back().aggregations.push_back(sum_agg()); + requests.back().aggregations.push_back( + cudf::make_nth_element_aggregation(0)); + + auto gby = groupby::groupby{table_view({keys}), null_policy::INCLUDE, sorted::NO, {}, {}}; + auto result = gby.aggregate(requests); + std::cout << "Results: Keys: " << std::endl; + print(result.first->get_column(0).view()); + std::cout << "Results: Values: " << std::endl; + print(result.second.front().results[0]->view()); + } +} + +void test_sum_agg(column_view const& keys, + column_view const& values, + column_view const& expected_keys, + column_view const& expected_values) +{ + test_single_agg(keys, + values, + expected_keys, + expected_values, + sum_agg(), + force_use_sort_impl::NO, + null_policy::INCLUDE); + test_single_agg(keys, + values, + expected_keys, + expected_values, + sum_agg(), + force_use_sort_impl::YES, + null_policy::INCLUDE); +} + +} // namespace + +TYPED_TEST(groupby_structs_test, basic) +{ + using M1 = TypeParam; // Type of STRUCT's second (i.e. 1th) member. + + // clang-format off + auto values = fwcw { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + auto member_0 = fwcw{ 1, 2, 3, 1, 2, 2, 1, 3, 3, 2}; + auto member_1 = fwcw{ 11, 22, 33, 11, 22, 22, 11, 33, 33, 22}; + auto member_2 = strings {"11", "22", "33", "11", "22", "22", "11", "33", "33", "22"}; + auto keys = structs{member_0, member_1, member_2}; + + auto expected_values = fwcw { 9, 19, 17 }; + auto expected_member_0 = fwcw{ 1, 2, 3 }; + auto expected_member_1 = fwcw{ 11, 22, 33 }; + auto expected_member_2 = strings {"11", "22", "33"}; + auto expected_keys = structs{expected_member_0, expected_member_1, expected_member_2}; + // clang-format on + + test_sum_agg(keys, values, expected_keys, expected_values); +} + +TYPED_TEST(groupby_structs_test, structs_with_nulls_in_members) +{ + using M1 = TypeParam; // Type of STRUCT's second (i.e. 1th) member. + + // clang-format off + auto values = fwcw { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; + auto member_0 = fwcw{{ 1, null, 3, 1, 2, 2, 1, 3, 3, 2 }, null_at(1)}; + auto member_1 = fwcw{{ 11, 22, 33, 11, 22, 22, 11, null, 33, 22 }, null_at(7)}; + auto member_2 = strings { "11", "22", "33", "11", "22", "22", "11", "33", "33", "22"}; + auto keys = structs{{member_0, member_1, member_2}}; + // clang-format on + + print_agg_results(keys, values); + + // clang-format off + auto expected_values = fwcw { 9, 18, 10, 7, 1 }; + auto expected_member_0 = fwcw{ { 1, 2, 3, 3, null }, null_at(4)}; + auto expected_member_1 = fwcw{ { 11, 22, 33, null, 22 }, null_at(3)}; + auto expected_member_2 = strings { "11", "22", "33", "33", "22" }; + auto expected_keys = structs{expected_member_0, expected_member_1, expected_member_2}; + // clang-format on + + test_sum_agg(keys, values, expected_keys, expected_values); +} + +TYPED_TEST(groupby_structs_test, structs_with_null_rows) +{ + using M1 = TypeParam; // Type of STRUCT's second (i.e. 1th) member. + + // clang-format off + auto values = fwcw { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + auto member_0 = fwcw{ 1, 2, 3, 1, 2, 2, 1, 3, 3, 2}; + auto member_1 = fwcw{ 11, 22, 33, 11, 22, 22, 11, 33, 33, 22}; + auto member_2 = strings {"11", "22", "33", "11", "22", "22", "11", "33", "33", "22"}; + auto keys = structs{{member_0, member_1, member_2}, nulls_at({0, 3})}; + + auto expected_values = fwcw { 6, 19, 17, 3 }; + auto expected_member_0 = fwcw{ { 1, 2, 3, null }, null_at(3)}; + auto expected_member_1 = fwcw{ { 11, 22, 33, null }, null_at(3)}; + auto expected_member_2 = strings { {"11", "22", "33", "null" }, null_at(3)}; + auto expected_keys = structs{{expected_member_0, expected_member_1, expected_member_2}, null_at(3)}; + // clang-format on + + print_agg_results(keys, values); + + test_sum_agg(keys, values, expected_keys, expected_values); +} + +TYPED_TEST(groupby_structs_test, structs_with_nulls_in_rows_and_members) +{ + using M1 = TypeParam; // Type of STRUCT's second (i.e. 1th) member. + + // clang-format off + auto values = fwcw { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; + auto member_0 = fwcw{{ 1, 2, 3, 1, 2, 2, 1, 3, 3, 2 }, null_at(1)}; + auto member_1 = fwcw{{ 11, 22, 33, 11, 22, 22, 11, 33, 33, 22 }, null_at(7)}; + auto member_2 = strings { "11", "22", "33", "11", "22", "22", "11", "33", "33", "22"}; + auto keys = structs{{member_0, member_1, member_2}, null_at(4)}; + // clang-format on + + print_agg_results(keys, values); + + // clang-format off + auto expected_values = fwcw { 9, 14, 10, 7, 1, 4 }; + auto expected_member_0 = fwcw{{ 1, 2, 3, 3, null, null }, nulls_at({4,5})}; + auto expected_member_1 = fwcw{{ 11, 22, 33, null, 22, null }, nulls_at({3,5})}; + auto expected_member_2 = strings {{ "11", "22", "33", "33", "22", "null" }, null_at(5)}; + auto expected_keys = structs{{expected_member_0, expected_member_1, expected_member_2}, null_at(5)}; + // clang-format on + + print_agg_results(keys, values); + test_sum_agg(keys, values, expected_keys, expected_values); +} + +TYPED_TEST(groupby_structs_test, null_members_differ_from_null_structs) +{ + // This test specifically confirms that a non-null STRUCT row `{null, null, null}` is grouped + // differently from a null STRUCT row (whose members are incidentally null). + + using M1 = TypeParam; // Type of STRUCT's second (i.e. 1th) member. + + // clang-format off + auto values = fwcw { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; + auto member_0 = fwcw{{ 1, null, 3, 1, 2, 2, 1, 3, 3, 2 }, null_at(1)}; + auto member_1 = fwcw{{ 11, null, 33, 11, 22, 22, 11, 33, 33, 22 }, null_at(1)}; + auto member_2 = strings {{ "11", "null", "33", "11", "22", "22", "11", "33", "33", "22"}, null_at(1)}; + auto keys = structs{{member_0, member_1, member_2}, null_at(4)}; + // clang-format on + + print_agg_results(keys, values); + + // Index-3 => Non-null Struct row, with nulls for all members. + // Index-4 => Null Struct row. + + // clang-format off + auto expected_values = fwcw { 9, 14, 17, 1, 4 }; + auto expected_member_0 = fwcw{ { 1, 2, 3, null, null }, nulls_at({3,4})}; + auto expected_member_1 = fwcw{ { 11, 22, 33, null, null }, nulls_at({3,4})}; + auto expected_member_2 = strings { {"11", "22", "33", "null", "null" }, nulls_at({3,4})}; + auto expected_keys = structs{{expected_member_0, expected_member_1, expected_member_2}, null_at(4)}; + // clang-format on + + test_sum_agg(keys, values, expected_keys, expected_values); +} + +TYPED_TEST(groupby_structs_test, structs_of_structs) +{ + using M1 = TypeParam; // Type of STRUCT's second (i.e. 1th) member. + + // clang-format off + auto values = fwcw { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; + auto struct_0_member_0 = fwcw{{ 1, null, 3, 1, 2, 2, 1, 3, 3, 2 }, null_at(1)}; + auto struct_0_member_1 = fwcw{{ 11, null, 33, 11, 22, 22, 11, 33, 33, 22 }, null_at(1)}; + auto struct_0_member_2 = strings {{ "11", "null", "33", "11", "22", "22", "11", "33", "33", "22"}, null_at(1)}; + // clang-format on + + auto struct_0 = structs{{struct_0_member_0, struct_0_member_1, struct_0_member_2}, null_at(4)}; + auto struct_1_member_1 = fwcw{8, 9, 6, 8, 0, 7, 8, 6, 6, 7}; + + auto keys = structs{{struct_0, struct_1_member_1}}; // Struct of structs. + + print_agg_results(keys, values); + + // clang-format off + auto expected_values = fwcw { 9, 14, 17, 1, 4 }; + auto expected_member_0 = fwcw{ { 1, 2, 3, null, null }, nulls_at({3,4})}; + auto expected_member_1 = fwcw{ { 11, 22, 33, null, null }, nulls_at({3,4})}; + auto expected_member_2 = strings { {"11", "22", "33", "null", "null" }, nulls_at({3,4})}; + auto expected_structs = structs{{expected_member_0, expected_member_1, expected_member_2}, null_at(4)}; + auto expected_struct_1_member_1 = fwcw{ 8, 7, 6, 9, 0 }; + auto expected_keys = structs{{expected_structs, expected_struct_1_member_1}}; + // clang-format on + + test_sum_agg(keys, values, expected_keys, expected_values); +} + +TYPED_TEST(groupby_structs_test, empty_input) +{ + using M1 = TypeParam; // Type of STRUCT's second (i.e. 1th) member. + + // clang-format off + auto values = fwcw {}; + auto member_0 = fwcw{}; + auto member_1 = fwcw{}; + auto member_2 = strings {}; + auto keys = structs{member_0, member_1, member_2}; + + auto expected_values = fwcw {}; + auto expected_member_0 = fwcw{}; + auto expected_member_1 = fwcw{}; + auto expected_member_2 = strings {}; + auto expected_keys = structs{expected_member_0, expected_member_1, expected_member_2}; + // clang-format on + + test_sum_agg(keys, values, expected_keys, expected_values); +} + +TYPED_TEST(groupby_structs_test, all_null_input) +{ + using M1 = TypeParam; // Type of STRUCT's second (i.e. 1th) member. + + // clang-format off + auto values = fwcw { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + auto member_0 = fwcw{ 1, 2, 3, 1, 2, 2, 1, 3, 3, 2}; + auto member_1 = fwcw{ 11, 22, 33, 11, 22, 22, 11, 33, 33, 22}; + auto member_2 = strings {"11", "22", "33", "11", "22", "22", "11", "33", "33", "22"}; + auto keys = structs{{member_0, member_1, member_2}, all_nulls()}; + + auto expected_values = fwcw { 45 }; + auto expected_member_0 = fwcw{ null }; + auto expected_member_1 = fwcw{ null }; + auto expected_member_2 = strings {"null"}; + auto expected_keys = structs{{expected_member_0, expected_member_1, expected_member_2}, all_nulls()}; + // clang-format on + + test_sum_agg(keys, values, expected_keys, expected_values); +} + +TYPED_TEST(groupby_structs_test, lists_are_unsupported) +{ + using M1 = TypeParam; // Type of STRUCT's second (i.e. 1th) member. + + // clang-format off + auto values = fwcw { 0, 1, 2, 3, 4 }; + auto member_0 = lcw { {1,1}, {2,2}, {3,3}, {1,1}, {2,2} }; + auto member_1 = fwcw{ 1, 2, 3, 1, 2 }; + // clang-format on + auto keys = structs{{member_0, member_1}}; + + EXPECT_THROW(test_sum_agg(keys, values, keys, values), cudf::logic_error); +} + +} // namespace test +} // namespace cudf