diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index bdc72cc4535..95f0db895a8 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -153,8 +153,12 @@ ConfigureBench(APPLY_BOOLEAN_MASK_BENCH stream_compaction/apply_boolean_mask.cpp # ################################################################################################## # * stream_compaction benchmark ------------------------------------------------------------------- ConfigureNVBench( - STREAM_COMPACTION_NVBENCH stream_compaction/distinct.cpp stream_compaction/distinct_count.cpp - stream_compaction/unique.cpp stream_compaction/unique_count.cpp + STREAM_COMPACTION_NVBENCH + stream_compaction/distinct.cpp + stream_compaction/distinct_count.cpp + stream_compaction/stable_distinct.cpp + stream_compaction/unique.cpp + stream_compaction/unique_count.cpp ) # ################################################################################################## diff --git a/cpp/benchmarks/stream_compaction/stable_distinct.cpp b/cpp/benchmarks/stream_compaction/stable_distinct.cpp new file mode 100644 index 00000000000..6a9542c83a6 --- /dev/null +++ b/cpp/benchmarks/stream_compaction/stable_distinct.cpp @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include +#include + +#include + +NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_ms, "cudf::timestamp_ms", "cudf::timestamp_ms"); + +template +void nvbench_stable_distinct(nvbench::state& state, nvbench::type_list) +{ + cudf::size_type const num_rows = state.get_int64("NumRows"); + + data_profile profile = data_profile_builder().cardinality(0).null_probability(0.01).distribution( + cudf::type_to_id(), distribution_id::UNIFORM, 0, 100); + + auto source_column = create_random_column(cudf::type_to_id(), row_count{num_rows}, profile); + + auto input_column = source_column->view(); + auto input_table = cudf::table_view({input_column, input_column, input_column, input_column}); + + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto result = cudf::stable_distinct(input_table, + {0}, + cudf::duplicate_keep_option::KEEP_ANY, + cudf::null_equality::EQUAL, + cudf::nan_equality::ALL_EQUAL); + }); +} + +using data_type = nvbench::type_list; + +NVBENCH_BENCH_TYPES(nvbench_stable_distinct, NVBENCH_TYPE_AXES(data_type)) + .set_name("stable_distinct") + .set_type_axes_names({"Type"}) + .add_int64_axis("NumRows", {10'000, 100'000, 1'000'000, 10'000'000}); + +template +void nvbench_stable_distinct_list(nvbench::state& state, nvbench::type_list) +{ + auto const size = state.get_int64("ColumnSize"); + auto const dtype = cudf::type_to_id(); + double const null_probability = state.get_float64("null_probability"); + + auto builder = data_profile_builder().null_probability(null_probability); + if (dtype == cudf::type_id::LIST) { + builder.distribution(dtype, distribution_id::UNIFORM, 0, 4) + .distribution(cudf::type_id::INT32, distribution_id::UNIFORM, 0, 4) + .list_depth(1); + } else { + // We're comparing stable_distinct() on a non-nested column to that on a list column with the + // same number of stable_distinct rows. The max list size is 4 and the number of distinct values + // in the list's child is 5. So the number of distinct rows in the list = 1 + 5 + 5^2 + 5^3 + + // 5^4 = 781 We want this column to also have 781 distinct values. + builder.distribution(dtype, distribution_id::UNIFORM, 0, 781); + } + + auto const table = create_random_table( + {dtype}, table_size_bytes{static_cast(size)}, data_profile{builder}, 0); + + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto result = cudf::stable_distinct(*table, + {0}, + cudf::duplicate_keep_option::KEEP_ANY, + cudf::null_equality::EQUAL, + cudf::nan_equality::ALL_EQUAL); + }); +} + +NVBENCH_BENCH_TYPES(nvbench_stable_distinct_list, + NVBENCH_TYPE_AXES(nvbench::type_list)) + .set_name("stable_distinct_list") + .set_type_axes_names({"Type"}) + .add_float64_axis("null_probability", {0.0, 0.1}) + .add_int64_axis("ColumnSize", {100'000'000}); diff --git a/cpp/include/cudf/detail/stream_compaction.hpp b/cpp/include/cudf/detail/stream_compaction.hpp index e0fc7b71cd9..5476000fc29 100644 --- a/cpp/include/cudf/detail/stream_compaction.hpp +++ b/cpp/include/cudf/detail/stream_compaction.hpp @@ -86,24 +86,9 @@ std::unique_ptr distinct(table_view const& input, rmm::mr::device_memory_resource* mr); /** - * @brief Create a new table without duplicate rows. + * @copydoc cudf::stable_distinct * - * Given an `input` table_view, each row is copied to the output table to create a set of distinct - * rows. The row order is guaranteed to be preserved as in the input. - * - * If there are duplicate rows, which row to be copied depends on the specified value of the `keep` - * parameter. - * - * This API produces exactly the same set of output rows as `cudf::distinct`. - * - * @param input The input table - * @param keys Vector of indices indicating key columns in the `input` table - * @param keep Copy any, first, last, or none of the found duplicates - * @param nulls_equal Flag to specify whether null elements should be considered as equal - * @param nans_equal Flag to specify whether NaN elements should be considered as equal - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource used to allocate the returned table - * @return A table containing the resulting distinct rows + * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr
stable_distinct(table_view const& input, std::vector const& keys, diff --git a/cpp/include/cudf/stream_compaction.hpp b/cpp/include/cudf/stream_compaction.hpp index e2a6b97256f..984e3037cd1 100644 --- a/cpp/include/cudf/stream_compaction.hpp +++ b/cpp/include/cudf/stream_compaction.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -254,22 +254,19 @@ std::unique_ptr
unique( * @brief Create a new table without duplicate rows. * * Given an `input` table_view, each row is copied to the output table to create a set of distinct - * rows. If there are duplicate rows, which row to be copied depends on the specified value of - * the `keep` parameter. + * rows. If there are duplicate rows, which row is copied depends on the `keep` parameter. * * The order of rows in the output table is not specified. * * Performance hint: if the input is pre-sorted, `cudf::unique` can produce an equivalent result * (i.e., same set of output rows) but with less running time than `cudf::distinct`. * - * @param[in] input input table_view to copy only distinct rows - * @param[in] keys vector of indices representing key columns from `input` - * @param[in] keep keep any, first, last, or none of the found duplicates - * @param[in] nulls_equal flag to control if nulls are compared equal or not - * @param[in] nans_equal flag to control if floating-point NaN values are compared equal or not - * @param[in] mr Device memory resource used to allocate the returned table's device - * memory - * + * @param input The input table + * @param keys Vector of indices indicating key columns in the `input` table + * @param keep Copy any, first, last, or none of the found duplicates + * @param nulls_equal Flag to specify whether null elements should be considered as equal + * @param nans_equal Flag to specify whether NaN elements should be considered as equal + * @param mr Device memory resource used to allocate the returned table * @return Table with distinct rows in an unspecified order */ std::unique_ptr
distinct( @@ -280,6 +277,36 @@ std::unique_ptr
distinct( nan_equality nans_equal = nan_equality::ALL_EQUAL, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Create a new table without duplicate rows, preserving input order. + * + * Given an `input` table_view, each row is copied to the output table to create a set of distinct + * rows. The input row order is preserved. If there are duplicate rows, which row is copied depends + * on the `keep` parameter. + * + * This API produces the same output rows as `cudf::distinct`, but with input order preserved. + * + * Note that when `keep` is `KEEP_ANY`, the choice of which duplicate row to keep is arbitrary, but + * the returned table will retain the input order. That is, if the key column contained `1, 2, 1` + * with another values column `3, 4, 5`, the result could contain values `3, 4` or `4, 5` but not + * `4, 3` or `5, 4`. + * + * @param input The input table + * @param keys Vector of indices indicating key columns in the `input` table + * @param keep Copy any, first, last, or none of the found duplicates + * @param nulls_equal Flag to specify whether null elements should be considered as equal + * @param nans_equal Flag to specify whether NaN elements should be considered as equal + * @param mr Device memory resource used to allocate the returned table + * @return Table with distinct rows, preserving input order + */ +std::unique_ptr
stable_distinct( + table_view const& input, + std::vector const& keys, + duplicate_keep_option keep = duplicate_keep_option::KEEP_ANY, + null_equality nulls_equal = null_equality::EQUAL, + nan_equality nans_equal = nan_equality::ALL_EQUAL, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Count the number of consecutive groups of equivalent rows in a column. * diff --git a/cpp/src/stream_compaction/stable_distinct.cu b/cpp/src/stream_compaction/stable_distinct.cu index d45897930b0..45a2de9288b 100644 --- a/cpp/src/stream_compaction/stable_distinct.cu +++ b/cpp/src/stream_compaction/stable_distinct.cu @@ -19,12 +19,14 @@ #include #include #include +#include #include #include #include -namespace cudf::detail { +namespace cudf { +namespace detail { std::unique_ptr
stable_distinct(table_view const& input, std::vector const& keys, @@ -45,7 +47,13 @@ std::unique_ptr
stable_distinct(table_view const& input, stream, rmm::mr::get_current_device_resource()); - // Markers to denote which rows to be copied to the output. + // The only difference between this implementation and the unstable version + // is that the stable implementation must retain the input order. The + // distinct indices are not sorted, so we cannot simply copy the rows in the + // order of the distinct indices and retain the input order. Instead, we use + // a boolean mask to indicate which rows to copy to the output. This avoids + // the need to sort the distinct indices, which is slower. + auto const output_markers = [&] { auto markers = rmm::device_uvector(input.num_rows(), stream); thrust::uninitialized_fill(rmm::exec_policy(stream), markers.begin(), markers.end(), false); @@ -58,13 +66,22 @@ std::unique_ptr
stable_distinct(table_view const& input, return markers; }(); - return cudf::detail::copy_if( - input, - [output_markers = output_markers.begin()] __device__(auto const idx) { - return *(output_markers + idx); - }, - stream, - mr); + return cudf::detail::apply_boolean_mask( + input, cudf::device_span(output_markers), stream, mr); +} + +} // namespace detail + +std::unique_ptr
stable_distinct(table_view const& input, + std::vector const& keys, + duplicate_keep_option keep, + null_equality nulls_equal, + nan_equality nans_equal, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::stable_distinct( + input, keys, keep, nulls_equal, nans_equal, cudf::get_default_stream(), mr); } -} // namespace cudf::detail +} // namespace cudf diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 7f2807fc30e..1262e065041 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -401,8 +401,9 @@ ConfigureTest( stream_compaction/apply_boolean_mask_tests.cpp stream_compaction/distinct_count_tests.cpp stream_compaction/distinct_tests.cpp - stream_compaction/drop_nulls_tests.cpp stream_compaction/drop_nans_tests.cpp + stream_compaction/drop_nulls_tests.cpp + stream_compaction/stable_distinct_tests.cpp stream_compaction/unique_count_tests.cpp stream_compaction/unique_tests.cpp ) diff --git a/cpp/tests/stream_compaction/distinct_tests.cpp b/cpp/tests/stream_compaction/distinct_tests.cpp index 85955ce7fc9..586792b4b30 100644 --- a/cpp/tests/stream_compaction/distinct_tests.cpp +++ b/cpp/tests/stream_compaction/distinct_tests.cpp @@ -126,7 +126,7 @@ TEST_F(DistinctKeepAny, EmptyInputTable) { int32s_col col(std::initializer_list{}); cudf::table_view input{{col}}; - std::vector key_idx{1, 2}; + std::vector key_idx{0}; auto got = cudf::distinct(input, key_idx, KEEP_ANY); CUDF_TEST_EXPECT_TABLES_EQUAL(input, got->view()); @@ -1217,11 +1217,11 @@ TEST_F(DistinctKeepAny, StructsOfStructs) // 8 | { {2, 1}, 5} | auto s1 = [&] { - auto a = int32s_col{1, 1, XXX, XXX, 2, 1, 1, XXX, 2}; - auto b = int32s_col{1, 2, XXX, XXX, 2, 1, 1, XXX, 1}; + auto a = int32s_col{1, 1, XXX, XXX, XXX, XXX, 1, XXX, 2}; + auto b = int32s_col{1, 2, XXX, XXX, XXX, XXX, 1, XXX, 1}; auto s2 = structs_col{{a, b}, nulls_at({2, 3, 7})}; - auto c = int32s_col{5, 4, 6, 4, 3, 3, 5, 4, 5}; + auto c = int32s_col{5, 4, 6, 4, XXX, XXX, 5, 4, 5}; std::vector> s1_children; s1_children.emplace_back(s2.release()); s1_children.emplace_back(c.release()); @@ -1270,11 +1270,11 @@ TEST_F(DistinctKeepAny, SlicedStructsOfStructs) // 8 | { {2, 1}, 5} | auto s1 = [&] { - auto a = int32s_col{1, 1, 2, 2, 2, 1, 1, 1, 2}; - auto b = int32s_col{1, 2, 1, 2, 2, 1, 1, 1, 1}; + auto a = int32s_col{1, 1, XXX, XXX, XXX, XXX, 1, XXX, 2}; + auto b = int32s_col{1, 2, XXX, XXX, XXX, XXX, 1, XXX, 1}; auto s2 = structs_col{{a, b}, nulls_at({2, 3, 7})}; - auto c = int32s_col{5, 4, 6, 4, 3, 3, 5, 4, 5}; + auto c = int32s_col{5, 4, 6, 4, XXX, XXX, 5, 4, 5}; std::vector> s1_children; s1_children.emplace_back(s2.release()); s1_children.emplace_back(c.release()); diff --git a/cpp/tests/stream_compaction/stable_distinct_tests.cpp b/cpp/tests/stream_compaction/stable_distinct_tests.cpp new file mode 100644 index 00000000000..e28b96fc8be --- /dev/null +++ b/cpp/tests/stream_compaction/stable_distinct_tests.cpp @@ -0,0 +1,1354 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +auto constexpr null{0}; // null at current level +auto constexpr XXX{0}; // null pushed down from parent level +auto constexpr NaN = std::numeric_limits::quiet_NaN(); +auto constexpr KEEP_ANY = cudf::duplicate_keep_option::KEEP_ANY; +auto constexpr KEEP_FIRST = cudf::duplicate_keep_option::KEEP_FIRST; +auto constexpr KEEP_LAST = cudf::duplicate_keep_option::KEEP_LAST; +auto constexpr KEEP_NONE = cudf::duplicate_keep_option::KEEP_NONE; +auto constexpr NULL_EQUAL = cudf::null_equality::EQUAL; +auto constexpr NULL_UNEQUAL = cudf::null_equality::UNEQUAL; +auto constexpr NAN_EQUAL = cudf::nan_equality::ALL_EQUAL; +auto constexpr NAN_UNEQUAL = cudf::nan_equality::UNEQUAL; + +using int32s_col = cudf::test::fixed_width_column_wrapper; +using floats_col = cudf::test::fixed_width_column_wrapper; +using lists_col = cudf::test::lists_column_wrapper; +using strings_col = cudf::test::strings_column_wrapper; +using structs_col = cudf::test::structs_column_wrapper; + +using cudf::nan_policy; +using cudf::null_equality; +using cudf::null_policy; +using cudf::test::iterators::no_nulls; +using cudf::test::iterators::null_at; +using cudf::test::iterators::nulls_at; + +struct StableDistinctKeepAny : public cudf::test::BaseFixture {}; + +struct StableDistinctKeepFirstLastNone : public cudf::test::BaseFixture {}; + +TEST_F(StableDistinctKeepAny, StringKeyColumn) +{ + // Column(s) used to test KEEP_ANY needs to have same rows in contiguous + // groups for equivalent keys because KEEP_ANY is nondeterministic. + auto const col = int32s_col{{5, 5, null, null, 5, 8, 1}, nulls_at({2, 3})}; + auto const keys = + strings_col{{"all", "all", "new", "new", "" /*NULL*/, "the", "strings"}, null_at(4)}; + auto const input = cudf::table_view{{col, keys}}; + auto const key_idx = std::vector{1}; + + auto const exp_col = int32s_col{{5, null, 5, 8, 1}, null_at(1)}; + auto const exp_keys = strings_col{{"all", "new", "" /*NULL*/, "the", "strings"}, null_at(2)}; + auto const expected = cudf::table_view{{exp_col, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); +} + +TEST_F(StableDistinctKeepFirstLastNone, StringKeyColumn) +{ + // Column(s) used to test needs to have different rows for the same keys. + auto const col = int32s_col{{0, null, 2, 3, 4, 5, 6}, null_at(1)}; + auto const keys = + strings_col{{"all", "new", "new", "all", "" /*NULL*/, "the", "strings"}, null_at(4)}; + auto const input = cudf::table_view{{col, keys}}; + auto const key_idx = std::vector{1}; + + // KEEP_FIRST + { + auto const exp_col = int32s_col{{0, null, 4, 5, 6}, null_at(1)}; + auto const exp_keys = strings_col{{"all", "new", "" /*NULL*/, "the", "strings"}, null_at(2)}; + auto const expected = cudf::table_view{{exp_col, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_FIRST); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } + + // KEEP_LAST + { + auto const exp_col = int32s_col{{2, 3, 4, 5, 6}, no_nulls()}; + auto const exp_keys = strings_col{{"new", "all", "" /*NULL*/, "the", "strings"}, null_at(2)}; + auto const expected = cudf::table_view{{exp_col, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_LAST); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } + + // KEEP_NONE + { + auto const exp_col = int32s_col{{4, 5, 6}, no_nulls()}; + auto const exp_keys = strings_col{{"" /*NULL*/, "the", "strings"}, null_at(0)}; + auto const expected = cudf::table_view{{exp_col, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_NONE); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } +} + +TEST_F(StableDistinctKeepAny, EmptyInputTable) +{ + int32s_col col(std::initializer_list{}); + cudf::table_view input{{col}}; + std::vector key_idx{0}; + + auto got = cudf::stable_distinct(input, key_idx, KEEP_ANY); + CUDF_TEST_EXPECT_TABLES_EQUAL(input, got->view()); +} + +TEST_F(StableDistinctKeepAny, NoColumnInputTable) +{ + cudf::table_view input{std::vector()}; + std::vector key_idx{1, 2}; + + auto got = cudf::stable_distinct(input, key_idx, KEEP_ANY); + CUDF_TEST_EXPECT_TABLES_EQUAL(input, got->view()); +} + +TEST_F(StableDistinctKeepAny, EmptyKeys) +{ + int32s_col col{{5, 4, 3, 5, 8, 1}, {1, 0, 1, 1, 1, 1}}; + int32s_col empty_col{}; + cudf::table_view input{{col}}; + std::vector key_idx{}; + + auto got = cudf::stable_distinct(input, key_idx, KEEP_ANY); + CUDF_TEST_EXPECT_TABLES_EQUAL(cudf::table_view{{empty_col}}, got->view()); +} + +TEST_F(StableDistinctKeepAny, NoNullsTable) +{ + // Column(s) used to test KEEP_ANY needs to have same rows in contiguous + // groups for equivalent keys because KEEP_ANY is nondeterministic. + auto const col1 = int32s_col{6, 6, 6, 3, 5, 8, 5}; + auto const col2 = floats_col{6, 6, 6, 3, 4, 9, 4}; + auto const keys1 = int32s_col{20, 20, 20, 20, 19, 21, 9}; + auto const keys2 = int32s_col{19, 19, 19, 20, 20, 9, 21}; + + auto const input = cudf::table_view{{col1, col2, keys1, keys2}}; + auto const key_idx = std::vector{2, 3}; + + auto const exp_col1 = int32s_col{6, 3, 5, 8, 5}; + auto const exp_col2 = floats_col{6, 3, 4, 9, 4}; + auto const exp_keys1 = int32s_col{20, 20, 19, 21, 9}; + auto const exp_keys2 = int32s_col{19, 20, 20, 9, 21}; + auto const expected = cudf::table_view{{exp_col1, exp_col2, exp_keys1, exp_keys2}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); +} + +TEST_F(StableDistinctKeepAny, NoNullsTableWithNaNs) +{ + // Column(s) used to test KEEP_ANY needs to have same rows in contiguous + // groups for equivalent keys because KEEP_ANY is nondeterministic. + auto const col1 = int32s_col{6, 6, 6, 1, 1, 1, 3, 5, 8, 5}; + auto const col2 = floats_col{6, 6, 6, 1, 1, 1, 3, 4, 9, 4}; + auto const keys1 = int32s_col{20, 20, 20, 15, 15, 15, 20, 19, 21, 9}; + auto const keys2 = floats_col{19., 19., 19., NaN, NaN, NaN, 20., 20., 9., 21.}; + + auto const input = cudf::table_view{{col1, col2, keys1, keys2}}; + auto const key_idx = std::vector{2, 3}; + + // NaNs are unequal. + { + auto const exp_col1 = int32s_col{6, 1, 1, 1, 3, 5, 8, 5}; + auto const exp_col2 = floats_col{6, 1, 1, 1, 3, 4, 9, 4}; + auto const exp_keys1 = int32s_col{20, 15, 15, 15, 20, 19, 21, 9}; + auto const exp_keys2 = floats_col{19., NaN, NaN, NaN, 20., 20., 9., 21.}; + auto const expected = cudf::table_view{{exp_col1, exp_col2, exp_keys1, exp_keys2}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY, NULL_EQUAL, NAN_UNEQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } + + // NaNs are equal. + { + auto const exp_col1 = int32s_col{6, 1, 3, 5, 8, 5}; + auto const exp_col2 = floats_col{6, 1, 3, 4, 9, 4}; + auto const exp_keys1 = int32s_col{20, 15, 20, 19, 21, 9}; + auto const exp_keys2 = floats_col{19., NaN, 20., 20., 9., 21.}; + auto const expected = cudf::table_view{{exp_col1, exp_col2, exp_keys1, exp_keys2}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY, NULL_EQUAL, NAN_EQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } +} + +TEST_F(StableDistinctKeepFirstLastNone, NoNullsTable) +{ + // Column(s) used to test needs to have different rows for the same keys. + auto const col1 = int32s_col{0, 1, 2, 3, 4, 5, 6}; + auto const col2 = floats_col{10, 11, 12, 13, 14, 15, 16}; + auto const keys1 = int32s_col{20, 20, 20, 20, 19, 21, 9}; + auto const keys2 = int32s_col{19, 19, 19, 20, 20, 9, 21}; + + auto const input = cudf::table_view{{col1, col2, keys1, keys2}}; + auto const key_idx = std::vector{2, 3}; + + // KEEP_FIRST + { + auto const exp_col1 = int32s_col{0, 3, 4, 5, 6}; + auto const exp_col2 = floats_col{10, 13, 14, 15, 16}; + auto const exp_keys1 = int32s_col{20, 20, 19, 21, 9}; + auto const exp_keys2 = int32s_col{19, 20, 20, 9, 21}; + auto const expected = cudf::table_view{{exp_col1, exp_col2, exp_keys1, exp_keys2}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_FIRST); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } + + // KEEP_LAST + { + auto const exp_col1 = int32s_col{2, 3, 4, 5, 6}; + auto const exp_col2 = floats_col{12, 13, 14, 15, 16}; + auto const exp_keys1 = int32s_col{20, 20, 19, 21, 9}; + auto const exp_keys2 = int32s_col{19, 20, 20, 9, 21}; + auto const expected = cudf::table_view{{exp_col1, exp_col2, exp_keys1, exp_keys2}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_LAST); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } + + // KEEP_NONE + { + auto const exp_col1 = int32s_col{3, 4, 5, 6}; + auto const exp_col2 = floats_col{13, 14, 15, 16}; + auto const exp_keys1 = int32s_col{20, 19, 21, 9}; + auto const exp_keys2 = int32s_col{20, 20, 9, 21}; + auto const expected = cudf::table_view{{exp_col1, exp_col2, exp_keys1, exp_keys2}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_NONE); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } +} + +TEST_F(StableDistinctKeepAny, SlicedNoNullsTable) +{ + auto constexpr dont_care = int32_t{0}; + + // Column(s) used to test KEEP_ANY needs to have same rows in contiguous + // groups for equivalent keys because KEEP_ANY is nondeterministic. + auto const col1 = int32s_col{dont_care, dont_care, 6, 6, 6, 3, 5, 8, 5, dont_care}; + auto const col2 = floats_col{dont_care, dont_care, 6, 6, 6, 3, 4, 9, 4, dont_care}; + auto const keys1 = int32s_col{dont_care, dont_care, 20, 20, 20, 20, 19, 21, 9, dont_care}; + auto const keys2 = int32s_col{dont_care, dont_care, 19, 19, 19, 20, 20, 9, 21, dont_care}; + + auto const input_original = cudf::table_view{{col1, col2, keys1, keys2}}; + auto const input = cudf::slice(input_original, {2, 9})[0]; + auto const key_idx = std::vector{2, 3}; + + auto const exp_col1 = int32s_col{6, 3, 5, 8, 5}; + auto const exp_col2 = floats_col{6, 3, 4, 9, 4}; + auto const exp_keys1 = int32s_col{20, 20, 19, 21, 9}; + auto const exp_keys2 = int32s_col{19, 20, 20, 9, 21}; + auto const expected = cudf::table_view{{exp_col1, exp_col2, exp_keys1, exp_keys2}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); +} + +TEST_F(StableDistinctKeepFirstLastNone, SlicedNoNullsTable) +{ + auto constexpr dont_care = int32_t{0}; + + // Column(s) used to test needs to have different rows for the same keys. + // clang-format off + auto const col1 = int32s_col{0, 1, 2, // <- don't care + 3, 4, 5, 6, 7, 8, 9, dont_care}; + auto const col2 = floats_col{10, 11, 12, // <- don't care + 13, 14, 15, 16, 17, 18, 19, dont_care}; + auto const keys1 = int32s_col{20, 20, 20, // <- don't care + 20, 20, 20, 20, 19, 21, 9, dont_care}; + auto const keys2 = int32s_col{19, 19, 19, // <- don't care + 19, 19, 19, 20, 20, 9, 21, dont_care}; + // clang-format on + auto const input_original = cudf::table_view{{col1, col2, keys1, keys2}}; + auto const input = cudf::slice(input_original, {3, 10})[0]; + auto const key_idx = std::vector{2, 3}; + + // KEEP_FIRST + { + auto const exp_col1 = int32s_col{3, 6, 7, 8, 9}; + auto const exp_col2 = floats_col{13, 16, 17, 18, 19}; + auto const exp_keys1 = int32s_col{20, 20, 19, 21, 9}; + auto const exp_keys2 = int32s_col{19, 20, 20, 9, 21}; + auto const expected = cudf::table_view{{exp_col1, exp_col2, exp_keys1, exp_keys2}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_FIRST); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } + + // KEEP_LAST + { + auto const exp_col1 = int32s_col{5, 6, 7, 8, 9}; + auto const exp_col2 = floats_col{15, 16, 17, 18, 19}; + auto const exp_keys1 = int32s_col{20, 20, 19, 21, 9}; + auto const exp_keys2 = int32s_col{19, 20, 20, 9, 21}; + auto const expected = cudf::table_view{{exp_col1, exp_col2, exp_keys1, exp_keys2}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_LAST); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } + + // KEEP_NONE + { + auto const exp_col1 = int32s_col{6, 7, 8, 9}; + auto const exp_col2 = floats_col{16, 17, 18, 19}; + auto const exp_keys1 = int32s_col{20, 19, 21, 9}; + auto const exp_keys2 = int32s_col{20, 20, 9, 21}; + auto const expected = cudf::table_view{{exp_col1, exp_col2, exp_keys1, exp_keys2}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_NONE); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } +} + +TEST_F(StableDistinctKeepAny, InputWithNulls) +{ + // Column(s) used to test KEEP_ANY needs to have same rows in contiguous + // groups for equivalent keys because KEEP_ANY is nondeterministic. + auto const col = int32s_col{5, 4, 4, 1, 1, 8}; + auto const keys = int32s_col{{20, null, null, 19, 19, 21}, nulls_at({1, 2})}; + auto const input = cudf::table_view{{col, keys}}; + auto const key_idx = std::vector{1}; + + // Nulls are equal. + { + auto const exp_col = int32s_col{5, 4, 1, 8}; + auto const exp_keys = int32s_col{{20, null, 19, 21}, null_at(1)}; + auto const expected = cudf::table_view{{exp_col, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } + + // Nulls are unequal. + { + auto const exp_col = int32s_col{5, 4, 4, 1, 8}; + auto const exp_keys = int32s_col{{20, null, null, 19, 21}, nulls_at({1, 2})}; + auto const expected = cudf::table_view{{exp_col, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY, NULL_UNEQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } +} + +TEST_F(StableDistinctKeepAny, InputWithNullsAndNaNs) +{ + auto constexpr null{0.0}; // shadow the global `null` variable of type int + + // Column(s) used to test KEEP_ANY needs to have same rows in contiguous + // groups for equivalent keys because KEEP_ANY is nondeterministic. + auto const col = int32s_col{5, 4, 4, 1, 1, 1, 8, 8, 1}; + auto const keys = floats_col{{20., null, null, NaN, NaN, NaN, 19., 19., 21.}, nulls_at({1, 2})}; + auto const input = cudf::table_view{{col, keys}}; + auto const key_idx = std::vector{1}; + + // Nulls are equal, NaNs are unequal. + { + auto const exp_col = int32s_col{5, 4, 1, 1, 1, 8, 1}; + auto const exp_keys = floats_col{{20., null, NaN, NaN, NaN, 19., 21.}, null_at(1)}; + auto const expected = cudf::table_view{{exp_col, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY, NULL_EQUAL, NAN_UNEQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } + + // Nulls are equal, NaNs are equal. + { + auto const exp_col = int32s_col{5, 4, 1, 8, 1}; + auto const exp_keys = floats_col{{20., null, NaN, 19., 21.}, null_at(1)}; + auto const expected = cudf::table_view{{exp_col, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY, NULL_EQUAL, NAN_EQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } + + // Nulls are unequal, NaNs are unequal. + { + auto const exp_col = int32s_col{5, 4, 4, 1, 1, 1, 8, 1}; + auto const exp_keys = floats_col{{20., null, null, NaN, NaN, NaN, 19., 21.}, nulls_at({1, 2})}; + auto const expected = cudf::table_view{{exp_col, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY, NULL_UNEQUAL, NAN_UNEQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } + + // Nulls are unequal, NaNs are equal. + { + auto const exp_col = int32s_col{5, 4, 4, 1, 8, 1}; + auto const exp_keys = floats_col{{20., null, null, NaN, 19., 21.}, nulls_at({1, 2})}; + auto const expected = cudf::table_view{{exp_col, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY, NULL_UNEQUAL, NAN_EQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } +} + +TEST_F(StableDistinctKeepFirstLastNone, InputWithNullsEqual) +{ + // Column(s) used to test needs to have different rows for the same keys. + auto const col = int32s_col{0, 1, 2, 3, 4, 5, 6}; + auto const keys = int32s_col{{20, null, null, 19, 21, 19, 22}, nulls_at({1, 2})}; + auto const input = cudf::table_view{{col, keys}}; + auto const key_idx = std::vector{1}; + + // KEEP_FIRST + { + auto const exp_col = int32s_col{0, 1, 3, 4, 6}; + auto const exp_keys = int32s_col{{20, null, 19, 21, 22}, null_at(1)}; + auto const expected = cudf::table_view{{exp_col, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_FIRST, NULL_EQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } + + // KEEP_LAST + { + auto const exp_col = int32s_col{0, 2, 4, 5, 6}; + auto const exp_keys = int32s_col{{20, null, 21, 19, 22}, null_at(1)}; + auto const expected = cudf::table_view{{exp_col, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_LAST, NULL_EQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } + + // KEEP_NONE + { + auto const exp_col = int32s_col{0, 4, 6}; + auto const exp_keys = int32s_col{{20, 21, 22}, no_nulls()}; + auto const expected = cudf::table_view{{exp_col, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_NONE, NULL_EQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } +} + +TEST_F(StableDistinctKeepFirstLastNone, InputWithNullsUnequal) +{ + // Column(s) used to test needs to have different rows for the same keys. + auto const col = int32s_col{0, 1, 2, 3, 4, 5, 6, 7}; + auto const keys = int32s_col{{20, null, null, 19, 21, 19, 22, 20}, nulls_at({1, 2})}; + auto const input = cudf::table_view{{col, keys}}; + auto const key_idx = std::vector{1}; + + // KEEP_FIRST + { + auto const exp_col = int32s_col{0, 1, 2, 3, 4, 6}; + auto const exp_keys = int32s_col{{20, null, null, 19, 21, 22}, nulls_at({1, 2})}; + auto const expected = cudf::table_view{{exp_col, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_FIRST, NULL_UNEQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } + + // KEEP_LAST + { + auto const exp_col = int32s_col{1, 2, 4, 5, 6, 7}; + auto const exp_keys = int32s_col{{null, null, 21, 19, 22, 20}, nulls_at({0, 1})}; + auto const expected = cudf::table_view{{exp_col, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_LAST, NULL_UNEQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } + + // KEEP_NONE + { + auto const exp_col = int32s_col{1, 2, 4, 6}; + auto const exp_keys = int32s_col{{null, null, 21, 22}, nulls_at({0, 1})}; + auto const expected = cudf::table_view{{exp_col, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_NONE, NULL_UNEQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } +} + +TEST_F(StableDistinctKeepFirstLastNone, InputWithNaNsEqual) +{ + // Column(s) used to test needs to have different rows for the same keys. + auto const col = int32s_col{0, 1, 2, 3, 4, 5, 6}; + auto const keys = floats_col{20., NaN, NaN, 19., 21., 19., 22.}; + auto const input = cudf::table_view{{col, keys}}; + auto const key_idx = std::vector{1}; + + // KEEP_FIRST + { + auto const exp_col = int32s_col{0, 1, 3, 4, 6}; + auto const exp_keys = floats_col{20., NaN, 19., 21., 22.}; + auto const expected = cudf::table_view{{exp_col, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_FIRST, NULL_EQUAL, NAN_EQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } + + // KEEP_LAST + { + auto const exp_col = int32s_col{0, 2, 4, 5, 6}; + auto const exp_keys = floats_col{20., NaN, 21., 19., 22.}; + auto const expected = cudf::table_view{{exp_col, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_LAST, NULL_EQUAL, NAN_EQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } + + // KEEP_NONE + { + auto const exp_col = int32s_col{0, 4, 6}; + auto const exp_keys = floats_col{20., 21., 22.}; + auto const expected = cudf::table_view{{exp_col, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_NONE, NULL_EQUAL, NAN_EQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } +} + +TEST_F(StableDistinctKeepFirstLastNone, InputWithNaNsUnequal) +{ + // Column(s) used to test needs to have different rows for the same keys. + auto const col = int32s_col{0, 1, 2, 3, 4, 5, 6, 7}; + auto const keys = floats_col{20., NaN, NaN, 19., 21., 19., 22., 20.}; + auto const input = cudf::table_view{{col, keys}}; + auto const key_idx = std::vector{1}; + + // KEEP_FIRST + { + auto const exp_col = int32s_col{0, 1, 2, 3, 4, 6}; + auto const exp_keys = floats_col{20., NaN, NaN, 19., 21., 22.}; + auto const expected = cudf::table_view{{exp_col, exp_keys}}; + + auto const result = + cudf::stable_distinct(input, key_idx, KEEP_FIRST, NULL_UNEQUAL, NAN_UNEQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } + + // KEEP_LAST + { + auto const exp_col = int32s_col{1, 2, 4, 5, 6, 7}; + auto const exp_keys = floats_col{NaN, NaN, 21., 19., 22., 20.}; + auto const expected = cudf::table_view{{exp_col, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_LAST, NULL_UNEQUAL, NAN_UNEQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } + + // KEEP_NONE + { + auto const exp_col = int32s_col{1, 2, 4, 6}; + auto const exp_keys = floats_col{NaN, NaN, 21., 22.}; + auto const expected = cudf::table_view{{exp_col, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_NONE, NULL_UNEQUAL, NAN_UNEQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } +} + +TEST_F(StableDistinctKeepAny, BasicLists) +{ + // Column(s) used to test KEEP_ANY needs to have same rows in contiguous + // groups for equivalent keys because KEEP_ANY is nondeterministic. + // clang-format off + auto const idx = int32s_col{ 0, 0, 1, 1, 2, 3, 4, 4, 4, 5, 5, 6}; + auto const keys = lists_col{{}, {}, {1}, {1}, {1, 1}, {1, 2}, {2, 2}, {2, 2}, {2, 2}, {2}, {2}, {2, 1}}; + // clang-format on + auto const input = cudf::table_view{{idx, keys}}; + auto const key_idx = std::vector{1}; + + auto const exp_idx = int32s_col{0, 1, 2, 3, 4, 5, 6}; + auto const exp_keys = lists_col{{}, {1}, {1, 1}, {1, 2}, {2, 2}, {2}, {2, 1}}; + auto const expected = cudf::table_view{{exp_idx, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); +} + +TEST_F(StableDistinctKeepFirstLastNone, BasicLists) +{ + // Column(s) used to test needs to have different rows for the same keys. + // clang-format off + auto const idx = int32s_col{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; + auto const keys = lists_col{{}, {}, {1}, {1, 1}, {1}, {1, 2}, {2, 2}, {2}, {2}, {2, 1}, {2, 2}, {2, 2}}; + // clang-format on + auto const input = cudf::table_view{{idx, keys}}; + auto const key_idx = std::vector{1}; + + // KEEP_FIRST + { + auto const exp_idx = int32s_col{0, 2, 3, 5, 6, 7, 9}; + auto const exp_keys = lists_col{{}, {1}, {1, 1}, {1, 2}, {2, 2}, {2}, {2, 1}}; + auto const expected = cudf::table_view{{exp_idx, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_FIRST); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } + + // KEEP_LAST + { + auto const exp_idx = int32s_col{1, 3, 4, 5, 8, 9, 11}; + auto const exp_keys = lists_col{{}, {1, 1}, {1}, {1, 2}, {2}, {2, 1}, {2, 2}}; + auto const expected = cudf::table_view{{exp_idx, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_LAST); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } + + // KEEP_NONE + { + auto const exp_idx = int32s_col{3, 5, 9}; + auto const exp_keys = lists_col{{1, 1}, {1, 2}, {2, 1}}; + auto const expected = cudf::table_view{{exp_idx, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_NONE); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } +} + +TEST_F(StableDistinctKeepAny, SlicedBasicLists) +{ + auto constexpr dont_care = int32_t{0}; + + // Column(s) used to test KEEP_ANY needs to have same rows in contiguous + // groups for equivalent keys because KEEP_ANY is nondeterministic. + auto const idx = int32s_col{dont_care, dont_care, 1, 1, 2, 3, 4, 4, 4, 5, 5, 6, dont_care}; + auto const keys = lists_col{ + {0, 0}, {0, 0}, {1}, {1}, {1, 1}, {1, 2}, {2, 2}, {2, 2}, {2, 2}, {2}, {2}, {2, 1}, {5, 5}}; + auto const input_original = cudf::table_view{{idx, keys}}; + auto const input = cudf::slice(input_original, {2, 12})[0]; + auto const key_idx = std::vector{1}; + + auto const exp_idx = int32s_col{1, 2, 3, 4, 5, 6}; + auto const exp_val = lists_col{{1}, {1, 1}, {1, 2}, {2, 2}, {2}, {2, 1}}; + auto const expected = cudf::table_view{{exp_idx, exp_val}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); +} + +TEST_F(StableDistinctKeepAny, NullableLists) +{ + // Column(s) used to test KEEP_ANY needs to have same rows in contiguous + // groups for equivalent keys because KEEP_ANY is nondeterministic. + auto const idx = int32s_col{0, 0, 1, 1, 2, 2, 2, 3, 3, 4, 4}; + auto const keys = + lists_col{{{}, {}, {1}, {1}, {2, 2}, {2, 2}, {2, 2}, {2}, {2}, {} /*NULL*/, {} /*NULL*/}, + nulls_at({9, 10})}; + auto const input = cudf::table_view{{idx, keys}}; + auto const key_idx = std::vector{1}; + + // Nulls are equal. + { + auto const exp_idx = int32s_col{0, 1, 2, 3, 4}; + auto const exp_keys = lists_col{{{}, {1}, {2, 2}, {2}, {} /*NULL*/}, null_at(4)}; + auto const expected = cudf::table_view{{exp_idx, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } + + // Nulls are unequal. + { + auto const exp_idx = int32s_col{0, 1, 2, 3, 4, 4}; + auto const exp_keys = + lists_col{{{}, {1}, {2, 2}, {2}, {} /*NULL*/, {} /*NULL*/}, nulls_at({4, 5})}; + auto const expected = cudf::table_view{{exp_idx, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY, NULL_UNEQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } +} + +TEST_F(StableDistinctKeepFirstLastNone, ListsWithNullsEqual) +{ + // Column(s) used to test needs to have different rows for the same keys. + auto const idx = int32s_col{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; + auto const keys = + lists_col{{{}, {}, {1}, {1}, {2, 2}, {2}, {2}, {} /*NULL*/, {2, 2}, {2, 2}, {} /*NULL*/}, + nulls_at({7, 10})}; + auto const input = cudf::table_view{{idx, keys}}; + auto const key_idx = std::vector{1}; + + // KEEP_FIRST + { + auto const exp_idx = int32s_col{0, 2, 4, 5, 7}; + auto const exp_keys = lists_col{{{}, {1}, {2, 2}, {2}, {} /*NULL*/}, null_at(4)}; + auto const expected = cudf::table_view{{exp_idx, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_FIRST, NULL_EQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } + + // KEEP_LAST + { + auto const exp_idx = int32s_col{1, 3, 6, 9, 10}; + auto const exp_keys = lists_col{{{}, {1}, {2}, {2, 2}, {} /*NULL*/}, null_at(4)}; + auto const expected = cudf::table_view{{exp_idx, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_LAST, NULL_EQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } + + // KEEP_NONE + { + auto const exp_idx = int32s_col{}; + auto const exp_keys = lists_col{}; + auto const expected = cudf::table_view{{exp_idx, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_NONE, NULL_EQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } +} + +TEST_F(StableDistinctKeepFirstLastNone, ListsWithNullsUnequal) +{ + // Column(s) used to test needs to have different rows for the same keys. + auto const idx = int32s_col{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; + auto const keys = + lists_col{{{}, {}, {1}, {1}, {2, 2}, {2}, {2}, {} /*NULL*/, {2, 2}, {2, 2}, {} /*NULL*/}, + nulls_at({7, 10})}; + auto const input = cudf::table_view{{idx, keys}}; + auto const key_idx = std::vector{1}; + + // KEEP_FIRST + { + auto const exp_idx = int32s_col{0, 2, 4, 5, 7, 10}; + auto const exp_keys = + lists_col{{{}, {1}, {2, 2}, {2}, {} /*NULL*/, {} /*NULL*/}, nulls_at({4, 5})}; + auto const expected = cudf::table_view{{exp_idx, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_FIRST, NULL_UNEQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } + + // KEEP_LAST + { + auto const exp_idx = int32s_col{1, 3, 6, 7, 9, 10}; + auto const exp_keys = + lists_col{{{}, {1}, {2}, {} /*NULL*/, {2, 2}, {} /*NULL*/}, nulls_at({3, 5})}; + auto const expected = cudf::table_view{{exp_idx, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_LAST, NULL_UNEQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } + + // KEEP_NONE + { + auto const exp_idx = int32s_col{7, 10}; + auto const exp_keys = lists_col{{lists_col{} /*NULL*/, lists_col{} /*NULL*/}, nulls_at({0, 1})}; + auto const expected = cudf::table_view{{exp_idx, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_NONE, NULL_UNEQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } +} + +TEST_F(StableDistinctKeepAny, ListsOfStructs) +{ + // Constructing a list of structs of two elements + // 0. [] == + // 1. [] != + // 2. Null == + // 3. Null != + // 4. [Null, Null] != + // 5. [Null] == + // 6. [Null] == + // 7. [Null] != + // 8. [{Null, Null}] != + // 9. [{1,'a'}, {2,'b'}] != + // 10. [{0,'a'}, {2,'b'}] != + // 11. [{0,'a'}, {2,'c'}] == + // 12. [{0,'a'}, {2,'c'}] != + // 13. [{0,Null}] == + // 14. [{0,Null}] != + // 15. [{Null, 'b'}] == + // 16. [{Null, 'b'}] + + auto const structs = [] { + auto child1 = + int32s_col{{XXX, XXX, XXX, XXX, XXX, null, 1, 2, 0, 2, 0, 2, 0, 2, 0, 0, null, null}, + nulls_at({5, 16, 17})}; + auto child2 = strings_col{{"" /*XXX*/, + "" /*XXX*/, + "" /*XXX*/, + "" /*XXX*/, + "" /*XXX*/, + "" /*null*/, + "a", + "b", + "a", + "b", + "a", + "c", + "a", + "c", + "" /*null*/, + "" /*null*/, + "b", + "b"}, + nulls_at({5, 14, 15})}; + + return structs_col{{child1, child2}, nulls_at({0, 1, 2, 3, 4})}; + }(); + + auto const offsets = int32s_col{0, 0, 0, 0, 0, 2, 3, 4, 5, 6, 8, 10, 12, 14, 15, 16, 17, 18}; + auto const null_it = nulls_at({2, 3}); + + auto [null_mask, null_count] = cudf::test::detail::make_null_mask(null_it, null_it + 17); + + auto const keys = cudf::column_view(cudf::data_type(cudf::type_id::LIST), + 17, + nullptr, + static_cast(null_mask.data()), + null_count, + 0, + {offsets, structs}); + + auto const idx = int32s_col{1, 1, 2, 2, 3, 4, 4, 4, 5, 6, 7, 8, 8, 9, 9, 10, 10}; + auto const input = cudf::table_view{{idx, keys}}; + auto const key_idx = std::vector{1}; + + // Nulls are equal. + { + auto const expect_map = int32s_col{0, 2, 4, 5, 8, 9, 10, 11, 13, 15}; + auto const expect_table = cudf::gather(input, expect_map); + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expect_table, *result); + } + + // Nulls are unequal. + { + auto const expect_map = int32s_col{0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16}; + auto const expect_table = cudf::gather(input, expect_map); + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY, NULL_UNEQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expect_table, *result); + } +} + +TEST_F(StableDistinctKeepFirstLastNone, ListsOfStructs) +{ + // Constructing a list of structs of two elements + // 0. [] == + // 1. [] != + // 2. Null == + // 3. Null != + // 4. [Null, Null] != + // 5. [Null] == + // 6. [Null] == + // 7. [Null] != + // 8. [{Null, Null}] != + // 9. [{1,'a'}, {2,'b'}] != + // 10. [{0,'a'}, {2,'b'}] != + // 11. [{0,'a'}, {2,'c'}] == + // 12. [{0,'a'}, {2,'c'}] != + // 13. [{0,Null}] == + // 14. [{0,Null}] != + // 15. [{Null, 'b'}] == + // 16. [{Null, 'b'}] + + auto const structs = [] { + auto child1 = + int32s_col{{XXX, XXX, XXX, XXX, XXX, null, 1, 2, 0, 2, 0, 2, 0, 2, 0, 0, null, null}, + nulls_at({5, 16, 17})}; + auto child2 = strings_col{{"" /*XXX*/, + "" /*XXX*/, + "" /*XXX*/, + "" /*XXX*/, + "" /*XXX*/, + "" /*null*/, + "a", + "b", + "a", + "b", + "a", + "c", + "a", + "c", + "" /*null*/, + "" /*null*/, + "b", + "b"}, + nulls_at({5, 14, 15})}; + + return structs_col{{child1, child2}, nulls_at({0, 1, 2, 3, 4})}; + }(); + + auto const offsets = int32s_col{0, 0, 0, 0, 0, 2, 3, 4, 5, 6, 8, 10, 12, 14, 15, 16, 17, 18}; + auto const null_it = nulls_at({2, 3}); + + auto [null_mask, null_count] = cudf::test::detail::make_null_mask(null_it, null_it + 17); + + auto const keys = cudf::column_view(cudf::data_type(cudf::type_id::LIST), + 17, + nullptr, + static_cast(null_mask.data()), + null_count, + 0, + {offsets, structs}); + + auto const idx = int32s_col{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}; + auto const input = cudf::table_view{{idx, keys}}; + auto const key_idx = std::vector{1}; + + // KEEP_FIRST + { + auto const expect_map = int32s_col{0, 2, 4, 5, 8, 9, 10, 11, 13, 15}; + auto const expect_table = cudf::gather(input, expect_map); + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_FIRST); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expect_table, *result); + } + + // KEEP_LAST + { + auto const expect_map = int32s_col{1, 3, 4, 7, 8, 9, 10, 12, 14, 16}; + auto const expect_table = cudf::gather(input, expect_map); + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_LAST); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expect_table, *result); + } + + // KEEP_NONE + { + auto const expect_map = int32s_col{4, 8, 9, 10}; + auto const expect_table = cudf::gather(input, expect_map); + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_NONE); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expect_table, *result); + } +} + +TEST_F(StableDistinctKeepAny, SlicedListsOfStructs) +{ + // Constructing a list of struct of two elements + // 0. [] == <- Don't care + // 1. [] != <- Don't care + // 2. Null == <- Don't care + // 3. Null != <- Don't care + // 4. [Null, Null] != <- Don't care + // 5. [Null] == <- Don't care + // 6. [Null] == <- Don't care + // 7. [Null] != <- Don't care + // 8. [{Null, Null}] != + // 9. [{1,'a'}, {2,'b'}] != + // 10. [{0,'a'}, {2,'b'}] != + // 11. [{0,'a'}, {2,'c'}] == + // 12. [{0,'a'}, {2,'c'}] != + // 13. [{0,Null}] == + // 14. [{0,Null}] != + // 15. [{Null, 'b'}] == <- Don't care + // 16. [{Null, 'b'}] <- Don't care + + auto const structs = [] { + auto child1 = + int32s_col{{XXX, XXX, XXX, XXX, XXX, null, 1, 2, 0, 2, 0, 2, 0, 2, 0, 0, null, null}, + nulls_at({5, 16, 17})}; + auto child2 = strings_col{{"" /*XXX*/, + "" /*XXX*/, + "" /*XXX*/, + "" /*XXX*/, + "" /*XXX*/, + "" /*null*/, + "a", + "b", + "a", + "b", + "a", + "c", + "a", + "c", + "" /*null*/, + "" /*null*/, + "b", + "b"}, + nulls_at({5, 14, 15})}; + + return structs_col{{child1, child2}, nulls_at({0, 1, 2, 3, 4})}; + }(); + + auto const offsets = int32s_col{0, 0, 0, 0, 0, 2, 3, 4, 5, 6, 8, 10, 12, 14, 15, 16, 17, 18}; + auto const null_it = nulls_at({2, 3}); + + auto [null_mask, null_count] = cudf::test::detail::make_null_mask(null_it, null_it + 17); + + auto const keys = cudf::column_view(cudf::data_type(cudf::type_id::LIST), + 17, + nullptr, + static_cast(null_mask.data()), + null_count, + 0, + {offsets, structs}); + + auto const idx = int32s_col{1, 1, 2, 2, 3, 4, 4, 4, 5, 6, 7, 8, 8, 9, 9, 10, 10}; + auto const input_original = cudf::table_view{{idx, keys}}; + auto const input = cudf::slice(input_original, {8, 15})[0]; + auto const key_idx = std::vector{1}; + + // Nulls are equal. + { + auto const expect_map = int32s_col{8, 9, 10, 11, 13}; + auto const expect_table = cudf::gather(input_original, expect_map); + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*expect_table, *result); + } + + // Nulls are unequal. + { + auto const expect_map = int32s_col{8, 9, 10, 11, 13, 14}; + auto const expect_table = cudf::gather(input_original, expect_map); + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY, NULL_UNEQUAL); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*expect_table, *result); + } +} + +TEST_F(StableDistinctKeepAny, ListsOfEmptyStructs) +{ + // Column(s) used to test KEEP_ANY needs to have same rows in contiguous + // groups for equivalent keys because KEEP_ANY is nondeterministic. + + // 0. [] == + // 1. [] != + // 2. Null == + // 3. Null != + // 4. [Null, Null] == + // 5. [Null, Null] == + // 6. [Null, Null] != + // 7. [Null] == + // 8. [Null] != + // 9. [{}] == + // 10. [{}] != + // 11. [{}, {}] == + // 12. [{}, {}] + + auto const structs_null_it = nulls_at({0, 1, 2, 3, 4, 5, 6, 7}); + auto [structs_null_mask, structs_null_count] = + cudf::test::detail::make_null_mask(structs_null_it, structs_null_it + 14); + auto const structs = + cudf::column_view(cudf::data_type(cudf::type_id::STRUCT), + 14, + nullptr, + static_cast(structs_null_mask.data()), + structs_null_count); + + auto const offsets = int32s_col{0, 0, 0, 0, 0, 2, 4, 6, 7, 8, 9, 10, 12, 14}; + auto const lists_null_it = nulls_at({2, 3}); + auto [lists_null_mask, lists_null_count] = + cudf::test::detail::make_null_mask(lists_null_it, lists_null_it + 13); + auto const keys = + cudf::column_view(cudf::data_type(cudf::type_id::LIST), + 13, + nullptr, + static_cast(lists_null_mask.data()), + lists_null_count, + 0, + {offsets, structs}); + + auto const idx = int32s_col{1, 1, 2, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6}; + auto const input = cudf::table_view{{idx, keys}}; + auto const key_idx = std::vector{1}; + + // Nulls are equal. + { + auto const expect_map = int32s_col{0, 2, 4, 7, 9, 11}; + auto const expect_table = cudf::gather(input, expect_map); + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expect_table, *result); + } + + // Nulls are unequal. + { + auto const expect_map = int32s_col{0, 2, 3, 4, 5, 6, 7, 8, 9, 11}; + auto const expect_table = cudf::gather(input, expect_map); + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY, NULL_UNEQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expect_table, *result); + } +} + +TEST_F(StableDistinctKeepAny, EmptyDeepList) +{ + // Column(s) used to test KEEP_ANY needs to have same rows in contiguous + // groups for equivalent keys because KEEP_ANY is nondeterministic. + + // List>, where all lists are empty: + // + // 0. [] + // 1. [] + // 2. Null + // 3. Null + + auto const keys = + lists_col{{lists_col{}, lists_col{}, lists_col{}, lists_col{}}, nulls_at({2, 3})}; + + auto const idx = int32s_col{1, 1, 2, 2}; + auto const input = cudf::table_view{{idx, keys}}; + auto const key_idx = std::vector{1}; + + // Nulls are equal. + { + auto const expect_map = int32s_col{0, 2}; + auto const expect_table = cudf::gather(input, expect_map); + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expect_table, *result); + } + + // Nulls are unequal. + { + auto const expect_map = int32s_col{0, 2, 3}; + auto const expect_table = cudf::gather(input, expect_map); + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY, NULL_UNEQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expect_table, *result); + } +} + +TEST_F(StableDistinctKeepAny, StructsOfStructs) +{ + // Column(s) used to test KEEP_ANY needs to have same rows in contiguous + // groups for equivalent keys because KEEP_ANY is nondeterministic. + + // +-----------------+ + // | s1{s2{a,b}, c} | + // +-----------------+ + // 0 | { {1, 1}, 5} | + // 1 | { {1, 1}, 5} | // Same as 0 + // 2 | { {1, 2}, 4} | + // 3 | { Null, 6} | + // 4 | { Null, 4} | + // 5 | { Null, 4} | // Same as 4 + // 6 | Null | + // 7 | Null | // Same as 6 + // 8 | { {2, 1}, 5} | + + auto s1 = [&] { + auto a = int32s_col{1, 1, 1, XXX, XXX, XXX, XXX, XXX, 2}; + auto b = int32s_col{1, 1, 2, XXX, XXX, XXX, XXX, XXX, 1}; + auto s2 = structs_col{{a, b}, nulls_at({3, 4, 5})}; + + auto c = int32s_col{5, 5, 4, 6, 4, 4, XXX, XXX, 5}; + std::vector> s1_children; + s1_children.emplace_back(s2.release()); + s1_children.emplace_back(c.release()); + auto const null_it = nulls_at({6, 7}); + return structs_col(std::move(s1_children), std::vector{null_it, null_it + 9}); + }(); + + auto const idx = int32s_col{0, 0, 2, 3, 4, 4, 6, 6, 8}; + auto const input = cudf::table_view{{idx, s1}}; + auto const key_idx = std::vector{1}; + + // Nulls are equal. + { + auto const expect_map = int32s_col{0, 2, 3, 4, 6, 8}; + auto const expect_table = cudf::gather(input, expect_map); + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expect_table, *result); + } + + // Nulls are unequal. + { + auto const expect_map = int32s_col{0, 2, 3, 4, 4, 6, 6, 8}; + auto const expect_table = cudf::gather(input, expect_map); + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY, NULL_UNEQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expect_table, *result); + } +} + +TEST_F(StableDistinctKeepAny, SlicedStructsOfStructs) +{ + // Column(s) used to test KEEP_ANY needs to have same rows in contiguous + // groups for equivalent keys because KEEP_ANY is nondeterministic. + + // +-----------------+ + // | s1{s2{a,b}, c} | + // +-----------------+ + // 0 | { {1, 1}, 5} | + // 1 | { {1, 1}, 5} | // Same as 0 + // 2 | { {1, 2}, 4} | + // 3 | { Null, 6} | + // 4 | { Null, 4} | + // 5 | { Null, 4} | // Same as 4 + // 6 | Null | + // 7 | Null | // Same as 6 + // 8 | { {2, 1}, 5} | + + auto s1 = [&] { + auto a = int32s_col{1, 1, XXX, XXX, XXX, XXX, 1, XXX, 2}; + auto b = int32s_col{1, 2, XXX, XXX, XXX, XXX, 1, XXX, 1}; + auto s2 = structs_col{{a, b}, nulls_at({3, 4, 5})}; + + auto c = int32s_col{5, 4, 6, 4, XXX, XXX, 5, 4, 5}; + std::vector> s1_children; + s1_children.emplace_back(s2.release()); + s1_children.emplace_back(c.release()); + auto const null_it = nulls_at({6, 7}); + return structs_col(std::move(s1_children), std::vector{null_it, null_it + 9}); + }(); + + auto const idx = int32s_col{0, 0, 2, 3, 4, 4, 6, 6, 8}; + auto const input_original = cudf::table_view{{idx, s1}}; + auto const input = cudf::slice(input_original, {1, 7})[0]; + auto const key_idx = std::vector{1}; + + // Nulls are equal. + { + auto const expect_map = int32s_col{1, 2, 3, 4, 6}; + auto const expect_table = cudf::gather(input_original, expect_map); + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expect_table, *result); + } + + // Nulls are unequal. + { + auto const expect_map = int32s_col{1, 2, 3, 4, 4, 6}; + auto const expect_table = cudf::gather(input_original, expect_map); + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY, NULL_UNEQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expect_table, *result); + } +} + +TEST_F(StableDistinctKeepAny, StructsOfLists) +{ + // Column(s) used to test KEEP_ANY needs to have same rows in contiguous + // groups for equivalent keys because KEEP_ANY is nondeterministic. + + auto const idx = int32s_col{1, 1, 2, 3, 4, 4, 4, 5, 5, 6}; + auto const keys = [] { + // All child columns are identical. + auto child1 = lists_col{{1}, {1}, {1, 1}, {1, 2}, {2, 2}, {2, 2}, {2, 2}, {2}, {2}, {2, 1}}; + auto child2 = lists_col{{1}, {1}, {1, 1}, {1, 2}, {2, 2}, {2, 2}, {2, 2}, {2}, {2}, {2, 1}}; + auto child3 = lists_col{{1}, {1}, {1, 1}, {1, 2}, {2, 2}, {2, 2}, {2, 2}, {2}, {2}, {2, 1}}; + return structs_col{{child1, child2, child3}}; + }(); + + auto const input = cudf::table_view{{idx, keys}}; + auto const key_idx = std::vector{1}; + + auto const exp_idx = int32s_col{1, 2, 3, 4, 5, 6}; + auto const exp_keys = [] { + auto child1 = lists_col{{1}, {1, 1}, {1, 2}, {2, 2}, {2}, {2, 1}}; + auto child2 = lists_col{{1}, {1, 1}, {1, 2}, {2, 2}, {2}, {2, 1}}; + auto child3 = lists_col{{1}, {1, 1}, {1, 2}, {2, 2}, {2}, {2, 1}}; + return structs_col{{child1, child2, child3}}; + }(); + auto const expected = cudf::table_view{{exp_idx, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); +} + +TEST_F(StableDistinctKeepFirstLastNone, StructsOfLists) +{ + auto const idx = int32s_col{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + auto const keys = [] { + // All child columns are identical. + auto child1 = lists_col{{1}, {1, 1}, {1}, {1, 2}, {2, 2}, {2}, {2}, {2, 1}, {2, 2}, {2, 2}}; + auto child2 = lists_col{{1}, {1, 1}, {1}, {1, 2}, {2, 2}, {2}, {2}, {2, 1}, {2, 2}, {2, 2}}; + auto child3 = lists_col{{1}, {1, 1}, {1}, {1, 2}, {2, 2}, {2}, {2}, {2, 1}, {2, 2}, {2, 2}}; + return structs_col{{child1, child2, child3}}; + }(); + + auto const input = cudf::table_view{{idx, keys}}; + auto const key_idx = std::vector{1}; + + // KEEP_FIRST + { + auto const expect_map = int32s_col{0, 1, 3, 4, 5, 7}; + auto const expect_table = cudf::gather(input, expect_map); + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_FIRST); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expect_table, *result); + } + + // KEEP_LAST + { + auto const expect_map = int32s_col{1, 2, 3, 6, 7, 9}; + auto const expect_table = cudf::gather(input, expect_map); + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_LAST); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expect_table, *result); + } + + // KEEP_NONE + { + auto const expect_map = int32s_col{1, 3, 7}; + auto const expect_table = cudf::gather(input, expect_map); + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_NONE); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expect_table, *result); + } +} + +TEST_F(StableDistinctKeepAny, SlicedStructsOfLists) +{ + // Column(s) used to test KEEP_ANY needs to have same rows in contiguous + // groups for equivalent keys because KEEP_ANY is nondeterministic. + + auto constexpr dont_care = int32_t{0}; + + auto const idx = int32s_col{dont_care, dont_care, 1, 1, 2, 3, 4, 4, 4, 5, 5, 6, dont_care}; + auto const keys = [] { + // All child columns are identical. + auto child1 = lists_col{ + {0, 0}, {0, 0}, {1}, {1}, {1, 1}, {1, 2}, {2, 2}, {2, 2}, {2, 2}, {2}, {2}, {2, 1}, {5, 5}}; + auto child2 = lists_col{ + {0, 0}, {0, 0}, {1}, {1}, {1, 1}, {1, 2}, {2, 2}, {2, 2}, {2, 2}, {2}, {2}, {2, 1}, {5, 5}}; + auto child3 = lists_col{ + {0, 0}, {0, 0}, {1}, {1}, {1, 1}, {1, 2}, {2, 2}, {2, 2}, {2, 2}, {2}, {2}, {2, 1}, {5, 5}}; + return structs_col{{child1, child2, child3}}; + }(); + + auto const input_original = cudf::table_view{{idx, keys}}; + auto const input = cudf::slice(input_original, {2, 12})[0]; + auto const key_idx = std::vector{1}; + + auto const exp_idx = int32s_col{1, 2, 3, 4, 5, 6}; + auto const exp_keys = [] { + auto child1 = lists_col{{1}, {1, 1}, {1, 2}, {2, 2}, {2}, {2, 1}}; + auto child2 = lists_col{{1}, {1, 1}, {1, 2}, {2, 2}, {2}, {2, 1}}; + auto child3 = lists_col{{1}, {1, 1}, {1, 2}, {2, 2}, {2}, {2, 1}}; + return structs_col{{child1, child2, child3}}; + }(); + auto const expected = cudf::table_view{{exp_idx, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); +}