Skip to content

Commit

Permalink
Add cudf::stable_distinct public API, tests, and benchmarks. (#13392)
Browse files Browse the repository at this point in the history
This PR supersedes part of #11656.

It adds a public API for `cudf::stable_distinct`, mirroring that of `cudf::distinct` but preserving the order of the input table. The `stable_distinct` implementation was refactored to use `apply_boolean_mask`, which reduces the number of kernels needed. I also added tests/benchmarks for `cudf::stable_distinct`.

I split out the C++ changes from #11656 because that PR size was getting too large. Also these C++ changes are non-breaking, but the Python changes are breaking (and depend on these C++ changes), so separating into a new PR seemed like a good idea.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Robert Maynard (https://github.com/robertmaynard)

URL: #13392
  • Loading branch information
bdice authored May 22, 2023
1 parent 383c3cf commit c32cb9a
Show file tree
Hide file tree
Showing 8 changed files with 1,533 additions and 48 deletions.
8 changes: 6 additions & 2 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -153,8 +153,12 @@ ConfigureBench(APPLY_BOOLEAN_MASK_BENCH stream_compaction/apply_boolean_mask.cpp
# ##################################################################################################
# * stream_compaction benchmark -------------------------------------------------------------------
ConfigureNVBench(
STREAM_COMPACTION_NVBENCH stream_compaction/distinct.cpp stream_compaction/distinct_count.cpp
stream_compaction/unique.cpp stream_compaction/unique_count.cpp
STREAM_COMPACTION_NVBENCH
stream_compaction/distinct.cpp
stream_compaction/distinct_count.cpp
stream_compaction/stable_distinct.cpp
stream_compaction/unique.cpp
stream_compaction/unique_count.cpp
)

# ##################################################################################################
Expand Down
97 changes: 97 additions & 0 deletions cpp/benchmarks/stream_compaction/stable_distinct.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmarks/common/generate_input.hpp>
#include <benchmarks/fixture/rmm_pool_raii.hpp>

#include <cudf/column/column_view.hpp>
#include <cudf/lists/list_view.hpp>
#include <cudf/stream_compaction.hpp>
#include <cudf/types.hpp>

#include <nvbench/nvbench.cuh>

NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_ms, "cudf::timestamp_ms", "cudf::timestamp_ms");

template <typename Type>
void nvbench_stable_distinct(nvbench::state& state, nvbench::type_list<Type>)
{
cudf::size_type const num_rows = state.get_int64("NumRows");

data_profile profile = data_profile_builder().cardinality(0).null_probability(0.01).distribution(
cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, 100);

auto source_column = create_random_column(cudf::type_to_id<Type>(), row_count{num_rows}, profile);

auto input_column = source_column->view();
auto input_table = cudf::table_view({input_column, input_column, input_column, input_column});

state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
auto result = cudf::stable_distinct(input_table,
{0},
cudf::duplicate_keep_option::KEEP_ANY,
cudf::null_equality::EQUAL,
cudf::nan_equality::ALL_EQUAL);
});
}

using data_type = nvbench::type_list<bool, int8_t, int32_t, int64_t, float, cudf::timestamp_ms>;

NVBENCH_BENCH_TYPES(nvbench_stable_distinct, NVBENCH_TYPE_AXES(data_type))
.set_name("stable_distinct")
.set_type_axes_names({"Type"})
.add_int64_axis("NumRows", {10'000, 100'000, 1'000'000, 10'000'000});

template <typename Type>
void nvbench_stable_distinct_list(nvbench::state& state, nvbench::type_list<Type>)
{
auto const size = state.get_int64("ColumnSize");
auto const dtype = cudf::type_to_id<Type>();
double const null_probability = state.get_float64("null_probability");

auto builder = data_profile_builder().null_probability(null_probability);
if (dtype == cudf::type_id::LIST) {
builder.distribution(dtype, distribution_id::UNIFORM, 0, 4)
.distribution(cudf::type_id::INT32, distribution_id::UNIFORM, 0, 4)
.list_depth(1);
} else {
// We're comparing stable_distinct() on a non-nested column to that on a list column with the
// same number of stable_distinct rows. The max list size is 4 and the number of distinct values
// in the list's child is 5. So the number of distinct rows in the list = 1 + 5 + 5^2 + 5^3 +
// 5^4 = 781 We want this column to also have 781 distinct values.
builder.distribution(dtype, distribution_id::UNIFORM, 0, 781);
}

auto const table = create_random_table(
{dtype}, table_size_bytes{static_cast<size_t>(size)}, data_profile{builder}, 0);

state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
auto result = cudf::stable_distinct(*table,
{0},
cudf::duplicate_keep_option::KEEP_ANY,
cudf::null_equality::EQUAL,
cudf::nan_equality::ALL_EQUAL);
});
}

NVBENCH_BENCH_TYPES(nvbench_stable_distinct_list,
NVBENCH_TYPE_AXES(nvbench::type_list<int32_t, cudf::list_view>))
.set_name("stable_distinct_list")
.set_type_axes_names({"Type"})
.add_float64_axis("null_probability", {0.0, 0.1})
.add_int64_axis("ColumnSize", {100'000'000});
19 changes: 2 additions & 17 deletions cpp/include/cudf/detail/stream_compaction.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,24 +86,9 @@ std::unique_ptr<table> distinct(table_view const& input,
rmm::mr::device_memory_resource* mr);

/**
* @brief Create a new table without duplicate rows.
* @copydoc cudf::stable_distinct
*
* Given an `input` table_view, each row is copied to the output table to create a set of distinct
* rows. The row order is guaranteed to be preserved as in the input.
*
* If there are duplicate rows, which row to be copied depends on the specified value of the `keep`
* parameter.
*
* This API produces exactly the same set of output rows as `cudf::distinct`.
*
* @param input The input table
* @param keys Vector of indices indicating key columns in the `input` table
* @param keep Copy any, first, last, or none of the found duplicates
* @param nulls_equal Flag to specify whether null elements should be considered as equal
* @param nans_equal Flag to specify whether NaN elements should be considered as equal
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned table
* @return A table containing the resulting distinct rows
* @param stream CUDA stream used for device memory operations and kernel launches.
*/
std::unique_ptr<table> stable_distinct(table_view const& input,
std::vector<size_type> const& keys,
Expand Down
49 changes: 38 additions & 11 deletions cpp/include/cudf/stream_compaction.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -254,22 +254,19 @@ std::unique_ptr<table> unique(
* @brief Create a new table without duplicate rows.
*
* Given an `input` table_view, each row is copied to the output table to create a set of distinct
* rows. If there are duplicate rows, which row to be copied depends on the specified value of
* the `keep` parameter.
* rows. If there are duplicate rows, which row is copied depends on the `keep` parameter.
*
* The order of rows in the output table is not specified.
*
* Performance hint: if the input is pre-sorted, `cudf::unique` can produce an equivalent result
* (i.e., same set of output rows) but with less running time than `cudf::distinct`.
*
* @param[in] input input table_view to copy only distinct rows
* @param[in] keys vector of indices representing key columns from `input`
* @param[in] keep keep any, first, last, or none of the found duplicates
* @param[in] nulls_equal flag to control if nulls are compared equal or not
* @param[in] nans_equal flag to control if floating-point NaN values are compared equal or not
* @param[in] mr Device memory resource used to allocate the returned table's device
* memory
*
* @param input The input table
* @param keys Vector of indices indicating key columns in the `input` table
* @param keep Copy any, first, last, or none of the found duplicates
* @param nulls_equal Flag to specify whether null elements should be considered as equal
* @param nans_equal Flag to specify whether NaN elements should be considered as equal
* @param mr Device memory resource used to allocate the returned table
* @return Table with distinct rows in an unspecified order
*/
std::unique_ptr<table> distinct(
Expand All @@ -280,6 +277,36 @@ std::unique_ptr<table> distinct(
nan_equality nans_equal = nan_equality::ALL_EQUAL,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Create a new table without duplicate rows, preserving input order.
*
* Given an `input` table_view, each row is copied to the output table to create a set of distinct
* rows. The input row order is preserved. If there are duplicate rows, which row is copied depends
* on the `keep` parameter.
*
* This API produces the same output rows as `cudf::distinct`, but with input order preserved.
*
* Note that when `keep` is `KEEP_ANY`, the choice of which duplicate row to keep is arbitrary, but
* the returned table will retain the input order. That is, if the key column contained `1, 2, 1`
* with another values column `3, 4, 5`, the result could contain values `3, 4` or `4, 5` but not
* `4, 3` or `5, 4`.
*
* @param input The input table
* @param keys Vector of indices indicating key columns in the `input` table
* @param keep Copy any, first, last, or none of the found duplicates
* @param nulls_equal Flag to specify whether null elements should be considered as equal
* @param nans_equal Flag to specify whether NaN elements should be considered as equal
* @param mr Device memory resource used to allocate the returned table
* @return Table with distinct rows, preserving input order
*/
std::unique_ptr<table> stable_distinct(
table_view const& input,
std::vector<size_type> const& keys,
duplicate_keep_option keep = duplicate_keep_option::KEEP_ANY,
null_equality nulls_equal = null_equality::EQUAL,
nan_equality nans_equal = nan_equality::ALL_EQUAL,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Count the number of consecutive groups of equivalent rows in a column.
*
Expand Down
37 changes: 27 additions & 10 deletions cpp/src/stream_compaction/stable_distinct.cu
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,14 @@
#include <cudf/table/table.hpp>
#include <cudf/table/table_view.hpp>
#include <cudf/types.hpp>
#include <cudf/utilities/span.hpp>

#include <thrust/iterator/constant_iterator.h>
#include <thrust/scatter.h>
#include <thrust/uninitialized_fill.h>

namespace cudf::detail {
namespace cudf {
namespace detail {

std::unique_ptr<table> stable_distinct(table_view const& input,
std::vector<size_type> const& keys,
Expand All @@ -45,7 +47,13 @@ std::unique_ptr<table> stable_distinct(table_view const& input,
stream,
rmm::mr::get_current_device_resource());

// Markers to denote which rows to be copied to the output.
// The only difference between this implementation and the unstable version
// is that the stable implementation must retain the input order. The
// distinct indices are not sorted, so we cannot simply copy the rows in the
// order of the distinct indices and retain the input order. Instead, we use
// a boolean mask to indicate which rows to copy to the output. This avoids
// the need to sort the distinct indices, which is slower.

auto const output_markers = [&] {
auto markers = rmm::device_uvector<bool>(input.num_rows(), stream);
thrust::uninitialized_fill(rmm::exec_policy(stream), markers.begin(), markers.end(), false);
Expand All @@ -58,13 +66,22 @@ std::unique_ptr<table> stable_distinct(table_view const& input,
return markers;
}();

return cudf::detail::copy_if(
input,
[output_markers = output_markers.begin()] __device__(auto const idx) {
return *(output_markers + idx);
},
stream,
mr);
return cudf::detail::apply_boolean_mask(
input, cudf::device_span<bool const>(output_markers), stream, mr);
}

} // namespace detail

std::unique_ptr<table> stable_distinct(table_view const& input,
std::vector<size_type> const& keys,
duplicate_keep_option keep,
null_equality nulls_equal,
nan_equality nans_equal,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::stable_distinct(
input, keys, keep, nulls_equal, nans_equal, cudf::get_default_stream(), mr);
}

} // namespace cudf::detail
} // namespace cudf
3 changes: 2 additions & 1 deletion cpp/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -401,8 +401,9 @@ ConfigureTest(
stream_compaction/apply_boolean_mask_tests.cpp
stream_compaction/distinct_count_tests.cpp
stream_compaction/distinct_tests.cpp
stream_compaction/drop_nulls_tests.cpp
stream_compaction/drop_nans_tests.cpp
stream_compaction/drop_nulls_tests.cpp
stream_compaction/stable_distinct_tests.cpp
stream_compaction/unique_count_tests.cpp
stream_compaction/unique_tests.cpp
)
Expand Down
14 changes: 7 additions & 7 deletions cpp/tests/stream_compaction/distinct_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ TEST_F(DistinctKeepAny, EmptyInputTable)
{
int32s_col col(std::initializer_list<int32_t>{});
cudf::table_view input{{col}};
std::vector<cudf::size_type> key_idx{1, 2};
std::vector<cudf::size_type> key_idx{0};

auto got = cudf::distinct(input, key_idx, KEEP_ANY);
CUDF_TEST_EXPECT_TABLES_EQUAL(input, got->view());
Expand Down Expand Up @@ -1217,11 +1217,11 @@ TEST_F(DistinctKeepAny, StructsOfStructs)
// 8 | { {2, 1}, 5} |

auto s1 = [&] {
auto a = int32s_col{1, 1, XXX, XXX, 2, 1, 1, XXX, 2};
auto b = int32s_col{1, 2, XXX, XXX, 2, 1, 1, XXX, 1};
auto a = int32s_col{1, 1, XXX, XXX, XXX, XXX, 1, XXX, 2};
auto b = int32s_col{1, 2, XXX, XXX, XXX, XXX, 1, XXX, 1};
auto s2 = structs_col{{a, b}, nulls_at({2, 3, 7})};

auto c = int32s_col{5, 4, 6, 4, 3, 3, 5, 4, 5};
auto c = int32s_col{5, 4, 6, 4, XXX, XXX, 5, 4, 5};
std::vector<std::unique_ptr<cudf::column>> s1_children;
s1_children.emplace_back(s2.release());
s1_children.emplace_back(c.release());
Expand Down Expand Up @@ -1270,11 +1270,11 @@ TEST_F(DistinctKeepAny, SlicedStructsOfStructs)
// 8 | { {2, 1}, 5} |

auto s1 = [&] {
auto a = int32s_col{1, 1, 2, 2, 2, 1, 1, 1, 2};
auto b = int32s_col{1, 2, 1, 2, 2, 1, 1, 1, 1};
auto a = int32s_col{1, 1, XXX, XXX, XXX, XXX, 1, XXX, 2};
auto b = int32s_col{1, 2, XXX, XXX, XXX, XXX, 1, XXX, 1};
auto s2 = structs_col{{a, b}, nulls_at({2, 3, 7})};

auto c = int32s_col{5, 4, 6, 4, 3, 3, 5, 4, 5};
auto c = int32s_col{5, 4, 6, 4, XXX, XXX, 5, 4, 5};
std::vector<std::unique_ptr<cudf::column>> s1_children;
s1_children.emplace_back(s2.release());
s1_children.emplace_back(c.release());
Expand Down
Loading

0 comments on commit c32cb9a

Please sign in to comment.