Skip to content

Commit

Permalink
Fix typos, rename types, and add null_probability benchmark axis for …
Browse files Browse the repository at this point in the history
…distinct (rapidsai#17546)

This PR addresses several minor issues discovered while working on rapidsai#17467:

- Corrected a typo where `RowHasher` should have been `RowEqual`
- Renamed `hash_set_type` to `distinct_set_t`
- Added a `null_probability` benchmark axis for the distinct benchmark, similar to other stream compaction benchmarks

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: rapidsai#17546
  • Loading branch information
PointKernel authored Dec 10, 2024
1 parent 5306eca commit 657f50b
Show file tree
Hide file tree
Showing 4 changed files with 18 additions and 14 deletions.
4 changes: 3 additions & 1 deletion cpp/benchmarks/stream_compaction/distinct.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ void nvbench_distinct(nvbench::state& state, nvbench::type_list<Type>)
cudf::size_type const num_rows = state.get_int64("NumRows");
auto const keep = get_keep(state.get_string("keep"));
cudf::size_type const cardinality = state.get_int64("cardinality");
auto const null_probability = state.get_float64("null_probability");

if (cardinality > num_rows) {
state.skip("cardinality > num_rows");
Expand All @@ -42,7 +43,7 @@ void nvbench_distinct(nvbench::state& state, nvbench::type_list<Type>)

data_profile profile = data_profile_builder()
.cardinality(cardinality)
.null_probability(0.01)
.null_probability(null_probability)
.distribution(cudf::type_to_id<Type>(),
distribution_id::UNIFORM,
static_cast<Type>(0),
Expand All @@ -65,6 +66,7 @@ using data_type = nvbench::type_list<int32_t, int64_t>;
NVBENCH_BENCH_TYPES(nvbench_distinct, NVBENCH_TYPE_AXES(data_type))
.set_name("distinct")
.set_type_axes_names({"Type"})
.add_float64_axis("null_probability", {0.01})
.add_string_axis("keep", {"any", "first", "last", "none"})
.add_int64_axis("cardinality", {100, 100'000, 10'000'000, 1'000'000'000})
.add_int64_axis("NumRows", {100, 100'000, 10'000'000, 1'000'000'000});
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/stream_compaction/distinct.cu
Original file line number Diff line number Diff line change
Expand Up @@ -95,8 +95,8 @@ rmm::device_uvector<size_type> distinct_indices(table_view const& input,
auto const row_equal = cudf::experimental::row::equality::self_comparator(preprocessed_input);

auto const helper_func = [&](auto const& d_equal) {
using RowHasher = std::decay_t<decltype(d_equal)>;
auto set = hash_set_type<RowHasher>{
using RowEqual = std::decay_t<decltype(d_equal)>;
auto set = distinct_set_t<RowEqual>{
num_rows,
0.5, // desired load factor
cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
Expand Down
12 changes: 6 additions & 6 deletions cpp/src/stream_compaction/distinct_helpers.cu
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@

namespace cudf::detail {

template <typename RowHasher>
rmm::device_uvector<size_type> reduce_by_row(hash_set_type<RowHasher>& set,
template <typename RowEqual>
rmm::device_uvector<size_type> reduce_by_row(distinct_set_t<RowEqual>& set,
size_type num_rows,
duplicate_keep_option keep,
rmm::cuda_stream_view stream,
Expand Down Expand Up @@ -100,7 +100,7 @@ rmm::device_uvector<size_type> reduce_by_row(hash_set_type<RowHasher>& set,
}

template rmm::device_uvector<size_type> reduce_by_row(
hash_set_type<cudf::experimental::row::equality::device_row_comparator<
distinct_set_t<cudf::experimental::row::equality::device_row_comparator<
false,
cudf::nullate::DYNAMIC,
cudf::experimental::row::equality::nan_equal_physical_equality_comparator>>& set,
Expand All @@ -110,7 +110,7 @@ template rmm::device_uvector<size_type> reduce_by_row(
rmm::device_async_resource_ref mr);

template rmm::device_uvector<size_type> reduce_by_row(
hash_set_type<cudf::experimental::row::equality::device_row_comparator<
distinct_set_t<cudf::experimental::row::equality::device_row_comparator<
true,
cudf::nullate::DYNAMIC,
cudf::experimental::row::equality::nan_equal_physical_equality_comparator>>& set,
Expand All @@ -120,7 +120,7 @@ template rmm::device_uvector<size_type> reduce_by_row(
rmm::device_async_resource_ref mr);

template rmm::device_uvector<size_type> reduce_by_row(
hash_set_type<cudf::experimental::row::equality::device_row_comparator<
distinct_set_t<cudf::experimental::row::equality::device_row_comparator<
false,
cudf::nullate::DYNAMIC,
cudf::experimental::row::equality::physical_equality_comparator>>& set,
Expand All @@ -130,7 +130,7 @@ template rmm::device_uvector<size_type> reduce_by_row(
rmm::device_async_resource_ref mr);

template rmm::device_uvector<size_type> reduce_by_row(
hash_set_type<cudf::experimental::row::equality::device_row_comparator<
distinct_set_t<cudf::experimental::row::equality::device_row_comparator<
true,
cudf::nullate::DYNAMIC,
cudf::experimental::row::equality::physical_equality_comparator>>& set,
Expand Down
12 changes: 7 additions & 5 deletions cpp/src/stream_compaction/distinct_helpers.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,12 @@ auto constexpr reduction_init_value(duplicate_keep_option keep)
}
}

template <typename RowHasher>
using hash_set_type =
template <typename RowEqual>
using distinct_set_t =
cuco::static_set<size_type,
cuco::extent<int64_t>,
cuda::thread_scope_device,
RowHasher,
RowEqual,
cuco::linear_probing<1,
cudf::experimental::row::hash::device_row_hasher<
cudf::hashing::detail::default_hash,
Expand All @@ -79,6 +79,8 @@ using hash_set_type =
* the `reduction_init_value()` function. Then, the reduction result for each row group is written
* into the output array at the index of an unspecified row in the group.
*
* @tparam RowEqual The type of row equality comparator
*
* @param set The auxiliary set to perform reduction
* @param set_size The number of elements in set
* @param num_rows The number of all input rows
Expand All @@ -87,8 +89,8 @@ using hash_set_type =
* @param mr Device memory resource used to allocate the returned vector
* @return A device_uvector containing the output indices
*/
template <typename RowHasher>
rmm::device_uvector<size_type> reduce_by_row(hash_set_type<RowHasher>& set,
template <typename RowEqual>
rmm::device_uvector<size_type> reduce_by_row(distinct_set_t<RowEqual>& set,
size_type num_rows,
duplicate_keep_option keep,
rmm::cuda_stream_view stream,
Expand Down

0 comments on commit 657f50b

Please sign in to comment.