Skip to content

Commit

Permalink
Fix hash join when the input tables have nulls on only one side (#13120)
Browse files Browse the repository at this point in the history
This is very similar to #11284, which fixes a bug when only one input table has nulls while the other doesn't. This is due to the new experimental hasher producing different hash values depending on an input flag `has_nulls`. In order to properly use it, `has_nulls` must be computed by checking all the possible input tables, or set to a constant value (`true`).

Closes:
 * #13109

Authors:
  - Nghia Truong (https://github.com/ttnghia)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Divye Gala (https://github.com/divyegala)
  - Yunsong Wang (https://github.com/PointKernel)

URL: #13120
  • Loading branch information
ttnghia authored Apr 13, 2023
1 parent 3069f1e commit d415ffe
Show file tree
Hide file tree
Showing 8 changed files with 263 additions and 68 deletions.
15 changes: 12 additions & 3 deletions cpp/benchmarks/join/join.cu
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,10 @@ void nvbench_inner_join(nvbench::state& state,
cudf::table_view const& right_input,
cudf::null_equality compare_nulls,
rmm::cuda_stream_view stream) {
cudf::hash_join hj_obj(left_input, compare_nulls, stream);
auto const has_nulls = cudf::has_nested_nulls(left_input) || cudf::has_nested_nulls(right_input)
? cudf::nullable_join::YES
: cudf::nullable_join::NO;
cudf::hash_join hj_obj(left_input, has_nulls, compare_nulls, stream);
return hj_obj.inner_join(right_input, std::nullopt, stream);
};

Expand All @@ -44,7 +47,10 @@ void nvbench_left_join(nvbench::state& state,
cudf::table_view const& right_input,
cudf::null_equality compare_nulls,
rmm::cuda_stream_view stream) {
cudf::hash_join hj_obj(left_input, compare_nulls, stream);
auto const has_nulls = cudf::has_nested_nulls(left_input) || cudf::has_nested_nulls(right_input)
? cudf::nullable_join::YES
: cudf::nullable_join::NO;
cudf::hash_join hj_obj(left_input, has_nulls, compare_nulls, stream);
return hj_obj.left_join(right_input, std::nullopt, stream);
};

Expand All @@ -61,7 +67,10 @@ void nvbench_full_join(nvbench::state& state,
cudf::table_view const& right_input,
cudf::null_equality compare_nulls,
rmm::cuda_stream_view stream) {
cudf::hash_join hj_obj(left_input, compare_nulls, stream);
auto const has_nulls = cudf::has_nested_nulls(left_input) || cudf::has_nested_nulls(right_input)
? cudf::nullable_join::YES
: cudf::nullable_join::NO;
cudf::hash_join hj_obj(left_input, has_nulls, compare_nulls, stream);
return hj_obj.full_join(right_input, std::nullopt, stream);
};

Expand Down
11 changes: 7 additions & 4 deletions cpp/include/cudf/detail/join.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,10 +73,10 @@ struct hash_join {
hash_join& operator=(hash_join&&) = delete;

private:
bool const _is_empty; ///< true if `_hash_table` is empty
rmm::device_buffer const _composite_bitmask; ///< Bitmask to denote whether a row is valid
cudf::null_equality const _nulls_equal; ///< whether to consider nulls as equal
cudf::table_view _build; ///< input table to build the hash map
bool const _is_empty; ///< true if `_hash_table` is empty
bool const _has_nulls; ///< true if nulls are present in either build table or any probe table
cudf::null_equality const _nulls_equal; ///< whether to consider nulls as equal
cudf::table_view _build; ///< input table to build the hash map
std::shared_ptr<cudf::experimental::row::equality::preprocessed_table>
_preprocessed_build; ///< input table preprocssed for row operators
map_type _hash_table; ///< hash table built on `_build`
Expand All @@ -89,10 +89,13 @@ struct hash_join {
* @throw cudf::logic_error if the number of rows in `build` table exceeds MAX_JOIN_SIZE.
*
* @param build The build table, from which the hash table is built.
* @param has_nulls Flag to indicate if the there exists any nulls in the `build` table or
* any `probe` table that will be used later for join.
* @param compare_nulls Controls whether null join-key values should match or not.
* @param stream CUDA stream used for device memory operations and kernel launches.
*/
hash_join(cudf::table_view const& build,
bool has_nulls,
cudf::null_equality compare_nulls,
rmm::cuda_stream_view stream);

Expand Down
41 changes: 40 additions & 1 deletion cpp/include/cudf/join.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -256,6 +256,16 @@ std::unique_ptr<cudf::table> cross_join(
cudf::table_view const& right,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief The enum class to specify if any of the input join tables (`build` table and any later
* `probe` table) has nulls.
*
* This is used upon hash_join object construction to specify the existence of nulls in all the
* possible input tables. If such null existence is unknown, `YES` should be used as the default
* option.
*/
enum class nullable_join : bool { YES, NO };

/**
* @brief Hash join that builds hash table in creation and probes results in subsequent `*_join`
* member functions.
Expand Down Expand Up @@ -289,6 +299,17 @@ class hash_join {
null_equality compare_nulls,
rmm::cuda_stream_view stream = cudf::get_default_stream());

/**
* @copydoc hash_join(cudf::table_view const&, null_equality, rmm::cuda_stream_view)
*
* @param has_nulls Flag to indicate if the there exists any nulls in the `build` table or
* any `probe` table that will be used later for join
*/
hash_join(cudf::table_view const& build,
nullable_join has_nulls,
null_equality compare_nulls,
rmm::cuda_stream_view stream = cudf::get_default_stream());

/**
* Returns the row indices that can be used to construct the result of performing
* an inner join between two tables. @see cudf::inner_join(). Behavior is undefined if the
Expand All @@ -300,6 +321,9 @@ class hash_join {
* @param mr Device memory resource used to allocate the returned table and columns' device
* memory.
*
* @throw cudf::logic_error If the input probe table has nulls while this hash_join object was not
* constructed with null check.
*
* @return A pair of columns [`left_indices`, `right_indices`] that can be used to construct
* the result of performing an inner join between two tables with `build` and `probe`
* as the the join keys .
Expand All @@ -322,6 +346,9 @@ class hash_join {
* @param mr Device memory resource used to allocate the returned table and columns' device
* memory.
*
* @throw cudf::logic_error If the input probe table has nulls while this hash_join object was not
* constructed with null check.
*
* @return A pair of columns [`left_indices`, `right_indices`] that can be used to construct
* the result of performing a left join between two tables with `build` and `probe`
* as the the join keys .
Expand All @@ -344,6 +371,9 @@ class hash_join {
* @param mr Device memory resource used to allocate the returned table and columns' device
* memory.
*
* @throw cudf::logic_error If the input probe table has nulls while this hash_join object was not
* constructed with null check.
*
* @return A pair of columns [`left_indices`, `right_indices`] that can be used to construct
* the result of performing a full join between two tables with `build` and `probe`
* as the the join keys .
Expand All @@ -362,6 +392,9 @@ class hash_join {
* @param probe The probe table, from which the tuples are probed
* @param stream CUDA stream used for device memory operations and kernel launches
*
* @throw cudf::logic_error If the input probe table has nulls while this hash_join object was not
* constructed with null check.
*
* @return The exact number of output when performing an inner join between two tables with
* `build` and `probe` as the the join keys .
*/
Expand All @@ -375,6 +408,9 @@ class hash_join {
* @param probe The probe table, from which the tuples are probed
* @param stream CUDA stream used for device memory operations and kernel launches
*
* @throw cudf::logic_error If the input probe table has nulls while this hash_join object was not
* constructed with null check.
*
* @return The exact number of output when performing a left join between two tables with `build`
* and `probe` as the the join keys .
*/
Expand All @@ -390,6 +426,9 @@ class hash_join {
* @param mr Device memory resource used to allocate the intermediate table and columns' device
* memory.
*
* @throw cudf::logic_error If the input probe table has nulls while this hash_join object was not
* constructed with null check.
*
* @return The exact number of output when performing a full join between two tables with `build`
* and `probe` as the the join keys .
*/
Expand Down
118 changes: 71 additions & 47 deletions cpp/src/join/hash_join.cu
Original file line number Diff line number Diff line change
Expand Up @@ -359,11 +359,11 @@ std::size_t get_full_join_size(

template <typename Hasher>
hash_join<Hasher>::hash_join(cudf::table_view const& build,
bool has_nulls,
cudf::null_equality compare_nulls,
rmm::cuda_stream_view stream)
: _is_empty{build.num_rows() == 0},
_composite_bitmask{
cudf::detail::bitmask_and(build, stream, rmm::mr::get_current_device_resource()).first},
: _has_nulls(has_nulls),
_is_empty{build.num_rows() == 0},
_nulls_equal{compare_nulls},
_hash_table{compute_hash_table_size(build.num_rows()),
cuco::empty_key{std::numeric_limits<hash_value_type>::max()},
Expand All @@ -381,11 +381,14 @@ hash_join<Hasher>::hash_join(cudf::table_view const& build,

if (_is_empty) { return; }

auto const row_bitmask =
cudf::detail::bitmask_and(build, stream, rmm::mr::get_current_device_resource()).first;
cudf::detail::build_join_hash_table(_build,
_preprocessed_build,
_hash_table,
_has_nulls,
_nulls_equal,
static_cast<bitmask_type const*>(_composite_bitmask.data()),
reinterpret_cast<bitmask_type const*>(row_bitmask.data()),
stream);
}

Expand Down Expand Up @@ -434,19 +437,21 @@ std::size_t hash_join<Hasher>::inner_join_size(cudf::table_view const& probe,
// Return directly if build table is empty
if (_is_empty) { return 0; }

CUDF_EXPECTS(_has_nulls || !cudf::has_nested_nulls(probe),
"Probe table has nulls while build table was not hashed with null check.");

auto const preprocessed_probe =
cudf::experimental::row::equality::preprocessed_table::create(probe, stream);

return cudf::detail::compute_join_output_size(
_build,
probe,
_preprocessed_build,
preprocessed_probe,
_hash_table,
cudf::detail::join_kind::INNER_JOIN,
cudf::has_nested_nulls(probe) || cudf::has_nested_nulls(_build),
_nulls_equal,
stream);
return cudf::detail::compute_join_output_size(_build,
probe,
_preprocessed_build,
preprocessed_probe,
_hash_table,
cudf::detail::join_kind::INNER_JOIN,
_has_nulls,
_nulls_equal,
stream);
}

template <typename Hasher>
Expand All @@ -458,19 +463,21 @@ std::size_t hash_join<Hasher>::left_join_size(cudf::table_view const& probe,
// Trivial left join case - exit early
if (_is_empty) { return probe.num_rows(); }

CUDF_EXPECTS(_has_nulls || !cudf::has_nested_nulls(probe),
"Probe table has nulls while build table was not hashed with null check.");

auto const preprocessed_probe =
cudf::experimental::row::equality::preprocessed_table::create(probe, stream);

return cudf::detail::compute_join_output_size(
_build,
probe,
_preprocessed_build,
preprocessed_probe,
_hash_table,
cudf::detail::join_kind::LEFT_JOIN,
cudf::has_nested_nulls(probe) || cudf::has_nested_nulls(_build),
_nulls_equal,
stream);
return cudf::detail::compute_join_output_size(_build,
probe,
_preprocessed_build,
preprocessed_probe,
_hash_table,
cudf::detail::join_kind::LEFT_JOIN,
_has_nulls,
_nulls_equal,
stream);
}

template <typename Hasher>
Expand All @@ -483,19 +490,21 @@ std::size_t hash_join<Hasher>::full_join_size(cudf::table_view const& probe,
// Trivial left join case - exit early
if (_is_empty) { return probe.num_rows(); }

CUDF_EXPECTS(_has_nulls || !cudf::has_nested_nulls(probe),
"Probe table has nulls while build table was not hashed with null check.");

auto const preprocessed_probe =
cudf::experimental::row::equality::preprocessed_table::create(probe, stream);

return cudf::detail::get_full_join_size(
_build,
probe,
_preprocessed_build,
preprocessed_probe,
_hash_table,
cudf::has_nested_nulls(probe) || cudf::has_nested_nulls(_build),
_nulls_equal,
stream,
mr);
return cudf::detail::get_full_join_size(_build,
probe,
_preprocessed_build,
preprocessed_probe,
_hash_table,
_has_nulls,
_nulls_equal,
stream,
mr);
}

template <typename Hasher>
Expand All @@ -514,20 +523,22 @@ hash_join<Hasher>::probe_join_indices(cudf::table_view const& probe_table,

CUDF_EXPECTS(!_is_empty, "Hash table of hash join is null.");

CUDF_EXPECTS(_has_nulls || !cudf::has_nested_nulls(probe_table),
"Probe table has nulls while build table was not hashed with null check.");

auto const preprocessed_probe =
cudf::experimental::row::equality::preprocessed_table::create(probe_table, stream);
auto join_indices = cudf::detail::probe_join_hash_table(
_build,
probe_table,
_preprocessed_build,
preprocessed_probe,
_hash_table,
join,
cudf::has_nested_nulls(probe_table) || cudf::has_nested_nulls(_build),
_nulls_equal,
output_size,
stream,
mr);
auto join_indices = cudf::detail::probe_join_hash_table(_build,
probe_table,
_preprocessed_build,
preprocessed_probe,
_hash_table,
join,
_has_nulls,
_nulls_equal,
output_size,
stream,
mr);

if (join == cudf::detail::join_kind::FULL_JOIN) {
auto complement_indices = detail::get_left_join_indices_complement(
Expand All @@ -553,6 +564,9 @@ hash_join<Hasher>::compute_hash_join(cudf::table_view const& probe,
CUDF_EXPECTS(_build.num_columns() == probe.num_columns(),
"Mismatch in number of columns to be joined on");

CUDF_EXPECTS(_has_nulls || !cudf::has_nested_nulls(probe),
"Probe table has nulls while build table was not hashed with null check.");

if (is_trivial_join(probe, _build, join)) {
return std::pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
Expand All @@ -574,7 +588,17 @@ hash_join::~hash_join() = default;
hash_join::hash_join(cudf::table_view const& build,
null_equality compare_nulls,
rmm::cuda_stream_view stream)
: _impl{std::make_unique<const impl_type>(build, compare_nulls, stream)}
// If we cannot know beforehand about null existence then let's assume that there are nulls.
: hash_join(build, nullable_join::YES, compare_nulls, stream)
{
}

hash_join::hash_join(cudf::table_view const& build,
nullable_join has_nulls,
null_equality compare_nulls,
rmm::cuda_stream_view stream)
: _impl{std::make_unique<const impl_type>(
build, has_nulls == nullable_join::YES, compare_nulls, stream)}
{
}

Expand Down
Loading

0 comments on commit d415ffe

Please sign in to comment.