Skip to content

Commit

Permalink
Change cudf::detail::concatenate_masks to return null-count (#13330)
Browse files Browse the repository at this point in the history
Changes `cudf::detail::concatenate_masks` to also return the null count. This saves computing the null count in a separate kernel launch where this function is used. Also changes the `detail/concatenate.cuh` header to `detail/concatenate_masks.hpp` since contains no device code and only includes the `detail::concatenate_masks()` functions. The `detail::concatenate()` functions are already declared in `detail/concatenate.hpp`.
This is marked as non-breaking since it only effects a detail function.
Reference: #11968

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Bradley Dice (https://github.com/bdice)
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: #13330
  • Loading branch information
davidwendt authored May 15, 2023
1 parent b3f89c7 commit 23a77a6
Show file tree
Hide file tree
Showing 8 changed files with 69 additions and 58 deletions.
1 change: 1 addition & 0 deletions conda/recipes/libcudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ outputs:
- test -f $PREFIX/include/cudf/detail/binaryop.hpp
- test -f $PREFIX/include/cudf/detail/calendrical_month_sequence.cuh
- test -f $PREFIX/include/cudf/detail/concatenate.hpp
- test -f $PREFIX/include/cudf/detail/concatenate_masks.hpp
- test -f $PREFIX/include/cudf/detail/contiguous_split.hpp
- test -f $PREFIX/include/cudf/detail/copy.hpp
- test -f $PREFIX/include/cudf/detail/datetime.hpp
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -17,14 +17,11 @@

#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_view.hpp>
#include <cudf/concatenate.hpp>
#include <cudf/detail/concatenate.hpp>
#include <cudf/table/table_view.hpp>
#include <cudf/utilities/span.hpp>

#include <rmm/cuda_stream_view.hpp>

#include <vector>
#include <rmm/device_buffer.hpp>
#include <rmm/mr/device/device_memory_resource.hpp>

namespace cudf {
//! Inner interfaces and implementations
Expand All @@ -39,12 +36,13 @@ namespace detail {
* @param dest_mask The output buffer to copy null masks into
* @param output_size The total number of null masks bits that are being copied
* @param stream CUDA stream used for device memory operations and kernel launches.
* @return The number of nulls
*/
void concatenate_masks(device_span<column_device_view const> d_views,
device_span<size_t const> d_offsets,
bitmask_type* dest_mask,
size_type output_size,
rmm::cuda_stream_view stream);
size_type concatenate_masks(device_span<column_device_view const> d_views,
device_span<size_t const> d_offsets,
bitmask_type* dest_mask,
size_type output_size,
rmm::cuda_stream_view stream);

/**
* @brief Concatenates `views[i]`'s bitmask from the bits
Expand All @@ -54,10 +52,11 @@ void concatenate_masks(device_span<column_device_view const> d_views,
* @param views Column views whose bitmasks will be concatenated
* @param dest_mask The output buffer to copy null masks into
* @param stream CUDA stream used for device memory operations and kernel launches.
* @return The number of nulls
*/
void concatenate_masks(host_span<column_view const> views,
bitmask_type* dest_mask,
rmm::cuda_stream_view stream);
size_type concatenate_masks(host_span<column_view const> views,
bitmask_type* dest_mask,
rmm::cuda_stream_view stream);

/**
* @copydoc cudf::concatenate_masks(host_span<column_view const>, rmm::mr::device_memory_resource*)
Expand Down
65 changes: 42 additions & 23 deletions cpp/src/copying/concatenate.cu
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <cudf/detail/concatenate.cuh>

#include <cudf/column/column.hpp>
#include <cudf/column/column_device_view.cuh>
#include <cudf/detail/concatenate_masks.hpp>
#include <cudf/detail/copy.hpp>
#include <cudf/detail/get_value.cuh>
#include <cudf/detail/null_mask.hpp>
Expand Down Expand Up @@ -49,6 +49,7 @@

namespace cudf {
namespace detail {
namespace {

// From benchmark data, the fused kernel optimization appears to perform better
// when there are more than a trivial number of columns, or when the null mask
Expand Down Expand Up @@ -100,23 +101,29 @@ auto create_device_views(host_span<column_view const> views, rmm::cuda_stream_vi
* @brief Concatenates the null mask bits of all the column device views in the
* `views` array to the destination bitmask.
*
* @tparam block_size Block size for using with single_lane_block_sum_reduce
*
* @param views Array of column_device_view
* @param output_offsets Prefix sum of sizes of elements of `views`
* @param number_of_views Size of `views` array
* @param dest_mask The output buffer to copy null masks into
* @param number_of_mask_bits The total number of null masks bits that are being
* copied
* @param number_of_mask_bits The total number of null masks bits that are being copied
* @param out_valid_count To hold the total number of valid bits set
*/
template <size_type block_size>
__global__ void concatenate_masks_kernel(column_device_view const* views,
size_t const* output_offsets,
size_type number_of_views,
bitmask_type* dest_mask,
size_type number_of_mask_bits)
size_type number_of_mask_bits,
size_type* out_valid_count)
{
size_type mask_index = threadIdx.x + blockIdx.x * blockDim.x;

auto active_mask = __ballot_sync(0xFFFF'FFFFu, mask_index < number_of_mask_bits);

size_type warp_valid_count = 0;

while (mask_index < number_of_mask_bits) {
size_type const source_view_index =
thrust::upper_bound(
Expand All @@ -129,42 +136,55 @@ __global__ void concatenate_masks_kernel(column_device_view const* views,
}
bitmask_type const new_word = __ballot_sync(active_mask, bit_is_set);

if (threadIdx.x % detail::warp_size == 0) { dest_mask[word_index(mask_index)] = new_word; }
if (threadIdx.x % detail::warp_size == 0) {
dest_mask[word_index(mask_index)] = new_word;
warp_valid_count += __popc(new_word);
}

mask_index += blockDim.x * gridDim.x;
active_mask = __ballot_sync(active_mask, mask_index < number_of_mask_bits);
}

using detail::single_lane_block_sum_reduce;
auto const block_valid_count = single_lane_block_sum_reduce<block_size, 0>(warp_valid_count);
if (threadIdx.x == 0) { atomicAdd(out_valid_count, block_valid_count); }
}
} // namespace

void concatenate_masks(device_span<column_device_view const> d_views,
device_span<size_t const> d_offsets,
bitmask_type* dest_mask,
size_type output_size,
rmm::cuda_stream_view stream)
size_type concatenate_masks(device_span<column_device_view const> d_views,
device_span<size_t const> d_offsets,
bitmask_type* dest_mask,
size_type output_size,
rmm::cuda_stream_view stream)
{
rmm::device_scalar<size_type> d_valid_count(0, stream);
constexpr size_type block_size{256};
cudf::detail::grid_1d config(output_size, block_size);
concatenate_masks_kernel<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
d_views.data(),
d_offsets.data(),
static_cast<size_type>(d_views.size()),
dest_mask,
output_size);
concatenate_masks_kernel<block_size>
<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
d_views.data(),
d_offsets.data(),
static_cast<size_type>(d_views.size()),
dest_mask,
output_size,
d_valid_count.data());
return output_size - d_valid_count.value(stream);
}

void concatenate_masks(host_span<column_view const> views,
bitmask_type* dest_mask,
rmm::cuda_stream_view stream)
size_type concatenate_masks(host_span<column_view const> views,
bitmask_type* dest_mask,
rmm::cuda_stream_view stream)
{
// Preprocess and upload inputs to device memory
auto const device_views = create_device_views(views, stream);
auto const& d_views = std::get<1>(device_views);
auto const& d_offsets = std::get<2>(device_views);
auto const output_size = std::get<3>(device_views);

concatenate_masks(d_views, d_offsets, dest_mask, output_size, stream);
return concatenate_masks(d_views, d_offsets, dest_mask, output_size, stream);
}

namespace {
template <typename T, size_type block_size, bool Nullable>
__global__ void fused_concatenate_kernel(column_device_view const* input_views,
size_t const* input_offsets,
Expand Down Expand Up @@ -287,7 +307,8 @@ std::unique_ptr<column> for_each_concatenate(host_span<column_view const> views,

// If concatenated column is nullable, proceed to calculate it
if (has_nulls) {
cudf::detail::concatenate_masks(views, (col->mutable_view()).null_mask(), stream);
col->set_null_count(
cudf::detail::concatenate_masks(views, (col->mutable_view()).null_mask(), stream));
} else {
col->set_null_count(0); // prevent null count from being materialized
}
Expand Down Expand Up @@ -340,8 +361,6 @@ std::unique_ptr<column> concatenate_dispatch::operator()<cudf::struct_view>()
return cudf::structs::detail::concatenate(views, stream, mr);
}

namespace {

void bounds_and_type_check(host_span<column_view const> cols, rmm::cuda_stream_view stream);

/**
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/dictionary/add_keys.cu
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
* limitations under the License.
*/

#include <cudf/detail/concatenate.cuh>
#include <cudf/detail/concatenate.hpp>
#include <cudf/detail/gather.hpp>
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/dictionary/detail/concatenate.cu
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
*/

#include <cudf/column/column_factories.hpp>
#include <cudf/detail/concatenate.cuh>
#include <cudf/detail/concatenate.hpp>
#include <cudf/detail/indexalator.cuh>
#include <cudf/detail/iterator.cuh>
#include <cudf/detail/sorting.hpp>
Expand Down
1 change: 0 additions & 1 deletion cpp/src/join/hash_join.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
#include "join_common_utils.cuh"

#include <cudf/copying.hpp>
#include <cudf/detail/concatenate.cuh>
#include <cudf/detail/iterator.cuh>
#include <cudf/detail/join.hpp>
#include <cudf/detail/structs/utilities.hpp>
Expand Down
15 changes: 6 additions & 9 deletions cpp/src/lists/copying/concatenate.cu
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,10 @@
#include <cudf/column/column_factories.hpp>
#include <cudf/concatenate.hpp>
#include <cudf/copying.hpp>
#include <cudf/detail/concatenate.cuh>
#include <cudf/detail/concatenate.hpp>
#include <cudf/detail/concatenate_masks.hpp>
#include <cudf/detail/get_value.cuh>
#include <cudf/detail/null_mask.cuh>
#include <cudf/lists/lists_column_view.hpp>

#include <rmm/cuda_stream_view.hpp>
Expand Down Expand Up @@ -124,14 +126,9 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
std::any_of(columns.begin(), columns.end(), [](auto const& col) { return col.has_nulls(); });
rmm::device_buffer null_mask = create_null_mask(
total_list_count, has_nulls ? mask_state::UNINITIALIZED : mask_state::UNALLOCATED);
cudf::size_type null_count{0};
if (has_nulls) {
cudf::detail::concatenate_masks(columns, static_cast<bitmask_type*>(null_mask.data()), stream);
null_count =
std::transform_reduce(columns.begin(), columns.end(), 0, std::plus{}, [](auto const& col) {
return col.null_count();
});
}
auto null_mask_data = static_cast<bitmask_type*>(null_mask.data());
auto const null_count =
has_nulls ? cudf::detail::concatenate_masks(columns, null_mask_data, stream) : size_type{0};

// assemble into outgoing list column
return make_lists_column(total_list_count,
Expand Down
14 changes: 5 additions & 9 deletions cpp/src/structs/copying/concatenate.cu
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@
#include <cudf/column/column_factories.hpp>
#include <cudf/concatenate.hpp>
#include <cudf/copying.hpp>
#include <cudf/detail/concatenate.cuh>
#include <cudf/detail/concatenate.hpp>
#include <cudf/detail/concatenate_masks.hpp>
#include <cudf/detail/get_value.cuh>
#include <cudf/detail/structs/utilities.hpp>
#include <cudf/structs/structs_column_view.hpp>
Expand Down Expand Up @@ -65,14 +66,9 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
std::any_of(columns.begin(), columns.end(), [](auto const& col) { return col.has_nulls(); });
rmm::device_buffer null_mask =
create_null_mask(total_length, has_nulls ? mask_state::UNINITIALIZED : mask_state::UNALLOCATED);
cudf::size_type null_count{0};
if (has_nulls) {
cudf::detail::concatenate_masks(columns, static_cast<bitmask_type*>(null_mask.data()), stream);
null_count =
std::transform_reduce(columns.begin(), columns.end(), 0, std::plus{}, [](auto const& col) {
return col.null_count();
});
}
auto null_mask_data = static_cast<bitmask_type*>(null_mask.data());
auto const null_count =
has_nulls ? cudf::detail::concatenate_masks(columns, null_mask_data, stream) : size_type{0};

// assemble into outgoing list column
return make_structs_column(
Expand Down

0 comments on commit 23a77a6

Please sign in to comment.