Skip to content

Commit

Permalink
Fix valid count computation in offset_bitmask_binop kernel (#13489)
Browse files Browse the repository at this point in the history
Fixes the valid count calculation in the `offset_bitmask_binop` kernel when the mask may contain slack bits. The slack bits must be accounted for in the final bitmask word to correctly compute the valid count.

Closes #13479 

Authors:
   - David Wendt (https://github.com/davidwendt)

Approvers:
   - Vyas Ramasubramani (https://github.com/vyasr)
   - Bradley Dice (https://github.com/bdice)
   - Lawrence Mitchell (https://github.com/wence-)
  • Loading branch information
davidwendt authored Jun 2, 2023
1 parent 5541e64 commit 6bb0328
Showing 1 changed file with 12 additions and 11 deletions.
23 changes: 12 additions & 11 deletions cpp/include/cudf/detail/null_mask.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -66,9 +66,11 @@ __global__ void offset_bitmask_binop(Binop op,
size_type source_size_bits,
size_type* count_ptr)
{
constexpr auto const word_size{detail::size_in_bits<bitmask_type>()};
auto const tid = threadIdx.x + blockIdx.x * blockDim.x;

auto const last_bit_index = source_size_bits - 1;
auto const last_word_index = cudf::word_index(last_bit_index);

size_type thread_count = 0;

for (size_type destination_word_index = tid; destination_word_index < destination.size();
Expand All @@ -86,20 +88,19 @@ __global__ void offset_bitmask_binop(Binop op,
source_begin_bits[i] + source_size_bits));
}

if (destination_word_index == last_word_index) {
// mask out any bits not part of this word
auto const num_bits_in_last_word = intra_word_index(last_bit_index);
if (num_bits_in_last_word <
static_cast<size_type>(detail::size_in_bits<bitmask_type>() - 1)) {
destination_word &= set_least_significant_bits(num_bits_in_last_word + 1);
}
}

destination[destination_word_index] = destination_word;
thread_count += __popc(destination_word);
}

// Subtract any slack bits from the last word
if (tid == 0) {
size_type const last_bit_index = source_size_bits - 1;
size_type const num_slack_bits = word_size - (last_bit_index % word_size) - 1;
if (num_slack_bits > 0) {
size_type const word_index = cudf::word_index(last_bit_index);
thread_count -= __popc(destination[word_index] & set_most_significant_bits(num_slack_bits));
}
}

using BlockReduce = cub::BlockReduce<size_type, block_size>;
__shared__ typename BlockReduce::TempStorage temp_storage;
size_type block_count = BlockReduce(temp_storage).Sum(thread_count);
Expand Down

0 comments on commit 6bb0328

Please sign in to comment.