Skip to content

Commit

Permalink
shared mem atomics to keep track of per-submap erases
Browse files Browse the repository at this point in the history
  • Loading branch information
niskos99 committed Apr 7, 2022
1 parent fd4db73 commit 3906909
Show file tree
Hide file tree
Showing 5 changed files with 141 additions and 80 deletions.
12 changes: 6 additions & 6 deletions benchmarks/hash_table/dynamic_map_bench.cu
Original file line number Diff line number Diff line change
Expand Up @@ -264,12 +264,12 @@ BENCHMARK_TEMPLATE(BM_dynamic_search_all, int32_t, int32_t, dist_type::UNIQUE)
->Unit(benchmark::kMillisecond)
->Apply(gen_final_size)
->UseManualTime();
*/
BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int32_t, int32_t, dist_type::UNIQUE)
->Unit(benchmark::kMillisecond)
->Apply(gen_final_size)
->UseManualTime();
/*
BENCHMARK_TEMPLATE(BM_dynamic_insert, int32_t, int32_t, dist_type::UNIFORM)
->Unit(benchmark::kMillisecond)
->Apply(gen_final_size)
Expand Down Expand Up @@ -304,22 +304,22 @@ BENCHMARK_TEMPLATE(BM_dynamic_search_all, int64_t, int64_t, dist_type::UNIQUE)
->Unit(benchmark::kMillisecond)
->Apply(gen_final_size)
->UseManualTime();
*/
BENCHMARK_TEMPLATE(BM_dynamic_erase_none, int32_t, int32_t, dist_type::UNIQUE)
->Unit(benchmark::kMillisecond)
->Apply(gen_final_size)
->UseManualTime();
/*
BENCHMARK_TEMPLATE(BM_dynamic_erase_none, int32_t, int32_t, dist_type::GAUSSIAN)
->Unit(benchmark::kMillisecond)
->Apply(gen_final_size)
->UseManualTime();
*/
BENCHMARK_TEMPLATE(BM_dynamic_search_none, int32_t, int32_t, dist_type::GAUSSIAN)
->Unit(benchmark::kMillisecond)
->Apply(gen_final_size)
->UseManualTime();
/*
BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int64_t, int64_t, dist_type::UNIQUE)
->Unit(benchmark::kMillisecond)
->Apply(gen_final_size)
Expand Down
67 changes: 30 additions & 37 deletions include/cuco/detail/dynamic_map.inl
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,6 @@ dynamic_map<Key, Value, Scope, Allocator>::dynamic_map(
alloc));
submap_views_.push_back(submaps_[0]->get_device_view());
submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view());
submap_num_successes_.push_back(submaps_[0]->get_num_successes());

num_successes_ = std::allocator_traits<counter_allocator_type>::allocate(counter_allocator_, 1);
}

Expand Down Expand Up @@ -70,7 +68,7 @@ dynamic_map<Key, Value, Scope, Allocator>::dynamic_map(
submap_views_.push_back(submaps_[0]->get_device_view());
submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view());
submap_num_successes_.push_back(submaps_[0]->get_num_successes());

d_submap_num_successes_ = submap_num_successes_;
num_successes_ = std::allocator_traits<counter_allocator_type>::allocate(counter_allocator_, 1);
}

Expand Down Expand Up @@ -102,6 +100,8 @@ void dynamic_map<Key, Value, Scope, Allocator>::reserve(std::size_t n)
sentinel::empty_value<Value>{empty_value_sentinel_},
sentinel::erased_key<Key>{erased_key_sentinel_},
alloc_));
submap_num_successes_.push_back(submaps_[submap_idx]->get_num_successes());
d_submap_num_successes_ = submap_num_successes_;
} else {
submaps_.push_back(std::make_unique<static_map<Key, Value, Scope, Allocator>>(
submap_capacity,
Expand All @@ -111,8 +111,6 @@ void dynamic_map<Key, Value, Scope, Allocator>::reserve(std::size_t n)
}
submap_views_.push_back(submaps_[submap_idx]->get_device_view());
submap_mutable_views_.push_back(submaps_[submap_idx]->get_device_mutable_view());
submap_num_successes_.push_back(submaps_[submap_idx]->get_num_successes());

capacity_ *= 2;
}

Expand Down Expand Up @@ -164,7 +162,6 @@ void dynamic_map<Key, Value, Scope, Allocator>::insert(InputIt first,
std::size_t h_num_successes;
CUCO_CUDA_TRY(cudaMemcpy(
&h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost));

submaps_[submap_idx]->size_ += h_num_successes;
size_ += h_num_successes;
first += n;
Expand Down Expand Up @@ -193,46 +190,42 @@ void dynamic_map<Key, Value, Scope, Allocator>::erase(InputIt first,
CUCO_CUDA_TRY(cudaMemset(num_successes_, 0, sizeof(atomic_ctr_type)));

// zero out submap success counters
static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type));
for (int i = 0; i < submaps_.size(); ++i) {
CUCO_CUDA_TRY(cudaMemset(submap_num_successes_[i], 0, sizeof(atomic_ctr_type)));
if(submaps_.size() > 1) {
static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type));
for(int i = 0; i < submaps_.size(); ++i) {
CUCO_CUDA_TRY(cudaMemset(submap_num_successes_[i], 0, sizeof(atomic_ctr_type)));
}
}

// TODO: hacky, improve this
// provide device-accessible vector for each submap num_successes variable
thrust::device_vector<atomic_ctr_type*> d_submap_num_successes(submap_num_successes_);

// TODO: hack (how to get size on host?)
// use dynamic shared memory to hold block reduce space for each submap's erases
constexpr size_t temp_storage_size_one_block = 48;
auto const temp_storage_size = submaps_.size() * temp_storage_size_one_block;

auto const temp_storage_size = submaps_.size() * sizeof(unsigned long long);

detail::erase<block_size, tile_size, cuco::pair_type<key_type, mapped_type>>
<<<grid_size, block_size, temp_storage_size>>>(first,
first + num_keys,
submap_views_.data().get(),
submap_mutable_views_.data().get(),
num_successes_,
d_submap_num_successes.data().get(),
submaps_.size(),
hash,
key_equal);
<<<grid_size, block_size, temp_storage_size>>>(
first,
first + num_keys,
submap_views_.data().get(),
submap_mutable_views_.data().get(),
num_successes_,
d_submap_num_successes_.data().get(),
submaps_.size(),
hash,
key_equal);

// update total dynamic map size
std::size_t h_num_successes;
CUCO_CUDA_TRY(
cudaMemcpy(&h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost));
size_ -= h_num_successes;

// TODO: if only one submap, skip this step
// update each submap's size
for (int i = 0; i < submaps_.size(); ++i) {
std::size_t h_submap_num_successes;
CUCO_CUDA_TRY(cudaMemcpy(&h_submap_num_successes,
submap_num_successes_[i],
sizeof(atomic_ctr_type),
cudaMemcpyDeviceToHost));
submaps_[i]->size_ -= h_submap_num_successes;
if(submaps_.size() == 1) {
submaps_[0]->size_ -= h_num_successes;
} else {
for(int i = 0; i < submaps_.size(); ++i) {
std::size_t h_submap_num_successes;
CUCO_CUDA_TRY(cudaMemcpy(
&h_submap_num_successes, submap_num_successes_[i], sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost));
submaps_[i]->size_ -= h_submap_num_successes;
}
}
}

Expand Down
122 changes: 97 additions & 25 deletions include/cuco/detail/dynamic_map_kernels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,6 @@ __global__ void insert(InputIt first,
}

template <uint32_t block_size,
uint32_t tile_size,
typename pair_type,
typename InputIt,
typename viewT,
Expand All @@ -199,48 +198,121 @@ __global__ void erase(InputIt first,
KeyEqual key_equal)
{
typedef cub::BlockReduce<std::size_t, block_size> BlockReduce;
extern __shared__ typename BlockReduce::TempStorage temp_submap_storage[];
__shared__ typename BlockReduce::TempStorage temp_storage;
extern __shared__ unsigned long long submap_block_num_successes[];

std::size_t thread_num_successes = 0;

// TODO: find permanent solution (only works for four submaps)
std::size_t submap_thread_num_successes[4] = {0, 0, 0, 0};
auto tid = block_size * blockIdx.x + threadIdx.x;
auto it = first + tid;

if(num_submaps > 1) {
for(int i = threadIdx.x; i < num_submaps; i += block_size)
submap_block_num_successes[i] = 0;
__syncthreads();

while (it < last) {
int i;
for (i = 0; i < num_submaps; ++i) {
if (submap_mutable_views[i].erase(*it, hash, key_equal)) {
thread_num_successes++;
atomicAdd(&submap_block_num_successes[i], 1);
break;
}
}
it += gridDim.x * blockDim.x;
}
} else {
while (it < last) {
if(submap_mutable_views[0].erase(*it, hash, key_equal))
thread_num_successes++;
it += gridDim.x * blockDim.x;
}
}

std::size_t block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes);
if (threadIdx.x == 0) {
num_successes->fetch_add(block_num_successes, cuda::std::memory_order_relaxed);
}

if(num_submaps > 1) {
for(int i = 0; i < num_submaps; ++i) {
if(threadIdx.x == 0) {
submap_num_successes[i]->fetch_add(
static_cast<std::size_t>(submap_block_num_successes[i]), cuda::std::memory_order_relaxed);
}
}
}
}

template <uint32_t block_size,
uint32_t tile_size,
typename pair_type,
typename InputIt,
typename viewT,
typename mutableViewT,
typename atomicT,
typename Hash,
typename KeyEqual>
__global__ void erase(InputIt first,
InputIt last,
viewT* submap_views,
mutableViewT* submap_mutable_views,
atomicT* num_successes,
atomicT** submap_num_successes,
const uint32_t num_submaps,
Hash hash,
KeyEqual key_equal)
{
typedef cub::BlockReduce<std::size_t, block_size> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
extern __shared__ unsigned long long submap_block_num_successes[];

std::size_t thread_num_successes = 0;

auto tile = cg::tiled_partition<tile_size>(cg::this_thread_block());
auto tid = block_size * blockIdx.x + threadIdx.x;
auto it = first + tid / tile_size;

while (it < last) {
auto erased = false;

// manually check for duplicates in those submaps we are not inserting into
int i;
for (i = 0; i < num_submaps; ++i) {
erased = submap_mutable_views[i].erase(tile, *it, hash, key_equal);
if (erased) { break; }
}
if (erased && tile.thread_rank() == 0) {
thread_num_successes++;
submap_thread_num_successes[i]++;
if(num_submaps > 1) {
for(int i = threadIdx.x; i < num_submaps; i += block_size)
submap_block_num_successes[i] = 0;
__syncthreads();

while (it < last) {
auto erased = false;
int i;
for (i = 0; i < num_submaps; ++i) {
erased = submap_mutable_views[i].erase(tile, *it, hash, key_equal);
if (erased) { break; }
}
if (erased && tile.thread_rank() == 0) {
thread_num_successes++;
atomicAdd(&submap_block_num_successes[i], 1);
}
it += (gridDim.x * blockDim.x) / tile_size;
}
} else {
while (it < last) {
auto erased = submap_mutable_views[0].erase(tile, *it, hash, key_equal);
if (erased && tile.thread_rank() == 0)
thread_num_successes++;

it += (gridDim.x * blockDim.x) / tile_size;
it += (gridDim.x * blockDim.x) / tile_size;
}
}

std::size_t block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes);
if (threadIdx.x == 0) {
num_successes->fetch_add(block_num_successes, cuda::std::memory_order_relaxed);
}

// TODO: if there's only one submap, skip this step
// update submap thread counts
for (int i = 0; i < num_submaps; ++i) {
std::size_t submap_block_num_successes =
BlockReduce(temp_submap_storage[i]).Sum(submap_thread_num_successes[i]);
if (threadIdx.x == 0) {
submap_num_successes[i]->fetch_add(submap_block_num_successes,
cuda::std::memory_order_relaxed);
if(num_submaps > 1) {
for(int i = 0; i < num_submaps; ++i) {
if(threadIdx.x == 0) {
submap_num_successes[i]->fetch_add(
static_cast<std::size_t>(submap_block_num_successes[i]), cuda::std::memory_order_relaxed);
}
}
}
}
Expand Down
8 changes: 4 additions & 4 deletions include/cuco/dynamic_map.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@

#include <cooperative_groups.h>
#include <cub/cub.cuh>
#include <cuco/detail/dynamic_map_kernels.cuh>
#include <cuco/detail/error.hpp>
#include <cuco/static_map.cuh>
#include <cuda/std/atomic>
#include <cuco/detail/dynamic_map_kernels.cuh>
#include <thrust/device_vector.h>

namespace cuco {
Expand Down Expand Up @@ -277,9 +277,9 @@ class dynamic_map {
submap_mutable_views_; ///< vector of mutable device views for each submap
std::size_t min_insert_size_{}; ///< min remaining capacity of submap for insert
atomic_ctr_type* num_successes_; ///< number of successfully inserted keys on insert
std::vector<atomic_ctr_type*>
submap_num_successes_; ///< number of succesfully erased keys for each submap
Allocator alloc_{}; ///< Allocator passed to submaps to allocate their device storage
std::vector<atomic_ctr_type*> submap_num_successes_; ///< number of succesfully erased keys for each submap
thrust::device_vector<atomic_ctr_type*> d_submap_num_successes_;
Allocator alloc_{}; ///< Allocator passed to submaps to allocate their device storage
counter_allocator_type counter_allocator_{}; ///< Allocator used to allocate `num_successes_`
};
} // namespace cuco
Expand Down
12 changes: 4 additions & 8 deletions tests/dynamic_map/erase_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -107,11 +107,9 @@ TEMPLATE_TEST_CASE_SIG("erase key", "", ((typename T), T), (int32_t))
map.insert(pairs_begin2, pairs_begin2 + 4 * num_keys);

// map should resize twice if the erased slots are successfully reused
REQUIRE(map.get_capacity() == 8 * num_keys);

REQUIRE(map.get_capacity() == 8*num_keys);
// check that keys can be successfully deleted from only the first and second submaps
map.erase(d_keys2.begin(), d_keys2.begin() + 2 * num_keys);

map.erase(d_keys2.begin(), d_keys2.begin() + 2*num_keys);
map.contains(d_keys2.begin(), d_keys2.end(), d_keys_exist2.begin());

REQUIRE(cuco::test::none_of(d_keys_exist2.begin(),
Expand All @@ -122,10 +120,8 @@ TEMPLATE_TEST_CASE_SIG("erase key", "", ((typename T), T), (int32_t))
d_keys_exist2.end(),
[] __device__(const bool key_found) { return key_found; }));

REQUIRE(map.get_size() == 2 * num_keys);

// check that keys can be successfully deleted from all submaps (some will be unsuccessful
// erases)
REQUIRE(map.get_size() == 2*num_keys);
// check that keys can be successfully deleted from all submaps (some will be unsuccessful erases)
map.erase(d_keys2.begin(), d_keys2.end());

map.contains(d_keys2.begin(), d_keys2.end(), d_keys_exist2.begin());
Expand Down

0 comments on commit 3906909

Please sign in to comment.