Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Erase Functionality for dynamic_map #149

Merged
merged 38 commits into from
Dec 21, 2022
Merged
Show file tree
Hide file tree
Changes from 34 commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
976a746
dynamic map erase, still needs work
niskos99 Mar 9, 2022
eead8b8
minor clarity changes
niskos99 Mar 9, 2022
20ac7a3
erase bug fix
niskos99 Apr 5, 2022
ede50d6
dynamic map erase working, only 4 submaps for now
niskos99 Apr 5, 2022
1d8fbd0
type wrappers added
niskos99 Apr 5, 2022
63dd4eb
prevent implicit type conversion of sentinels during construction
niskos99 Apr 5, 2022
52d83f6
erase benchmark added
niskos99 Apr 5, 2022
0878216
num_successes managed pointer updated
niskos99 Apr 6, 2022
7eac9d1
more efficient block reduce
niskos99 Apr 6, 2022
4d10631
doc changes
niskos99 Apr 6, 2022
b59a16b
[pre-commit.ci] auto code formatting
pre-commit-ci[bot] Apr 6, 2022
b00fcba
shared mem atomics to keep track of per-submap erases
niskos99 Apr 7, 2022
c146f9d
doc improvements
niskos99 Apr 30, 2022
faf8224
warning fixes
niskos99 Aug 31, 2022
e4b548e
[pre-commit.ci] auto code formatting
pre-commit-ci[bot] Aug 31, 2022
93b7983
removed nvtx file
niskos99 Aug 31, 2022
cd21190
num_successes_ removed
niskos99 Oct 7, 2022
4c1952d
doxygen warning fixes
niskos99 Oct 7, 2022
80f4d14
[pre-commit.ci] auto code formatting
pre-commit-ci[bot] Oct 7, 2022
6616889
code cleanup
niskos99 Nov 14, 2022
2df247c
[pre-commit.ci] auto code formatting
pre-commit-ci[bot] Nov 14, 2022
af7706d
switched typedef to using
niskos99 Nov 14, 2022
54ae254
[pre-commit.ci] auto code formatting
pre-commit-ci[bot] Nov 14, 2022
593fe12
responding to PR comments
niskos99 Nov 18, 2022
7598e47
[pre-commit.ci] auto code formatting
pre-commit-ci[bot] Nov 18, 2022
ecd3945
Merge remote-tracking branch 'upstream/dev' into erase
PointKernel Dec 20, 2022
1e6ad99
Add more data types for erase tests
PointKernel Dec 20, 2022
9e324fc
Use public murmurhash
PointKernel Dec 20, 2022
bb0e4e9
Update static map benchmark: fix runtime stall bug, remove redundant …
PointKernel Dec 20, 2022
10fd08a
Update dynamic map benchmark: fix conversion warning, add search_none…
PointKernel Dec 20, 2022
788ad29
Cleanups: get rid of host-side counter vector, remove get_ prefixes a…
PointKernel Dec 20, 2022
c71dd60
Get rid of num_successes getter
PointKernel Dec 20, 2022
ab4ef0c
Fix comments
PointKernel Dec 20, 2022
d72e403
Update tests
PointKernel Dec 20, 2022
9478650
Update include/cuco/detail/dynamic_map_kernels.cuh
PointKernel Dec 20, 2022
82e0f2e
Cleanups: relaxed memory atomic, static_assert instead of runtime exp…
PointKernel Dec 21, 2022
412b7f9
Merge branch 'dynamic_map_erase' of github.com:Nicolas-Iskos/cuCollec…
PointKernel Dec 21, 2022
f5ec677
Reorder header groups + remove unused counter allocator
PointKernel Dec 21, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
177 changes: 170 additions & 7 deletions benchmarks/hash_table/dynamic_map_bench.cu
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ static void generate_keys(OutputIt output_begin, OutputIt output_end)

static void gen_final_size(benchmark::internal::Benchmark* b)
{
for (auto size = 10'000'000; size <= 150'000'000; size += 20'000'000) {
for (auto size = 10'000'000; size <= 310'000'000; size += 20'000'000) {
b->Args({size});
}
}
Expand Down Expand Up @@ -135,6 +135,128 @@ static void BM_dynamic_search_all(::benchmark::State& state)
int64_t(state.range(0)));
}

template <typename Key, typename Value, dist_type Dist>
static void BM_dynamic_search_none(::benchmark::State& state)
{
using map_type = cuco::dynamic_map<Key, Value>;

std::size_t num_keys = state.range(0);
std::size_t initial_size = 1 << 27;

std::vector<Key> h_keys(num_keys);
std::vector<cuco::pair_type<Key, Value>> h_pairs(num_keys);

generate_keys<Dist, Key>(h_keys.begin(), h_keys.end());

for (std::size_t i = 0; i < num_keys; ++i) {
Key key = h_keys[i] + num_keys;
Value val = h_keys[i] + num_keys;
h_pairs[i].first = key;
h_pairs[i].second = val;
}

thrust::device_vector<Key> d_keys(h_keys);
thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs(h_pairs);
thrust::device_vector<Value> d_results(num_keys);

map_type map{initial_size, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};
map.insert(d_pairs.begin(), d_pairs.end());

for (auto _ : state) {
cuda_event_timer raii{state};
map.find(d_keys.begin(), d_keys.end(), d_results.begin());
}

state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) *
int64_t(state.range(0)));
}

template <typename Key, typename Value, dist_type Dist>
static void BM_dynamic_erase_all(::benchmark::State& state)
{
using map_type = cuco::dynamic_map<Key, Value>;

std::size_t num_keys = state.range(0);
std::size_t initial_size = 1 << 27;

std::vector<Key> h_keys(num_keys);
std::vector<cuco::pair_type<Key, Value>> h_pairs(num_keys);

generate_keys<Dist, Key>(h_keys.begin(), h_keys.end());

for (uint32_t i = 0; i < num_keys; ++i) {
Key key = h_keys[i];
Value val = h_keys[i];
h_pairs[i].first = key;
h_pairs[i].second = val;
}

thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs(h_pairs);
thrust::device_vector<Key> d_keys(h_keys);

std::size_t batch_size = 1E6;
for (auto _ : state) {
map_type map{initial_size,
cuco::empty_key<Key>{-1},
cuco::empty_value<Value>{-1},
cuco::erased_key<Key>{-2}};
for (uint32_t i = 0; i < num_keys; i += batch_size) {
map.insert(d_pairs.begin() + i, d_pairs.begin() + i + batch_size);
}
{
cuda_event_timer raii{state};
for (uint32_t i = 0; i < num_keys; i += batch_size) {
map.erase(d_keys.begin() + i, d_keys.begin() + i + batch_size);
}
}
}

state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) *
int64_t(state.range(0)));
}

template <typename Key, typename Value, dist_type Dist>
static void BM_dynamic_erase_none(::benchmark::State& state)
{
using map_type = cuco::dynamic_map<Key, Value>;

std::size_t num_keys = state.range(0);
std::size_t initial_size = 1 << 27;

std::vector<Key> h_keys(num_keys);
std::vector<cuco::pair_type<Key, Value>> h_pairs(num_keys);

generate_keys<Dist, Key>(h_keys.begin(), h_keys.end());

for (std::size_t i = 0; i < num_keys; ++i) {
Key key = h_keys[i] + num_keys;
Value val = h_keys[i] + num_keys;
h_pairs[i].first = key;
h_pairs[i].second = val;
}

thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs(h_pairs);
thrust::device_vector<Key> d_keys(h_keys);

std::size_t batch_size = 1E6;
for (auto _ : state) {
map_type map{initial_size,
cuco::empty_key<Key>{-1},
cuco::empty_value<Value>{-1},
cuco::erased_key<Key>{-2}};
for (std::size_t i = 0; i < num_keys; i += batch_size) {
map.insert(d_pairs.begin() + i, d_pairs.begin() + i + batch_size);
}
{
cuda_event_timer raii{state};
map.erase(d_keys.begin(), d_keys.end());
}
}

state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) *
int64_t(state.range(0)));
}

BENCHMARK_TEMPLATE(BM_dynamic_insert, int32_t, int32_t, dist_type::UNIQUE)
->Unit(benchmark::kMillisecond)
->Apply(gen_final_size)
Expand All @@ -145,32 +267,37 @@ BENCHMARK_TEMPLATE(BM_dynamic_search_all, int32_t, int32_t, dist_type::UNIQUE)
->Apply(gen_final_size)
->UseManualTime();

BENCHMARK_TEMPLATE(BM_dynamic_insert, int32_t, int32_t, dist_type::UNIFORM)
BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int32_t, int32_t, dist_type::UNIQUE)
->Unit(benchmark::kMillisecond)
->Apply(gen_final_size)
->UseManualTime();

BENCHMARK_TEMPLATE(BM_dynamic_search_all, int32_t, int32_t, dist_type::UNIFORM)
BENCHMARK_TEMPLATE(BM_dynamic_insert, int64_t, int64_t, dist_type::UNIQUE)
->Unit(benchmark::kMillisecond)
->Apply(gen_final_size)
->UseManualTime();

BENCHMARK_TEMPLATE(BM_dynamic_insert, int32_t, int32_t, dist_type::GAUSSIAN)
BENCHMARK_TEMPLATE(BM_dynamic_search_all, int64_t, int64_t, dist_type::UNIQUE)
->Unit(benchmark::kMillisecond)
->Apply(gen_final_size)
->UseManualTime();

BENCHMARK_TEMPLATE(BM_dynamic_search_all, int32_t, int32_t, dist_type::GAUSSIAN)
BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int64_t, int64_t, dist_type::UNIQUE)
->Unit(benchmark::kMillisecond)
->Apply(gen_final_size)
->UseManualTime();

BENCHMARK_TEMPLATE(BM_dynamic_insert, int64_t, int64_t, dist_type::UNIQUE)
BENCHMARK_TEMPLATE(BM_dynamic_insert, int32_t, int32_t, dist_type::UNIFORM)
->Unit(benchmark::kMillisecond)
->Apply(gen_final_size)
->UseManualTime();

BENCHMARK_TEMPLATE(BM_dynamic_search_all, int64_t, int64_t, dist_type::UNIQUE)
BENCHMARK_TEMPLATE(BM_dynamic_search_all, int32_t, int32_t, dist_type::UNIFORM)
->Unit(benchmark::kMillisecond)
->Apply(gen_final_size)
->UseManualTime();

BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int32_t, int32_t, dist_type::UNIFORM)
->Unit(benchmark::kMillisecond)
->Apply(gen_final_size)
->UseManualTime();
Expand All @@ -185,6 +312,26 @@ BENCHMARK_TEMPLATE(BM_dynamic_search_all, int64_t, int64_t, dist_type::UNIFORM)
->Apply(gen_final_size)
->UseManualTime();

BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int64_t, int64_t, dist_type::UNIFORM)
->Unit(benchmark::kMillisecond)
->Apply(gen_final_size)
->UseManualTime();

BENCHMARK_TEMPLATE(BM_dynamic_insert, int32_t, int32_t, dist_type::GAUSSIAN)
->Unit(benchmark::kMillisecond)
->Apply(gen_final_size)
->UseManualTime();

BENCHMARK_TEMPLATE(BM_dynamic_search_all, int32_t, int32_t, dist_type::GAUSSIAN)
->Unit(benchmark::kMillisecond)
->Apply(gen_final_size)
->UseManualTime();

BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int32_t, int32_t, dist_type::GAUSSIAN)
->Unit(benchmark::kMillisecond)
->Apply(gen_final_size)
->UseManualTime();

BENCHMARK_TEMPLATE(BM_dynamic_insert, int64_t, int64_t, dist_type::GAUSSIAN)
->Unit(benchmark::kMillisecond)
->Apply(gen_final_size)
Expand All @@ -194,3 +341,19 @@ BENCHMARK_TEMPLATE(BM_dynamic_search_all, int64_t, int64_t, dist_type::GAUSSIAN)
->Unit(benchmark::kMillisecond)
->Apply(gen_final_size)
->UseManualTime();

BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int64_t, int64_t, dist_type::GAUSSIAN)
->Unit(benchmark::kMillisecond)
->Apply(gen_final_size)
->UseManualTime();

// TODO: comprehensive tests for erase_none and search_none?
BENCHMARK_TEMPLATE(BM_dynamic_search_none, int32_t, int32_t, dist_type::UNIFORM)
->Unit(benchmark::kMillisecond)
->Apply(gen_final_size)
->UseManualTime();

BENCHMARK_TEMPLATE(BM_dynamic_erase_none, int32_t, int32_t, dist_type::UNIFORM)
->Unit(benchmark::kMillisecond)
->Apply(gen_final_size)
->UseManualTime();
104 changes: 103 additions & 1 deletion benchmarks/hash_table/static_map_bench.cu
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,53 @@ static void BM_static_map_search_all(::benchmark::State& state)
int64_t(state.range(0)));
}

template <typename Key, typename Value, dist_type Dist>
static void BM_static_map_search_none(::benchmark::State& state)
{
using map_type = cuco::static_map<Key, Value>;

std::size_t num_keys = state.range(0);
float occupancy = state.range(1) / float{100};
std::size_t size = num_keys / occupancy;

map_type map{size, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};

std::vector<Key> h_keys(num_keys);
std::vector<Value> h_values(num_keys);
std::vector<cuco::pair_type<Key, Value>> h_pairs(num_keys);
std::vector<Value> h_results(num_keys);

generate_keys<Dist, Key>(h_keys.begin(), h_keys.end());

for (std::size_t i = 0; i < num_keys; ++i) {
Key key = h_keys[i];
Value val = h_keys[i];
h_pairs[i].first = key;
h_pairs[i].second = val;
}

// diff keys
for (std::size_t i = 0; i < num_keys; ++i) {
h_keys[i] += num_keys;
}

thrust::device_vector<Key> d_keys(h_keys);
thrust::device_vector<Value> d_results(num_keys);
thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs(h_pairs);

map.insert(d_pairs.begin(), d_pairs.end());

for (auto _ : state) {
map.find(d_keys.begin(), d_keys.end(), d_results.begin());
// TODO: get rid of sync and rewrite the benchmark with `nvbench`
// once https://github.com/NVIDIA/nvbench/pull/80 is merged
cudaDeviceSynchronize();
}

state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) *
int64_t(state.range(0)));
}

template <typename Key, typename Value, dist_type Dist>
static void BM_static_map_erase_all(::benchmark::State& state)
{
Expand Down Expand Up @@ -198,6 +245,52 @@ static void BM_static_map_erase_all(::benchmark::State& state)
int64_t(state.range(0)));
}

template <typename Key, typename Value, dist_type Dist>
static void BM_static_map_erase_none(::benchmark::State& state)
{
using map_type = cuco::static_map<Key, Value>;

std::size_t num_keys = state.range(0);
float occupancy = state.range(1) / float{100};
std::size_t size = num_keys / occupancy;

map_type map{size, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}, cuco::erased_key{-2}};

std::vector<Key> h_keys(num_keys);
std::vector<Value> h_values(num_keys);
std::vector<cuco::pair_type<Key, Value>> h_pairs(num_keys);
std::vector<Value> h_results(num_keys);

generate_keys<Dist, Key>(h_keys.begin(), h_keys.end());

for (std::size_t i = 0; i < num_keys; ++i) {
Key key = h_keys[i];
Value val = h_keys[i];
h_pairs[i].first = key;
h_pairs[i].second = val;
}

// diff keys
for (std::size_t i = 0; i < num_keys; ++i) {
h_keys[i] += num_keys;
}

thrust::device_vector<Key> d_keys(h_keys);
thrust::device_vector<bool> d_results(num_keys);
thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs(h_pairs);

for (auto _ : state) {
state.PauseTiming();
map.insert(d_pairs.begin(), d_pairs.end());
state.ResumeTiming();

map.erase(d_keys.begin(), d_keys.end());
}

state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) *
int64_t(state.range(0)));
}

BENCHMARK_TEMPLATE(BM_static_map_insert, int32_t, int32_t, dist_type::UNIQUE)
->Unit(benchmark::kMillisecond)
->Apply(generate_size_and_occupancy)
Expand Down Expand Up @@ -252,6 +345,15 @@ BENCHMARK_TEMPLATE(BM_static_map_search_all, int64_t, int64_t, dist_type::GAUSSI
->Unit(benchmark::kMillisecond)
->Apply(generate_size_and_occupancy);

BENCHMARK_TEMPLATE(BM_static_map_erase_all, int32_t, int32_t, dist_type::UNIQUE)
// TODO: comprehensive tests for erase_all, erase_none and search_none
BENCHMARK_TEMPLATE(BM_static_map_erase_all, int32_t, int32_t, dist_type::UNIFORM)
->Unit(benchmark::kMillisecond)
->Apply(generate_size_and_occupancy);

BENCHMARK_TEMPLATE(BM_static_map_search_none, int32_t, int32_t, dist_type::UNIFORM)
->Unit(benchmark::kMillisecond)
->Apply(generate_size_and_occupancy);

BENCHMARK_TEMPLATE(BM_static_map_erase_none, int32_t, int32_t, dist_type::UNIFORM)
->Unit(benchmark::kMillisecond)
->Apply(generate_size_and_occupancy);
Loading