Skip to content

Commit

Permalink
HDBSCAN bug on A100 (rapidsai#4024)
Browse files Browse the repository at this point in the history
While this issue only appeared in A100, it could have appeared on any other GPU. In this kernel https://github.com/rapidsai/cuml/blob/c6f992a5fcbccf5677ca6d639af6b84e93aa8108/cpp/src/hdbscan/detail/kernels/condense.cuh#L85, we launch a thread for every node of a binary tree on the GPU. The problem that occurs then, is:

1. Each node marks itself out of the frontier https://github.com/rapidsai/cuml/blob/c6f992a5fcbccf5677ca6d639af6b84e93aa8108/cpp/src/hdbscan/detail/kernels/condense.cuh#L94
2. For every node that is not a leaf, it marks its left and right child into the frontier https://github.com/rapidsai/cuml/blob/c6f992a5fcbccf5677ca6d639af6b84e93aa8108/cpp/src/hdbscan/detail/kernels/condense.cuh#L117

This is UB because the thread for the non-leaf node could be marking itself out of the frontier, but it could be the child of a node whose thread tries to mark it into the frontier.

Edit: Dropped `__threadfence()` solution as it wasn't fully correct. Using a `next_frontier` array instead to keep track of the frontier for the next BFS iteration.

Authors:
  - Divye Gala (https://github.com/divyegala)

Approvers:
  - Dante Gama Dessavre (https://github.com/dantegd)
  - Corey J. Nolet (https://github.com/cjnolet)

URL: rapidsai#4024
  • Loading branch information
divyegala authored Jul 6, 2021
1 parent f09341a commit 041a7fc
Show file tree
Hide file tree
Showing 5 changed files with 27 additions and 21 deletions.
13 changes: 10 additions & 3 deletions cpp/src/hdbscan/detail/condense.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,10 @@ void build_condensed_hierarchy(
"Cannot find single-linkage solution.");

rmm::device_uvector<bool> frontier(root + 1, stream);
rmm::device_uvector<bool> next_frontier(root + 1, stream);

thrust::fill(exec_policy, frontier.begin(), frontier.end(), false);
thrust::fill(exec_policy, next_frontier.begin(), next_frontier.end(), false);

// Array to propagate the lambda of subtrees actively being collapsed
// through multiple bfs iterations.
Expand Down Expand Up @@ -121,9 +123,14 @@ void build_condensed_hierarchy(
// TODO: Investigate whether it would be worth performing a gather/argmatch in order
// to schedule only the number of threads needed. (it might not be worth it)
condense_hierarchy_kernel<<<grid, tpb, 0, handle.get_stream()>>>(
frontier.data(), ignore.data(), relabel.data(), children, delta, sizes,
n_leaves, min_cluster_size, out_parent.data(), out_child.data(),
out_lambda.data(), out_size.data());
frontier.data(), next_frontier.data(), ignore.data(), relabel.data(),
children, delta, sizes, n_leaves, min_cluster_size, out_parent.data(),
out_child.data(), out_lambda.data(), out_size.data());

thrust::copy(exec_policy, next_frontier.begin(), next_frontier.end(),
frontier.begin());
thrust::fill(exec_policy, next_frontier.begin(), next_frontier.end(),
false);

n_elements_to_traverse = thrust::reduce(exec_policy, frontier.data(),
frontier.data() + root + 1, 0);
Expand Down
6 changes: 3 additions & 3 deletions cpp/src/hdbscan/detail/kernels/condense.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ __device__ inline value_t get_lambda(value_idx node, value_idx num_points,
*/
template <typename value_idx, typename value_t>
__global__ void condense_hierarchy_kernel(
bool *frontier, value_t *ignore, value_idx *relabel,
bool *frontier, bool *next_frontier, value_t *ignore, value_idx *relabel,
const value_idx *children, const value_t *deltas, const value_idx *sizes,
int n_leaves, int min_cluster_size, value_idx *out_parent,
value_idx *out_child, value_t *out_lambda, value_idx *out_count) {
Expand Down Expand Up @@ -114,8 +114,8 @@ __global__ void condense_hierarchy_kernel(
value_idx right_child = children[((node - n_leaves) * 2) + 1];

// flip frontier for children
frontier[left_child] = true;
frontier[right_child] = true;
next_frontier[left_child] = true;
next_frontier[right_child] = true;

// propagate ignore down to children
ignore[left_child] = should_ignore ? subtree_lambda : -1;
Expand Down
10 changes: 4 additions & 6 deletions cpp/src/hdbscan/detail/kernels/select.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,9 @@ namespace Select {
* @param[in] n_clusters number of clusters
*/
template <typename value_idx>
__global__ void propagate_cluster_negation_kernel(const value_idx *indptr,
const value_idx *children,
int *frontier,
int *is_cluster,
int n_clusters) {
__global__ void propagate_cluster_negation_kernel(
const value_idx *indptr, const value_idx *children, int *frontier,
int *next_frontier, int *is_cluster, int n_clusters) {
int cluster = blockDim.x * blockIdx.x + threadIdx.x;

if (cluster < n_clusters && frontier[cluster]) {
Expand All @@ -47,7 +45,7 @@ __global__ void propagate_cluster_negation_kernel(const value_idx *indptr,
value_idx children_stop = indptr[cluster + 1];
for (int i = children_start; i < children_stop; i++) {
value_idx child = children[i];
frontier[child] = true;
next_frontier[child] = true;
is_cluster[child] = false;
}
}
Expand Down
12 changes: 10 additions & 2 deletions cpp/src/hdbscan/detail/select.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,9 @@ void perform_bfs(const raft::handle_t &handle, const value_idx *indptr,
auto stream = handle.get_stream();
auto thrust_policy = rmm::exec_policy(stream);

rmm::device_uvector<int> next_frontier(n_clusters, stream);
thrust::fill(thrust_policy, next_frontier.begin(), next_frontier.end(), 0);

value_idx n_elements_to_traverse =
thrust::reduce(thrust_policy, frontier, frontier + n_clusters, 0);

Expand All @@ -78,11 +81,16 @@ void perform_bfs(const raft::handle_t &handle, const value_idx *indptr,
size_t grid = raft::ceildiv(n_clusters, tpb);

while (n_elements_to_traverse > 0) {
bfs_kernel<<<grid, tpb, 0, stream>>>(indptr, children, frontier, is_cluster,
n_clusters);
bfs_kernel<<<grid, tpb, 0, stream>>>(
indptr, children, frontier, next_frontier.data(), is_cluster, n_clusters);

thrust::copy(thrust_policy, next_frontier.begin(), next_frontier.end(),
frontier);
thrust::fill(thrust_policy, next_frontier.begin(), next_frontier.end(), 0);

n_elements_to_traverse =
thrust::reduce(thrust_policy, frontier, frontier + n_clusters, 0);
CUDA_CHECK(cudaStreamSynchronize(stream));
}
}

Expand Down
7 changes: 0 additions & 7 deletions python/cuml/test/test_hdbscan.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,6 @@

import cupy as cp

import rmm

IS_A100 = 'A100' in rmm._cuda.gpu.deviceGetName(0)

test_datasets = {
"digits": datasets.load_digits(),
"boston": datasets.load_boston(),
Expand All @@ -56,7 +52,6 @@ def assert_cluster_counts(sk_agg, cuml_agg, digits=25):
np.testing.assert_almost_equal(sk_counts, cu_counts, decimal=-1 * digits)


@pytest.mark.skipif(IS_A100, reason="Skipping test on A100")
@pytest.mark.parametrize('nrows', [500])
@pytest.mark.parametrize('ncols', [25])
@pytest.mark.parametrize('nclusters', [2, 5])
Expand Down Expand Up @@ -114,7 +109,6 @@ def test_hdbscan_blobs(nrows, ncols, nclusters,
np.sort(cuml_agg.cluster_persistence_), rtol=0.01, atol=0.01)


@pytest.mark.skipif(IS_A100, reason="Skipping test on A100")
@pytest.mark.parametrize('dataset', test_datasets.values())
@pytest.mark.parametrize('cluster_selection_epsilon', [0.0, 50.0, 150.0])
@pytest.mark.parametrize('min_samples_cluster_size_bounds', [(150, 150, 0),
Expand Down Expand Up @@ -167,7 +161,6 @@ def test_hdbscan_sklearn_datasets(dataset,
np.sort(cuml_agg.cluster_persistence_), rtol=0.1, atol=0.1)


@pytest.mark.skipif(IS_A100, reason="Skipping test on A100")
@pytest.mark.parametrize('nrows', [1000])
@pytest.mark.parametrize('dataset', dataset_names)
@pytest.mark.parametrize('min_samples', [15])
Expand Down

0 comments on commit 041a7fc

Please sign in to comment.