rapidsai · rapids-bot · Jul 9, 2021 · Jun 23, 2021 · Jun 23, 2021 · Jun 23, 2021
@@ -308,4 +308,11 @@ void hdbscan(const raft::handle_t &handle, const float *X, size_t m, size_t n,
              raft::distance::DistanceType metric,
              HDBSCAN::Common::HDBSCANParams &params,
              HDBSCAN::Common::hdbscan_output<int, float> &out);
+
+void _extract_clusters(
+  const raft::handle_t &handle, size_t n_leaves, int n_edges, int *parents,
+  int *children, float *lambdas, int *sizes, int *labels, float *probabilities,
+  HDBSCAN::Common::CLUSTER_SELECTION_METHOD cluster_selection_method,
+  bool allow_single_cluster, int max_cluster_size,
+  float cluster_selection_epsilon);
 }  // END namespace ML
@@ -71,15 +71,10 @@ CondensedHierarchy<value_idx, value_t>::CondensedHierarchy(
   : handle(handle_),
     n_leaves(n_leaves_),
     n_edges(n_edges_),
-    parents(0, handle.get_stream()),
-    children(0, handle.get_stream()),
-    lambdas(0, handle.get_stream()),
-    sizes(0, handle.get_stream()) {
-  parents.resize(n_edges_, handle.get_stream());
-  children.resize(n_edges_, handle.get_stream());
-  lambdas.resize(n_edges_, handle.get_stream());
-  sizes.resize(n_edges_, handle.get_stream());
-
+    parents(n_edges_, handle.get_stream()),
+    children(n_edges_, handle.get_stream()),
+    lambdas(n_edges_, handle.get_stream()),
+    sizes(n_edges_, handle.get_stream()) {
   raft::copy(parents.begin(), parents_, n_edges_, handle.get_stream());
   raft::copy(children.begin(), children_, n_edges_, handle.get_stream());
   raft::copy(lambdas.begin(), lambdas_, n_edges_, handle.get_stream());

@@ -27,4 +27,23 @@ void hdbscan(const raft::handle_t &handle, const float *X, size_t m, size_t n,
   HDBSCAN::_fit_hdbscan(handle, X, m, n, metric, params, out);
 }
 
+void _extract_clusters(
+  const raft::handle_t &handle, size_t n_leaves, int n_edges, int *parents,
+  int *children, float *lambdas, int *sizes, int *labels, float *probabilities,
+  HDBSCAN::Common::CLUSTER_SELECTION_METHOD cluster_selection_method,
+  bool allow_single_cluster, int max_cluster_size,
+  float cluster_selection_epsilon) {
+  HDBSCAN::Common::CondensedHierarchy condensed_tree(
+    handle, n_leaves, n_edges, parents, children, lambdas, sizes);
+
+  rmm::device_uvector<float> stabilities(condensed_tree.get_n_clusters(),
+                                         handle.get_stream());
+  rmm::device_uvector<int> label_map(n_leaves, handle.get_stream());
+
+  HDBSCAN::detail::Extract::extract_clusters(
+    handle, condensed_tree, n_leaves, labels, stabilities.data(), probabilities,
+    label_map.data(), cluster_selection_method, allow_single_cluster,
+    max_cluster_size, cluster_selection_epsilon);
+}
+
 };  // end namespace ML
@@ -89,6 +89,14 @@ cdef extern from "cuml/cluster/hdbscan.hpp" namespace "ML":
                  HDBSCANParams & params,
                  hdbscan_output & output)
 
+    void _extract_clusters(const handle_t &handle, size_t n_leaves,
+                           int n_edges, int *parents, int *children,
+                           float *lambdas, int *sizes, int *labels,
+                           float *probabilities,
+                           CLUSTER_SELECTION_METHOD cluster_selection_method,
+                           bool allow_single_cluster, int max_cluster_size,
+                           float cluster_selection_epsilon)
+
 _metrics_mapping = {
     'l1': DistanceType.L1,
     'cityblock': DistanceType.L1,
@@ -537,6 +545,60 @@ class HDBSCAN(Base, ClusterMixin, CMajorInputTagMixin):
         """
         return self.fit(X).labels_
 
+    def _extract_clusters(self, condensed_tree):
+        parents, n_edges, _, _ = \
+            input_to_cuml_array(condensed_tree.to_numpy()['parent'],
+                                order='C',
+                                convert_to_dtype=np.int32)
+
+        children, _, _, _ = \
+            input_to_cuml_array(condensed_tree.to_numpy()['child'],
+                                order='C',
+                                convert_to_dtype=np.int32)
+
+        lambdas, _, _, _ = \
+            input_to_cuml_array(condensed_tree.to_numpy()['lambda_val'],
+                                order='C',
+                                convert_to_dtype=np.float32)
+
+        sizes, _, _, _ = \
+            input_to_cuml_array(condensed_tree.to_numpy()['child_size'],
+                                order='C',
+                                convert_to_dtype=np.int32)
+
+        n_leaves = int(condensed_tree.to_numpy()['parent'].min())
+
+        self.labels_test = CumlArray.empty(n_leaves, dtype="int32")
+        self.probabilities_test = CumlArray.empty(n_leaves, dtype="float32")
+
+        cdef uintptr_t labels_ptr = self.labels_test.ptr
+        cdef uintptr_t parents_ptr = parents.ptr
+        cdef uintptr_t children_ptr = children.ptr
+        cdef uintptr_t sizes_ptr = sizes.ptr
+        cdef uintptr_t lambdas_ptr = lambdas.ptr
+        cdef uintptr_t probabilities_ptr = self.probabilities_test.ptr
+
+        if self.cluster_selection_method == 'eom':
+            cluster_selection_method = CLUSTER_SELECTION_METHOD.EOM
+        elif self.cluster_selection_method == 'leaf':
+            cluster_selection_method = CLUSTER_SELECTION_METHOD.LEAF
+
+        cdef handle_t* handle_ = <handle_t*><size_t>self.handle.getHandle()
+
+        _extract_clusters(handle_[0],
+                          <size_t> n_leaves,
+                          <int> n_edges,
+                          <int*> parents_ptr,
+                          <int*> children_ptr,
+                          <float*> lambdas_ptr,
+                          <int*> sizes_ptr,
+                          <int*> labels_ptr,
+                          <float*> probabilities_ptr,
+                          <CLUSTER_SELECTION_METHOD> cluster_selection_method,
+                          <bool> self.allow_single_cluster,
+                          <int> self.max_cluster_size,
+                          <float> self.cluster_selection_epsilon)
+
     def get_param_names(self):
         return super().get_param_names() + [
             "n_neighbors",

@@ -161,14 +161,61 @@ def test_hdbscan_sklearn_datasets(dataset,
            np.sort(cuml_agg.cluster_persistence_), rtol=0.1, atol=0.1)
 
 
+@pytest.mark.parametrize('dataset', test_datasets.values())
+@pytest.mark.parametrize('cluster_selection_epsilon', [0.0, 50.0, 150.0])
+@pytest.mark.parametrize('min_samples', [150, 50, 5, 400])
+@pytest.mark.parametrize('min_cluster_size', [150, 25, 5, 250])
+@pytest.mark.parametrize('max_cluster_size', [0])
+@pytest.mark.parametrize('allow_single_cluster', [True, False])
+@pytest.mark.parametrize('cluster_selection_method', ['eom', 'leaf'])
+@pytest.mark.parametrize('connectivity', ['knn'])
+def test_hdbscan_sklearn_extract_clusters(dataset,
+                                          connectivity,
+                                          cluster_selection_epsilon,
+                                          cluster_selection_method,
+                                          min_samples,
+                                          min_cluster_size,
+                                          max_cluster_size,
+                                          allow_single_cluster):
+
+    X = dataset.data
+
+    cuml_agg = HDBSCAN(verbose=logger.level_info,
+                       allow_single_cluster=allow_single_cluster,
+                       n_neighbors=min_samples,
+                       gen_min_span_tree=True,
+                       min_samples=min_samples,
+                       max_cluster_size=max_cluster_size,
+                       min_cluster_size=min_cluster_size,
+                       cluster_selection_epsilon=cluster_selection_epsilon,
+                       cluster_selection_method=cluster_selection_method)
+
+    sk_agg = hdbscan.HDBSCAN(
+        allow_single_cluster=allow_single_cluster,
+        approx_min_span_tree=False,
+        gen_min_span_tree=True,
+        min_samples=min_samples,
+        min_cluster_size=min_cluster_size,
+        cluster_selection_epsilon=cluster_selection_epsilon,
+        cluster_selection_method=cluster_selection_method,
+        algorithm="generic")
+
+    sk_agg.fit(cp.asnumpy(X))
+
+    cuml_agg._extract_clusters(sk_agg.condensed_tree_)
+
+    assert adjusted_rand_score(cuml_agg.labels_test, sk_agg.labels_) == 1.0
+    assert np.allclose(cp.asnumpy(cuml_agg.probabilities_test),
+                       sk_agg.probabilities_)
+
+
 @pytest.mark.parametrize('nrows', [1000])
 @pytest.mark.parametrize('dataset', dataset_names)
 @pytest.mark.parametrize('min_samples', [15])
 @pytest.mark.parametrize('cluster_selection_epsilon', [0.0])
 @pytest.mark.parametrize('min_cluster_size', [25])
 @pytest.mark.parametrize('allow_single_cluster', [True, False])
 @pytest.mark.parametrize('max_cluster_size', [0])
-# TODO: Need to test leaf selection method
 @pytest.mark.parametrize('cluster_selection_method', ['eom'])
 @pytest.mark.parametrize('connectivity', ['knn'])
 def test_hdbscan_cluster_patterns(dataset, nrows,
@@ -215,6 +262,55 @@ def test_hdbscan_cluster_patterns(dataset, nrows,
            np.sort(cuml_agg.cluster_persistence_), rtol=0.1, atol=0.1)
 
 
+@pytest.mark.parametrize('nrows', [1000])
+@pytest.mark.parametrize('dataset', dataset_names)
+@pytest.mark.parametrize('min_samples', [5, 50, 400, 800])
+@pytest.mark.parametrize('cluster_selection_epsilon', [0.0])
+@pytest.mark.parametrize('min_cluster_size', [10, 25, 100, 350])
+@pytest.mark.parametrize('allow_single_cluster', [True, False])
+@pytest.mark.parametrize('max_cluster_size', [0])
+@pytest.mark.parametrize('cluster_selection_method', ['eom', 'leaf'])
+@pytest.mark.parametrize('connectivity', ['knn'])
+def test_hdbscan_cluster_patterns_extract_clusters(dataset, nrows,
+                                                   connectivity,
+                                                   cluster_selection_epsilon,
+                                                   cluster_selection_method,
+                                                   min_cluster_size,
+                                                   allow_single_cluster,
+                                                   max_cluster_size,
+                                                   min_samples):
+
+    # This also tests duplicate data points
+    X, y = get_pattern(dataset, nrows)[0]
+
+    cuml_agg = HDBSCAN(verbose=logger.level_info,
+                       allow_single_cluster=allow_single_cluster,
+                       n_neighbors=min_samples,
+                       min_samples=min_samples,
+                       max_cluster_size=max_cluster_size,
+                       min_cluster_size=min_cluster_size,
+                       cluster_selection_epsilon=cluster_selection_epsilon,
+                       cluster_selection_method=cluster_selection_method)
+
+    sk_agg = hdbscan.HDBSCAN(
+        allow_single_cluster=allow_single_cluster,
+        approx_min_span_tree=False,
+        gen_min_span_tree=True,
+        min_samples=min_samples,
+        min_cluster_size=min_cluster_size,
+        cluster_selection_epsilon=cluster_selection_epsilon,
+        cluster_selection_method=cluster_selection_method,
+        algorithm="generic")
+
+    sk_agg.fit(cp.asnumpy(X))
+
+    cuml_agg._extract_clusters(sk_agg.condensed_tree_)
+
+    assert adjusted_rand_score(cuml_agg.labels_test, sk_agg.labels_) == 1.0
+    assert np.allclose(cp.asnumpy(cuml_agg.probabilities_test),
+                       sk_agg.probabilities_)
+
+
 def test_hdbscan_plots():
 
     X, y = make_blobs(int(100),