Skip to content

Commit

Permalink
Pickling for HBDSCAN (rapidsai#5102)
Browse files Browse the repository at this point in the history
closes rapidsai#4986

Authors:
  - Divye Gala (https://github.com/divyegala)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: rapidsai#5102
  • Loading branch information
divyegala authored Jan 4, 2023
1 parent cc8a0d2 commit d2e7b8e
Show file tree
Hide file tree
Showing 12 changed files with 619 additions and 190 deletions.
98 changes: 64 additions & 34 deletions cpp/include/cuml/cluster/hdbscan.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,8 @@ class hdbscan_output : public robust_single_linkage_output<value_idx, value_t> {
handle_, n_leaves_, labels_, children_, sizes_, deltas_, mst_src_, mst_dst_, mst_weights_),
probabilities(probabilities_),
stabilities(0, handle_.get_stream()),
condensed_tree(handle_, n_leaves_)
condensed_tree(handle_, n_leaves_),
inverse_label_map(0, handle_.get_stream())
{
}

Expand All @@ -276,6 +277,9 @@ class hdbscan_output : public robust_single_linkage_output<value_idx, value_t> {
// it much easier to use / debug.
value_t* get_probabilities() { return probabilities; }
value_t* get_stabilities() { return stabilities.data(); }
value_idx* get_inverse_label_map() { return inverse_label_map.data(); }
// internal function
rmm::device_uvector<value_idx>& _get_inverse_label_map() { return inverse_label_map; }

/**
* Once n_clusters is known, the stabilities array
Expand All @@ -293,6 +297,9 @@ class hdbscan_output : public robust_single_linkage_output<value_idx, value_t> {

private:
value_t* probabilities; // size n_leaves
// inversely maps normalized labels to pre-normalized labels
// used for out-of-sample prediction
rmm::device_uvector<value_idx> inverse_label_map; // size n_clusters

// Size not known ahead of time. Initialize
// with `initialize_stabilities()` method.
Expand All @@ -316,14 +323,14 @@ template class CondensedHierarchy<int, float>;
template <typename value_idx, typename value_t>
class PredictionData {
public:
PredictionData(const raft::handle_t& handle_, value_idx m, value_idx n)
PredictionData(const raft::handle_t& handle_, value_idx m, value_idx n, value_t* core_dists_)
: handle(handle_),
exemplar_idx(0, handle.get_stream()),
exemplar_label_offsets(0, handle.get_stream()),
n_selected_clusters(0),
selected_clusters(0, handle.get_stream()),
deaths(0, handle.get_stream()),
core_dists(m, handle.get_stream()),
core_dists(core_dists_),
index_into_children(0, handle.get_stream()),
n_exemplars(0),
n_rows(m),
Expand All @@ -342,7 +349,7 @@ class PredictionData {
value_idx* get_exemplar_label_offsets() { return exemplar_label_offsets.data(); }
value_idx* get_selected_clusters() { return selected_clusters.data(); }
value_t* get_deaths() { return deaths.data(); }
value_t* get_core_dists() { return core_dists.data(); }
value_t* get_core_dists() { return core_dists; }
value_idx* get_index_into_children() { return index_into_children.data(); }

/**
Expand Down Expand Up @@ -376,18 +383,18 @@ class PredictionData {
value_idx n_selected_clusters;
rmm::device_uvector<value_idx> selected_clusters;
rmm::device_uvector<value_t> deaths;
rmm::device_uvector<value_t> core_dists;
value_t* core_dists;
rmm::device_uvector<value_idx> index_into_children;
};

template class PredictionData<int, float>;

void build_prediction_data(const raft::handle_t& handle,
CondensedHierarchy<int, float>& condensed_tree,
int* labels,
int* label_map,
int n_selected_clusters,
PredictionData<int, float>& prediction_data);
void generate_prediction_data(const raft::handle_t& handle,
CondensedHierarchy<int, float>& condensed_tree,
int* labels,
int* inverse_label_map,
int n_selected_clusters,
PredictionData<int, float>& prediction_data);

}; // namespace Common
}; // namespace HDBSCAN
Expand All @@ -413,36 +420,14 @@ void build_prediction_data(const raft::handle_t& handle,
* @param params struct of configuration hyper-parameters
* @param out struct of output data and arrays on device
*/
void hdbscan(const raft::handle_t& handle,
const float* X,
size_t m,
size_t n,
raft::distance::DistanceType metric,
HDBSCAN::Common::HDBSCANParams& params,
HDBSCAN::Common::hdbscan_output<int, float>& out);

/**
* Executes HDBSCAN clustering on an mxn-dimensional input array, X, then builds the PredictionData
* object which computes and stores information needed later for prediction algorithms.
*
* @param[in] handle raft handle for resource reuse
* @param[in] X array (size m, n) on device in row-major format
* @param m number of rows in X
* @param n number of columns in X
* @param metric distance metric to use
* @param params struct of configuration hyper-parameters
* @param out struct of output data and arrays on device
* @param prediction_data_ struct for storing computing and storing information to be used during
* prediction
*/
void hdbscan(const raft::handle_t& handle,
const float* X,
size_t m,
size_t n,
raft::distance::DistanceType metric,
HDBSCAN::Common::HDBSCANParams& params,
HDBSCAN::Common::hdbscan_output<int, float>& out,
HDBSCAN::Common::PredictionData<int, float>& prediction_data_);
float* core_dists);

void build_condensed_hierarchy(const raft::handle_t& handle,
const int* children,
Expand Down Expand Up @@ -485,4 +470,49 @@ void out_of_sample_predict(const raft::handle_t& handle,
int min_samples,
int* out_labels,
float* out_probabilities);

namespace HDBSCAN::HELPER {

/**
* @brief Compute the core distances for each point in the training matrix
*
* @param[in] handle raft handle for resource reuse
* @param[in] X array (size m, n) on device in row-major format
* @param[out] core_dists array (size m, 1) of core distances
* @param m number of rows in X
* @param n number of columns in X
* @param metric distance metric to use
* @param min_samples minimum number of samples to use for computing core distances
*/
void compute_core_dists(const raft::handle_t& handle,
const float* X,
float* core_dists,
size_t m,
size_t n,
raft::distance::DistanceType metric,
int min_samples);

/**
* @brief Compute the map from final, normalize labels to the labels in the CondensedHierarchy
*
* @param[in] handle raft handle for resource reuse
* @param[in] condensed_tree the Condensed Hiearchy object
* @param[in] n_leaves number of leaves in the input data
* @param[in] cluster_selection_method cluster selection method
* @param[out] inverse_label_map rmm::device_uvector of size 0. It will be resized during the
* computation
* @param[in] allow_single_cluster allow single cluster
* @param[in] max_cluster_size max cluster size
* @param[in] cluster_selection_epsilon cluster selection epsilon
*/
void compute_inverse_label_map(const raft::handle_t& handle,
HDBSCAN::Common::CondensedHierarchy<int, float>& condensed_tree,
size_t n_leaves,
HDBSCAN::Common::CLUSTER_SELECTION_METHOD cluster_selection_method,
rmm::device_uvector<int>& inverse_label_map,
bool allow_single_cluster,
int max_cluster_size,
float cluster_selection_epsilon);

} // namespace HDBSCAN::HELPER
} // END namespace ML
69 changes: 68 additions & 1 deletion cpp/src/hdbscan/detail/extract.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,60 @@ void do_labelling_on_host(const raft::handle_t& handle,
raft::update_device(labels, result.data(), n_leaves, stream);
}

/*
@brief Internal function compute inverse_label_map for CPU/GPU interop
*/
template <typename value_idx, typename value_t>
void _compute_inverse_label_map(const raft::handle_t& handle,
Common::CondensedHierarchy<value_idx, value_t>& condensed_tree,
size_t n_leaves,
Common::CLUSTER_SELECTION_METHOD cluster_selection_method,
rmm::device_uvector<value_idx>& inverse_label_map,
bool allow_single_cluster,
value_idx max_cluster_size,
value_t cluster_selection_epsilon)
{
auto stream = handle.get_stream();
rmm::device_uvector<value_t> tree_stabilities(condensed_tree.get_n_clusters(),
handle.get_stream());
Stability::compute_stabilities(handle, condensed_tree, tree_stabilities.data());
rmm::device_uvector<int> is_cluster(condensed_tree.get_n_clusters(), handle.get_stream());

if (max_cluster_size <= 0) max_cluster_size = n_leaves; // negates the max cluster size

Select::select_clusters(handle,
condensed_tree,
tree_stabilities.data(),
is_cluster.data(),
cluster_selection_method,
allow_single_cluster,
max_cluster_size,
cluster_selection_epsilon);

std::vector<int> is_cluster_h(is_cluster.size());
raft::update_host(is_cluster_h.data(), is_cluster.data(), is_cluster_h.size(), stream);
handle.sync_stream(stream);

std::set<value_idx> clusters;
for (std::size_t i = 0; i < is_cluster_h.size(); i++) {
if (is_cluster_h[i] != 0) { clusters.insert(i + n_leaves); }
}

std::vector<value_idx> inverse_label_map_h(clusters.size(), -1);
value_idx i = 0;
// creating inverse index between
// original and final labels
for (const value_idx cluster : clusters) {
inverse_label_map_h[i] = cluster - n_leaves;
i++;
}

// resizing is to n_clusters
inverse_label_map.resize(clusters.size(), stream);
raft::copy(
inverse_label_map.data(), inverse_label_map_h.data(), inverse_label_map_h.size(), stream);
}

/**
* Compute cluster stabilities, perform cluster selection, and
* label the resulting clusters. In addition, probabilities
Expand All @@ -191,8 +245,11 @@ void do_labelling_on_host(const raft::handle_t& handle,
* @param[out] labels array of labels on device (size n_leaves)
* @param[out] stabilities array of stabilities on device (size n_clusters)
* @param[out] probabilities array of probabilities on device (size n_leaves)
* @param[out] label_map array mapping condensed label ids to selected label ids (size n_leaves)
* @param[out] label_map array mapping condensed label ids to selected label ids (size
* n_condensed_trees)
* @param[in] cluster_selection_method method to use for cluster selection
* @param[out] inverse_label_map array mapping final label ids to condensed label ids, used for
* prediction APIs (size n_clusters)
* @param[in] allow_single_cluster allows a single cluster to be returned (rather than just noise)
* @param[in] max_cluster_size maximium number of points that can be considered in a cluster before
* it is split into multiple sub-clusters.
Expand All @@ -208,6 +265,7 @@ value_idx extract_clusters(const raft::handle_t& handle,
value_t* probabilities,
value_idx* label_map,
Common::CLUSTER_SELECTION_METHOD cluster_selection_method,
rmm::device_uvector<value_idx>& inverse_label_map,
bool allow_single_cluster = false,
value_idx max_cluster_size = 0,
value_t cluster_selection_epsilon = 0.0)
Expand Down Expand Up @@ -239,12 +297,21 @@ value_idx extract_clusters(const raft::handle_t& handle,
}

std::vector<value_idx> label_map_h(condensed_tree.get_n_clusters(), -1);
std::vector<value_idx> inverse_label_map_h(clusters.size(), -1);
value_idx i = 0;
// creating forward and inverse index between
// original and final labels
for (const value_idx cluster : clusters) {
label_map_h[cluster - n_leaves] = i;
inverse_label_map_h[i] = cluster - n_leaves;
i++;
}

// resizing is to n_clusters
inverse_label_map.resize(clusters.size(), stream);
raft::copy(
inverse_label_map.data(), inverse_label_map_h.data(), inverse_label_map_h.size(), stream);

raft::copy(label_map, label_map_h.data(), label_map_h.size(), stream);
do_labelling_on_host<value_idx, value_t>(handle,
condensed_tree,
Expand Down
28 changes: 28 additions & 0 deletions cpp/src/hdbscan/detail/reachability.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,34 @@ void compute_knn(const raft::handle_t& handle,
[] __device__(int64_t in) -> value_idx { return in; });
}

/*
@brief Internal function for CPU->GPU interop
to compute core_dists
*/
template <typename value_idx, typename value_t>
void _compute_core_dists(const raft::handle_t& handle,
const value_t* X,
value_t* core_dists,
size_t m,
size_t n,
raft::distance::DistanceType metric,
int min_samples)
{
RAFT_EXPECTS(metric == raft::distance::DistanceType::L2SqrtExpanded,
"Currently only L2 expanded distance is supported");

auto stream = handle.get_stream();

rmm::device_uvector<value_idx> inds(min_samples * m, stream);
rmm::device_uvector<value_t> dists(min_samples * m, stream);

// perform knn
compute_knn(handle, X, inds.data(), dists.data(), m, n, X, m, min_samples, metric);

// Slice core distances (distances to kth nearest neighbor)
core_distances<value_idx>(dists.data(), min_samples, min_samples, m, core_dists, stream);
}

/**
* Constructs a mutual reachability graph, which is a k-nearest neighbors
* graph projected into mutual reachability space using the following
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/hdbscan/detail/stabilities.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ void get_stability_scores(const raft::handle_t& handle,
/**
* 1. Populate cluster sizes
*/
rmm::device_uvector<value_idx> cluster_sizes(n_leaves, handle.get_stream());
rmm::device_uvector<value_idx> cluster_sizes(n_condensed_clusters, handle.get_stream());
thrust::fill(exec_policy, cluster_sizes.data(), cluster_sizes.data() + cluster_sizes.size(), 0);

value_idx* sizes = cluster_sizes.data();
Expand Down
Loading

0 comments on commit d2e7b8e

Please sign in to comment.