Skip to content

Commit

Permalink
Merge pull request #1163 from cjnolet/bug-ext-dbscan_batch_fix
Browse files Browse the repository at this point in the history
[REVIEW] More DBSCAN fixes. Exposing both int and long variants through the API. Will need to find a good way to allow the user to select one through the Python API
  • Loading branch information
cjnolet authored Oct 2, 2019
2 parents 45f554b + d57a667 commit 1ac1f33
Show file tree
Hide file tree
Showing 27 changed files with 435 additions and 189 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
- PR #1142: prims: expose separate InType and OutType for unaryOp and binaryOp
- PR #1115: Moving dask_make_blobs to cuml.dask.datasets. Adding conversion to dask.DataFrame
- PR #1136: CUDA 10.1 CI updates
- PR #1163: Some more correctness improvements. Better verbose printing
- PR #1165: Adding except + in all remaining cython
- PR #1173: Docs: Barnes Hut TSNE documentation
- PR #1176: Use new RMM API based on Cython
Expand Down
6 changes: 3 additions & 3 deletions cpp/examples/dbscan/dbscan_example.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -241,10 +241,10 @@ int main(int argc, char* argv[]) {
cumlHandle.setStream(stream);

std::vector<int> h_labels(nRows);
long* d_labels = nullptr;
int* d_labels = nullptr;
float* d_inputData = nullptr;

CUDA_RT_CALL(cudaMalloc(&d_labels, nRows * sizeof(long)));
CUDA_RT_CALL(cudaMalloc(&d_labels, nRows * sizeof(int)));
CUDA_RT_CALL(cudaMalloc(&d_inputData, nRows * nCols * sizeof(float)));
CUDA_RT_CALL(cudaMemcpyAsync(d_inputData, h_inputData.data(),
nRows * nCols * sizeof(float),
Expand All @@ -259,7 +259,7 @@ int main(int argc, char* argv[]) {

ML::dbscanFit(cumlHandle, d_inputData, nRows, nCols, eps, minPts, d_labels,
max_bytes_per_batch, false);
CUDA_RT_CALL(cudaMemcpyAsync(h_labels.data(), d_labels, nRows * sizeof(long),
CUDA_RT_CALL(cudaMemcpyAsync(h_labels.data(), d_labels, nRows * sizeof(int),
cudaMemcpyDeviceToHost, stream));
CUDA_RT_CALL(cudaStreamSynchronize(stream));

Expand Down
37 changes: 29 additions & 8 deletions cpp/src/datasets/make_blobs.cu
Original file line number Diff line number Diff line change
Expand Up @@ -23,19 +23,19 @@
namespace ML {
namespace Datasets {

void make_blobs(const cumlHandle& handle, float* out, long* labels, long n_rows,
long n_cols, long n_clusters, const float* centers,
const float* cluster_std, const float cluster_std_scalar,
bool shuffle, float center_box_min, float center_box_max,
uint64_t seed) {
void make_blobs(const cumlHandle& handle, float* out, int64_t* labels,
int64_t n_rows, int64_t n_cols, int64_t n_clusters,
const float* centers, const float* cluster_std,
const float cluster_std_scalar, bool shuffle,
float center_box_min, float center_box_max, uint64_t seed) {
MLCommon::Random::make_blobs(out, labels, n_rows, n_cols, n_clusters,
handle.getDeviceAllocator(), handle.getStream(),
centers, cluster_std, cluster_std_scalar,
shuffle, center_box_min, center_box_max, seed);
}

void make_blobs(const cumlHandle& handle, double* out, long* labels,
long n_rows, long n_cols, long n_clusters,
void make_blobs(const cumlHandle& handle, double* out, int64_t* labels,
int64_t n_rows, int64_t n_cols, int64_t n_clusters,
const double* centers, const double* cluster_std,
const double cluster_std_scalar, bool shuffle,
double center_box_min, double center_box_max, uint64_t seed) {
Expand All @@ -45,5 +45,26 @@ void make_blobs(const cumlHandle& handle, double* out, long* labels,
shuffle, center_box_min, center_box_max, seed);
}

void make_blobs(const cumlHandle& handle, float* out, int* labels, int n_rows,
int n_cols, int n_clusters, const float* centers,
const float* cluster_std, const float cluster_std_scalar,
bool shuffle, float center_box_min, float center_box_max,
uint64_t seed) {
MLCommon::Random::make_blobs(out, labels, n_rows, n_cols, n_clusters,
handle.getDeviceAllocator(), handle.getStream(),
centers, cluster_std, cluster_std_scalar,
shuffle, center_box_min, center_box_max, seed);
}

void make_blobs(const cumlHandle& handle, double* out, int* labels, int n_rows,
int n_cols, int n_clusters, const double* centers,
const double* cluster_std, const double cluster_std_scalar,
bool shuffle, double center_box_min, double center_box_max,
uint64_t seed) {
MLCommon::Random::make_blobs(out, labels, n_rows, n_cols, n_clusters,
handle.getDeviceAllocator(), handle.getStream(),
centers, cluster_std, cluster_std_scalar,
shuffle, center_box_min, center_box_max, seed);
}
} // namespace Datasets
} // end namespace ML
} // namespace ML
22 changes: 17 additions & 5 deletions cpp/src/datasets/make_blobs.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,21 +49,33 @@ namespace Datasets {
* centers. Useful only if 'centers' is nullptr
* @param seed seed for the RNG
*/
void make_blobs(const cumlHandle& handle, float* out, long* labels, long n_rows,
long n_cols, long n_clusters, const float* centers = nullptr,
void make_blobs(const cumlHandle& handle, float* out, int64_t* labels,
int64_t n_rows, int64_t n_cols, int64_t n_clusters,
const float* centers = nullptr,
const float* cluster_std = nullptr,
const float cluster_std_scalar = 1.f, bool shuffle = true,
float center_box_min = 10.f, float center_box_max = 10.f,
uint64_t seed = 0ULL);

void make_blobs(const cumlHandle& handle, double* out, long* labels,
long n_rows, long n_cols, long n_clusters,
void make_blobs(const cumlHandle& handle, double* out, int64_t* labels,
int64_t n_rows, int64_t n_cols, int64_t n_clusters,
const double* centers = nullptr,
const double* cluster_std = nullptr,
const double cluster_std_scalar = 1.f, bool shuffle = true,
double center_box_min = 10.f, double center_box_max = 10.f,
uint64_t seed = 0ULL);
/** @} */

void make_blobs(const cumlHandle& handle, float* out, int* labels, int n_rows,
int n_cols, int n_clusters, const float* centers,
const float* cluster_std, const float cluster_std_scalar,
bool shuffle, float center_box_min, float center_box_max,
uint64_t seed);

void make_blobs(const cumlHandle& handle, double* out, int* labels, int n_rows,
int n_cols, int n_clusters, const double* centers,
const double* cluster_std, const double cluster_std_scalar,
bool shuffle, double center_box_min, double center_box_max,
uint64_t seed);

} // namespace Datasets
} // namespace ML
2 changes: 1 addition & 1 deletion cpp/src/dbscan/adjgraph/algo.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ static const int TPB_X = 256;
* Takes vertex degree array (vd) and CSR row_ind array (ex_scan) to produce the
* CSR row_ind_ptr array (adj_graph) and filters into a core_pts array based on min_pts.
*/
template <typename Type, typename Index_ = long>
template <typename Type, typename Index_ = int>
void launcher(const ML::cumlHandle_impl &handle, Pack<Type, Index_> data,
Index_ batchSize, cudaStream_t stream) {
device_ptr<Index_> dev_vd = device_pointer_cast(data.vd);
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/dbscan/adjgraph/naive.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ namespace Dbscan {
namespace AdjGraph {
namespace Naive {

template <typename Type, typename Index_ = long>
template <typename Type, typename Index_ = int>
void launcher(const ML::cumlHandle_impl& handle, Pack<Type, Index_> data,
Index_ batchSize, cudaStream_t stream) {
Index_ k = 0;
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/dbscan/adjgraph/pack.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
namespace Dbscan {
namespace AdjGraph {

template <typename Type, typename Index_ = long>
template <typename Type, typename Index_ = int>
struct Pack {
/**
* vertex degree array
Expand Down
27 changes: 20 additions & 7 deletions cpp/src/dbscan/dbscan.cu
Original file line number Diff line number Diff line change
Expand Up @@ -24,19 +24,32 @@ namespace ML {

using namespace Dbscan;

// @todo
// In the below 2 calls, the Index type has been hard-coded to `int64_t`
// We should pick the right Index type based on the input dimensions.
void dbscanFit(const cumlHandle &handle, float *input, long n_rows, long n_cols,
float eps, int min_pts, long *labels, size_t max_bytes_per_batch,
void dbscanFit(const cumlHandle &handle, float *input, int n_rows, int n_cols,
float eps, int min_pts, int *labels, size_t max_bytes_per_batch,
bool verbose) {
dbscanFitImpl<float, int>(handle.getImpl(), input, n_rows, n_cols, eps,
min_pts, labels, max_bytes_per_batch,
handle.getStream(), verbose);
}

void dbscanFit(const cumlHandle &handle, double *input, int n_rows, int n_cols,
double eps, int min_pts, int *labels, size_t max_bytes_per_batch,
bool verbose) {
dbscanFitImpl<double, int>(handle.getImpl(), input, n_rows, n_cols, eps,
min_pts, labels, max_bytes_per_batch,
handle.getStream(), verbose);
}

void dbscanFit(const cumlHandle &handle, float *input, int64_t n_rows,
int64_t n_cols, float eps, int min_pts, int64_t *labels,
size_t max_bytes_per_batch, bool verbose) {
dbscanFitImpl<float, int64_t>(handle.getImpl(), input, n_rows, n_cols, eps,
min_pts, labels, max_bytes_per_batch,
handle.getStream(), verbose);
}

void dbscanFit(const cumlHandle &handle, double *input, long n_rows,
long n_cols, double eps, int min_pts, long *labels,
void dbscanFit(const cumlHandle &handle, double *input, int64_t n_rows,
int64_t n_cols, double eps, int min_pts, int64_t *labels,
size_t max_bytes_per_batch, bool verbose) {
dbscanFitImpl<double, int64_t>(handle.getImpl(), input, n_rows, n_cols, eps,
min_pts, labels, max_bytes_per_batch,
Expand Down
30 changes: 20 additions & 10 deletions cpp/src/dbscan/dbscan.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,48 +23,58 @@
namespace ML {

using namespace Dbscan;
static const size_t DEFAULT_MAX_MEM_BYTES = 13e9;
static const size_t DEFAULT_MAX_MEM_MBYTES = 13e3;

// Default max mem set to a reasonable value for a 16gb card.

template <typename T, typename Index_ = long>
Index_ computeBatchCount(Index_ n_rows, size_t max_bytes_per_batch) {
template <typename T, typename Index_ = int>
Index_ computeBatchCount(Index_ n_rows, size_t max_mbytes_per_batch) {
Index_ n_batches = 1;
// There seems to be a weird overflow bug with cutlass gemm kernels
// hence, artifically limiting to a smaller batchsize!
///TODO: in future, when we bump up the underlying cutlass version, this should go away
// paving way to cudaMemGetInfo based workspace allocation

if (max_bytes_per_batch <= 0) max_bytes_per_batch = DEFAULT_MAX_MEM_BYTES;
if (max_mbytes_per_batch <= 0) max_mbytes_per_batch = DEFAULT_MAX_MEM_MBYTES;

Index_ MAX_LABEL = std::numeric_limits<Index_>::max();

while (true) {
size_t batchSize = ceildiv<size_t>(n_rows, n_batches);
if (batchSize * n_rows * sizeof(T) < max_bytes_per_batch || batchSize == 1)
if (((batchSize * n_rows * sizeof(T) * 1e-6 < max_mbytes_per_batch) &&
/**
* Though single precision can be faster per execution of each kernel,
* there's a trade-off to be made between using single precision with
* many more batches (which become smaller as n_rows grows) and using
* double precision, which will have less batches but could become 8-10x
* slower per batch.
*/
(batchSize * n_rows < MAX_LABEL)) ||
batchSize == 1)
break;
++n_batches;
}
return n_batches;
}

template <typename T, typename Index_ = long>
template <typename T, typename Index_ = int>
void dbscanFitImpl(const ML::cumlHandle_impl &handle, T *input, Index_ n_rows,
Index_ n_cols, T eps, int min_pts, Index_ *labels,
size_t max_bytes_per_batch, cudaStream_t stream,
size_t max_mbytes_per_batch, cudaStream_t stream,
bool verbose) {
ML::PUSH_RANGE("ML::Dbscan::Fit");
int algoVd = 1;
int algoAdj = 1;
int algoCcl = 2;

// @todo: Query device for remaining memory
Index_ n_batches = computeBatchCount<T, Index_>(n_rows, max_bytes_per_batch);
Index_ n_batches = computeBatchCount<T, Index_>(n_rows, max_mbytes_per_batch);

if (verbose) {
Index_ batchSize = ceildiv<Index_>(n_rows, n_batches);
if (n_batches > 1) {
std::cout << "Running batched training on " << n_batches
<< " batches w/ ";
std::cout << batchSize * n_rows * sizeof(T) << " bytes." << std::endl;
std::cout << batchSize * n_rows * sizeof(T) * 1e-6 << " megabytes." << std::endl;
}
}

Expand Down
18 changes: 13 additions & 5 deletions cpp/src/dbscan/dbscan.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,18 +29,26 @@ namespace ML {
* @param[in] eps the epsilon value to use for epsilon-neighborhood determination
* @param[in] min_pts minimum number of points to determine a cluster
* @param[out] labels (size n_rows) output labels array
* @param[in] max_mem_bytes: the maximum number of bytes to be used for each batch of
* @param[in] max_mem_mbytes: the maximum number of megabytes to be used for each batch of
* the pairwise distance calculation. This enables the trade off between
* memory usage and algorithm execution time.
* @param[in] verbose: print useful information as algorithm executes
* @{
*/
void dbscanFit(const cumlHandle &handle, float *input, long n_rows, long n_cols,
float eps, int min_pts, long *labels,
void dbscanFit(const cumlHandle &handle, float *input, int n_rows, int n_cols,
float eps, int min_pts, int *labels,
size_t max_bytes_per_batch = 0, bool verbose = false);
void dbscanFit(const cumlHandle &handle, double *input, long n_rows,
long n_cols, double eps, int min_pts, long *labels,
void dbscanFit(const cumlHandle &handle, double *input, int n_rows, int n_cols,
double eps, int min_pts, int *labels,
size_t max_bytes_per_batch = 0, bool verbose = false);

void dbscanFit(const cumlHandle &handle, float *input, int64_t n_rows,
int64_t n_cols, float eps, int min_pts, int64_t *labels,
size_t max_bytes_per_batch = 0, bool verbose = false);
void dbscanFit(const cumlHandle &handle, double *input, int64_t n_rows,
int64_t n_cols, double eps, int min_pts, int64_t *labels,
size_t max_bytes_per_batch = 0, bool verbose = false);

/** @} */

} // namespace ML
8 changes: 4 additions & 4 deletions cpp/src/dbscan/dbscan_api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@
#include "common/cumlHandle.hpp"
#include "dbscan.hpp"

cumlError_t cumlSpDbscanFit(cumlHandle_t handle, float *input, long n_rows,
long n_cols, float eps, int min_pts, long *labels,
cumlError_t cumlSpDbscanFit(cumlHandle_t handle, float *input, int n_rows,
int n_cols, float eps, int min_pts, int *labels,
size_t max_bytes_per_batch, int verbose) {
cumlError_t status;
ML::cumlHandle *handle_ptr;
Expand All @@ -42,8 +42,8 @@ cumlError_t cumlSpDbscanFit(cumlHandle_t handle, float *input, long n_rows,
return status;
}

cumlError_t cumlDpDbscanFit(cumlHandle_t handle, double *input, long n_rows,
long n_cols, double eps, int min_pts, long *labels,
cumlError_t cumlDpDbscanFit(cumlHandle_t handle, double *input, int n_rows,
int n_cols, double eps, int min_pts, int *labels,
size_t max_bytes_per_batch, int verbose) {
cumlError_t status;
ML::cumlHandle *handle_ptr;
Expand Down
8 changes: 4 additions & 4 deletions cpp/src/dbscan/dbscan_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,11 @@ extern "C" {
* @return CUML_SUCCESS on success and other corresponding flags upon any failures.
* @{
*/
cumlError_t cumlSpDbscanFit(cumlHandle_t handle, float *input, long n_rows,
long n_cols, float eps, int min_pts, long *labels,
cumlError_t cumlSpDbscanFit(cumlHandle_t handle, float *input, int n_rows,
int n_cols, float eps, int min_pts, int *labels,
size_t max_bytes_per_batch, int verbose);
cumlError_t cumlDpDbscanFit(cumlHandle_t handle, double *input, long n_rows,
long n_cols, double eps, int min_pts, long *labels,
cumlError_t cumlDpDbscanFit(cumlHandle_t handle, double *input, int n_rows,
int n_cols, double eps, int min_pts, int *labels,
size_t max_bytes_per_batch, int verbose);
/** @} */

Expand Down
Loading

0 comments on commit 1ac1f33

Please sign in to comment.