From 09a2fcefbc3df358b5e6aff34beff497a63fb5e0 Mon Sep 17 00:00:00 2001 From: Thejaswi Rao Date: Tue, 24 Sep 2019 22:57:55 -0700 Subject: [PATCH 01/63] Separate Input and Output type for unaryOp --- cpp/src_prims/linalg/unary_op.h | 80 ++++++++++++++++++--------------- 1 file changed, 45 insertions(+), 35 deletions(-) diff --git a/cpp/src_prims/linalg/unary_op.h b/cpp/src_prims/linalg/unary_op.h index 24860e6fae..bf06ed76ec 100644 --- a/cpp/src_prims/linalg/unary_op.h +++ b/cpp/src_prims/linalg/unary_op.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, NVIDIA CORPORATION. + * Copyright (c) 2018-2019, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,42 +16,47 @@ #pragma once +#include #include "cuda_utils.h" #include "vectorized.h" namespace MLCommon { namespace LinAlg { -template -__global__ void unaryOpKernel(math_t *out, const math_t *in, IdxType len, +template +__global__ void unaryOpKernel(OutType *out, const InType *in, IdxType len, Lambda op) { - typedef TxN_t VecType; - VecType a; + typedef TxN_t InVecType; + typedef TxN_t OutVecType; + InVecType a; + OutVecType b; IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * blockDim.x); - idx *= VecType::Ratio; + idx *= InVecType::Ratio; if (idx >= len) return; a.load(in, idx); #pragma unroll - for (int i = 0; i < VecType::Ratio; ++i) { - a.val.data[i] = op(a.val.data[i]); + for (int i = 0; i < InVecType::Ratio; ++i) { + b.val.data[i] = op(a.val.data[i]); } - a.store(out, idx); + b.store(out, idx); } -template -void unaryOpImpl(math_t *out, const math_t *in, IdxType len, Lambda op, +template +void unaryOpImpl(OutType *out, const InType *in, IdxType len, Lambda op, cudaStream_t stream) { - const IdxType nblks = ceildiv(veclen_ ? len / veclen_ : len, (IdxType)TPB); - unaryOpKernel + const IdxType nblks = ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB); + unaryOpKernel <<>>(out, in, len, op); CUDA_CHECK(cudaPeekAtLastError()); } /** * @brief perform element-wise unary operation in the input array - * @tparam math_t data-type upon which the math operation will be performed + * @tparam InType input data-type * @tparam Lambda the device-lambda performing the actual operation + * @tparam OutType output data-type * @tparam IdxType Integer type used to for addressing * @tparam TPB threads-per-block in the final kernel launched * @param out the output array @@ -59,36 +64,41 @@ void unaryOpImpl(math_t *out, const math_t *in, IdxType len, Lambda op, * @param len number of elements in the input array * @param op the device-lambda * @param stream cuda stream where to launch work + * @note Lambda must be a functor with the following signature: + * `OutType func(const InType& val);` */ -template -void unaryOp(math_t *out, const math_t *in, IdxType len, Lambda op, +template +void unaryOp(OutType *out, const InType *in, IdxType len, Lambda op, cudaStream_t stream) { if (len <= 0) return; //silently skip in case of 0 length input - size_t bytes = len * sizeof(math_t); + constexpr auto minSize = + sizeof(InType) < sizeof(OutType) ? sizeof(InType) : sizeof(OutType); + size_t bytes = len * minSize; uint64_t inAddr = uint64_t(in); uint64_t outAddr = uint64_t(out); - if (16 / sizeof(math_t) && bytes % 16 == 0 && inAddr % 16 == 0 && + if (16 / minSize && bytes % 16 == 0 && inAddr % 16 == 0 && outAddr % 16 == 0) { - unaryOpImpl(out, in, len, - op, stream); - } else if (8 / sizeof(math_t) && bytes % 8 == 0 && inAddr % 8 == 0 && + unaryOpImpl( + out, in, len, op, stream); + } else if (8 / minSize && bytes % 8 == 0 && inAddr % 8 == 0 && outAddr % 8 == 0) { - unaryOpImpl(out, in, len, - op, stream); - } else if (4 / sizeof(math_t) && bytes % 4 == 0 && inAddr % 4 == 0 && + unaryOpImpl( + out, in, len, op, stream); + } else if (4 / minSize && bytes % 4 == 0 && inAddr % 4 == 0 && outAddr % 4 == 0) { - unaryOpImpl(out, in, len, - op, stream); - } else if (2 / sizeof(math_t) && bytes % 2 == 0 && inAddr % 2 == 0 && + unaryOpImpl( + out, in, len, op, stream); + } else if (2 / minSize && bytes % 2 == 0 && inAddr % 2 == 0 && outAddr % 2 == 0) { - unaryOpImpl(out, in, len, - op, stream); - } else if (1 / sizeof(math_t)) { - unaryOpImpl(out, in, len, - op, stream); + unaryOpImpl( + out, in, len, op, stream); + } else if (1 / minSize) { + unaryOpImpl( + out, in, len, op, stream); } else { - unaryOpImpl(out, in, len, op, stream); + unaryOpImpl(out, in, len, op, + stream); } } From 8bfd35c29d7426921060b37a5d35e3252f1b1746 Mon Sep 17 00:00:00 2001 From: Thejaswi Rao Date: Tue, 24 Sep 2019 23:01:05 -0700 Subject: [PATCH 02/63] update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 01f4dd36b0..eed5c246ae 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,7 @@ - PR #1076: Paying off some UMAP / Spectral tech debt. - PR #1086: Ensure RegressorMixin scorer uses device arrays - PR #1114: K-means: Exposing useful params, removing unused params, proxying params in Dask +- PR #1142: prims: expose separate InType and OutType for unaryOp and binaryOp ## Bug Fixes From 832227d283e9d8ebb90143feb1cbb602a6a8be7a Mon Sep 17 00:00:00 2001 From: Thejaswi Rao Date: Wed, 25 Sep 2019 01:28:54 -0700 Subject: [PATCH 03/63] unit-tests to verify different input and output types for unaryOp --- cpp/test/prims/unary_op.cu | 33 ++++++++++++++++++++++++--------- cpp/test/prims/unary_op.h | 23 ++++++++++++----------- 2 files changed, 36 insertions(+), 20 deletions(-) diff --git a/cpp/test/prims/unary_op.cu b/cpp/test/prims/unary_op.cu index 62d925fbc7..ebf7d955c9 100644 --- a/cpp/test/prims/unary_op.cu +++ b/cpp/test/prims/unary_op.cu @@ -26,18 +26,22 @@ namespace LinAlg { // Or else, we get the following compilation error // for an extended __device__ lambda cannot have private or protected access // within its class -template -void unaryOpLaunch(T *out, const T *in, T scalar, IdxType len, +template +void unaryOpLaunch(OutType *out, const InType *in, InType scalar, IdxType len, cudaStream_t stream) { - unaryOp( - out, in, len, [scalar] __device__(T in) { return in * scalar; }, stream); + auto op = [scalar] __device__(InType in) { + return static_cast(in * scalar); + }; + unaryOp(out, in, len, op, stream); } -template -class UnaryOpTest : public ::testing::TestWithParam> { +template +class UnaryOpTest + : public ::testing::TestWithParam> { protected: void SetUp() override { - params = ::testing::TestWithParam>::GetParam(); + params = ::testing::TestWithParam< + UnaryOpInputs>::GetParam(); Random::Rng r(params.seed); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); @@ -61,8 +65,9 @@ class UnaryOpTest : public ::testing::TestWithParam> { } protected: - UnaryOpInputs params; - T *in, *out_ref, *out; + UnaryOpInputs params; + InType *in; + OutType *out_ref, *out; }; const std::vector> inputsf_i32 = { @@ -85,6 +90,16 @@ TEST_P(UnaryOpTestF_i64, Result) { INSTANTIATE_TEST_CASE_P(UnaryOpTests, UnaryOpTestF_i64, ::testing::ValuesIn(inputsf_i64)); +const std::vector> inputsf_i32_d = { + {0.000001f, 1024 * 1024, 2.f, 1234ULL}}; +typedef UnaryOpTest UnaryOpTestF_i32_D; +TEST_P(UnaryOpTestF_i32_D, Result) { + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, + CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_CASE_P(UnaryOpTests, UnaryOpTestF_i32_D, + ::testing::ValuesIn(inputsf_i32_d)); + const std::vector> inputsd_i32 = { {0.00000001, 1024 * 1024, 2.0, 1234ULL}}; typedef UnaryOpTest UnaryOpTestD_i32; diff --git a/cpp/test/prims/unary_op.h b/cpp/test/prims/unary_op.h index bc08a56d59..24e5e2ec1c 100644 --- a/cpp/test/prims/unary_op.h +++ b/cpp/test/prims/unary_op.h @@ -22,35 +22,36 @@ namespace MLCommon { namespace LinAlg { -template -__global__ void naiveScaleKernel(Type *out, const Type *in, Type scalar, +template +__global__ void naiveScaleKernel(OutType *out, const InType *in, InType scalar, IdxType len) { IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * (IdxType)blockDim.x); if (idx < len) { - out[idx] = scalar * in[idx]; + out[idx] = static_cast(scalar * in[idx]); } } -template -void naiveScale(Type *out, const Type *in, Type scalar, int len, +template +void naiveScale(OutType *out, const InType *in, InType scalar, int len, cudaStream_t stream) { static const int TPB = 64; int nblks = ceildiv(len, TPB); - naiveScaleKernel<<>>(out, in, scalar, len); + naiveScaleKernel + <<>>(out, in, scalar, len); CUDA_CHECK(cudaPeekAtLastError()); } -template +template struct UnaryOpInputs { - T tolerance; + InType tolerance; IdxType len; - T scalar; + InType scalar; unsigned long long int seed; }; -template +template ::std::ostream &operator<<(::std::ostream &os, - const UnaryOpInputs &dims) { + const UnaryOpInputs &d) { return os; } From 6e97fb626e8b5eddf0b9e845c5b58ffb56b37be6 Mon Sep 17 00:00:00 2001 From: Thejaswi Rao Date: Wed, 25 Sep 2019 01:39:20 -0700 Subject: [PATCH 04/63] fixed the max size comparison bug in unaryOp --- cpp/src_prims/linalg/unary_op.h | 27 +++++++++++++-------------- cpp/test/prims/unary_op.cu | 2 +- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/cpp/src_prims/linalg/unary_op.h b/cpp/src_prims/linalg/unary_op.h index bf06ed76ec..35b1fdd9cd 100644 --- a/cpp/src_prims/linalg/unary_op.h +++ b/cpp/src_prims/linalg/unary_op.h @@ -16,7 +16,6 @@ #pragma once -#include #include "cuda_utils.h" #include "vectorized.h" @@ -72,29 +71,29 @@ template = sizeof(OutType) ? sizeof(InType) : sizeof(OutType); + size_t bytes = len * maxSize; uint64_t inAddr = uint64_t(in); uint64_t outAddr = uint64_t(out); - if (16 / minSize && bytes % 16 == 0 && inAddr % 16 == 0 && + if (16 / maxSize && bytes % 16 == 0 && inAddr % 16 == 0 && outAddr % 16 == 0) { - unaryOpImpl( + unaryOpImpl( out, in, len, op, stream); - } else if (8 / minSize && bytes % 8 == 0 && inAddr % 8 == 0 && + } else if (8 / maxSize && bytes % 8 == 0 && inAddr % 8 == 0 && outAddr % 8 == 0) { - unaryOpImpl( + unaryOpImpl( out, in, len, op, stream); - } else if (4 / minSize && bytes % 4 == 0 && inAddr % 4 == 0 && + } else if (4 / maxSize && bytes % 4 == 0 && inAddr % 4 == 0 && outAddr % 4 == 0) { - unaryOpImpl( + unaryOpImpl( out, in, len, op, stream); - } else if (2 / minSize && bytes % 2 == 0 && inAddr % 2 == 0 && + } else if (2 / maxSize && bytes % 2 == 0 && inAddr % 2 == 0 && outAddr % 2 == 0) { - unaryOpImpl( + unaryOpImpl( out, in, len, op, stream); - } else if (1 / minSize) { - unaryOpImpl( + } else if (1 / maxSize) { + unaryOpImpl( out, in, len, op, stream); } else { unaryOpImpl(out, in, len, op, diff --git a/cpp/test/prims/unary_op.cu b/cpp/test/prims/unary_op.cu index ebf7d955c9..e91399c75a 100644 --- a/cpp/test/prims/unary_op.cu +++ b/cpp/test/prims/unary_op.cu @@ -52,7 +52,7 @@ class UnaryOpTest allocate(in, len); allocate(out_ref, len); allocate(out, len); - r.uniform(in, len, T(-1.0), T(1.0), stream); + r.uniform(in, len, InType(-1.0), InType(1.0), stream); naiveScale(out_ref, in, scalar, len, stream); unaryOpLaunch(out, in, scalar, len, stream); CUDA_CHECK(cudaStreamDestroy(stream)); From dda78c1a232b295ebd4d2dbfb52ee52facfcdb38 Mon Sep 17 00:00:00 2001 From: Thejaswi Rao Date: Thu, 26 Sep 2019 22:05:26 -0700 Subject: [PATCH 05/63] Enable binaryOp to also support different input and output types --- cpp/src_prims/linalg/binary_op.h | 68 +++++++++++++++++--------------- cpp/test/prims/binary_op.cu | 28 +++++++++---- cpp/test/prims/binary_op.h | 22 +++++------ cpp/test/prims/unary_op.cu | 2 - 4 files changed, 68 insertions(+), 52 deletions(-) diff --git a/cpp/src_prims/linalg/binary_op.h b/cpp/src_prims/linalg/binary_op.h index 9f799e076c..3a60519d1c 100644 --- a/cpp/src_prims/linalg/binary_op.h +++ b/cpp/src_prims/linalg/binary_op.h @@ -22,37 +22,41 @@ namespace MLCommon { namespace LinAlg { -template -__global__ void binaryOpKernel(math_t *out, const math_t *in1, - const math_t *in2, IdxType len, Lambda op) { - typedef TxN_t VecType; - VecType a, b; +template +__global__ void binaryOpKernel(OutType *out, const InType *in1, + const InType *in2, IdxType len, Lambda op) { + typedef TxN_t InVecType; + typedef TxN_t OutVecType; + InVecType a, b; + OutVecType c; IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * blockDim.x); - idx *= VecType::Ratio; + idx *= InVecType::Ratio; if (idx >= len) return; a.load(in1, idx); b.load(in2, idx); #pragma unroll - for (int i = 0; i < VecType::Ratio; ++i) { - a.val.data[i] = op(a.val.data[i], b.val.data[i]); + for (int i = 0; i < InVecType::Ratio; ++i) { + c.val.data[i] = op(a.val.data[i], b.val.data[i]); } - a.store(out, idx); + c.store(out, idx); } -template -void binaryOpImpl(math_t *out, const math_t *in1, const math_t *in2, +template +void binaryOpImpl(OutType *out, const InType *in1, const InType *in2, IdxType len, Lambda op, cudaStream_t stream) { - const IdxType nblks = ceildiv(veclen_ ? len / veclen_ : len, (IdxType)TPB); - binaryOpKernel + const IdxType nblks = ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB); + binaryOpKernel <<>>(out, in1, in2, len, op); CUDA_CHECK(cudaPeekAtLastError()); } /** * @brief perform element-wise binary operation on the input arrays - * @tparam math_t data-type upon which the math operation will be performed + * @tparam InType input data-type * @tparam Lambda the device-lambda performing the actual operation + * @tparam OutType output data-type * @tparam IdxType Integer type used to for addressing * @tparam TPB threads-per-block in the final kernel launched * @param out the output array @@ -62,29 +66,31 @@ void binaryOpImpl(math_t *out, const math_t *in1, const math_t *in2, * @param op the device-lambda * @param stream cuda stream where to launch work */ -template -void binaryOp(math_t *out, const math_t *in1, const math_t *in2, IdxType len, +template +void binaryOp(OutType *out, const InType *in1, const InType *in2, IdxType len, Lambda op, cudaStream_t stream) { - size_t bytes = len * sizeof(math_t); - if (16 / sizeof(math_t) && bytes % 16 == 0) { - binaryOpImpl( + constexpr auto maxSize = + sizeof(InType) > sizeof(OutType) ? sizeof(InType) : sizeof(OutType); + size_t bytes = len * maxSize; + if (16 / maxSize && bytes % 16 == 0) { + binaryOpImpl( out, in1, in2, len, op, stream); - } else if (8 / sizeof(math_t) && bytes % 8 == 0) { - binaryOpImpl( + } else if (8 / maxSize && bytes % 8 == 0) { + binaryOpImpl( out, in1, in2, len, op, stream); - } else if (4 / sizeof(math_t) && bytes % 4 == 0) { - binaryOpImpl( + } else if (4 / maxSize && bytes % 4 == 0) { + binaryOpImpl( out, in1, in2, len, op, stream); - } else if (2 / sizeof(math_t) && bytes % 2 == 0) { - binaryOpImpl( + } else if (2 / maxSize && bytes % 2 == 0) { + binaryOpImpl( out, in1, in2, len, op, stream); - } else if (1 / sizeof(math_t)) { - binaryOpImpl( + } else if (1 / maxSize) { + binaryOpImpl( out, in1, in2, len, op, stream); } else { - binaryOpImpl(out, in1, in2, len, op, - stream); + binaryOpImpl(out, in1, in2, len, + op, stream); } } diff --git a/cpp/test/prims/binary_op.cu b/cpp/test/prims/binary_op.cu index 917df565d6..d92bf2fe6e 100644 --- a/cpp/test/prims/binary_op.cu +++ b/cpp/test/prims/binary_op.cu @@ -26,19 +26,20 @@ namespace LinAlg { // Or else, we get the following compilation error // for an extended __device__ lambda cannot have private or protected access // within its class -template -void binaryOpLaunch(T *out, const T *in1, const T *in2, IdxType len, - cudaStream_t stream) { +template +void binaryOpLaunch(OutType *out, const InType *in1, const InType *in2, + IdxType len, cudaStream_t stream) { binaryOp( out, in1, in2, len, [] __device__(T a, T b) { return a + b; }, stream); } -template +template class BinaryOpTest - : public ::testing::TestWithParam> { + : public ::testing::TestWithParam> { protected: void SetUp() override { - params = ::testing::TestWithParam>::GetParam(); + params = ::testing::TestWithParam< + BinaryOpInputs>::GetParam(); Random::Rng r(params.seed); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); @@ -62,8 +63,9 @@ class BinaryOpTest } protected: - BinaryOpInputs params; - T *in1, *in2, *out_ref, *out; + BinaryOpInputs params; + InType *in1, *in2; + OutType *out_ref, *out; }; const std::vector> inputsf_i32 = { @@ -86,6 +88,16 @@ TEST_P(BinaryOpTestF_i64, Result) { INSTANTIATE_TEST_CASE_P(BinaryOpTests, BinaryOpTestF_i64, ::testing::ValuesIn(inputsf_i64)); +const std::vector> inputsf_i32_d = { + {0.000001f, 1024 * 1024, 2.f, 1234ULL}}; +typedef BinaryOpTest BinaryOpTestF_i32_D; +TEST_P(BinaryOpTestF_i32_D, Result) { + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, + CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_CASE_P(BinaryOpTests, BinaryOpTestF_i32_D, + ::testing::ValuesIn(inputsf_i32_d)); + const std::vector> inputsd_i32 = { {0.00000001, 1024 * 1024, 1234ULL}}; typedef BinaryOpTest BinaryOpTestD_i32; diff --git a/cpp/test/prims/binary_op.h b/cpp/test/prims/binary_op.h index 30d0b3f42b..b1d25d81e2 100644 --- a/cpp/test/prims/binary_op.h +++ b/cpp/test/prims/binary_op.h @@ -22,33 +22,33 @@ namespace MLCommon { namespace LinAlg { -template -__global__ void naiveAddKernel(Type *out, const Type *in1, const Type *in2, - IdxType len) { +template +__global__ void naiveAddKernel(OutType *out, const InType *in1, + const InType *in2, IdxType len) { IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * (IdxType)blockDim.x); if (idx < len) { - out[idx] = in1[idx] + in2[idx]; + out[idx] = static_cast(in1[idx] + in2[idx]); } } -template -void naiveAdd(Type *out, const Type *in1, const Type *in2, IdxType len) { +template +void naiveAdd(OutType *out, const InType *in1, const InType *in2, IdxType len) { static const IdxType TPB = 64; IdxType nblks = ceildiv(len, TPB); - naiveAddKernel<<>>(out, in1, in2, len); + naiveAddKernel<<>>(out, in1, in2, len); CUDA_CHECK(cudaPeekAtLastError()); } -template +template struct BinaryOpInputs { - T tolerance; + InType tolerance; IdxType len; unsigned long long int seed; }; -template +template ::std::ostream &operator<<(::std::ostream &os, - const BinaryOpInputs &dims) { + const BinaryOpInputs &d) { return os; } diff --git a/cpp/test/prims/unary_op.cu b/cpp/test/prims/unary_op.cu index e91399c75a..ca9d0d25c9 100644 --- a/cpp/test/prims/unary_op.cu +++ b/cpp/test/prims/unary_op.cu @@ -45,10 +45,8 @@ class UnaryOpTest Random::Rng r(params.seed); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - auto len = params.len; auto scalar = params.scalar; - allocate(in, len); allocate(out_ref, len); allocate(out, len); From 14f466726432ec00485a497e0c6e1bb3de672fdc Mon Sep 17 00:00:00 2001 From: Thejaswi Rao Date: Thu, 26 Sep 2019 22:07:56 -0700 Subject: [PATCH 06/63] fixed minor typos in the previous commit --- cpp/test/prims/binary_op.cu | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/cpp/test/prims/binary_op.cu b/cpp/test/prims/binary_op.cu index d92bf2fe6e..c0bac8b438 100644 --- a/cpp/test/prims/binary_op.cu +++ b/cpp/test/prims/binary_op.cu @@ -30,7 +30,8 @@ template void binaryOpLaunch(OutType *out, const InType *in1, const InType *in2, IdxType len, cudaStream_t stream) { binaryOp( - out, in1, in2, len, [] __device__(T a, T b) { return a + b; }, stream); + out, in1, in2, len, [] __device__(InType a, InType b) { return a + b; }, + stream); } template @@ -48,8 +49,8 @@ class BinaryOpTest allocate(in2, len); allocate(out_ref, len); allocate(out, len); - r.uniform(in1, len, T(-1.0), T(1.0), stream); - r.uniform(in2, len, T(-1.0), T(1.0), stream); + r.uniform(in1, len, InType(-1.0), InType(1.0), stream); + r.uniform(in2, len, InType(-1.0), InType(1.0), stream); naiveAdd(out_ref, in1, in2, len); binaryOpLaunch(out, in1, in2, len, stream); CUDA_CHECK(cudaStreamDestroy(stream)); @@ -89,7 +90,7 @@ INSTANTIATE_TEST_CASE_P(BinaryOpTests, BinaryOpTestF_i64, ::testing::ValuesIn(inputsf_i64)); const std::vector> inputsf_i32_d = { - {0.000001f, 1024 * 1024, 2.f, 1234ULL}}; + {0.000001f, 1024 * 1024, 1234ULL}}; typedef BinaryOpTest BinaryOpTestF_i32_D; TEST_P(BinaryOpTestF_i32_D, Result) { ASSERT_TRUE(devArrMatch(out_ref, out, params.len, From 927b119a4a709a9c4a71cf8f7c683a8634877093 Mon Sep 17 00:00:00 2001 From: Thejaswi Rao Date: Thu, 26 Sep 2019 22:10:07 -0700 Subject: [PATCH 07/63] updated doxygen for binaryOp to document the lambda signature --- cpp/src_prims/linalg/binary_op.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/src_prims/linalg/binary_op.h b/cpp/src_prims/linalg/binary_op.h index 3a60519d1c..a7ad986b7a 100644 --- a/cpp/src_prims/linalg/binary_op.h +++ b/cpp/src_prims/linalg/binary_op.h @@ -65,6 +65,8 @@ void binaryOpImpl(OutType *out, const InType *in1, const InType *in2, * @param len number of elements in the input array * @param op the device-lambda * @param stream cuda stream where to launch work + * @note Lambda must be a functor with the following signature: + * `OutType func(const InType& val1, const InType& val2);` */ template From a2ad67f49e51b50a9e9d89b5bb21fa01ac6b85d5 Mon Sep 17 00:00:00 2001 From: Onur Yilmaz Date: Fri, 27 Sep 2019 13:36:21 -0400 Subject: [PATCH 08/63] Initial commit for seed bug in dask RF. --- cpp/src/randomforest/randomforest.cu | 15 ++++++---- cpp/src/randomforest/randomforest.hpp | 13 ++++++--- cpp/src/randomforest/randomforest_impl.cuh | 29 ++++++++++--------- cpp/src/randomforest/randomforest_impl.h | 2 +- cpp/test/sg/rf_test.cu | 4 +-- cpp/test/sg/rf_treelite_test.cu | 2 +- .../cuml/ensemble/randomforestclassifier.pyx | 7 ++++- .../cuml/ensemble/randomforestregressor.pyx | 6 +++- 8 files changed, 48 insertions(+), 30 deletions(-) diff --git a/cpp/src/randomforest/randomforest.cu b/cpp/src/randomforest/randomforest.cu index 59c8ed9c5c..fa82017d4a 100644 --- a/cpp/src/randomforest/randomforest.cu +++ b/cpp/src/randomforest/randomforest.cu @@ -150,10 +150,11 @@ void postprocess_labels(int n_rows, std::vector& labels, * @param[in] cfg_n_streams: No of parallel CUDA for training forest */ void set_rf_params(RF_params& params, int cfg_n_trees, bool cfg_bootstrap, - float cfg_rows_sample, int cfg_n_streams) { + float cfg_rows_sample, int cfg_seed, int cfg_n_streams) { params.n_trees = cfg_n_trees; params.bootstrap = cfg_bootstrap; params.rows_sample = cfg_rows_sample; + params.seed = cfg_seed; params.n_streams = min(cfg_n_streams, omp_get_max_threads()); if (params.n_streams == cfg_n_streams) { std::cout << "Warning! Max setting Max streams to max openmp threads " @@ -173,11 +174,12 @@ void set_rf_params(RF_params& params, int cfg_n_trees, bool cfg_bootstrap, * @param[in] cfg_tree_params: tree parameters */ void set_all_rf_params(RF_params& params, int cfg_n_trees, bool cfg_bootstrap, - float cfg_rows_sample, int cfg_n_streams, + float cfg_rows_sample, int cfg_seed, int cfg_n_streams, DecisionTree::DecisionTreeParams cfg_tree_params) { params.n_trees = cfg_n_trees; params.bootstrap = cfg_bootstrap; params.rows_sample = cfg_rows_sample; + params.seed = cfg_seed; params.n_streams = min(cfg_n_streams, omp_get_max_threads()); if (cfg_n_trees < params.n_streams) params.n_streams = cfg_n_trees; set_tree_params(params.tree_params); // use input tree params @@ -462,15 +464,16 @@ RF_metrics score(const cumlHandle& user_handle, RF_params set_rf_class_obj(int max_depth, int max_leaves, float max_features, int n_bins, int split_algo, int min_rows_per_node, bool bootstrap_features, bool bootstrap, int n_trees, - float rows_sample, CRITERION split_criterion, - bool quantile_per_tree, int cfg_n_streams) { + float rows_sample, int seed, + CRITERION split_criterion, bool quantile_per_tree, + int cfg_n_streams) { DecisionTree::DecisionTreeParams tree_params; DecisionTree::set_tree_params( tree_params, max_depth, max_leaves, max_features, n_bins, split_algo, min_rows_per_node, bootstrap_features, split_criterion, quantile_per_tree); RF_params rf_params; - set_all_rf_params(rf_params, n_trees, bootstrap, rows_sample, cfg_n_streams, - tree_params); + set_all_rf_params(rf_params, n_trees, bootstrap, rows_sample, seed, + cfg_n_streams, tree_params); return rf_params; } diff --git a/cpp/src/randomforest/randomforest.hpp b/cpp/src/randomforest/randomforest.hpp index 75f78e6340..277aee5172 100644 --- a/cpp/src/randomforest/randomforest.hpp +++ b/cpp/src/randomforest/randomforest.hpp @@ -65,6 +65,10 @@ struct RF_params { /** * Decision tree training hyper parameter struct. */ + /** + * random seed + */ + int seed; /** * Number of concurrent GPU streams for parallel tree building. * Each stream is independently managed by CPU thread. @@ -76,9 +80,9 @@ struct RF_params { void set_rf_params(RF_params& params, int cfg_n_trees = 1, bool cfg_bootstrap = true, float cfg_rows_sample = 1.0f, - int cfg_n_streams = 8); + int cfg_seed = -1, int cfg_n_streams = 8); void set_all_rf_params(RF_params& params, int cfg_n_trees, bool cfg_bootstrap, - float cfg_rows_sample, int cfg_n_streams, + float cfg_rows_sample, int cfg_seed, int cfg_n_streams, DecisionTree::DecisionTreeParams cfg_tree_params); void validity_check(const RF_params rf_params); void print(const RF_params rf_params); @@ -154,8 +158,9 @@ RF_metrics score(const cumlHandle& user_handle, RF_params set_rf_class_obj(int max_depth, int max_leaves, float max_features, int n_bins, int split_algo, int min_rows_per_node, bool bootstrap_features, bool bootstrap, int n_trees, - float rows_sample, CRITERION split_criterion, - bool quantile_per_tree, int cfg_n_streams); + float rows_sample, int seed, + CRITERION split_criterion, bool quantile_per_tree, + int cfg_n_streams); // ----------------------------- Regression ----------------------------------- // diff --git a/cpp/src/randomforest/randomforest_impl.cuh b/cpp/src/randomforest/randomforest_impl.cuh index a1de148a10..60096c2e12 100644 --- a/cpp/src/randomforest/randomforest_impl.cuh +++ b/cpp/src/randomforest/randomforest_impl.cuh @@ -70,9 +70,12 @@ void random_uniformInt(int treeid, unsigned int* data, int len, int n_rows, template void rf::prepare_fit_per_tree( int tree_id, int n_rows, int n_sampled_rows, unsigned int* selected_rows, - const int num_sms, const cudaStream_t stream, + int seed, const int num_sms, const cudaStream_t stream, const std::shared_ptr device_allocator) { - srand(tree_id * 1000); + int rs = tree_id * 1000; + if (seed != -1) rs = seed * 1000; + + srand(rs * 1000); if (rf_params.bootstrap) { random_uniformInt(tree_id, selected_rows, n_sampled_rows, n_rows, num_sms, stream); @@ -221,10 +224,10 @@ void rfClassifier::fit(const cumlHandle& user_handle, const T* input, unsigned int* rowids; rowids = selected_rows[stream_id]->data(); - this->prepare_fit_per_tree(i, n_rows, n_sampled_rows, rowids, - tempmem[stream_id]->num_sms, - tempmem[stream_id]->stream, - handle.getDeviceAllocator()); + this->prepare_fit_per_tree( + i, n_rows, n_sampled_rows, rowids, (this->rf_params.seed + i), + tempmem[stream_id]->num_sms, tempmem[stream_id]->stream, + handle.getDeviceAllocator()); /* Build individual tree in the forest. - input is a pointer to orig data that have n_cols features and n_rows rows. @@ -236,8 +239,7 @@ void rfClassifier::fit(const cumlHandle& user_handle, const T* input, */ DecisionTree::TreeMetaDataNode* tree_ptr = &(forest->trees[i]); tree_ptr->treeid = i; - trees[i].fit(handle.getDeviceAllocator(), - handle.getHostAllocator(), + trees[i].fit(handle.getDeviceAllocator(), handle.getHostAllocator(), tempmem[stream_id]->stream, input, n_cols, n_rows, labels, rowids, n_sampled_rows, n_unique_labels, tree_ptr, this->rf_params.tree_params, tempmem[stream_id]); @@ -485,10 +487,10 @@ void rfRegressor::fit(const cumlHandle& user_handle, const T* input, for (int i = 0; i < this->rf_params.n_trees; i++) { int stream_id = omp_get_thread_num(); unsigned int* rowids = selected_rows[stream_id]->data(); - this->prepare_fit_per_tree(i, n_rows, n_sampled_rows, rowids, - tempmem[stream_id]->num_sms, - tempmem[stream_id]->stream, - handle.getDeviceAllocator()); + this->prepare_fit_per_tree( + i, n_rows, n_sampled_rows, rowids, (this->rf_params.seed + i), + tempmem[stream_id]->num_sms, tempmem[stream_id]->stream, + handle.getDeviceAllocator()); /* Build individual tree in the forest. - input is a pointer to orig data that have n_cols features and n_rows rows. @@ -499,8 +501,7 @@ void rfRegressor::fit(const cumlHandle& user_handle, const T* input, */ DecisionTree::TreeMetaDataNode* tree_ptr = &(forest->trees[i]); tree_ptr->treeid = i; - trees[i].fit(handle.getDeviceAllocator(), - handle.getHostAllocator(), + trees[i].fit(handle.getDeviceAllocator(), handle.getHostAllocator(), tempmem[stream_id]->stream, input, n_cols, n_rows, labels, rowids, n_sampled_rows, tree_ptr, this->rf_params.tree_params, tempmem[stream_id]); diff --git a/cpp/src/randomforest/randomforest_impl.h b/cpp/src/randomforest/randomforest_impl.h index 3369c82ae1..cafa675838 100644 --- a/cpp/src/randomforest/randomforest_impl.h +++ b/cpp/src/randomforest/randomforest_impl.h @@ -30,7 +30,7 @@ class rf { virtual ~rf() = default; void prepare_fit_per_tree( int tree_id, int n_rows, int n_sampled_rows, unsigned int* selected_rows, - int num_sms, const cudaStream_t stream, + int seed, int num_sms, const cudaStream_t stream, const std::shared_ptr device_allocator); void error_checking(const T* input, L* predictions, int n_rows, int n_cols, diff --git a/cpp/test/sg/rf_test.cu b/cpp/test/sg/rf_test.cu index 512c53c2af..115cbd8e97 100644 --- a/cpp/test/sg/rf_test.cu +++ b/cpp/test/sg/rf_test.cu @@ -61,7 +61,7 @@ class RfClassifierTest : public ::testing::TestWithParam> { params.split_criterion, false); RF_params rf_params; set_all_rf_params(rf_params, params.n_trees, params.bootstrap, - params.rows_sample, params.n_streams, tree_params); + params.rows_sample, -1, params.n_streams, tree_params); //print(rf_params); //-------------------------------------------------------- @@ -161,7 +161,7 @@ class RfRegressorTest : public ::testing::TestWithParam> { params.split_criterion, false); RF_params rf_params; set_all_rf_params(rf_params, params.n_trees, params.bootstrap, - params.rows_sample, params.n_streams, tree_params); + params.rows_sample, -1, params.n_streams, tree_params); //print(rf_params); //-------------------------------------------------------- diff --git a/cpp/test/sg/rf_treelite_test.cu b/cpp/test/sg/rf_treelite_test.cu index d772b0c83e..279db9588e 100644 --- a/cpp/test/sg/rf_treelite_test.cu +++ b/cpp/test/sg/rf_treelite_test.cu @@ -181,7 +181,7 @@ class RfTreeliteTestCommon : public ::testing::TestWithParam> { params.min_rows_per_node, params.bootstrap_features, params.split_criterion, false); set_all_rf_params(rf_params, params.n_trees, params.bootstrap, - params.rows_sample, params.n_streams, tree_params); + params.rows_sample, -1, params.n_streams, tree_params); // print(rf_params); handle.reset(new cumlHandle(rf_params.n_streams)); diff --git a/python/cuml/ensemble/randomforestclassifier.pyx b/python/cuml/ensemble/randomforestclassifier.pyx index 04f848a92d..8dc308bd9d 100644 --- a/python/cuml/ensemble/randomforestclassifier.pyx +++ b/python/cuml/ensemble/randomforestclassifier.pyx @@ -81,6 +81,7 @@ cdef extern from "randomforest/randomforest.hpp" namespace "ML": int n_trees bool bootstrap float rows_sample + int seed pass cdef cppclass RandomForestMetaData[T, L]: @@ -181,6 +182,7 @@ cdef extern from "randomforest/randomforest.hpp" namespace "ML": bool, int, float, + int, CRITERION, bool, int) except + @@ -302,7 +304,8 @@ class RandomForestClassifier(Base): min_samples_leaf=None, min_weight_fraction_leaf=None, max_leaf_nodes=None, min_impurity_decrease=None, min_impurity_split=None, oob_score=None, n_jobs=None, - random_state=None, warm_start=None, class_weight=None): + random_state=None, warm_start=None, class_weight=None, + int seed=-1): sklearn_params = {"criterion": criterion, "min_samples_leaf": min_samples_leaf, @@ -353,6 +356,7 @@ class RandomForestClassifier(Base): self.quantile_per_tree = quantile_per_tree self.n_cols = None self.n_streams = n_streams + self.seed = seed cdef RandomForestMetaData[float, int] *rf_forest = \ new RandomForestMetaData[float, int]() @@ -497,6 +501,7 @@ class RandomForestClassifier(Base): self.bootstrap, self.n_estimators, self.rows_sample, + self.seed, self.split_criterion, self.quantile_per_tree, self.n_streams) diff --git a/python/cuml/ensemble/randomforestregressor.pyx b/python/cuml/ensemble/randomforestregressor.pyx index e6727a47a8..f86ff332ad 100644 --- a/python/cuml/ensemble/randomforestregressor.pyx +++ b/python/cuml/ensemble/randomforestregressor.pyx @@ -80,6 +80,7 @@ cdef extern from "randomforest/randomforest.hpp" namespace "ML": int n_trees bool bootstrap float rows_sample + int seed pass cdef cppclass RandomForestMetaData[T, L]: @@ -162,6 +163,7 @@ cdef extern from "randomforest/randomforest.hpp" namespace "ML": bool, int, float, + int, CRITERION, bool, int) except + @@ -286,7 +288,7 @@ class RandomForestRegressor(Base): max_leaf_nodes=None, min_impurity_decrease=None, min_impurity_split=None, oob_score=None, random_state=None, warm_start=None, class_weight=None, - quantile_per_tree=False, criterion=None): + quantile_per_tree=False, criterion=None, int seed=-1): sklearn_params = {"criterion": criterion, "min_samples_leaf": min_samples_leaf, @@ -337,6 +339,7 @@ class RandomForestRegressor(Base): self.accuracy_metric = accuracy_metric self.quantile_per_tree = quantile_per_tree self.n_streams = n_streams + self.seed = seed cdef RandomForestMetaData[float, float] *rf_forest = \ new RandomForestMetaData[float, float]() @@ -461,6 +464,7 @@ class RandomForestRegressor(Base): self.bootstrap, self.n_estimators, self.rows_sample, + self.seed, self.split_criterion, self.quantile_per_tree, self.n_streams) From ebf4233249755361142572129e35d66c186f761c Mon Sep 17 00:00:00 2001 From: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com> Date: Fri, 27 Sep 2019 13:39:24 -0400 Subject: [PATCH 09/63] Update CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index b8bc4a6279..85a841dd55 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -41,6 +41,7 @@ - PR #1106: Pinning Distributed version to match Dask for consistent CI results - PR #1116: TSNE CUDA 10.1 Bug Fixes - PR #1132: DBSCAN Batching Bug Fix +- PR #1162: DASK RF random seed bug fix # cuML 0.9.0 (21 Aug 2019) From 72e631fc7128e804d97510a8d0fde3ed279f242f Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Fri, 27 Sep 2019 14:16:20 -0400 Subject: [PATCH 10/63] Exposing both single and double index precision, now that we have full control over the types used in dbscan Found what might be a nasty bug in the vertex degree computation. --- cpp/examples/dbscan/dbscan_example.cpp | 6 ++--- cpp/src/datasets/make_blobs.cu | 23 +++++++++++++++- cpp/src/datasets/make_blobs.hpp | 13 +++++++++- cpp/src/dbscan/dbscan.cu | 28 +++++++++++++++----- cpp/src/dbscan/dbscan.hpp | 8 ++++++ cpp/src/dbscan/dbscan_api.cpp | 8 +++--- cpp/src/dbscan/dbscan_api.h | 8 +++--- cpp/src/dbscan/runner.h | 8 +++--- cpp/src/dbscan/vertexdeg/algo.h | 16 ++++++------ cpp/src/metrics/metrics.cu | 9 +++++++ cpp/src/metrics/metrics.hpp | 5 ++++ cpp/test/sg/dbscan_test.cu | 36 +++++++++++++------------- python/cuml/cluster/dbscan.pyx | 26 +++++++++---------- 13 files changed, 133 insertions(+), 61 deletions(-) diff --git a/cpp/examples/dbscan/dbscan_example.cpp b/cpp/examples/dbscan/dbscan_example.cpp index 8e5b33069e..21f2ab0aae 100644 --- a/cpp/examples/dbscan/dbscan_example.cpp +++ b/cpp/examples/dbscan/dbscan_example.cpp @@ -241,10 +241,10 @@ int main(int argc, char* argv[]) { cumlHandle.setStream(stream); std::vector h_labels(nRows); - long* d_labels = nullptr; + int* d_labels = nullptr; float* d_inputData = nullptr; - CUDA_RT_CALL(cudaMalloc(&d_labels, nRows * sizeof(long))); + CUDA_RT_CALL(cudaMalloc(&d_labels, nRows * sizeof(int))); CUDA_RT_CALL(cudaMalloc(&d_inputData, nRows * nCols * sizeof(float))); CUDA_RT_CALL(cudaMemcpyAsync(d_inputData, h_inputData.data(), nRows * nCols * sizeof(float), @@ -259,7 +259,7 @@ int main(int argc, char* argv[]) { ML::dbscanFit(cumlHandle, d_inputData, nRows, nCols, eps, minPts, d_labels, max_bytes_per_batch, false); - CUDA_RT_CALL(cudaMemcpyAsync(h_labels.data(), d_labels, nRows * sizeof(long), + CUDA_RT_CALL(cudaMemcpyAsync(h_labels.data(), d_labels, nRows * sizeof(int), cudaMemcpyDeviceToHost, stream)); CUDA_RT_CALL(cudaStreamSynchronize(stream)); diff --git a/cpp/src/datasets/make_blobs.cu b/cpp/src/datasets/make_blobs.cu index 703594b982..3b476cd5ca 100644 --- a/cpp/src/datasets/make_blobs.cu +++ b/cpp/src/datasets/make_blobs.cu @@ -45,5 +45,26 @@ void make_blobs(const cumlHandle& handle, double* out, long* labels, shuffle, center_box_min, center_box_max, seed); } +void make_blobs(const cumlHandle& handle, float* out, int* labels, int n_rows, + int n_cols, int n_clusters, const float* centers, + const float* cluster_std, const float cluster_std_scalar, + bool shuffle, float center_box_min, float center_box_max, + uint64_t seed) { + MLCommon::Random::make_blobs(out, labels, n_rows, n_cols, n_clusters, + handle.getDeviceAllocator(), handle.getStream(), + centers, cluster_std, cluster_std_scalar, + shuffle, center_box_min, center_box_max, seed); +} + +void make_blobs(const cumlHandle& handle, double* out, int* labels, int n_rows, + int n_cols, int n_clusters, const double* centers, + const double* cluster_std, const double cluster_std_scalar, + bool shuffle, double center_box_min, double center_box_max, + uint64_t seed) { + MLCommon::Random::make_blobs(out, labels, n_rows, n_cols, n_clusters, + handle.getDeviceAllocator(), handle.getStream(), + centers, cluster_std, cluster_std_scalar, + shuffle, center_box_min, center_box_max, seed); +} } // namespace Datasets -} // end namespace ML +} // namespace ML diff --git a/cpp/src/datasets/make_blobs.hpp b/cpp/src/datasets/make_blobs.hpp index ce08d92062..645f075e38 100644 --- a/cpp/src/datasets/make_blobs.hpp +++ b/cpp/src/datasets/make_blobs.hpp @@ -63,7 +63,18 @@ void make_blobs(const cumlHandle& handle, double* out, long* labels, const double cluster_std_scalar = 1.f, bool shuffle = true, double center_box_min = 10.f, double center_box_max = 10.f, uint64_t seed = 0ULL); -/** @} */ + +void make_blobs(const cumlHandle& handle, float* out, int* labels, int n_rows, + int n_cols, int n_clusters, const float* centers, + const float* cluster_std, const float cluster_std_scalar, + bool shuffle, float center_box_min, float center_box_max, + uint64_t seed); + +void make_blobs(const cumlHandle& handle, double* out, int* labels, int n_rows, + int n_cols, int n_clusters, const double* centers, + const double* cluster_std, const double cluster_std_scalar, + bool shuffle, double center_box_min, double center_box_max, + uint64_t seed); } // namespace Datasets } // namespace ML diff --git a/cpp/src/dbscan/dbscan.cu b/cpp/src/dbscan/dbscan.cu index 74419967d3..f94d622584 100644 --- a/cpp/src/dbscan/dbscan.cu +++ b/cpp/src/dbscan/dbscan.cu @@ -27,20 +27,36 @@ using namespace Dbscan; // @todo // In the below 2 calls, the Index type has been hard-coded to `int64_t` // We should pick the right Index type based on the input dimensions. +void dbscanFit(const cumlHandle &handle, float *input, int n_rows, int n_cols, + float eps, int min_pts, int *labels, size_t max_bytes_per_batch, + bool verbose) { + dbscanFitImpl(handle.getImpl(), input, n_rows, n_cols, eps, + min_pts, labels, max_bytes_per_batch, + handle.getStream(), verbose); +} + +void dbscanFit(const cumlHandle &handle, double *input, int n_rows, int n_cols, + double eps, int min_pts, int *labels, size_t max_bytes_per_batch, + bool verbose) { + dbscanFitImpl(handle.getImpl(), input, n_rows, n_cols, eps, + min_pts, labels, max_bytes_per_batch, + handle.getStream(), verbose); +} + void dbscanFit(const cumlHandle &handle, float *input, long n_rows, long n_cols, float eps, int min_pts, long *labels, size_t max_bytes_per_batch, bool verbose) { - dbscanFitImpl(handle.getImpl(), input, n_rows, n_cols, eps, - min_pts, labels, max_bytes_per_batch, - handle.getStream(), verbose); + dbscanFitImpl(handle.getImpl(), input, n_rows, n_cols, eps, + min_pts, labels, max_bytes_per_batch, + handle.getStream(), verbose); } void dbscanFit(const cumlHandle &handle, double *input, long n_rows, long n_cols, double eps, int min_pts, long *labels, size_t max_bytes_per_batch, bool verbose) { - dbscanFitImpl(handle.getImpl(), input, n_rows, n_cols, eps, - min_pts, labels, max_bytes_per_batch, - handle.getStream(), verbose); + dbscanFitImpl(handle.getImpl(), input, n_rows, n_cols, eps, + min_pts, labels, max_bytes_per_batch, + handle.getStream(), verbose); } }; // end namespace ML diff --git a/cpp/src/dbscan/dbscan.hpp b/cpp/src/dbscan/dbscan.hpp index abe03ab280..8f1e5ae4c8 100644 --- a/cpp/src/dbscan/dbscan.hpp +++ b/cpp/src/dbscan/dbscan.hpp @@ -35,12 +35,20 @@ namespace ML { * @param[in] verbose: print useful information as algorithm executes * @{ */ +void dbscanFit(const cumlHandle &handle, float *input, int n_rows, int n_cols, + float eps, int min_pts, int *labels, + size_t max_bytes_per_batch = 0, bool verbose = false); +void dbscanFit(const cumlHandle &handle, double *input, int n_rows, int n_cols, + double eps, int min_pts, int *labels, + size_t max_bytes_per_batch = 0, bool verbose = false); + void dbscanFit(const cumlHandle &handle, float *input, long n_rows, long n_cols, float eps, int min_pts, long *labels, size_t max_bytes_per_batch = 0, bool verbose = false); void dbscanFit(const cumlHandle &handle, double *input, long n_rows, long n_cols, double eps, int min_pts, long *labels, size_t max_bytes_per_batch = 0, bool verbose = false); + /** @} */ } // namespace ML diff --git a/cpp/src/dbscan/dbscan_api.cpp b/cpp/src/dbscan/dbscan_api.cpp index 0c0272ab3d..a70a1a9bb9 100644 --- a/cpp/src/dbscan/dbscan_api.cpp +++ b/cpp/src/dbscan/dbscan_api.cpp @@ -18,8 +18,8 @@ #include "common/cumlHandle.hpp" #include "dbscan.hpp" -cumlError_t cumlSpDbscanFit(cumlHandle_t handle, float *input, long n_rows, - long n_cols, float eps, int min_pts, long *labels, +cumlError_t cumlSpDbscanFit(cumlHandle_t handle, float *input, int n_rows, + int n_cols, float eps, int min_pts, int *labels, size_t max_bytes_per_batch, int verbose) { cumlError_t status; ML::cumlHandle *handle_ptr; @@ -42,8 +42,8 @@ cumlError_t cumlSpDbscanFit(cumlHandle_t handle, float *input, long n_rows, return status; } -cumlError_t cumlDpDbscanFit(cumlHandle_t handle, double *input, long n_rows, - long n_cols, double eps, int min_pts, long *labels, +cumlError_t cumlDpDbscanFit(cumlHandle_t handle, double *input, int n_rows, + int n_cols, double eps, int min_pts, int *labels, size_t max_bytes_per_batch, int verbose) { cumlError_t status; ML::cumlHandle *handle_ptr; diff --git a/cpp/src/dbscan/dbscan_api.h b/cpp/src/dbscan/dbscan_api.h index 5563d01330..67112b131e 100644 --- a/cpp/src/dbscan/dbscan_api.h +++ b/cpp/src/dbscan/dbscan_api.h @@ -39,11 +39,11 @@ extern "C" { * @return CUML_SUCCESS on success and other corresponding flags upon any failures. * @{ */ -cumlError_t cumlSpDbscanFit(cumlHandle_t handle, float *input, long n_rows, - long n_cols, float eps, int min_pts, long *labels, +cumlError_t cumlSpDbscanFit(cumlHandle_t handle, float *input, int n_rows, + int n_cols, float eps, int min_pts, int *labels, size_t max_bytes_per_batch, int verbose); -cumlError_t cumlDpDbscanFit(cumlHandle_t handle, double *input, long n_rows, - long n_cols, double eps, int min_pts, long *labels, +cumlError_t cumlDpDbscanFit(cumlHandle_t handle, double *input, int n_rows, + int n_cols, double eps, int min_pts, int *labels, size_t max_bytes_per_batch, int verbose); /** @} */ diff --git a/cpp/src/dbscan/runner.h b/cpp/src/dbscan/runner.h index df8faabb12..c8160677ac 100644 --- a/cpp/src/dbscan/runner.h +++ b/cpp/src/dbscan/runner.h @@ -88,6 +88,7 @@ size_t run(const ML::cumlHandle_impl& handle, Type_f* x, Index_ N, Index_ D, adjSize + corePtsSize + 2 * xaSize + mSize + vdSize + exScanSize; return size; } + // partition the temporary workspace needed for different stages of dbscan Index_ adjlen = 0; Index_ curradjlen = 0; @@ -120,7 +121,7 @@ size_t run(const ML::cumlHandle_impl& handle, Type_f* x, Index_ N, Index_ D, if (nPoints <= 0) continue; if (verbose) - std::cout << "- Iteration " << i + 1 << " / " << nBatches + std::cout << "- Iteration " << i + 1 << " " << nBatches << ". Batch size is " << nPoints << " samples." << std::endl; if (verbose) std::cout << "Computing vertex degrees" << std::endl; @@ -130,6 +131,9 @@ size_t run(const ML::cumlHandle_impl& handle, Type_f* x, Index_ N, Index_ D, CUDA_CHECK(cudaStreamSynchronize(stream)); ML::POP_RANGE(); + if (verbose) + std::cout << "Computing adjacency graph of size " << curradjlen + << std::endl; // Running AdjGraph ML::PUSH_RANGE("Trace::Dbscan::AdjGraph"); if (curradjlen > adjlen || adj_graph.data() == NULL) { @@ -137,8 +141,6 @@ size_t run(const ML::cumlHandle_impl& handle, Type_f* x, Index_ N, Index_ D, adj_graph.resize(adjlen, stream); } - if (verbose) std::cout << "Computing adjacency graph" << std::endl; - AdjGraph::run(handle, adj, vd, adj_graph.data(), adjlen, ex_scan, N, minPts, core_pts, algoAdj, nPoints, stream); diff --git a/cpp/src/dbscan/vertexdeg/algo.h b/cpp/src/dbscan/vertexdeg/algo.h index 6bc5398bd0..3f8fdaca51 100644 --- a/cpp/src/dbscan/vertexdeg/algo.h +++ b/cpp/src/dbscan/vertexdeg/algo.h @@ -41,7 +41,7 @@ void launcher(const ML::cumlHandle_impl &handle, Pack data, "index_t should be 4 or 8 bytes"); index_t m = data.N; - index_t n = min(data.N - startVertexId, batchSize); + index_t n = batchSize; index_t k = data.D; index_t *vd = data.vd; @@ -60,17 +60,17 @@ void launcher(const ML::cumlHandle_impl &handle, Pack data, if (workspaceSize != 0) workspace.resize(workspaceSize, stream); - auto fused_op = [vd, n] __device__(index_t global_c_idx, bool in_neigh) { + auto fused_op = [vd, n, batchSize] __device__(index_t global_c_idx, + bool in_neigh) { // fused construction of vertex degree - index_t batch_vertex = global_c_idx - (n * (global_c_idx / n)); + index_t batch_vertex = fmod(global_c_idx, n); if (sizeof(index_t) == 4) { - atomicAdd((int *)(vd + batch_vertex), (int)in_neigh); - atomicAdd((int *)(vd + n), (int)in_neigh); + atomicAdd((unsigned int *)(vd + batch_vertex), in_neigh); + atomicAdd((unsigned int *)(vd + n), in_neigh); } else if (sizeof(index_t) == 8) { - atomicAdd((unsigned long long *)(vd + batch_vertex), - (unsigned long long)in_neigh); - atomicAdd((unsigned long long *)(vd + n), (unsigned long long)in_neigh); + atomicAdd((unsigned long long *)(vd + batch_vertex), in_neigh); + atomicAdd((unsigned long long *)(vd + n), in_neigh); } }; diff --git a/cpp/src/metrics/metrics.cu b/cpp/src/metrics/metrics.cu index 0a6b51ffbc..79007262cb 100644 --- a/cpp/src/metrics/metrics.cu +++ b/cpp/src/metrics/metrics.cu @@ -59,6 +59,15 @@ double adjustedRandIndex(const cumlHandle &handle, const long *y, handle.getDeviceAllocator(), handle.getStream()); } +double adjustedRandIndex(const cumlHandle &handle, const int *y, + const int *y_hat, const int n, + const int lower_class_range, + const int upper_class_range) { + return MLCommon::Metrics::computeAdjustedRandIndex( + y, y_hat, n, lower_class_range, upper_class_range, + handle.getDeviceAllocator(), handle.getStream()); +} + double klDivergence(const cumlHandle &handle, const double *y, const double *y_hat, int n) { return MLCommon::Metrics::klDivergence( diff --git a/cpp/src/metrics/metrics.hpp b/cpp/src/metrics/metrics.hpp index aa4736b3ac..881d5a9aab 100644 --- a/cpp/src/metrics/metrics.hpp +++ b/cpp/src/metrics/metrics.hpp @@ -111,6 +111,11 @@ double adjustedRandIndex(const cumlHandle &handle, const long *y, const long lower_class_range, const long upper_class_range); +double adjustedRandIndex(const cumlHandle &handle, const int *y, + const int *y_hat, const int n, + const int lower_class_range, + const int upper_class_range); + /** * Calculates the "Kullback-Leibler Divergence" * diff --git a/cpp/test/sg/dbscan_test.cu b/cpp/test/sg/dbscan_test.cu index 075f02fbe7..9518b466d2 100644 --- a/cpp/test/sg/dbscan_test.cu +++ b/cpp/test/sg/dbscan_test.cu @@ -42,9 +42,9 @@ using namespace std; template struct DbscanInputs { - long n_row; - long n_col; - long n_centers; + int n_row; + int n_col; + int n_centers; T cluster_std; T eps; int min_pts; @@ -67,8 +67,8 @@ class DbscanTest : public ::testing::TestWithParam> { device_buffer out(handle.getDeviceAllocator(), handle.getStream(), params.n_row * params.n_col); - device_buffer l(handle.getDeviceAllocator(), handle.getStream(), - params.n_row); + device_buffer l(handle.getDeviceAllocator(), handle.getStream(), + params.n_row); make_blobs(handle, out.data(), l.data(), params.n_row, params.n_col, params.n_centers, nullptr, nullptr, params.cluster_std, true, @@ -108,26 +108,26 @@ class DbscanTest : public ::testing::TestWithParam> { protected: DbscanInputs params; - long *labels, *labels_ref; + int *labels, *labels_ref; double score; }; const std::vector> inputsf2 = { - {50000l, 16l, 5l, 0.01, 2, 2, (size_t)13e8, 1234ULL}, - {500l, 16l, 5l, 0.01, 2, 2, (size_t)100, 1234ULL}, - {1000l, 1000l, 10l, 0.01, 2, 2, (size_t)13e8, 1234ULL}, - {50000l, 16l, 5l, 0.01, 2, 2, (size_t)13e8, 1234ULL}, - {20000l, 10000l, 10l, 0.01, 2, 2, (size_t)13e8, 1234ULL}, - {20000l, 100l, 5000l, 0.01, 2, 2, (size_t)13e8, 1234ULL}}; + {50000, 16, 5, 0.01, 2, 2, (size_t)13e8, 1234ULL}, + {500, 16, 5, 0.01, 2, 2, (size_t)100, 1234ULL}, + {1000, 1000, 10, 0.01, 2, 2, (size_t)13e8, 1234ULL}, + {50000, 16, 5l, 0.01, 2, 2, (size_t)13e8, 1234ULL}, + {20000, 10000, 10, 0.01, 2, 2, (size_t)13e8, 1234ULL}, + {20000, 100, 5000, 0.01, 2, 2, (size_t)13e8, 1234ULL}}; const std::vector> inputsd2 = { - {50000l, 16l, 5l, 0.01, 2, 2, (size_t)13e9, 1234ULL}, - {500l, 16l, 5l, 0.01, 2, 2, (size_t)100, 1234ULL}, - {1000l, 1000l, 10l, 0.01, 2, 2, (size_t)13e9, 1234ULL}, - {100l, 10000l, 10l, 0.01, 2, 2, (size_t)13e9, 1234ULL}, - {20000l, 10000l, 10l, 0.01, 2, 2, (size_t)13e9, 1234ULL}, - {20000l, 100l, 5000l, 0.01, 2, 2, (size_t)13e9, 1234ULL}}; + {50000, 16, 5, 0.01, 2, 2, (size_t)13e9, 1234ULL}, + {500, 16, 5, 0.01, 2, 2, (size_t)100, 1234ULL}, + {1000, 1000, 10, 0.01, 2, 2, (size_t)13e9, 1234ULL}, + {100, 10000, 10, 0.01, 2, 2, (size_t)13e9, 1234ULL}, + {20000, 10000, 10, 0.01, 2, 2, (size_t)13e9, 1234ULL}, + {20000, 100, 5000, 0.01, 2, 2, (size_t)13e9, 1234ULL}}; typedef DbscanTest DbscanTestF; TEST_P(DbscanTestF, Result) { ASSERT_TRUE(score == 1.0); } diff --git a/python/cuml/cluster/dbscan.pyx b/python/cuml/cluster/dbscan.pyx index 1cb1660119..fa582d7bc2 100644 --- a/python/cuml/cluster/dbscan.pyx +++ b/python/cuml/cluster/dbscan.pyx @@ -38,21 +38,21 @@ cdef extern from "dbscan/dbscan.hpp" namespace "ML": cdef void dbscanFit(cumlHandle& handle, float *input, - long n_rows, - long n_cols, + int n_rows, + int n_cols, float eps, int min_pts, - long *labels, + int *labels, size_t max_bytes_per_batch, bool verbose) except + cdef void dbscanFit(cumlHandle& handle, double *input, - long n_rows, - long n_cols, + int n_rows, + int n_cols, double eps, int min_pts, - long *labels, + int *labels, size_t max_bytes_per_batch, bool verbose) except + @@ -186,27 +186,27 @@ class DBSCAN(Base): check_dtype=[np.float32, np.float64]) cdef cumlHandle* handle_ = self.handle.getHandle() - self.labels_ = cudf.Series(zeros(n_rows, dtype=np.int64)) + self.labels_ = cudf.Series(zeros(n_rows, dtype=np.int32)) cdef uintptr_t labels_ptr = get_cudf_column_ptr(self.labels_) if self.dtype == np.float32: dbscanFit(handle_[0], input_ptr, - n_rows, - n_cols, + n_rows, + n_cols, self.eps, self.min_samples, - labels_ptr, + labels_ptr, self.max_bytes_per_batch, self.verbose) else: dbscanFit(handle_[0], input_ptr, - n_rows, - n_cols, + n_rows, + n_cols, self.eps, self.min_samples, - labels_ptr, + labels_ptr, self.max_bytes_per_batch, self.verbose) # make sure that the `dbscanFit` is complete before the following From a9889c7e1efc9b3d90cbbc6f9bba8018f91f112d Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Fri, 27 Sep 2019 14:19:48 -0400 Subject: [PATCH 11/63] More fixes --- cpp/src/dbscan/dbscan.cu | 3 --- cpp/src_prims/sparse/csr.h | 11 +++++------ 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/cpp/src/dbscan/dbscan.cu b/cpp/src/dbscan/dbscan.cu index f94d622584..055c234847 100644 --- a/cpp/src/dbscan/dbscan.cu +++ b/cpp/src/dbscan/dbscan.cu @@ -24,9 +24,6 @@ namespace ML { using namespace Dbscan; -// @todo -// In the below 2 calls, the Index type has been hard-coded to `int64_t` -// We should pick the right Index type based on the input dimensions. void dbscanFit(const cumlHandle &handle, float *input, int n_rows, int n_cols, float eps, int min_pts, int *labels, size_t max_bytes_per_batch, bool verbose) { diff --git a/cpp/src_prims/sparse/csr.h b/cpp/src_prims/sparse/csr.h index 790ad2eac5..03cbd8a0e8 100644 --- a/cpp/src_prims/sparse/csr.h +++ b/cpp/src_prims/sparse/csr.h @@ -708,10 +708,9 @@ __global__ void weak_cc_label_device(Index_ *labels, const Index_ *row_ind, cj = labels[j_ind]; if (ci < cj) { if (sizeof(Index_) == 4) - atomicMin((int *)(labels + j_ind), (int)ci); + atomicMin((unsigned int *)(labels + j_ind), ci); else if (sizeof(Index_) == 8) - atomicMin((unsigned long long int *)(labels + j_ind), - (unsigned long long int)ci); + atomicMin((unsigned long long int *)(labels + j_ind), ci); xa[j_ind] = true; m[0] = true; } else if (ci > cj) { @@ -721,10 +720,10 @@ __global__ void weak_cc_label_device(Index_ *labels, const Index_ *row_ind, } if (ci_mod) { if (sizeof(Index_) == 4) - atomicMin((int *)(labels + startVertexId + tid), (int)ci); + atomicMin((unsigned int *)(labels + startVertexId + tid), ci); else if (sizeof(Index_) == 8) atomicMin((unsigned long long int *)(labels + startVertexId + tid), - (unsigned long long int)ci); + ci); xa[startVertexId + tid] = true; m[0] = true; @@ -742,7 +741,7 @@ __global__ void weak_cc_init_label_kernel(Index_ *labels, Index_ startVertexId, Index_ tid = threadIdx.x + blockIdx.x * TPB_X; if (tid < batchSize) { if (filter_op(tid) && labels[tid + startVertexId] == MAX_LABEL) - labels[startVertexId + tid] = Index_(startVertexId + tid + 1); + labels[startVertexId + tid] = startVertexId + tid + 1; } } From 503753a8d7bbe4ad20a21f77387a69b0ae26c1d1 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Fri, 27 Sep 2019 14:24:26 -0400 Subject: [PATCH 12/63] Couple single and double precision across floating point and int variants for now --- python/cuml/cluster/dbscan.pyx | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/python/cuml/cluster/dbscan.pyx b/python/cuml/cluster/dbscan.pyx index fa582d7bc2..2633dbae7f 100644 --- a/python/cuml/cluster/dbscan.pyx +++ b/python/cuml/cluster/dbscan.pyx @@ -48,11 +48,11 @@ cdef extern from "dbscan/dbscan.hpp" namespace "ML": cdef void dbscanFit(cumlHandle& handle, double *input, - int n_rows, - int n_cols, + long n_rows, + long n_cols, double eps, int min_pts, - int *labels, + long *labels, size_t max_bytes_per_batch, bool verbose) except + @@ -202,11 +202,11 @@ class DBSCAN(Base): else: dbscanFit(handle_[0], input_ptr, - n_rows, - n_cols, + n_rows, + n_cols, self.eps, self.min_samples, - labels_ptr, + labels_ptr, self.max_bytes_per_batch, self.verbose) # make sure that the `dbscanFit` is complete before the following From 2273d2ab9fac36ff4b68b0a454c5f85f940abcd4 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Fri, 27 Sep 2019 16:20:57 -0400 Subject: [PATCH 13/63] Still getting to the bottom of unfortunate single precision instability --- cpp/src/dbscan/runner.h | 2 +- cpp/src/dbscan/vertexdeg/algo.h | 16 ++++++++-------- cpp/src_prims/distance/distance.h | 2 +- python/cuml/test/test_dbscan.py | 4 ++-- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/cpp/src/dbscan/runner.h b/cpp/src/dbscan/runner.h index c8160677ac..1c9ddd3f69 100644 --- a/cpp/src/dbscan/runner.h +++ b/cpp/src/dbscan/runner.h @@ -124,7 +124,7 @@ size_t run(const ML::cumlHandle_impl& handle, Type_f* x, Index_ N, Index_ D, std::cout << "- Iteration " << i + 1 << " " << nBatches << ". Batch size is " << nPoints << " samples." << std::endl; - if (verbose) std::cout << "Computing vertex degrees" << std::endl; + if (verbose) std::cout << "Compting vertex degrees" << std::endl; VertexDeg::run(handle, adj, vd, x, eps, N, D, algoVd, startVertexId, nPoints, stream); MLCommon::updateHost(&curradjlen, vd + nPoints, 1, stream); diff --git a/cpp/src/dbscan/vertexdeg/algo.h b/cpp/src/dbscan/vertexdeg/algo.h index 3f8fdaca51..6bc5398bd0 100644 --- a/cpp/src/dbscan/vertexdeg/algo.h +++ b/cpp/src/dbscan/vertexdeg/algo.h @@ -41,7 +41,7 @@ void launcher(const ML::cumlHandle_impl &handle, Pack data, "index_t should be 4 or 8 bytes"); index_t m = data.N; - index_t n = batchSize; + index_t n = min(data.N - startVertexId, batchSize); index_t k = data.D; index_t *vd = data.vd; @@ -60,17 +60,17 @@ void launcher(const ML::cumlHandle_impl &handle, Pack data, if (workspaceSize != 0) workspace.resize(workspaceSize, stream); - auto fused_op = [vd, n, batchSize] __device__(index_t global_c_idx, - bool in_neigh) { + auto fused_op = [vd, n] __device__(index_t global_c_idx, bool in_neigh) { // fused construction of vertex degree - index_t batch_vertex = fmod(global_c_idx, n); + index_t batch_vertex = global_c_idx - (n * (global_c_idx / n)); if (sizeof(index_t) == 4) { - atomicAdd((unsigned int *)(vd + batch_vertex), in_neigh); - atomicAdd((unsigned int *)(vd + n), in_neigh); + atomicAdd((int *)(vd + batch_vertex), (int)in_neigh); + atomicAdd((int *)(vd + n), (int)in_neigh); } else if (sizeof(index_t) == 8) { - atomicAdd((unsigned long long *)(vd + batch_vertex), in_neigh); - atomicAdd((unsigned long long *)(vd + n), in_neigh); + atomicAdd((unsigned long long *)(vd + batch_vertex), + (unsigned long long)in_neigh); + atomicAdd((unsigned long long *)(vd + n), (unsigned long long)in_neigh); } }; diff --git a/cpp/src_prims/distance/distance.h b/cpp/src_prims/distance/distance.h index 19f9d2c77a..97bdbdc7af 100644 --- a/cpp/src_prims/distance/distance.h +++ b/cpp/src_prims/distance/distance.h @@ -326,7 +326,7 @@ void pairwiseDistance(const Type *x, const Type *y, Type *dist, Index_ m, * the epsilon neighborhood. */ template + typename Index_ = long, typename OutputTile_ = OutputTile_8x128x128> size_t epsilon_neighborhood(const T *a, const T *b, bool *adj, Index_ m, Index_ n, Index_ k, T eps, void *workspace, size_t worksize, cudaStream_t stream, diff --git a/python/cuml/test/test_dbscan.py b/python/cuml/test/test_dbscan.py index 795805465f..8768e6161a 100644 --- a/python/cuml/test/test_dbscan.py +++ b/python/cuml/test/test_dbscan.py @@ -73,8 +73,8 @@ def test_dbscan(datatype, input_type, use_handle, if nrows < 500000: skdbscan = skDBSCAN(eps=1, min_samples=2, algorithm="brute") sk_labels = skdbscan.fit_predict(X) - for i in range(X.shape[0]): - assert cu_labels[i] == sk_labels[i] + score = adjusted_rand_score(cu_labels, sk_labels) + assert score == 1 @pytest.mark.parametrize("name", [ From 9057d94fb61c1df24a9a27121c9bc2ed01a788b3 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Fri, 27 Sep 2019 16:27:07 -0400 Subject: [PATCH 14/63] Computing proper vertex degrees now --- cpp/src/dbscan/runner.h | 2 +- cpp/src/dbscan/vertexdeg/algo.h | 3 ++- python/cuml/cluster/dbscan.pyx | 7 +++++-- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/cpp/src/dbscan/runner.h b/cpp/src/dbscan/runner.h index 1c9ddd3f69..350f6b7a7d 100644 --- a/cpp/src/dbscan/runner.h +++ b/cpp/src/dbscan/runner.h @@ -149,7 +149,7 @@ size_t run(const ML::cumlHandle_impl& handle, Type_f* x, Index_ N, Index_ D, ML::PUSH_RANGE("Trace::Dbscan::WeakCC"); - if (verbose) std::cout << "Computing connected components" << std::endl; + if (verbose) std::cout << "Compuing connected components" << std::endl; MLCommon::Sparse::weak_cc_batched( labels, ex_scan, adj_graph.data(), adjlen, N, startVertexId, nPoints, diff --git a/cpp/src/dbscan/vertexdeg/algo.h b/cpp/src/dbscan/vertexdeg/algo.h index 6bc5398bd0..06502ba6e0 100644 --- a/cpp/src/dbscan/vertexdeg/algo.h +++ b/cpp/src/dbscan/vertexdeg/algo.h @@ -62,7 +62,8 @@ void launcher(const ML::cumlHandle_impl &handle, Pack data, auto fused_op = [vd, n] __device__(index_t global_c_idx, bool in_neigh) { // fused construction of vertex degree - index_t batch_vertex = global_c_idx - (n * (global_c_idx / n)); + index_t batch_vertex = + fmod(global_c_idx, n); //global_c_idx - (n * (global_c_idx / n)); if (sizeof(index_t) == 4) { atomicAdd((int *)(vd + batch_vertex), (int)in_neigh); diff --git a/python/cuml/cluster/dbscan.pyx b/python/cuml/cluster/dbscan.pyx index 2633dbae7f..1a7d4dfd11 100644 --- a/python/cuml/cluster/dbscan.pyx +++ b/python/cuml/cluster/dbscan.pyx @@ -186,10 +186,11 @@ class DBSCAN(Base): check_dtype=[np.float32, np.float64]) cdef cumlHandle* handle_ = self.handle.getHandle() - self.labels_ = cudf.Series(zeros(n_rows, dtype=np.int32)) - cdef uintptr_t labels_ptr = get_cudf_column_ptr(self.labels_) + cdef uintptr_t labels_ptr if self.dtype == np.float32: + self.labels_ = cudf.Series(zeros(n_rows, dtype=np.int32)) + labels_ptr = get_cudf_column_ptr(self.labels_) dbscanFit(handle_[0], input_ptr, n_rows, @@ -200,6 +201,8 @@ class DBSCAN(Base): self.max_bytes_per_batch, self.verbose) else: + self.labels_ = cudf.Series(zeros(n_rows, dtype=np.int64)) + labels_ptr = get_cudf_column_ptr(self.labels_) dbscanFit(handle_[0], input_ptr, n_rows, From 1797923bf894916e90c47d76d893cfe91b6a1044 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Fri, 27 Sep 2019 16:42:40 -0400 Subject: [PATCH 15/63] Light cleanup --- cpp/src/dbscan/runner.h | 14 +++++++++----- cpp/src/dbscan/vertexdeg/algo.h | 3 +-- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/cpp/src/dbscan/runner.h b/cpp/src/dbscan/runner.h index 350f6b7a7d..077c47cada 100644 --- a/cpp/src/dbscan/runner.h +++ b/cpp/src/dbscan/runner.h @@ -121,10 +121,10 @@ size_t run(const ML::cumlHandle_impl& handle, Type_f* x, Index_ N, Index_ D, if (nPoints <= 0) continue; if (verbose) - std::cout << "- Iteration " << i + 1 << " " << nBatches + std::cout << "- Iteration " << i + 1 << " out of " << nBatches << ". Batch size is " << nPoints << " samples." << std::endl; - if (verbose) std::cout << "Compting vertex degrees" << std::endl; + if (verbose) std::cout << "--> Computing vertex degrees" << std::endl; VertexDeg::run(handle, adj, vd, x, eps, N, D, algoVd, startVertexId, nPoints, stream); MLCommon::updateHost(&curradjlen, vd + nPoints, 1, stream); @@ -132,8 +132,8 @@ size_t run(const ML::cumlHandle_impl& handle, Type_f* x, Index_ N, Index_ D, ML::POP_RANGE(); if (verbose) - std::cout << "Computing adjacency graph of size " << curradjlen - << std::endl; + std::cout << "--> Computing adjacency graph of size " << curradjlen + << " samples." << std::endl; // Running AdjGraph ML::PUSH_RANGE("Trace::Dbscan::AdjGraph"); if (curradjlen > adjlen || adj_graph.data() == NULL) { @@ -149,13 +149,15 @@ size_t run(const ML::cumlHandle_impl& handle, Type_f* x, Index_ N, Index_ D, ML::PUSH_RANGE("Trace::Dbscan::WeakCC"); - if (verbose) std::cout << "Compuing connected components" << std::endl; + if (verbose) std::cout << "--> Compuing connected components" << std::endl; MLCommon::Sparse::weak_cc_batched( labels, ex_scan, adj_graph.data(), adjlen, N, startVertexId, nPoints, &state, stream, [core_pts] __device__(Index_ tid) { return core_pts[tid]; }); ML::POP_RANGE(); + + if (verbose) std::cout << " " << std::endl; } ML::PUSH_RANGE("Trace::Dbscan::FinalRelabel"); @@ -165,6 +167,8 @@ size_t run(const ML::cumlHandle_impl& handle, Type_f* x, Index_ N, Index_ D, relabelForSkl<<>>(labels, N, MAX_LABEL); CUDA_CHECK(cudaPeekAtLastError()); ML::POP_RANGE(); + + if (verbose) std::cout << "Done." << std::endl; return (size_t)0; } } // namespace Dbscan diff --git a/cpp/src/dbscan/vertexdeg/algo.h b/cpp/src/dbscan/vertexdeg/algo.h index 06502ba6e0..abefb26d0a 100644 --- a/cpp/src/dbscan/vertexdeg/algo.h +++ b/cpp/src/dbscan/vertexdeg/algo.h @@ -62,8 +62,7 @@ void launcher(const ML::cumlHandle_impl &handle, Pack data, auto fused_op = [vd, n] __device__(index_t global_c_idx, bool in_neigh) { // fused construction of vertex degree - index_t batch_vertex = - fmod(global_c_idx, n); //global_c_idx - (n * (global_c_idx / n)); + index_t batch_vertex = fmod(global_c_idx, n); if (sizeof(index_t) == 4) { atomicAdd((int *)(vd + batch_vertex), (int)in_neigh); From e8e3d2661cc653655c8bbbd62cc0beaa91901545 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Fri, 27 Sep 2019 16:43:05 -0400 Subject: [PATCH 16/63] Fixing typos --- cpp/src/dbscan/runner.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/dbscan/runner.h b/cpp/src/dbscan/runner.h index 077c47cada..8bd97a008b 100644 --- a/cpp/src/dbscan/runner.h +++ b/cpp/src/dbscan/runner.h @@ -121,7 +121,7 @@ size_t run(const ML::cumlHandle_impl& handle, Type_f* x, Index_ N, Index_ D, if (nPoints <= 0) continue; if (verbose) - std::cout << "- Iteration " << i + 1 << " out of " << nBatches + std::cout << "- Iteration " << i + 1 << " / " << nBatches << ". Batch size is " << nPoints << " samples." << std::endl; if (verbose) std::cout << "--> Computing vertex degrees" << std::endl; @@ -149,7 +149,7 @@ size_t run(const ML::cumlHandle_impl& handle, Type_f* x, Index_ N, Index_ D, ML::PUSH_RANGE("Trace::Dbscan::WeakCC"); - if (verbose) std::cout << "--> Compuing connected components" << std::endl; + if (verbose) std::cout << "--> Computing connected components" << std::endl; MLCommon::Sparse::weak_cc_batched( labels, ex_scan, adj_graph.data(), adjlen, N, startVertexId, nPoints, From 8a0122bdc46724c9857d88391c5d6fec259528eb Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Fri, 27 Sep 2019 16:51:17 -0400 Subject: [PATCH 17/63] Updating changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index b25cf8c994..0d9c26f8ab 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,7 @@ - PR #1086: Ensure RegressorMixin scorer uses device arrays - PR #1114: K-means: Exposing useful params, removing unused params, proxying params in Dask - PR #1136: CUDA 10.1 CI updates +- PR #1163: Some more correctness improvements. Better verbose printing ## Bug Fixes From 0b6db552fcadfe139cab44280bc9ab20e6877e92 Mon Sep 17 00:00:00 2001 From: Onur Yilmaz Date: Sat, 28 Sep 2019 11:56:13 -0400 Subject: [PATCH 18/63] Dask RF updated to fix the random seed bug. --- cpp/src/randomforest/randomforest_impl.cuh | 16 +++++++--------- cpp/src/randomforest/randomforest_impl.h | 2 +- .../cuml/dask/ensemble/randomforestclassifier.py | 9 +++++++++ .../cuml/dask/ensemble/randomforestregressor.py | 9 +++++++++ 4 files changed, 26 insertions(+), 10 deletions(-) diff --git a/cpp/src/randomforest/randomforest_impl.cuh b/cpp/src/randomforest/randomforest_impl.cuh index 60096c2e12..471e6da754 100644 --- a/cpp/src/randomforest/randomforest_impl.cuh +++ b/cpp/src/randomforest/randomforest_impl.cuh @@ -70,10 +70,10 @@ void random_uniformInt(int treeid, unsigned int* data, int len, int n_rows, template void rf::prepare_fit_per_tree( int tree_id, int n_rows, int n_sampled_rows, unsigned int* selected_rows, - int seed, const int num_sms, const cudaStream_t stream, + const int num_sms, const cudaStream_t stream, const std::shared_ptr device_allocator) { - int rs = tree_id * 1000; - if (seed != -1) rs = seed * 1000; + int rs = tree_id; + if (rf_params.seed > -1) rs = rf_params.seed + tree_id; srand(rs * 1000); if (rf_params.bootstrap) { @@ -225,9 +225,8 @@ void rfClassifier::fit(const cumlHandle& user_handle, const T* input, rowids = selected_rows[stream_id]->data(); this->prepare_fit_per_tree( - i, n_rows, n_sampled_rows, rowids, (this->rf_params.seed + i), - tempmem[stream_id]->num_sms, tempmem[stream_id]->stream, - handle.getDeviceAllocator()); + i, n_rows, n_sampled_rows, rowids, tempmem[stream_id]->num_sms, + tempmem[stream_id]->stream, handle.getDeviceAllocator()); /* Build individual tree in the forest. - input is a pointer to orig data that have n_cols features and n_rows rows. @@ -488,9 +487,8 @@ void rfRegressor::fit(const cumlHandle& user_handle, const T* input, int stream_id = omp_get_thread_num(); unsigned int* rowids = selected_rows[stream_id]->data(); this->prepare_fit_per_tree( - i, n_rows, n_sampled_rows, rowids, (this->rf_params.seed + i), - tempmem[stream_id]->num_sms, tempmem[stream_id]->stream, - handle.getDeviceAllocator()); + i, n_rows, n_sampled_rows, rowids, tempmem[stream_id]->num_sms, + tempmem[stream_id]->stream, handle.getDeviceAllocator()); /* Build individual tree in the forest. - input is a pointer to orig data that have n_cols features and n_rows rows. diff --git a/cpp/src/randomforest/randomforest_impl.h b/cpp/src/randomforest/randomforest_impl.h index cafa675838..3369c82ae1 100644 --- a/cpp/src/randomforest/randomforest_impl.h +++ b/cpp/src/randomforest/randomforest_impl.h @@ -30,7 +30,7 @@ class rf { virtual ~rf() = default; void prepare_fit_per_tree( int tree_id, int n_rows, int n_sampled_rows, unsigned int* selected_rows, - int seed, int num_sms, const cudaStream_t stream, + int num_sms, const cudaStream_t stream, const std::shared_ptr device_allocator); void error_checking(const T* input, L* predictions, int n_rows, int n_cols, diff --git a/python/cuml/dask/ensemble/randomforestclassifier.py b/python/cuml/dask/ensemble/randomforestclassifier.py index 9e063e18a4..8187af137a 100755 --- a/python/cuml/dask/ensemble/randomforestclassifier.py +++ b/python/cuml/dask/ensemble/randomforestclassifier.py @@ -194,6 +194,12 @@ def __init__( self.n_estimators_per_worker[i] + 1 ) + seeds = list() + seeds.append(0) + for i in range(1, len(self.n_estimators_per_worker)): + sd = self.n_estimators_per_worker[i-1] + seeds[i-1] + seeds.append(sd) + key = str(uuid1()) self.rfs = { worker: c.submit( @@ -213,6 +219,7 @@ def __init__( rows_sample, max_leaves, quantile_per_tree, + seeds[n], dtype, key="%s-%s" % (key, n), workers=[worker], @@ -243,6 +250,7 @@ def _func_build_rf( rows_sample, max_leaves, quantile_per_tree, + seed, dtype, ): return cuRFC( @@ -262,6 +270,7 @@ def _func_build_rf( max_leaves=max_leaves, n_streams=n_streams, quantile_per_tree=quantile_per_tree, + seed=seed, gdf_datatype=dtype, ) diff --git a/python/cuml/dask/ensemble/randomforestregressor.py b/python/cuml/dask/ensemble/randomforestregressor.py index b86e1a9269..e12bb75056 100755 --- a/python/cuml/dask/ensemble/randomforestregressor.py +++ b/python/cuml/dask/ensemble/randomforestregressor.py @@ -197,6 +197,12 @@ def __init__( self.n_estimators_per_worker[i] + 1 ) + seeds = list() + seeds.append(0) + for i in range(1, len(self.n_estimators_per_worker)): + sd = self.n_estimators_per_worker[i-1] + seeds[i-1] + seeds.append(sd) + key = str(uuid1()) self.rfs = { worker: c.submit( @@ -216,6 +222,7 @@ def __init__( max_leaves, accuracy_metric, quantile_per_tree, + seeds[n], key="%s-%s" % (key, n), workers=[worker], ) @@ -245,6 +252,7 @@ def _func_build_rf( max_leaves, accuracy_metric, quantile_per_tree, + seed, ): return cuRFR( @@ -264,6 +272,7 @@ def _func_build_rf( n_streams=n_streams, accuracy_metric=accuracy_metric, quantile_per_tree=quantile_per_tree, + seed=seed, ) @staticmethod From 0eacd14a382869208ebc04d30dda61a2c0dc0d53 Mon Sep 17 00:00:00 2001 From: Onur Yilmaz Date: Sat, 28 Sep 2019 12:02:49 -0400 Subject: [PATCH 19/63] Fixed the formatting issues. --- python/cuml/ensemble/randomforestclassifier.pyx | 4 ++-- python/cuml/ensemble/randomforestregressor.pyx | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cuml/ensemble/randomforestclassifier.pyx b/python/cuml/ensemble/randomforestclassifier.pyx index 8dc308bd9d..5d7aadb656 100644 --- a/python/cuml/ensemble/randomforestclassifier.pyx +++ b/python/cuml/ensemble/randomforestclassifier.pyx @@ -304,8 +304,8 @@ class RandomForestClassifier(Base): min_samples_leaf=None, min_weight_fraction_leaf=None, max_leaf_nodes=None, min_impurity_decrease=None, min_impurity_split=None, oob_score=None, n_jobs=None, - random_state=None, warm_start=None, class_weight=None, - int seed=-1): + random_state=None, warm_start=None, class_weight=None, + seed=-1): sklearn_params = {"criterion": criterion, "min_samples_leaf": min_samples_leaf, diff --git a/python/cuml/ensemble/randomforestregressor.pyx b/python/cuml/ensemble/randomforestregressor.pyx index f86ff332ad..b3cd2f2bf6 100644 --- a/python/cuml/ensemble/randomforestregressor.pyx +++ b/python/cuml/ensemble/randomforestregressor.pyx @@ -288,7 +288,7 @@ class RandomForestRegressor(Base): max_leaf_nodes=None, min_impurity_decrease=None, min_impurity_split=None, oob_score=None, random_state=None, warm_start=None, class_weight=None, - quantile_per_tree=False, criterion=None, int seed=-1): + quantile_per_tree=False, criterion=None, seed=-1): sklearn_params = {"criterion": criterion, "min_samples_leaf": min_samples_leaf, From c9b3f3d48cd224f852c86bf80e1616b1b91ef1c7 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Sat, 28 Sep 2019 13:06:03 -0400 Subject: [PATCH 20/63] Increasing number of batches when single precision isn't large enough to store the batch size * N (for adjacency graph & CSR row in array. Will eventually expose an argument through Python to allow the user to control the precision of the outputs (and thus the precision of the internal structures.). It's important to allow the user to set this explicitly because there's a huge tradeoff to be made between a larger number of much faster batches and a smaller number of much slower batches. --- cpp/src/dbscan/adjgraph/naive.h | 2 +- cpp/src/dbscan/adjgraph/pack.h | 2 +- cpp/src/dbscan/dbscan.h | 17 ++++++++++++--- cpp/src/dbscan/runner.h | 35 ++++++++++++++++++++++++------- cpp/src/dbscan/vertexdeg/algo.h | 2 +- cpp/src/dbscan/vertexdeg/naive.h | 2 +- cpp/src/dbscan/vertexdeg/runner.h | 2 +- cpp/src_prims/distance/distance.h | 2 +- python/cuml/cluster/dbscan.pyx | 20 ++++++++---------- 9 files changed, 57 insertions(+), 27 deletions(-) diff --git a/cpp/src/dbscan/adjgraph/naive.h b/cpp/src/dbscan/adjgraph/naive.h index 41d10b2558..ff211ad46c 100644 --- a/cpp/src/dbscan/adjgraph/naive.h +++ b/cpp/src/dbscan/adjgraph/naive.h @@ -26,7 +26,7 @@ namespace Dbscan { namespace AdjGraph { namespace Naive { -template +template void launcher(const ML::cumlHandle_impl& handle, Pack data, Index_ batchSize, cudaStream_t stream) { Index_ k = 0; diff --git a/cpp/src/dbscan/adjgraph/pack.h b/cpp/src/dbscan/adjgraph/pack.h index f1d028a2e6..fd002363cd 100644 --- a/cpp/src/dbscan/adjgraph/pack.h +++ b/cpp/src/dbscan/adjgraph/pack.h @@ -19,7 +19,7 @@ namespace Dbscan { namespace AdjGraph { -template +template struct Pack { /** * vertex degree array diff --git a/cpp/src/dbscan/dbscan.h b/cpp/src/dbscan/dbscan.h index b38793215d..b0dd3d5f9c 100644 --- a/cpp/src/dbscan/dbscan.h +++ b/cpp/src/dbscan/dbscan.h @@ -27,7 +27,7 @@ static const size_t DEFAULT_MAX_MEM_BYTES = 13e9; // Default max mem set to a reasonable value for a 16gb card. -template +template Index_ computeBatchCount(Index_ n_rows, size_t max_bytes_per_batch) { Index_ n_batches = 1; // There seems to be a weird overflow bug with cutlass gemm kernels @@ -37,16 +37,27 @@ Index_ computeBatchCount(Index_ n_rows, size_t max_bytes_per_batch) { if (max_bytes_per_batch <= 0) max_bytes_per_batch = DEFAULT_MAX_MEM_BYTES; + Index_ MAX_LABEL = std::numeric_limits::max(); + while (true) { size_t batchSize = ceildiv(n_rows, n_batches); - if (batchSize * n_rows * sizeof(T) < max_bytes_per_batch || batchSize == 1) + if (((batchSize * n_rows * sizeof(T) < max_bytes_per_batch) && + /** + * Though single precision can be faster per execution of each kernel, + * there's a trade-off to be made between using single precision with + * many more batches (which become smaller as n_rows grows) and using + * double precision, which will have less batches but could become 8-10x + * slower per batch. + */ + (batchSize * n_rows < MAX_LABEL)) || + batchSize == 1) break; ++n_batches; } return n_batches; } -template +template void dbscanFitImpl(const ML::cumlHandle_impl &handle, T *input, Index_ n_rows, Index_ n_cols, T eps, int min_pts, Index_ *labels, size_t max_bytes_per_batch, cudaStream_t stream, diff --git a/cpp/src/dbscan/runner.h b/cpp/src/dbscan/runner.h index 8bd97a008b..0e2136c1b6 100644 --- a/cpp/src/dbscan/runner.h +++ b/cpp/src/dbscan/runner.h @@ -36,7 +36,7 @@ static const int TPB = 256; * 1. Turn any labels matching MAX_LABEL into -1 * 2. Subtract 1 from all other labels. */ -template +template __global__ void relabelForSkl(Index_* labels, Index_ N, Index_ MAX_LABEL) { Index_ tid = threadIdx.x + blockDim.x * blockIdx.x; if (labels[tid] == MAX_LABEL) @@ -49,7 +49,7 @@ __global__ void relabelForSkl(Index_* labels, Index_ N, Index_ MAX_LABEL) { * Turn the non-monotonic labels from weak_cc primitive into * an array of labels drawn from a monotonically increasing set. */ -template +template void final_relabel(Index_* db_cluster, Index_ N, cudaStream_t stream) { Index_ MAX_LABEL = std::numeric_limits::max(); MLCommon::Label::make_monotonic( @@ -69,13 +69,25 @@ void final_relabel(Index_* db_cluster, Index_ N, cudaStream_t stream) { * @param stream the cudaStream where to launch the kernels * @return in case the temp buffer is null, this returns the size needed. */ -template +template size_t run(const ML::cumlHandle_impl& handle, Type_f* x, Index_ N, Index_ D, Type_f eps, Type minPts, Index_* labels, int algoVd, int algoAdj, int algoCcl, void* workspace, Index_ nBatches, cudaStream_t stream, bool verbose = false) { const size_t align = 256; - Index_ batchSize = ceildiv(N, nBatches); + size_t batchSize = ceildiv(N, nBatches); + + /** + * Note on coupling between data types: + * - adjacency graph has a worst case size of N * batchSize elements. Thus, + * if N is very close to being greater than the maximum 32-bit IdxType type used, a + * 64-bit IdxType should probably be used instead. + * - exclusive scan is the CSR row index for the adjacency graph and its values have a + * risk of overflowing when N * batchSize becomes larger what can be stored in IdxType + * - the vertex degree array has a worst case of each element having all other + * elements in their neighborhood, so any IdxType can be safely used, so long as N doesn't + * overflow. + */ size_t adjSize = alignTo(sizeof(bool) * N * batchSize, align); size_t corePtsSize = alignTo(sizeof(bool) * batchSize, align); size_t xaSize = alignTo(sizeof(bool) * N, align); @@ -83,13 +95,23 @@ size_t run(const ML::cumlHandle_impl& handle, Type_f* x, Index_ N, Index_ D, size_t vdSize = alignTo(sizeof(Index_) * (batchSize + 1), align); size_t exScanSize = alignTo(sizeof(Index_) * batchSize, align); + // TODO: We should ASSERT that N * batchSize is greater than the maximum value used + Index_ MAX_LABEL = std::numeric_limits::max(); + + ASSERT( + N * batchSize < MAX_LABEL, + "An overflow occurred with the current choice of precision " + "and the number of samples. (Max allowed batch size is %d, but was %d)", + MAX_LABEL / N, batchSize); + if (workspace == NULL) { auto size = adjSize + corePtsSize + 2 * xaSize + mSize + vdSize + exScanSize; return size; } - // partition the temporary workspace needed for different stages of dbscan + // partition the temporary workspace needed for different stages of dbscan. + Index_ adjlen = 0; Index_ curradjlen = 0; char* temp = (char*)workspace; @@ -117,7 +139,7 @@ size_t run(const ML::cumlHandle_impl& handle, Type_f* x, Index_ N, Index_ D, stream); Index_ startVertexId = i * batchSize; - Index_ nPoints = min(N - startVertexId, batchSize); + Index_ nPoints = min(size_t(N - startVertexId), batchSize); if (nPoints <= 0) continue; if (verbose) @@ -162,7 +184,6 @@ size_t run(const ML::cumlHandle_impl& handle, Type_f* x, Index_ N, Index_ D, ML::PUSH_RANGE("Trace::Dbscan::FinalRelabel"); if (algoCcl == 2) final_relabel(labels, N, stream); - Index_ MAX_LABEL = std::numeric_limits::max(); size_t nblks = ceildiv(N, TPB); relabelForSkl<<>>(labels, N, MAX_LABEL); CUDA_CHECK(cudaPeekAtLastError()); diff --git a/cpp/src/dbscan/vertexdeg/algo.h b/cpp/src/dbscan/vertexdeg/algo.h index abefb26d0a..6b1bf29b24 100644 --- a/cpp/src/dbscan/vertexdeg/algo.h +++ b/cpp/src/dbscan/vertexdeg/algo.h @@ -32,7 +32,7 @@ namespace Algo { /** * Calculates the vertex degree array and the epsilon neighborhood adjacency matrix for the batch. */ -template +template void launcher(const ML::cumlHandle_impl &handle, Pack data, index_t startVertexId, index_t batchSize, cudaStream_t stream) { data.resetArray(stream, batchSize + 1); diff --git a/cpp/src/dbscan/vertexdeg/naive.h b/cpp/src/dbscan/vertexdeg/naive.h index d5f8ba67f8..f2a9cf4b13 100644 --- a/cpp/src/dbscan/vertexdeg/naive.h +++ b/cpp/src/dbscan/vertexdeg/naive.h @@ -38,7 +38,7 @@ static const int TPB_Y = 8; * @param N number of rows * @param D number of columns */ -template +template __global__ void vertex_degree_kernel(Pack data, Index_ startVertexId, Index_ batchSize) { const Type Zero = (Type)0; diff --git a/cpp/src/dbscan/vertexdeg/runner.h b/cpp/src/dbscan/vertexdeg/runner.h index ebfb23c350..b24e808053 100644 --- a/cpp/src/dbscan/vertexdeg/runner.h +++ b/cpp/src/dbscan/vertexdeg/runner.h @@ -24,7 +24,7 @@ namespace Dbscan { namespace VertexDeg { -template +template void run(const ML::cumlHandle_impl& handle, bool* adj, Index_* vd, Type_f* x, Type_f eps, Index_ N, Index_ D, int algo, Index_ startVertexId, Index_ batchSize, cudaStream_t stream) { diff --git a/cpp/src_prims/distance/distance.h b/cpp/src_prims/distance/distance.h index 97bdbdc7af..19f9d2c77a 100644 --- a/cpp/src_prims/distance/distance.h +++ b/cpp/src_prims/distance/distance.h @@ -326,7 +326,7 @@ void pairwiseDistance(const Type *x, const Type *y, Type *dist, Index_ m, * the epsilon neighborhood. */ template + typename Index_ = int, typename OutputTile_ = OutputTile_8x128x128> size_t epsilon_neighborhood(const T *a, const T *b, bool *adj, Index_ m, Index_ n, Index_ k, T eps, void *workspace, size_t worksize, cudaStream_t stream, diff --git a/python/cuml/cluster/dbscan.pyx b/python/cuml/cluster/dbscan.pyx index 1a7d4dfd11..5fad5228fe 100644 --- a/python/cuml/cluster/dbscan.pyx +++ b/python/cuml/cluster/dbscan.pyx @@ -48,11 +48,11 @@ cdef extern from "dbscan/dbscan.hpp" namespace "ML": cdef void dbscanFit(cumlHandle& handle, double *input, - long n_rows, - long n_cols, + int n_rows, + int n_cols, double eps, int min_pts, - long *labels, + int *labels, size_t max_bytes_per_batch, bool verbose) except + @@ -187,10 +187,10 @@ class DBSCAN(Base): cdef cumlHandle* handle_ = self.handle.getHandle() - cdef uintptr_t labels_ptr + self.labels_ = cudf.Series(zeros(n_rows, dtype=np.int32)) + cdef uintptr_t labels_ptr = get_cudf_column_ptr(self.labels_) + if self.dtype == np.float32: - self.labels_ = cudf.Series(zeros(n_rows, dtype=np.int32)) - labels_ptr = get_cudf_column_ptr(self.labels_) dbscanFit(handle_[0], input_ptr, n_rows, @@ -201,15 +201,13 @@ class DBSCAN(Base): self.max_bytes_per_batch, self.verbose) else: - self.labels_ = cudf.Series(zeros(n_rows, dtype=np.int64)) - labels_ptr = get_cudf_column_ptr(self.labels_) dbscanFit(handle_[0], input_ptr, - n_rows, - n_cols, + n_rows, + n_cols, self.eps, self.min_samples, - labels_ptr, + labels_ptr, self.max_bytes_per_batch, self.verbose) # make sure that the `dbscanFit` is complete before the following From 4c1b4a8ffd905493fa40fd4ee18ebb0f684a34cc Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Sat, 28 Sep 2019 21:33:43 -0400 Subject: [PATCH 21/63] Adding `out_dtype` argument to dbscan fit() and tests for valid output --- python/cuml/cluster/dbscan.pyx | 101 +++++++++++++++++++++++++------- python/cuml/test/test_dbscan.py | 18 +++++- 2 files changed, 94 insertions(+), 25 deletions(-) diff --git a/python/cuml/cluster/dbscan.pyx b/python/cuml/cluster/dbscan.pyx index 5fad5228fe..2f002e5c36 100644 --- a/python/cuml/cluster/dbscan.pyx +++ b/python/cuml/cluster/dbscan.pyx @@ -56,6 +56,27 @@ cdef extern from "dbscan/dbscan.hpp" namespace "ML": size_t max_bytes_per_batch, bool verbose) except + + cdef void dbscanFit(cumlHandle& handle, + float *input, + long n_rows, + long n_cols, + double eps, + int min_pts, + long *labels, + size_t max_bytes_per_batch, + bool verbose) except + + + cdef void dbscanFit(cumlHandle& handle, + double *input, + long n_rows, + long n_cols, + double eps, + int min_pts, + long *labels, + size_t max_bytes_per_batch, + bool verbose) except + + + class DBSCAN(Base): """ @@ -164,7 +185,7 @@ class DBSCAN(Base): if attr == 'labels_array': return self.labels_._column._data.mem - def fit(self, X): + def fit(self, X, out_dtype="auto"): """ Perform DBSCAN clustering from features. @@ -174,11 +195,23 @@ class DBSCAN(Base): Dense matrix (floats or doubles) of shape (n_samples, n_features). Acceptable formats: cuDF DataFrame, NumPy ndarray, Numba device ndarray, cuda array interface compliant array like CuPy + out_dtype: dtype Determines the precision of the output labels array. + default: "auto". Valid values are { "auto", "int32", + np.int32, "int64", np.int64} """ if self.labels_ is not None: del self.labels_ + if out_dtype == "auto": + out_dtype = np.int32 if X.shape[0] < 1e6 else np.int64 + elif out_dtype not in ["int32", np.int32, "int64", np.int64]: + raise ValueError("Invalid value for out_dtype. " + "Valid values are {'auto', 'int32', 'int64', " + "np.int32, np.int64}") + + + cdef uintptr_t input_ptr X_m, input_ptr, n_rows, n_cols, self.dtype = \ @@ -187,36 +220,60 @@ class DBSCAN(Base): cdef cumlHandle* handle_ = self.handle.getHandle() - self.labels_ = cudf.Series(zeros(n_rows, dtype=np.int32)) + self.labels_ = cudf.Series(zeros(n_rows, dtype=out_dtype)) cdef uintptr_t labels_ptr = get_cudf_column_ptr(self.labels_) if self.dtype == np.float32: - dbscanFit(handle_[0], - input_ptr, - n_rows, - n_cols, - self.eps, - self.min_samples, - labels_ptr, - self.max_bytes_per_batch, - self.verbose) + if out_dtype is "int32" or out_dtype is np.int32: + dbscanFit(handle_[0], + input_ptr, + n_rows, + n_cols, + self.eps, + self.min_samples, + labels_ptr, + self.max_bytes_per_batch, + self.verbose) + else: + dbscanFit(handle_[0], + input_ptr, + n_rows, + n_cols, + self.eps, + self.min_samples, + labels_ptr, + self.max_bytes_per_batch, + self.verbose) + else: - dbscanFit(handle_[0], - input_ptr, - n_rows, - n_cols, - self.eps, - self.min_samples, - labels_ptr, - self.max_bytes_per_batch, - self.verbose) + if out_dtype is "int32" or out_dtype is np.int32: + dbscanFit(handle_[0], + input_ptr, + n_rows, + n_cols, + self.eps, + self.min_samples, + labels_ptr, + self.max_bytes_per_batch, + self.verbose) + else: + dbscanFit(handle_[0], + input_ptr, + n_rows, + n_cols, + self.eps, + self.min_samples, + labels_ptr, + self.max_bytes_per_batch, + self.verbose) + # make sure that the `dbscanFit` is complete before the following # delete call happens self.handle.sync() del(X_m) return self - def fit_predict(self, X): + def fit_predict(self, X, out_dtype="auto"): """ Performs clustering on input_gdf and returns cluster labels. @@ -232,7 +289,7 @@ class DBSCAN(Base): y : cuDF Series, shape (n_samples) cluster labels """ - self.fit(X) + self.fit(X, out_dtype) return self.labels_ def get_param_names(self): diff --git a/python/cuml/test/test_dbscan.py b/python/cuml/test/test_dbscan.py index 8768e6161a..7783b0f2b3 100644 --- a/python/cuml/test/test_dbscan.py +++ b/python/cuml/test/test_dbscan.py @@ -51,8 +51,13 @@ def stress_param(*args, **kwargs): stress_param(500000)]) @pytest.mark.parametrize('ncols', [unit_param(3), quality_param(100), stress_param(1000)]) +@pytest.mark.parametrize('out_dtype', [unit_param("int32"), + unit_param(np.int32), + unit_param("int64"), + unit_param(np.int64), unit_param("auto"), + quality_param("auto"), stress_param("auto")]) def test_dbscan(datatype, input_type, use_handle, - nrows, ncols, max_bytes_per_batch): + nrows, ncols, max_bytes_per_batch, out_dtype): n_samples = nrows n_feats = ncols X, y = make_blobs(n_samples=n_samples, cluster_std=0.01, @@ -66,9 +71,9 @@ def test_dbscan(datatype, input_type, use_handle, X = pd.DataFrame( {'fea%d' % i: X[0:, i] for i in range(X.shape[1])}) X_cudf = cudf.DataFrame.from_pandas(X) - cu_labels = cudbscan.fit_predict(X_cudf) + cu_labels = cudbscan.fit_predict(X_cudf, out_dtype=out_dtype) else: - cu_labels = cudbscan.fit_predict(X) + cu_labels = cudbscan.fit_predict(X, out_dtype=out_dtype) if nrows < 500000: skdbscan = skDBSCAN(eps=1, min_samples=2, algorithm="brute") @@ -109,3 +114,10 @@ def test_dbscan_sklearn_comparison(name, nrows): score = adjusted_rand_score(sk_y_pred, cu_y_pred) assert(score == 1.0) + +@pytest.mark.xfail(strict=True, raises=ValueError) +def test_dbscan_out_dtype_fails_invalid_input(): + X, _ = make_blobs(n_samples=100) + + cudbscan = cuDBSCAN() + cudbscan.fit_predict(X, out_dtype="bad_input") From 6836727e7098df06f9f4aad9d3ae19678769e489 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Sat, 28 Sep 2019 21:38:11 -0400 Subject: [PATCH 22/63] Adding more explicit asserts for output label dtypes --- python/cuml/test/test_dbscan.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/python/cuml/test/test_dbscan.py b/python/cuml/test/test_dbscan.py index 7783b0f2b3..28e2052dfc 100644 --- a/python/cuml/test/test_dbscan.py +++ b/python/cuml/test/test_dbscan.py @@ -54,7 +54,8 @@ def stress_param(*args, **kwargs): @pytest.mark.parametrize('out_dtype', [unit_param("int32"), unit_param(np.int32), unit_param("int64"), - unit_param(np.int64), unit_param("auto"), + unit_param(np.int64), + unit_param("auto"), quality_param("auto"), stress_param("auto")]) def test_dbscan(datatype, input_type, use_handle, nrows, ncols, max_bytes_per_batch, out_dtype): @@ -81,6 +82,13 @@ def test_dbscan(datatype, input_type, use_handle, score = adjusted_rand_score(cu_labels, sk_labels) assert score == 1 + if out_dtype is "int32" or out_dtype is np.int32: + assert cu_labels.dtype == np.int32 + elif out_dtype is "int64" or out_dtype is np.int64: + assert cu_labels.dtype == np.int64 + else: # out_dtype was "auto" and we shouldn't have any inputs > threshold + assert cu_labels.dtype == np.int32 + @pytest.mark.parametrize("name", [ 'noisy_moons', From f46cdaae40a659d71ee69bd6c6bfb3fab583ee82 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Sat, 28 Sep 2019 22:24:40 -0400 Subject: [PATCH 23/63] Changing max_bytes_per_batch to max_mbytes_per_batch to avoid potential overflow and allow increased overall batch size. --- cpp/src/dbscan/dbscan.h | 15 +++++++-------- python/cuml/cluster/dbscan.pyx | 33 ++++++++++++++++----------------- python/cuml/test/test_dbscan.py | 6 +++--- 3 files changed, 26 insertions(+), 28 deletions(-) diff --git a/cpp/src/dbscan/dbscan.h b/cpp/src/dbscan/dbscan.h index b0dd3d5f9c..6eabc54418 100644 --- a/cpp/src/dbscan/dbscan.h +++ b/cpp/src/dbscan/dbscan.h @@ -23,25 +23,24 @@ namespace ML { using namespace Dbscan; -static const size_t DEFAULT_MAX_MEM_BYTES = 13e9; +static const size_t DEFAULT_MAX_MEM_MBYTES = 13e3; // Default max mem set to a reasonable value for a 16gb card. - template -Index_ computeBatchCount(Index_ n_rows, size_t max_bytes_per_batch) { +Index_ computeBatchCount(Index_ n_rows, size_t max_mbytes_per_batch) { Index_ n_batches = 1; // There seems to be a weird overflow bug with cutlass gemm kernels // hence, artifically limiting to a smaller batchsize! ///TODO: in future, when we bump up the underlying cutlass version, this should go away // paving way to cudaMemGetInfo based workspace allocation - if (max_bytes_per_batch <= 0) max_bytes_per_batch = DEFAULT_MAX_MEM_BYTES; + if (max_mbytes_per_batch <= 0) max_mbytes_per_batch = DEFAULT_MAX_MEM_MBYTES; Index_ MAX_LABEL = std::numeric_limits::max(); while (true) { size_t batchSize = ceildiv(n_rows, n_batches); - if (((batchSize * n_rows * sizeof(T) < max_bytes_per_batch) && + if (((batchSize * n_rows * sizeof(T) * 1e-6 < max_mbytes_per_batch) && /** * Though single precision can be faster per execution of each kernel, * there's a trade-off to be made between using single precision with @@ -60,7 +59,7 @@ Index_ computeBatchCount(Index_ n_rows, size_t max_bytes_per_batch) { template void dbscanFitImpl(const ML::cumlHandle_impl &handle, T *input, Index_ n_rows, Index_ n_cols, T eps, int min_pts, Index_ *labels, - size_t max_bytes_per_batch, cudaStream_t stream, + size_t max_mbytes_per_batch, cudaStream_t stream, bool verbose) { ML::PUSH_RANGE("ML::Dbscan::Fit"); int algoVd = 1; @@ -68,14 +67,14 @@ void dbscanFitImpl(const ML::cumlHandle_impl &handle, T *input, Index_ n_rows, int algoCcl = 2; // @todo: Query device for remaining memory - Index_ n_batches = computeBatchCount(n_rows, max_bytes_per_batch); + Index_ n_batches = computeBatchCount(n_rows, max_mbytes_per_batch); if (verbose) { Index_ batchSize = ceildiv(n_rows, n_batches); if (n_batches > 1) { std::cout << "Running batched training on " << n_batches << " batches w/ "; - std::cout << batchSize * n_rows * sizeof(T) << " bytes." << std::endl; + std::cout << batchSize * n_rows * sizeof(T) * 1e-6 << " megabytes." << std::endl; } } diff --git a/python/cuml/cluster/dbscan.pyx b/python/cuml/cluster/dbscan.pyx index 2f002e5c36..5b3717fc83 100644 --- a/python/cuml/cluster/dbscan.pyx +++ b/python/cuml/cluster/dbscan.pyx @@ -43,7 +43,7 @@ cdef extern from "dbscan/dbscan.hpp" namespace "ML": float eps, int min_pts, int *labels, - size_t max_bytes_per_batch, + size_t max_mbytes_per_batch, bool verbose) except + cdef void dbscanFit(cumlHandle& handle, @@ -53,7 +53,7 @@ cdef extern from "dbscan/dbscan.hpp" namespace "ML": double eps, int min_pts, int *labels, - size_t max_bytes_per_batch, + size_t max_mbytes_per_batch, bool verbose) except + cdef void dbscanFit(cumlHandle& handle, @@ -63,7 +63,7 @@ cdef extern from "dbscan/dbscan.hpp" namespace "ML": double eps, int min_pts, long *labels, - size_t max_bytes_per_batch, + size_t max_mbytes_per_batch, bool verbose) except + cdef void dbscanFit(cumlHandle& handle, @@ -73,7 +73,7 @@ cdef extern from "dbscan/dbscan.hpp" namespace "ML": double eps, int min_pts, long *labels, - size_t max_bytes_per_batch, + size_t max_mbytes_per_batch, bool verbose) except + @@ -129,7 +129,7 @@ class DBSCAN(Base): considered as an important core point (including the point itself). verbose : bool Whether to print debug spews - max_bytes_per_batch : (optional) int64 + max_mbytes_per_batch : (optional) int64 Calculate batch size using no more than this number of bytes for the pairwise distance computation. This enables the trade-off between runtime and memory usage for making the N^2 pairwise distance @@ -169,17 +169,17 @@ class DBSCAN(Base): """ def __init__(self, eps=0.5, handle=None, min_samples=5, verbose=False, - max_bytes_per_batch=None): + max_mbytes_per_batch=None): super(DBSCAN, self).__init__(handle, verbose) self.eps = eps self.min_samples = min_samples self.labels_ = None - self.max_bytes_per_batch = max_bytes_per_batch + self.max_mbytes_per_batch = max_mbytes_per_batch self.verbose = verbose # C++ API expects this to be numeric. - if self.max_bytes_per_batch is None: - self.max_bytes_per_batch = 0 + if self.max_mbytes_per_batch is None: + self.max_mbytes_per_batch = 0 def __getattr__(self, attr): if attr == 'labels_array': @@ -196,8 +196,9 @@ class DBSCAN(Base): Acceptable formats: cuDF DataFrame, NumPy ndarray, Numba device ndarray, cuda array interface compliant array like CuPy out_dtype: dtype Determines the precision of the output labels array. - default: "auto". Valid values are { "auto", "int32", - np.int32, "int64", np.int64} + default: "auto". Valid values are { "auto", "int32", np.int32, + "int64", np.int64}. When the number of samples exceed + """ if self.labels_ is not None: @@ -210,8 +211,6 @@ class DBSCAN(Base): "Valid values are {'auto', 'int32', 'int64', " "np.int32, np.int64}") - - cdef uintptr_t input_ptr X_m, input_ptr, n_rows, n_cols, self.dtype = \ @@ -232,7 +231,7 @@ class DBSCAN(Base): self.eps, self.min_samples, labels_ptr, - self.max_bytes_per_batch, + self.max_mbytes_per_batch, self.verbose) else: dbscanFit(handle_[0], @@ -242,7 +241,7 @@ class DBSCAN(Base): self.eps, self.min_samples, labels_ptr, - self.max_bytes_per_batch, + self.max_mbytes_per_batch, self.verbose) else: @@ -254,7 +253,7 @@ class DBSCAN(Base): self.eps, self.min_samples, labels_ptr, - self.max_bytes_per_batch, + self.max_mbytes_per_batch, self.verbose) else: dbscanFit(handle_[0], @@ -264,7 +263,7 @@ class DBSCAN(Base): self.eps, self.min_samples, labels_ptr, - self.max_bytes_per_batch, + self.max_mbytes_per_batch, self.verbose) # make sure that the `dbscanFit` is complete before the following diff --git a/python/cuml/test/test_dbscan.py b/python/cuml/test/test_dbscan.py index 28e2052dfc..c930a64b02 100644 --- a/python/cuml/test/test_dbscan.py +++ b/python/cuml/test/test_dbscan.py @@ -43,7 +43,7 @@ def stress_param(*args, **kwargs): 'noisy_circles', 'no_structure'] -@pytest.mark.parametrize('max_bytes_per_batch', [1e9, 5e9]) +@pytest.mark.parametrize('max_mbytes_per_batch', [1e9, 5e9]) @pytest.mark.parametrize('datatype', [np.float32, np.float64]) @pytest.mark.parametrize('input_type', ['ndarray']) @pytest.mark.parametrize('use_handle', [True, False]) @@ -58,7 +58,7 @@ def stress_param(*args, **kwargs): unit_param("auto"), quality_param("auto"), stress_param("auto")]) def test_dbscan(datatype, input_type, use_handle, - nrows, ncols, max_bytes_per_batch, out_dtype): + nrows, ncols, max_mbytes_per_batch, out_dtype): n_samples = nrows n_feats = ncols X, y = make_blobs(n_samples=n_samples, cluster_std=0.01, @@ -66,7 +66,7 @@ def test_dbscan(datatype, input_type, use_handle, handle, stream = get_handle(use_handle) cudbscan = cuDBSCAN(handle=handle, eps=1, min_samples=2, - max_bytes_per_batch=max_bytes_per_batch) + max_mbytes_per_batch=max_mbytes_per_batch) if input_type == 'dataframe': X = pd.DataFrame( From 2f36ec1b75b2ccb687656e98b7643b273d26f9c3 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Sat, 28 Sep 2019 22:31:38 -0400 Subject: [PATCH 24/63] Changing scale of max_bytes to max_mbytes. This allows us to no longer need to worry about overflows from bytes being larger than int. Now there's a hard-set rule for when long needs to be used- if N * batchSize > max_integer, an explicit exception will be thrown on the C++ side. --- python/cuml/cluster/dbscan.pyx | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/python/cuml/cluster/dbscan.pyx b/python/cuml/cluster/dbscan.pyx index 5b3717fc83..0a3d76097b 100644 --- a/python/cuml/cluster/dbscan.pyx +++ b/python/cuml/cluster/dbscan.pyx @@ -185,7 +185,7 @@ class DBSCAN(Base): if attr == 'labels_array': return self.labels_._column._data.mem - def fit(self, X, out_dtype="auto"): + def fit(self, X, out_dtype="int32"): """ Perform DBSCAN clustering from features. @@ -196,17 +196,14 @@ class DBSCAN(Base): Acceptable formats: cuDF DataFrame, NumPy ndarray, Numba device ndarray, cuda array interface compliant array like CuPy out_dtype: dtype Determines the precision of the output labels array. - default: "auto". Valid values are { "auto", "int32", np.int32, + default: "int32". Valid values are { "int32", np.int32, "int64", np.int64}. When the number of samples exceed - """ if self.labels_ is not None: del self.labels_ - if out_dtype == "auto": - out_dtype = np.int32 if X.shape[0] < 1e6 else np.int64 - elif out_dtype not in ["int32", np.int32, "int64", np.int64]: + if out_dtype not in ["int32", np.int32, "int64", np.int64]: raise ValueError("Invalid value for out_dtype. " "Valid values are {'auto', 'int32', 'int64', " "np.int32, np.int64}") From ff6464335e2b35bd12a0f5b8b803deaea9b00897 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Sat, 28 Sep 2019 22:36:55 -0400 Subject: [PATCH 25/63] Making style checker happy --- python/cuml/test/test_dbscan.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/python/cuml/test/test_dbscan.py b/python/cuml/test/test_dbscan.py index c930a64b02..9d15043f67 100644 --- a/python/cuml/test/test_dbscan.py +++ b/python/cuml/test/test_dbscan.py @@ -56,7 +56,8 @@ def stress_param(*args, **kwargs): unit_param("int64"), unit_param(np.int64), unit_param("auto"), - quality_param("auto"), stress_param("auto")]) + quality_param("auto"), + stress_param("auto")]) def test_dbscan(datatype, input_type, use_handle, nrows, ncols, max_mbytes_per_batch, out_dtype): n_samples = nrows @@ -82,12 +83,10 @@ def test_dbscan(datatype, input_type, use_handle, score = adjusted_rand_score(cu_labels, sk_labels) assert score == 1 - if out_dtype is "int32" or out_dtype is np.int32: + if out_dtype == "int32" or out_dtype == np.int32: assert cu_labels.dtype == np.int32 - elif out_dtype is "int64" or out_dtype is np.int64: + elif out_dtype == "int64" or out_dtype == np.int64: assert cu_labels.dtype == np.int64 - else: # out_dtype was "auto" and we shouldn't have any inputs > threshold - assert cu_labels.dtype == np.int32 @pytest.mark.parametrize("name", [ @@ -123,6 +122,7 @@ def test_dbscan_sklearn_comparison(name, nrows): score = adjusted_rand_score(sk_y_pred, cu_y_pred) assert(score == 1.0) + @pytest.mark.xfail(strict=True, raises=ValueError) def test_dbscan_out_dtype_fails_invalid_input(): X, _ = make_blobs(n_samples=100) From d81bd803b969f3cf05cfeffe85eba387b36b16e6 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Sat, 28 Sep 2019 22:39:40 -0400 Subject: [PATCH 26/63] Better assertion statement for users --- cpp/src/dbscan/runner.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/dbscan/runner.h b/cpp/src/dbscan/runner.h index 0e2136c1b6..4cba104d14 100644 --- a/cpp/src/dbscan/runner.h +++ b/cpp/src/dbscan/runner.h @@ -95,13 +95,13 @@ size_t run(const ML::cumlHandle_impl& handle, Type_f* x, Index_ N, Index_ D, size_t vdSize = alignTo(sizeof(Index_) * (batchSize + 1), align); size_t exScanSize = alignTo(sizeof(Index_) * batchSize, align); - // TODO: We should ASSERT that N * batchSize is greater than the maximum value used Index_ MAX_LABEL = std::numeric_limits::max(); ASSERT( N * batchSize < MAX_LABEL, "An overflow occurred with the current choice of precision " - "and the number of samples. (Max allowed batch size is %d, but was %d)", + "and the number of samples. (Max allowed batch size is %d, but was %d). " + "Consider using double precision for the output labels.", MAX_LABEL / N, batchSize); if (workspace == NULL) { From c6e98bc3f8f8a16496e682a67939203a608b911d Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Sat, 28 Sep 2019 22:43:59 -0400 Subject: [PATCH 27/63] Making style checker happy --- python/cuml/cluster/dbscan.pyx | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/cuml/cluster/dbscan.pyx b/python/cuml/cluster/dbscan.pyx index 0a3d76097b..275974ee30 100644 --- a/python/cuml/cluster/dbscan.pyx +++ b/python/cuml/cluster/dbscan.pyx @@ -76,8 +76,6 @@ cdef extern from "dbscan/dbscan.hpp" namespace "ML": size_t max_mbytes_per_batch, bool verbose) except + - - class DBSCAN(Base): """ DBSCAN is a very powerful yet fast clustering technique that finds clusters From f2e0a2d6bbe3ffa5438bea949ae3a222b083c8aa Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Sat, 28 Sep 2019 22:46:25 -0400 Subject: [PATCH 28/63] Updating docs to use megabytes instead of bytes --- cpp/src/dbscan/dbscan.hpp | 2 +- python/cuml/cluster/dbscan.pyx | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/src/dbscan/dbscan.hpp b/cpp/src/dbscan/dbscan.hpp index 8f1e5ae4c8..364eae5efe 100644 --- a/cpp/src/dbscan/dbscan.hpp +++ b/cpp/src/dbscan/dbscan.hpp @@ -29,7 +29,7 @@ namespace ML { * @param[in] eps the epsilon value to use for epsilon-neighborhood determination * @param[in] min_pts minimum number of points to determine a cluster * @param[out] labels (size n_rows) output labels array - * @param[in] max_mem_bytes: the maximum number of bytes to be used for each batch of + * @param[in] max_mem_mbytes: the maximum number of megabytes to be used for each batch of * the pairwise distance calculation. This enables the trade off between * memory usage and algorithm execution time. * @param[in] verbose: print useful information as algorithm executes diff --git a/python/cuml/cluster/dbscan.pyx b/python/cuml/cluster/dbscan.pyx index 275974ee30..cba90fa0be 100644 --- a/python/cuml/cluster/dbscan.pyx +++ b/python/cuml/cluster/dbscan.pyx @@ -128,15 +128,15 @@ class DBSCAN(Base): verbose : bool Whether to print debug spews max_mbytes_per_batch : (optional) int64 - Calculate batch size using no more than this number of bytes for the - pairwise distance computation. This enables the trade-off between + Calculate batch size using no more than this number of megabytes for + the pairwise distance computation. This enables the trade-off between runtime and memory usage for making the N^2 pairwise distance computations more tractable for large numbers of samples. If you are experiencing out of memory errors when running DBSCAN, you can set this value based on the memory size of your device. Note: this option does not set the maximum total memory used in the - DBSCAN computation and so this value will not - be able to be set to the total memory available on the device. + DBSCAN computation and so this value will not be able to be set to + the total memory available on the device. Attributes ----------- From 70aedbc9133c51f51769301b60c71629f0d133af Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Sat, 28 Sep 2019 23:55:46 -0400 Subject: [PATCH 29/63] Style checker fixes --- python/cuml/cluster/dbscan.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/python/cuml/cluster/dbscan.pyx b/python/cuml/cluster/dbscan.pyx index cba90fa0be..29687f0883 100644 --- a/python/cuml/cluster/dbscan.pyx +++ b/python/cuml/cluster/dbscan.pyx @@ -76,6 +76,7 @@ cdef extern from "dbscan/dbscan.hpp" namespace "ML": size_t max_mbytes_per_batch, bool verbose) except + + class DBSCAN(Base): """ DBSCAN is a very powerful yet fast clustering technique that finds clusters From d0a9d9a64c1217416d2376716e3f5a776047c7f5 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Sat, 28 Sep 2019 23:57:00 -0400 Subject: [PATCH 30/63] Fixing dbscan test --- python/cuml/cluster/dbscan.pyx | 2 +- python/cuml/test/test_dbscan.py | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/python/cuml/cluster/dbscan.pyx b/python/cuml/cluster/dbscan.pyx index 29687f0883..8e7a5215fa 100644 --- a/python/cuml/cluster/dbscan.pyx +++ b/python/cuml/cluster/dbscan.pyx @@ -204,7 +204,7 @@ class DBSCAN(Base): if out_dtype not in ["int32", np.int32, "int64", np.int64]: raise ValueError("Invalid value for out_dtype. " - "Valid values are {'auto', 'int32', 'int64', " + "Valid values are {'int32', 'int64', " "np.int32, np.int64}") cdef uintptr_t input_ptr diff --git a/python/cuml/test/test_dbscan.py b/python/cuml/test/test_dbscan.py index 9d15043f67..48d4b5641a 100644 --- a/python/cuml/test/test_dbscan.py +++ b/python/cuml/test/test_dbscan.py @@ -55,9 +55,8 @@ def stress_param(*args, **kwargs): unit_param(np.int32), unit_param("int64"), unit_param(np.int64), - unit_param("auto"), - quality_param("auto"), - stress_param("auto")]) + quality_param("int32"), + stress_param("int32")]) def test_dbscan(datatype, input_type, use_handle, nrows, ncols, max_mbytes_per_batch, out_dtype): n_samples = nrows From f0884eed92368a882d4d8e13c33ae81968d4050a Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Sun, 29 Sep 2019 00:22:08 -0400 Subject: [PATCH 31/63] Adding double precision outputs to dbscan gtests. Removing long and using int64_t for portability --- cpp/src/datasets/make_blobs.cu | 14 ++--- cpp/src/datasets/make_blobs.hpp | 9 ++-- cpp/src/dbscan/dbscan.cu | 22 ++++---- cpp/src/dbscan/dbscan.hpp | 8 +-- cpp/src/metrics/metrics.cu | 8 +-- cpp/src/metrics/metrics.hpp | 8 +-- cpp/test/sg/dbscan_test.cu | 93 ++++++++++++++++++++++----------- python/cuml/cluster/dbscan.pyx | 12 ++--- 8 files changed, 103 insertions(+), 71 deletions(-) diff --git a/cpp/src/datasets/make_blobs.cu b/cpp/src/datasets/make_blobs.cu index 3b476cd5ca..3c173fc4d7 100644 --- a/cpp/src/datasets/make_blobs.cu +++ b/cpp/src/datasets/make_blobs.cu @@ -23,19 +23,19 @@ namespace ML { namespace Datasets { -void make_blobs(const cumlHandle& handle, float* out, long* labels, long n_rows, - long n_cols, long n_clusters, const float* centers, - const float* cluster_std, const float cluster_std_scalar, - bool shuffle, float center_box_min, float center_box_max, - uint64_t seed) { +void make_blobs(const cumlHandle& handle, float* out, int64_t* labels, + int64_t n_rows, int64_t n_cols, int64_t n_clusters, + const float* centers, const float* cluster_std, + const float cluster_std_scalar, bool shuffle, + float center_box_min, float center_box_max, uint64_t seed) { MLCommon::Random::make_blobs(out, labels, n_rows, n_cols, n_clusters, handle.getDeviceAllocator(), handle.getStream(), centers, cluster_std, cluster_std_scalar, shuffle, center_box_min, center_box_max, seed); } -void make_blobs(const cumlHandle& handle, double* out, long* labels, - long n_rows, long n_cols, long n_clusters, +void make_blobs(const cumlHandle& handle, double* out, int64_t* labels, + int64_t n_rows, long n_cols, int64_t n_clusters, const double* centers, const double* cluster_std, const double cluster_std_scalar, bool shuffle, double center_box_min, double center_box_max, uint64_t seed) { diff --git a/cpp/src/datasets/make_blobs.hpp b/cpp/src/datasets/make_blobs.hpp index 645f075e38..ca2ed1611a 100644 --- a/cpp/src/datasets/make_blobs.hpp +++ b/cpp/src/datasets/make_blobs.hpp @@ -49,15 +49,16 @@ namespace Datasets { * centers. Useful only if 'centers' is nullptr * @param seed seed for the RNG */ -void make_blobs(const cumlHandle& handle, float* out, long* labels, long n_rows, - long n_cols, long n_clusters, const float* centers = nullptr, +void make_blobs(const cumlHandle& handle, float* out, int64_t* labels, + int64_t n_rows, int64_t n_cols, int64_t n_clusters, + const float* centers = nullptr, const float* cluster_std = nullptr, const float cluster_std_scalar = 1.f, bool shuffle = true, float center_box_min = 10.f, float center_box_max = 10.f, uint64_t seed = 0ULL); -void make_blobs(const cumlHandle& handle, double* out, long* labels, - long n_rows, long n_cols, long n_clusters, +void make_blobs(const cumlHandle& handle, double* out, int64_t* labels, + int64_t n_rows, int64_t n_cols, int64_t n_clusters, const double* centers = nullptr, const double* cluster_std = nullptr, const double cluster_std_scalar = 1.f, bool shuffle = true, diff --git a/cpp/src/dbscan/dbscan.cu b/cpp/src/dbscan/dbscan.cu index 055c234847..833d6ef701 100644 --- a/cpp/src/dbscan/dbscan.cu +++ b/cpp/src/dbscan/dbscan.cu @@ -40,20 +40,20 @@ void dbscanFit(const cumlHandle &handle, double *input, int n_rows, int n_cols, handle.getStream(), verbose); } -void dbscanFit(const cumlHandle &handle, float *input, long n_rows, long n_cols, - float eps, int min_pts, long *labels, size_t max_bytes_per_batch, - bool verbose) { - dbscanFitImpl(handle.getImpl(), input, n_rows, n_cols, eps, - min_pts, labels, max_bytes_per_batch, - handle.getStream(), verbose); +void dbscanFit(const cumlHandle &handle, float *input, int64_t n_rows, + int64_t n_cols, float eps, int min_pts, int64_t *labels, + size_t max_bytes_per_batch, bool verbose) { + dbscanFitImpl(handle.getImpl(), input, n_rows, n_cols, eps, + min_pts, labels, max_bytes_per_batch, + handle.getStream(), verbose); } -void dbscanFit(const cumlHandle &handle, double *input, long n_rows, - long n_cols, double eps, int min_pts, long *labels, +void dbscanFit(const cumlHandle &handle, double *input, int64_t n_rows, + int64_t n_cols, double eps, int min_pts, int64_t *labels, size_t max_bytes_per_batch, bool verbose) { - dbscanFitImpl(handle.getImpl(), input, n_rows, n_cols, eps, - min_pts, labels, max_bytes_per_batch, - handle.getStream(), verbose); + dbscanFitImpl(handle.getImpl(), input, n_rows, n_cols, eps, + min_pts, labels, max_bytes_per_batch, + handle.getStream(), verbose); } }; // end namespace ML diff --git a/cpp/src/dbscan/dbscan.hpp b/cpp/src/dbscan/dbscan.hpp index 364eae5efe..aebe3ad8f4 100644 --- a/cpp/src/dbscan/dbscan.hpp +++ b/cpp/src/dbscan/dbscan.hpp @@ -42,11 +42,11 @@ void dbscanFit(const cumlHandle &handle, double *input, int n_rows, int n_cols, double eps, int min_pts, int *labels, size_t max_bytes_per_batch = 0, bool verbose = false); -void dbscanFit(const cumlHandle &handle, float *input, long n_rows, long n_cols, - float eps, int min_pts, long *labels, +void dbscanFit(const cumlHandle &handle, float *input, int64_t n_rows, + int64_t n_cols, float eps, int min_pts, int64_t *labels, size_t max_bytes_per_batch = 0, bool verbose = false); -void dbscanFit(const cumlHandle &handle, double *input, long n_rows, - long n_cols, double eps, int min_pts, long *labels, +void dbscanFit(const cumlHandle &handle, double *input, int64_t n_rows, + int64_t n_cols, double eps, int min_pts, int64_t *labels, size_t max_bytes_per_batch = 0, bool verbose = false); /** @} */ diff --git a/cpp/src/metrics/metrics.cu b/cpp/src/metrics/metrics.cu index 79007262cb..0a1faed817 100644 --- a/cpp/src/metrics/metrics.cu +++ b/cpp/src/metrics/metrics.cu @@ -50,10 +50,10 @@ double silhouetteScore(const cumlHandle &handle, double *y, int nRows, handle.getStream(), metric); } -double adjustedRandIndex(const cumlHandle &handle, const long *y, - const long *y_hat, const long n, - const long lower_class_range, - const long upper_class_range) { +double adjustedRandIndex(const cumlHandle &handle, const int64_t *y, + const int64_t *y_hat, const int64_t n, + const int64_t lower_class_range, + const int64_t upper_class_range) { return MLCommon::Metrics::computeAdjustedRandIndex( y, y_hat, n, lower_class_range, upper_class_range, handle.getDeviceAllocator(), handle.getStream()); diff --git a/cpp/src/metrics/metrics.hpp b/cpp/src/metrics/metrics.hpp index 881d5a9aab..cc1184ecac 100644 --- a/cpp/src/metrics/metrics.hpp +++ b/cpp/src/metrics/metrics.hpp @@ -106,10 +106,10 @@ double silhouetteScore(const cumlHandle &handle, double *y, int nRows, * @param upper_class_range: the highest value in the range of classes * @return: The adjusted rand index value */ -double adjustedRandIndex(const cumlHandle &handle, const long *y, - const long *y_hat, const long n, - const long lower_class_range, - const long upper_class_range); +double adjustedRandIndex(const cumlHandle &handle, const int64_t *y, + const int64_t *y_hat, const int64_t n, + const int64_t lower_class_range, + const int64_t upper_class_range); double adjustedRandIndex(const cumlHandle &handle, const int *y, const int *y_hat, const int n, diff --git a/cpp/test/sg/dbscan_test.cu b/cpp/test/sg/dbscan_test.cu index 9518b466d2..bc58d89887 100644 --- a/cpp/test/sg/dbscan_test.cu +++ b/cpp/test/sg/dbscan_test.cu @@ -40,11 +40,11 @@ using namespace Datasets; using namespace Metrics; using namespace std; -template +template struct DbscanInputs { - int n_row; - int n_col; - int n_centers; + IdxT n_row; + IdxT n_col; + IdxT n_centers; T cluster_std; T eps; int min_pts; @@ -52,23 +52,24 @@ struct DbscanInputs { unsigned long long int seed; }; -template -::std::ostream& operator<<(::std::ostream& os, const DbscanInputs& dims) { +template +::std::ostream& operator<<(::std::ostream& os, + const DbscanInputs& dims) { return os; } -template -class DbscanTest : public ::testing::TestWithParam> { +template +class DbscanTest : public ::testing::TestWithParam> { protected: void basicTest() { cumlHandle handle; - params = ::testing::TestWithParam>::GetParam(); + params = ::testing::TestWithParam>::GetParam(); device_buffer out(handle.getDeviceAllocator(), handle.getStream(), params.n_row * params.n_col); - device_buffer l(handle.getDeviceAllocator(), handle.getStream(), - params.n_row); + device_buffer l(handle.getDeviceAllocator(), handle.getStream(), + params.n_row); make_blobs(handle, out.data(), l.data(), params.n_row, params.n_col, params.n_centers, nullptr, nullptr, params.cluster_std, true, @@ -96,6 +97,8 @@ class DbscanTest : public ::testing::TestWithParam> { std::cout << "y_hat: " << arr2Str(labels, 25, "labels", handle.getStream()) << std::endl; + + std::cout << "Score = " << score << std::endl; } } @@ -107,38 +110,66 @@ class DbscanTest : public ::testing::TestWithParam> { } protected: - DbscanInputs params; - int *labels, *labels_ref; + DbscanInputs params; + IdxT *labels, *labels_ref; double score; }; -const std::vector> inputsf2 = { - {50000, 16, 5, 0.01, 2, 2, (size_t)13e8, 1234ULL}, +const std::vector> inputsf2 = { + {50000, 16, 5, 0.01, 2, 2, (size_t)13e3, 1234ULL}, + {500, 16, 5, 0.01, 2, 2, (size_t)100, 1234ULL}, + {1000, 1000, 10, 0.01, 2, 2, (size_t)13e3, 1234ULL}, + {50000, 16, 5l, 0.01, 2, 2, (size_t)13e3, 1234ULL}, + {20000, 10000, 10, 0.01, 2, 2, (size_t)13e3, 1234ULL}, + {20000, 100, 5000, 0.01, 2, 2, (size_t)13e3, 1234ULL}}; + +const std::vector> inputsf3 = { + {50000, 16, 5, 0.01, 2, 2, (size_t)9e3, 1234ULL}, + {500, 16, 5, 0.01, 2, 2, (size_t)100, 1234ULL}, + {1000, 1000, 10, 0.01, 2, 2, (size_t)9e3, 1234ULL}, + {50000, 16, 5l, 0.01, 2, 2, (size_t)9e3, 1234ULL}, + {20000, 10000, 10, 0.01, 2, 2, (size_t)9e3, 1234ULL}, + {20000, 100, 5000, 0.01, 2, 2, (size_t)9e3, 1234ULL}}; + +const std::vector> inputsd2 = { + {50000, 16, 5, 0.01, 2, 2, (size_t)13e3, 1234ULL}, {500, 16, 5, 0.01, 2, 2, (size_t)100, 1234ULL}, - {1000, 1000, 10, 0.01, 2, 2, (size_t)13e8, 1234ULL}, - {50000, 16, 5l, 0.01, 2, 2, (size_t)13e8, 1234ULL}, - {20000, 10000, 10, 0.01, 2, 2, (size_t)13e8, 1234ULL}, - {20000, 100, 5000, 0.01, 2, 2, (size_t)13e8, 1234ULL}}; + {1000, 1000, 10, 0.01, 2, 2, (size_t)13e3, 1234ULL}, + {100, 10000, 10, 0.01, 2, 2, (size_t)13e3, 1234ULL}, + {20000, 10000, 10, 0.01, 2, 2, (size_t)13e3, 1234ULL}, + {20000, 100, 5000, 0.01, 2, 2, (size_t)13e3, 1234ULL}}; -const std::vector> inputsd2 = { - {50000, 16, 5, 0.01, 2, 2, (size_t)13e9, 1234ULL}, +const std::vector> inputsd3 = { + {50000, 16, 5, 0.01, 2, 2, (size_t)9e3, 1234ULL}, {500, 16, 5, 0.01, 2, 2, (size_t)100, 1234ULL}, - {1000, 1000, 10, 0.01, 2, 2, (size_t)13e9, 1234ULL}, - {100, 10000, 10, 0.01, 2, 2, (size_t)13e9, 1234ULL}, - {20000, 10000, 10, 0.01, 2, 2, (size_t)13e9, 1234ULL}, - {20000, 100, 5000, 0.01, 2, 2, (size_t)13e9, 1234ULL}}; + {1000, 1000, 10, 0.01, 2, 2, (size_t)9e3, 1234ULL}, + {100, 10000, 10, 0.01, 2, 2, (size_t)9e3, 1234ULL}, + {20000, 10000, 10, 0.01, 2, 2, (size_t)9e3, 1234ULL}, + {20000, 100, 5000, 0.01, 2, 2, (size_t)9e3, 1234ULL}}; -typedef DbscanTest DbscanTestF; -TEST_P(DbscanTestF, Result) { ASSERT_TRUE(score == 1.0); } +typedef DbscanTest DbscanTestF_Int; +TEST_P(DbscanTestF_Int, Result) { ASSERT_TRUE(score == 1.0); } -typedef DbscanTest DbscanTestD; -TEST_P(DbscanTestD, Result) { ASSERT_TRUE(score == 1.0); } +typedef DbscanTest DbscanTestF_Int64; +TEST_P(DbscanTestF_Int64, Result) { ASSERT_TRUE(score == 1.0); } -INSTANTIATE_TEST_CASE_P(DbscanTests, DbscanTestF, +typedef DbscanTest DbscanTestD_Int; +TEST_P(DbscanTestD_Int, Result) { ASSERT_TRUE(score == 1.0); } + +typedef DbscanTest DbscanTestD_Int64; +TEST_P(DbscanTestD_Int64, Result) { ASSERT_TRUE(score == 1.0); } + +INSTANTIATE_TEST_CASE_P(DbscanTests, DbscanTestF_Int, ::testing::ValuesIn(inputsf2)); -INSTANTIATE_TEST_CASE_P(DbscanTests, DbscanTestD, +INSTANTIATE_TEST_CASE_P(DbscanTests, DbscanTestF_Int64, + ::testing::ValuesIn(inputsf3)); + +INSTANTIATE_TEST_CASE_P(DbscanTests, DbscanTestD_Int, ::testing::ValuesIn(inputsd2)); +INSTANTIATE_TEST_CASE_P(DbscanTests, DbscanTestD_Int64, + ::testing::ValuesIn(inputsd3)); + } // end namespace ML diff --git a/python/cuml/cluster/dbscan.pyx b/python/cuml/cluster/dbscan.pyx index 8e7a5215fa..059944144d 100644 --- a/python/cuml/cluster/dbscan.pyx +++ b/python/cuml/cluster/dbscan.pyx @@ -232,11 +232,11 @@ class DBSCAN(Base): else: dbscanFit(handle_[0], input_ptr, - n_rows, - n_cols, + n_rows, + n_cols, self.eps, self.min_samples, - labels_ptr, + labels_ptr, self.max_mbytes_per_batch, self.verbose) @@ -254,11 +254,11 @@ class DBSCAN(Base): else: dbscanFit(handle_[0], input_ptr, - n_rows, - n_cols, + n_rows, + n_cols, self.eps, self.min_samples, - labels_ptr, + labels_ptr, self.max_mbytes_per_batch, self.verbose) From 2e775769c4ae1cc4bf3666ba134d186eaf8f1924 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Sun, 29 Sep 2019 00:33:56 -0400 Subject: [PATCH 32/63] Catching any missing longs in make_blobs and dbscan --- cpp/src/datasets/make_blobs.cu | 2 +- cpp/src/dbscan/adjgraph/algo.h | 2 +- cpp/src/dbscan/vertexdeg/naive.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/datasets/make_blobs.cu b/cpp/src/datasets/make_blobs.cu index 3c173fc4d7..f9bf1d8d99 100644 --- a/cpp/src/datasets/make_blobs.cu +++ b/cpp/src/datasets/make_blobs.cu @@ -35,7 +35,7 @@ void make_blobs(const cumlHandle& handle, float* out, int64_t* labels, } void make_blobs(const cumlHandle& handle, double* out, int64_t* labels, - int64_t n_rows, long n_cols, int64_t n_clusters, + int64_t n_rows, int64_t n_cols, int64_t n_clusters, const double* centers, const double* cluster_std, const double cluster_std_scalar, bool shuffle, double center_box_min, double center_box_max, uint64_t seed) { diff --git a/cpp/src/dbscan/adjgraph/algo.h b/cpp/src/dbscan/adjgraph/algo.h index 0793ede1b0..1df95dfa21 100644 --- a/cpp/src/dbscan/adjgraph/algo.h +++ b/cpp/src/dbscan/adjgraph/algo.h @@ -40,7 +40,7 @@ static const int TPB_X = 256; * Takes vertex degree array (vd) and CSR row_ind array (ex_scan) to produce the * CSR row_ind_ptr array (adj_graph) and filters into a core_pts array based on min_pts. */ -template +template void launcher(const ML::cumlHandle_impl &handle, Pack data, Index_ batchSize, cudaStream_t stream) { device_ptr dev_vd = device_pointer_cast(data.vd); diff --git a/cpp/src/dbscan/vertexdeg/naive.h b/cpp/src/dbscan/vertexdeg/naive.h index f2a9cf4b13..8710640295 100644 --- a/cpp/src/dbscan/vertexdeg/naive.h +++ b/cpp/src/dbscan/vertexdeg/naive.h @@ -71,7 +71,7 @@ __global__ void vertex_degree_kernel(Pack data, } } -template +template void launcher(Pack data, Index_ startVertexId, Index_ batchSize, cudaStream_t stream) { ASSERT(sizeof(Index_) == 4 || sizeof(Index_) == 8, From 36cd51a9ef055a489b774e68682feec91ded8715 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Sun, 29 Sep 2019 08:25:29 -0400 Subject: [PATCH 33/63] Importing int64_t --- python/cuml/cluster/kmeans.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cuml/cluster/kmeans.pyx b/python/cuml/cluster/kmeans.pyx index dca1316e21..fcfa070de4 100644 --- a/python/cuml/cluster/kmeans.pyx +++ b/python/cuml/cluster/kmeans.pyx @@ -27,7 +27,7 @@ import warnings from librmm_cffi import librmm as rmm from libcpp cimport bool -from libc.stdint cimport uintptr_t +from libc.stdint cimport uintptr_t, int64_t from libc.stdlib cimport calloc, malloc, free from cuml.common.base import Base From 2503d4fdb91e9131cdc3b280810818f185ff1fd6 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Tue, 1 Oct 2019 01:51:37 +1000 Subject: [PATCH 34/63] Updated Barnes Hut TSNE documentation --- python/cuml/manifold/t_sne.pyx | 35 +++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/python/cuml/manifold/t_sne.pyx b/python/cuml/manifold/t_sne.pyx index 453fa60a42..2196fbba4f 100644 --- a/python/cuml/manifold/t_sne.pyx +++ b/python/cuml/manifold/t_sne.pyx @@ -71,24 +71,21 @@ cdef extern from "tsne/tsne.h" namespace "ML" nogil: class TSNE(Base): - """ +""" TSNE (T-Distributed Stochastic Neighbor Embedding) is an extremely powerful dimensionality reduction technique that aims to maintain local distances between data points. It is extremely robust to whatever dataset you give it, and is used in many areas including cancer research, music analysis and neural network weight visualizations. - The current cuML TSNE implementation is a first experimental release. It - defaults to use the 'exact' fitting algorithm, which is signficantly slower - then the Barnes-Hut algorithm as data sizes grow. A preview implementation - of Barnes-Hut (derived from CannyLabs' BH open source CUDA code) is also - available for problems with n_components = 2, though this implementation - currently has outstanding issues that can lead to crashes in rare - scenarios. Future releases of TSNE will fix these issues (tracked as cuML - Issue #1002) and switch Barnes-Hut to be the default. + Currently, cuML's TSNE supports the fast Barnes Hut O(NlogN) TSNE + approximation (derived from CannyLabs' BH open source CUDA code). This + allows TSNE to produce extremely fast embeddings when n_components = 2. + cuML defaults to this algorithm. A slower but more accurate Exact + algorithm is also provided. Parameters - ---------- + ----------- n_components : int (default 2) The output dimensionality size. Currently only size=2 is tested, but the 'exact' algorithm will support greater dimensionality in future. @@ -110,13 +107,18 @@ class TSNE(Base): metric : str 'euclidean' only (default 'euclidean') Currently only supports euclidean distance. Will support cosine in a future release. - init : str 'random' only (default 'random') - Currently only supports random intialization. Will support PCA - intialization in a future release. + init : str 'random' (default 'random') + Currently supports random intialization. verbose : int (default 0) Level of verbosity. If > 0, prints all help messages and warnings. + Most messages will be printed inside the Python Console. random_state : int (default None) - Setting this can allow future runs of TSNE to look the same. + Setting this can allow future runs of TSNE to look mostly the same. + It is known that TSNE tends to have vastly different outputs on + many runs. Try using PCA intialization (in future release) to + possibly counteract this problem. + It is known that small perturbations can directly + change the result of the embedding for parallel TSNE implementations. method : str 'barnes_hut' or 'exact' (default 'barnes_hut') Options are either barnes_hut or exact. It is recommend that you use the barnes hut approximation for superior O(nlogn) complexity. @@ -147,7 +149,7 @@ class TSNE(Base): one for you anew! References - ---------- + ----------- * van der Maaten, L.J.P. t-Distributed Stochastic Neighbor Embedding https://lvdmaaten.github.io/tsne/ @@ -169,6 +171,9 @@ class TSNE(Base): specifying random_state and fixing it across runs can help, but TSNE does not guarantee similar results each time. + As suggested, PCA (in future release) can also help to alleviate this + issue. + Reference Implementation ------------------------- The CUDA implementation is derived from the excellent CannyLabs open source From 4aa11d8f8d717f36cf895093ef474228d1eb6e85 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Tue, 1 Oct 2019 01:53:10 +1000 Subject: [PATCH 35/63] Update CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7b471839e0..51c3016454 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,7 @@ - PR #1115: Moving dask_make_blobs to cuml.dask.datasets. Adding conversion to dask.DataFrame - PR #1136: CUDA 10.1 CI updates - PR #1165: Adding except + in all remaining cython +- PR #1173: Docs: Barnes Hut TSNE documentation ## Bug Fixes From 1da825e6fae4011b3631aee237c209328c6d2516 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Tue, 1 Oct 2019 01:55:42 +1000 Subject: [PATCH 36/63] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index fa8a4dfe22..d2ff95409a 100644 --- a/README.md +++ b/README.md @@ -75,7 +75,7 @@ repo](https://github.com/rapidsai/notebooks-contrib). | | Truncated Singular Value Decomposition (tSVD) | Multi-GPU version available (CUDA 10 only) | | | Uniform Manifold Approximation and Projection (UMAP) | | | | Random Projection | | -| | t-Distributed Stochastic Neighbor Embedding (TSNE) | (Experimental) | +| | t-Distributed Stochastic Neighbor Embedding (TSNE) | Barnes-Hut O(NlogN) and Exact O(N^2) | | **Linear Models for Regression or Classification** | Linear Regression (OLS) | Multi-GPU available in conda CUDA 10 package | | | Linear Regression with Lasso or Ridge Regularization | | | | ElasticNet Regression | | From b739ed0bafa9ce0eec13aee47c4a7d22ea0e7338 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Tue, 1 Oct 2019 01:56:46 +1000 Subject: [PATCH 37/63] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d2ff95409a..4a2958b7da 100644 --- a/README.md +++ b/README.md @@ -75,7 +75,7 @@ repo](https://github.com/rapidsai/notebooks-contrib). | | Truncated Singular Value Decomposition (tSVD) | Multi-GPU version available (CUDA 10 only) | | | Uniform Manifold Approximation and Projection (UMAP) | | | | Random Projection | | -| | t-Distributed Stochastic Neighbor Embedding (TSNE) | Barnes-Hut O(NlogN) and Exact O(N^2) | +| | t-Distributed Stochastic Neighbor Embedding (TSNE) | Both Barnes-Hut O(NlogN) and Exact O(N^2) are provided | | **Linear Models for Regression or Classification** | Linear Regression (OLS) | Multi-GPU available in conda CUDA 10 package | | | Linear Regression with Lasso or Ridge Regularization | | | | ElasticNet Regression | | From 166160d025d308b44370c9853da44bede6ac9641 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Tue, 1 Oct 2019 01:58:50 +1000 Subject: [PATCH 38/63] Update t_sne.pyx --- python/cuml/manifold/t_sne.pyx | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/python/cuml/manifold/t_sne.pyx b/python/cuml/manifold/t_sne.pyx index 2196fbba4f..55149da812 100644 --- a/python/cuml/manifold/t_sne.pyx +++ b/python/cuml/manifold/t_sne.pyx @@ -71,7 +71,7 @@ cdef extern from "tsne/tsne.h" namespace "ML" nogil: class TSNE(Base): -""" + """ TSNE (T-Distributed Stochastic Neighbor Embedding) is an extremely powerful dimensionality reduction technique that aims to maintain local distances between data points. It is extremely robust to whatever @@ -306,9 +306,10 @@ class TSNE(Base): return def fit(self, X): - """Fit X into an embedded space. + """ + Fit X into an embedded space. Parameters - ---------- + ----------- X : array-like (device or host) shape = (n_samples, n_features) X contains a sample per row. Acceptable formats: cuDF DataFrame, NumPy ndarray, Numba device @@ -418,15 +419,16 @@ class TSNE(Base): self.Y = None def fit_transform(self, X): - """Fit X into an embedded space and return that transformed output. + """ + Fit X into an embedded space and return that transformed output. Parameters - ---------- + ----------- X : array-like (device or host) shape = (n_samples, n_features) X contains a sample per row. Acceptable formats: cuDF DataFrame, NumPy ndarray, Numba device ndarray, cuda array interface compliant array like CuPy Returns - ------- + -------- X_new : array, shape (n_samples, n_components) Embedding of the training data in low-dimensional space. """ From 9f37cba9551f0f65a9a12e3974b72ea7737663d7 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Tue, 1 Oct 2019 02:00:45 +1000 Subject: [PATCH 39/63] Update t_sne.pyx --- python/cuml/manifold/t_sne.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cuml/manifold/t_sne.pyx b/python/cuml/manifold/t_sne.pyx index 55149da812..339c3e24c7 100644 --- a/python/cuml/manifold/t_sne.pyx +++ b/python/cuml/manifold/t_sne.pyx @@ -115,7 +115,7 @@ class TSNE(Base): random_state : int (default None) Setting this can allow future runs of TSNE to look mostly the same. It is known that TSNE tends to have vastly different outputs on - many runs. Try using PCA intialization (in future release) to + many runs. Try using PCA intialization (in future release) to possibly counteract this problem. It is known that small perturbations can directly change the result of the embedding for parallel TSNE implementations. From d4df9b743f1d39155882fe29f61b71645d6f28c9 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Tue, 1 Oct 2019 02:09:48 +1000 Subject: [PATCH 40/63] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 4a2958b7da..1ff2549f7f 100644 --- a/README.md +++ b/README.md @@ -75,7 +75,7 @@ repo](https://github.com/rapidsai/notebooks-contrib). | | Truncated Singular Value Decomposition (tSVD) | Multi-GPU version available (CUDA 10 only) | | | Uniform Manifold Approximation and Projection (UMAP) | | | | Random Projection | | -| | t-Distributed Stochastic Neighbor Embedding (TSNE) | Both Barnes-Hut O(NlogN) and Exact O(N^2) are provided | +| | t-Distributed Stochastic Neighbor Embedding (TSNE) | Both Barnes-Hut and Exact algorithms are provided | | **Linear Models for Regression or Classification** | Linear Regression (OLS) | Multi-GPU available in conda CUDA 10 package | | | Linear Regression with Lasso or Ridge Regularization | | | | ElasticNet Regression | | From d3954f3078efe23d7a947a0c6a0761188ba8c49d Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Tue, 1 Oct 2019 03:49:39 +1000 Subject: [PATCH 41/63] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 1ff2549f7f..ec7c281282 100644 --- a/README.md +++ b/README.md @@ -75,7 +75,7 @@ repo](https://github.com/rapidsai/notebooks-contrib). | | Truncated Singular Value Decomposition (tSVD) | Multi-GPU version available (CUDA 10 only) | | | Uniform Manifold Approximation and Projection (UMAP) | | | | Random Projection | | -| | t-Distributed Stochastic Neighbor Embedding (TSNE) | Both Barnes-Hut and Exact algorithms are provided | +| | t-Distributed Stochastic Neighbor Embedding (TSNE) | | | **Linear Models for Regression or Classification** | Linear Regression (OLS) | Multi-GPU available in conda CUDA 10 package | | | Linear Regression with Lasso or Ridge Regularization | | | | ElasticNet Regression | | From 421c9c3ff5f67c1a96eadf3c878476dc15fb2501 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Tue, 1 Oct 2019 03:51:05 +1000 Subject: [PATCH 42/63] Update t_sne.pyx --- python/cuml/manifold/t_sne.pyx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/cuml/manifold/t_sne.pyx b/python/cuml/manifold/t_sne.pyx index 339c3e24c7..20fea2e2b6 100644 --- a/python/cuml/manifold/t_sne.pyx +++ b/python/cuml/manifold/t_sne.pyx @@ -115,8 +115,8 @@ class TSNE(Base): random_state : int (default None) Setting this can allow future runs of TSNE to look mostly the same. It is known that TSNE tends to have vastly different outputs on - many runs. Try using PCA intialization (in future release) to - possibly counteract this problem. + many runs. Try using PCA intialization (upcoming with change #1098) + to possibly counteract this problem. It is known that small perturbations can directly change the result of the embedding for parallel TSNE implementations. method : str 'barnes_hut' or 'exact' (default 'barnes_hut') @@ -171,8 +171,8 @@ class TSNE(Base): specifying random_state and fixing it across runs can help, but TSNE does not guarantee similar results each time. - As suggested, PCA (in future release) can also help to alleviate this - issue. + As suggested, PCA (upcoming with change #1098) can also help to alleviate + this issue. Reference Implementation ------------------------- From 24ae3d724ec731e8862cefddff535fe89f5e141b Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Tue, 1 Oct 2019 03:51:54 +1000 Subject: [PATCH 43/63] Update t_sne.pyx --- python/cuml/manifold/t_sne.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cuml/manifold/t_sne.pyx b/python/cuml/manifold/t_sne.pyx index 20fea2e2b6..17ba9765b8 100644 --- a/python/cuml/manifold/t_sne.pyx +++ b/python/cuml/manifold/t_sne.pyx @@ -419,8 +419,8 @@ class TSNE(Base): self.Y = None def fit_transform(self, X): - """ - Fit X into an embedded space and return that transformed output. + """Fit X into an embedded space and return that transformed output. + Parameters ----------- X : array-like (device or host) shape = (n_samples, n_features) From 2a66c48837c08a13a3d15a69fd0a43219781db55 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Mon, 30 Sep 2019 14:34:21 -0400 Subject: [PATCH 44/63] Adding timing information to verbose printing --- cpp/src/dbscan/runner.h | 29 ++++++++++++++++++++++++++++- cpp/src/dbscan/vertexdeg/algo.h | 9 ++++----- cpp/src/dbscan/vertexdeg/naive.h | 4 ++-- cpp/src_prims/sparse/csr.h | 19 +++++++++++++------ 4 files changed, 47 insertions(+), 14 deletions(-) diff --git a/cpp/src/dbscan/runner.h b/cpp/src/dbscan/runner.h index 4cba104d14..112d4ded9d 100644 --- a/cpp/src/dbscan/runner.h +++ b/cpp/src/dbscan/runner.h @@ -25,6 +25,8 @@ #include "sparse/csr.h" #include "vertexdeg/runner.h" +#include + namespace Dbscan { using namespace MLCommon; @@ -57,6 +59,12 @@ void final_relabel(Index_* db_cluster, Index_ N, cudaStream_t stream) { [MAX_LABEL] __device__(Index_ val) { return val == MAX_LABEL; }); } +int64_t curTimeMillis() { + struct timeval tp; + gettimeofday(&tp, NULL); + return tp.tv_sec * 1000 + tp.tv_usec / 1000; +} + /* @param N number of points * @param D dimensionality of the points * @param eps epsilon neighborhood criterion @@ -146,6 +154,8 @@ size_t run(const ML::cumlHandle_impl& handle, Type_f* x, Index_ N, Index_ D, std::cout << "- Iteration " << i + 1 << " / " << nBatches << ". Batch size is " << nPoints << " samples." << std::endl; + int64_t start_time = curTimeMillis(); + if (verbose) std::cout << "--> Computing vertex degrees" << std::endl; VertexDeg::run(handle, adj, vd, x, eps, N, D, algoVd, startVertexId, nPoints, stream); @@ -153,9 +163,15 @@ size_t run(const ML::cumlHandle_impl& handle, Type_f* x, Index_ N, Index_ D, CUDA_CHECK(cudaStreamSynchronize(stream)); ML::POP_RANGE(); + int64_t cur_time = curTimeMillis(); + if (verbose) + std::cout << " |-> Took " << (cur_time - start_time) << "ms." + << std::endl; + if (verbose) std::cout << "--> Computing adjacency graph of size " << curradjlen << " samples." << std::endl; + start_time = curTimeMillis(); // Running AdjGraph ML::PUSH_RANGE("Trace::Dbscan::AdjGraph"); if (curradjlen > adjlen || adj_graph.data() == NULL) { @@ -171,14 +187,25 @@ size_t run(const ML::cumlHandle_impl& handle, Type_f* x, Index_ N, Index_ D, ML::PUSH_RANGE("Trace::Dbscan::WeakCC"); + cur_time = curTimeMillis(); + if (verbose) + std::cout << " |-> Took " << (cur_time - start_time) << "ms." + << std::endl; + if (verbose) std::cout << "--> Computing connected components" << std::endl; - MLCommon::Sparse::weak_cc_batched( + start_time = curTimeMillis(); + MLCommon::Sparse::weak_cc_batched( labels, ex_scan, adj_graph.data(), adjlen, N, startVertexId, nPoints, &state, stream, [core_pts] __device__(Index_ tid) { return core_pts[tid]; }); ML::POP_RANGE(); + cur_time = curTimeMillis(); + if (verbose) + std::cout << " |-> Took " << (cur_time - start_time) << "ms." + << std::endl; + if (verbose) std::cout << " " << std::endl; } diff --git a/cpp/src/dbscan/vertexdeg/algo.h b/cpp/src/dbscan/vertexdeg/algo.h index 6b1bf29b24..160281d697 100644 --- a/cpp/src/dbscan/vertexdeg/algo.h +++ b/cpp/src/dbscan/vertexdeg/algo.h @@ -65,12 +65,11 @@ void launcher(const ML::cumlHandle_impl &handle, Pack data, index_t batch_vertex = fmod(global_c_idx, n); if (sizeof(index_t) == 4) { - atomicAdd((int *)(vd + batch_vertex), (int)in_neigh); - atomicAdd((int *)(vd + n), (int)in_neigh); + atomicAdd((unsigned int *)(vd + batch_vertex), in_neigh); + atomicAdd((unsigned int *)(vd + n), in_neigh); } else if (sizeof(index_t) == 8) { - atomicAdd((unsigned long long *)(vd + batch_vertex), - (unsigned long long)in_neigh); - atomicAdd((unsigned long long *)(vd + n), (unsigned long long)in_neigh); + atomicAdd((unsigned long long int *)(vd + batch_vertex), in_neigh); + atomicAdd((unsigned long long int *)(vd + n), in_neigh); } }; diff --git a/cpp/src/dbscan/vertexdeg/naive.h b/cpp/src/dbscan/vertexdeg/naive.h index 8710640295..db17d0fe34 100644 --- a/cpp/src/dbscan/vertexdeg/naive.h +++ b/cpp/src/dbscan/vertexdeg/naive.h @@ -66,8 +66,8 @@ __global__ void vertex_degree_kernel(Pack data, atomicAdd((int *)(vd + row), (int)res); atomicAdd((int *)(vd + batchSize), (int)res); } else if (sizeof(Index_) == 8) { - atomicAdd((unsigned long long *)(vd + row), (unsigned long long)res); - atomicAdd((unsigned long long *)(vd + batchSize), (unsigned long long)res); + atomicAdd((unsigned long long *)(vd + row), res); + atomicAdd((unsigned long long *)(vd + batchSize), res); } } diff --git a/cpp/src_prims/sparse/csr.h b/cpp/src_prims/sparse/csr.h index 03cbd8a0e8..f35f8268c2 100644 --- a/cpp/src_prims/sparse/csr.h +++ b/cpp/src_prims/sparse/csr.h @@ -708,9 +708,9 @@ __global__ void weak_cc_label_device(Index_ *labels, const Index_ *row_ind, cj = labels[j_ind]; if (ci < cj) { if (sizeof(Index_) == 4) - atomicMin((unsigned int *)(labels + j_ind), ci); + atomicMin((int *)(labels + j_ind), ci); else if (sizeof(Index_) == 8) - atomicMin((unsigned long long int *)(labels + j_ind), ci); + atomicMin((long long int *)(labels + j_ind), ci); xa[j_ind] = true; m[0] = true; } else if (ci > cj) { @@ -720,11 +720,9 @@ __global__ void weak_cc_label_device(Index_ *labels, const Index_ *row_ind, } if (ci_mod) { if (sizeof(Index_) == 4) - atomicMin((unsigned int *)(labels + startVertexId + tid), ci); + atomicMin((int *)(labels + startVertexId + tid), ci); else if (sizeof(Index_) == 8) - atomicMin((unsigned long long int *)(labels + startVertexId + tid), - ci); - + atomicMin((long long int *)(labels + startVertexId + tid), ci); xa[startVertexId + tid] = true; m[0] = true; } @@ -777,6 +775,7 @@ void weak_cc_label_batched(Index_ *labels, const Index_ *row_ind, labels, startVertexId, batchSize, MAX_LABEL, filter_op); CUDA_CHECK(cudaPeekAtLastError()); + int n_iters = 0; do { CUDA_CHECK(cudaMemsetAsync(state->m, false, sizeof(bool), stream)); @@ -784,6 +783,8 @@ void weak_cc_label_batched(Index_ *labels, const Index_ *row_ind, labels, row_ind, row_ind_ptr, nnz, state->fa, state->xa, state->m, startVertexId, batchSize); CUDA_CHECK(cudaPeekAtLastError()); + CUDA_CHECK(cudaStreamSynchronize(stream)); + //** swapping F1 and F2 MLCommon::updateHost(host_fa, state->fa, N, stream); @@ -794,7 +795,12 @@ void weak_cc_label_batched(Index_ *labels, const Index_ *row_ind, //** Updating m * MLCommon::updateHost(&host_m, state->m, 1, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); + + n_iters++; } while (host_m); + + free(host_fa); + free(host_xa); } /** @@ -833,6 +839,7 @@ void weak_cc_batched(Index_ *labels, const Index_ *row_ind, dim3 threads(TPB_X); Index_ MAX_LABEL = std::numeric_limits::max(); + if (startVertexId == 0) { weak_cc_init_all_kernel<<>>( labels, state->fa, state->xa, N, MAX_LABEL); From 4014df40e4f17d759542ff0ff1513adc3bcd47db Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Tue, 1 Oct 2019 05:00:04 +1000 Subject: [PATCH 45/63] Fix style --- python/cuml/manifold/t_sne.pyx | 1 - 1 file changed, 1 deletion(-) diff --git a/python/cuml/manifold/t_sne.pyx b/python/cuml/manifold/t_sne.pyx index 17ba9765b8..98c72b1cca 100644 --- a/python/cuml/manifold/t_sne.pyx +++ b/python/cuml/manifold/t_sne.pyx @@ -420,7 +420,6 @@ class TSNE(Base): def fit_transform(self, X): """Fit X into an embedded space and return that transformed output. - Parameters ----------- X : array-like (device or host) shape = (n_samples, n_features) From c00beb564d4ac955c8cbe032ed3ed64fa24a515f Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Mon, 30 Sep 2019 15:34:52 -0400 Subject: [PATCH 46/63] Fixing Python to use int64_t --- python/cuml/cluster/dbscan.pyx | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/python/cuml/cluster/dbscan.pyx b/python/cuml/cluster/dbscan.pyx index 059944144d..02eaafd66c 100644 --- a/python/cuml/cluster/dbscan.pyx +++ b/python/cuml/cluster/dbscan.pyx @@ -24,7 +24,7 @@ import cudf import numpy as np from libcpp cimport bool -from libc.stdint cimport uintptr_t +from libc.stdint cimport uintptr_t, int64_t from libc.stdlib cimport calloc, malloc, free from cuml.common.base import Base @@ -58,21 +58,21 @@ cdef extern from "dbscan/dbscan.hpp" namespace "ML": cdef void dbscanFit(cumlHandle& handle, float *input, - long n_rows, - long n_cols, + int64_t n_rows, + int64_t n_cols, double eps, int min_pts, - long *labels, + int64_t *labels, size_t max_mbytes_per_batch, bool verbose) except + cdef void dbscanFit(cumlHandle& handle, double *input, - long n_rows, - long n_cols, + int64_t n_rows, + int64_t n_cols, double eps, int min_pts, - long *labels, + int64_t *labels, size_t max_mbytes_per_batch, bool verbose) except + From f7407fa9e8f905f1d5473cad981b1381c5297fd1 Mon Sep 17 00:00:00 2001 From: John Zedlewski Date: Mon, 30 Sep 2019 15:20:40 -0700 Subject: [PATCH 47/63] Add metrics, utils, SVM to docs --- README.md | 1 + docs/source/api.rst | 32 +++++++++++++++++-- .../cuml/ensemble/randomforestclassifier.pyx | 19 ++++++++--- python/cuml/metrics/accuracy.pyx | 13 +++++--- python/cuml/svm/svm.pyx | 5 +-- python/cuml/utils/input_utils.py | 21 +++++++++--- 6 files changed, 72 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index fa8a4dfe22..9bd6206bcf 100644 --- a/README.md +++ b/README.md @@ -84,6 +84,7 @@ repo](https://github.com/rapidsai/notebooks-contrib). | **Nonlinear Models for Regression or Classification** | Random Forest (RF) Classification | Experimental multi-node, multi-GPU version available via Dask integration | | | Random Forest (RF) Regression | Experimental multi-node, multi-GPU version available via Dask integration | | | K-Nearest Neighbors (KNN) | Multi-GPU
Uses [Faiss](https://github.com/facebookresearch/faiss) | +| | Support Vector Machine Classifier (SVC) | | | **Time Series** | Linear Kalman Filter | | | | Holt-Winters Exponential Smoothing | | --- diff --git a/docs/source/api.rst b/docs/source/api.rst index fce956891b..1d1cc5f069 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -4,8 +4,8 @@ cuML API Reference -Preprocessing -============== +Preprocessing, Metrics, and Utilities +===================================== Model Selection and Data Splitting ---------------------------------- @@ -24,6 +24,28 @@ Dataset Generation .. automethod:: cuml.datasets.make_blobs +Metrics +--------- + + .. automodule:: cuml.metrics.regression + :members: + + .. automodule:: cuml.metrics.accuracy + :members: + + .. automodule:: cuml.metrics.trustworthiness + :members: + + +Utilities for I/O and Numba +--------------------------- + + .. automodule:: cuml.utils.input_utils + :members: + + .. automodule:: cuml.utils.numba_utils + :members: + Regression and Classification ============================= @@ -84,6 +106,12 @@ Quasi-Newton .. autoclass:: cuml.QN :members: +Support Vector Machines +------------------------ + +.. autoclass:: cuml.svm.SVC + :members: + Clustering ========== diff --git a/python/cuml/ensemble/randomforestclassifier.pyx b/python/cuml/ensemble/randomforestclassifier.pyx index 04f848a92d..f5dc6c616f 100644 --- a/python/cuml/ensemble/randomforestclassifier.pyx +++ b/python/cuml/ensemble/randomforestclassifier.pyx @@ -608,6 +608,7 @@ class RandomForestClassifier(Base): num_classes=2): """ Predicts the labels for X. + Parameters ---------- X : array-like (device or host) shape = (n_samples, n_features) @@ -639,9 +640,10 @@ class RandomForestClassifier(Base): It is applied if output_class == True, else it is ignored num_classes : integer number of different classes present in the dataset + Returns ---------- - y: NumPy + y : NumPy Dense vector (int) of shape (n_samples, 1) """ if self.dtype == np.float64: @@ -662,15 +664,17 @@ class RandomForestClassifier(Base): def _predict_get_all(self, X): """ Predicts the labels for X. + Parameters ---------- X : array-like (device or host) shape = (n_samples, n_features) Dense matrix (floats or doubles) of shape (n_samples, n_features). Acceptable formats: cuDF DataFrame, NumPy ndarray, Numba device ndarray, cuda array interface compliant array like CuPy + Returns ---------- - y: NumPy + y : NumPy Dense vector (int) of shape (n_samples, 1) """ cdef uintptr_t X_ptr @@ -724,17 +728,20 @@ class RandomForestClassifier(Base): def score(self, X, y): """ Calculates the accuracy metric score of the model for X. + Parameters ---------- X : array-like (device or host) shape = (n_samples, n_features) Dense matrix (floats or doubles) of shape (n_samples, n_features). Acceptable formats: cuDF DataFrame, NumPy ndarray, Numba device ndarray, cuda array interface compliant array like CuPy - y: NumPy + y : NumPy Dense vector (int) of shape (n_samples, 1) + Returns - ---------- - accuracy of the model + ------- + float + Accuracy of the model [0.0 - 1.0] """ cdef uintptr_t X_ptr, y_ptr X_m, X_ptr, n_rows, n_cols, _ = \ @@ -795,6 +802,7 @@ class RandomForestClassifier(Base): """ Returns the value of all parameters required to configure this estimator as a dictionary. + Parameters ----------- deep : boolean (default = True) @@ -811,6 +819,7 @@ class RandomForestClassifier(Base): Sets the value of parameters required to configure this estimator, it functions similar to the sklearn set_params. + Parameters ----------- params : dict of new params diff --git a/python/cuml/metrics/accuracy.pyx b/python/cuml/metrics/accuracy.pyx index dfebe1910f..5ce62bb591 100644 --- a/python/cuml/metrics/accuracy.pyx +++ b/python/cuml/metrics/accuracy.pyx @@ -42,13 +42,16 @@ def accuracy_score(ground_truth, predictions, handle=None): Parameters ---------- - handle : cuml.Handle - prediction : The lables predicted by the model - for the test dataset - ground_truth : The ground truth labels of the test dataset + handle : cuml.Handle + prediction : NumPy ndarray or Numba device + The lables predicted by the model for the test dataset + ground_truth : NumPy ndarray, Numba device + The ground truth labels of the test dataset + Returns ------- - The accuracy of the model used for prediction + float + The accuracy of the model used for prediction """ handle = cuml.common.handle.Handle() \ if handle is None else handle diff --git a/python/cuml/svm/svm.pyx b/python/cuml/svm/svm.pyx index 141e8a5002..8ebf541f49 100644 --- a/python/cuml/svm/svm.pyx +++ b/python/cuml/svm/svm.pyx @@ -442,9 +442,10 @@ class SVC(Base): Dense matrix (floats or doubles) of shape (n_samples, n_features). Acceptable formats: cuDF DataFrame, NumPy ndarray, Numba device ndarray, cuda array interface compliant array like CuPy + Returns - ---------- - y: cuDF Series + ------- + y : cuDF Series Dense vector (floats or doubles) of shape (n_samples, 1) """ diff --git a/python/cuml/utils/input_utils.py b/python/cuml/utils/input_utils.py index 6170ab4223..282a0d4bea 100644 --- a/python/cuml/utils/input_utils.py +++ b/python/cuml/utils/input_utils.py @@ -71,14 +71,20 @@ def input_to_dev_array(X, order='F', deepcopy=False, check_cols=False, check_rows=False, fail_on_order=False): """ - Convert input X to device array suitable for C++ methods + Convert input X to device array suitable for C++ methods. + Acceptable input formats: + * cuDF Dataframe - returns a deep copy always + * cuDF Series - returns by reference or a deep copy depending on `deepcopy` + * Numpy array - returns a copy in device always + * cuda array interface compliant array (like Cupy) - returns a - reference unless deepcopy=True + reference unless `deepcopy`=True + * numba device array - returns a reference unless deepcopy=True Parameters @@ -309,13 +315,18 @@ def input_to_host_array(X, order='F', deepcopy=False, """ Convert input X to host array (NumPy) suitable for C++ methods that accept host arrays. + Acceptable input formats: + * Numpy array - returns a pointer to the original input + * cuDF Dataframe - returns a deep copy always - * cuDF Series - returns by reference or a deep copy depending on - `deepcopy` - * cuda array interface compliant array (like Cupy) - returns a + + * cuDF Series - returns by reference or a deep copy depending on `deepcopy` + + * cuda array interface compliant array (like Cupy) - returns a \ reference unless deepcopy=True + * numba device array - returns a reference unless deepcopy=True Parameters From af4fe5a02a8411cd70ae2dec8b2adc734ba9826e Mon Sep 17 00:00:00 2001 From: John Zedlewski Date: Mon, 30 Sep 2019 15:27:30 -0700 Subject: [PATCH 48/63] Update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7b471839e0..4f72f7b6aa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -104,6 +104,7 @@ - PR #978: Update README for 0.9 - PR #1009: Fix references to notebooks-contrib - PR #1015: Ability to control the number of internal streams in cumlHandle_impl via cumlHandle +- PR #1175: Add more modules to docs ToC ## Bug Fixes From 39ede99cc64a410ae2b1cd7c91c0b966c030bfed Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Mon, 30 Sep 2019 21:09:00 -0400 Subject: [PATCH 49/63] Fixing out_dtype for fit_predict --- python/cuml/benchmark/algorithms.py | 3 ++- python/cuml/benchmark/runners.py | 16 +++++++++++++--- python/cuml/cluster/dbscan.pyx | 2 +- 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/python/cuml/benchmark/algorithms.py b/python/cuml/benchmark/algorithms.py index 088e0e8910..63e7686455 100644 --- a/python/cuml/benchmark/algorithms.py +++ b/python/cuml/benchmark/algorithms.py @@ -109,6 +109,7 @@ def run_cuml(self, data, **override_args): all_args = {**all_args, **override_args} cuml_obj = self.cuml_class(**all_args) + print(str(cuml_obj)) if self.data_prep_hook: data = self.data_prep_hook(data) if self.accepts_labels: @@ -159,7 +160,7 @@ def all_algorithms(): AlgorithmPair( sklearn.neighbors.NearestNeighbors, cuml.neighbors.NearestNeighbors, - shared_args=dict(n_neighbors=1024), + shared_args=dict(n_neighbors=1000000), cpu_args=dict(algorithm="brute"), cuml_args={}, name="NearestNeighbors", diff --git a/python/cuml/benchmark/runners.py b/python/cuml/benchmark/runners.py index ca3f4d9bbe..b7a3d9b15e 100644 --- a/python/cuml/benchmark/runners.py +++ b/python/cuml/benchmark/runners.py @@ -42,11 +42,11 @@ def _run_one_size( cuml_param_overrides={}, cpu_param_overrides={}, run_cpu=True, + verbose=False ): data = datagen.gen_data( self.dataset_name, self.input_type, n_samples, n_features ) - print("data type: ", data[0].__class__) cu_start = time.time() algo_pair.run_cuml(data, **param_overrides, **cuml_param_overrides) @@ -59,10 +59,17 @@ def _run_one_size( else: cpu_elapsed = 0.0 + speedup = cpu_elapsed / float(cu_elapsed) + + if verbose is True: + print("Benchmark [n_samples=%d, " + "n_features=%d with datatype=%s] = %f speedup." % + (n_samples, n_features, data[0].__class__, speedup)) + return dict( cu_time=cu_elapsed, cpu_time=cpu_elapsed, - speedup=cpu_elapsed / float(cu_elapsed), + speedup=speedup, n_samples=n_samples, n_features=n_features, **param_overrides, @@ -77,7 +84,9 @@ def run( cpu_param_overrides={}, *, run_cpu=True, - raise_on_error=False + raise_on_error=False, + verbose=False + ): all_results = [] for ns in self.bench_rows: @@ -92,6 +101,7 @@ def run( cuml_param_overrides, cpu_param_overrides, run_cpu, + verbose ) ) except Exception as e: diff --git a/python/cuml/cluster/dbscan.pyx b/python/cuml/cluster/dbscan.pyx index 02eaafd66c..26be6f54ea 100644 --- a/python/cuml/cluster/dbscan.pyx +++ b/python/cuml/cluster/dbscan.pyx @@ -268,7 +268,7 @@ class DBSCAN(Base): del(X_m) return self - def fit_predict(self, X, out_dtype="auto"): + def fit_predict(self, X, out_dtype="int32"): """ Performs clustering on input_gdf and returns cluster labels. From a4673e9dbb264ab679e49d0f14d18fb6e97aeb9b Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Mon, 30 Sep 2019 20:16:59 -0500 Subject: [PATCH 50/63] FIX Change import of RMM from cffi to cython --- python/cuml/cluster/kmeans.pyx | 2 +- python/cuml/cluster/kmeans_mg.pyx | 2 +- python/cuml/dask/linear_model/linear_regression.py | 2 +- python/cuml/dask/neighbors/nearest_neighbors.py | 2 +- python/cuml/decomposition/pca.pyx | 2 +- python/cuml/decomposition/tsvd.pyx | 2 +- python/cuml/fil/fil.pyx | 2 +- python/cuml/filter/kalman_filter.pyx | 2 +- python/cuml/manifold/t_sne.pyx | 2 +- python/cuml/manifold/umap.pyx | 2 +- python/cuml/neighbors/nearest_neighbors.pyx | 4 ++-- python/cuml/random_projection/random_projection.pyx | 2 +- python/cuml/solvers/qn.pyx | 2 +- python/cuml/utils/input_utils.py | 2 +- python/cuml/utils/numba_utils.py | 2 +- 15 files changed, 16 insertions(+), 16 deletions(-) diff --git a/python/cuml/cluster/kmeans.pyx b/python/cuml/cluster/kmeans.pyx index 802acdc907..aa1cd53fab 100644 --- a/python/cuml/cluster/kmeans.pyx +++ b/python/cuml/cluster/kmeans.pyx @@ -24,7 +24,7 @@ import cudf import numpy as np import warnings -from librmm_cffi import librmm as rmm +import rmm from libcpp cimport bool from libc.stdint cimport uintptr_t diff --git a/python/cuml/cluster/kmeans_mg.pyx b/python/cuml/cluster/kmeans_mg.pyx index d04d448f7a..57785a4a5d 100644 --- a/python/cuml/cluster/kmeans_mg.pyx +++ b/python/cuml/cluster/kmeans_mg.pyx @@ -24,7 +24,7 @@ import cudf import numpy as np import warnings -from librmm_cffi import librmm as rmm +import rmm from libcpp cimport bool from libc.stdint cimport uintptr_t diff --git a/python/cuml/dask/linear_model/linear_regression.py b/python/cuml/dask/linear_model/linear_regression.py index 1034af5489..a860c00b6f 100644 --- a/python/cuml/dask/linear_model/linear_regression.py +++ b/python/cuml/dask/linear_model/linear_regression.py @@ -26,7 +26,7 @@ from dask import delayed from dask.distributed import wait, default_client from math import ceil -from librmm_cffi import librmm as rmm +import rmm from toolz import first from tornado import gen diff --git a/python/cuml/dask/neighbors/nearest_neighbors.py b/python/cuml/dask/neighbors/nearest_neighbors.py index b37ee0568c..7128eb2dd1 100644 --- a/python/cuml/dask/neighbors/nearest_neighbors.py +++ b/python/cuml/dask/neighbors/nearest_neighbors.py @@ -24,7 +24,7 @@ import random from cuml.utils import numba_utils -from librmm_cffi import librmm as rmm +import rmm from dask import delayed from collections import defaultdict diff --git a/python/cuml/decomposition/pca.pyx b/python/cuml/decomposition/pca.pyx index 78aab743ad..cd28e929ed 100644 --- a/python/cuml/decomposition/pca.pyx +++ b/python/cuml/decomposition/pca.pyx @@ -23,7 +23,7 @@ import ctypes import cudf import numpy as np -from librmm_cffi import librmm as rmm +import rmm from libcpp cimport bool from libc.stdint cimport uintptr_t diff --git a/python/cuml/decomposition/tsvd.pyx b/python/cuml/decomposition/tsvd.pyx index eb10bf9565..66e4866048 100644 --- a/python/cuml/decomposition/tsvd.pyx +++ b/python/cuml/decomposition/tsvd.pyx @@ -23,7 +23,7 @@ import ctypes import cudf import numpy as np -from librmm_cffi import librmm as rmm +import rmm from libcpp cimport bool from libc.stdint cimport uintptr_t diff --git a/python/cuml/fil/fil.pyx b/python/cuml/fil/fil.pyx index 58ecb9645a..d55983abda 100644 --- a/python/cuml/fil/fil.pyx +++ b/python/cuml/fil/fil.pyx @@ -26,7 +26,7 @@ import math import numpy as np import warnings -from librmm_cffi import librmm as rmm +import rmm from libcpp cimport bool from libc.stdint cimport uintptr_t diff --git a/python/cuml/filter/kalman_filter.pyx b/python/cuml/filter/kalman_filter.pyx index 7308fa14ab..22bf510a33 100644 --- a/python/cuml/filter/kalman_filter.pyx +++ b/python/cuml/filter/kalman_filter.pyx @@ -24,7 +24,7 @@ import numpy as np from numba import cuda from cuml.utils import numba_utils -from librmm_cffi import librmm as rmm +import rmm from libc.stdint cimport uintptr_t from libc.stdlib cimport calloc, malloc, free diff --git a/python/cuml/manifold/t_sne.pyx b/python/cuml/manifold/t_sne.pyx index 453fa60a42..8e9a0ce10f 100644 --- a/python/cuml/manifold/t_sne.pyx +++ b/python/cuml/manifold/t_sne.pyx @@ -32,7 +32,7 @@ from cuml.common.base import Base from cuml.common.handle cimport cumlHandle from cuml.utils import input_to_dev_array as to_cuda -from librmm_cffi import librmm as rmm +import rmm from libcpp cimport bool from libc.stdint cimport uintptr_t diff --git a/python/cuml/manifold/umap.pyx b/python/cuml/manifold/umap.pyx index 1f120c0685..25a42669a5 100644 --- a/python/cuml/manifold/umap.pyx +++ b/python/cuml/manifold/umap.pyx @@ -31,7 +31,7 @@ from cuml.common.handle cimport cumlHandle from cuml.utils import get_cudf_column_ptr, get_dev_array_ptr, \ input_to_dev_array, zeros, row_matrix -from librmm_cffi import librmm as rmm +import rmm from libcpp cimport bool from libc.stdint cimport uintptr_t diff --git a/python/cuml/neighbors/nearest_neighbors.pyx b/python/cuml/neighbors/nearest_neighbors.pyx index 7e21a1aa87..49812e76f4 100644 --- a/python/cuml/neighbors/nearest_neighbors.pyx +++ b/python/cuml/neighbors/nearest_neighbors.pyx @@ -35,14 +35,14 @@ from cython.operator cimport dereference as deref from libcpp cimport bool from libcpp.memory cimport shared_ptr -from librmm_cffi import librmm as rmm +import rmm from libc.stdlib cimport malloc, free from libc.stdint cimport uintptr_t from libc.stdlib cimport calloc, malloc, free from numba import cuda -from librmm_cffi import librmm as rmm +import rmm cimport cuml.common.handle cimport cuml.common.cuda diff --git a/python/cuml/random_projection/random_projection.pyx b/python/cuml/random_projection/random_projection.pyx index 7897eecc61..efa6ba8193 100644 --- a/python/cuml/random_projection/random_projection.pyx +++ b/python/cuml/random_projection/random_projection.pyx @@ -22,7 +22,7 @@ import cudf import numpy as np -from librmm_cffi import librmm as rmm +import rmm from libc.stdint cimport uintptr_t from libcpp cimport bool diff --git a/python/cuml/solvers/qn.pyx b/python/cuml/solvers/qn.pyx index f783590dfe..393ab2b379 100644 --- a/python/cuml/solvers/qn.pyx +++ b/python/cuml/solvers/qn.pyx @@ -23,7 +23,7 @@ import cudf import numpy as np import warnings -from librmm_cffi import librmm as rmm +import rmm from libcpp cimport bool from libc.stdint cimport uintptr_t diff --git a/python/cuml/utils/input_utils.py b/python/cuml/utils/input_utils.py index 6170ab4223..51fea46e49 100644 --- a/python/cuml/utils/input_utils.py +++ b/python/cuml/utils/input_utils.py @@ -26,7 +26,7 @@ from collections.abc import Collection from numba import cuda -from librmm_cffi import librmm as rmm +import rmm inp_array = namedtuple('inp_array', 'array pointer n_rows n_cols dtype') diff --git a/python/cuml/utils/numba_utils.py b/python/cuml/utils/numba_utils.py index 288063b165..dc6b2d8c00 100644 --- a/python/cuml/utils/numba_utils.py +++ b/python/cuml/utils/numba_utils.py @@ -18,7 +18,7 @@ from numba import cuda from numba.cuda.cudadrv.driver import driver -from librmm_cffi import librmm as rmm +import rmm import numpy as np From 13f8962127cdbbaa2e0367f0b3277a3dce7ca254 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Mon, 30 Sep 2019 20:17:33 -0500 Subject: [PATCH 51/63] FIX Change import of RMM from cffi to cython --- python/cuml/preprocessing/LabelEncoder.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cuml/preprocessing/LabelEncoder.py b/python/cuml/preprocessing/LabelEncoder.py index a62765b4c5..f5f28a0f27 100644 --- a/python/cuml/preprocessing/LabelEncoder.py +++ b/python/cuml/preprocessing/LabelEncoder.py @@ -16,7 +16,7 @@ import cudf import nvcategory -from librmm_cffi import librmm +import rmm import numpy as np @@ -194,7 +194,7 @@ def fit_transform(self, y: cudf.Series) -> cudf.Series: self._cats = nvcategory.from_strings(y.data) self._fitted = True - arr: librmm.device_array = librmm.device_array( + arr: rmm.device_array = rmm.device_array( y.data.size(), dtype=np.int32 ) self._cats.values(devptr=arr.device_ctypes_pointer.value) From 951cc4fabfad50e701c7a99a4ebc7de9678b732c Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Mon, 30 Sep 2019 20:18:53 -0500 Subject: [PATCH 52/63] DOC Add entry to changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7b471839e0..91d8feba0f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,7 @@ - PR #1115: Moving dask_make_blobs to cuml.dask.datasets. Adding conversion to dask.DataFrame - PR #1136: CUDA 10.1 CI updates - PR #1165: Adding except + in all remaining cython +- PR #1176: Use new RMM API based on Cython ## Bug Fixes From 14ec7f55f8738f100445e112aca0c73955a504d2 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Mon, 30 Sep 2019 22:43:54 -0500 Subject: [PATCH 53/63] FIX Updated dask and distributed versions --- ci/gpu/build.sh | 4 ++-- conda/environments/cuml_dev_cuda10.0.yml | 4 ++-- conda/environments/cuml_dev_cuda10.1.yml | 4 ++-- conda/environments/cuml_dev_cuda9.2.yml | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index 73f77369cf..8e9f3091e7 100644 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -53,8 +53,8 @@ conda install -c conda-forge -c rapidsai -c rapidsai-nightly -c rapidsai/label/x "cmake==3.14.3" \ "umap-learn" \ "nccl>=2.4" \ - "dask=2.3.0" \ - "distributed=2.3.0" \ + "dask=2.5.0" \ + "distributed=2.5.1" \ "dask-ml" \ "dask-cudf=${MINOR_VERSION}" \ "dask-cuda=${MINOR_VERSION}" \ diff --git a/conda/environments/cuml_dev_cuda10.0.yml b/conda/environments/cuml_dev_cuda10.0.yml index 63159abc29..362dda8be3 100644 --- a/conda/environments/cuml_dev_cuda10.0.yml +++ b/conda/environments/cuml_dev_cuda10.0.yml @@ -17,8 +17,8 @@ dependencies: - scikit-learn>=0.21 - umap-learn>=0.3.9 - scikit-learn>=0.21 -- dask=2.3.0 -- distributed=2.3.0 +- dask=2.5.0 +- distributed=2.5.1 - dask-ml - dask-cuda=0.9* - dask-cudf=0.10* diff --git a/conda/environments/cuml_dev_cuda10.1.yml b/conda/environments/cuml_dev_cuda10.1.yml index 999aebcc20..d4a97d2a64 100644 --- a/conda/environments/cuml_dev_cuda10.1.yml +++ b/conda/environments/cuml_dev_cuda10.1.yml @@ -17,8 +17,8 @@ dependencies: - scikit-learn>=0.21 - umap-learn>=0.3.9 - scikit-learn>=0.21 -- dask=2.3.0 -- distributed=2.3.0 +- dask=2.5.0 +- distributed=2.5.1 - dask-ml - dask-cuda=0.9* - dask-cudf=0.10* diff --git a/conda/environments/cuml_dev_cuda9.2.yml b/conda/environments/cuml_dev_cuda9.2.yml index 40f928e884..6fd25a9bef 100644 --- a/conda/environments/cuml_dev_cuda9.2.yml +++ b/conda/environments/cuml_dev_cuda9.2.yml @@ -17,8 +17,8 @@ dependencies: - scikit-learn>=0.21 - umap-learn>=0.3.9 - scikit-learn>=0.21 -- dask=2.3.0 -- distributed=2.3.0 +- dask=2.5.0 +- distributed=2.5.1 - dask-ml - dask-cuda=0.9* - dask-cudf=0.10* From c0b8cacd730c0135d50d9c7e37989dd544ee6113 Mon Sep 17 00:00:00 2001 From: Daniel Han-Chen Date: Tue, 1 Oct 2019 14:34:50 +1000 Subject: [PATCH 54/63] Add blank lines --- python/cuml/manifold/t_sne.pyx | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python/cuml/manifold/t_sne.pyx b/python/cuml/manifold/t_sne.pyx index 98c72b1cca..9ee124b97f 100644 --- a/python/cuml/manifold/t_sne.pyx +++ b/python/cuml/manifold/t_sne.pyx @@ -306,8 +306,8 @@ class TSNE(Base): return def fit(self, X): - """ - Fit X into an embedded space. + """Fit X into an embedded space. + Parameters ----------- X : array-like (device or host) shape = (n_samples, n_features) @@ -420,12 +420,14 @@ class TSNE(Base): def fit_transform(self, X): """Fit X into an embedded space and return that transformed output. + Parameters ----------- X : array-like (device or host) shape = (n_samples, n_features) X contains a sample per row. Acceptable formats: cuDF DataFrame, NumPy ndarray, Numba device ndarray, cuda array interface compliant array like CuPy + Returns -------- X_new : array, shape (n_samples, n_components) From 6f789ad5cadf1aaff0614c17cb43f4f649ecff6b Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 1 Oct 2019 08:38:37 -0400 Subject: [PATCH 55/63] Adding utility in prims for getting the current time milliseconds --- cpp/src/dbscan/runner.h | 8 ++------ cpp/src_prims/utils.h | 9 +++++++++ 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/cpp/src/dbscan/runner.h b/cpp/src/dbscan/runner.h index 112d4ded9d..c9cc0ad925 100644 --- a/cpp/src/dbscan/runner.h +++ b/cpp/src/dbscan/runner.h @@ -25,6 +25,8 @@ #include "sparse/csr.h" #include "vertexdeg/runner.h" +#include "utils.h" + #include namespace Dbscan { @@ -59,12 +61,6 @@ void final_relabel(Index_* db_cluster, Index_ N, cudaStream_t stream) { [MAX_LABEL] __device__(Index_ val) { return val == MAX_LABEL; }); } -int64_t curTimeMillis() { - struct timeval tp; - gettimeofday(&tp, NULL); - return tp.tv_sec * 1000 + tp.tv_usec / 1000; -} - /* @param N number of points * @param D dimensionality of the points * @param eps epsilon neighborhood criterion diff --git a/cpp/src_prims/utils.h b/cpp/src_prims/utils.h index eb2b2bd59f..05a55cc0e9 100644 --- a/cpp/src_prims/utils.h +++ b/cpp/src_prims/utils.h @@ -18,6 +18,7 @@ #include #include +#include #include #include #include @@ -177,6 +178,14 @@ void copyAsync(Type* dPtr1, const Type* dPtr2, size_t len, CUDA_CHECK(cudaMemcpyAsync(dPtr1, dPtr2, len * sizeof(Type), cudaMemcpyDeviceToDevice, stream)); } + +inline uint32_t curTimeMillis() { + auto now = std::chrono::high_resolution_clock::now(); + auto duration = now.time_since_epoch(); + return std::chrono::duration_cast(duration) + .count(); +} + /** @} */ /** Helper function to calculate need memory for allocate to store dense matrix. From 648b70279ab9363867edcb4568c6d536b146a771 Mon Sep 17 00:00:00 2001 From: John Zedlewski Date: Tue, 1 Oct 2019 09:03:24 -0700 Subject: [PATCH 56/63] Add benchmarking and cluster metrics to docs --- docs/source/api.rst | 16 ++++++++++++++++ python/cuml/benchmark/datagen.py | 2 +- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/docs/source/api.rst b/docs/source/api.rst index 1d1cc5f069..ce0fb8d750 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -36,6 +36,22 @@ Metrics .. automodule:: cuml.metrics.trustworthiness :members: + .. automodule:: cuml.metrics.cluster + :members: + +Benchmarking +------------- + + .. automodule:: cuml.benchmark.algorithms + :members: + + .. automodule:: cuml.benchmark.runners + :members: + + .. automodule:: cuml.benchmark.datagen + :members: + + Utilities for I/O and Numba --------------------------- diff --git a/python/cuml/benchmark/datagen.py b/python/cuml/benchmark/datagen.py index 8a881687ad..203d6a6759 100644 --- a/python/cuml/benchmark/datagen.py +++ b/python/cuml/benchmark/datagen.py @@ -22,7 +22,7 @@ * n_samples (set to 0 for 'default') * n_features (set to 0 for 'default') * random_state - * .. and optional generator-specific parameters + * (and optional generator-specific parameters) The function should return a 2-tuple (X, y), where X is a Pandas dataframe and y is a Pandas series. If the generator does not produce From 3d47170bb4bb1b30e159578f800bcaae858b791a Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 1 Oct 2019 12:39:33 -0400 Subject: [PATCH 57/63] Undoing accidentally checked in benchmark changes --- python/cuml/benchmark/algorithms.py | 3 +-- python/cuml/benchmark/runners.py | 16 +++------------- 2 files changed, 4 insertions(+), 15 deletions(-) diff --git a/python/cuml/benchmark/algorithms.py b/python/cuml/benchmark/algorithms.py index 63e7686455..088e0e8910 100644 --- a/python/cuml/benchmark/algorithms.py +++ b/python/cuml/benchmark/algorithms.py @@ -109,7 +109,6 @@ def run_cuml(self, data, **override_args): all_args = {**all_args, **override_args} cuml_obj = self.cuml_class(**all_args) - print(str(cuml_obj)) if self.data_prep_hook: data = self.data_prep_hook(data) if self.accepts_labels: @@ -160,7 +159,7 @@ def all_algorithms(): AlgorithmPair( sklearn.neighbors.NearestNeighbors, cuml.neighbors.NearestNeighbors, - shared_args=dict(n_neighbors=1000000), + shared_args=dict(n_neighbors=1024), cpu_args=dict(algorithm="brute"), cuml_args={}, name="NearestNeighbors", diff --git a/python/cuml/benchmark/runners.py b/python/cuml/benchmark/runners.py index b7a3d9b15e..ca3f4d9bbe 100644 --- a/python/cuml/benchmark/runners.py +++ b/python/cuml/benchmark/runners.py @@ -42,11 +42,11 @@ def _run_one_size( cuml_param_overrides={}, cpu_param_overrides={}, run_cpu=True, - verbose=False ): data = datagen.gen_data( self.dataset_name, self.input_type, n_samples, n_features ) + print("data type: ", data[0].__class__) cu_start = time.time() algo_pair.run_cuml(data, **param_overrides, **cuml_param_overrides) @@ -59,17 +59,10 @@ def _run_one_size( else: cpu_elapsed = 0.0 - speedup = cpu_elapsed / float(cu_elapsed) - - if verbose is True: - print("Benchmark [n_samples=%d, " - "n_features=%d with datatype=%s] = %f speedup." % - (n_samples, n_features, data[0].__class__, speedup)) - return dict( cu_time=cu_elapsed, cpu_time=cpu_elapsed, - speedup=speedup, + speedup=cpu_elapsed / float(cu_elapsed), n_samples=n_samples, n_features=n_features, **param_overrides, @@ -84,9 +77,7 @@ def run( cpu_param_overrides={}, *, run_cpu=True, - raise_on_error=False, - verbose=False - + raise_on_error=False ): all_results = [] for ns in self.bench_rows: @@ -101,7 +92,6 @@ def run( cuml_param_overrides, cpu_param_overrides, run_cpu, - verbose ) ) except Exception as e: From 779f6f803dad12d38f658c685dc011bc3272a29c Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 1 Oct 2019 12:52:12 -0400 Subject: [PATCH 58/63] Making style checker happy --- python/cuml/metrics/accuracy.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cuml/metrics/accuracy.pyx b/python/cuml/metrics/accuracy.pyx index 5ce62bb591..7520178424 100644 --- a/python/cuml/metrics/accuracy.pyx +++ b/python/cuml/metrics/accuracy.pyx @@ -43,7 +43,7 @@ def accuracy_score(ground_truth, predictions, handle=None): Parameters ---------- handle : cuml.Handle - prediction : NumPy ndarray or Numba device + prediction : NumPy ndarray or Numba device The lables predicted by the model for the test dataset ground_truth : NumPy ndarray, Numba device The ground truth labels of the test dataset From a87e0642d77a1bdefc08cf57b078c72a3d1516e3 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Tue, 1 Oct 2019 12:32:47 -0500 Subject: [PATCH 59/63] DOC Add entry to changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6b81f80dff..be72cd2f59 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -47,6 +47,7 @@ - PR #1132: DBSCAN Batching Bug Fix - PR #1162: DASK RF random seed bug fix - PR #1164: Fix check_dtype arg handling for input_to_dev_array +- PR #1177: Update dask and distributed to 2.5 # cuML 0.9.0 (21 Aug 2019) From 079579bdf15843710f7b06d56aea442ea10efaf4 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Tue, 1 Oct 2019 12:34:20 -0500 Subject: [PATCH 60/63] FIX PEP8 fixes --- python/cuml/metrics/accuracy.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cuml/metrics/accuracy.pyx b/python/cuml/metrics/accuracy.pyx index 5ce62bb591..0dea749494 100644 --- a/python/cuml/metrics/accuracy.pyx +++ b/python/cuml/metrics/accuracy.pyx @@ -43,8 +43,8 @@ def accuracy_score(ground_truth, predictions, handle=None): Parameters ---------- handle : cuml.Handle - prediction : NumPy ndarray or Numba device - The lables predicted by the model for the test dataset + prediction : NumPy ndarray or Numba device + The lablels predicted by the model for the test dataset ground_truth : NumPy ndarray, Numba device The ground truth labels of the test dataset From 0f6d4a623ea1fc28bb45bcec9d328dfa5b225595 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 1 Oct 2019 16:06:12 -0400 Subject: [PATCH 61/63] Fixing adjusted rand score python to use int again --- .../metrics/cluster/adjustedrandindex.pyx | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/python/cuml/metrics/cluster/adjustedrandindex.pyx b/python/cuml/metrics/cluster/adjustedrandindex.pyx index 971763e664..2a1c4933db 100644 --- a/python/cuml/metrics/cluster/adjustedrandindex.pyx +++ b/python/cuml/metrics/cluster/adjustedrandindex.pyx @@ -32,11 +32,11 @@ cimport cuml.common.cuda cdef extern from "metrics/metrics.hpp" namespace "ML::Metrics": double adjustedRandIndex(cumlHandle &handle, - long *y, - long *y_hat, - long n, - long lower_class_range, - long upper_class_range) + int *y, + int *y_hat, + int n, + int lower_class_range, + int upper_class_range) def adjusted_rand_score(labels_true, @@ -85,10 +85,10 @@ def adjusted_rand_score(labels_true, input_to_dev_array(labels_pred) rand_score = adjustedRandIndex(handle_[0], - y_ptr, - y_hat_ptr, - n_rows, - lower_class_range, - upper_class_range) + y_ptr, + y_hat_ptr, + n_rows, + lower_class_range, + upper_class_range) return rand_score From 5190df2765cdd1d6662ea87471cfd05b6f1e30a7 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 1 Oct 2019 16:12:26 -0400 Subject: [PATCH 62/63] Fixing style checker issues from t-sne PR --- python/cuml/manifold/t_sne.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cuml/manifold/t_sne.pyx b/python/cuml/manifold/t_sne.pyx index cb27bbf9af..7499df0c7d 100644 --- a/python/cuml/manifold/t_sne.pyx +++ b/python/cuml/manifold/t_sne.pyx @@ -307,7 +307,7 @@ class TSNE(Base): def fit(self, X): """Fit X into an embedded space. - + Parameters ----------- X : array-like (device or host) shape = (n_samples, n_features) @@ -420,14 +420,14 @@ class TSNE(Base): def fit_transform(self, X): """Fit X into an embedded space and return that transformed output. - + Parameters ----------- X : array-like (device or host) shape = (n_samples, n_features) X contains a sample per row. Acceptable formats: cuDF DataFrame, NumPy ndarray, Numba device ndarray, cuda array interface compliant array like CuPy - + Returns -------- X_new : array, shape (n_samples, n_components) From 455741682f47ce095b7af4e85a22961b6efef89c Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 1 Oct 2019 21:01:04 -0400 Subject: [PATCH 63/63] Casting labels to int32 in adjusted rand score --- python/cuml/metrics/cluster/adjustedrandindex.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cuml/metrics/cluster/adjustedrandindex.pyx b/python/cuml/metrics/cluster/adjustedrandindex.pyx index 2a1c4933db..e0c20538a3 100644 --- a/python/cuml/metrics/cluster/adjustedrandindex.pyx +++ b/python/cuml/metrics/cluster/adjustedrandindex.pyx @@ -65,11 +65,11 @@ def adjusted_rand_score(labels_true, if labels_true.astype != np.int64: warnings.warn(" The dtype of ground truth is not int32" " converting the ground truth to int32") - labels_true = labels_true.astype(np.int64) + labels_true = labels_true.astype(np.int32) if labels_pred.astype != np.int32: warnings.warn(" The dtype of predicted labels is not int32" " converting the predicted labels to int32") - labels_pred = labels_pred.astype(np.int64) + labels_pred = labels_pred.astype(np.int32) min_val_y = np.nanmin(labels_true) lower_class_range = np.nanmin(labels_pred) \