diff --git a/cpp/cmake/thirdparty/get_raft.cmake b/cpp/cmake/thirdparty/get_raft.cmake index b0a053b582..50845ec4b8 100644 --- a/cpp/cmake/thirdparty/get_raft.cmake +++ b/cpp/cmake/thirdparty/get_raft.cmake @@ -58,4 +58,4 @@ set(CUML_BRANCH_VERSION_raft "${CUML_VERSION_MAJOR}.${CUML_VERSION_MINOR}") find_and_configure_raft(VERSION ${CUML_MIN_VERSION_raft} FORK rapidsai PINNED_TAG branch-${CUML_BRANCH_VERSION_raft} - ) \ No newline at end of file + ) diff --git a/cpp/include/cuml/decomposition/params.hpp b/cpp/include/cuml/decomposition/params.hpp index 0a86db5d71..101f1b6df0 100644 --- a/cpp/include/cuml/decomposition/params.hpp +++ b/cpp/include/cuml/decomposition/params.hpp @@ -31,24 +31,24 @@ enum class solver : int { class params { public: - int n_rows; - int n_cols; + std::size_t n_rows; + std::size_t n_cols; int gpu_id = 0; }; class paramsSolver : public params { public: // math_t tol = 0.0; - float tol = 0.0; - int n_iterations = 15; - int verbose = 0; + float tol = 0.0; + std::uint32_t n_iterations = 15; + int verbose = 0; }; template class paramsTSVDTemplate : public paramsSolver { public: - int n_components = 1; - enum_solver algorithm = enum_solver::COV_EIG_DQ; + std::size_t n_components = 1; + enum_solver algorithm = enum_solver::COV_EIG_DQ; }; /** diff --git a/cpp/include/cuml/decomposition/pca_mg.hpp b/cpp/include/cuml/decomposition/pca_mg.hpp index f5c2434820..651e5e7695 100644 --- a/cpp/include/cuml/decomposition/pca_mg.hpp +++ b/cpp/include/cuml/decomposition/pca_mg.hpp @@ -86,7 +86,7 @@ void fit(raft::handle_t& handle, */ void fit_transform(raft::handle_t& handle, MLCommon::Matrix::RankSizePair** rank_sizes, - size_t n_parts, + std::uint32_t n_parts, MLCommon::Matrix::floatData_t** input, MLCommon::Matrix::floatData_t** trans_input, float* components, @@ -100,7 +100,7 @@ void fit_transform(raft::handle_t& handle, void fit_transform(raft::handle_t& handle, MLCommon::Matrix::RankSizePair** rank_sizes, - size_t n_parts, + std::uint32_t n_parts, MLCommon::Matrix::doubleData_t** input, MLCommon::Matrix::doubleData_t** trans_input, double* components, @@ -127,7 +127,7 @@ void fit_transform(raft::handle_t& handle, */ void transform(raft::handle_t& handle, MLCommon::Matrix::RankSizePair** rank_sizes, - size_t n_parts, + std::uint32_t n_parts, MLCommon::Matrix::Data** input, float* components, MLCommon::Matrix::Data** trans_input, @@ -138,7 +138,7 @@ void transform(raft::handle_t& handle, void transform(raft::handle_t& handle, MLCommon::Matrix::RankSizePair** rank_sizes, - size_t n_parts, + std::uint32_t n_parts, MLCommon::Matrix::Data** input, double* components, MLCommon::Matrix::Data** trans_input, @@ -162,7 +162,7 @@ void transform(raft::handle_t& handle, */ void inverse_transform(raft::handle_t& handle, MLCommon::Matrix::RankSizePair** rank_sizes, - size_t n_parts, + std::uint32_t n_parts, MLCommon::Matrix::Data** trans_input, float* components, MLCommon::Matrix::Data** input, @@ -173,7 +173,7 @@ void inverse_transform(raft::handle_t& handle, void inverse_transform(raft::handle_t& handle, MLCommon::Matrix::RankSizePair** rank_sizes, - size_t n_parts, + std::uint32_t n_parts, MLCommon::Matrix::Data** trans_input, double* components, MLCommon::Matrix::Data** input, diff --git a/cpp/include/cuml/decomposition/sign_flip_mg.hpp b/cpp/include/cuml/decomposition/sign_flip_mg.hpp index 9775547794..e88b7ce47d 100644 --- a/cpp/include/cuml/decomposition/sign_flip_mg.hpp +++ b/cpp/include/cuml/decomposition/sign_flip_mg.hpp @@ -40,17 +40,17 @@ void sign_flip(raft::handle_t& handle, std::vector*>& input_data, MLCommon::Matrix::PartDescriptor& input_desc, float* components, - int n_components, + std::size_t n_components, cudaStream_t* streams, - int n_stream); + std::uint32_t n_stream); void sign_flip(raft::handle_t& handle, std::vector*>& input_data, MLCommon::Matrix::PartDescriptor& input_desc, double* components, - int n_components, + std::size_t n_components, cudaStream_t* streams, - int n_stream); + std::uint32_t n_stream); }; // end namespace opg }; // end namespace PCA diff --git a/cpp/include/cuml/decomposition/tsvd_mg.hpp b/cpp/include/cuml/decomposition/tsvd_mg.hpp index 6d8001d810..549b918098 100644 --- a/cpp/include/cuml/decomposition/tsvd_mg.hpp +++ b/cpp/include/cuml/decomposition/tsvd_mg.hpp @@ -37,7 +37,7 @@ namespace opg { */ void fit(raft::handle_t& handle, MLCommon::Matrix::RankSizePair** rank_sizes, - size_t n_parts, + std::uint32_t n_parts, MLCommon::Matrix::floatData_t** input, float* components, float* singular_vals, @@ -46,7 +46,7 @@ void fit(raft::handle_t& handle, void fit(raft::handle_t& handle, MLCommon::Matrix::RankSizePair** rank_sizes, - size_t n_parts, + std::uint32_t n_parts, MLCommon::Matrix::doubleData_t** input, double* components, double* singular_vals, @@ -104,7 +104,7 @@ void fit_transform(raft::handle_t& handle, */ void transform(raft::handle_t& handle, MLCommon::Matrix::RankSizePair** rank_sizes, - size_t n_parts, + std::uint32_t n_parts, MLCommon::Matrix::Data** input, float* components, MLCommon::Matrix::Data** trans_input, @@ -113,7 +113,7 @@ void transform(raft::handle_t& handle, void transform(raft::handle_t& handle, MLCommon::Matrix::RankSizePair** rank_sizes, - size_t n_parts, + std::uint32_t n_parts, MLCommon::Matrix::Data** input, double* components, MLCommon::Matrix::Data** trans_input, @@ -133,7 +133,7 @@ void transform(raft::handle_t& handle, */ void inverse_transform(raft::handle_t& handle, MLCommon::Matrix::RankSizePair** rank_sizes, - size_t n_parts, + std::uint32_t n_parts, MLCommon::Matrix::Data** trans_input, float* components, MLCommon::Matrix::Data** input, @@ -142,7 +142,7 @@ void inverse_transform(raft::handle_t& handle, void inverse_transform(raft::handle_t& handle, MLCommon::Matrix::RankSizePair** rank_sizes, - size_t n_parts, + std::uint32_t n_parts, MLCommon::Matrix::Data** trans_input, double* components, MLCommon::Matrix::Data** input, diff --git a/cpp/src/pca/pca.cuh b/cpp/src/pca/pca.cuh index 9feb344996..bb7c3be9d9 100644 --- a/cpp/src/pca/pca.cuh +++ b/cpp/src/pca/pca.cuh @@ -41,10 +41,10 @@ void truncCompExpVars(const raft::handle_t& handle, math_t* components, math_t* explained_var, math_t* explained_var_ratio, - const paramsTSVDTemplate prms, + const paramsTSVDTemplate& prms, cudaStream_t stream) { - size_t len = prms.n_cols * prms.n_cols; + auto len = prms.n_cols * prms.n_cols; rmm::device_uvector components_all(len, stream); rmm::device_uvector explained_var_all(prms.n_cols, stream); rmm::device_uvector explained_var_ratio_all(prms.n_cols, stream); @@ -55,10 +55,18 @@ void truncCompExpVars(const raft::handle_t& handle, components_all.data(), prms.n_cols, components, prms.n_components, prms.n_cols, stream); raft::matrix::ratio( handle, explained_var_all.data(), explained_var_ratio_all.data(), prms.n_cols, stream); - raft::matrix::truncZeroOrigin( - explained_var_all.data(), prms.n_cols, explained_var, prms.n_components, 1, stream); - raft::matrix::truncZeroOrigin( - explained_var_ratio_all.data(), prms.n_cols, explained_var_ratio, prms.n_components, 1, stream); + raft::matrix::truncZeroOrigin(explained_var_all.data(), + prms.n_cols, + explained_var, + prms.n_components, + std::size_t(1), + stream); + raft::matrix::truncZeroOrigin(explained_var_ratio_all.data(), + prms.n_cols, + explained_var_ratio, + prms.n_components, + std::size_t(1), + stream); } /** @@ -97,12 +105,12 @@ void pcaFit(const raft::handle_t& handle, ASSERT(prms.n_components > 0, "Parameter n_components: number of components cannot be less than one"); - int n_components = prms.n_components; + auto n_components = prms.n_components; if (n_components > prms.n_cols) n_components = prms.n_cols; raft::stats::mean(mu, input, prms.n_cols, prms.n_rows, true, false, stream); - size_t len = prms.n_cols * prms.n_cols; + auto len = prms.n_cols * prms.n_cols; rmm::device_uvector cov(len, stream); Stats::cov(handle, cov.data(), input, mu, prms.n_cols, prms.n_rows, true, false, true, stream); @@ -202,7 +210,7 @@ void pcaInverseTransform(const raft::handle_t& handle, ASSERT(prms.n_components > 0, "Parameter n_components: number of components cannot be less than one"); - std::size_t components_len = static_cast(prms.n_cols * prms.n_components); + auto components_len = prms.n_cols * prms.n_components; rmm::device_uvector components_copy{components_len, stream}; raft::copy(components_copy.data(), components, prms.n_cols * prms.n_components, stream); @@ -262,7 +270,7 @@ void pcaTransform(const raft::handle_t& handle, ASSERT(prms.n_components > 0, "Parameter n_components: number of components cannot be less than one"); - std::size_t components_len = static_cast(prms.n_cols * prms.n_components); + auto components_len = prms.n_cols * prms.n_components; rmm::device_uvector components_copy{components_len, stream}; raft::copy(components_copy.data(), components, prms.n_cols * prms.n_components, stream); diff --git a/cpp/src/pca/pca_mg.cu b/cpp/src/pca/pca_mg.cu index bd147b3343..10a727df07 100644 --- a/cpp/src/pca/pca_mg.cu +++ b/cpp/src/pca/pca_mg.cu @@ -53,17 +53,17 @@ void fit_impl(raft::handle_t& handle, T* noise_vars, paramsPCAMG prms, cudaStream_t* streams, - int n_streams, + std::uint32_t n_streams, bool verbose) { const auto& comm = handle.get_comms(); - Matrix::Data mu_data{mu, size_t(prms.n_cols)}; + Matrix::Data mu_data{mu, prms.n_cols}; Stats::opg::mean(handle, mu_data, input_data, input_desc, streams, n_streams); rmm::device_uvector cov_data(prms.n_cols * prms.n_cols, streams[0]); - size_t cov_data_size = cov_data.size(); + auto cov_data_size = cov_data.size(); Matrix::Data cov{cov_data.data(), cov_data_size}; Stats::opg::cov(handle, cov, input_data, input_desc, mu_data, true, streams, n_streams); @@ -108,9 +108,9 @@ void fit_impl(raft::handle_t& handle, // TODO: These streams should come from raft::handle_t // Reference issue https://github.com/rapidsai/cuml/issues/2470 - int n_streams = input_desc.blocksOwnedBy(rank).size(); + auto n_streams = input_desc.blocksOwnedBy(rank).size(); cudaStream_t streams[n_streams]; - for (int i = 0; i < n_streams; i++) { + for (std::uint32_t i = 0; i < n_streams; i++) { CUDA_CHECK(cudaStreamCreate(&streams[i])); } @@ -128,7 +128,7 @@ void fit_impl(raft::handle_t& handle, streams, n_streams, verbose); - for (int i = 0; i < n_streams; i++) { + for (std::uint32_t i = 0; i < n_streams; i++) { CUDA_CHECK(cudaStreamSynchronize(streams[i])); } } else if (prms.algorithm == mg_solver::QR) { @@ -137,10 +137,10 @@ void fit_impl(raft::handle_t& handle, const auto& comm = h.get_comms(); // Center the data - Matrix::Data mu_data{mu, size_t(prms.n_cols)}; + Matrix::Data mu_data{mu, prms.n_cols}; Stats::opg::mean(handle, mu_data, input_data, input_desc, streams, n_streams); Stats::opg::mean_center(input_data, input_desc, mu_data, comm, streams, n_streams); - for (int i = 0; i < n_streams; i++) { + for (std::uint32_t i = 0; i < n_streams; i++) { CUDA_CHECK(cudaStreamSynchronize(streams[i])); } @@ -179,15 +179,19 @@ void fit_impl(raft::handle_t& handle, handle, explained_var_all.data(), explained_var_ratio_all.data(), prms.n_cols, stream); raft::matrix::truncZeroOrigin( - sVector.data(), prms.n_cols, singular_vals, prms.n_components, 1, stream); + sVector.data(), prms.n_cols, singular_vals, prms.n_components, std::size_t(1), stream); - raft::matrix::truncZeroOrigin( - explained_var_all.data(), prms.n_cols, explained_var, prms.n_components, 1, stream); + raft::matrix::truncZeroOrigin(explained_var_all.data(), + prms.n_cols, + explained_var, + prms.n_components, + std::size_t(1), + stream); raft::matrix::truncZeroOrigin(explained_var_ratio_all.data(), prms.n_cols, explained_var_ratio, prms.n_components, - 1, + std::size_t(1), stream); raft::linalg::transpose(vMatrix.data(), prms.n_cols, stream); @@ -200,11 +204,11 @@ void fit_impl(raft::handle_t& handle, Stats::opg::mean_add(input_data, input_desc, mu_data, comm, streams, n_streams); } - for (int i = 0; i < n_streams; i++) { + for (std::uint32_t i = 0; i < n_streams; i++) { CUDA_CHECK(cudaStreamSynchronize(streams[i])); } - for (int i = 0; i < n_streams; i++) { + for (std::uint32_t i = 0; i < n_streams; i++) { CUDA_CHECK(cudaStreamDestroy(streams[i])); } } @@ -219,7 +223,7 @@ void transform_impl(raft::handle_t& handle, T* mu, const paramsPCAMG prms, cudaStream_t* streams, - int n_streams, + std::uint32_t n_streams, bool verbose) { std::vector local_blocks = input_desc.partsToRanks; @@ -233,12 +237,12 @@ void transform_impl(raft::handle_t& handle, } for (std::size_t i = 0; i < input.size(); i++) { - int si = i % n_streams; + auto si = i % n_streams; raft::stats::meanCenter(input[i]->ptr, input[i]->ptr, mu, - size_t(prms.n_cols), + prms.n_cols, local_blocks[i]->size, false, true, @@ -249,11 +253,11 @@ void transform_impl(raft::handle_t& handle, raft::linalg::gemm(handle, input[i]->ptr, local_blocks[i]->size, - size_t(prms.n_cols), + prms.n_cols, components, trans_input[i]->ptr, local_blocks[i]->size, - int(prms.n_components), + prms.n_components, CUBLAS_OP_N, CUBLAS_OP_T, alpha, @@ -263,7 +267,7 @@ void transform_impl(raft::handle_t& handle, raft::stats::meanAdd(input[i]->ptr, input[i]->ptr, mu, - size_t(prms.n_cols), + prms.n_cols, local_blocks[i]->size, false, true, @@ -278,7 +282,7 @@ void transform_impl(raft::handle_t& handle, components, components, scalar, prms.n_cols * prms.n_components, streams[0]); } - for (int i = 0; i < n_streams; i++) { + for (std::uint32_t i = 0; i < n_streams; i++) { CUDA_CHECK(cudaStreamSynchronize(streams[i])); } } @@ -299,7 +303,7 @@ void transform_impl(raft::handle_t& handle, template void transform_impl(raft::handle_t& handle, Matrix::RankSizePair** rank_sizes, - size_t n_parts, + std::uint32_t n_parts, Matrix::Data** input, T* components, Matrix::Data** trans_input, @@ -319,9 +323,9 @@ void transform_impl(raft::handle_t& handle, std::vector*> trans_data(trans_input, trans_input + n_parts); // TODO: These streams should come from raft::handle_t - int n_streams = n_parts; + auto n_streams = n_parts; cudaStream_t streams[n_streams]; - for (int i = 0; i < n_streams; i++) { + for (std::uint32_t i = 0; i < n_streams; i++) { CUDA_CHECK(cudaStreamCreate(&streams[i])); } @@ -337,11 +341,11 @@ void transform_impl(raft::handle_t& handle, n_streams, verbose); - for (int i = 0; i < n_streams; i++) { + for (std::uint32_t i = 0; i < n_streams; i++) { CUDA_CHECK(cudaStreamSynchronize(streams[i])); } - for (int i = 0; i < n_streams; i++) { + for (std::uint32_t i = 0; i < n_streams; i++) { CUDA_CHECK(cudaStreamDestroy(streams[i])); } } @@ -356,7 +360,7 @@ void inverse_transform_impl(raft::handle_t& handle, T* mu, paramsPCAMG prms, cudaStream_t* streams, - int n_streams, + std::uint32_t n_streams, bool verbose) { std::vector local_blocks = trans_input_desc.partsToRanks; @@ -370,14 +374,14 @@ void inverse_transform_impl(raft::handle_t& handle, } for (std::size_t i = 0; i < local_blocks.size(); i++) { - int si = i % n_streams; + auto si = i % n_streams; T alpha = T(1); T beta = T(0); raft::linalg::gemm(handle, trans_input[i]->ptr, local_blocks[i]->size, - size_t(prms.n_components), + prms.n_components, components, input[i]->ptr, local_blocks[i]->size, @@ -391,7 +395,7 @@ void inverse_transform_impl(raft::handle_t& handle, raft::stats::meanAdd(input[i]->ptr, input[i]->ptr, mu, - size_t(prms.n_cols), + prms.n_cols, local_blocks[i]->size, false, true, @@ -406,7 +410,7 @@ void inverse_transform_impl(raft::handle_t& handle, components, components, scalar, prms.n_rows * prms.n_components, streams[0]); } - for (int i = 0; i < n_streams; i++) { + for (std::uint32_t i = 0; i < n_streams; i++) { CUDA_CHECK(cudaStreamSynchronize(streams[i])); } } @@ -427,7 +431,7 @@ void inverse_transform_impl(raft::handle_t& handle, template void inverse_transform_impl(raft::handle_t& handle, Matrix::RankSizePair** rank_sizes, - size_t n_parts, + std::uint32_t n_parts, Matrix::Data** trans_input, T* components, Matrix::Data** input, @@ -445,9 +449,9 @@ void inverse_transform_impl(raft::handle_t& handle, std::vector*> input_data(input, input + n_parts); // TODO: These streams should come from raft::handle_t - int n_streams = n_parts; + auto n_streams = n_parts; cudaStream_t streams[n_streams]; - for (int i = 0; i < n_streams; i++) { + for (std::uint32_t i = 0; i < n_streams; i++) { CUDA_CHECK(cudaStreamCreate(&streams[i])); } @@ -463,11 +467,11 @@ void inverse_transform_impl(raft::handle_t& handle, n_streams, verbose); - for (int i = 0; i < n_streams; i++) { + for (std::uint32_t i = 0; i < n_streams; i++) { CUDA_CHECK(cudaStreamSynchronize(streams[i])); } - for (int i = 0; i < n_streams; i++) { + for (std::uint32_t i = 0; i < n_streams; i++) { CUDA_CHECK(cudaStreamDestroy(streams[i])); } } @@ -491,7 +495,7 @@ void inverse_transform_impl(raft::handle_t& handle, template void fit_transform_impl(raft::handle_t& handle, Matrix::RankSizePair** rank_sizes, - size_t n_parts, + std::uint32_t n_parts, Matrix::Data** input, Matrix::Data** trans_input, T* components, @@ -511,9 +515,9 @@ void fit_transform_impl(raft::handle_t& handle, std::vector*> trans_data(trans_input, trans_input + n_parts); // TODO: These streams should come from raft::handle_t - int n_streams = n_parts; + auto n_streams = n_parts; cudaStream_t streams[n_streams]; - for (int i = 0; i < n_streams; i++) { + for (std::uint32_t i = 0; i < n_streams; i++) { CUDA_CHECK(cudaStreamCreate(&streams[i])); } @@ -545,11 +549,11 @@ void fit_transform_impl(raft::handle_t& handle, sign_flip(handle, trans_data, input_desc, components, prms.n_components, streams, n_streams); - for (int i = 0; i < n_streams; i++) { + for (std::uint32_t i = 0; i < n_streams; i++) { CUDA_CHECK(cudaStreamSynchronize(streams[i])); } - for (int i = 0; i < n_streams; i++) { + for (std::uint32_t i = 0; i < n_streams; i++) { CUDA_CHECK(cudaStreamDestroy(streams[i])); } } @@ -606,7 +610,7 @@ void fit(raft::handle_t& handle, void fit_transform(raft::handle_t& handle, Matrix::RankSizePair** rank_sizes, - size_t n_parts, + std::uint32_t n_parts, Matrix::floatData_t** input, Matrix::floatData_t** trans_input, float* components, @@ -635,7 +639,7 @@ void fit_transform(raft::handle_t& handle, void fit_transform(raft::handle_t& handle, Matrix::RankSizePair** rank_sizes, - size_t n_parts, + std::uint32_t n_parts, Matrix::doubleData_t** input, Matrix::doubleData_t** trans_input, double* components, @@ -664,7 +668,7 @@ void fit_transform(raft::handle_t& handle, void transform(raft::handle_t& handle, Matrix::RankSizePair** rank_sizes, - size_t n_parts, + std::uint32_t n_parts, Matrix::Data** input, float* components, Matrix::Data** trans_input, @@ -679,7 +683,7 @@ void transform(raft::handle_t& handle, void transform(raft::handle_t& handle, Matrix::RankSizePair** rank_sizes, - size_t n_parts, + std::uint32_t n_parts, Matrix::Data** input, double* components, Matrix::Data** trans_input, @@ -694,7 +698,7 @@ void transform(raft::handle_t& handle, void inverse_transform(raft::handle_t& handle, Matrix::RankSizePair** rank_sizes, - size_t n_parts, + std::uint32_t n_parts, Matrix::Data** trans_input, float* components, Matrix::Data** input, @@ -709,7 +713,7 @@ void inverse_transform(raft::handle_t& handle, void inverse_transform(raft::handle_t& handle, Matrix::RankSizePair** rank_sizes, - size_t n_parts, + std::uint32_t n_parts, Matrix::Data** trans_input, double* components, Matrix::Data** input, diff --git a/cpp/src/pca/sign_flip_mg.cu b/cpp/src/pca/sign_flip_mg.cu index 64a1497ae6..9529a59078 100644 --- a/cpp/src/pca/sign_flip_mg.cu +++ b/cpp/src/pca/sign_flip_mg.cu @@ -40,8 +40,12 @@ namespace opg { // TODO: replace these thrust code with cuda kernels or prims template -void findMaxAbsOfColumns( - T* input, int n_rows, int n_cols, T* max_vals, cudaStream_t stream, bool row_major = false) +void findMaxAbsOfColumns(T* input, + std::size_t n_rows, + std::size_t n_cols, + T* max_vals, + cudaStream_t stream, + bool row_major = false) { auto counting = thrust::make_counting_iterator(0); auto m = n_rows; @@ -50,59 +54,62 @@ void findMaxAbsOfColumns( auto execution_policy = rmm::exec_policy(stream); if (row_major) { - thrust::for_each(execution_policy, counting, counting + n_rows, [=] __device__(int idx) { - T max = 0.0; - int max_index = 0; - int d_i = idx; - int end = d_i + (m * n); - - for (int i = d_i; i < end; i = i + m) { - T val = input[i]; - if (val < 0.0) { val = -val; } - if (val > max) { - max = val; - max_index = i; + thrust::for_each( + execution_policy, counting, counting + n_rows, [=] __device__(std::size_t idx) { + T max = 0.0; + std::size_t max_index = 0; + std::size_t d_i = idx; + std::size_t end = d_i + (m * n); + + for (auto i = d_i; i < end; i = i + m) { + T val = input[i]; + if (val < 0.0) { val = -val; } + if (val > max) { + max = val; + max_index = i; + } } - } - max_vals[idx] = input[max_index]; - }); + max_vals[idx] = input[max_index]; + }); } else { - thrust::for_each(execution_policy, counting, counting + n_cols, [=] __device__(int idx) { - T max = 0.0; - int max_index = 0; - int d_i = idx * m; - int end = d_i + m; - - for (int i = d_i; i < end; i++) { - T val = input[i]; - if (val < 0.0) { val = -val; } - if (val > max) { - max = val; - max_index = i; + thrust::for_each( + execution_policy, counting, counting + n_cols, [=] __device__(std::size_t idx) { + T max = 0.0; + std::size_t max_index = 0; + std::size_t d_i = idx * m; + std::size_t end = d_i + m; + + for (auto i = d_i; i < end; i++) { + T val = input[i]; + if (val < 0.0) { val = -val; } + if (val > max) { + max = val; + max_index = i; + } } - } - max_vals[idx] = input[max_index]; - }); + max_vals[idx] = input[max_index]; + }); } } // TODO: replace these thrust code with cuda kernels or prims template -void flip(T* input, int n_rows, int n_cols, T* max_vals, cudaStream_t stream) +void flip(T* input, std::size_t n_rows, std::size_t n_cols, T* max_vals, cudaStream_t stream) { auto counting = thrust::make_counting_iterator(0); auto m = n_rows; - thrust::for_each(rmm::exec_policy(stream), counting, counting + n_cols, [=] __device__(int idx) { - int d_i = idx * m; - int end = d_i + m; + thrust::for_each( + rmm::exec_policy(stream), counting, counting + n_cols, [=] __device__(std::size_t idx) { + auto d_i = idx * m; + auto end = d_i + m; - if (max_vals[idx] < 0.0) { - for (int i = d_i; i < end; i++) { - input[i] = -input[i]; + if (max_vals[idx] < 0.0) { + for (auto i = d_i; i < end; i++) { + input[i] = -input[i]; + } } - } - }); + }); } /** @@ -122,9 +129,9 @@ void sign_flip_imp(raft::handle_t& handle, std::vector*>& input, Matrix::PartDescriptor& input_desc, T* components, - int n_components, + std::size_t n_components, cudaStream_t* streams, - int n_stream) + std::uint32_t n_stream) { int rank = handle.get_comms().get_rank(); @@ -140,7 +147,7 @@ void sign_flip_imp(raft::handle_t& handle, input[i]->ptr, local_blocks[i]->size, n_components, mv_loc, streams[i % n_stream]); } - for (int i = 0; i < n_stream; i++) { + for (std::uint32_t i = 0; i < n_stream; i++) { CUDA_CHECK(cudaStreamSynchronize(streams[i])); } @@ -158,7 +165,7 @@ void sign_flip_imp(raft::handle_t& handle, input[i]->ptr, local_blocks[i]->size, n_components, max_vals.data(), streams[i % n_stream]); } - for (int i = 0; i < n_stream; i++) { + for (std::uint32_t i = 0; i < n_stream; i++) { CUDA_CHECK(cudaStreamSynchronize(streams[i])); } @@ -169,9 +176,9 @@ void sign_flip(raft::handle_t& handle, std::vector*>& input_data, Matrix::PartDescriptor& input_desc, float* components, - int n_components, + std::size_t n_components, cudaStream_t* streams, - int n_stream) + std::uint32_t n_stream) { sign_flip_imp(handle, input_data, input_desc, components, n_components, streams, n_stream); } @@ -180,9 +187,9 @@ void sign_flip(raft::handle_t& handle, std::vector*>& input_data, Matrix::PartDescriptor& input_desc, double* components, - int n_components, + std::size_t n_components, cudaStream_t* streams, - int n_stream) + std::uint32_t n_stream) { sign_flip_imp(handle, input_data, input_desc, components, n_components, streams, n_stream); } diff --git a/cpp/src/tsvd/tsvd.cuh b/cpp/src/tsvd/tsvd.cuh index 0e1d156dd3..638ce758d7 100644 --- a/cpp/src/tsvd/tsvd.cuh +++ b/cpp/src/tsvd/tsvd.cuh @@ -54,17 +54,17 @@ void calCompExpVarsSvd(const raft::handle_t& handle, auto cusolver_handle = handle.get_cusolver_dn_handle(); auto cublas_handle = handle.get_cublas_handle(); - int diff = prms.n_cols - prms.n_components; + auto diff = prms.n_cols - prms.n_components; math_t ratio = math_t(diff) / math_t(prms.n_cols); ASSERT(ratio >= math_t(0.2), "Number of components should be less than at least 80 percent of the " "number of features"); - int p = int(math_t(0.1) * math_t(prms.n_cols)); + std::size_t p = static_cast(math_t(0.1) * math_t(prms.n_cols)); // int p = int(math_t(prms.n_cols) / math_t(4)); ASSERT(p >= 5, "RSVD should be used where the number of columns are at least 50"); - int total_random_vecs = prms.n_components + p; + auto total_random_vecs = prms.n_components + p; ASSERT(total_random_vecs < prms.n_cols, "RSVD should be used where the number of columns are at least 50"); @@ -120,7 +120,7 @@ void calEig(const raft::handle_t& handle, raft::matrix::colReverse(components, prms.n_cols, prms.n_cols, stream); raft::linalg::transpose(components, prms.n_cols, stream); - raft::matrix::rowReverse(explained_var, prms.n_cols, 1, stream); + raft::matrix::rowReverse(explained_var, prms.n_cols, std::size_t(1), stream); } /** @@ -135,38 +135,43 @@ void calEig(const raft::handle_t& handle, * @{ */ template -void signFlip( - math_t* input, int n_rows, int n_cols, math_t* components, int n_cols_comp, cudaStream_t stream) +void signFlip(math_t* input, + std::size_t n_rows, + std::size_t n_cols, + math_t* components, + std::size_t n_cols_comp, + cudaStream_t stream) { auto counting = thrust::make_counting_iterator(0); auto m = n_rows; - thrust::for_each(rmm::exec_policy(stream), counting, counting + n_cols, [=] __device__(int idx) { - int d_i = idx * m; - int end = d_i + m; - - math_t max = 0.0; - int max_index = 0; - for (int i = d_i; i < end; i++) { - math_t val = input[i]; - if (val < 0.0) { val = -val; } - if (val > max) { - max = val; - max_index = i; + thrust::for_each( + rmm::exec_policy(stream), counting, counting + n_cols, [=] __device__(std::size_t idx) { + auto d_i = idx * m; + auto end = d_i + m; + + math_t max = 0.0; + std::size_t max_index = 0; + for (auto i = d_i; i < end; i++) { + math_t val = input[i]; + if (val < 0.0) { val = -val; } + if (val > max) { + max = val; + max_index = i; + } } - } - if (input[max_index] < 0.0) { - for (int i = d_i; i < end; i++) { - input[i] = -input[i]; - } + if (input[max_index] < 0.0) { + for (auto i = d_i; i < end; i++) { + input[i] = -input[i]; + } - int len = n_cols * n_cols_comp; - for (int i = idx; i < len; i = i + n_cols) { - components[i] = -components[i]; + auto len = n_cols * n_cols_comp; + for (auto i = idx; i < len; i = i + n_cols) { + components[i] = -components[i]; + } } - } - }); + }); } /** @@ -195,7 +200,7 @@ void tsvdFit(const raft::handle_t& handle, ASSERT(prms.n_components > 0, "Parameter n_components: number of components cannot be less than one"); - int n_components = prms.n_components; + auto n_components = prms.n_components; if (prms.n_components > prms.n_cols) n_components = prms.n_cols; size_t len = prms.n_cols * prms.n_cols; @@ -281,7 +286,7 @@ void tsvdFitTransform(const raft::handle_t& handle, raft::stats::vars(vars.data(), input, mu.data(), prms.n_cols, prms.n_rows, true, false, stream); rmm::device_scalar total_vars(stream); - raft::stats::sum(total_vars.data(), vars.data(), 1, prms.n_cols, false, stream); + raft::stats::sum(total_vars.data(), vars.data(), std::size_t(1), prms.n_cols, false, stream); math_t total_vars_h; raft::update_host(&total_vars_h, total_vars.data(), 1, stream); diff --git a/cpp/src/tsvd/tsvd_mg.cu b/cpp/src/tsvd/tsvd_mg.cu index 96bb7a943c..bd93daf223 100644 --- a/cpp/src/tsvd/tsvd_mg.cu +++ b/cpp/src/tsvd/tsvd_mg.cu @@ -48,18 +48,16 @@ void fit_impl(raft::handle_t& handle, T* singular_vals, paramsTSVD prms, cudaStream_t* streams, - int n_streams, + std::uint32_t n_streams, bool verbose) { const auto& comm = handle.get_comms(); cublasHandle_t cublas_handle = handle.get_cublas_handle(); - // This variable should be updated to use `size_t` - // Reference issue https://github.com/rapidsai/cuml/issues/2459 - int len = prms.n_cols * prms.n_cols; + auto len = prms.n_cols * prms.n_cols; rmm::device_uvector cov_data(len, streams[0]); - size_t cov_data_size = cov_data.size(); + auto cov_data_size = cov_data.size(); Matrix::Data cov{cov_data.data(), cov_data_size}; LinAlg::opg::mm_aTa(handle, cov, input_data, input_desc, streams, n_streams); @@ -91,7 +89,7 @@ void fit_impl(raft::handle_t& handle, template void fit_impl(raft::handle_t& handle, Matrix::RankSizePair** rank_sizes, - size_t n_parts, + std::uint32_t n_parts, Matrix::Data** input, T* components, T* singular_vals, @@ -106,20 +104,20 @@ void fit_impl(raft::handle_t& handle, Matrix::PartDescriptor input_desc(prms.n_rows, prms.n_cols, ranksAndSizes, rank); // TODO: These streams should come from raft::handle_t - int n_streams = n_parts; + auto n_streams = n_parts; cudaStream_t streams[n_streams]; - for (int i = 0; i < n_streams; i++) { + for (std::uint32_t i = 0; i < n_streams; i++) { CUDA_CHECK(cudaStreamCreate(&streams[i])); } fit_impl( handle, input_data, input_desc, components, singular_vals, prms, streams, n_streams, verbose); - for (int i = 0; i < n_streams; i++) { + for (std::uint32_t i = 0; i < n_streams; i++) { CUDA_CHECK(cudaStreamSynchronize(streams[i])); } - for (int i = 0; i < n_streams; i++) { + for (std::uint32_t i = 0; i < n_streams; i++) { CUDA_CHECK(cudaStreamDestroy(streams[i])); } } @@ -132,7 +130,7 @@ void transform_impl(raft::handle_t& handle, std::vector*>& trans_input, paramsTSVD prms, cudaStream_t* streams, - int n_streams, + std::uint32_t n_streams, bool verbose) { int rank = handle.get_comms().get_rank(); @@ -140,18 +138,18 @@ void transform_impl(raft::handle_t& handle, std::vector local_blocks = input_desc.blocksOwnedBy(rank); for (std::size_t i = 0; i < input.size(); i++) { - int si = i % n_streams; + auto si = i % n_streams; T alpha = T(1); T beta = T(0); raft::linalg::gemm(handle, input[i]->ptr, local_blocks[i]->size, - size_t(prms.n_cols), + prms.n_cols, components, trans_input[i]->ptr, local_blocks[i]->size, - int(prms.n_components), + prms.n_components, CUBLAS_OP_N, CUBLAS_OP_T, alpha, @@ -159,7 +157,7 @@ void transform_impl(raft::handle_t& handle, streams[si]); } - for (int i = 0; i < n_streams; i++) { + for (std::uint32_t i = 0; i < n_streams; i++) { CUDA_CHECK(cudaStreamSynchronize(streams[i])); } } @@ -178,7 +176,7 @@ void transform_impl(raft::handle_t& handle, template void transform_impl(raft::handle_t& handle, Matrix::RankSizePair** rank_sizes, - size_t n_parts, + std::uint32_t n_parts, Matrix::Data** input, T* components, Matrix::Data** trans_input, @@ -193,20 +191,20 @@ void transform_impl(raft::handle_t& handle, std::vector*> trans_data(trans_input, trans_input + n_parts); // TODO: These streams should come from raft::handle_t - int n_streams = n_parts; + auto n_streams = n_parts; cudaStream_t streams[n_streams]; - for (int i = 0; i < n_streams; i++) { + for (std::uint32_t i = 0; i < n_streams; i++) { CUDA_CHECK(cudaStreamCreate(&streams[i])); } transform_impl( handle, input_data, input_desc, components, trans_data, prms, streams, n_streams, verbose); - for (int i = 0; i < n_streams; i++) { + for (std::uint32_t i = 0; i < n_streams; i++) { CUDA_CHECK(cudaStreamSynchronize(streams[i])); } - for (int i = 0; i < n_streams; i++) { + for (std::uint32_t i = 0; i < n_streams; i++) { CUDA_CHECK(cudaStreamDestroy(streams[i])); } } @@ -219,20 +217,20 @@ void inverse_transform_impl(raft::handle_t& handle, std::vector*>& input, paramsTSVD prms, cudaStream_t* streams, - int n_streams, + std::uint32_t n_streams, bool verbose) { std::vector local_blocks = trans_input_desc.partsToRanks; for (std::size_t i = 0; i < local_blocks.size(); i++) { - int si = i % n_streams; + auto si = i % n_streams; T alpha = T(1); T beta = T(0); raft::linalg::gemm(handle, trans_input[i]->ptr, local_blocks[i]->size, - size_t(prms.n_components), + prms.n_components, components, input[i]->ptr, local_blocks[i]->size, @@ -244,7 +242,7 @@ void inverse_transform_impl(raft::handle_t& handle, streams[si]); } - for (int i = 0; i < n_streams; i++) { + for (std::uint32_t i = 0; i < n_streams; i++) { CUDA_CHECK(cudaStreamSynchronize(streams[i])); } } @@ -263,7 +261,7 @@ void inverse_transform_impl(raft::handle_t& handle, template void inverse_transform_impl(raft::handle_t& handle, Matrix::RankSizePair** rank_sizes, - size_t n_parts, + std::uint32_t n_parts, Matrix::Data** trans_input, T* components, Matrix::Data** input, @@ -279,20 +277,19 @@ void inverse_transform_impl(raft::handle_t& handle, std::vector*> input_data(input, input + n_parts); // TODO: These streams should come from raft::handle_t - int n_streams = n_parts; + auto n_streams = n_parts; cudaStream_t streams[n_streams]; - for (int i = 0; i < n_streams; i++) { + for (std::uint32_t i = 0; i < n_streams; i++) { CUDA_CHECK(cudaStreamCreate(&streams[i])); } inverse_transform_impl( handle, trans_data, trans_desc, components, input_data, prms, streams, n_streams, verbose); - for (int i = 0; i < n_streams; i++) { + for (std::uint32_t i = 0; i < n_streams; i++) { CUDA_CHECK(cudaStreamSynchronize(streams[i])); } - - for (int i = 0; i < n_streams; i++) { + for (std::uint32_t i = 0; i < n_streams; i++) { CUDA_CHECK(cudaStreamDestroy(streams[i])); } } @@ -327,10 +324,10 @@ void fit_transform_impl(raft::handle_t& handle, int rank = handle.get_comms().get_rank(); // TODO: These streams should come from raft::handle_t - int n_streams = input_desc.blocksOwnedBy(rank).size(); + auto n_streams = input_desc.blocksOwnedBy(rank).size(); ; cudaStream_t streams[n_streams]; - for (int i = 0; i < n_streams; i++) { + for (std::size_t i = 0; i < n_streams; i++) { CUDA_CHECK(cudaStreamCreate(&streams[i])); } @@ -344,48 +341,48 @@ void fit_transform_impl(raft::handle_t& handle, handle, trans_data, input_desc, components, prms.n_components, streams, n_streams); rmm::device_uvector mu_trans(prms.n_components, streams[0]); - Matrix::Data mu_trans_data{mu_trans.data(), size_t(prms.n_components)}; + Matrix::Data mu_trans_data{mu_trans.data(), prms.n_components}; Stats::opg::mean(handle, mu_trans_data, trans_data, trans_desc, streams, n_streams); - Matrix::Data explained_var_data{explained_var, size_t(prms.n_components)}; + Matrix::Data explained_var_data{explained_var, prms.n_components}; Stats::opg::var( handle, explained_var_data, trans_data, trans_desc, mu_trans_data.ptr, streams, n_streams); rmm::device_uvector mu(prms.n_rows, streams[0]); - Matrix::Data mu_data{mu.data(), size_t(prms.n_rows)}; + Matrix::Data mu_data{mu.data(), prms.n_rows}; Stats::opg::mean(handle, mu_data, input_data, input_desc, streams, n_streams); rmm::device_uvector var_input(prms.n_rows, streams[0]); - Matrix::Data var_input_data{var_input.data(), size_t(prms.n_rows)}; + Matrix::Data var_input_data{var_input.data(), prms.n_rows}; Stats::opg::var(handle, var_input_data, input_data, input_desc, mu_data.ptr, streams, n_streams); rmm::device_uvector total_vars(1, streams[0]); - raft::stats::sum(total_vars.data(), var_input_data.ptr, 1, prms.n_cols, false, streams[0]); + raft::stats::sum( + total_vars.data(), var_input_data.ptr, std::size_t(1), prms.n_cols, false, streams[0]); T total_vars_h; - raft::update_host(&total_vars_h, total_vars.data(), 1, streams[0]); + raft::update_host(&total_vars_h, total_vars.data(), std::size_t(1), streams[0]); CUDA_CHECK(cudaStreamSynchronize(streams[0])); T scalar = T(1) / total_vars_h; raft::linalg::scalarMultiply( explained_var_ratio, explained_var, scalar, prms.n_components, streams[0]); - for (int i = 0; i < n_streams; i++) { + for (std::size_t i = 0; i < n_streams; i++) { CUDA_CHECK(cudaStreamSynchronize(streams[i])); } - - for (int i = 0; i < n_streams; i++) { + for (std::size_t i = 0; i < n_streams; i++) { CUDA_CHECK(cudaStreamDestroy(streams[i])); } } void fit(raft::handle_t& handle, Matrix::RankSizePair** rank_sizes, - size_t n_parts, + std::uint32_t n_parts, Matrix::floatData_t** input, float* components, float* singular_vals, @@ -397,7 +394,7 @@ void fit(raft::handle_t& handle, void fit(raft::handle_t& handle, Matrix::RankSizePair** rank_sizes, - size_t n_parts, + std::uint32_t n_parts, Matrix::doubleData_t** input, double* components, double* singular_vals, @@ -459,7 +456,7 @@ void fit_transform(raft::handle_t& handle, void transform(raft::handle_t& handle, Matrix::RankSizePair** rank_sizes, - size_t n_parts, + std::uint32_t n_parts, Matrix::Data** input, float* components, Matrix::Data** trans_input, @@ -471,7 +468,7 @@ void transform(raft::handle_t& handle, void transform(raft::handle_t& handle, Matrix::RankSizePair** rank_sizes, - size_t n_parts, + std::uint32_t n_parts, Matrix::Data** input, double* components, Matrix::Data** trans_input, @@ -483,7 +480,7 @@ void transform(raft::handle_t& handle, void inverse_transform(raft::handle_t& handle, Matrix::RankSizePair** rank_sizes, - size_t n_parts, + std::size_t n_parts, Matrix::Data** trans_input, float* components, Matrix::Data** input, @@ -496,7 +493,7 @@ void inverse_transform(raft::handle_t& handle, void inverse_transform(raft::handle_t& handle, Matrix::RankSizePair** rank_sizes, - size_t n_parts, + std::uint32_t n_parts, Matrix::Data** trans_input, double* components, Matrix::Data** input, diff --git a/cpp/src_prims/stats/cov.cuh b/cpp/src_prims/stats/cov.cuh index a6e1dec2d2..e1e740c7c6 100644 --- a/cpp/src_prims/stats/cov.cuh +++ b/cpp/src_prims/stats/cov.cuh @@ -49,8 +49,8 @@ void cov(const raft::handle_t& handle, Type* covar, Type* data, const Type* mu, - int D, - int N, + std::size_t D, + std::size_t N, bool sample, bool rowMajor, bool stable, diff --git a/python/cuml/decomposition/pca_mg.pyx b/python/cuml/decomposition/pca_mg.pyx index e3ee6197d6..6365477355 100644 --- a/python/cuml/decomposition/pca_mg.pyx +++ b/python/cuml/decomposition/pca_mg.pyx @@ -53,10 +53,8 @@ cdef extern from "cuml/decomposition/pca_mg.hpp" namespace "ML": QR "ML::mg_solver::QR" cdef cppclass paramsTSVDMG(paramsSolver): - int n_components - int max_sweeps + size_t n_components mg_solver algorithm # = solver::COV_EIG_DQ - bool trans_input cdef cppclass paramsPCAMG(paramsTSVDMG): bool copy diff --git a/python/cuml/decomposition/utils.pxd b/python/cuml/decomposition/utils.pxd index b33c654b85..bd6ce0b8a0 100644 --- a/python/cuml/decomposition/utils.pxd +++ b/python/cuml/decomposition/utils.pxd @@ -1,5 +1,5 @@ # -# Copyright (c) 2019, NVIDIA CORPORATION. +# Copyright (c) 2019-2021, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -25,23 +25,18 @@ cdef extern from "cuml/decomposition/params.hpp" namespace "ML" nogil: COV_EIG_JACOBI "ML::solver::COV_EIG_JACOBI" cdef cppclass params: - int n_rows - int n_cols + size_t n_rows + size_t n_cols int gpu_id cdef cppclass paramsSolver(params): - int n_rows - int n_cols float tol - int n_iterations - int random_state + unsigned n_iterations int verbose cdef cppclass paramsTSVD(paramsSolver): - int n_components - int max_sweeps + size_t n_components solver algorithm # = solver::COV_EIG_DQ - bool trans_input cdef cppclass paramsPCA(paramsTSVD): bool copy