From f2cc9c061ed96e44a510bd84a27113c9d60bcc58 Mon Sep 17 00:00:00 2001 From: Anirban Das Date: Fri, 28 May 2021 13:40:52 -0700 Subject: [PATCH 1/3] Fixed multi GPU transform and added error handling when n_components is not passed --- python/cuml/decomposition/base_mg.pyx | 19 +++++++++++++--- python/cuml/decomposition/pca_mg.pyx | 2 +- python/cuml/test/dask/test_pca.py | 32 +++++++++++++++++++++++++-- 3 files changed, 47 insertions(+), 6 deletions(-) diff --git a/python/cuml/decomposition/base_mg.pyx b/python/cuml/decomposition/base_mg.pyx index 8615034c88..43c91ff5f3 100644 --- a/python/cuml/decomposition/base_mg.pyx +++ b/python/cuml/decomposition/base_mg.pyx @@ -30,7 +30,7 @@ from cython.operator cimport dereference as deref from cuml.common.array import CumlArray import cuml.common.opg_data_utils_mg as opg - +import cuml.common.logger as logger import cuml.internals from cuml.common.base import Base from cuml.raft.common.handle cimport handle_t @@ -59,6 +59,19 @@ class BaseDecompositionMG(object): self._set_output_type(X[0]) self._set_n_features_in(n_cols) + if self.n_components is None: + # logger.warn( + # 'Warning(`fit`): As of v0.16, PCA invoked without an' + # ' n_components argument defauts to using' + # ' min(n_samples, n_features) rather than 1' + # ) + # n_rows = total_rows + # n_cols = n_cols + # self._n_components = min(n_rows, n_cols) + self._n_components = 1 + else: + self._n_components = self.n_components + X_arys = [] for i in range(len(X)): if i == 0: @@ -90,11 +103,11 @@ class BaseDecompositionMG(object): trans_arg = opg.build_data_t(trans_arys) trans_part_desc = opg.build_part_descriptor(total_rows, - self.n_components, + self._n_components, rank_to_sizes, rank) - self._initialize_arrays(self.n_components, total_rows, n_cols) + self._initialize_arrays(self._n_components, total_rows, n_cols) decomp_params = self._build_params(total_rows, n_cols) if _transform: diff --git a/python/cuml/decomposition/pca_mg.pyx b/python/cuml/decomposition/pca_mg.pyx index 175ffdd7e0..8d8a383e32 100644 --- a/python/cuml/decomposition/pca_mg.pyx +++ b/python/cuml/decomposition/pca_mg.pyx @@ -116,7 +116,7 @@ class PCAMG(BaseDecompositionMG, PCA): def _build_params(self, n_rows, n_cols): cpdef paramsPCAMG *params = new paramsPCAMG() - params.n_components = self.n_components + params.n_components = self._n_components params.n_rows = n_rows params.n_cols = n_cols params.whiten = self.whiten diff --git a/python/cuml/test/dask/test_pca.py b/python/cuml/test/dask/test_pca.py index 85f0cc6588..d7bc7a7ecc 100644 --- a/python/cuml/test/dask/test_pca.py +++ b/python/cuml/test/dask/test_pca.py @@ -87,7 +87,10 @@ def test_pca_fit_transform_fp32(nrows, ncols, n_parts, client): random_state=10, dtype=np.float32) cupca = daskPCA(n_components=20, whiten=True) - cupca.fit_transform(X_cudf) + res = cupca.fit_transform(X_cudf) + res = res.compute() + assert res.shape[0] == nrows and res.shape[1]==20 + @pytest.mark.mg @@ -107,4 +110,29 @@ def test_pca_fit_transform_fp64(nrows, ncols, n_parts, client): random_state=10, dtype=np.float64) cupca = daskPCA(n_components=30, whiten=False) - cupca.fit_transform(X_cudf) + res = cupca.fit_transform(X_cudf) + res = res.compute() + assert res.shape[0] == nrows and res.shape[1]==30 + + +@pytest.mark.mg +@pytest.mark.parametrize("nrows", [1000]) +@pytest.mark.parametrize("ncols", [20]) +@pytest.mark.parametrize("n_parts", [28]) +def test_pca_fit_transform_fp32_noncomponents(nrows, ncols, n_parts, client): + # Tests the case when n_components is not passed for MG scenarios + from cuml.dask.decomposition import PCA as daskPCA + from cuml.dask.datasets import make_blobs + + X_cudf, _ = make_blobs(n_samples=nrows, + n_features=ncols, + centers=1, + n_parts=n_parts, + cluster_std=1.5, + random_state=10, dtype=np.float32) + + cupca = daskPCA(whiten=False) + res = cupca.fit_transform(X_cudf) + res = res.compute() + assert res.shape[0] == nrows and res.shape[1]==1 #20 + From 097a65f0ae12b1a6be8b757a7f8a8c42e3f2f9db Mon Sep 17 00:00:00 2001 From: Anirban Das Date: Fri, 28 May 2021 14:33:32 -0700 Subject: [PATCH 2/3] Fixed style check and Copyrights --- python/cuml/decomposition/base_mg.pyx | 4 ++-- python/cuml/decomposition/pca_mg.pyx | 2 +- python/cuml/test/dask/test_pca.py | 10 ++++------ 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/python/cuml/decomposition/base_mg.pyx b/python/cuml/decomposition/base_mg.pyx index 43c91ff5f3..6668c27501 100644 --- a/python/cuml/decomposition/base_mg.pyx +++ b/python/cuml/decomposition/base_mg.pyx @@ -1,5 +1,5 @@ # -# Copyright (c) 2019, NVIDIA CORPORATION. +# Copyright (c) 2019-2021, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -71,7 +71,7 @@ class BaseDecompositionMG(object): self._n_components = 1 else: self._n_components = self.n_components - + X_arys = [] for i in range(len(X)): if i == 0: diff --git a/python/cuml/decomposition/pca_mg.pyx b/python/cuml/decomposition/pca_mg.pyx index 8d8a383e32..e3ee6197d6 100644 --- a/python/cuml/decomposition/pca_mg.pyx +++ b/python/cuml/decomposition/pca_mg.pyx @@ -1,5 +1,5 @@ # -# Copyright (c) 2019, NVIDIA CORPORATION. +# Copyright (c) 2019-2021, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/python/cuml/test/dask/test_pca.py b/python/cuml/test/dask/test_pca.py index d7bc7a7ecc..c44a59c186 100644 --- a/python/cuml/test/dask/test_pca.py +++ b/python/cuml/test/dask/test_pca.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019, NVIDIA CORPORATION. +# Copyright (c) 2019-2021, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -89,8 +89,7 @@ def test_pca_fit_transform_fp32(nrows, ncols, n_parts, client): cupca = daskPCA(n_components=20, whiten=True) res = cupca.fit_transform(X_cudf) res = res.compute() - assert res.shape[0] == nrows and res.shape[1]==20 - + assert res.shape[0] == nrows and res.shape[1] == 20 @pytest.mark.mg @@ -112,7 +111,7 @@ def test_pca_fit_transform_fp64(nrows, ncols, n_parts, client): cupca = daskPCA(n_components=30, whiten=False) res = cupca.fit_transform(X_cudf) res = res.compute() - assert res.shape[0] == nrows and res.shape[1]==30 + assert res.shape[0] == nrows and res.shape[1] == 30 @pytest.mark.mg @@ -134,5 +133,4 @@ def test_pca_fit_transform_fp32_noncomponents(nrows, ncols, n_parts, client): cupca = daskPCA(whiten=False) res = cupca.fit_transform(X_cudf) res = res.compute() - assert res.shape[0] == nrows and res.shape[1]==1 #20 - + assert res.shape[0] == nrows and res.shape[1] == 1 # 20 \ No newline at end of file From 84eb007946f39cf8ac5c5461a14ff2948a617fd1 Mon Sep 17 00:00:00 2001 From: Anirban Das Date: Mon, 31 May 2021 12:56:16 -0700 Subject: [PATCH 3/3] Changed _n_components calculation to make it consistent with single GPU version. Fixed styling. --- python/cuml/decomposition/base_mg.pyx | 11 +---------- python/cuml/test/dask/test_pca.py | 2 +- 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/python/cuml/decomposition/base_mg.pyx b/python/cuml/decomposition/base_mg.pyx index 6668c27501..620b190fbb 100644 --- a/python/cuml/decomposition/base_mg.pyx +++ b/python/cuml/decomposition/base_mg.pyx @@ -30,7 +30,6 @@ from cython.operator cimport dereference as deref from cuml.common.array import CumlArray import cuml.common.opg_data_utils_mg as opg -import cuml.common.logger as logger import cuml.internals from cuml.common.base import Base from cuml.raft.common.handle cimport handle_t @@ -60,15 +59,7 @@ class BaseDecompositionMG(object): self._set_n_features_in(n_cols) if self.n_components is None: - # logger.warn( - # 'Warning(`fit`): As of v0.16, PCA invoked without an' - # ' n_components argument defauts to using' - # ' min(n_samples, n_features) rather than 1' - # ) - # n_rows = total_rows - # n_cols = n_cols - # self._n_components = min(n_rows, n_cols) - self._n_components = 1 + self._n_components = min(total_rows, n_cols) else: self._n_components = self.n_components diff --git a/python/cuml/test/dask/test_pca.py b/python/cuml/test/dask/test_pca.py index c44a59c186..6c21844517 100644 --- a/python/cuml/test/dask/test_pca.py +++ b/python/cuml/test/dask/test_pca.py @@ -133,4 +133,4 @@ def test_pca_fit_transform_fp32_noncomponents(nrows, ncols, n_parts, client): cupca = daskPCA(whiten=False) res = cupca.fit_transform(X_cudf) res = res.compute() - assert res.shape[0] == nrows and res.shape[1] == 1 # 20 \ No newline at end of file + assert res.shape[0] == nrows and res.shape[1] == 20