Skip to content

Commit

Permalink
Fix for multi GPU PCA compute failing bug after transform and added e…
Browse files Browse the repository at this point in the history
…rror handling when n_components is not passed (#3912)

This PR fixes #3913 where Multi GPU PCA is failing on calling compute() after transform. 

This fix is similar to what has been done in a previous PR #3320 . The current PR :

1. Fixes the problem of `AttributeError` when calling `compute()` after `transform()` while using `cuml.dask.decomposition.PCA`
2. Added setting of `n_components=1` when n_components is not set in the multi GPU case.  (The tests were failing if I don't set n_components, however, I did not find a place where n_component is set to 1. I may be wrong.  PR #3320 seems to fix `n_component=min(n_cols, n_rows` as of 0.16 for single GPU. Not entirely sure if that should also be the case for MG as well. )
3. Added test cases for the above two changes.

Authors:
  - Anirban Das (https://github.com/akaanirban)

Approvers:
  - Micka (https://github.com/lowener)
  - Dante Gama Dessavre (https://github.com/dantegd)

URL: #3912
  • Loading branch information
akaanirban authored Jun 16, 2021
1 parent 1be5e3a commit 87f8e90
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 9 deletions.
12 changes: 8 additions & 4 deletions python/cuml/decomposition/base_mg.pyx
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#
# Copyright (c) 2019, NVIDIA CORPORATION.
# Copyright (c) 2019-2021, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -30,7 +30,6 @@ from cython.operator cimport dereference as deref

from cuml.common.array import CumlArray
import cuml.common.opg_data_utils_mg as opg

import cuml.internals
from cuml.common.base import Base
from cuml.raft.common.handle cimport handle_t
Expand Down Expand Up @@ -59,6 +58,11 @@ class BaseDecompositionMG(object):
self._set_output_type(X[0])
self._set_n_features_in(n_cols)

if self.n_components is None:
self._n_components = min(total_rows, n_cols)
else:
self._n_components = self.n_components

X_arys = []
for i in range(len(X)):
if i == 0:
Expand Down Expand Up @@ -90,11 +94,11 @@ class BaseDecompositionMG(object):
trans_arg = opg.build_data_t(trans_arys)

trans_part_desc = opg.build_part_descriptor(total_rows,
self.n_components,
self._n_components,
rank_to_sizes,
rank)

self._initialize_arrays(self.n_components, total_rows, n_cols)
self._initialize_arrays(self._n_components, total_rows, n_cols)
decomp_params = self._build_params(total_rows, n_cols)

if _transform:
Expand Down
4 changes: 2 additions & 2 deletions python/cuml/decomposition/pca_mg.pyx
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#
# Copyright (c) 2019, NVIDIA CORPORATION.
# Copyright (c) 2019-2021, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -116,7 +116,7 @@ class PCAMG(BaseDecompositionMG, PCA):

def _build_params(self, n_rows, n_cols):
cpdef paramsPCAMG *params = new paramsPCAMG()
params.n_components = self.n_components
params.n_components = self._n_components
params.n_rows = n_rows
params.n_cols = n_cols
params.whiten = self.whiten
Expand Down
32 changes: 29 additions & 3 deletions python/cuml/test/dask/test_pca.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2019, NVIDIA CORPORATION.
# Copyright (c) 2019-2021, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -87,7 +87,9 @@ def test_pca_fit_transform_fp32(nrows, ncols, n_parts, client):
random_state=10, dtype=np.float32)

cupca = daskPCA(n_components=20, whiten=True)
cupca.fit_transform(X_cudf)
res = cupca.fit_transform(X_cudf)
res = res.compute()
assert res.shape[0] == nrows and res.shape[1] == 20


@pytest.mark.mg
Expand All @@ -107,4 +109,28 @@ def test_pca_fit_transform_fp64(nrows, ncols, n_parts, client):
random_state=10, dtype=np.float64)

cupca = daskPCA(n_components=30, whiten=False)
cupca.fit_transform(X_cudf)
res = cupca.fit_transform(X_cudf)
res = res.compute()
assert res.shape[0] == nrows and res.shape[1] == 30


@pytest.mark.mg
@pytest.mark.parametrize("nrows", [1000])
@pytest.mark.parametrize("ncols", [20])
@pytest.mark.parametrize("n_parts", [28])
def test_pca_fit_transform_fp32_noncomponents(nrows, ncols, n_parts, client):
# Tests the case when n_components is not passed for MG scenarios
from cuml.dask.decomposition import PCA as daskPCA
from cuml.dask.datasets import make_blobs

X_cudf, _ = make_blobs(n_samples=nrows,
n_features=ncols,
centers=1,
n_parts=n_parts,
cluster_std=1.5,
random_state=10, dtype=np.float32)

cupca = daskPCA(whiten=False)
res = cupca.fit_transform(X_cudf)
res = res.compute()
assert res.shape[0] == nrows and res.shape[1] == 20

0 comments on commit 87f8e90

Please sign in to comment.