Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Fix for multi GPU PCA compute failing bug after transform and added error handling when n_components is not passed #3912

Merged
merged 3 commits into from
Jun 16, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions python/cuml/decomposition/base_mg.pyx
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#
# Copyright (c) 2019, NVIDIA CORPORATION.
# Copyright (c) 2019-2021, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -30,7 +30,6 @@ from cython.operator cimport dereference as deref

from cuml.common.array import CumlArray
import cuml.common.opg_data_utils_mg as opg

import cuml.internals
from cuml.common.base import Base
from cuml.raft.common.handle cimport handle_t
Expand Down Expand Up @@ -59,6 +58,11 @@ class BaseDecompositionMG(object):
self._set_output_type(X[0])
self._set_n_features_in(n_cols)

if self.n_components is None:
self._n_components = min(total_rows, n_cols)
else:
self._n_components = self.n_components

X_arys = []
for i in range(len(X)):
if i == 0:
Expand Down Expand Up @@ -90,11 +94,11 @@ class BaseDecompositionMG(object):
trans_arg = opg.build_data_t(trans_arys)

trans_part_desc = opg.build_part_descriptor(total_rows,
self.n_components,
self._n_components,
rank_to_sizes,
rank)

self._initialize_arrays(self.n_components, total_rows, n_cols)
self._initialize_arrays(self._n_components, total_rows, n_cols)
decomp_params = self._build_params(total_rows, n_cols)

if _transform:
Expand Down
4 changes: 2 additions & 2 deletions python/cuml/decomposition/pca_mg.pyx
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#
# Copyright (c) 2019, NVIDIA CORPORATION.
# Copyright (c) 2019-2021, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -116,7 +116,7 @@ class PCAMG(BaseDecompositionMG, PCA):

def _build_params(self, n_rows, n_cols):
cpdef paramsPCAMG *params = new paramsPCAMG()
params.n_components = self.n_components
params.n_components = self._n_components
params.n_rows = n_rows
params.n_cols = n_cols
params.whiten = self.whiten
Expand Down
32 changes: 29 additions & 3 deletions python/cuml/test/dask/test_pca.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2019, NVIDIA CORPORATION.
# Copyright (c) 2019-2021, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -87,7 +87,9 @@ def test_pca_fit_transform_fp32(nrows, ncols, n_parts, client):
random_state=10, dtype=np.float32)

cupca = daskPCA(n_components=20, whiten=True)
cupca.fit_transform(X_cudf)
res = cupca.fit_transform(X_cudf)
res = res.compute()
assert res.shape[0] == nrows and res.shape[1] == 20


@pytest.mark.mg
Expand All @@ -107,4 +109,28 @@ def test_pca_fit_transform_fp64(nrows, ncols, n_parts, client):
random_state=10, dtype=np.float64)

cupca = daskPCA(n_components=30, whiten=False)
cupca.fit_transform(X_cudf)
res = cupca.fit_transform(X_cudf)
res = res.compute()
assert res.shape[0] == nrows and res.shape[1] == 30


@pytest.mark.mg
@pytest.mark.parametrize("nrows", [1000])
@pytest.mark.parametrize("ncols", [20])
@pytest.mark.parametrize("n_parts", [28])
def test_pca_fit_transform_fp32_noncomponents(nrows, ncols, n_parts, client):
# Tests the case when n_components is not passed for MG scenarios
from cuml.dask.decomposition import PCA as daskPCA
from cuml.dask.datasets import make_blobs

X_cudf, _ = make_blobs(n_samples=nrows,
n_features=ncols,
centers=1,
n_parts=n_parts,
cluster_std=1.5,
random_state=10, dtype=np.float32)

cupca = daskPCA(whiten=False)
res = cupca.fit_transform(X_cudf)
res = res.compute()
assert res.shape[0] == nrows and res.shape[1] == 20