From 87f8e907c9b7cb151b5824b1ebe67e34079c42a3 Mon Sep 17 00:00:00 2001
From: Anirban Das <akaanirban@users.noreply.github.com>
Date: Wed, 16 Jun 2021 13:44:52 -0400
Subject: [PATCH] Fix for multi GPU PCA compute failing bug after transform and
 added error handling when n_components is not passed (#3912)

This PR fixes #3913 where Multi GPU PCA is failing on calling compute() after transform.

This fix is similar to what has been done in a previous PR #3320 . The current PR :

1. Fixes the problem of `AttributeError` when calling `compute()` after `transform()` while using `cuml.dask.decomposition.PCA`
2. Added setting of `n_components=1` when n_components is not set in the multi GPU case.  (The tests were failing if I don't set n_components, however, I did not find a place where n_component is set to 1. I may be wrong.  PR #3320 seems to fix `n_component=min(n_cols, n_rows` as of 0.16 for single GPU. Not entirely sure if that should also be the case for MG as well. )
3. Added test cases for the above two changes.

Authors:
  - Anirban Das (https://github.com/akaanirban)

Approvers:
  - Micka (https://github.com/lowener)
  - Dante Gama Dessavre (https://github.com/dantegd)

URL: https://github.com/rapidsai/cuml/pull/3912
---
 python/cuml/decomposition/base_mg.pyx | 12 ++++++----
 python/cuml/decomposition/pca_mg.pyx  |  4 ++--
 python/cuml/test/dask/test_pca.py     | 32 ++++++++++++++++++++++++---
 3 files changed, 39 insertions(+), 9 deletions(-)

diff --git a/python/cuml/decomposition/base_mg.pyx b/python/cuml/decomposition/base_mg.pyx
index 8615034c88..620b190fbb 100644
--- a/python/cuml/decomposition/base_mg.pyx
+++ b/python/cuml/decomposition/base_mg.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -30,7 +30,6 @@ from cython.operator cimport dereference as deref
 
 from cuml.common.array import CumlArray
 import cuml.common.opg_data_utils_mg as opg
-
 import cuml.internals
 from cuml.common.base import Base
 from cuml.raft.common.handle cimport handle_t
@@ -59,6 +58,11 @@ class BaseDecompositionMG(object):
         self._set_output_type(X[0])
         self._set_n_features_in(n_cols)
 
+        if self.n_components is None:
+            self._n_components = min(total_rows, n_cols)
+        else:
+            self._n_components = self.n_components
+
         X_arys = []
         for i in range(len(X)):
             if i == 0:
@@ -90,11 +94,11 @@ class BaseDecompositionMG(object):
             trans_arg = opg.build_data_t(trans_arys)
 
             trans_part_desc = opg.build_part_descriptor(total_rows,
-                                                        self.n_components,
+                                                        self._n_components,
                                                         rank_to_sizes,
                                                         rank)
 
-        self._initialize_arrays(self.n_components, total_rows, n_cols)
+        self._initialize_arrays(self._n_components, total_rows, n_cols)
         decomp_params = self._build_params(total_rows, n_cols)
 
         if _transform:
diff --git a/python/cuml/decomposition/pca_mg.pyx b/python/cuml/decomposition/pca_mg.pyx
index 175ffdd7e0..e3ee6197d6 100644
--- a/python/cuml/decomposition/pca_mg.pyx
+++ b/python/cuml/decomposition/pca_mg.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -116,7 +116,7 @@ class PCAMG(BaseDecompositionMG, PCA):
 
     def _build_params(self, n_rows, n_cols):
         cpdef paramsPCAMG *params = new paramsPCAMG()
-        params.n_components = self.n_components
+        params.n_components = self._n_components
         params.n_rows = n_rows
         params.n_cols = n_cols
         params.whiten = self.whiten
diff --git a/python/cuml/test/dask/test_pca.py b/python/cuml/test/dask/test_pca.py
index 85f0cc6588..6c21844517 100644
--- a/python/cuml/test/dask/test_pca.py
+++ b/python/cuml/test/dask/test_pca.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -87,7 +87,9 @@ def test_pca_fit_transform_fp32(nrows, ncols, n_parts, client):
                            random_state=10, dtype=np.float32)
 
     cupca = daskPCA(n_components=20, whiten=True)
-    cupca.fit_transform(X_cudf)
+    res = cupca.fit_transform(X_cudf)
+    res = res.compute()
+    assert res.shape[0] == nrows and res.shape[1] == 20
 
 
 @pytest.mark.mg
@@ -107,4 +109,28 @@ def test_pca_fit_transform_fp64(nrows, ncols, n_parts, client):
                            random_state=10, dtype=np.float64)
 
     cupca = daskPCA(n_components=30, whiten=False)
-    cupca.fit_transform(X_cudf)
+    res = cupca.fit_transform(X_cudf)
+    res = res.compute()
+    assert res.shape[0] == nrows and res.shape[1] == 30
+
+
+@pytest.mark.mg
+@pytest.mark.parametrize("nrows", [1000])
+@pytest.mark.parametrize("ncols", [20])
+@pytest.mark.parametrize("n_parts", [28])
+def test_pca_fit_transform_fp32_noncomponents(nrows, ncols, n_parts, client):
+    # Tests the case when n_components is not passed for MG scenarios
+    from cuml.dask.decomposition import PCA as daskPCA
+    from cuml.dask.datasets import make_blobs
+
+    X_cudf, _ = make_blobs(n_samples=nrows,
+                           n_features=ncols,
+                           centers=1,
+                           n_parts=n_parts,
+                           cluster_std=1.5,
+                           random_state=10, dtype=np.float32)
+
+    cupca = daskPCA(whiten=False)
+    res = cupca.fit_transform(X_cudf)
+    res = res.compute()
+    assert res.shape[0] == nrows and res.shape[1] == 20