Preserve indexing in methods when applied to DataFrame and Series obj…

…ects (rapidsai#4317) closes rapidsai#4037 Adds index field to cumlarray to be able to use it to construct the output with the correct index from inputs. Authors: - Dante Gama Dessavre (https://github.com/dantegd) Approvers: - William Hicks (https://github.com/wphicks) URL: rapidsai#4317
vimarsh6739 · Nov 13, 2021 · 9f00a11 · 9f00a11
1 parent 0d72c5a
commit 9f00a11
Show file tree

Hide file tree

Showing 23 changed files with 172 additions and 62 deletions.
diff --git a/python/cuml/cluster/dbscan.pyx b/python/cuml/cluster/dbscan.pyx
@@ -259,7 +259,8 @@ class DBSCAN(Base,
 
         cdef handle_t* handle_ = <handle_t*><size_t>self.handle.getHandle()
 
-        self.labels_ = CumlArray.empty(n_rows, dtype=out_dtype)
+        self.labels_ = CumlArray.empty(n_rows, dtype=out_dtype,
+                                       index=X_m.index)
         cdef uintptr_t labels_ptr = self.labels_.ptr
 
         cdef uintptr_t core_sample_indices_ptr = <uintptr_t> NULL

diff --git a/python/cuml/cluster/hdbscan.pyx b/python/cuml/cluster/hdbscan.pyx
@@ -580,7 +580,7 @@ class HDBSCAN(Base, ClusterMixin, CMajorInputTagMixin):
         self.n_connected_components_ = 1
         self.n_leaves_ = n_rows
 
-        self.labels_ = CumlArray.empty(n_rows, dtype="int32")
+        self.labels_ = CumlArray.empty(n_rows, dtype="int32", index=X_m.index)
         self.children_ = CumlArray.empty((2, n_rows), dtype="int32")
         self.probabilities_ = CumlArray.empty(n_rows, dtype="float32")
         self.sizes_ = CumlArray.empty(n_rows, dtype="int32")

diff --git a/python/cuml/cluster/kmeans.pyx b/python/cuml/cluster/kmeans.pyx
@@ -476,7 +476,8 @@ class KMeans(Base,
 
         cdef uintptr_t cluster_centers_ptr = self.cluster_centers_.ptr
 
-        self.labels_ = CumlArray.zeros(shape=n_rows, dtype=np.int32)
+        self.labels_ = CumlArray.zeros(shape=n_rows, dtype=np.int32,
+                                       index=X_m.index)
         cdef uintptr_t labels_ptr = self.labels_.ptr
 
         # Sum of squared distances of samples to their closest cluster center.

diff --git a/python/cuml/common/array.py b/python/cuml/common/array.py
@@ -98,7 +98,12 @@ class CumlArray(Buffer):
 
     @nvtx.annotate(message="common.CumlArray.__init__", category="utils",
                    domain="cuml_python")
-    def __init__(self, data=None, owner=None, dtype=None, shape=None,
+    def __init__(self,
+                 data=None,
+                 index=None,
+                 owner=None,
+                 dtype=None,
+                 shape=None,
                  order=None):
 
         # Checks of parameters
@@ -148,6 +153,7 @@ def __init__(self, data=None, owner=None, dtype=None, shape=None,
         else:
             flattened_data = data
 
+        self._index = index
         super().__init__(data=flattened_data,
                          owner=owner,
                          size=size)
@@ -179,6 +185,16 @@ def __init__(self, data=None, owner=None, dtype=None, shape=None,
                 self.strides = ary_interface['strides']
                 self.order = _strides_to_order(self.strides, self.dtype)
 
+    # We use the index as a property to allow for validation/processing
+    # in the future if needed
+    @property
+    def index(self):
+        return self._index
+
+    @index.setter
+    def index(self, index):
+        self._index = index
+
     @with_cupy_rmm
     def __getitem__(self, slice):
         return CumlArray(data=cp.asarray(self).__getitem__(slice))
@@ -267,7 +283,8 @@ def to_output(self, output_type='cupy', output_dtype=None):
                 mat = cp.asarray(self, dtype=output_dtype)
                 if len(mat.shape) == 1:
                     mat = mat.reshape(mat.shape[0], 1)
-                return DataFrame(mat)
+                return DataFrame(mat,
+                                 index=self.index)
             else:
                 raise ValueError('cuDF unsupported Array dtype')
 
@@ -277,7 +294,9 @@ def to_output(self, output_type='cupy', output_dtype=None):
             if len(self.shape) == 1:
                 if self.dtype not in [np.uint8, np.uint16, np.uint32,
                                       np.uint64, np.float16]:
-                    return Series(self, dtype=output_dtype)
+                    return Series(self,
+                                  dtype=output_dtype,
+                                  index=self.index)
                 else:
                     raise ValueError('cuDF unsupported Array dtype')
             elif self.shape[1] > 1:
@@ -307,7 +326,11 @@ def serialize(self):
     @classmethod
     @nvtx.annotate(message="common.CumlArray.empty", category="utils",
                    domain="cuml_python")
-    def empty(cls, shape, dtype, order='F'):
+    def empty(cls,
+              shape,
+              dtype,
+              order='F',
+              index=None):
         """
         Create an empty Array with an allocated but uninitialized DeviceBuffer
 
@@ -321,12 +344,17 @@ def empty(cls, shape, dtype, order='F'):
             Whether to create a F-major or C-major array.
         """
 
-        return CumlArray(cp.empty(shape, dtype, order))
+        return CumlArray(cp.empty(shape, dtype, order), index=index)
 
     @classmethod
     @nvtx.annotate(message="common.CumlArray.full", category="utils",
                    domain="cuml_python")
-    def full(cls, shape, value, dtype, order='F'):
+    def full(cls,
+             shape,
+             value,
+             dtype,
+             order='F',
+             index=None):
         """
         Create an Array with an allocated DeviceBuffer initialized to value.
 
@@ -340,12 +368,16 @@ def full(cls, shape, value, dtype, order='F'):
             Whether to create a F-major or C-major array.
         """
 
-        return CumlArray(cp.full(shape, value, dtype, order))
+        return CumlArray(cp.full(shape, value, dtype, order), index=index)
 
     @classmethod
     @nvtx.annotate(message="common.CumlArray.zeros", category="utils",
                    domain="cuml_python")
-    def zeros(cls, shape, dtype='float32', order='F'):
+    def zeros(cls,
+              shape,
+              dtype='float32',
+              order='F',
+              index=None):
         """
         Create an Array with an allocated DeviceBuffer initialized to zeros.
 
@@ -358,12 +390,17 @@ def zeros(cls, shape, dtype='float32', order='F'):
         order: string, optional
             Whether to create a F-major or C-major array.
         """
-        return CumlArray.full(value=0, shape=shape, dtype=dtype, order=order)
+        return CumlArray.full(value=0, shape=shape, dtype=dtype, order=order,
+                              index=index)
 
     @classmethod
     @nvtx.annotate(message="common.CumlArray.ones", category="utils",
                    domain="cuml_python")
-    def ones(cls, shape, dtype='float32', order='F'):
+    def ones(cls,
+             shape,
+             dtype='float32',
+             order='F',
+             index=None):
         """
         Create an Array with an allocated DeviceBuffer initialized to zeros.
 
@@ -376,7 +413,8 @@ def ones(cls, shape, dtype='float32', order='F'):
         order: string, optional
             Whether to create a F-major or C-major array.
         """
-        return CumlArray.full(value=1, shape=shape, dtype=dtype, order=order)
+        return CumlArray.full(value=1, shape=shape, dtype=dtype, order=order,
+                              index=index)
 
 
 def _check_low_level_type(data):

diff --git a/python/cuml/common/array_sparse.py b/python/cuml/common/array_sparse.py
@@ -119,6 +119,7 @@ def __init__(self, data=None,
         self.shape = data.shape
         self.dtype = self.data.dtype
         self.nnz = data.nnz
+        self.index = None
 
     @nvtx.annotate(message="common.SparseCumlArray.to_output",
                    category="utils", domain="cuml_python")

diff --git a/python/cuml/common/input_utils.py b/python/cuml/common/input_utils.py
@@ -308,6 +308,8 @@ def check_order(arr_order):
                           safe_dtype=safe_dtype_conversion)
         check_dtype = False
 
+    index = getattr(X, 'index', None)
+
     # format conversion
 
     if (isinstance(X, cudf.Series)):
@@ -322,9 +324,11 @@ def check_order(arr_order):
 
     if isinstance(X, cudf.DataFrame):
         if order == 'K':
-            X_m = CumlArray(data=X.as_gpu_matrix(order='F'))
+            X_m = CumlArray(data=X.as_gpu_matrix(order='F'),
+                            index=index)
         else:
-            X_m = CumlArray(data=X.as_gpu_matrix(order=order))
+            X_m = CumlArray(data=X.as_gpu_matrix(order=order),
+                            index=index)
 
     elif isinstance(X, CumlArray):
         X_m = X
@@ -349,7 +353,6 @@ def check_order(arr_order):
             if not _check_array_contiguity(X):
                 debug("Non contiguous array or view detected, a "
                       "contiguous copy of the data will be done.")
-                # X = cp.array(X, order=order, copy=True)
                 make_copy = True
 
         # If we have a host array, we copy it first before changing order
@@ -359,7 +362,8 @@ def check_order(arr_order):
 
         cp_arr = cp.array(X, copy=make_copy, order=order)
 
-        X_m = CumlArray(data=cp_arr)
+        X_m = CumlArray(data=cp_arr,
+                        index=index)
 
         if deepcopy:
             X_m = copy.deepcopy(X_m)
@@ -404,9 +408,13 @@ def check_order(arr_order):
 
     if (check_order(X_m.order)):
         X_m = cp.array(X_m, copy=False, order=order)
-        X_m = CumlArray(data=X_m)
+        X_m = CumlArray(data=X_m,
+                        index=index)
 
-    return cuml_array(array=X_m, n_rows=n_rows, n_cols=n_cols, dtype=X_m.dtype)
+    return cuml_array(array=X_m,
+                      n_rows=n_rows,
+                      n_cols=n_cols,
+                      dtype=X_m.dtype)
 
 
 @nvtx.annotate(message="common.input_utils.input_to_cupy_array",

diff --git a/python/cuml/decomposition/pca.pyx b/python/cuml/decomposition/pca.pyx
@@ -693,7 +693,7 @@ class PCA(Base,
 
         t_input_data = \
             CumlArray.zeros((params.n_rows, params.n_components),
-                            dtype=dtype.type)
+                            dtype=dtype.type, index=X_m.index)
 
         cdef uintptr_t _trans_input_ptr = t_input_data.ptr
         cdef uintptr_t components_ptr = self.components_.ptr

diff --git a/python/cuml/decomposition/tsvd.pyx b/python/cuml/decomposition/tsvd.pyx
@@ -346,7 +346,7 @@ class TruncatedSVD(Base,
             self.singular_values_.ptr
 
         _trans_input_ = CumlArray.zeros((params.n_rows, params.n_components),
-                                        dtype=self.dtype)
+                                        dtype=self.dtype, index=X_m.index)
         cdef uintptr_t t_input_ptr = _trans_input_.ptr
 
         if self.n_components> self.n_cols:
@@ -389,7 +389,7 @@ class TruncatedSVD(Base,
 
         """
 
-        trans_input, n_rows, _, dtype = \
+        X_m, n_rows, _, dtype = \
             input_to_cuml_array(X, check_dtype=self.dtype,
                                 convert_to_dtype=(self.dtype if convert_dtype
                                                   else None))
@@ -400,9 +400,9 @@ class TruncatedSVD(Base,
         params.n_cols = self.n_cols
 
         input_data = CumlArray.zeros((params.n_rows, params.n_cols),
-                                     dtype=self.dtype)
+                                     dtype=self.dtype, index=X_m.index)
 
-        cdef uintptr_t trans_input_ptr = trans_input.ptr
+        cdef uintptr_t trans_input_ptr = X_m.ptr
         cdef uintptr_t input_ptr = input_data.ptr
         cdef uintptr_t components_ptr = self.components_.ptr
 
@@ -436,7 +436,7 @@ class TruncatedSVD(Base,
         Perform dimensionality reduction on X.
 
         """
-        input, n_rows, _, dtype = \
+        X_m, n_rows, _, dtype = \
             input_to_cuml_array(X, check_dtype=self.dtype,
                                 convert_to_dtype=(self.dtype if convert_dtype
                                                   else None),
@@ -449,9 +449,9 @@ class TruncatedSVD(Base,
 
         t_input_data = \
             CumlArray.zeros((params.n_rows, params.n_components),
-                            dtype=self.dtype)
+                            dtype=self.dtype, index=X_m.index)
 
-        cdef uintptr_t input_ptr = input.ptr
+        cdef uintptr_t input_ptr = X_m.ptr
         cdef uintptr_t trans_input_ptr = t_input_data.ptr
         cdef uintptr_t components_ptr = self.components_.ptr
 

diff --git a/python/cuml/experimental/linear_model/lars.pyx b/python/cuml/experimental/linear_model/lars.pyx
@@ -365,7 +365,7 @@ class Lars(Base, RegressorMixin):
         cdef uintptr_t active_idx_ptr = \
             input_to_cuml_array(self.active_).array.ptr
 
-        preds = CumlArray.zeros(n_rows, dtype=self.dtype)
+        preds = CumlArray.zeros(n_rows, dtype=self.dtype, index=X_m.index)
 
         if self.dtype == np.float32:
             larsPredict(handle_[0], <float*> X_ptr, <int> n_rows,

diff --git a/python/cuml/fil/fil.pyx b/python/cuml/fil/fil.pyx
@@ -329,11 +329,13 @@ cdef class ForestInference_impl():
                     shape += (2,)
                 else:
                     shape += (self.num_class,)
-            preds = CumlArray.empty(shape=shape, dtype=np.float32, order='C')
-        elif (not isinstance(preds, cudf.Series) and
-              not rmm.is_cuda_array(preds)):
-            raise ValueError("Invalid type for output preds,"
-                             " need GPU array")
+            preds = CumlArray.empty(shape=shape, dtype=np.float32, order='C',
+                                    index=X_m.index)
+        else:
+            if not hasattr(preds, "__cuda_array_interface__"):
+                raise ValueError("Invalid type for output preds,"
+                                 " need GPU array")
+            preds.index = X_m.index
 
         cdef uintptr_t preds_ptr
         preds_ptr = preds.ptr

diff --git a/python/cuml/linear_model/base.pyx b/python/cuml/linear_model/base.pyx
@@ -67,17 +67,15 @@ class LinearPredictMixin:
         Predicts `y` values for `X`.
 
         """
-        cdef uintptr_t X_ptr
         X_m, n_rows, n_cols, dtype = \
             input_to_cuml_array(X, check_dtype=self.dtype,
                                 convert_to_dtype=(self.dtype if convert_dtype
                                                   else None),
                                 check_cols=self.n_cols)
-        X_ptr = X_m.ptr
-
+        cdef uintptr_t X_ptr = X_m.ptr
         cdef uintptr_t coef_ptr = self.coef_.ptr
 
-        preds = CumlArray.zeros(n_rows, dtype=dtype)
+        preds = CumlArray.zeros(n_rows, dtype=dtype, index=X_m.index)
         cdef uintptr_t preds_ptr = preds.ptr
 
         cdef handle_t* handle_ = <handle_t*><size_t>self.handle.getHandle()

diff --git a/python/cuml/manifold/t_sne.pyx b/python/cuml/manifold/t_sne.pyx
@@ -422,7 +422,6 @@ class TSNE(Base,
                                        convert_format=False)
             n, p = self.X_m.shape
             self.sparse_fit = True
-
         # Handle dense inputs
         else:
             self.X_m, n, p, _ = \
@@ -451,7 +450,8 @@ class TSNE(Base,
         self.embedding_ = CumlArray.zeros(
             (n, self.n_components),
             order="F",
-            dtype=np.float32)
+            dtype=np.float32,
+            index=self.X_m.index)
 
         cdef uintptr_t embed_ptr = self.embedding_.ptr
 

diff --git a/python/cuml/manifold/umap.pyx b/python/cuml/manifold/umap.pyx
@@ -560,7 +560,8 @@ class UMAP(Base,
 
         self.embedding_ = CumlArray.zeros((self.n_rows,
                                            self.n_components),
-                                          order="C", dtype=np.float32)
+                                          order="C", dtype=np.float32,
+                                          index=self.X_m.index)
 
         if self.hash_input:
             with using_output_type("numpy"):
@@ -720,12 +721,14 @@ class UMAP(Base,
         if is_sparse(X):
             X_m = SparseCumlArray(X, convert_to_dtype=cupy.float32,
                                   convert_format=False)
+            index = None
         else:
             X_m, n_rows, n_cols, dtype = \
                 input_to_cuml_array(X, order='C', check_dtype=np.float32,
                                     convert_to_dtype=(np.float32
                                                       if convert_dtype
                                                       else None))
+            index = X_m.index
         n_rows = X_m.shape[0]
         n_cols = X_m.shape[1]
 
@@ -745,7 +748,8 @@ class UMAP(Base,
 
         embedding = CumlArray.zeros((X_m.shape[0],
                                     self.n_components),
-                                    order="C", dtype=np.float32)
+                                    order="C", dtype=np.float32,
+                                    index=index)
         cdef uintptr_t xformed_ptr = embedding.ptr
 
         (knn_indices_m, knn_indices_ctype), (knn_dists_m, knn_dists_ctype) =\