Merge pull request #33 from dantegd/fea-numpy-input

[REVIEW] Added ability to call cuML with numpy arrays and general test cleaning
rapidsai · Nov 13, 2018 · 7a02237 · 7a02237
2 parents 655dbeb + e28ab6d
commit 7a02237
Show file tree

Hide file tree

Showing 17 changed files with 433 additions and 1,874 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,20 @@
+## common
+__pycache__
+*.pyc
+*.o
+*.so
+*.dylib
+.cache
+.coverage
+.vscode
 *.swp
-__pycache__/
+*.pytest_cache
+htmlcov
 build/
 cuml.egg-info/
 dist/
 python/cuML/cuml.cpp
+log
+
+## eclipse
+.project
diff --git a/README.md b/README.md
@@ -94,6 +94,12 @@ $ make -j
 $ ./ml_test
 ```
 
+To test the python package:
+
+```
+$ py.test python/cuML/test
+```
+
 ### Python Notebooks
 
 Demo notebooks can be found in python/notebooks folder.

diff --git a/python/cuML/dbscan/dbscan_wrapper.pyx b/python/cuML/dbscan/dbscan_wrapper.pyx
@@ -1,18 +1,18 @@
 #
- # Copyright (c) 2018, NVIDIA CORPORATION.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- #     http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- #
+# Copyright (c) 2018, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
 
 cimport c_dbscan
 import numpy as np
@@ -79,17 +79,24 @@ class DBSCAN:
                Dense matrix (floats or doubles) of shape (n_samples, n_features)
         """
 
-        x = []
-        for col in X.columns:
-            x.append(X[col]._column.dtype)
-            break
+        cdef uintptr_t input_ptr
+        if (isinstance(X, cudf.DataFrame)):
+            self.gdf_datatype = np.dtype(X[X.columns[0]]._column.dtype)
+            X_m = X.as_gpu_matrix(order='C')
+            self.n_rows = len(X)
+            self.n_cols = len(X._cols)
+
+        elif (isinstance(X, np.ndarray)):
+            self.gdf_datatype = X.dtype
+            X_m = cuda.to_device(X)
+            self.n_rows = X.shape[0]
+            self.n_cols = X.shape[1]
 
-        self.gdf_datatype = np.dtype(x[0])
-        self.n_rows = len(X)
-        self.n_cols = len(X._cols)
+        else:
+            msg = "X matrix format  not supported"
+            raise TypeError(msg)
 
-        X_m = X.as_gpu_matrix()
-        cdef uintptr_t input_ptr = self._get_ctype_ptr(X_m)
+        input_ptr = self._get_ctype_ptr(X_m)
 
         self.labels_ = cudf.Series(np.zeros(self.n_rows, dtype=np.int32))
         cdef uintptr_t labels_ptr = self._get_column_ptr(self.labels_)
@@ -100,15 +107,16 @@ class DBSCAN:
                                <int> self.n_cols,
                                <float> self.eps,
                                <int> self.min_samples,
-		               <int*> labels_ptr)
+		                       <int*> labels_ptr)
         else:
             c_dbscan.dbscanFit(<double*>input_ptr,
                                <int> self.n_rows,
                                <int> self.n_cols,
                                <double> self.eps,
                                <int> self.min_samples,
-		               <int*> labels_ptr)
+		                       <int*> labels_ptr)
         del(X_m)
+        return self
 
     def fit_predict(self, X):
         """

diff --git a/python/cuML/kmeans/kmeans_wrapper.pyx b/python/cuML/kmeans/kmeans_wrapper.pyx
@@ -1,3 +1,19 @@
+#
+# Copyright (c) 2018, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
 cimport c_kmeans
 import numpy as np
 from numba import cuda
@@ -106,7 +122,6 @@ class KMeans:
         c = gdf.as_gpu_matrix(order='C').shape
         return self._get_ctype_ptr(gdf.as_gpu_matrix(order='C'))
 
-
     def fit(self, X):
         """
         Compute k-means clustering with X.
@@ -119,13 +134,25 @@ class KMeans:
         """
 
         self.gdf_datatype = np.dtype(X[X.columns[0]]._column.dtype)
-        self.n_rows = len(X)
-        self.n_cols = len(X._cols)
 
-        # cdef np.ndarray[np.float32_t, ndim=2, mode = 'c', cast=True] host_ary = input_gdf.as_gpu_matrix(order='C').copy_to_host()
+        cdef uintptr_t input_ptr
+        if (isinstance(X, cudf.DataFrame)):
+            self.gdf_datatype = np.dtype(X[X.columns[0]]._column.dtype)
+            X_m = X.as_gpu_matrix(order='C')
+            self.n_rows = len(X)
+            self.n_cols = len(X._cols)
+
+        elif (isinstance(X, np.ndarray)):
+            self.gdf_datatype = X.dtype
+            X_m = cuda.to_device(X)
+            self.n_rows = X.shape[0]
+            self.n_cols = X.shape[1]
 
-        X_m = X.as_gpu_matrix()
-        cdef uintptr_t input_ptr = self._get_ctype_ptr(X_m)
+        else:
+            msg = "X matrix format  not supported"
+            raise TypeError(msg)
+
+        input_ptr = self._get_ctype_ptr(X_m)
 
         self.labels_ = cudf.Series(np.zeros(self.n_rows, dtype=np.int32))
         cdef uintptr_t labels_ptr = self._get_column_ptr(self.labels_)
@@ -174,7 +201,6 @@ class KMeans:
                 <double*> cluster_centers_ptr, # pred_centroids
                 <int*> labels_ptr)          # pred_labels
 
-
         cluster_centers_gdf = cudf.DataFrame()
         for i in range(0, self.n_cols):
             cluster_centers_gdf[str(i)] = self.cluster_centers_[i:self.n_clusters*self.n_cols:self.n_cols]
@@ -184,7 +210,6 @@ class KMeans:
 
         return self
 
-
     def fit_predict(self, X):
         """
         Compute cluster centers and predict cluster index for each sample.
@@ -197,8 +222,6 @@ class KMeans:
         """
         return self.fit(X).labels_
 
-
-
     def predict(self, X):
         """
         Predict the closest cluster each sample in X belongs to.
@@ -210,11 +233,25 @@ class KMeans:
 
         """
         self.gdf_datatype = np.dtype(X[X.columns[0]]._column.dtype)
-        self.n_rows = len(X)
-        self.n_cols = len(X._cols)
 
-        X_m = X.as_gpu_matrix()
-        cdef uintptr_t input_ptr = self._get_ctype_ptr(X_m)
+        cdef uintptr_t input_ptr
+        if (isinstance(X, cudf.DataFrame)):
+            self.gdf_datatype = np.dtype(X[X.columns[0]]._column.dtype)
+            X_m = X.as_gpu_matrix(order='C')
+            self.n_rows = len(X)
+            self.n_cols = len(X._cols)
+
+        elif (isinstance(X, np.ndarray)):
+            self.gdf_datatype = X.dtype
+            X_m = cuda.to_device(X)
+            self.n_rows = X.shape[0]
+            self.n_cols = X.shape[1]
+
+        else:
+            msg = "X matrix format  not supported"
+            raise TypeError(msg)
+
+        input_ptr = self._get_ctype_ptr(X_m)
 
         clust_mat = self.cluster_centers_.as_gpu_matrix(order='C')
         cdef uintptr_t cluster_centers_ptr = self._get_ctype_ptr(clust_mat)
@@ -267,8 +304,6 @@ class KMeans:
         del(clust_mat)
         return self.labels_
 
-
-
     def transform(self, X):
         """
         Transform X to a cluster-distance space.
@@ -280,31 +315,33 @@ class KMeans:
 
         """
 
-        self.n_rows = len(X)
-        self.n_cols = len(X._cols)
-
-
-        # cdef np.ndarray[np.float32_t, ndim=2, mode = 'c', cast=True] host_ary = input_gdf.as_gpu_matrix(order='C').copy_to_host()
+        cdef uintptr_t input_ptr
+        if (isinstance(X, cudf.DataFrame)):
+            self.gdf_datatype = np.dtype(X[X.columns[0]]._column.dtype)
+            X_m = X.as_gpu_matrix(order='C')
+            self.n_rows = len(X)
+            self.n_cols = len(X._cols)
 
-        # cdef np.ndarray[np.float32_t, ndim=2, mode = 'c', cast=True] cluster_centers_ptr = self.cluster_centers_.as_gpu_matrix(order='C').copy_to_host()
+        elif (isinstance(X, np.ndarray)):
+            self.gdf_datatype = X.dtype
+            X_m = cuda.to_device(X)
+            self.n_rows = X.shape[0]
+            self.n_cols = X.shape[1]
 
+        else:
+            msg = "X matrix format  not supported"
+            raise TypeError(msg)
 
-        X_m = X.as_gpu_matrix()
-        cdef uintptr_t input_ptr = self._get_ctype_ptr(X_m)
+        input_ptr = self._get_ctype_ptr(X_m)
 
         clust_mat = self.cluster_centers_.as_gpu_matrix(order='C')
         cdef uintptr_t cluster_centers_ptr = self._get_ctype_ptr(clust_mat)
 
         preds_data = cuda.to_device(np.zeros(self.n_clusters*self.n_rows,
-                                       dtype=self.gdf_datatype.type))
+                                    dtype=self.gdf_datatype.type))
 
         cdef uintptr_t preds_ptr = self._get_ctype_ptr(preds_data)
 
-
-        ary=np.array([1.0,1.5,3.5,2.5],dtype=np.float32)
-        dary=cuda.to_device(ary)
-        cdef uintptr_t ptr2 = dary.device_ctypes_pointer.value
-
         if self.gdf_datatype.type == np.float32:
             c_kmeans.kmeans_transform(
                 <int> self.verbose,                    # verbose
@@ -330,7 +367,6 @@ class KMeans:
                 <double*> cluster_centers_ptr,    # centroids
                 <double*> preds_ptr)          # preds
 
-
         preds_gdf = cudf.DataFrame()
         for i in range(0, self.n_clusters):
             preds_gdf[str(i)] = preds_data[i*self.n_rows:(i+1)*self.n_rows]
@@ -339,7 +375,6 @@ class KMeans:
         del(clust_mat)
         return preds_gdf
 
-
     def fit_transform(self, input_gdf):
         """
         Compute clustering and transform input_gdf to cluster-distance space.