Skip to content

Commit

Permalink
Merge pull request #33 from dantegd/fea-numpy-input
Browse files Browse the repository at this point in the history
[REVIEW] Added ability to call cuML with numpy arrays and general test cleaning
  • Loading branch information
oyilmaz-nvidia authored Nov 13, 2018
2 parents 655dbeb + e28ab6d commit 7a02237
Show file tree
Hide file tree
Showing 17 changed files with 433 additions and 1,874 deletions.
16 changes: 15 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,20 @@
## common
__pycache__
*.pyc
*.o
*.so
*.dylib
.cache
.coverage
.vscode
*.swp
__pycache__/
*.pytest_cache
htmlcov
build/
cuml.egg-info/
dist/
python/cuML/cuml.cpp
log

## eclipse
.project
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,12 @@ $ make -j
$ ./ml_test
```

To test the python package:

```
$ py.test python/cuML/test
```

### Python Notebooks

Demo notebooks can be found in python/notebooks folder.
Expand Down
58 changes: 33 additions & 25 deletions python/cuML/dbscan/dbscan_wrapper.pyx
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
#
# Copyright (c) 2018, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Copyright (c) 2018, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

cimport c_dbscan
import numpy as np
Expand Down Expand Up @@ -79,17 +79,24 @@ class DBSCAN:
Dense matrix (floats or doubles) of shape (n_samples, n_features)
"""

x = []
for col in X.columns:
x.append(X[col]._column.dtype)
break
cdef uintptr_t input_ptr
if (isinstance(X, cudf.DataFrame)):
self.gdf_datatype = np.dtype(X[X.columns[0]]._column.dtype)
X_m = X.as_gpu_matrix(order='C')
self.n_rows = len(X)
self.n_cols = len(X._cols)

elif (isinstance(X, np.ndarray)):
self.gdf_datatype = X.dtype
X_m = cuda.to_device(X)
self.n_rows = X.shape[0]
self.n_cols = X.shape[1]

self.gdf_datatype = np.dtype(x[0])
self.n_rows = len(X)
self.n_cols = len(X._cols)
else:
msg = "X matrix format not supported"
raise TypeError(msg)

X_m = X.as_gpu_matrix()
cdef uintptr_t input_ptr = self._get_ctype_ptr(X_m)
input_ptr = self._get_ctype_ptr(X_m)

self.labels_ = cudf.Series(np.zeros(self.n_rows, dtype=np.int32))
cdef uintptr_t labels_ptr = self._get_column_ptr(self.labels_)
Expand All @@ -100,15 +107,16 @@ class DBSCAN:
<int> self.n_cols,
<float> self.eps,
<int> self.min_samples,
<int*> labels_ptr)
<int*> labels_ptr)
else:
c_dbscan.dbscanFit(<double*>input_ptr,
<int> self.n_rows,
<int> self.n_cols,
<double> self.eps,
<int> self.min_samples,
<int*> labels_ptr)
<int*> labels_ptr)
del(X_m)
return self

def fit_predict(self, X):
"""
Expand Down
99 changes: 67 additions & 32 deletions python/cuML/kmeans/kmeans_wrapper.pyx
Original file line number Diff line number Diff line change
@@ -1,3 +1,19 @@
#
# Copyright (c) 2018, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

cimport c_kmeans
import numpy as np
from numba import cuda
Expand Down Expand Up @@ -106,7 +122,6 @@ class KMeans:
c = gdf.as_gpu_matrix(order='C').shape
return self._get_ctype_ptr(gdf.as_gpu_matrix(order='C'))


def fit(self, X):
"""
Compute k-means clustering with X.
Expand All @@ -119,13 +134,25 @@ class KMeans:
"""

self.gdf_datatype = np.dtype(X[X.columns[0]]._column.dtype)
self.n_rows = len(X)
self.n_cols = len(X._cols)

# cdef np.ndarray[np.float32_t, ndim=2, mode = 'c', cast=True] host_ary = input_gdf.as_gpu_matrix(order='C').copy_to_host()
cdef uintptr_t input_ptr
if (isinstance(X, cudf.DataFrame)):
self.gdf_datatype = np.dtype(X[X.columns[0]]._column.dtype)
X_m = X.as_gpu_matrix(order='C')
self.n_rows = len(X)
self.n_cols = len(X._cols)

elif (isinstance(X, np.ndarray)):
self.gdf_datatype = X.dtype
X_m = cuda.to_device(X)
self.n_rows = X.shape[0]
self.n_cols = X.shape[1]

X_m = X.as_gpu_matrix()
cdef uintptr_t input_ptr = self._get_ctype_ptr(X_m)
else:
msg = "X matrix format not supported"
raise TypeError(msg)

input_ptr = self._get_ctype_ptr(X_m)

self.labels_ = cudf.Series(np.zeros(self.n_rows, dtype=np.int32))
cdef uintptr_t labels_ptr = self._get_column_ptr(self.labels_)
Expand Down Expand Up @@ -174,7 +201,6 @@ class KMeans:
<double*> cluster_centers_ptr, # pred_centroids
<int*> labels_ptr) # pred_labels


cluster_centers_gdf = cudf.DataFrame()
for i in range(0, self.n_cols):
cluster_centers_gdf[str(i)] = self.cluster_centers_[i:self.n_clusters*self.n_cols:self.n_cols]
Expand All @@ -184,7 +210,6 @@ class KMeans:

return self


def fit_predict(self, X):
"""
Compute cluster centers and predict cluster index for each sample.
Expand All @@ -197,8 +222,6 @@ class KMeans:
"""
return self.fit(X).labels_



def predict(self, X):
"""
Predict the closest cluster each sample in X belongs to.
Expand All @@ -210,11 +233,25 @@ class KMeans:
"""
self.gdf_datatype = np.dtype(X[X.columns[0]]._column.dtype)
self.n_rows = len(X)
self.n_cols = len(X._cols)

X_m = X.as_gpu_matrix()
cdef uintptr_t input_ptr = self._get_ctype_ptr(X_m)
cdef uintptr_t input_ptr
if (isinstance(X, cudf.DataFrame)):
self.gdf_datatype = np.dtype(X[X.columns[0]]._column.dtype)
X_m = X.as_gpu_matrix(order='C')
self.n_rows = len(X)
self.n_cols = len(X._cols)

elif (isinstance(X, np.ndarray)):
self.gdf_datatype = X.dtype
X_m = cuda.to_device(X)
self.n_rows = X.shape[0]
self.n_cols = X.shape[1]

else:
msg = "X matrix format not supported"
raise TypeError(msg)

input_ptr = self._get_ctype_ptr(X_m)

clust_mat = self.cluster_centers_.as_gpu_matrix(order='C')
cdef uintptr_t cluster_centers_ptr = self._get_ctype_ptr(clust_mat)
Expand Down Expand Up @@ -267,8 +304,6 @@ class KMeans:
del(clust_mat)
return self.labels_



def transform(self, X):
"""
Transform X to a cluster-distance space.
Expand All @@ -280,31 +315,33 @@ class KMeans:
"""

self.n_rows = len(X)
self.n_cols = len(X._cols)


# cdef np.ndarray[np.float32_t, ndim=2, mode = 'c', cast=True] host_ary = input_gdf.as_gpu_matrix(order='C').copy_to_host()
cdef uintptr_t input_ptr
if (isinstance(X, cudf.DataFrame)):
self.gdf_datatype = np.dtype(X[X.columns[0]]._column.dtype)
X_m = X.as_gpu_matrix(order='C')
self.n_rows = len(X)
self.n_cols = len(X._cols)

# cdef np.ndarray[np.float32_t, ndim=2, mode = 'c', cast=True] cluster_centers_ptr = self.cluster_centers_.as_gpu_matrix(order='C').copy_to_host()
elif (isinstance(X, np.ndarray)):
self.gdf_datatype = X.dtype
X_m = cuda.to_device(X)
self.n_rows = X.shape[0]
self.n_cols = X.shape[1]

else:
msg = "X matrix format not supported"
raise TypeError(msg)

X_m = X.as_gpu_matrix()
cdef uintptr_t input_ptr = self._get_ctype_ptr(X_m)
input_ptr = self._get_ctype_ptr(X_m)

clust_mat = self.cluster_centers_.as_gpu_matrix(order='C')
cdef uintptr_t cluster_centers_ptr = self._get_ctype_ptr(clust_mat)

preds_data = cuda.to_device(np.zeros(self.n_clusters*self.n_rows,
dtype=self.gdf_datatype.type))
dtype=self.gdf_datatype.type))

cdef uintptr_t preds_ptr = self._get_ctype_ptr(preds_data)


ary=np.array([1.0,1.5,3.5,2.5],dtype=np.float32)
dary=cuda.to_device(ary)
cdef uintptr_t ptr2 = dary.device_ctypes_pointer.value

if self.gdf_datatype.type == np.float32:
c_kmeans.kmeans_transform(
<int> self.verbose, # verbose
Expand All @@ -330,7 +367,6 @@ class KMeans:
<double*> cluster_centers_ptr, # centroids
<double*> preds_ptr) # preds


preds_gdf = cudf.DataFrame()
for i in range(0, self.n_clusters):
preds_gdf[str(i)] = preds_data[i*self.n_rows:(i+1)*self.n_rows]
Expand All @@ -339,7 +375,6 @@ class KMeans:
del(clust_mat)
return preds_gdf


def fit_transform(self, input_gdf):
"""
Compute clustering and transform input_gdf to cluster-distance space.
Expand Down
Loading

0 comments on commit 7a02237

Please sign in to comment.