Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Added ability to call cuML with numpy arrays and general test cleaning #33

Merged
merged 10 commits into from
Nov 13, 2018
16 changes: 15 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,20 @@
## common
__pycache__
*.pyc
*.o
*.so
*.dylib
.cache
.coverage
.vscode
*.swp
__pycache__/
*.pytest_cache
htmlcov
build/
cuml.egg-info/
dist/
python/cuML/cuml.cpp
log

## eclipse
.project
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,12 @@ $ make -j
$ ./ml_test
```

To test the python package:

```
$ py.test python/cuML/test
```

### Python Notebooks

Demo notebooks can be found in python/notebooks folder.
Expand Down
58 changes: 33 additions & 25 deletions python/cuML/dbscan/dbscan_wrapper.pyx
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
#
# Copyright (c) 2018, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Copyright (c) 2018, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

cimport c_dbscan
import numpy as np
Expand Down Expand Up @@ -79,17 +79,24 @@ class DBSCAN:
Dense matrix (floats or doubles) of shape (n_samples, n_features)
"""

x = []
for col in X.columns:
x.append(X[col]._column.dtype)
break
cdef uintptr_t input_ptr
if (isinstance(X, cudf.DataFrame)):
self.gdf_datatype = np.dtype(X[X.columns[0]]._column.dtype)
X_m = X.as_gpu_matrix(order='C')
self.n_rows = len(X)
self.n_cols = len(X._cols)

elif (isinstance(X, np.ndarray)):
self.gdf_datatype = X.dtype
X_m = cuda.to_device(X)
self.n_rows = X.shape[0]
self.n_cols = X.shape[1]

self.gdf_datatype = np.dtype(x[0])
self.n_rows = len(X)
self.n_cols = len(X._cols)
else:
msg = "X matrix format not supported"
raise TypeError(msg)

X_m = X.as_gpu_matrix()
cdef uintptr_t input_ptr = self._get_ctype_ptr(X_m)
input_ptr = self._get_ctype_ptr(X_m)

self.labels_ = cudf.Series(np.zeros(self.n_rows, dtype=np.int32))
cdef uintptr_t labels_ptr = self._get_column_ptr(self.labels_)
Expand All @@ -100,15 +107,16 @@ class DBSCAN:
<int> self.n_cols,
<float> self.eps,
<int> self.min_samples,
<int*> labels_ptr)
<int*> labels_ptr)
else:
c_dbscan.dbscanFit(<double*>input_ptr,
<int> self.n_rows,
<int> self.n_cols,
<double> self.eps,
<int> self.min_samples,
<int*> labels_ptr)
<int*> labels_ptr)
del(X_m)
return self

def fit_predict(self, X):
"""
Expand Down
99 changes: 67 additions & 32 deletions python/cuML/kmeans/kmeans_wrapper.pyx
Original file line number Diff line number Diff line change
@@ -1,3 +1,19 @@
#
# Copyright (c) 2018, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

cimport c_kmeans
import numpy as np
from numba import cuda
Expand Down Expand Up @@ -106,7 +122,6 @@ class KMeans:
c = gdf.as_gpu_matrix(order='C').shape
return self._get_ctype_ptr(gdf.as_gpu_matrix(order='C'))


def fit(self, X):
"""
Compute k-means clustering with X.
Expand All @@ -119,13 +134,25 @@ class KMeans:
"""

self.gdf_datatype = np.dtype(X[X.columns[0]]._column.dtype)
self.n_rows = len(X)
self.n_cols = len(X._cols)

# cdef np.ndarray[np.float32_t, ndim=2, mode = 'c', cast=True] host_ary = input_gdf.as_gpu_matrix(order='C').copy_to_host()
cdef uintptr_t input_ptr
if (isinstance(X, cudf.DataFrame)):
self.gdf_datatype = np.dtype(X[X.columns[0]]._column.dtype)
X_m = X.as_gpu_matrix(order='C')
self.n_rows = len(X)
self.n_cols = len(X._cols)

elif (isinstance(X, np.ndarray)):
self.gdf_datatype = X.dtype
X_m = cuda.to_device(X)
self.n_rows = X.shape[0]
self.n_cols = X.shape[1]

X_m = X.as_gpu_matrix()
cdef uintptr_t input_ptr = self._get_ctype_ptr(X_m)
else:
msg = "X matrix format not supported"
raise TypeError(msg)

input_ptr = self._get_ctype_ptr(X_m)

self.labels_ = cudf.Series(np.zeros(self.n_rows, dtype=np.int32))
cdef uintptr_t labels_ptr = self._get_column_ptr(self.labels_)
Expand Down Expand Up @@ -174,7 +201,6 @@ class KMeans:
<double*> cluster_centers_ptr, # pred_centroids
<int*> labels_ptr) # pred_labels


cluster_centers_gdf = cudf.DataFrame()
for i in range(0, self.n_cols):
cluster_centers_gdf[str(i)] = self.cluster_centers_[i:self.n_clusters*self.n_cols:self.n_cols]
Expand All @@ -184,7 +210,6 @@ class KMeans:

return self


def fit_predict(self, X):
"""
Compute cluster centers and predict cluster index for each sample.
Expand All @@ -197,8 +222,6 @@ class KMeans:
"""
return self.fit(X).labels_



def predict(self, X):
"""
Predict the closest cluster each sample in X belongs to.
Expand All @@ -210,11 +233,25 @@ class KMeans:

"""
self.gdf_datatype = np.dtype(X[X.columns[0]]._column.dtype)
self.n_rows = len(X)
self.n_cols = len(X._cols)

X_m = X.as_gpu_matrix()
cdef uintptr_t input_ptr = self._get_ctype_ptr(X_m)
cdef uintptr_t input_ptr
if (isinstance(X, cudf.DataFrame)):
self.gdf_datatype = np.dtype(X[X.columns[0]]._column.dtype)
X_m = X.as_gpu_matrix(order='C')
self.n_rows = len(X)
self.n_cols = len(X._cols)

elif (isinstance(X, np.ndarray)):
self.gdf_datatype = X.dtype
X_m = cuda.to_device(X)
self.n_rows = X.shape[0]
self.n_cols = X.shape[1]

else:
msg = "X matrix format not supported"
raise TypeError(msg)

input_ptr = self._get_ctype_ptr(X_m)

clust_mat = self.cluster_centers_.as_gpu_matrix(order='C')
cdef uintptr_t cluster_centers_ptr = self._get_ctype_ptr(clust_mat)
Expand Down Expand Up @@ -267,8 +304,6 @@ class KMeans:
del(clust_mat)
return self.labels_



def transform(self, X):
"""
Transform X to a cluster-distance space.
Expand All @@ -280,31 +315,33 @@ class KMeans:

"""

self.n_rows = len(X)
self.n_cols = len(X._cols)


# cdef np.ndarray[np.float32_t, ndim=2, mode = 'c', cast=True] host_ary = input_gdf.as_gpu_matrix(order='C').copy_to_host()
cdef uintptr_t input_ptr
if (isinstance(X, cudf.DataFrame)):
self.gdf_datatype = np.dtype(X[X.columns[0]]._column.dtype)
X_m = X.as_gpu_matrix(order='C')
self.n_rows = len(X)
self.n_cols = len(X._cols)

# cdef np.ndarray[np.float32_t, ndim=2, mode = 'c', cast=True] cluster_centers_ptr = self.cluster_centers_.as_gpu_matrix(order='C').copy_to_host()
elif (isinstance(X, np.ndarray)):
self.gdf_datatype = X.dtype
X_m = cuda.to_device(X)
self.n_rows = X.shape[0]
self.n_cols = X.shape[1]

else:
msg = "X matrix format not supported"
raise TypeError(msg)

X_m = X.as_gpu_matrix()
cdef uintptr_t input_ptr = self._get_ctype_ptr(X_m)
input_ptr = self._get_ctype_ptr(X_m)

clust_mat = self.cluster_centers_.as_gpu_matrix(order='C')
cdef uintptr_t cluster_centers_ptr = self._get_ctype_ptr(clust_mat)

preds_data = cuda.to_device(np.zeros(self.n_clusters*self.n_rows,
dtype=self.gdf_datatype.type))
dtype=self.gdf_datatype.type))

cdef uintptr_t preds_ptr = self._get_ctype_ptr(preds_data)


ary=np.array([1.0,1.5,3.5,2.5],dtype=np.float32)
dary=cuda.to_device(ary)
cdef uintptr_t ptr2 = dary.device_ctypes_pointer.value

if self.gdf_datatype.type == np.float32:
c_kmeans.kmeans_transform(
<int> self.verbose, # verbose
Expand All @@ -330,7 +367,6 @@ class KMeans:
<double*> cluster_centers_ptr, # centroids
<double*> preds_ptr) # preds


preds_gdf = cudf.DataFrame()
for i in range(0, self.n_clusters):
preds_gdf[str(i)] = preds_data[i*self.n_rows:(i+1)*self.n_rows]
Expand All @@ -339,7 +375,6 @@ class KMeans:
del(clust_mat)
return preds_gdf


def fit_transform(self, input_gdf):
"""
Compute clustering and transform input_gdf to cluster-distance space.
Expand Down
Loading