From 4cf413cc5ae08920effabfefddf76c33897c1e16 Mon Sep 17 00:00:00 2001 From: Jiwei Liu Date: Wed, 17 Oct 2018 18:13:42 -0700 Subject: [PATCH 1/3] add knn wrapper and notebook --- python/cuML/cuml.pyx | 1 + python/cuML/knn/knn_wrapper.py | 50 ++++++ python/notebooks/knn_demo.ipynb | 264 ++++++++++++++++++++++++++++++++ 3 files changed, 315 insertions(+) create mode 100644 python/cuML/knn/knn_wrapper.py create mode 100644 python/notebooks/knn_demo.ipynb diff --git a/python/cuML/cuml.pyx b/python/cuML/cuml.pyx index 6262110e2a..ba7944af3f 100644 --- a/python/cuML/cuml.pyx +++ b/python/cuML/cuml.pyx @@ -1,3 +1,4 @@ include "pca/pca_wrapper.pyx" include "tsvd/tsvd_wrapper.pyx" include "dbscan/dbscan_wrapper.pyx" +include "knn/knn_wrapper.py" diff --git a/python/cuML/knn/knn_wrapper.py b/python/cuML/knn/knn_wrapper.py new file mode 100644 index 0000000000..0d9ba9f28c --- /dev/null +++ b/python/cuML/knn/knn_wrapper.py @@ -0,0 +1,50 @@ +import faiss +import numpy as np +import pandas as pd +import pygdf + +class KNNparams: + def __init__(self,n_gpus): + self.n_gpus = n_gpus + +class KNN: + + def __init__(self, n_gpus=-1): # -1 means using all gpus + self.params = KNNparams(n_gpus) + + def fit(self,X): + X = self.to_nparray(X) + assert len(X.shape)==2, 'data should be two dimensional' + n_dims = X.shape[1] + cpu_index = faiss.IndexFlatL2(n_dims) # build a flat (CPU) index + if self.params.n_gpus==1: + res = faiss.StandardGpuResources() # use a single GPU + # make it a flat GPU index + gpu_index = faiss.index_cpu_to_gpu(res, 0, cpu_index) + else: + gpu_index = faiss.index_cpu_to_all_gpus(cpu_index,ngpu=self.params.n_gpus) + gpu_index.add(X) + self.gpu_index = gpu_index + + def query(self,X,k): + X = self.to_nparray(X) + D,I = self.gpu_index.search(X, k) + D = self.to_pygdf(D,col='distance') + I = self.to_pygdf(I,col='index') + return D,I + + def to_nparray(self,x): + if isinstance(x,pd.DataFrame): + x = x.values + elif isinstance(x,pygdf.DataFrame): + x = x.to_pandas().values + return np.ascontiguousarray(x) + + def to_pygdf(self,df,col=''): + # convert pandas dataframe to pygdf dataframe + if isinstance(df,np.ndarray): + df = pd.DataFrame({'%s_neighbor_%d'%(col,i):df[:,i] for i in range(df.shape[1])}) + pdf = pygdf.DataFrame() + for c,column in enumerate(df): + pdf[c] = df[column] + return pdf diff --git a/python/notebooks/knn_demo.ipynb b/python/notebooks/knn_demo.ipynb new file mode 100644 index 0000000000..4cdcbb6638 --- /dev/null +++ b/python/notebooks/knn_demo.ipynb @@ -0,0 +1,264 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from sklearn.neighbors import KDTree as skKNN\n", + "from cuML import KNN as cumlKNN\n", + "import pygdf\n", + "import os" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Helper Functions" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from timeit import default_timer\n", + "\n", + "class Timer(object):\n", + " def __init__(self):\n", + " self._timer = default_timer\n", + " \n", + " def __enter__(self):\n", + " self.start()\n", + " return self\n", + "\n", + " def __exit__(self, *args):\n", + " self.stop()\n", + "\n", + " def start(self):\n", + " \"\"\"Start the timer.\"\"\"\n", + " self.start = self._timer()\n", + "\n", + " def stop(self):\n", + " \"\"\"Stop the timer. Calculate the interval in seconds.\"\"\"\n", + " self.end = self._timer()\n", + " self.interval = self.end - self.start" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import gzip\n", + "def load_data(nrows, ncols, cached = 'data/mortgage.npy.gz',source='mortgage'):\n", + " if os.path.exists(cached) and source=='mortgage':\n", + " print('use mortgage data')\n", + " with gzip.open(cached) as f:\n", + " X = np.load(f)\n", + " X = X[np.random.randint(0,X.shape[0]-1,nrows),:ncols]\n", + " else:\n", + " print('use random data')\n", + " X = np.random.rand(nrows,ncols)\n", + " df = pd.DataFrame({'fea%d'%i:X[:,i] for i in range(X.shape[1])}).fillna(0)\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def pd2pygdf(df):\n", + " # convert pandas dataframe to pygdf dataframe\n", + " if isinstance(df,np.ndarray):\n", + " df = pd.DataFrame({'fea%d'%i:df[:,i] for i in range(df.shape[1])})\n", + " pdf = pygdf.DataFrame()\n", + " for c,column in enumerate(df):\n", + " pdf[c] = df[column]\n", + " return pdf" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.metrics import mean_squared_error\n", + "def array_equal(a,b,threshold=1e-2,with_sign=True,metric='mse'):\n", + " a = to_nparray(a)\n", + " b = to_nparray(b)\n", + " if with_sign == False:\n", + " a,b = np.abs(a),np.abs(b)\n", + " if metric=='mse':\n", + " error = mean_squared_error(a,b)\n", + " else:\n", + " error = np.sum(a!=b)/(a.shape[0]*a.shape[1])\n", + " res = error Date: Wed, 17 Oct 2018 19:23:29 -0700 Subject: [PATCH 2/3] update knn wrapper and all notebooks with from_pandas --- python/cuML/knn/knn_wrapper.py | 4 +-- python/notebooks/dbscan_demo.ipynb | 47 +++++++++----------------- python/notebooks/knn_demo.ipynb | 46 +++++++++---------------- python/notebooks/pca_demo.ipynb | 48 +++++++++----------------- python/notebooks/tsvd_demo.ipynb | 54 +++++++++++------------------- 5 files changed, 67 insertions(+), 132 deletions(-) diff --git a/python/cuML/knn/knn_wrapper.py b/python/cuML/knn/knn_wrapper.py index 0d9ba9f28c..8449200485 100644 --- a/python/cuML/knn/knn_wrapper.py +++ b/python/cuML/knn/knn_wrapper.py @@ -44,7 +44,5 @@ def to_pygdf(self,df,col=''): # convert pandas dataframe to pygdf dataframe if isinstance(df,np.ndarray): df = pd.DataFrame({'%s_neighbor_%d'%(col,i):df[:,i] for i in range(df.shape[1])}) - pdf = pygdf.DataFrame() - for c,column in enumerate(df): - pdf[c] = df[column] + pdf = pygdf.DataFrame.from_pandas(df) return pdf diff --git a/python/notebooks/dbscan_demo.ipynb b/python/notebooks/dbscan_demo.ipynb index 57b13d4d9c..271b21f567 100644 --- a/python/notebooks/dbscan_demo.ipynb +++ b/python/notebooks/dbscan_demo.ipynb @@ -75,22 +75,6 @@ "execution_count": 4, "metadata": {}, "outputs": [], - "source": [ - "def pd2pygdf(df):\n", - " # convert pandas dataframe to pygdf dataframe\n", - " if isinstance(df,np.ndarray):\n", - " df = pd.DataFrame({'fea%d'%i:df[:,i] for i in range(df.shape[1])})\n", - " pdf = pygdf.DataFrame()\n", - " for c,column in enumerate(df):\n", - " pdf[c] = df[column]\n", - " return pdf" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], "source": [ "from sklearn.metrics import mean_squared_error\n", "def array_equal(a,b,threshold=5e-3,with_sign=True):\n", @@ -120,16 +104,17 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ + "use mortgage data\n", "data (10000, 128)\n", - "CPU times: user 24 ms, sys: 4 ms, total: 28 ms\n", - "Wall time: 26.9 ms\n" + "CPU times: user 4.58 s, sys: 1.43 s, total: 6 s\n", + "Wall time: 5.05 s\n" ] } ], @@ -144,7 +129,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -154,15 +139,15 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 27.2 s, sys: 72 ms, total: 27.3 s\n", - "Wall time: 27 s\n" + "CPU times: user 26.7 s, sys: 724 ms, total: 27.4 s\n", + "Wall time: 26.8 s\n" ] } ], @@ -174,34 +159,34 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 112 ms, sys: 436 ms, total: 548 ms\n", - "Wall time: 547 ms\n" + "CPU times: user 5.42 s, sys: 680 ms, total: 6.1 s\n", + "Wall time: 867 ms\n" ] } ], "source": [ "%%time\n", - "X = pd2pygdf(X)" + "X = pygdf.DataFrame.from_pandas(X)" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 7.66 s, sys: 108 ms, total: 7.77 s\n", - "Wall time: 7.79 s\n" + "CPU times: user 7.62 s, sys: 100 ms, total: 7.72 s\n", + "Wall time: 7.8 s\n" ] } ], @@ -213,7 +198,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "metadata": {}, "outputs": [ { diff --git a/python/notebooks/knn_demo.ipynb b/python/notebooks/knn_demo.ipynb index 4cdcbb6638..de964e4f46 100644 --- a/python/notebooks/knn_demo.ipynb +++ b/python/notebooks/knn_demo.ipynb @@ -75,22 +75,6 @@ "execution_count": 4, "metadata": {}, "outputs": [], - "source": [ - "def pd2pygdf(df):\n", - " # convert pandas dataframe to pygdf dataframe\n", - " if isinstance(df,np.ndarray):\n", - " df = pd.DataFrame({'fea%d'%i:df[:,i] for i in range(df.shape[1])})\n", - " pdf = pygdf.DataFrame()\n", - " for c,column in enumerate(df):\n", - " pdf[c] = df[column]\n", - " return pdf" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], "source": [ "from sklearn.metrics import mean_squared_error\n", "def array_equal(a,b,threshold=1e-2,with_sign=True,metric='mse'):\n", @@ -124,7 +108,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -133,8 +117,8 @@ "text": [ "use mortgage data\n", "data (65536, 40)\n", - "CPU times: user 4.68 s, sys: 696 ms, total: 5.38 s\n", - "Wall time: 5.09 s\n" + "CPU times: user 4.42 s, sys: 784 ms, total: 5.2 s\n", + "Wall time: 5.04 s\n" ] } ], @@ -149,7 +133,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -158,15 +142,15 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 3min 18s, sys: 820 ms, total: 3min 19s\n", - "Wall time: 3min 11s\n" + "CPU times: user 2min 13s, sys: 888 ms, total: 2min 14s\n", + "Wall time: 1min 59s\n" ] } ], @@ -178,34 +162,34 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 6.88 s, sys: 1.04 s, total: 7.92 s\n", - "Wall time: 793 ms\n" + "CPU times: user 6.91 s, sys: 1.11 s, total: 8.02 s\n", + "Wall time: 899 ms\n" ] } ], "source": [ "%%time\n", - "X = pd2pygdf(X)" + "X = pygdf.DataFrame.from_pandas(X)" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 2.28 s, sys: 348 ms, total: 2.62 s\n", - "Wall time: 2.71 s\n" + "CPU times: user 2.27 s, sys: 368 ms, total: 2.64 s\n", + "Wall time: 2.72 s\n" ] } ], @@ -218,7 +202,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "metadata": {}, "outputs": [ { diff --git a/python/notebooks/pca_demo.ipynb b/python/notebooks/pca_demo.ipynb index f846388df5..cdbcab5d5c 100644 --- a/python/notebooks/pca_demo.ipynb +++ b/python/notebooks/pca_demo.ipynb @@ -75,22 +75,6 @@ "execution_count": 4, "metadata": {}, "outputs": [], - "source": [ - "def pd2pygdf(df):\n", - " # convert pandas dataframe to pygdf dataframe\n", - " if isinstance(df,np.ndarray):\n", - " df = pd.DataFrame({'fea%d'%i:df[:,i] for i in range(df.shape[1])})\n", - " pdf = pygdf.DataFrame()\n", - " for c,column in enumerate(df):\n", - " pdf[c] = df[column]\n", - " return pdf" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], "source": [ "from sklearn.metrics import mean_squared_error\n", "def array_equal(a,b,threshold=2e-3,with_sign=True):\n", @@ -121,7 +105,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -130,8 +114,8 @@ "text": [ "use mortgage data\n", "data (1048576, 400)\n", - "CPU times: user 16.4 s, sys: 2.32 s, total: 18.7 s\n", - "Wall time: 18.7 s\n" + "CPU times: user 16.7 s, sys: 3.76 s, total: 20.4 s\n", + "Wall time: 18.2 s\n" ] } ], @@ -146,7 +130,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -158,15 +142,15 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 6min 7s, sys: 1min 37s, total: 7min 44s\n", - "Wall time: 21.3 s\n" + "CPU times: user 6min 10s, sys: 1min 47s, total: 7min 58s\n", + "Wall time: 21 s\n" ] } ], @@ -179,34 +163,34 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 8.03 s, sys: 2.4 s, total: 10.4 s\n", - "Wall time: 3.3 s\n" + "CPU times: user 8.8 s, sys: 2.24 s, total: 11 s\n", + "Wall time: 3.58 s\n" ] } ], "source": [ "%%time\n", - "X = pd2pygdf(X)" + "X = pygdf.DataFrame.from_pandas(X)" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 3.37 s, sys: 496 ms, total: 3.87 s\n", - "Wall time: 3.89 s\n" + "CPU times: user 3.38 s, sys: 400 ms, total: 3.78 s\n", + "Wall time: 3.87 s\n" ] } ], @@ -219,7 +203,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -243,7 +227,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "metadata": {}, "outputs": [ { diff --git a/python/notebooks/tsvd_demo.ipynb b/python/notebooks/tsvd_demo.ipynb index 7bafcce1f7..89eb6704fb 100644 --- a/python/notebooks/tsvd_demo.ipynb +++ b/python/notebooks/tsvd_demo.ipynb @@ -75,22 +75,6 @@ "execution_count": 4, "metadata": {}, "outputs": [], - "source": [ - "def pd2pygdf(df):\n", - " # convert pandas dataframe to pygdf dataframe\n", - " if isinstance(df,np.ndarray):\n", - " df = pd.DataFrame({'fea%d'%i:df[:,i] for i in range(df.shape[1])})\n", - " pdf = pygdf.DataFrame()\n", - " for c,column in enumerate(df):\n", - " pdf[c] = df[column]\n", - " return pdf" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], "source": [ "from sklearn.metrics import mean_squared_error\n", "def array_equal(a,b,threshold=5e-3,with_sign=True):\n", @@ -121,7 +105,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -129,15 +113,15 @@ "output_type": "stream", "text": [ "use mortgage data\n", - "data (2097152, 40)\n", - "CPU times: user 6.21 s, sys: 1.05 s, total: 7.26 s\n", - "Wall time: 7.25 s\n" + "data (4194304, 40)\n", + "CPU times: user 9.03 s, sys: 2.86 s, total: 11.9 s\n", + "Wall time: 9.72 s\n" ] } ], "source": [ "%%time\n", - "nrows = 2**21\n", + "nrows = 2**22\n", "ncols = 40\n", "\n", "X = load_data(nrows,ncols)\n", @@ -146,7 +130,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -156,15 +140,15 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 35.6 s, sys: 3.52 s, total: 39.1 s\n", - "Wall time: 1.39 s\n" + "CPU times: user 1min 5s, sys: 4.15 s, total: 1min 9s\n", + "Wall time: 2.74 s\n" ] } ], @@ -178,34 +162,34 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 5 s, sys: 768 ms, total: 5.77 s\n", - "Wall time: 729 ms\n" + "CPU times: user 2.89 s, sys: 760 ms, total: 3.65 s\n", + "Wall time: 1.11 s\n" ] } ], "source": [ "%%time\n", - "X = pd2pygdf(X)" + "X = pygdf.DataFrame.from_pandas(X)" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 1.17 s, sys: 412 ms, total: 1.58 s\n", - "Wall time: 1.59 s\n" + "CPU times: user 1.39 s, sys: 480 ms, total: 1.87 s\n", + "Wall time: 1.96 s\n" ] } ], @@ -219,7 +203,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -241,7 +225,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -283,7 +267,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.5" + "version": "3.5.5" } }, "nbformat": 4, From f11c336c50a066e58413fd243e20c26e132c3ea7 Mon Sep 17 00:00:00 2001 From: Jiwei Liu Date: Thu, 18 Oct 2018 11:38:58 -0700 Subject: [PATCH 3/3] add doc string and fix a notebook bug --- python/cuML/knn/knn_wrapper.py | 65 +++++++++++++++++++++++++++++++-- python/notebooks/knn_demo.ipynb | 25 ++++++++----- 2 files changed, 77 insertions(+), 13 deletions(-) diff --git a/python/cuML/knn/knn_wrapper.py b/python/cuML/knn/knn_wrapper.py index 8449200485..6eb4b6e35f 100644 --- a/python/cuML/knn/knn_wrapper.py +++ b/python/cuML/knn/knn_wrapper.py @@ -1,3 +1,18 @@ + # Copyright (c) 2018, NVIDIA CORPORATION. + # + # Licensed under the Apache License, Version 2.0 (the "License"); + # you may not use this file except in compliance with the License. + # You may obtain a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # + # Unless required by applicable law or agreed to in writing, software + # distributed under the License is distributed on an "AS IS" BASIS, + # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + # See the License for the specific language governing permissions and + # limitations under the License. + # + import faiss import numpy as np import pandas as pd @@ -8,7 +23,51 @@ def __init__(self,n_gpus): self.n_gpus = n_gpus class KNN: + """ + Create a DataFrame, fill it with data, and compute KNN: + .. code-block:: python + import pygdf + from cuML import KNN + import numpy as np + np_float = np.array([ + [1.,2.,3.], # 1st point + [1.,2.,4.], # 2nd point + [2.,2.,4.] # 3rd point + ]).astype('float32') + gdf_float = pygdf.DataFrame() + gdf_float['dim_0'] = np.ascontiguousarray(np_float[:,0]) + gdf_float['dim_1'] = np.ascontiguousarray(np_float[:,1]) + gdf_float['dim_2'] = np.ascontiguousarray(np_float[:,2]) + print('n_samples = 3, n_dims = 3') + print(gdf_float) + knn_float = KNN(n_gpus=1) + knn_float.fit(gdf_float) + Distance,Index = knn_float.query(gdf_float,k=3) #get 3 nearest neighbors + print("Index:") + print(Index) + print("Distance:") + print(Distance) + + Output: + .. code-block:: python + n_samples = 3, n_dims = 3 + dim_0 dim_1 dim_2 + 0 1.0 2.0 3.0 + 1 1.0 2.0 4.0 + 2 2.0 2.0 4.0 + Index: + index_neighbor_0 index_neighbor_1 index_neighbor_2 + 0 0 1 2 + 1 1 0 2 + 2 2 1 0 + Distance: + distance_neighbor_0 distance_neighbor_1 distance_neighbor_2 + 0 0.0 1.0 2.0 + 1 0.0 1.0 1.0 + 2 0.0 1.0 2.0 + For an additional example see `the KNN notebook `_. For additional docs, see `scikitlearn's KDtree `_. + """ def __init__(self, n_gpus=-1): # -1 means using all gpus self.params = KNNparams(n_gpus) @@ -34,10 +93,8 @@ def query(self,X,k): return D,I def to_nparray(self,x): - if isinstance(x,pd.DataFrame): - x = x.values - elif isinstance(x,pygdf.DataFrame): - x = x.to_pandas().values + if isinstance(x,pygdf.DataFrame): + x = x.to_pandas() return np.ascontiguousarray(x) def to_pygdf(self,df,col=''): diff --git a/python/notebooks/knn_demo.ipynb b/python/notebooks/knn_demo.ipynb index de964e4f46..5374cc7c23 100644 --- a/python/notebooks/knn_demo.ipynb +++ b/python/notebooks/knn_demo.ipynb @@ -65,7 +65,7 @@ " X = X[np.random.randint(0,X.shape[0]-1,nrows),:ncols]\n", " else:\n", " print('use random data')\n", - " X = np.random.rand(nrows,ncols)\n", + " X = np.random.random((nrows,ncols)).astype('float32')\n", " df = pd.DataFrame({'fea%d'%i:X[:,i] for i in range(X.shape[1])}).fillna(0)\n", " return df" ] @@ -117,8 +117,8 @@ "text": [ "use mortgage data\n", "data (65536, 40)\n", - "CPU times: user 4.42 s, sys: 784 ms, total: 5.2 s\n", - "Wall time: 5.04 s\n" + "CPU times: user 4.4 s, sys: 776 ms, total: 5.17 s\n", + "Wall time: 4.83 s\n" ] } ], @@ -149,8 +149,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 2min 13s, sys: 888 ms, total: 2min 14s\n", - "Wall time: 1min 59s\n" + "CPU times: user 3min 23s, sys: 1.15 s, total: 3min 24s\n", + "Wall time: 3min 10s\n" ] } ], @@ -169,8 +169,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 6.91 s, sys: 1.11 s, total: 8.02 s\n", - "Wall time: 899 ms\n" + "CPU times: user 7.12 s, sys: 1.06 s, total: 8.18 s\n", + "Wall time: 908 ms\n" ] } ], @@ -188,8 +188,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 2.27 s, sys: 368 ms, total: 2.64 s\n", - "Wall time: 2.72 s\n" + "CPU times: user 600 ms, sys: 320 ms, total: 920 ms\n", + "Wall time: 983 ms\n" ] } ], @@ -222,6 +222,13 @@ "message = 'compare knn: cuml vs sklearn indexes %s'%('equal'if passed else 'NOT equal')\n", "print(message)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": {