rapidsai · dantegd · Oct 18, 2018 · Oct 18, 2018 · Oct 18, 2018 · Oct 18, 2018
diff --git a/python/cuML/cuml.pyx b/python/cuML/cuml.pyx
@@ -1,3 +1,4 @@
 include "pca/pca_wrapper.pyx"
 include "tsvd/tsvd_wrapper.pyx"
 include "dbscan/dbscan_wrapper.pyx"
+include "knn/knn_wrapper.py"
diff --git a/python/cuML/knn/knn_wrapper.py b/python/cuML/knn/knn_wrapper.py
@@ -0,0 +1,105 @@
+ # Copyright (c) 2018, NVIDIA CORPORATION.
+ #
+ # Licensed under the Apache License, Version 2.0 (the "License");
+ # you may not use this file except in compliance with the License.
+ # You may obtain a copy of the License at
+ #
+ #     http://www.apache.org/licenses/LICENSE-2.0
+ #
+ # Unless required by applicable law or agreed to in writing, software
+ # distributed under the License is distributed on an "AS IS" BASIS,
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ # See the License for the specific language governing permissions and
+ # limitations under the License.
+ #
+
+import faiss
+import numpy as np
+import pandas as pd
+import pygdf
+
+class KNNparams:
+    def __init__(self,n_gpus):
+        self.n_gpus = n_gpus
+
+class KNN:
+    """
+    Create a DataFrame, fill it with data, and compute KNN:
+    .. code-block:: python
+        import pygdf
+        from cuML import KNN
+        import numpy as np
+        np_float = np.array([
+                [1.,2.,3.], # 1st point 
+                [1.,2.,4.], # 2nd point
+                [2.,2.,4.]  # 3rd point
+            ]).astype('float32')
+        gdf_float = pygdf.DataFrame()
+        gdf_float['dim_0'] = np.ascontiguousarray(np_float[:,0])
+        gdf_float['dim_1'] = np.ascontiguousarray(np_float[:,1])
+        gdf_float['dim_2'] = np.ascontiguousarray(np_float[:,2])
+        print('n_samples = 3, n_dims = 3')
+        print(gdf_float)
+        knn_float = KNN(n_gpus=1)
+        knn_float.fit(gdf_float)
+        Distance,Index = knn_float.query(gdf_float,k=3) #get 3 nearest neighbors
+        print("Index:")
+        print(Index)
+        print("Distance:")
+        print(Distance)
+
+    Output:
+        .. code-block:: python
+
+        n_samples = 3, n_dims = 3
+           dim_0 dim_1 dim_2
+        0   1.0   2.0   3.0
+        1   1.0   2.0   4.0
+        2   2.0   2.0   4.0
+        Index:
+                 index_neighbor_0 index_neighbor_1 index_neighbor_2
+        0                0                1                2
+        1                1                0                2
+        2                2                1                0
+        Distance:
+                 distance_neighbor_0 distance_neighbor_1 distance_neighbor_2
+        0                 0.0                 1.0                 2.0
+        1                 0.0                 1.0                 1.0
+        2                 0.0                 1.0                 2.0
+    For an additional example see `the KNN notebook <https://github.com/rapidsai/cuml/blob/master/python/notebooks/knn_demo.ipynb>`_. For additional docs, see `scikitlearn's KDtree <http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KDTree.html#sklearn.neighbors.KDTree>`_.
+    """
+    def __init__(self, n_gpus=-1): # -1 means using all gpus
+        self.params = KNNparams(n_gpus)
+
+    def fit(self,X):
+        X = self.to_nparray(X)
+        assert len(X.shape)==2, 'data should be two dimensional'
+        n_dims = X.shape[1]
+        cpu_index = faiss.IndexFlatL2(n_dims) # build a flat (CPU) index
+        if self.params.n_gpus==1:
+            res = faiss.StandardGpuResources()  # use a single GPU
+            # make it a flat GPU index
+            gpu_index = faiss.index_cpu_to_gpu(res, 0, cpu_index)
+        else:
+            gpu_index = faiss.index_cpu_to_all_gpus(cpu_index,ngpu=self.params.n_gpus)
+        gpu_index.add(X)
+        self.gpu_index = gpu_index
+
+    def query(self,X,k):
+        X = self.to_nparray(X)
+        D,I = self.gpu_index.search(X, k)
+        D = self.to_pygdf(D,col='distance')
+        I = self.to_pygdf(I,col='index')
+        return D,I
+
+    def to_nparray(self,x):
+        if isinstance(x,pygdf.DataFrame):
+            x = x.to_pandas()
+        return np.ascontiguousarray(x)
+
+    def to_pygdf(self,df,col=''):
+        # convert pandas dataframe to pygdf dataframe
+        if isinstance(df,np.ndarray):
+            df = pd.DataFrame({'%s_neighbor_%d'%(col,i):df[:,i] for i in range(df.shape[1])})
+        pdf = pygdf.DataFrame.from_pandas(df)
+        return pdf
diff --git a/python/notebooks/dbscan_demo.ipynb b/python/notebooks/dbscan_demo.ipynb
@@ -75,22 +75,6 @@
    "execution_count": 4,
    "metadata": {},
    "outputs": [],
-   "source": [
-    "def pd2pygdf(df):\n",
-    "    # convert pandas dataframe to pygdf dataframe\n",
-    "    if isinstance(df,np.ndarray):\n",
-    "        df = pd.DataFrame({'fea%d'%i:df[:,i] for i in range(df.shape[1])})\n",
-    "    pdf = pygdf.DataFrame()\n",
-    "    for c,column in enumerate(df):\n",
-    "        pdf[c] = df[column]\n",
-    "    return pdf"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
    "source": [
     "from sklearn.metrics import mean_squared_error\n",
     "def array_equal(a,b,threshold=5e-3,with_sign=True):\n",
@@ -120,16 +104,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "use mortgage data\n",
       "data (10000, 128)\n",
-      "CPU times: user 24 ms, sys: 4 ms, total: 28 ms\n",
-      "Wall time: 26.9 ms\n"
+      "CPU times: user 4.58 s, sys: 1.43 s, total: 6 s\n",
+      "Wall time: 5.05 s\n"
      ]
     }
    ],
@@ -144,7 +129,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -154,15 +139,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 27.2 s, sys: 72 ms, total: 27.3 s\n",
-      "Wall time: 27 s\n"
+      "CPU times: user 26.7 s, sys: 724 ms, total: 27.4 s\n",
+      "Wall time: 26.8 s\n"
      ]
     }
    ],
@@ -174,34 +159,34 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 112 ms, sys: 436 ms, total: 548 ms\n",
-      "Wall time: 547 ms\n"
+      "CPU times: user 5.42 s, sys: 680 ms, total: 6.1 s\n",
+      "Wall time: 867 ms\n"
      ]
     }
    ],
    "source": [
     "%%time\n",
-    "X = pd2pygdf(X)"
+    "X = pygdf.DataFrame.from_pandas(X)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 7.66 s, sys: 108 ms, total: 7.77 s\n",
-      "Wall time: 7.79 s\n"
+      "CPU times: user 7.62 s, sys: 100 ms, total: 7.72 s\n",
+      "Wall time: 7.8 s\n"
      ]
     }
    ],
@@ -213,7 +198,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {