From 4cf413cc5ae08920effabfefddf76c33897c1e16 Mon Sep 17 00:00:00 2001
From: Jiwei Liu <jiweil@dgx11.aselab.nvidia.com>
Date: Wed, 17 Oct 2018 18:13:42 -0700
Subject: [PATCH 1/3] add knn wrapper and notebook

---
 python/cuML/cuml.pyx            |   1 +
 python/cuML/knn/knn_wrapper.py  |  50 ++++++
 python/notebooks/knn_demo.ipynb | 264 ++++++++++++++++++++++++++++++++
 3 files changed, 315 insertions(+)
 create mode 100644 python/cuML/knn/knn_wrapper.py
 create mode 100644 python/notebooks/knn_demo.ipynb

diff --git a/python/cuML/cuml.pyx b/python/cuML/cuml.pyx
index 6262110e2a..ba7944af3f 100644
--- a/python/cuML/cuml.pyx
+++ b/python/cuML/cuml.pyx
@@ -1,3 +1,4 @@
 include "pca/pca_wrapper.pyx"
 include "tsvd/tsvd_wrapper.pyx"
 include "dbscan/dbscan_wrapper.pyx"
+include "knn/knn_wrapper.py"
diff --git a/python/cuML/knn/knn_wrapper.py b/python/cuML/knn/knn_wrapper.py
new file mode 100644
index 0000000000..0d9ba9f28c
--- /dev/null
+++ b/python/cuML/knn/knn_wrapper.py
@@ -0,0 +1,50 @@
+import faiss
+import numpy as np
+import pandas as pd
+import pygdf
+
+class KNNparams:
+    def __init__(self,n_gpus):
+        self.n_gpus = n_gpus
+
+class KNN:
+
+    def __init__(self, n_gpus=-1): # -1 means using all gpus
+        self.params = KNNparams(n_gpus)
+
+    def fit(self,X):
+        X = self.to_nparray(X)
+        assert len(X.shape)==2, 'data should be two dimensional'
+        n_dims = X.shape[1]
+        cpu_index = faiss.IndexFlatL2(n_dims) # build a flat (CPU) index
+        if self.params.n_gpus==1:
+            res = faiss.StandardGpuResources()  # use a single GPU
+            # make it a flat GPU index
+            gpu_index = faiss.index_cpu_to_gpu(res, 0, cpu_index)
+        else:
+            gpu_index = faiss.index_cpu_to_all_gpus(cpu_index,ngpu=self.params.n_gpus)
+        gpu_index.add(X)
+        self.gpu_index = gpu_index
+
+    def query(self,X,k):
+        X = self.to_nparray(X)
+        D,I = self.gpu_index.search(X, k)
+        D = self.to_pygdf(D,col='distance')
+        I = self.to_pygdf(I,col='index')
+        return D,I
+
+    def to_nparray(self,x):
+        if isinstance(x,pd.DataFrame):
+            x = x.values
+        elif isinstance(x,pygdf.DataFrame):
+            x = x.to_pandas().values
+        return np.ascontiguousarray(x)
+
+    def to_pygdf(self,df,col=''):
+        # convert pandas dataframe to pygdf dataframe
+        if isinstance(df,np.ndarray):
+            df = pd.DataFrame({'%s_neighbor_%d'%(col,i):df[:,i] for i in range(df.shape[1])})
+        pdf = pygdf.DataFrame()
+        for c,column in enumerate(df):
+            pdf[c] = df[column]
+        return pdf
diff --git a/python/notebooks/knn_demo.ipynb b/python/notebooks/knn_demo.ipynb
new file mode 100644
index 0000000000..4cdcbb6638
--- /dev/null
+++ b/python/notebooks/knn_demo.ipynb
@@ -0,0 +1,264 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from sklearn.neighbors import KDTree as skKNN\n",
+    "from cuML import KNN as cumlKNN\n",
+    "import pygdf\n",
+    "import os"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Helper Functions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from timeit import default_timer\n",
+    "\n",
+    "class Timer(object):\n",
+    "    def __init__(self):\n",
+    "        self._timer = default_timer\n",
+    "    \n",
+    "    def __enter__(self):\n",
+    "        self.start()\n",
+    "        return self\n",
+    "\n",
+    "    def __exit__(self, *args):\n",
+    "        self.stop()\n",
+    "\n",
+    "    def start(self):\n",
+    "        \"\"\"Start the timer.\"\"\"\n",
+    "        self.start = self._timer()\n",
+    "\n",
+    "    def stop(self):\n",
+    "        \"\"\"Stop the timer. Calculate the interval in seconds.\"\"\"\n",
+    "        self.end = self._timer()\n",
+    "        self.interval = self.end - self.start"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import gzip\n",
+    "def load_data(nrows, ncols, cached = 'data/mortgage.npy.gz',source='mortgage'):\n",
+    "    if os.path.exists(cached) and source=='mortgage':\n",
+    "        print('use mortgage data')\n",
+    "        with gzip.open(cached) as f:\n",
+    "            X = np.load(f)\n",
+    "        X = X[np.random.randint(0,X.shape[0]-1,nrows),:ncols]\n",
+    "    else:\n",
+    "        print('use random data')\n",
+    "        X = np.random.rand(nrows,ncols)\n",
+    "    df = pd.DataFrame({'fea%d'%i:X[:,i] for i in range(X.shape[1])}).fillna(0)\n",
+    "    return df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def pd2pygdf(df):\n",
+    "    # convert pandas dataframe to pygdf dataframe\n",
+    "    if isinstance(df,np.ndarray):\n",
+    "        df = pd.DataFrame({'fea%d'%i:df[:,i] for i in range(df.shape[1])})\n",
+    "    pdf = pygdf.DataFrame()\n",
+    "    for c,column in enumerate(df):\n",
+    "        pdf[c] = df[column]\n",
+    "    return pdf"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.metrics import mean_squared_error\n",
+    "def array_equal(a,b,threshold=1e-2,with_sign=True,metric='mse'):\n",
+    "    a = to_nparray(a)\n",
+    "    b = to_nparray(b)\n",
+    "    if with_sign == False:\n",
+    "        a,b = np.abs(a),np.abs(b)\n",
+    "    if metric=='mse':\n",
+    "        error = mean_squared_error(a,b)\n",
+    "    else:\n",
+    "        error = np.sum(a!=b)/(a.shape[0]*a.shape[1])\n",
+    "    res = error<threshold\n",
+    "    return res\n",
+    "\n",
+    "def to_nparray(x):\n",
+    "    if isinstance(x,np.ndarray) or isinstance(x,pd.DataFrame):\n",
+    "        return np.array(x)\n",
+    "    elif isinstance(x,np.float64):\n",
+    "        return np.array([x])\n",
+    "    elif isinstance(x,pygdf.DataFrame) or isinstance(x,pygdf.Series):\n",
+    "        return x.to_pandas().values\n",
+    "    return x    "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Run tests"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "use mortgage data\n",
+      "data (65536, 40)\n",
+      "CPU times: user 4.68 s, sys: 696 ms, total: 5.38 s\n",
+      "Wall time: 5.09 s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "nrows = 2**16\n",
+    "ncols = 40\n",
+    "\n",
+    "X = load_data(nrows,ncols)\n",
+    "print('data',X.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "n_neighbors = 10"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 3min 18s, sys: 820 ms, total: 3min 19s\n",
+      "Wall time: 3min 11s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "knn_sk = skKNN(X)\n",
+    "D_sk,I_sk = knn_sk.query(X,n_neighbors)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 6.88 s, sys: 1.04 s, total: 7.92 s\n",
+      "Wall time: 793 ms\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "X = pd2pygdf(X)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 2.28 s, sys: 348 ms, total: 2.62 s\n",
+      "Wall time: 2.71 s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "knn_cuml = cumlKNN(n_gpus=1)\n",
+    "knn_cuml.fit(X)\n",
+    "D_cuml,I_cuml = knn_cuml.query(X,n_neighbors)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "compare knn: cuml vs sklearn distances equal\n",
+      "compare knn: cuml vs sklearn indexes NOT equal\n"
+     ]
+    }
+   ],
+   "source": [
+    "passed = array_equal(D_sk,D_cuml)\n",
+    "message = 'compare knn: cuml vs sklearn distances %s'%('equal'if passed else 'NOT equal')\n",
+    "print(message)\n",
+    "passed = array_equal(I_sk,I_cuml)\n",
+    "message = 'compare knn: cuml vs sklearn indexes %s'%('equal'if passed else 'NOT equal')\n",
+    "print(message)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From 8f5af937521c2d930a47e0a2ab71f85e1980b10c Mon Sep 17 00:00:00 2001
From: Jiwei Liu <jiweil@dgx11.aselab.nvidia.com>
Date: Wed, 17 Oct 2018 19:23:29 -0700
Subject: [PATCH 2/3] update knn wrapper and all notebooks with from_pandas

---
 python/cuML/knn/knn_wrapper.py     |  4 +--
 python/notebooks/dbscan_demo.ipynb | 47 +++++++++-----------------
 python/notebooks/knn_demo.ipynb    | 46 +++++++++----------------
 python/notebooks/pca_demo.ipynb    | 48 +++++++++-----------------
 python/notebooks/tsvd_demo.ipynb   | 54 +++++++++++-------------------
 5 files changed, 67 insertions(+), 132 deletions(-)

diff --git a/python/cuML/knn/knn_wrapper.py b/python/cuML/knn/knn_wrapper.py
index 0d9ba9f28c..8449200485 100644
--- a/python/cuML/knn/knn_wrapper.py
+++ b/python/cuML/knn/knn_wrapper.py
@@ -44,7 +44,5 @@ def to_pygdf(self,df,col=''):
         # convert pandas dataframe to pygdf dataframe
         if isinstance(df,np.ndarray):
             df = pd.DataFrame({'%s_neighbor_%d'%(col,i):df[:,i] for i in range(df.shape[1])})
-        pdf = pygdf.DataFrame()
-        for c,column in enumerate(df):
-            pdf[c] = df[column]
+        pdf = pygdf.DataFrame.from_pandas(df)
         return pdf
diff --git a/python/notebooks/dbscan_demo.ipynb b/python/notebooks/dbscan_demo.ipynb
index 57b13d4d9c..271b21f567 100644
--- a/python/notebooks/dbscan_demo.ipynb
+++ b/python/notebooks/dbscan_demo.ipynb
@@ -75,22 +75,6 @@
    "execution_count": 4,
    "metadata": {},
    "outputs": [],
-   "source": [
-    "def pd2pygdf(df):\n",
-    "    # convert pandas dataframe to pygdf dataframe\n",
-    "    if isinstance(df,np.ndarray):\n",
-    "        df = pd.DataFrame({'fea%d'%i:df[:,i] for i in range(df.shape[1])})\n",
-    "    pdf = pygdf.DataFrame()\n",
-    "    for c,column in enumerate(df):\n",
-    "        pdf[c] = df[column]\n",
-    "    return pdf"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
    "source": [
     "from sklearn.metrics import mean_squared_error\n",
     "def array_equal(a,b,threshold=5e-3,with_sign=True):\n",
@@ -120,16 +104,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "use mortgage data\n",
       "data (10000, 128)\n",
-      "CPU times: user 24 ms, sys: 4 ms, total: 28 ms\n",
-      "Wall time: 26.9 ms\n"
+      "CPU times: user 4.58 s, sys: 1.43 s, total: 6 s\n",
+      "Wall time: 5.05 s\n"
      ]
     }
    ],
@@ -144,7 +129,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -154,15 +139,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 27.2 s, sys: 72 ms, total: 27.3 s\n",
-      "Wall time: 27 s\n"
+      "CPU times: user 26.7 s, sys: 724 ms, total: 27.4 s\n",
+      "Wall time: 26.8 s\n"
      ]
     }
    ],
@@ -174,34 +159,34 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 112 ms, sys: 436 ms, total: 548 ms\n",
-      "Wall time: 547 ms\n"
+      "CPU times: user 5.42 s, sys: 680 ms, total: 6.1 s\n",
+      "Wall time: 867 ms\n"
      ]
     }
    ],
    "source": [
     "%%time\n",
-    "X = pd2pygdf(X)"
+    "X = pygdf.DataFrame.from_pandas(X)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 7.66 s, sys: 108 ms, total: 7.77 s\n",
-      "Wall time: 7.79 s\n"
+      "CPU times: user 7.62 s, sys: 100 ms, total: 7.72 s\n",
+      "Wall time: 7.8 s\n"
      ]
     }
    ],
@@ -213,7 +198,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
diff --git a/python/notebooks/knn_demo.ipynb b/python/notebooks/knn_demo.ipynb
index 4cdcbb6638..de964e4f46 100644
--- a/python/notebooks/knn_demo.ipynb
+++ b/python/notebooks/knn_demo.ipynb
@@ -75,22 +75,6 @@
    "execution_count": 4,
    "metadata": {},
    "outputs": [],
-   "source": [
-    "def pd2pygdf(df):\n",
-    "    # convert pandas dataframe to pygdf dataframe\n",
-    "    if isinstance(df,np.ndarray):\n",
-    "        df = pd.DataFrame({'fea%d'%i:df[:,i] for i in range(df.shape[1])})\n",
-    "    pdf = pygdf.DataFrame()\n",
-    "    for c,column in enumerate(df):\n",
-    "        pdf[c] = df[column]\n",
-    "    return pdf"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
    "source": [
     "from sklearn.metrics import mean_squared_error\n",
     "def array_equal(a,b,threshold=1e-2,with_sign=True,metric='mse'):\n",
@@ -124,7 +108,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
@@ -133,8 +117,8 @@
      "text": [
       "use mortgage data\n",
       "data (65536, 40)\n",
-      "CPU times: user 4.68 s, sys: 696 ms, total: 5.38 s\n",
-      "Wall time: 5.09 s\n"
+      "CPU times: user 4.42 s, sys: 784 ms, total: 5.2 s\n",
+      "Wall time: 5.04 s\n"
      ]
     }
    ],
@@ -149,7 +133,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -158,15 +142,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 3min 18s, sys: 820 ms, total: 3min 19s\n",
-      "Wall time: 3min 11s\n"
+      "CPU times: user 2min 13s, sys: 888 ms, total: 2min 14s\n",
+      "Wall time: 1min 59s\n"
      ]
     }
    ],
@@ -178,34 +162,34 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 6.88 s, sys: 1.04 s, total: 7.92 s\n",
-      "Wall time: 793 ms\n"
+      "CPU times: user 6.91 s, sys: 1.11 s, total: 8.02 s\n",
+      "Wall time: 899 ms\n"
      ]
     }
    ],
    "source": [
     "%%time\n",
-    "X = pd2pygdf(X)"
+    "X = pygdf.DataFrame.from_pandas(X)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 2.28 s, sys: 348 ms, total: 2.62 s\n",
-      "Wall time: 2.71 s\n"
+      "CPU times: user 2.27 s, sys: 368 ms, total: 2.64 s\n",
+      "Wall time: 2.72 s\n"
      ]
     }
    ],
@@ -218,7 +202,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
diff --git a/python/notebooks/pca_demo.ipynb b/python/notebooks/pca_demo.ipynb
index f846388df5..cdbcab5d5c 100644
--- a/python/notebooks/pca_demo.ipynb
+++ b/python/notebooks/pca_demo.ipynb
@@ -75,22 +75,6 @@
    "execution_count": 4,
    "metadata": {},
    "outputs": [],
-   "source": [
-    "def pd2pygdf(df):\n",
-    "    # convert pandas dataframe to pygdf dataframe\n",
-    "    if isinstance(df,np.ndarray):\n",
-    "        df = pd.DataFrame({'fea%d'%i:df[:,i] for i in range(df.shape[1])})\n",
-    "    pdf = pygdf.DataFrame()\n",
-    "    for c,column in enumerate(df):\n",
-    "        pdf[c] = df[column]\n",
-    "    return pdf"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
    "source": [
     "from sklearn.metrics import mean_squared_error\n",
     "def array_equal(a,b,threshold=2e-3,with_sign=True):\n",
@@ -121,7 +105,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
@@ -130,8 +114,8 @@
      "text": [
       "use mortgage data\n",
       "data (1048576, 400)\n",
-      "CPU times: user 16.4 s, sys: 2.32 s, total: 18.7 s\n",
-      "Wall time: 18.7 s\n"
+      "CPU times: user 16.7 s, sys: 3.76 s, total: 20.4 s\n",
+      "Wall time: 18.2 s\n"
      ]
     }
    ],
@@ -146,7 +130,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -158,15 +142,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 6min 7s, sys: 1min 37s, total: 7min 44s\n",
-      "Wall time: 21.3 s\n"
+      "CPU times: user 6min 10s, sys: 1min 47s, total: 7min 58s\n",
+      "Wall time: 21 s\n"
      ]
     }
    ],
@@ -179,34 +163,34 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 8.03 s, sys: 2.4 s, total: 10.4 s\n",
-      "Wall time: 3.3 s\n"
+      "CPU times: user 8.8 s, sys: 2.24 s, total: 11 s\n",
+      "Wall time: 3.58 s\n"
      ]
     }
    ],
    "source": [
     "%%time\n",
-    "X = pd2pygdf(X)"
+    "X = pygdf.DataFrame.from_pandas(X)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 3.37 s, sys: 496 ms, total: 3.87 s\n",
-      "Wall time: 3.89 s\n"
+      "CPU times: user 3.38 s, sys: 400 ms, total: 3.78 s\n",
+      "Wall time: 3.87 s\n"
      ]
     }
    ],
@@ -219,7 +203,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
@@ -243,7 +227,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
diff --git a/python/notebooks/tsvd_demo.ipynb b/python/notebooks/tsvd_demo.ipynb
index 7bafcce1f7..89eb6704fb 100644
--- a/python/notebooks/tsvd_demo.ipynb
+++ b/python/notebooks/tsvd_demo.ipynb
@@ -75,22 +75,6 @@
    "execution_count": 4,
    "metadata": {},
    "outputs": [],
-   "source": [
-    "def pd2pygdf(df):\n",
-    "    # convert pandas dataframe to pygdf dataframe\n",
-    "    if isinstance(df,np.ndarray):\n",
-    "        df = pd.DataFrame({'fea%d'%i:df[:,i] for i in range(df.shape[1])})\n",
-    "    pdf = pygdf.DataFrame()\n",
-    "    for c,column in enumerate(df):\n",
-    "        pdf[c] = df[column]\n",
-    "    return pdf"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
    "source": [
     "from sklearn.metrics import mean_squared_error\n",
     "def array_equal(a,b,threshold=5e-3,with_sign=True):\n",
@@ -121,7 +105,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
@@ -129,15 +113,15 @@
      "output_type": "stream",
      "text": [
       "use mortgage data\n",
-      "data (2097152, 40)\n",
-      "CPU times: user 6.21 s, sys: 1.05 s, total: 7.26 s\n",
-      "Wall time: 7.25 s\n"
+      "data (4194304, 40)\n",
+      "CPU times: user 9.03 s, sys: 2.86 s, total: 11.9 s\n",
+      "Wall time: 9.72 s\n"
      ]
     }
    ],
    "source": [
     "%%time\n",
-    "nrows = 2**21\n",
+    "nrows = 2**22\n",
     "ncols = 40\n",
     "\n",
     "X = load_data(nrows,ncols)\n",
@@ -146,7 +130,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -156,15 +140,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 35.6 s, sys: 3.52 s, total: 39.1 s\n",
-      "Wall time: 1.39 s\n"
+      "CPU times: user 1min 5s, sys: 4.15 s, total: 1min 9s\n",
+      "Wall time: 2.74 s\n"
      ]
     }
    ],
@@ -178,34 +162,34 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 5 s, sys: 768 ms, total: 5.77 s\n",
-      "Wall time: 729 ms\n"
+      "CPU times: user 2.89 s, sys: 760 ms, total: 3.65 s\n",
+      "Wall time: 1.11 s\n"
      ]
     }
    ],
    "source": [
     "%%time\n",
-    "X = pd2pygdf(X)"
+    "X = pygdf.DataFrame.from_pandas(X)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 1.17 s, sys: 412 ms, total: 1.58 s\n",
-      "Wall time: 1.59 s\n"
+      "CPU times: user 1.39 s, sys: 480 ms, total: 1.87 s\n",
+      "Wall time: 1.96 s\n"
      ]
     }
    ],
@@ -219,7 +203,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
@@ -241,7 +225,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
@@ -283,7 +267,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.5"
+   "version": "3.5.5"
   }
  },
  "nbformat": 4,

From f11c336c50a066e58413fd243e20c26e132c3ea7 Mon Sep 17 00:00:00 2001
From: Jiwei Liu <jiweil@dgx11.aselab.nvidia.com>
Date: Thu, 18 Oct 2018 11:38:58 -0700
Subject: [PATCH 3/3] add doc string and fix a notebook bug

---
 python/cuML/knn/knn_wrapper.py  | 65 +++++++++++++++++++++++++++++++--
 python/notebooks/knn_demo.ipynb | 25 ++++++++-----
 2 files changed, 77 insertions(+), 13 deletions(-)

diff --git a/python/cuML/knn/knn_wrapper.py b/python/cuML/knn/knn_wrapper.py
index 8449200485..6eb4b6e35f 100644
--- a/python/cuML/knn/knn_wrapper.py
+++ b/python/cuML/knn/knn_wrapper.py
@@ -1,3 +1,18 @@
+ # Copyright (c) 2018, NVIDIA CORPORATION.
+ #
+ # Licensed under the Apache License, Version 2.0 (the "License");
+ # you may not use this file except in compliance with the License.
+ # You may obtain a copy of the License at
+ #
+ #     http://www.apache.org/licenses/LICENSE-2.0
+ #
+ # Unless required by applicable law or agreed to in writing, software
+ # distributed under the License is distributed on an "AS IS" BASIS,
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ # See the License for the specific language governing permissions and
+ # limitations under the License.
+ #
+
 import faiss
 import numpy as np
 import pandas as pd
@@ -8,7 +23,51 @@ def __init__(self,n_gpus):
         self.n_gpus = n_gpus
 
 class KNN:
+    """
+    Create a DataFrame, fill it with data, and compute KNN:
+    .. code-block:: python
+        import pygdf
+        from cuML import KNN
+        import numpy as np
+        np_float = np.array([
+                [1.,2.,3.], # 1st point 
+                [1.,2.,4.], # 2nd point
+                [2.,2.,4.]  # 3rd point
+            ]).astype('float32')
+        gdf_float = pygdf.DataFrame()
+        gdf_float['dim_0'] = np.ascontiguousarray(np_float[:,0])
+        gdf_float['dim_1'] = np.ascontiguousarray(np_float[:,1])
+        gdf_float['dim_2'] = np.ascontiguousarray(np_float[:,2])
+        print('n_samples = 3, n_dims = 3')
+        print(gdf_float)
+        knn_float = KNN(n_gpus=1)
+        knn_float.fit(gdf_float)
+        Distance,Index = knn_float.query(gdf_float,k=3) #get 3 nearest neighbors
+        print("Index:")
+        print(Index)
+        print("Distance:")
+        print(Distance)
+
+    Output:
+        .. code-block:: python
 
+        n_samples = 3, n_dims = 3
+           dim_0 dim_1 dim_2
+        0   1.0   2.0   3.0
+        1   1.0   2.0   4.0
+        2   2.0   2.0   4.0
+        Index:
+                 index_neighbor_0 index_neighbor_1 index_neighbor_2
+        0                0                1                2
+        1                1                0                2
+        2                2                1                0
+        Distance:
+                 distance_neighbor_0 distance_neighbor_1 distance_neighbor_2
+        0                 0.0                 1.0                 2.0
+        1                 0.0                 1.0                 1.0
+        2                 0.0                 1.0                 2.0
+    For an additional example see `the KNN notebook <https://github.com/rapidsai/cuml/blob/master/python/notebooks/knn_demo.ipynb>`_. For additional docs, see `scikitlearn's KDtree <http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KDTree.html#sklearn.neighbors.KDTree>`_.
+    """
     def __init__(self, n_gpus=-1): # -1 means using all gpus
         self.params = KNNparams(n_gpus)
 
@@ -34,10 +93,8 @@ def query(self,X,k):
         return D,I
 
     def to_nparray(self,x):
-        if isinstance(x,pd.DataFrame):
-            x = x.values
-        elif isinstance(x,pygdf.DataFrame):
-            x = x.to_pandas().values
+        if isinstance(x,pygdf.DataFrame):
+            x = x.to_pandas()
         return np.ascontiguousarray(x)
 
     def to_pygdf(self,df,col=''):
diff --git a/python/notebooks/knn_demo.ipynb b/python/notebooks/knn_demo.ipynb
index de964e4f46..5374cc7c23 100644
--- a/python/notebooks/knn_demo.ipynb
+++ b/python/notebooks/knn_demo.ipynb
@@ -65,7 +65,7 @@
     "        X = X[np.random.randint(0,X.shape[0]-1,nrows),:ncols]\n",
     "    else:\n",
     "        print('use random data')\n",
-    "        X = np.random.rand(nrows,ncols)\n",
+    "        X = np.random.random((nrows,ncols)).astype('float32')\n",
     "    df = pd.DataFrame({'fea%d'%i:X[:,i] for i in range(X.shape[1])}).fillna(0)\n",
     "    return df"
    ]
@@ -117,8 +117,8 @@
      "text": [
       "use mortgage data\n",
       "data (65536, 40)\n",
-      "CPU times: user 4.42 s, sys: 784 ms, total: 5.2 s\n",
-      "Wall time: 5.04 s\n"
+      "CPU times: user 4.4 s, sys: 776 ms, total: 5.17 s\n",
+      "Wall time: 4.83 s\n"
      ]
     }
    ],
@@ -149,8 +149,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 2min 13s, sys: 888 ms, total: 2min 14s\n",
-      "Wall time: 1min 59s\n"
+      "CPU times: user 3min 23s, sys: 1.15 s, total: 3min 24s\n",
+      "Wall time: 3min 10s\n"
      ]
     }
    ],
@@ -169,8 +169,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 6.91 s, sys: 1.11 s, total: 8.02 s\n",
-      "Wall time: 899 ms\n"
+      "CPU times: user 7.12 s, sys: 1.06 s, total: 8.18 s\n",
+      "Wall time: 908 ms\n"
      ]
     }
    ],
@@ -188,8 +188,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 2.27 s, sys: 368 ms, total: 2.64 s\n",
-      "Wall time: 2.72 s\n"
+      "CPU times: user 600 ms, sys: 320 ms, total: 920 ms\n",
+      "Wall time: 983 ms\n"
      ]
     }
    ],
@@ -222,6 +222,13 @@
     "message = 'compare knn: cuml vs sklearn indexes %s'%('equal'if passed else 'NOT equal')\n",
     "print(message)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {