Merge pull request #7 from Salonijain27/all_tests

All tests
rapidsai · May 3, 2019 · b6b425a · b6b425a
2 parents 18c9987 + 20d27ae
commit b6b425a
Show file tree

Hide file tree

Showing 12 changed files with 130 additions and 57 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -35,6 +35,7 @@
 - PR #500: Added CI check for black listed CUDA Runtime API calls
 - PR #475: exposing cumlHandle for dbscan from python-side
 - PR #395: Edited the CONTRIBUTING.md file
+- PR #407: Test files to run stress, correctness and unit tests for cuml algos
 - PR #512: generic copy method for copying buffers between device/host
 - PR #533: Add cudatoolkit conda dependency
 - PR #524: Use cmake find blas and find lapack to pass configure options to faiss

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
@@ -36,7 +36,7 @@ nvidia-smi
 logger "Activate conda env..."
 source activate gdf
 conda install -c rapidsai/label/cuda${CUDA_REL} -c rapidsai-nightly/label/cuda${CUDA_REL} cudf=${CUDF_VERSION} rmm=${RMM_VERSION} nvstrings=${NVSTRINGS_VERSION}
-conda install -c conda-forge lapack cmake==3.14.3
+conda install -c conda-forge lapack cmake==3.14.3 umap-learn
 
 logger "Check versions..."
 python --version

diff --git a/python/cuml/test/conftest.py b/python/cuml/test/conftest.py
@@ -0,0 +1,18 @@
+import pytest
+
+
+def pytest_addoption(parser):
+    parser.addoption("--run_quality", action="store_true",
+                     default=False, help="run correctness tests")
+    parser.addoption("--run_stress", action="store_true",
+                     default=False, help="run stress tests")
+
+
+@pytest.fixture
+def run_stress(request):
+    return request.config.getoption("--run_stress")
+
+
+@pytest.fixture
+def run_quality(request):
+    return request.config.getoption("--run_quality")
diff --git a/python/cuml/test/test_coordinate_descent.py b/python/cuml/test/test_coordinate_descent.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018, NVIDIA CORPORATION.
+# Copyright (c) 2019, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

diff --git a/python/cuml/test/test_dbscan.py b/python/cuml/test/test_dbscan.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018, NVIDIA CORPORATION.
+# Copyright (c) 2019, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

diff --git a/python/cuml/test/test_kalman_filter.py b/python/cuml/test/test_kalman_filter.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018, NVIDIA CORPORATION.
+# Copyright (c) 2019, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

diff --git a/python/cuml/test/test_kmeans.py b/python/cuml/test/test_kmeans.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018, NVIDIA CORPORATION.
+# Copyright (c) 2019, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

diff --git a/python/cuml/test/test_linear_model.py b/python/cuml/test/test_linear_model.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018, NVIDIA CORPORATION.
+# Copyright (c) 2019, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

diff --git a/python/cuml/test/test_nearest_neighbors.py b/python/cuml/test/test_nearest_neighbors.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018, NVIDIA CORPORATION.
+# Copyright (c) 2019, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

diff --git a/python/cuml/test/test_pca.py b/python/cuml/test/test_pca.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018, NVIDIA CORPORATION.
+# Copyright (c) 2019, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

diff --git a/python/cuml/test/test_tsvd.py b/python/cuml/test/test_tsvd.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018, NVIDIA CORPORATION.
+# Copyright (c) 2019, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

diff --git a/python/cuml/test/test_umap.py b/python/cuml/test/test_umap.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018, NVIDIA CORPORATION.
+# Copyright (c) 2019, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,44 +12,58 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-import pytest
 
-from cuml.manifold.umap import UMAP
 
+# Please install UMAP before running the code
+# use 'conda install -c conda-forge umap-learn' command to install it
+
+import pytest
+from cuml.test.utils import array_equal
+
+from cuml.manifold.umap import UMAP as UMAP_cuml
+import umap
 import cudf
 import pandas as pd
 import numpy as np
-
 from sklearn import datasets
 from sklearn.manifold.t_sne import trustworthiness
 from sklearn.cluster import KMeans
 from sklearn.metrics import adjusted_rand_score
+from sklearn.datasets.samples_generator import make_blobs
 
+dataset_names = ['iris', 'digits', 'wine', 'blobs']
 
-def test_blobs_cluster():
-    data, labels = datasets.make_blobs(
-        n_samples=500, n_features=10, centers=5)
-    embedding = UMAP().fit_transform(data)
-    score = adjusted_rand_score(labels,
-                                KMeans(5).fit_predict(embedding))
-    assert score == 1.0
 
+def test_umap_fit_transform_score(run_stress, run_quality):
 
-def test_umap_transform_on_iris():
-    iris = datasets.load_iris()
-    iris_selection = np.random.choice(
-        [True, False], 150, replace=True, p=[0.75, 0.25])
-    data = iris.data[iris_selection]
+    if run_stress:
+        n_samples = 500000
+        n_features = 1000
+
+    elif run_quality:
+        n_samples = 5000
+        n_features = 100
+
+    else:
+        n_samples = 500
+        n_features = 10
+
+    data, labels = make_blobs(n_samples=n_samples, n_features=n_features,
+                              centers=10, random_state=42)
+
+    model = umap.UMAP(n_neighbors=10, min_dist=0.1)
+    cuml_model = UMAP_cuml(n_neighbors=10, min_dist=0.01, verbose=True)
 
-    fitter = UMAP(n_neighbors=10, min_dist=0.01, verbose=True)
-    fitter.fit(data)
 
-    new_data = iris.data[~iris_selection]
-    embedding = fitter.transform(new_data)
+    embedding = model.fit_transform(data)
+    cuml_embedding = cuml_model.fit_transform(data)
 
-    trust = trustworthiness(new_data, embedding, 10)
-    assert trust >= 0.90
+    cuml_score = adjusted_rand_score(labels,
+                                     KMeans(10).fit_predict(cuml_embedding))
+    score = adjusted_rand_score(labels,
+                                KMeans(10).fit_predict(embedding))
 
+    assert array_equal(score, cuml_score, 1e-2, with_sign=True)
 
 def test_supervised_umap_trustworthiness_on_iris():
     iris = datasets.load_iris()
@@ -79,53 +93,93 @@ def test_umap_trustworthiness_on_iris():
     embedding = UMAP(n_neighbors=10, min_dist=0.01).fit_transform(data)
     trust = trustworthiness(iris.data, embedding, 10)
 
-    # We are doing a spectral embedding but not a
-    # multi-component layout (which is marked experimental).
-    # As a result, our score drops by 0.006.
-    assert trust >= 0.964
 
+@pytest.mark.parametrize('name', dataset_names)
+def test_umap_fit_transform_trust(name, run_stress, run_quality):
 
-def test_umap_trustworthiness_on_iris_random_init():
-    iris = datasets.load_iris()
-    data = iris.data
-    embedding = UMAP(
-        n_neighbors=10, min_dist=0.01,  init="random"
-    ).fit_transform(data)
-    trust = trustworthiness(iris.data, embedding, 10)
-    assert trust >= 0.95
+    if name == 'iris':
+        iris = datasets.load_iris()
+        data = iris.data
+        labels = iris.target
+
+    elif name == 'digits':
+        digits = datasets.load_digits(n_class=5)
+        data = digits.data
+        labels = digits.target
+
+    elif name == 'wine':
+        wine = datasets.load_wine()
+        data = wine.data
+        labels = wine.target
+    else:
+        data, labels = make_blobs(n_samples=5000, n_features=10,
+                                  centers=10, random_state=42)
+
+    model = umap.UMAP(n_neighbors=10, min_dist=0.01)
+    cuml_model = UMAP_cuml(n_neighbors=10, min_dist=0.01, verbose=True)
+    embedding = model.fit_transform(data)
+    cuml_embedding = cuml_model.fit_transform(data)
+
+    trust = trustworthiness(data, embedding, 10)
+    cuml_trust = trustworthiness(data, cuml_embedding, 10)
+
+    assert array_equal(trust, cuml_trust, 1e-2, with_sign=True)
 
 
 @pytest.mark.parametrize('should_downcast', [True, False])
 @pytest.mark.parametrize('input_type', ['dataframe', 'ndarray'])
-def test_umap_data_formats(input_type, should_downcast):
+def test_umap_data_formats(input_type, should_downcast,
+                           run_stress, run_quality):
 
     dtype = np.float32 if not should_downcast else np.float64
+    n_samples = 50000
+    n_feats = 50
+    if run_stress:
+        X, y = datasets.make_blobs(n_samples=n_samples*10,
+                                   n_features=n_feats, random_state=0)
 
-    # For now, FAISS based nearest_neighbors only supports single precision
-    digits = datasets.load_digits(n_class=9)
-    X = digits["data"].astype(dtype)
+    elif run_quality:
+        X, y = datasets.make_blobs(n_samples=int(n_samples/10),
+                                   n_features=n_feats, random_state=0)
 
-    umap = UMAP(n_neighbors=3, n_components=2,
-                should_downcast=should_downcast)
+    else:
+        # For now, FAISS based nearest_neighbors only supports single precision
+        digits = datasets.load_digits(n_class=9)
+        X = digits["data"].astype(dtype)
 
-    if input_type == 'dataframe':
-        X = cudf.DataFrame.from_pandas(pd.DataFrame(X))
-        embeds = umap.fit_transform(X)
+    umap = UMAP_cuml(n_neighbors=3, n_components=2,
+                     should_downcast=should_downcast)
 
+    if input_type == 'dataframe':
+        X_pd = pd.DataFrame(
+               {'fea%d' % i: X[0:, i] for i in range(X.shape[1])})
+        X_cudf = cudf.DataFrame.from_pandas(X_pd)
+        embeds = umap.fit_transform(X_cudf)
         assert type(embeds) == cudf.DataFrame
+
     else:
         embeds = umap.fit_transform(X)
-
         assert type(embeds) == np.ndarray
 
 
 @pytest.mark.parametrize('input_type', ['dataframe', 'ndarray'])
-def test_umap_downcast_fails(input_type):
+def test_umap_downcast_fails(input_type, run_stress, run_quality):
+    n_samples = 50000
+    n_feats = 50
+    if run_stress:
+        X, y = datasets.make_blobs(n_samples=n_samples*10,
+                                   n_features=n_feats, random_state=0)
+
+    elif run_quality:
+        X, y = datasets.make_blobs(n_samples=int(n_samples/10),
+                                   n_features=n_feats, random_state=0)
 
-    X = np.array([[1.0, 1.0], [50.0, 1.0], [51.0, 1.0]], dtype=np.float64)
+    else:
+        X = np.array([[1.0, 1.0], [50.0, 1.0], [51.0, 1.0]],
+                     dtype=np.float64)
 
     # Test fit() fails with double precision when should_downcast set to False
-    umap = UMAP(should_downcast=False)
+    umap = UMAP_cuml(should_downcast=False)
     if input_type == 'dataframe':
         X = cudf.DataFrame.from_pandas(pd.DataFrame(X))
 
@@ -135,7 +189,7 @@ def test_umap_downcast_fails(input_type):
     # Test fit() fails when downcast corrupted data
     X = np.array([[np.finfo(np.float32).max]], dtype=np.float64)
 
-    umap = UMAP(should_downcast=True)
+    umap = UMAP_cuml(should_downcast=True)
     if input_type == 'dataframe':
         X = cudf.DataFrame.from_pandas(pd.DataFrame(X))