Merge branch 'enh-stress-quality-tests' into pr/9

rapidsai · May 6, 2019 · 14a53a2 · 14a53a2
2 parents b8fe721 + b6b425a
commit 14a53a2
Show file tree

Hide file tree

Showing 13 changed files with 647 additions and 254 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -37,6 +37,7 @@
 - PR #500: Added CI check for black listed CUDA Runtime API calls
 - PR #475: exposing cumlHandle for dbscan from python-side
 - PR #395: Edited the CONTRIBUTING.md file
+- PR #407: Test files to run stress, correctness and unit tests for cuml algos
 - PR #512: generic copy method for copying buffers between device/host
 - PR #533: Add cudatoolkit conda dependency
 - PR #524: Use cmake find blas and find lapack to pass configure options to faiss

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
@@ -36,7 +36,7 @@ nvidia-smi
 logger "Activate conda env..."
 source activate gdf
 conda install -c rapidsai/label/cuda${CUDA_REL} -c rapidsai-nightly/label/cuda${CUDA_REL} cudf=${CUDF_VERSION} rmm=${RMM_VERSION} nvstrings=${NVSTRINGS_VERSION}
-conda install -c conda-forge lapack cmake==3.14.3
+conda install -c conda-forge lapack cmake==3.14.3 umap-learn
 
 logger "Check versions..."
 python --version

diff --git a/python/cuml/test/conftest.py b/python/cuml/test/conftest.py
@@ -0,0 +1,18 @@
+import pytest
+
+
+def pytest_addoption(parser):
+    parser.addoption("--run_quality", action="store_true",
+                     default=False, help="run correctness tests")
+    parser.addoption("--run_stress", action="store_true",
+                     default=False, help="run stress tests")
+
+
+@pytest.fixture
+def run_stress(request):
+    return request.config.getoption("--run_stress")
+
+
+@pytest.fixture
+def run_quality(request):
+    return request.config.getoption("--run_quality")
diff --git a/python/cuml/test/test_coordinate_descent.py b/python/cuml/test/test_coordinate_descent.py
@@ -0,0 +1,153 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import pytest
+import cudf
+import numpy as np
+import pandas as pd
+from cuml import Lasso as cuLasso
+from sklearn.linear_model import Lasso
+from cuml.linear_model import ElasticNet as cuElasticNet
+from sklearn.linear_model import ElasticNet
+from cuml.test.utils import array_equal
+from sklearn.datasets import make_regression
+
+
+@pytest.mark.parametrize('datatype', [np.float32, np.float64])
+@pytest.mark.parametrize('X_type', ['dataframe', 'ndarray'])
+@pytest.mark.parametrize('lr', [0.1, 0.001])
+@pytest.mark.parametrize('algorithm', ['cyclic', 'random'])
+def test_lasso(datatype, X_type, lr, algorithm,
+               run_stress, run_quality):
+    nrows = 5000
+    ncols = 100
+    n_info = 50
+    if run_stress:
+        train_rows = np.int32(nrows*80)
+        X, y = make_regression(n_samples=(nrows*100), n_features=ncols,
+                               n_informative=n_info, random_state=0)
+
+    elif run_quality:
+        train_rows = np.int32(nrows*0.8)
+        X, y = make_regression(n_samples=nrows, n_features=int(ncols/2),
+                               n_informative=int(n_info/2), random_state=0)
+
+    else:
+        nrows = 50
+        ncols = 5
+        n_info = 3
+        train_rows = np.int32(nrows*0.8)
+        X, y = make_regression(n_samples=(nrows), n_features=ncols,
+                               n_informative=n_info, random_state=0)
+
+    X_test = np.asarray(X[train_rows:, 0:]).astype(datatype)
+    X_train = np.asarray(X[0:train_rows, :]).astype(datatype)
+    y_train = np.asarray(y[0:train_rows, ]).astype(datatype)
+
+    sk_lasso = Lasso(alpha=np.array([lr]), fit_intercept=True,
+                     normalize=False, max_iter=1000,
+                     selection=algorithm, tol=1e-10)
+    sk_lasso.fit(X_train, y_train)
+
+    cu_lasso = cuLasso(alpha=np.array([lr]), fit_intercept=True,
+                       normalize=False, max_iter=1000,
+                       selection=algorithm, tol=1e-10)
+
+    if X_type == 'dataframe':
+        y_train = pd.DataFrame({'fea0': y_train[0:, ]})
+        X_train = pd.DataFrame(
+            {'fea%d' % i: X_train[0:, i] for i in range(X_train.shape[1])})
+        X_test = pd.DataFrame(
+            {'fea%d' % i: X_test[0:, i] for i in range(X_test.shape[1])})
+        X_cudf = cudf.DataFrame.from_pandas(X_train)
+        X_cudf_test = cudf.DataFrame.from_pandas(X_test)
+        y_cudf = y_train.values
+        y_cudf = y_cudf[:, 0]
+        y_cudf = cudf.Series(y_cudf)
+        cu_lasso.fit(X_cudf, y_cudf)
+        cu_predict = cu_lasso.predict(X_cudf_test).to_array()
+
+    elif X_type == 'ndarray':
+
+        cu_lasso.fit(X_train, y_train)
+        cu_predict = cu_lasso.predict(X_test).to_array()
+
+    sk_predict = sk_lasso.predict(X_test)
+    assert array_equal(sk_predict, cu_predict, 1e-1, with_sign=True)
+
+
+@pytest.mark.parametrize('datatype', [np.float32, np.float64])
+@pytest.mark.parametrize('X_type', ['dataframe', 'ndarray'])
+@pytest.mark.parametrize('lr', [0.1, 0.001])
+@pytest.mark.parametrize('algorithm', ['cyclic', 'random'])
+def test_elastic_net(datatype, X_type, lr, algorithm,
+                     run_stress, run_quality):
+    nrows = 5000
+    ncols = 100
+    n_info = 50
+    if run_stress:
+        train_rows = np.int32(nrows*80)
+        X, y = make_regression(n_samples=(nrows*100), n_features=ncols,
+                               n_informative=n_info, random_state=0)
+
+    elif run_quality:
+        train_rows = np.int32(nrows*0.8)
+        X, y = make_regression(n_samples=nrows, n_features=ncols,
+                               n_informative=n_info, random_state=0)
+
+    else:
+        nrows = 50
+        ncols = 5
+        n_info = 3
+        train_rows = np.int32(nrows*0.8)
+        X, y = make_regression(n_samples=(nrows), n_features=ncols,
+                               n_informative=n_info, random_state=0)
+
+    X_test = np.asarray(X[train_rows:, 0:]).astype(datatype)
+    X_train = np.asarray(X[0:train_rows, :]).astype(datatype)
+    y_train = np.asarray(y[0:train_rows, ]).astype(datatype)
+
+    elastic_sk = ElasticNet(alpha=np.array([0.1]), fit_intercept=True,
+                            normalize=False, max_iter=1000,
+                            selection=algorithm, tol=1e-10)
+
+    elastic_sk.fit(X_train, y_train)
+
+    elastic_cu = cuElasticNet(alpha=np.array([0.1]), fit_intercept=True,
+                              normalize=False, max_iter=1000,
+                              selection=algorithm, tol=1e-10)
+
+    if X_type == 'dataframe':
+        y_train = pd.DataFrame({'fea0': y_train[0:, ]})
+        X_train = pd.DataFrame(
+            {'fea%d' % i: X_train[0:, i] for i in range(X_train.shape[1])})
+        X_test = pd.DataFrame(
+            {'fea%d' % i: X_test[0:, i] for i in range(X_test.shape[1])})
+        X_cudf = cudf.DataFrame.from_pandas(X_train)
+        X_cudf_test = cudf.DataFrame.from_pandas(X_test)
+        y_cudf = y_train.values
+        y_cudf = y_cudf[:, 0]
+        y_cudf = cudf.Series(y_cudf)
+        elastic_cu.fit(X_cudf, y_cudf)
+        cu_predict = elastic_cu.predict(X_cudf_test).to_array()
+
+    elif X_type == 'ndarray':
+
+        elastic_cu.fit(X_train, y_train)
+        cu_predict = elastic_cu.predict(X_test).to_array()
+
+    sk_predict = elastic_sk.predict(X_test)
+
+    assert array_equal(sk_predict, cu_predict, 1e-1, with_sign=True)
diff --git a/python/cuml/test/test_dbscan.py b/python/cuml/test/test_dbscan.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018, NVIDIA CORPORATION.
+# Copyright (c) 2019, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,123 +15,94 @@
 
 import pytest
 from cuml import DBSCAN as cuDBSCAN
-from cuml.test.utils import get_handle
 from sklearn.cluster import DBSCAN as skDBSCAN
+from sklearn.datasets.samples_generator import make_blobs
+import pandas as pd
 import cudf
 import numpy as np
-
 from sklearn.preprocessing import StandardScaler
-
 from cuml.test.utils import fit_predict, get_pattern, clusters_equal
 
-dataset_names = ['noisy_moons', 'varied', 'aniso', 'blobs', 'noisy_circles',
-                 'no_structure']
+
+dataset_names = ['noisy_moons', 'varied', 'aniso', 'blobs',
+                 'noisy_circles', 'no_structure']
 
 
 @pytest.mark.parametrize('max_bytes_per_batch', [10, 200, 2e6])
 @pytest.mark.parametrize('datatype', [np.float32, np.float64])
 @pytest.mark.parametrize('input_type', ['dataframe', 'ndarray'])
+
 @pytest.mark.parametrize('use_handle', [True, False])
-def test_dbscan_predict(datatype, input_type, use_handle, max_bytes_per_batch):
+def test_dbscan_predict(datatype, input_type, use_handle, max_bytes_per_batch
+                        run_stress, run_quality):
 
     # max_bytes_per_batch sizes: 10=6 batches, 200=2 batches, 2e6=1 batch
-
-    X = np.array([[1, 2], [2, 2], [2, 3], [8, 7], [8, 8], [25, 80]],
-                 dtype=datatype)
-    skdbscan = skDBSCAN(eps=3, min_samples=2)
+    n_samples = 10000
+    n_feats = 50
+    if run_stress:
+        X, y = make_blobs(n_samples=n_samples*50,
+                          n_features=n_feats, random_state=0)
+    elif run_quality:
+        X, y = make_blobs(n_samples=n_samples,
+                          n_features=n_feats, random_state=0)
+
+    else:
+        X = np.array([[1, 2], [2, 2], [2, 3], [8, 7], [8, 8], [25, 80]],
+                     dtype=datatype)
+    skdbscan = skDBSCAN(eps=3, min_samples=10)
     sk_labels = skdbscan.fit_predict(X)
 
+
     handle, stream = get_handle(use_handle)
     cudbscan = cuDBSCAN(handle=handle, eps=3, min_samples=2,
                         max_bytes_per_batch=max_bytes_per_batch)
 
     if input_type == 'dataframe':
-        gdf = cudf.DataFrame()
-        gdf['0'] = np.asarray([1, 2, 2, 8, 8, 25], dtype=datatype)
-        gdf['1'] = np.asarray([2, 2, 3, 7, 8, 80], dtype=datatype)
-        cu_labels = cudbscan.fit_predict(gdf)
+        X = pd.DataFrame(
+            {'fea%d' % i: X[0:, i] for i in range(X.shape[1])})
+        X_cudf = cudf.DataFrame.from_pandas(X)
+        cu_labels = cudbscan.fit_predict(X_cudf)
     else:
         cu_labels = cudbscan.fit_predict(X)
-    cudbscan.handle.sync()
-
-    for i in range(X.shape[0]):
-        assert cu_labels[i] == sk_labels[i]
-
-
-@pytest.mark.parametrize('datatype', [np.float32, np.float64])
-@pytest.mark.parametrize('use_handle', [True, False])
-def test_dbscan_predict_numpy(datatype, use_handle):
-    gdf = cudf.DataFrame()
-    gdf['0'] = np.asarray([1, 2, 2, 8, 8, 25], dtype=datatype)
-    gdf['1'] = np.asarray([2, 2, 3, 7, 8, 80], dtype=datatype)
 
-    X = np.array([[1, 2], [2, 2], [2, 3], [8, 7], [8, 8], [25, 80]],
-                 dtype=datatype)
-
-    print("Calling fit_predict")
-    handle, stream = get_handle(use_handle)
-    cudbscan = cuDBSCAN(handle=handle, eps=3, min_samples=2)
-    cu_labels = cudbscan.fit_predict(gdf)
-    skdbscan = skDBSCAN(eps=3, min_samples=2)
-    sk_labels = skdbscan.fit_predict(X)
-    print(X.shape[0])
-    cudbscan.handle.sync()
     for i in range(X.shape[0]):
         assert cu_labels[i] == sk_labels[i]
 
 
-def test_dbscan_predict_multiple_streams():
-    datatype = np.float32
-    gdf = cudf.DataFrame()
-    gdf['0'] = np.asarray([1, 2, 2, 8, 8, 25], dtype=datatype)
-    gdf['1'] = np.asarray([2, 2, 3, 7, 8, 80], dtype=datatype)
-
-    X = np.array([[1, 2], [2, 2], [2, 3], [8, 7], [8, 8], [25, 80]],
-                 dtype=datatype)
-
-    skdbscan = skDBSCAN(eps=3, min_samples=2)
-    sk_labels = skdbscan.fit_predict(X)
-
-    handle1, stream1 = get_handle(True)
-    handle2, stream2 = get_handle(True)
-    cudbscan1 = cuDBSCAN(handle=handle1, eps=3, min_samples=2)
-    cudbscan2 = cuDBSCAN(handle=handle2, eps=3, min_samples=2)
-    cu_labels1 = cudbscan1.fit_predict(gdf)
-    cu_labels2 = cudbscan2.fit_predict(gdf)
-    cudbscan1.handle.sync()
-    cudbscan2.handle.sync()
-    for i in range(X.shape[0]):
-        assert cu_labels1[i] == sk_labels[i]
-        assert cu_labels2[i] == sk_labels[i]
-
-
 @pytest.mark.parametrize("name", [
                                  'noisy_moons',
                                  'blobs',
                                  'no_structure'])
-@pytest.mark.parametrize('use_handle', [True, False])
-@pytest.mark.stress
-def test_dbscan_sklearn_comparison(name, use_handle):
-    # Skipping datasets of known discrepancies in PR83 while they are corrected
+def test_dbscan_sklearn_comparison(name, run_stress, run_quality):
     default_base = {'quantile': .3,
                     'eps': .3,
                     'damping': .9,
                     'preference': -200,
                     'n_neighbors': 10,
-                    'n_clusters': 3}
+                    'n_clusters': 20}
+    n_samples = 10000
+    if run_stress:
+        pat = get_pattern(name, n_samples*50)
+        params = default_base.copy()
+        params.update(pat[1])
+        X, y = pat[0]
+
+    elif run_quality:
+        pat = get_pattern(name, n_samples)
+        params = default_base.copy()
+        params.update(pat[1])
+        X, y = pat[0]
 
-    pat = get_pattern(name, 1500)
+    else:
+        pat = get_pattern(name, np.int32(n_samples/2))
+        params = default_base.copy()
+        params.update(pat[1])
+        X, y = pat[0]
 
-    params = default_base.copy()
-    params.update(pat[1])
+    X = StandardScaler().fit_transform(X)
 
     dbscan = skDBSCAN(eps=params['eps'], min_samples=5)
-    handle, stream = get_handle(use_handle)
-    cuml_dbscan = cuDBSCAN(handle=handle, eps=params['eps'], min_samples=5)
-
-    X, y = pat[0]
-
-    X = StandardScaler().fit_transform(X)
+    cuml_dbscan = cuDBSCAN(eps=params['eps'], min_samples=5)
 
     clustering_algorithms = (
         ('sk_DBSCAN', dbscan),
@@ -144,8 +115,6 @@ def test_dbscan_sklearn_comparison(name, use_handle):
     cu_y_pred, cu_n_clusters = fit_predict(clustering_algorithms[1][1],
                                            clustering_algorithms[1][0], X)
 
-    cuml_dbscan.handle.sync()
-
     assert(sk_n_clusters == cu_n_clusters)
 
     clusters_equal(sk_y_pred, cu_y_pred, sk_n_clusters)
diff --git a/python/cuml/test/test_kalman_filter.py b/python/cuml/test/test_kalman_filter.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018, NVIDIA CORPORATION.
+# Copyright (c) 2019, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.