Skip to content

Commit

Permalink
Merge branch 'enh-stress-quality-tests' into pr/9
Browse files Browse the repository at this point in the history
  • Loading branch information
Salonijain27 authored May 6, 2019
2 parents b8fe721 + b6b425a commit 14a53a2
Show file tree
Hide file tree
Showing 13 changed files with 647 additions and 254 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
- PR #500: Added CI check for black listed CUDA Runtime API calls
- PR #475: exposing cumlHandle for dbscan from python-side
- PR #395: Edited the CONTRIBUTING.md file
- PR #407: Test files to run stress, correctness and unit tests for cuml algos
- PR #512: generic copy method for copying buffers between device/host
- PR #533: Add cudatoolkit conda dependency
- PR #524: Use cmake find blas and find lapack to pass configure options to faiss
Expand Down
2 changes: 1 addition & 1 deletion ci/gpu/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ nvidia-smi
logger "Activate conda env..."
source activate gdf
conda install -c rapidsai/label/cuda${CUDA_REL} -c rapidsai-nightly/label/cuda${CUDA_REL} cudf=${CUDF_VERSION} rmm=${RMM_VERSION} nvstrings=${NVSTRINGS_VERSION}
conda install -c conda-forge lapack cmake==3.14.3
conda install -c conda-forge lapack cmake==3.14.3 umap-learn

logger "Check versions..."
python --version
Expand Down
18 changes: 18 additions & 0 deletions python/cuml/test/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import pytest


def pytest_addoption(parser):
parser.addoption("--run_quality", action="store_true",
default=False, help="run correctness tests")
parser.addoption("--run_stress", action="store_true",
default=False, help="run stress tests")


@pytest.fixture
def run_stress(request):
return request.config.getoption("--run_stress")


@pytest.fixture
def run_quality(request):
return request.config.getoption("--run_quality")
153 changes: 153 additions & 0 deletions python/cuml/test/test_coordinate_descent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
# Copyright (c) 2019, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import pytest
import cudf
import numpy as np
import pandas as pd
from cuml import Lasso as cuLasso
from sklearn.linear_model import Lasso
from cuml.linear_model import ElasticNet as cuElasticNet
from sklearn.linear_model import ElasticNet
from cuml.test.utils import array_equal
from sklearn.datasets import make_regression


@pytest.mark.parametrize('datatype', [np.float32, np.float64])
@pytest.mark.parametrize('X_type', ['dataframe', 'ndarray'])
@pytest.mark.parametrize('lr', [0.1, 0.001])
@pytest.mark.parametrize('algorithm', ['cyclic', 'random'])
def test_lasso(datatype, X_type, lr, algorithm,
run_stress, run_quality):
nrows = 5000
ncols = 100
n_info = 50
if run_stress:
train_rows = np.int32(nrows*80)
X, y = make_regression(n_samples=(nrows*100), n_features=ncols,
n_informative=n_info, random_state=0)

elif run_quality:
train_rows = np.int32(nrows*0.8)
X, y = make_regression(n_samples=nrows, n_features=int(ncols/2),
n_informative=int(n_info/2), random_state=0)

else:
nrows = 50
ncols = 5
n_info = 3
train_rows = np.int32(nrows*0.8)
X, y = make_regression(n_samples=(nrows), n_features=ncols,
n_informative=n_info, random_state=0)

X_test = np.asarray(X[train_rows:, 0:]).astype(datatype)
X_train = np.asarray(X[0:train_rows, :]).astype(datatype)
y_train = np.asarray(y[0:train_rows, ]).astype(datatype)

sk_lasso = Lasso(alpha=np.array([lr]), fit_intercept=True,
normalize=False, max_iter=1000,
selection=algorithm, tol=1e-10)
sk_lasso.fit(X_train, y_train)

cu_lasso = cuLasso(alpha=np.array([lr]), fit_intercept=True,
normalize=False, max_iter=1000,
selection=algorithm, tol=1e-10)

if X_type == 'dataframe':
y_train = pd.DataFrame({'fea0': y_train[0:, ]})
X_train = pd.DataFrame(
{'fea%d' % i: X_train[0:, i] for i in range(X_train.shape[1])})
X_test = pd.DataFrame(
{'fea%d' % i: X_test[0:, i] for i in range(X_test.shape[1])})
X_cudf = cudf.DataFrame.from_pandas(X_train)
X_cudf_test = cudf.DataFrame.from_pandas(X_test)
y_cudf = y_train.values
y_cudf = y_cudf[:, 0]
y_cudf = cudf.Series(y_cudf)
cu_lasso.fit(X_cudf, y_cudf)
cu_predict = cu_lasso.predict(X_cudf_test).to_array()

elif X_type == 'ndarray':

cu_lasso.fit(X_train, y_train)
cu_predict = cu_lasso.predict(X_test).to_array()

sk_predict = sk_lasso.predict(X_test)
assert array_equal(sk_predict, cu_predict, 1e-1, with_sign=True)


@pytest.mark.parametrize('datatype', [np.float32, np.float64])
@pytest.mark.parametrize('X_type', ['dataframe', 'ndarray'])
@pytest.mark.parametrize('lr', [0.1, 0.001])
@pytest.mark.parametrize('algorithm', ['cyclic', 'random'])
def test_elastic_net(datatype, X_type, lr, algorithm,
run_stress, run_quality):
nrows = 5000
ncols = 100
n_info = 50
if run_stress:
train_rows = np.int32(nrows*80)
X, y = make_regression(n_samples=(nrows*100), n_features=ncols,
n_informative=n_info, random_state=0)

elif run_quality:
train_rows = np.int32(nrows*0.8)
X, y = make_regression(n_samples=nrows, n_features=ncols,
n_informative=n_info, random_state=0)

else:
nrows = 50
ncols = 5
n_info = 3
train_rows = np.int32(nrows*0.8)
X, y = make_regression(n_samples=(nrows), n_features=ncols,
n_informative=n_info, random_state=0)

X_test = np.asarray(X[train_rows:, 0:]).astype(datatype)
X_train = np.asarray(X[0:train_rows, :]).astype(datatype)
y_train = np.asarray(y[0:train_rows, ]).astype(datatype)

elastic_sk = ElasticNet(alpha=np.array([0.1]), fit_intercept=True,
normalize=False, max_iter=1000,
selection=algorithm, tol=1e-10)

elastic_sk.fit(X_train, y_train)

elastic_cu = cuElasticNet(alpha=np.array([0.1]), fit_intercept=True,
normalize=False, max_iter=1000,
selection=algorithm, tol=1e-10)

if X_type == 'dataframe':
y_train = pd.DataFrame({'fea0': y_train[0:, ]})
X_train = pd.DataFrame(
{'fea%d' % i: X_train[0:, i] for i in range(X_train.shape[1])})
X_test = pd.DataFrame(
{'fea%d' % i: X_test[0:, i] for i in range(X_test.shape[1])})
X_cudf = cudf.DataFrame.from_pandas(X_train)
X_cudf_test = cudf.DataFrame.from_pandas(X_test)
y_cudf = y_train.values
y_cudf = y_cudf[:, 0]
y_cudf = cudf.Series(y_cudf)
elastic_cu.fit(X_cudf, y_cudf)
cu_predict = elastic_cu.predict(X_cudf_test).to_array()

elif X_type == 'ndarray':

elastic_cu.fit(X_train, y_train)
cu_predict = elastic_cu.predict(X_test).to_array()

sk_predict = elastic_sk.predict(X_test)

assert array_equal(sk_predict, cu_predict, 1e-1, with_sign=True)
127 changes: 48 additions & 79 deletions python/cuml/test/test_dbscan.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2018, NVIDIA CORPORATION.
# Copyright (c) 2019, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -15,123 +15,94 @@

import pytest
from cuml import DBSCAN as cuDBSCAN
from cuml.test.utils import get_handle
from sklearn.cluster import DBSCAN as skDBSCAN
from sklearn.datasets.samples_generator import make_blobs
import pandas as pd
import cudf
import numpy as np

from sklearn.preprocessing import StandardScaler

from cuml.test.utils import fit_predict, get_pattern, clusters_equal

dataset_names = ['noisy_moons', 'varied', 'aniso', 'blobs', 'noisy_circles',
'no_structure']

dataset_names = ['noisy_moons', 'varied', 'aniso', 'blobs',
'noisy_circles', 'no_structure']


@pytest.mark.parametrize('max_bytes_per_batch', [10, 200, 2e6])
@pytest.mark.parametrize('datatype', [np.float32, np.float64])
@pytest.mark.parametrize('input_type', ['dataframe', 'ndarray'])

@pytest.mark.parametrize('use_handle', [True, False])
def test_dbscan_predict(datatype, input_type, use_handle, max_bytes_per_batch):
def test_dbscan_predict(datatype, input_type, use_handle, max_bytes_per_batch
run_stress, run_quality):

# max_bytes_per_batch sizes: 10=6 batches, 200=2 batches, 2e6=1 batch

X = np.array([[1, 2], [2, 2], [2, 3], [8, 7], [8, 8], [25, 80]],
dtype=datatype)
skdbscan = skDBSCAN(eps=3, min_samples=2)
n_samples = 10000
n_feats = 50
if run_stress:
X, y = make_blobs(n_samples=n_samples*50,
n_features=n_feats, random_state=0)
elif run_quality:
X, y = make_blobs(n_samples=n_samples,
n_features=n_feats, random_state=0)

else:
X = np.array([[1, 2], [2, 2], [2, 3], [8, 7], [8, 8], [25, 80]],
dtype=datatype)
skdbscan = skDBSCAN(eps=3, min_samples=10)
sk_labels = skdbscan.fit_predict(X)


handle, stream = get_handle(use_handle)
cudbscan = cuDBSCAN(handle=handle, eps=3, min_samples=2,
max_bytes_per_batch=max_bytes_per_batch)

if input_type == 'dataframe':
gdf = cudf.DataFrame()
gdf['0'] = np.asarray([1, 2, 2, 8, 8, 25], dtype=datatype)
gdf['1'] = np.asarray([2, 2, 3, 7, 8, 80], dtype=datatype)
cu_labels = cudbscan.fit_predict(gdf)
X = pd.DataFrame(
{'fea%d' % i: X[0:, i] for i in range(X.shape[1])})
X_cudf = cudf.DataFrame.from_pandas(X)
cu_labels = cudbscan.fit_predict(X_cudf)
else:
cu_labels = cudbscan.fit_predict(X)
cudbscan.handle.sync()

for i in range(X.shape[0]):
assert cu_labels[i] == sk_labels[i]


@pytest.mark.parametrize('datatype', [np.float32, np.float64])
@pytest.mark.parametrize('use_handle', [True, False])
def test_dbscan_predict_numpy(datatype, use_handle):
gdf = cudf.DataFrame()
gdf['0'] = np.asarray([1, 2, 2, 8, 8, 25], dtype=datatype)
gdf['1'] = np.asarray([2, 2, 3, 7, 8, 80], dtype=datatype)

X = np.array([[1, 2], [2, 2], [2, 3], [8, 7], [8, 8], [25, 80]],
dtype=datatype)

print("Calling fit_predict")
handle, stream = get_handle(use_handle)
cudbscan = cuDBSCAN(handle=handle, eps=3, min_samples=2)
cu_labels = cudbscan.fit_predict(gdf)
skdbscan = skDBSCAN(eps=3, min_samples=2)
sk_labels = skdbscan.fit_predict(X)
print(X.shape[0])
cudbscan.handle.sync()
for i in range(X.shape[0]):
assert cu_labels[i] == sk_labels[i]


def test_dbscan_predict_multiple_streams():
datatype = np.float32
gdf = cudf.DataFrame()
gdf['0'] = np.asarray([1, 2, 2, 8, 8, 25], dtype=datatype)
gdf['1'] = np.asarray([2, 2, 3, 7, 8, 80], dtype=datatype)

X = np.array([[1, 2], [2, 2], [2, 3], [8, 7], [8, 8], [25, 80]],
dtype=datatype)

skdbscan = skDBSCAN(eps=3, min_samples=2)
sk_labels = skdbscan.fit_predict(X)

handle1, stream1 = get_handle(True)
handle2, stream2 = get_handle(True)
cudbscan1 = cuDBSCAN(handle=handle1, eps=3, min_samples=2)
cudbscan2 = cuDBSCAN(handle=handle2, eps=3, min_samples=2)
cu_labels1 = cudbscan1.fit_predict(gdf)
cu_labels2 = cudbscan2.fit_predict(gdf)
cudbscan1.handle.sync()
cudbscan2.handle.sync()
for i in range(X.shape[0]):
assert cu_labels1[i] == sk_labels[i]
assert cu_labels2[i] == sk_labels[i]


@pytest.mark.parametrize("name", [
'noisy_moons',
'blobs',
'no_structure'])
@pytest.mark.parametrize('use_handle', [True, False])
@pytest.mark.stress
def test_dbscan_sklearn_comparison(name, use_handle):
# Skipping datasets of known discrepancies in PR83 while they are corrected
def test_dbscan_sklearn_comparison(name, run_stress, run_quality):
default_base = {'quantile': .3,
'eps': .3,
'damping': .9,
'preference': -200,
'n_neighbors': 10,
'n_clusters': 3}
'n_clusters': 20}
n_samples = 10000
if run_stress:
pat = get_pattern(name, n_samples*50)
params = default_base.copy()
params.update(pat[1])
X, y = pat[0]

elif run_quality:
pat = get_pattern(name, n_samples)
params = default_base.copy()
params.update(pat[1])
X, y = pat[0]

pat = get_pattern(name, 1500)
else:
pat = get_pattern(name, np.int32(n_samples/2))
params = default_base.copy()
params.update(pat[1])
X, y = pat[0]

params = default_base.copy()
params.update(pat[1])
X = StandardScaler().fit_transform(X)

dbscan = skDBSCAN(eps=params['eps'], min_samples=5)
handle, stream = get_handle(use_handle)
cuml_dbscan = cuDBSCAN(handle=handle, eps=params['eps'], min_samples=5)

X, y = pat[0]

X = StandardScaler().fit_transform(X)
cuml_dbscan = cuDBSCAN(eps=params['eps'], min_samples=5)

clustering_algorithms = (
('sk_DBSCAN', dbscan),
Expand All @@ -144,8 +115,6 @@ def test_dbscan_sklearn_comparison(name, use_handle):
cu_y_pred, cu_n_clusters = fit_predict(clustering_algorithms[1][1],
clustering_algorithms[1][0], X)

cuml_dbscan.handle.sync()

assert(sk_n_clusters == cu_n_clusters)

clusters_equal(sk_y_pred, cu_y_pred, sk_n_clusters)
2 changes: 1 addition & 1 deletion python/cuml/test/test_kalman_filter.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2018, NVIDIA CORPORATION.
# Copyright (c) 2019, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down
Loading

0 comments on commit 14a53a2

Please sign in to comment.