From 3de3aa25b9ecd8ab8f651b08de72aef825be5022 Mon Sep 17 00:00:00 2001 From: Andy Adinets Date: Wed, 22 Jul 2020 19:51:29 +0200 Subject: [PATCH 1/4] Generate datasets for benchmarks using cuml.datasets. --- python/cuml/benchmark/datagen.py | 41 +++++++++++++++++--------------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/python/cuml/benchmark/datagen.py b/python/cuml/benchmark/datagen.py index 6ad12fc95f..e157bd6326 100644 --- a/python/cuml/benchmark/datagen.py +++ b/python/cuml/benchmark/datagen.py @@ -35,13 +35,14 @@ """ import cudf +import cupy as cp import gzip import functools import numpy as np import os import pandas as pd -import sklearn.datasets +import cuml.datasets import sklearn.model_selection from urllib.request import urlretrieve @@ -55,12 +56,9 @@ def _gen_data_regression(n_samples, n_features, random_state=42): n_samples = int(1e6) if n_features == 0: n_features = 100 - X_arr, y_arr = sklearn.datasets.make_regression( + X_arr, y_arr = cuml.datasets.make_regression( n_samples=n_samples, n_features=n_features, random_state=random_state) - return ( - pd.DataFrame(X_arr.astype(np.float32)), - pd.Series(y_arr.astype(np.float32)), - ) + return cudf.DataFrame.from_gpu_matrix(X_arr), cudf.Series(y_arr) def _gen_data_blobs(n_samples, n_features, random_state=42, centers=None): @@ -69,20 +67,21 @@ def _gen_data_blobs(n_samples, n_features, random_state=42, centers=None): n_samples = int(1e6) if n_features == 0: n_samples = 100 - X_arr, y_arr = sklearn.datasets.make_blobs( + X_arr, y_arr = cuml.datasets.make_blobs( n_samples=n_samples, n_features=n_features, centers=centers, random_state=random_state) + print(type(X_arr), type(y_arr)) return ( - pd.DataFrame(X_arr.astype(np.float32)), - pd.Series(y_arr.astype(np.float32)), + cudf.DataFrame(X_arr.astype(np.float32)), + cudf.Series(y_arr.astype(np.float32)), ) def _gen_data_zeros(n_samples, n_features, random_state=42): """Dummy generator for use in testing - returns all 0s""" return ( - np.zeros((n_samples, n_features), dtype=np.float32), - np.zeros(n_samples, dtype=np.float32), + cudf.DataFrame(np.zeros((n_samples, n_features), dtype=np.float32)), + cudf.Series(np.zeros(n_samples, dtype=np.float32)), ) @@ -95,13 +94,13 @@ def _gen_data_classification( if n_features == 0: n_samples = 100 - X_arr, y_arr = sklearn.datasets.make_classification( + X_arr, y_arr = cuml.datasets.make_classification( n_samples=n_samples, n_features=n_features, n_classes=n_classes, random_state=random_state) return ( - pd.DataFrame(X_arr.astype(np.float32)), - pd.Series(y_arr.astype(np.float32)), + cudf.DataFrame(X_arr.astype(np.float32)), + cudf.Series(y_arr.astype(np.float32)), ) @@ -159,7 +158,7 @@ def load_higgs(): ) X_df = data_df[data_df.columns.difference(['label'])] y_df = data_df['label'] - return X_df, y_df + return cudf.DataFrame.from_pandas(X_df), cudf.Series.from_pandas(y_df) def _convert_to_numpy(data): @@ -170,6 +169,10 @@ def _convert_to_numpy(data): return tuple([_convert_to_numpy(d) for d in data]) elif isinstance(data, np.ndarray): return data + elif isinstance(data, cudf.DataFrame): + return data.as_matrix() + elif isinstance(data, cudf.Series): + return data.to_array() elif isinstance(data, (pd.DataFrame, pd.Series)): return data.to_numpy() else: @@ -181,6 +184,8 @@ def _convert_to_cudf(data): return None elif isinstance(data, tuple): return tuple([_convert_to_cudf(d) for d in data]) + elif isinstance(data, (cudf.DataFrame, cudf.Series)): + return data elif isinstance(data, pd.DataFrame): return cudf.DataFrame.from_pandas(data) elif isinstance(data, pd.Series): @@ -194,11 +199,9 @@ def _convert_to_pandas(data): return None elif isinstance(data, tuple): return tuple([_convert_to_pandas(d) for d in data]) - elif isinstance(data, pd.DataFrame): - return data - elif isinstance(data, pd.Series): + elif isinstance(data, (pd.DataFrame, pd.Series)): return data - elif isinstance(data, cudf.DataFrame): + elif isinstance(data, (cudf.DataFrame, cudf.Series)): return data.to_pandas() else: raise Exception("Unsupported type %s" % str(type(data))) From 8ceb0d2511c674b2d2f9d3178117da4b58f6e248 Mon Sep 17 00:00:00 2001 From: Andy Adinets Date: Wed, 22 Jul 2020 19:54:44 +0200 Subject: [PATCH 2/4] Updated CHANGELOG.md. --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index d6bb62e801..325deae9da 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -62,6 +62,7 @@ - PR #2566: Remove deprecated cuDF from_gpu_matrix calls - PR #2577: Fully removing NVGraph dependency for CUDA 11 compatibility - PR #2575: Speed up TfidfTransformer +- PR #2591: Generate benchmark datsets using `cuml.datasets` ## Bug Fixes - PR #2369: Update RF code to fix set_params memory leak From 18db67a14dbd2a6c571829109df7a84a628124e5 Mon Sep 17 00:00:00 2001 From: Andy Adinets Date: Wed, 22 Jul 2020 20:03:30 +0200 Subject: [PATCH 3/4] Fixed style errors. --- python/cuml/benchmark/datagen.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/cuml/benchmark/datagen.py b/python/cuml/benchmark/datagen.py index e157bd6326..95744e0591 100644 --- a/python/cuml/benchmark/datagen.py +++ b/python/cuml/benchmark/datagen.py @@ -35,7 +35,6 @@ """ import cudf -import cupy as cp import gzip import functools import numpy as np From 60c66ed18df2965aa384c9d3b969634432178303 Mon Sep 17 00:00:00 2001 From: Andy Adinets Date: Thu, 23 Jul 2020 00:48:02 +0200 Subject: [PATCH 4/4] Using cudf.DataFrame() instead of a deprecated method. --- python/cuml/benchmark/datagen.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cuml/benchmark/datagen.py b/python/cuml/benchmark/datagen.py index 95744e0591..f74965848f 100644 --- a/python/cuml/benchmark/datagen.py +++ b/python/cuml/benchmark/datagen.py @@ -57,7 +57,7 @@ def _gen_data_regression(n_samples, n_features, random_state=42): n_features = 100 X_arr, y_arr = cuml.datasets.make_regression( n_samples=n_samples, n_features=n_features, random_state=random_state) - return cudf.DataFrame.from_gpu_matrix(X_arr), cudf.Series(y_arr) + return cudf.DataFrame(X_arr), cudf.Series(y_arr) def _gen_data_blobs(n_samples, n_features, random_state=42, centers=None):