diff --git a/.gitignore b/.gitignore index f2a13f361357..ed3efaba51fc 100644 --- a/.gitignore +++ b/.gitignore @@ -81,6 +81,7 @@ target *.gcno build_tests /tests/cpp/xgboost_test +/tests/benchmark/data .DS_Store lib/ diff --git a/tests/benchmark/hist_method/README.md b/tests/benchmark/hist_method/README.md new file mode 100755 index 000000000000..924744cd0c1b --- /dev/null +++ b/tests/benchmark/hist_method/README.md @@ -0,0 +1,39 @@ +## How to run the benchmarks: +1. Obtain python package of XGBoost. There are a few options: + - Build XGBoost from sources manually: + ```sh + git clone --recursive https://github.com/dmlc/xgboost + cd xgboost + make -j8 + cd python-package + python setup.py install + cd .. + ``` + - Or download the latest available version from pip: + ```sh + pip install xgboost + ``` + - More details are available [here](https://xgboost.readthedocs.io/en/latest/build.html) + +2. Resolve dependencies on other python packages. For now it has dependencies on further packages: requests, scikit-learn, pandas, numpy. You can easily download them through pip: + ```sh + pip install requests scikit-learn pandas + ``` +3. Run benchmarks with specified parameters: + ```sh + cd tests/benchmark/hist_method + python xgboost_bench.py --dataset < dataset > \ + --hw < platform > \ + --n_iter < n_iter > \ + --n_runs < n_runs > \ + --log < enable_log > + ``` + +The benchmark downloads required datasets from the Internet automatically, you don't need to worry about it. + +## Available parameters: +* **dataset** - dataset to use in benchmark. Possible values: *"higgs1m", "airline-ohe", "msrank-10k"* [Required]. +* **platform** - specify platform for computation. Possible values: *cpu, gpu*. [Default=cpu]. +* **n_iter** - amount of boosting iterations. Possible values: *integer > 0*. [Default=1000]. +* **n_runs** - number of training and prediction measurements to obtain stable performance results. Possible values: *integer > 0*. [Default=5]. +* **enable_log** - if False - no additional debug info ("silent"=1). If True ("verbosity"=3) it prints execution time by kernels. Possible values: *True, False*. [Default=False]. diff --git a/tests/benchmark/hist_method/bench_utils.py b/tests/benchmark/hist_method/bench_utils.py new file mode 100755 index 000000000000..61437b2f40bb --- /dev/null +++ b/tests/benchmark/hist_method/bench_utils.py @@ -0,0 +1,220 @@ +#******************************************************************************* +# Copyright 2017-2019 by Contributors +# \file bench_utils.py +# \brief utills for a benchmark for 'hist' tree_method on both CPU/GPU arhitectures +# \author Egor Smirnov +#******************************************************************************* + +import os +import re +import bz2 +import sys +import timeit +import tarfile +import requests +import numpy as np +import pandas as pd +from sklearn.metrics import log_loss + +if sys.version_info[0] >= 3: + from urllib.request import urlretrieve # pylint: disable=import-error,no-name-in-module +else: + from urllib import urlretrieve # pylint: disable=import-error,no-name-in-module + +DATASET_DIR="./data/" + + +def measure(func, string, nrepeat): + t = timeit.Timer(stmt="%s()" % func.__name__, setup="from __main__ import %s" % func.__name__) + res = t.repeat(repeat=nrepeat, number=1) + + def box_filter(timing, left=0.25, right=0.75): # statistically remove outliers and compute average + timing.sort() + size = len(timing) + if size == 1: + return timing[0] + + Q1, Q2 = timing[int(size * left)], timing[int(size * right)] + + IQ = Q2 - Q1 + + lower = Q1 - 1.5 * IQ + upper = Q2 + 1.5 * IQ + + result = np.array([item for item in timing if lower < item < upper]) + return np.mean(result) + + timing = box_filter(res) + print((string + " = {:.4f} sec (").format(timing), res, ")") + + +def compute_logloss(y1, y2): + return log_loss(y1.ravel(), y2) + + +def download_file(url): + local_filename = DATASET_DIR + url.split('/')[-1] + with requests.get(url, stream=True) as r: + r.raise_for_status() + with open(local_filename, 'wb') as f: + for chunk in r.iter_content(chunk_size=2**20): + if chunk: + f.write(chunk) + return local_filename + + +def load_higgs(nrows_train, nrows_test, dtype): + """ + Higgs dataset from UCI machine learning repository ( + https://archive.ics.uci.edu/ml/datasets/HIGGS). + TaskType:binclass + NumberOfFeatures:28 + NumberOfInstances:11M + """ + if not os.path.isfile(DATASET_DIR + "HIGGS.csv.gz"): + print("Loading data set...") + download_file("https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz") + + print("Reading data set...") + data = pd.read_csv(DATASET_DIR + "HIGGS.csv.gz", delimiter=",", header=None, compression="gzip", dtype=dtype, nrows=nrows_train+nrows_test) + print("Pre-processing data set...") + + data = data[list(data.columns[1:])+list(data.columns[0:1])] + n_features = data.shape[1]-1 + train_data = np.ascontiguousarray(data.values[:nrows_train,:n_features], dtype=dtype) + train_label = np.ascontiguousarray(data.values[:nrows_train,n_features], dtype=dtype) + test_data = np.ascontiguousarray(data.values[nrows_train:nrows_train+nrows_test,:n_features], dtype=dtype) + test_label = np.ascontiguousarray(data.values[nrows_train:nrows_train+nrows_test,n_features], dtype=dtype) + n_classes = len(np.unique(train_label)) + return train_data, train_label, test_data, test_label, n_classes + + +def load_higgs1m(dtype): + return load_higgs(1000000, 500000, dtype) + + +def read_libsvm_msrank(file_obj, n_samples, n_features, dtype): + X = np.zeros((n_samples, n_features)) + y = np.zeros((n_samples,)) + + counter = 0 + + regexp = re.compile(r'[A-Za-z0-9]+:(-?\d*\.?\d+)') + + for line in file_obj: + line = str(line).replace("\\n'", "") + line = regexp.sub('\g<1>', line) + line = line.rstrip(" \n\r").split(' ') + + y[counter] = int(line[0]) + X[counter] = [float(i) for i in line[1:]] + + counter += 1 + if counter == n_samples: + break + + return np.array(X, dtype=dtype), np.array(y, dtype=dtype) + + +def _make_gen(reader): + b = reader(1024 * 1024) + while b: + yield b + b = reader(1024 * 1024) + + +def _count_lines(filename): + with open(filename, 'rb') as f: + f_gen = _make_gen(f.read) + return sum(buf.count(b'\n') for buf in f_gen) + +def load_msrank_10k(dtype): + """ + Dataset from szilard benchmarks: https://github.com/szilard/GBM-perf + TaskType:binclass + NumberOfFeatures:700 + NumberOfInstances:10100000 + """ + + url = "https://storage.mds.yandex.net/get-devtools-opensource/471749/msrank.tar.gz" + tar = DATASET_DIR + "msrank.tar.gz" + + if not os.path.isfile(tar): + print("Loading data set...") + download_file(url) + + if not os.path.isfile(DATASET_DIR + "MSRank/train.txt"): + tar = tarfile.open(tar, "r:gz") + tar.extractall(DATASET_DIR) + tar.close() + + sets = [] + labels = [] + n_features = 137 + + print("Reading data set...") + for set_name in ['train.txt', 'vali.txt', 'test.txt']: + file_name = DATASET_DIR + os.path.join('MSRank', set_name) + + n_samples = _count_lines(file_name) + with open(file_name, 'r') as file_obj: + X, y = read_libsvm_msrank(file_obj, n_samples, n_features, dtype) + + sets.append(X) + labels.append(y) + + sets[0] = np.vstack((sets[0], sets[1])) + labels[0] = np.hstack((labels[0], labels[1])) + + sets = [ np.ascontiguousarray(sets[i]) for i in [0, 2]] + labels = [ np.ascontiguousarray(labels[i]) for i in [0, 2]] + + n_classes = len(np.unique(labels[0])) + + return sets[0], labels[0], sets[1], labels[1], n_classes + + +def load_airline_one_hot(dtype): + """ + Dataset from szilard benchmarks: https://github.com/szilard/GBM-perf + TaskType:binclass + NumberOfFeatures:700 + NumberOfInstances:10100000 + """ + url = 'https://s3.amazonaws.com/benchm-ml--main/' + + name_train = 'train-10m.csv' + name_test = 'test.csv' + + sets = [] + labels = [] + + categorical_names = ["Month", "DayofMonth", "DayOfWeek", "UniqueCarrier", "Origin", "Dest"] + categorical_ids = [0, 1, 2, 4, 5, 6] + + numeric_names = ["DepTime", "Distance"] + numeric_ids = [3, 7] + + for name in [name_train, name_test]: + filename = os.path.join(DATASET_DIR, name) + if not os.path.exists(filename): + print("Loading", filename) + urlretrieve(url + name, filename) + + print("Reading", filename) + df = pd.read_csv(filename, nrows=1000000) if name == 'train-10m.csv' else pd.read_csv(filename) + X = df.drop('dep_delayed_15min', 1) + y = df["dep_delayed_15min"] + + y_num = np.where(y == "Y", 1, 0) + + sets.append(X) + labels.append(y_num) + + n_samples_train = sets[0].shape[0] + + X = pd.concat(sets) + X = pd.get_dummies(X, columns=categorical_names) + sets = [X[:n_samples_train], X[n_samples_train:]] + + return sets[0], labels[0], sets[1], labels[1], 2 diff --git a/tests/benchmark/hist_method/xgboost_hist_method_bench.py b/tests/benchmark/hist_method/xgboost_hist_method_bench.py new file mode 100755 index 000000000000..aa7dbcb23621 --- /dev/null +++ b/tests/benchmark/hist_method/xgboost_hist_method_bench.py @@ -0,0 +1,114 @@ +#******************************************************************************* +# Copyright 2017-2019 by Contributors +# \file xgboost_hist_method_bench.py +# \brief a benchmark for 'hist' tree_method on both CPU/GPU arhitectures +# \author Egor Smirnov +#******************************************************************************* + +import argparse +import xgboost as xgb +from bench_utils import * + +N_PERF_RUNS = 5 +DTYPE=np.float32 + +xgb_params = { + 'alpha': 0.9, + 'max_bin': 256, + 'scale_pos_weight': 2, + 'learning_rate': 0.1, + 'subsample': 1, + 'reg_lambda': 1, + "min_child_weight": 0, + 'max_depth': 8, + 'max_leaves': 2**8, +} + +def xbg_fit(): + global model_xgb + dtrain = xgb.DMatrix(x_train, label=y_train) + model_xgb = xgb.train(xgb_params, dtrain, xgb_params['n_estimators']) + +def xgb_predict_of_train_data(): + global result_predict_xgb_train + dtest = xgb.DMatrix(x_train) + result_predict_xgb_train = model_xgb.predict(dtest) + +def xgb_predict_of_test_data(): + global result_predict_xgb_test + dtest = xgb.DMatrix(x_test) + result_predict_xgb_test = model_xgb.predict(dtest) + + +def load_dataset(dataset): + global x_train, y_train, x_test, y_test + + try: + os.mkdir(DATASET_DIR) + except: + pass + + datasets_dict = { + 'higgs1m': load_higgs1m, + 'msrank-10k': load_msrank_10k, + 'airline-ohe':load_airline_one_hot + } + + x_train, y_train, x_test, y_test, n_classes = datasets_dict[dataset](DTYPE) + + if n_classes == -1: + xgb_params['objective'] = 'reg:squarederror' + elif n_classes == 2: + xgb_params['objective'] = 'binary:logistic' + else: + xgb_params['objective'] = 'multi:softprob' + xgb_params['num_class'] = n_classes + +def parse_args(): + global N_PERF_RUNS + parser = argparse.ArgumentParser() + parser.add_argument('--n_iter', required=False, type=int, default=1000) + parser.add_argument('--n_runs', default=N_PERF_RUNS, required=False, type=int) + parser.add_argument('--hw', choices=['cpu', 'gpu'], metavar='stage', required=False, default='cpu') + parser.add_argument('--log', metavar='stage', required=False, type=bool, default=False) + parser.add_argument('--dataset', choices=['higgs1m', "airline-ohe", "msrank-10k"], + metavar='stage', required=True) + + args = parser.parse_args() + N_PERF_RUNS = args.n_runs + + xgb_params['n_estimators'] = args.n_iter + + if args.log: + xgb_params['verbosity'] = 3 + else: + xgb_params['silent'] = 1 + + if args.hw == "cpu": + xgb_params['tree_method'] = 'hist' + xgb_params['predictor'] = 'cpu_predictor' + elif args.hw == "gpu": + xgb_params['tree_method'] = 'gpu_hist' + xgb_params['predictor'] = 'gpu_predictor' + + load_dataset(args.dataset) + + +def main(): + parse_args() + + print("Running ...") + measure(xbg_fit, "XGBOOST training ", N_PERF_RUNS) + measure(xgb_predict_of_train_data, "XGBOOST predict (train data)", N_PERF_RUNS) + measure(xgb_predict_of_test_data, "XGBOOST predict (test data) ", N_PERF_RUNS) + + print("Compute quality metrics...") + + train_loglos = compute_logloss(y_train, result_predict_xgb_train) + test_loglos = compute_logloss(y_test, result_predict_xgb_test) + + print("LogLoss for train data set = {:.6f}".format(train_loglos)) + print("LogLoss for test data set = {:.6f}".format(test_loglos)) + +if __name__ == '__main__': + main()