Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Benchmarks to track performance changes in 'hist' method #5126

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ target
*.gcno
build_tests
/tests/cpp/xgboost_test
/tests/benchmark/data

.DS_Store
lib/
Expand Down
39 changes: 39 additions & 0 deletions tests/benchmark/hist_method/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
## How to run the benchmarks:
1. Obtain python package of XGBoost. There are a few options:
- Build XGBoost from sources manually:
```sh
git clone --recursive https://github.com/dmlc/xgboost
cd xgboost
make -j8
cd python-package
python setup.py install
cd ..
```
- Or download the latest available version from pip:
```sh
pip install xgboost
```
- More details are available [here](https://xgboost.readthedocs.io/en/latest/build.html)

2. Resolve dependencies on other python packages. For now it has dependencies on further packages: requests, scikit-learn, pandas, numpy. You can easily download them through pip:
```sh
pip install requests scikit-learn pandas
```
3. Run benchmarks with specified parameters:
```sh
cd tests/benchmark/hist_method
python xgboost_bench.py --dataset < dataset > \
--hw < platform > \
--n_iter < n_iter > \
--n_runs < n_runs > \
--log < enable_log >
```

The benchmark downloads required datasets from the Internet automatically, you don't need to worry about it.

## Available parameters:
* **dataset** - dataset to use in benchmark. Possible values: *"higgs1m", "airline-ohe", "msrank-10k"* [Required].
* **platform** - specify platform for computation. Possible values: *cpu, gpu*. [Default=cpu].
* **n_iter** - amount of boosting iterations. Possible values: *integer > 0*. [Default=1000].
* **n_runs** - number of training and prediction measurements to obtain stable performance results. Possible values: *integer > 0*. [Default=5].
* **enable_log** - if False - no additional debug info ("silent"=1). If True ("verbosity"=3) it prints execution time by kernels. Possible values: *True, False*. [Default=False].
220 changes: 220 additions & 0 deletions tests/benchmark/hist_method/bench_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,220 @@
#*******************************************************************************
# Copyright 2017-2019 by Contributors
# \file bench_utils.py
# \brief utills for a benchmark for 'hist' tree_method on both CPU/GPU arhitectures
# \author Egor Smirnov
#*******************************************************************************

import os
import re
import bz2
import sys
import timeit
import tarfile
import requests
import numpy as np
import pandas as pd
from sklearn.metrics import log_loss

if sys.version_info[0] >= 3:
from urllib.request import urlretrieve # pylint: disable=import-error,no-name-in-module
else:
from urllib import urlretrieve # pylint: disable=import-error,no-name-in-module

DATASET_DIR="./data/"


def measure(func, string, nrepeat):
t = timeit.Timer(stmt="%s()" % func.__name__, setup="from __main__ import %s" % func.__name__)
res = t.repeat(repeat=nrepeat, number=1)

def box_filter(timing, left=0.25, right=0.75): # statistically remove outliers and compute average
timing.sort()
size = len(timing)
if size == 1:
return timing[0]

Q1, Q2 = timing[int(size * left)], timing[int(size * right)]

IQ = Q2 - Q1

lower = Q1 - 1.5 * IQ
upper = Q2 + 1.5 * IQ

result = np.array([item for item in timing if lower < item < upper])
return np.mean(result)

timing = box_filter(res)
print((string + " = {:.4f} sec (").format(timing), res, ")")


def compute_logloss(y1, y2):
return log_loss(y1.ravel(), y2)


def download_file(url):
local_filename = DATASET_DIR + url.split('/')[-1]
with requests.get(url, stream=True) as r:
r.raise_for_status()
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=2**20):
if chunk:
f.write(chunk)
return local_filename


def load_higgs(nrows_train, nrows_test, dtype):
"""
Higgs dataset from UCI machine learning repository (
https://archive.ics.uci.edu/ml/datasets/HIGGS).
TaskType:binclass
NumberOfFeatures:28
NumberOfInstances:11M
"""
if not os.path.isfile(DATASET_DIR + "HIGGS.csv.gz"):
print("Loading data set...")
download_file("https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz")

print("Reading data set...")
data = pd.read_csv(DATASET_DIR + "HIGGS.csv.gz", delimiter=",", header=None, compression="gzip", dtype=dtype, nrows=nrows_train+nrows_test)
print("Pre-processing data set...")

data = data[list(data.columns[1:])+list(data.columns[0:1])]
n_features = data.shape[1]-1
train_data = np.ascontiguousarray(data.values[:nrows_train,:n_features], dtype=dtype)
train_label = np.ascontiguousarray(data.values[:nrows_train,n_features], dtype=dtype)
test_data = np.ascontiguousarray(data.values[nrows_train:nrows_train+nrows_test,:n_features], dtype=dtype)
test_label = np.ascontiguousarray(data.values[nrows_train:nrows_train+nrows_test,n_features], dtype=dtype)
n_classes = len(np.unique(train_label))
return train_data, train_label, test_data, test_label, n_classes


def load_higgs1m(dtype):
return load_higgs(1000000, 500000, dtype)


def read_libsvm_msrank(file_obj, n_samples, n_features, dtype):
X = np.zeros((n_samples, n_features))
y = np.zeros((n_samples,))

counter = 0

regexp = re.compile(r'[A-Za-z0-9]+:(-?\d*\.?\d+)')

for line in file_obj:
line = str(line).replace("\\n'", "")
line = regexp.sub('\g<1>', line)
line = line.rstrip(" \n\r").split(' ')

y[counter] = int(line[0])
X[counter] = [float(i) for i in line[1:]]

counter += 1
if counter == n_samples:
break

return np.array(X, dtype=dtype), np.array(y, dtype=dtype)


def _make_gen(reader):
b = reader(1024 * 1024)
while b:
yield b
b = reader(1024 * 1024)


def _count_lines(filename):
with open(filename, 'rb') as f:
f_gen = _make_gen(f.read)
return sum(buf.count(b'\n') for buf in f_gen)

def load_msrank_10k(dtype):
"""
Dataset from szilard benchmarks: https://github.com/szilard/GBM-perf
TaskType:binclass
NumberOfFeatures:700
NumberOfInstances:10100000
"""

url = "https://storage.mds.yandex.net/get-devtools-opensource/471749/msrank.tar.gz"
tar = DATASET_DIR + "msrank.tar.gz"

if not os.path.isfile(tar):
print("Loading data set...")
download_file(url)

if not os.path.isfile(DATASET_DIR + "MSRank/train.txt"):
tar = tarfile.open(tar, "r:gz")
tar.extractall(DATASET_DIR)
tar.close()

sets = []
labels = []
n_features = 137

print("Reading data set...")
for set_name in ['train.txt', 'vali.txt', 'test.txt']:
file_name = DATASET_DIR + os.path.join('MSRank', set_name)

n_samples = _count_lines(file_name)
with open(file_name, 'r') as file_obj:
X, y = read_libsvm_msrank(file_obj, n_samples, n_features, dtype)

sets.append(X)
labels.append(y)

sets[0] = np.vstack((sets[0], sets[1]))
labels[0] = np.hstack((labels[0], labels[1]))

sets = [ np.ascontiguousarray(sets[i]) for i in [0, 2]]
labels = [ np.ascontiguousarray(labels[i]) for i in [0, 2]]

n_classes = len(np.unique(labels[0]))

return sets[0], labels[0], sets[1], labels[1], n_classes


def load_airline_one_hot(dtype):
"""
Dataset from szilard benchmarks: https://github.com/szilard/GBM-perf
TaskType:binclass
NumberOfFeatures:700
NumberOfInstances:10100000
"""
url = 'https://s3.amazonaws.com/benchm-ml--main/'

name_train = 'train-10m.csv'
name_test = 'test.csv'

sets = []
labels = []

categorical_names = ["Month", "DayofMonth", "DayOfWeek", "UniqueCarrier", "Origin", "Dest"]
categorical_ids = [0, 1, 2, 4, 5, 6]

numeric_names = ["DepTime", "Distance"]
numeric_ids = [3, 7]

for name in [name_train, name_test]:
filename = os.path.join(DATASET_DIR, name)
if not os.path.exists(filename):
print("Loading", filename)
urlretrieve(url + name, filename)

print("Reading", filename)
df = pd.read_csv(filename, nrows=1000000) if name == 'train-10m.csv' else pd.read_csv(filename)
X = df.drop('dep_delayed_15min', 1)
y = df["dep_delayed_15min"]

y_num = np.where(y == "Y", 1, 0)

sets.append(X)
labels.append(y_num)

n_samples_train = sets[0].shape[0]

X = pd.concat(sets)
X = pd.get_dummies(X, columns=categorical_names)
sets = [X[:n_samples_train], X[n_samples_train:]]

return sets[0], labels[0], sets[1], labels[1], 2
114 changes: 114 additions & 0 deletions tests/benchmark/hist_method/xgboost_hist_method_bench.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
#*******************************************************************************
# Copyright 2017-2019 by Contributors
# \file xgboost_hist_method_bench.py
# \brief a benchmark for 'hist' tree_method on both CPU/GPU arhitectures
# \author Egor Smirnov
#*******************************************************************************

import argparse
import xgboost as xgb
from bench_utils import *

N_PERF_RUNS = 5
DTYPE=np.float32

xgb_params = {
'alpha': 0.9,
'max_bin': 256,
'scale_pos_weight': 2,
'learning_rate': 0.1,
'subsample': 1,
'reg_lambda': 1,
"min_child_weight": 0,
'max_depth': 8,
'max_leaves': 2**8,
}

def xbg_fit():
global model_xgb
dtrain = xgb.DMatrix(x_train, label=y_train)
model_xgb = xgb.train(xgb_params, dtrain, xgb_params['n_estimators'])

def xgb_predict_of_train_data():
global result_predict_xgb_train
dtest = xgb.DMatrix(x_train)
result_predict_xgb_train = model_xgb.predict(dtest)

def xgb_predict_of_test_data():
global result_predict_xgb_test
dtest = xgb.DMatrix(x_test)
result_predict_xgb_test = model_xgb.predict(dtest)


def load_dataset(dataset):
global x_train, y_train, x_test, y_test

try:
os.mkdir(DATASET_DIR)
except:
pass

datasets_dict = {
'higgs1m': load_higgs1m,
'msrank-10k': load_msrank_10k,
'airline-ohe':load_airline_one_hot
}

x_train, y_train, x_test, y_test, n_classes = datasets_dict[dataset](DTYPE)

if n_classes == -1:
xgb_params['objective'] = 'reg:squarederror'
elif n_classes == 2:
xgb_params['objective'] = 'binary:logistic'
else:
xgb_params['objective'] = 'multi:softprob'
xgb_params['num_class'] = n_classes

def parse_args():
global N_PERF_RUNS
parser = argparse.ArgumentParser()
parser.add_argument('--n_iter', required=False, type=int, default=1000)
parser.add_argument('--n_runs', default=N_PERF_RUNS, required=False, type=int)
parser.add_argument('--hw', choices=['cpu', 'gpu'], metavar='stage', required=False, default='cpu')
parser.add_argument('--log', metavar='stage', required=False, type=bool, default=False)
parser.add_argument('--dataset', choices=['higgs1m', "airline-ohe", "msrank-10k"],
metavar='stage', required=True)

args = parser.parse_args()
N_PERF_RUNS = args.n_runs

xgb_params['n_estimators'] = args.n_iter

if args.log:
xgb_params['verbosity'] = 3
else:
xgb_params['silent'] = 1

if args.hw == "cpu":
xgb_params['tree_method'] = 'hist'
xgb_params['predictor'] = 'cpu_predictor'
elif args.hw == "gpu":
xgb_params['tree_method'] = 'gpu_hist'
xgb_params['predictor'] = 'gpu_predictor'

load_dataset(args.dataset)


def main():
parse_args()

print("Running ...")
measure(xbg_fit, "XGBOOST training ", N_PERF_RUNS)
measure(xgb_predict_of_train_data, "XGBOOST predict (train data)", N_PERF_RUNS)
measure(xgb_predict_of_test_data, "XGBOOST predict (test data) ", N_PERF_RUNS)

print("Compute quality metrics...")

train_loglos = compute_logloss(y_train, result_predict_xgb_train)
test_loglos = compute_logloss(y_test, result_predict_xgb_test)

print("LogLoss for train data set = {:.6f}".format(train_loglos))
print("LogLoss for test data set = {:.6f}".format(test_loglos))

if __name__ == '__main__':
main()