dmlc · SmirnovEgorRu · Dec 16, 2019
diff --git a/.gitignore b/.gitignore
@@ -81,6 +81,7 @@ target
 *.gcno
 build_tests
 /tests/cpp/xgboost_test
+/tests/benchmark/data
 
 .DS_Store
 lib/

diff --git a/tests/benchmark/hist_method/README.md b/tests/benchmark/hist_method/README.md
@@ -0,0 +1,39 @@
+## How to run the benchmarks:
+1. Obtain python package of XGBoost. There are a few options:
+    - Build XGBoost from sources manually:
+        ```sh
+        git clone --recursive https://github.com/dmlc/xgboost
+        cd xgboost
+        make -j8
+        cd python-package
+        python setup.py install
+        cd ..
+        ```
+    - Or download the latest available version from pip:
+        ```sh
+        pip install xgboost
+        ```
+    - More details are available [here](https://xgboost.readthedocs.io/en/latest/build.html)
+
+2. Resolve dependencies on other python packages. For now it has dependencies on further packages: requests, scikit-learn, pandas, numpy. You can easily download them through pip:
+    ```sh
+        pip install requests scikit-learn pandas
+    ```
+3. Run benchmarks with specified parameters:
+    ```sh
+    cd tests/benchmark/hist_method
+    python xgboost_bench.py  --dataset < dataset > \
+                             --hw < platform >     \
+                             --n_iter < n_iter >   \
+                             --n_runs < n_runs >   \
+                             --log < enable_log >
+    ```
+
+The benchmark downloads required datasets from the Internet automatically, you don't need to worry about it.
+
+## Available parameters:
+* **dataset**    - dataset to use in benchmark. Possible values: *"higgs1m", "airline-ohe", "msrank-10k"* [Required].
+* **platform**   - specify platform for computation. Possible values: *cpu, gpu*. [Default=cpu].
+* **n_iter**     - amount of boosting iterations. Possible values: *integer > 0*. [Default=1000].
+* **n_runs**     - number of training and prediction measurements to obtain stable performance results. Possible values: *integer > 0*. [Default=5].
+* **enable_log** - if False - no additional debug info ("silent"=1). If True ("verbosity"=3) it prints execution time by kernels. Possible values: *True, False*. [Default=False].
diff --git a/tests/benchmark/hist_method/bench_utils.py b/tests/benchmark/hist_method/bench_utils.py
@@ -0,0 +1,220 @@
+#*******************************************************************************
+# Copyright 2017-2019 by Contributors
+# \file bench_utils.py
+# \brief utills for a benchmark for 'hist' tree_method on both CPU/GPU arhitectures
+# \author Egor Smirnov
+#*******************************************************************************
+
+import os
+import re
+import bz2
+import sys
+import timeit
+import tarfile
+import requests
+import numpy as np
+import pandas as pd
+from sklearn.metrics import log_loss
+
+if sys.version_info[0] >= 3:
+    from urllib.request import urlretrieve  # pylint: disable=import-error,no-name-in-module
+else:
+    from urllib import urlretrieve  # pylint: disable=import-error,no-name-in-module
+
+DATASET_DIR="./data/"
+
+
+def measure(func, string, nrepeat):
+    t = timeit.Timer(stmt="%s()" % func.__name__, setup="from __main__ import %s" % func.__name__)
+    res = t.repeat(repeat=nrepeat, number=1)
+
+    def box_filter(timing, left=0.25, right=0.75): # statistically remove outliers and compute average
+        timing.sort()
+        size = len(timing)
+        if size == 1:
+            return timing[0]
+
+        Q1, Q2 = timing[int(size * left)], timing[int(size * right)]
+
+        IQ = Q2 - Q1
+
+        lower = Q1 - 1.5 * IQ
+        upper = Q2 + 1.5 * IQ
+
+        result = np.array([item for item in timing if lower < item < upper])
+        return np.mean(result)
+
+    timing = box_filter(res)
+    print((string + " = {:.4f} sec (").format(timing), res, ")")
+
+
+def compute_logloss(y1, y2):
+    return log_loss(y1.ravel(), y2)
+
+
+def download_file(url):
+    local_filename = DATASET_DIR + url.split('/')[-1]
+    with requests.get(url, stream=True) as r:
+        r.raise_for_status()
+        with open(local_filename, 'wb') as f:
+            for chunk in r.iter_content(chunk_size=2**20):
+                if chunk:
+                    f.write(chunk)
+    return local_filename
+
+
+def load_higgs(nrows_train, nrows_test, dtype):
+    """
+    Higgs dataset from UCI machine learning repository (
+    https://archive.ics.uci.edu/ml/datasets/HIGGS).
+    TaskType:binclass
+    NumberOfFeatures:28
+    NumberOfInstances:11M
+    """
+    if not os.path.isfile(DATASET_DIR + "HIGGS.csv.gz"):
+        print("Loading data set...")
+        download_file("https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz")
+
+    print("Reading data set...")
+    data = pd.read_csv(DATASET_DIR + "HIGGS.csv.gz", delimiter=",", header=None, compression="gzip", dtype=dtype, nrows=nrows_train+nrows_test)
+    print("Pre-processing data set...")
+
+    data = data[list(data.columns[1:])+list(data.columns[0:1])]
+    n_features = data.shape[1]-1
+    train_data = np.ascontiguousarray(data.values[:nrows_train,:n_features], dtype=dtype)
+    train_label = np.ascontiguousarray(data.values[:nrows_train,n_features], dtype=dtype)
+    test_data = np.ascontiguousarray(data.values[nrows_train:nrows_train+nrows_test,:n_features], dtype=dtype)
+    test_label = np.ascontiguousarray(data.values[nrows_train:nrows_train+nrows_test,n_features], dtype=dtype)
+    n_classes = len(np.unique(train_label))
+    return train_data, train_label, test_data, test_label, n_classes
+
+
+def load_higgs1m(dtype):
+    return load_higgs(1000000, 500000, dtype)
+
+
+def read_libsvm_msrank(file_obj, n_samples, n_features, dtype):
+    X = np.zeros((n_samples, n_features))
+    y = np.zeros((n_samples,))
+
+    counter = 0
+
+    regexp = re.compile(r'[A-Za-z0-9]+:(-?\d*\.?\d+)')
+
+    for line in file_obj:
+        line = str(line).replace("\\n'", "")
+        line = regexp.sub('\g<1>', line)
+        line = line.rstrip(" \n\r").split(' ')
+
+        y[counter] = int(line[0])
+        X[counter] = [float(i) for i in line[1:]]
+
+        counter += 1
+        if counter == n_samples:
+            break
+
+    return np.array(X, dtype=dtype), np.array(y, dtype=dtype)
+
+
+def _make_gen(reader):
+    b = reader(1024 * 1024)
+    while b:
+        yield b
+        b = reader(1024 * 1024)
+
+
+def _count_lines(filename):
+    with open(filename, 'rb') as f:
+        f_gen = _make_gen(f.read)
+        return sum(buf.count(b'\n') for buf in f_gen)
+
+def load_msrank_10k(dtype):
+    """
+    Dataset from szilard benchmarks: https://github.com/szilard/GBM-perf
+    TaskType:binclass
+    NumberOfFeatures:700
+    NumberOfInstances:10100000
+    """
+
+    url = "https://storage.mds.yandex.net/get-devtools-opensource/471749/msrank.tar.gz"
+    tar = DATASET_DIR + "msrank.tar.gz"
+
+    if not os.path.isfile(tar):
+        print("Loading data set...")
+        download_file(url)
+
+    if not os.path.isfile(DATASET_DIR + "MSRank/train.txt"):
+        tar = tarfile.open(tar, "r:gz")
+        tar.extractall(DATASET_DIR)
+        tar.close()
+
+    sets = []
+    labels = []
+    n_features = 137
+
+    print("Reading data set...")
+    for set_name in ['train.txt', 'vali.txt', 'test.txt']:
+        file_name = DATASET_DIR + os.path.join('MSRank', set_name)
+
+        n_samples = _count_lines(file_name)
+        with open(file_name, 'r') as file_obj:
+            X, y = read_libsvm_msrank(file_obj, n_samples, n_features, dtype)
+
+        sets.append(X)
+        labels.append(y)
+
+    sets[0] = np.vstack((sets[0], sets[1]))
+    labels[0] = np.hstack((labels[0], labels[1]))
+
+    sets   = [ np.ascontiguousarray(sets[i]) for i in [0, 2]]
+    labels = [ np.ascontiguousarray(labels[i]) for i in [0, 2]]
+
+    n_classes = len(np.unique(labels[0]))
+
+    return sets[0], labels[0], sets[1], labels[1], n_classes
+
+
+def load_airline_one_hot(dtype):
+    """
+    Dataset from szilard benchmarks: https://github.com/szilard/GBM-perf
+    TaskType:binclass
+    NumberOfFeatures:700
+    NumberOfInstances:10100000
+    """
+    url = 'https://s3.amazonaws.com/benchm-ml--main/'
+
+    name_train = 'train-10m.csv'
+    name_test = 'test.csv'
+
+    sets = []
+    labels = []
+
+    categorical_names = ["Month", "DayofMonth", "DayOfWeek", "UniqueCarrier", "Origin", "Dest"]
+    categorical_ids = [0, 1, 2, 4, 5, 6]
+
+    numeric_names = ["DepTime", "Distance"]
+    numeric_ids = [3, 7]
+
+    for name in [name_train, name_test]:
+        filename = os.path.join(DATASET_DIR, name)
+        if not os.path.exists(filename):
+            print("Loading", filename)
+            urlretrieve(url + name, filename)
+
+        print("Reading", filename)
+        df = pd.read_csv(filename, nrows=1000000) if name == 'train-10m.csv' else pd.read_csv(filename)
+        X = df.drop('dep_delayed_15min', 1)
+        y = df["dep_delayed_15min"]
+
+        y_num = np.where(y == "Y", 1, 0)
+
+        sets.append(X)
+        labels.append(y_num)
+
+    n_samples_train = sets[0].shape[0]
+
+    X = pd.concat(sets)
+    X = pd.get_dummies(X, columns=categorical_names)
+    sets = [X[:n_samples_train], X[n_samples_train:]]
+
+    return sets[0], labels[0], sets[1], labels[1], 2
diff --git a/tests/benchmark/hist_method/xgboost_hist_method_bench.py b/tests/benchmark/hist_method/xgboost_hist_method_bench.py
@@ -0,0 +1,114 @@
+#*******************************************************************************
+# Copyright 2017-2019 by Contributors
+# \file xgboost_hist_method_bench.py
+# \brief a benchmark for 'hist' tree_method on both CPU/GPU arhitectures
+# \author Egor Smirnov
+#*******************************************************************************
+
+import argparse
+import xgboost as xgb
+from bench_utils import *
+
+N_PERF_RUNS = 5
+DTYPE=np.float32
+
+xgb_params = {
+    'alpha':                        0.9,
+    'max_bin':                      256,
+    'scale_pos_weight':             2,
+    'learning_rate':                0.1,
+    'subsample':                    1,
+    'reg_lambda':                   1,
+    "min_child_weight":             0,
+    'max_depth':                    8,
+    'max_leaves':                   2**8,
+}
+
+def xbg_fit():
+    global model_xgb
+    dtrain = xgb.DMatrix(x_train, label=y_train)
+    model_xgb = xgb.train(xgb_params, dtrain, xgb_params['n_estimators'])
+
+def xgb_predict_of_train_data():
+    global result_predict_xgb_train
+    dtest = xgb.DMatrix(x_train)
+    result_predict_xgb_train = model_xgb.predict(dtest)
+
+def xgb_predict_of_test_data():
+    global result_predict_xgb_test
+    dtest = xgb.DMatrix(x_test)
+    result_predict_xgb_test = model_xgb.predict(dtest)
+
+
+def load_dataset(dataset):
+    global x_train, y_train, x_test, y_test
+
+    try:
+        os.mkdir(DATASET_DIR)
+    except:
+        pass
+
+    datasets_dict = {
+        'higgs1m': load_higgs1m,
+        'msrank-10k': load_msrank_10k,
+        'airline-ohe':load_airline_one_hot
+    }
+
+    x_train, y_train, x_test, y_test, n_classes = datasets_dict[dataset](DTYPE)
+
+    if n_classes == -1:
+        xgb_params['objective'] = 'reg:squarederror'
+    elif n_classes == 2:
+        xgb_params['objective'] = 'binary:logistic'
+    else:
+        xgb_params['objective'] = 'multi:softprob'
+        xgb_params['num_class'] = n_classes
+
+def parse_args():
+    global N_PERF_RUNS
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--n_iter', required=False, type=int, default=1000)
+    parser.add_argument('--n_runs', default=N_PERF_RUNS, required=False, type=int)
+    parser.add_argument('--hw', choices=['cpu', 'gpu'], metavar='stage', required=False, default='cpu')
+    parser.add_argument('--log', metavar='stage', required=False, type=bool, default=False)
+    parser.add_argument('--dataset', choices=['higgs1m', "airline-ohe", "msrank-10k"],
+            metavar='stage', required=True)
+
+    args = parser.parse_args()
+    N_PERF_RUNS = args.n_runs
+
+    xgb_params['n_estimators'] = args.n_iter
+
+    if args.log:
+        xgb_params['verbosity'] = 3
+    else:
+         xgb_params['silent'] = 1
+
+    if args.hw == "cpu":
+        xgb_params['tree_method'] = 'hist'
+        xgb_params['predictor']   = 'cpu_predictor'
+    elif args.hw == "gpu":
+        xgb_params['tree_method'] = 'gpu_hist'
+        xgb_params['predictor']   = 'gpu_predictor'
+
+    load_dataset(args.dataset)
+
+
+def main():
+    parse_args()
+
+    print("Running ...")
+    measure(xbg_fit,                   "XGBOOST training            ", N_PERF_RUNS)
+    measure(xgb_predict_of_train_data, "XGBOOST predict (train data)", N_PERF_RUNS)
+    measure(xgb_predict_of_test_data,  "XGBOOST predict (test data) ", N_PERF_RUNS)
+
+    print("Compute quality metrics...")
+
+    train_loglos = compute_logloss(y_train, result_predict_xgb_train)
+    test_loglos = compute_logloss(y_test, result_predict_xgb_test)
+
+    print("LogLoss for train data set = {:.6f}".format(train_loglos))
+    print("LogLoss for test  data set = {:.6f}".format(test_loglos))
+
+if __name__ == '__main__':
+    main()