Skip to content

Commit

Permalink
Update Python demos with tests. (#5651)
Browse files Browse the repository at this point in the history
* Remove GPU memory usage demo.
* Add tests for demos.
* Remove `silent`.
* Remove shebang as it's not portable.
  • Loading branch information
trivialfis authored May 12, 2020
1 parent 4e64e2e commit 2c1a439
Show file tree
Hide file tree
Showing 25 changed files with 158 additions and 158 deletions.
6 changes: 4 additions & 2 deletions demo/aft_survival/aft_survival_demo.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
"""
Demo for survival analysis (regression) using Accelerated Failure Time (AFT) model
"""
import os
from sklearn.model_selection import ShuffleSplit
import pandas as pd
import numpy as np
import xgboost as xgb

# The Veterans' Administration Lung Cancer Trial
# The Statistical Analysis of Failure Time Data by Kalbfleisch J. and Prentice R (1980)
df = pd.read_csv('../data/veterans_lung_cancer.csv')
CURRENT_DIR = os.path.dirname(__file__)
df = pd.read_csv(os.path.join(CURRENT_DIR, '../data/veterans_lung_cancer.csv'))
print('Training data:')
print(df)

Expand Down Expand Up @@ -39,7 +41,7 @@
'lambda': 0.01,
'alpha': 0.02}
bst = xgb.train(params, dtrain, num_boost_round=10000,
evals=[(dtrain, 'train'), (dvalid, 'valid')],
evals=[(dtrain, 'train'), (dvalid, 'valid')],
early_stopping_rounds=50)

# Run prediction on the validation set
Expand Down
6 changes: 3 additions & 3 deletions demo/c-api/c-api-demo.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,12 @@ if (err != 0) { \
int main(int argc, char** argv) {
int silent = 0;
int use_gpu = 0; // set to 1 to use the GPU for training

// load the data
DMatrixHandle dtrain, dtest;
safe_xgboost(XGDMatrixCreateFromFile("../data/agaricus.txt.train", silent, &dtrain));
safe_xgboost(XGDMatrixCreateFromFile("../data/agaricus.txt.test", silent, &dtest));

// create the booster
BoosterHandle booster;
DMatrixHandle eval_dmats[2] = {dtrain, dtest};
Expand All @@ -49,7 +49,7 @@ int main(int argc, char** argv) {
safe_xgboost(XGBoosterSetParam(booster, "gamma", "0.1"));
safe_xgboost(XGBoosterSetParam(booster, "max_depth", "3"));
safe_xgboost(XGBoosterSetParam(booster, "verbosity", silent ? "0" : "1"));

// train and evaluate for 10 iterations
int n_trees = 10;
const char* eval_names[2] = {"train", "test"};
Expand Down
4 changes: 1 addition & 3 deletions demo/gpu_acceleration/README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
# GPU Acceleration Demo

`cover_type.py` shows how to train a model on the [forest cover type](https://archive.ics.uci.edu/ml/datasets/covertype) dataset using GPU acceleration. The forest cover type dataset has 581,012 rows and 54 features, making it time consuming to process. We compare the run-time and accuracy of the GPU and CPU histogram algorithms.

`memory.py` shows how to repeatedly train xgboost models while freeing memory between iterations.
`cover_type.py` shows how to train a model on the [forest cover type](https://archive.ics.uci.edu/ml/datasets/covertype) dataset using GPU acceleration. The forest cover type dataset has 581,012 rows and 54 features, making it time consuming to process. We compare the run-time and accuracy of the GPU and CPU histogram algorithms.
1 change: 0 additions & 1 deletion demo/gpu_acceleration/cover_type.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import xgboost as xgb
import numpy as np
from sklearn.datasets import fetch_covtype
from sklearn.model_selection import train_test_split
import time
Expand Down
51 changes: 0 additions & 51 deletions demo/gpu_acceleration/memory.py

This file was deleted.

1 change: 0 additions & 1 deletion demo/guide-python/basic_walkthrough.py
100755 → 100644
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
#!/usr/bin/env python
import numpy as np
import scipy.sparse
import pickle
Expand Down
10 changes: 6 additions & 4 deletions demo/guide-python/boost_from_prediction.py
100755 → 100644
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
#!/usr/bin/python
import os
import xgboost as xgb

dtrain = xgb.DMatrix('../data/agaricus.txt.train')
dtest = xgb.DMatrix('../data/agaricus.txt.test')

CURRENT_DIR = os.path.dirname(__file__)
dtrain = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.train'))
dtest = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.test'))
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
###
# advanced: start from a initial base prediction
#
print('start running example to start from a initial prediction')
# specify parameters via map, definition are same as c++ version
param = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic'}
param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
# train xgboost for 1 round
bst = xgb.train(param, dtrain, 1, watchlist)
# Note: we need the margin value instead of transformed prediction in
Expand Down
11 changes: 6 additions & 5 deletions demo/guide-python/cross_validation.py
100755 → 100644
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
#!/usr/bin/python
import os
import numpy as np
import xgboost as xgb

### load data in do training
dtrain = xgb.DMatrix('../data/agaricus.txt.train')
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic'}
# load data in do training
CURRENT_DIR = os.path.dirname(__file__)
dtrain = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.train'))
param = {'max_depth':2, 'eta':1, 'objective':'binary:logistic'}
num_round = 2

print('running cross validation')
Expand Down Expand Up @@ -56,7 +57,7 @@ def evalerror(preds, dtrain):
labels = dtrain.get_label()
return 'error', float(sum(labels != (preds > 0.0))) / len(labels)

param = {'max_depth':2, 'eta':1, 'silent':1}
param = {'max_depth':2, 'eta':1}
# train with customized objective
xgb.cv(param, dtrain, num_round, nfold=5, seed=0,
obj=logregobj, feval=evalerror)
9 changes: 5 additions & 4 deletions demo/guide-python/custom_objective.py
100755 → 100644
Original file line number Diff line number Diff line change
@@ -1,18 +1,19 @@
#!/usr/bin/python
import os
import numpy as np
import xgboost as xgb
###
# advanced: customized loss function
#
print('start running example to used customized objective function')

dtrain = xgb.DMatrix('../data/agaricus.txt.train')
dtest = xgb.DMatrix('../data/agaricus.txt.test')
CURRENT_DIR = os.path.dirname(__file__)
dtrain = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.train'))
dtest = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.test'))

# note: for customized objective function, we leave objective as default
# note: what we are getting is margin value in prediction
# you must know what you are doing
param = {'max_depth': 2, 'eta': 1, 'silent': 1}
param = {'max_depth': 2, 'eta': 1}
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
num_round = 2

Expand Down
10 changes: 6 additions & 4 deletions demo/guide-python/evals_result.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
##
# This script demonstrate how to access the eval metrics in xgboost
##

import os
import xgboost as xgb
dtrain = xgb.DMatrix('../data/agaricus.txt.train', silent=True)
dtest = xgb.DMatrix('../data/agaricus.txt.test', silent=True)

CURRENT_DIR = os.path.dirname(__file__)
dtrain = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.train'))
dtest = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.test'))

param = [('max_depth', 2), ('objective', 'binary:logistic'), ('eval_metric', 'logloss'), ('eval_metric', 'error')]

num_round = 2
watchlist = [(dtest,'eval'), (dtrain,'train')]

Expand Down
13 changes: 5 additions & 8 deletions demo/guide-python/external_memory.py
100755 → 100644
Original file line number Diff line number Diff line change
@@ -1,18 +1,17 @@
#!/usr/bin/python
import numpy as np
import scipy.sparse
import os
import xgboost as xgb

### simple example for using external memory version

# this is the only difference, add a # followed by a cache prefix name
# several cache file with the prefix will be generated
# currently only support convert from libsvm file
dtrain = xgb.DMatrix('../data/agaricus.txt.train#dtrain.cache')
dtest = xgb.DMatrix('../data/agaricus.txt.test#dtest.cache')
CURRENT_DIR = os.path.dirname(__file__)
dtrain = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.train'))
dtest = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.test'))

# specify validations set to watch performance
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic'}
param = {'max_depth':2, 'eta':1, 'objective':'binary:logistic'}

# performance notice: set nthread to be the number of your real cpu
# some cpu offer two threads per core, for example, a 4 core cpu with 8 threads, in such case set nthread=4
Expand All @@ -21,5 +20,3 @@
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
num_round = 2
bst = xgb.train(param, dtrain, num_round, watchlist)


3 changes: 1 addition & 2 deletions demo/guide-python/gamma_regression.py
100755 → 100644
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
#!/usr/bin/python
import xgboost as xgb
import numpy as np

Expand All @@ -12,7 +11,7 @@

# for gamma regression, we need to set the objective to 'reg:gamma', it also suggests
# to set the base_score to a value between 1 to 5 if the number of iteration is small
param = {'silent':1, 'objective':'reg:gamma', 'booster':'gbtree', 'base_score':3}
param = {'objective':'reg:gamma', 'booster':'gbtree', 'base_score':3}

# the rest of settings are the same
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
Expand Down
9 changes: 5 additions & 4 deletions demo/guide-python/generalized_linear_model.py
100755 → 100644
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
#!/usr/bin/python
import os
import xgboost as xgb
##
# this script demonstrate how to fit generalized linear model in xgboost
# basically, we are using linear model, instead of tree for our boosters
##
dtrain = xgb.DMatrix('../data/agaricus.txt.train')
dtest = xgb.DMatrix('../data/agaricus.txt.test')
CURRENT_DIR = os.path.dirname(__file__)
dtrain = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.train'))
dtest = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.test'))
# change booster to gblinear, so that we are fitting a linear model
# alpha is the L1 regularizer
# lambda is the L2 regularizer
# you can also set lambda_bias which is L2 regularizer on the bias term
param = {'silent':1, 'objective':'binary:logistic', 'booster':'gblinear',
param = {'objective':'binary:logistic', 'booster':'gblinear',
'alpha': 0.0001, 'lambda': 1}

# normally, you do not need to set eta (step_size)
Expand Down
13 changes: 7 additions & 6 deletions demo/guide-python/predict_first_ntree.py
100755 → 100644
Original file line number Diff line number Diff line change
@@ -1,17 +1,18 @@
#!/usr/bin/python
import os
import numpy as np
import xgboost as xgb

### load data in do training
dtrain = xgb.DMatrix('../data/agaricus.txt.train')
dtest = xgb.DMatrix('../data/agaricus.txt.test')
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic'}
# load data in do training
CURRENT_DIR = os.path.dirname(__file__)
dtrain = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.train'))
dtest = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.test'))
param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
num_round = 3
bst = xgb.train(param, dtrain, num_round, watchlist)

print('start testing prediction from first n trees')
### predict using first 1 tree
# predict using first 1 tree
label = dtest.get_label()
ypred1 = bst.predict(dtest, ntree_limit=1)
# by default, we predict using all the trees
Expand Down
17 changes: 9 additions & 8 deletions demo/guide-python/predict_leaf_indices.py
100755 → 100644
Original file line number Diff line number Diff line change
@@ -1,19 +1,20 @@
#!/usr/bin/python
import os
import xgboost as xgb

### load data in do training
dtrain = xgb.DMatrix('../data/agaricus.txt.train')
dtest = xgb.DMatrix('../data/agaricus.txt.test')
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic'}
# load data in do training
CURRENT_DIR = os.path.dirname(__file__)
dtrain = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.train'))
dtest = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.test'))
param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
num_round = 3
bst = xgb.train(param, dtrain, num_round, watchlist)

print ('start testing predict the leaf indices')
### predict using first 2 tree
print('start testing predict the leaf indices')
# predict using first 2 tree
leafindex = bst.predict(dtest, ntree_limit=2, pred_leaf=True)
print(leafindex.shape)
print(leafindex)
### predict all trees
# predict all trees
leafindex = bst.predict(dtest, pred_leaf=True)
print(leafindex.shape)
13 changes: 0 additions & 13 deletions demo/guide-python/runall.sh

This file was deleted.

4 changes: 2 additions & 2 deletions demo/guide-python/sklearn_evals_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
# Or you can use: clf = xgb.XGBClassifier(**param_dist)

clf.fit(X_train, y_train,
eval_set=[(X_train, y_train), (X_test, y_test)],
eval_set=[(X_train, y_train), (X_test, y_test)],
eval_metric='logloss',
verbose=True)

Expand All @@ -37,7 +37,7 @@
for e_mtr_name, e_mtr_vals in e_mtrs.items():
print(' - {}'.format(e_mtr_name))
print(' - {}'.format(e_mtr_vals))

print('')
print('Access complete dict:')
print(evals_result)
8 changes: 3 additions & 5 deletions demo/guide-python/sklearn_examples.py
100755 → 100644
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
#!/usr/bin/python
'''
Created on 1 Apr 2015
Expand Down Expand Up @@ -52,9 +51,9 @@
X = boston['data']
xgb_model = xgb.XGBRegressor()
clf = GridSearchCV(xgb_model,
{'max_depth': [2,4,6],
'n_estimators': [50,100,200]}, verbose=1)
clf.fit(X,y)
{'max_depth': [2, 4, 6],
'n_estimators': [50, 100, 200]}, verbose=1)
clf.fit(X, y)
print(clf.best_score_)
print(clf.best_params_)

Expand All @@ -73,4 +72,3 @@
clf = xgb.XGBClassifier()
clf.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="auc",
eval_set=[(X_test, y_test)])

Loading

0 comments on commit 2c1a439

Please sign in to comment.