Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Dart booster. #1220

Merged
merged 1 commit into from
Jun 8, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 24 additions & 1 deletion doc/parameter.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ In R-package, you can use .(dot) to replace under score in the parameters, for e
General Parameters
------------------
* booster [default=gbtree]
- which booster to use, can be gbtree or gblinear. gbtree uses tree based model while gblinear uses linear function.
- which booster to use, can be gbtree, gblinear or dart.
 gbtree and dart use tree based model while gblinear uses linear function.
* silent [default=0]
- 0 means printing running messages, 1 means silent mode.
* nthread [default to maximum number of threads available if not set]
Expand Down Expand Up @@ -74,6 +75,28 @@ Parameters for Tree Booster
* scale_pos_weight, [default=0]
- Control the balance of positive and negative weights, useful for unbalanced classes. A typical value to consider: sum(negative cases) / sum(positive cases) See [Parameters Tuning](how_to/param_tuning.md) for more discussion. Also see Higgs Kaggle competition demo for examples: [R](../demo/kaggle-higgs/higgs-train.R ), [py1](../demo/kaggle-higgs/higgs-numpy.py ), [py2](../demo/kaggle-higgs/higgs-cv.py ), [py3](../demo/guide-python/cross_validation.py)

Additional parameters for Dart Booster
--------------------------------------
* sample_type [default="uniform"]
- type of sampling algorithm.
- "uniform": dropped trees are selected uniformly.
- "weighted": dropped trees are selected in proportion to weight.
* normalize_type [default="tree]
- type of normalization algorithm.
- "tree": New trees have the same weight of each of dropped trees.
weight of new trees are learning_rate / (k + learnig_rate)
dropped trees are scaled by a factor of k / (k + learning_rate)
- "forest": New trees have the same weight of sum of dropped trees (forest).
weight of new trees are learning_rate / (1 + learning_rate)
dropped trees are scaled by a factor of 1 / (1 + learning_rate)
* rate_drop [default=0.0]
- dropout rate.
- range: [0.0, 1.0]
* skip_drop [default=0.0]
- probability of skip dropout.
If a dropout is skipped, new trees are added in the same manner as gbtree.
- range: [0.0, 1.0]

Parameters for Linear Booster
-----------------------------
* lambda [default=0]
Expand Down
265 changes: 263 additions & 2 deletions src/gbm/gbtree.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
#include <limits>
#include "../common/common.h"

#include "../common/random.h"

namespace xgboost {
namespace gbm {

Expand Down Expand Up @@ -47,6 +49,42 @@ struct GBTreeTrainParam : public dmlc::Parameter<GBTreeTrainParam> {
}
};

/*! \brief training parameters */
struct DartTrainParam : public dmlc::Parameter<DartTrainParam> {
/*! \brief whether to not print info during training */
bool silent;
/*! \brief type of sampling algorithm */
int sample_type;
/*! \brief type of normalization algorithm */
int normalize_type;
/*! \brief how many trees are dropped */
float rate_drop;
/*! \brief whether to drop trees */
float skip_drop;
/*! \brief learning step size for a time */
float learning_rate;
// declare parameters
DMLC_DECLARE_PARAMETER(DartTrainParam) {
DMLC_DECLARE_FIELD(silent).set_default(false)
.describe("Not print information during trainig.");
DMLC_DECLARE_FIELD(sample_type).set_default(0)
.add_enum("uniform", 0)
.add_enum("weighted", 1)
.describe("Different types of sampling algorithm.");
DMLC_DECLARE_FIELD(normalize_type).set_default(0)
.add_enum("tree", 0)
.add_enum("forest", 1)
.describe("Different types of normalization algorithm.");
DMLC_DECLARE_FIELD(rate_drop).set_range(0.0f, 1.0f).set_default(0.0f)
.describe("Parameter of how many trees are dropped.");
DMLC_DECLARE_FIELD(skip_drop).set_range(0.0f, 1.0f).set_default(0.0f)
.describe("Parameter of whether to drop trees.");
DMLC_DECLARE_FIELD(learning_rate).set_lower_bound(0.0f).set_default(0.3f)
.describe("Learning rate(step size) of update.");
DMLC_DECLARE_ALIAS(learning_rate, eta);
}
};

/*! \brief model parameters */
struct GBTreeModelParam : public dmlc::Parameter<GBTreeModelParam> {
/*! \brief number of trees */
Expand Down Expand Up @@ -313,8 +351,9 @@ class GBTree : public GradientBooster {
}
}
// commit new trees all at once
inline void CommitModel(std::vector<std::unique_ptr<RegTree> >&& new_trees,
int bst_group) {
virtual void
CommitModel(std::vector<std::unique_ptr<RegTree> >&& new_trees,
int bst_group) {
for (size_t i = 0; i < new_trees.size(); ++i) {
trees.push_back(std::move(new_trees[i]));
tree_info.push_back(bst_group);
Expand Down Expand Up @@ -475,14 +514,236 @@ class GBTree : public GradientBooster {
std::vector<std::unique_ptr<TreeUpdater> > updaters;
};

// dart
class Dart : public GBTree {
public:
Dart() {}

void Configure(const std::vector<std::pair<std::string, std::string> >& cfg) override {
GBTree::Configure(cfg);
if (trees.size() == 0) {
dparam.InitAllowUnknown(cfg);
}
}

void Load(dmlc::Stream* fi) override {
GBTree::Load(fi);
weight_drop.resize(mparam.num_trees);
if (mparam.num_trees != 0) {
fi->Read(&weight_drop);
}
}

void Save(dmlc::Stream* fo) const override {
GBTree::Save(fo);
if (weight_drop.size() != 0) {
fo->Write(weight_drop);
}
}

// predict the leaf scores with dropout if ntree_limit = 0
void Predict(DMatrix* p_fmat,
int64_t buffer_offset,
std::vector<float>* out_preds,
unsigned ntree_limit) override {
DropTrees(ntree_limit);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this same as the GBTre except the drop trees? Would be better to leave a comment here

const MetaInfo& info = p_fmat->info();
int nthread;
#pragma omp parallel
{
nthread = omp_get_num_threads();
}
InitThreadTemp(nthread);
std::vector<float> &preds = *out_preds;
const size_t stride = p_fmat->info().num_row * mparam.num_output_group;
preds.resize(stride * (mparam.size_leaf_vector+1));
// start collecting the prediction
dmlc::DataIter<RowBatch>* iter = p_fmat->RowIterator();

iter->BeforeFirst();
while (iter->Next()) {
const RowBatch &batch = iter->Value();
// parallel over local batch
const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
#pragma omp parallel for schedule(static)
for (bst_omp_uint i = 0; i < nsize; ++i) {
const int tid = omp_get_thread_num();
RegTree::FVec &feats = thread_temp[tid];
int64_t ridx = static_cast<int64_t>(batch.base_rowid + i);
CHECK_LT(static_cast<size_t>(ridx), info.num_row);
// loop over output groups
for (int gid = 0; gid < mparam.num_output_group; ++gid) {
this->Pred(batch[i],
buffer_offset < 0 ? -1 : buffer_offset + ridx,
gid, info.GetRoot(ridx), &feats,
&preds[ridx * mparam.num_output_group + gid], stride,
ntree_limit);
}
}
}
}

void Predict(const SparseBatch::Inst& inst,
std::vector<float>* out_preds,
unsigned ntree_limit,
unsigned root_index) override {
DropTrees(1);
if (thread_temp.size() == 0) {
thread_temp.resize(1, RegTree::FVec());
thread_temp[0].Init(mparam.num_feature);
}
out_preds->resize(mparam.num_output_group * (mparam.size_leaf_vector+1));
// loop over output groups
for (int gid = 0; gid < mparam.num_output_group; ++gid) {
this->Pred(inst, -1, gid, root_index, &thread_temp[0],
&(*out_preds)[gid], mparam.num_output_group,
ntree_limit);
}
}

protected:
// commit new trees all at once
virtual void
CommitModel(std::vector<std::unique_ptr<RegTree> >&& new_trees,
int bst_group) {
for (size_t i = 0; i < new_trees.size(); ++i) {
trees.push_back(std::move(new_trees[i]));
tree_info.push_back(bst_group);
}
mparam.num_trees += static_cast<int>(new_trees.size());
size_t num_drop = NormalizeTrees(new_trees.size());
if (dparam.silent != 1) {
LOG(INFO) << "drop " << num_drop << " trees, "
<< "weight = " << weight_drop.back();
}
}
// predict the leaf scores without dropped trees
inline void Pred(const RowBatch::Inst &inst,
int64_t buffer_index,
int bst_group,
unsigned root_index,
RegTree::FVec *p_feats,
float *out_pred,
size_t stride,
unsigned ntree_limit) {
float psum = 0.0f;
// sum of leaf vector
std::vector<float> vec_psum(mparam.size_leaf_vector, 0.0f);
const int64_t bid = this->BufferOffset(buffer_index, bst_group);
p_feats->Fill(inst);
for (size_t i = 0; i < trees.size(); ++i) {
if (tree_info[i] == bst_group) {
bool drop = (std::find(idx_drop.begin(), idx_drop.end(), i) != idx_drop.end());
if (!drop) {
int tid = trees[i]->GetLeafIndex(*p_feats, root_index);
psum += weight_drop[i] * (*trees[i])[tid].leaf_value();
for (int j = 0; j < mparam.size_leaf_vector; ++j) {
vec_psum[j] += weight_drop[i] * trees[i]->leafvec(tid)[j];
}
}
}
}
p_feats->Drop(inst);
// updated the buffered results
if (bid >= 0 && ntree_limit == 0) {
pred_counter[bid] = static_cast<unsigned>(trees.size());
pred_buffer[bid] = psum;
for (int i = 0; i < mparam.size_leaf_vector; ++i) {
pred_buffer[bid + i + 1] = vec_psum[i];
}
}
out_pred[0] = psum;
for (int i = 0; i < mparam.size_leaf_vector; ++i) {
out_pred[stride * (i + 1)] = vec_psum[i];
}
}

// select dropped trees
inline void DropTrees(unsigned ntree_limit_drop) {
std::uniform_real_distribution<> runif(0.0, 1.0);
auto& rnd = common::GlobalRandom();
// reset
idx_drop.clear();
// sample dropped trees
bool skip = false;
if (dparam.skip_drop > 0.0) skip = (runif(rnd) < dparam.skip_drop);
if (ntree_limit_drop == 0 && !skip) {
if (dparam.sample_type == 1) {
float sum_weight = 0.0;
for (size_t i = 0; i < weight_drop.size(); ++i) {
sum_weight += weight_drop[i];
}
for (size_t i = 0; i < weight_drop.size(); ++i) {
if (runif(rnd) < dparam.rate_drop * weight_drop.size() * weight_drop[i] / sum_weight) {
idx_drop.push_back(i);
}
}
} else {
for (size_t i = 0; i < weight_drop.size(); ++i) {
if (runif(rnd) < dparam.rate_drop) {
idx_drop.push_back(i);
}
}
}
}
}
// set normalization factors
inline size_t NormalizeTrees(size_t size_new_trees) {
float lr = 1.0 * dparam.learning_rate / size_new_trees;
size_t num_drop = idx_drop.size();
if (num_drop == 0) {
for (size_t i = 0; i < size_new_trees; ++i) {
weight_drop.push_back(1.0);
}
} else {
if (dparam.normalize_type == 1) {
// normalize_type 1
float factor = 1.0 / (1.0 + lr);
for (size_t i = 0; i < idx_drop.size(); ++i) {
weight_drop[i] *= factor;
}
for (size_t i = 0; i < size_new_trees; ++i) {
weight_drop.push_back(lr * factor);
}
} else {
// normalize_type 0
float factor = 1.0 * num_drop / (num_drop + lr);
for (size_t i = 0; i < idx_drop.size(); ++i) {
weight_drop[i] *= factor;
}
for (size_t i = 0; i < size_new_trees; ++i) {
weight_drop.push_back(1.0 * lr / (num_drop + lr));
}
}
}
// reset
idx_drop.clear();
return num_drop;
}

// --- data structure ---
// training parameter
DartTrainParam dparam;
/*! \brief prediction buffer */
std::vector<float> weight_drop;
// indexes of dropped trees
std::vector<size_t> idx_drop;
};

// register the ojective functions
DMLC_REGISTER_PARAMETER(GBTreeModelParam);
DMLC_REGISTER_PARAMETER(GBTreeTrainParam);
DMLC_REGISTER_PARAMETER(DartTrainParam);

XGBOOST_REGISTER_GBM(GBTree, "gbtree")
.describe("Tree booster, gradient boosted trees.")
.set_body([]() {
return new GBTree();
});
XGBOOST_REGISTER_GBM(Dart, "dart")
.describe("Tree booster, dart.")
.set_body([]() {
return new Dart();
});
} // namespace gbm
} // namespace xgboost
45 changes: 45 additions & 0 deletions tests/python/test_basic_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,51 @@ def test_glm(self):
if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
assert err < 0.1

def test_dart(self):
dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
param = {'max_depth': 5, 'objective': 'binary:logistic', 'booster': 'dart', 'silent': False}
# specify validations set to watch performance
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
num_round = 2
bst = xgb.train(param, dtrain, num_round, watchlist)
# this is prediction
preds = bst.predict(dtest, ntree_limit=num_round)
labels = dtest.get_label()
err = sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
# error must be smaller than 10%
assert err < 0.1
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let us add a few testcase with the parameters you proposed, e.g. normalize type


# save dmatrix into binary buffer
dtest.save_binary('dtest.buffer')
# save model
bst.save_model('xgb.model.dart')
# load model and data in
bst2 = xgb.Booster(params=param, model_file='xgb.model.dart')
dtest2 = xgb.DMatrix('dtest.buffer')
preds2 = bst2.predict(dtest2, ntree_limit=num_round)
# assert they are the same
assert np.sum(np.abs(preds2 - preds)) == 0

# check whether sample_type and normalize_type work
num_round = 50
param['silent'] = True
param['learning_rate'] = 0.1
param['rate_drop'] = 0.1
preds_list = []
for p in [[p0, p1] for p0 in ['uniform', 'weighted'] for p1 in ['tree', 'forest']]:
param['sample_type'] = p[0]
param['normalize_type'] = p[1]
bst = xgb.train(param, dtrain, num_round, watchlist)
preds = bst.predict(dtest, ntree_limit=num_round)
err = sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
assert err < 0.1
preds_list.append(preds)

for ii in range(len(preds_list)):
for jj in range(ii + 1, len(preds_list)):
assert np.sum(np.abs(preds_list[ii] - preds_list[jj])) > 0

def test_eta_decay(self):
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
num_round = 4
Expand Down