Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make `HistCutMatrix::Init' be aware of groups. #4115

Merged
merged 2 commits into from
Feb 15, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions doc/python/python_intro.rst
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,10 @@ The data is stored in a :py:class:`DMatrix <xgboost.DMatrix>` object.
w = np.random.rand(5, 1)
dtrain = xgb.DMatrix(data, label=label, missing=-999.0, weight=w)

When performing ranking tasks, the number of weights should be equal
to number of groups.


Setting Parameters
------------------
XGBoost can use either a list of pairs or a dictionary to set :doc:`parameters </parameter>`. For instance:
Expand Down
57 changes: 48 additions & 9 deletions src/common/hist_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,25 @@
namespace xgboost {
namespace common {

HistCutMatrix::HistCutMatrix() {
monitor_.Init("HistCutMatrix");
}

size_t HistCutMatrix::SearchGroupIndFromBaseRow(
std::vector<bst_uint> const& group_ptr, size_t const base_rowid) const {
using KIt = std::vector<bst_uint>::const_iterator;
KIt res = std::lower_bound(group_ptr.cbegin(), group_ptr.cend() - 1, base_rowid);
// Cannot use CHECK_NE because it will try to print the iterator.
bool const found = res != group_ptr.cend() - 1;
if (!found) {
LOG(FATAL) << "Row " << base_rowid << " does not lie in any group!\n";
}
size_t group_ind = std::distance(group_ptr.cbegin(), res);
return group_ind;
}

void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) {
monitor_.Start("Init");
const MetaInfo& info = p_fmat->Info();

// safe factor for better accuracy
Expand All @@ -33,30 +51,50 @@ void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) {

const int nthread = omp_get_max_threads();

auto nstep = static_cast<unsigned>((info.num_col_ + nthread - 1) / nthread);
auto ncol = static_cast<unsigned>(info.num_col_);
unsigned const nstep =
static_cast<unsigned>((info.num_col_ + nthread - 1) / nthread);
unsigned const ncol = static_cast<unsigned>(info.num_col_);
sketchs.resize(info.num_col_);
for (auto& s : sketchs) {
s.Init(info.num_row_, 1.0 / (max_num_bins * kFactor));
}

const auto& weights = info.weights_.HostVector();

// Data groups, used in ranking.
std::vector<bst_uint> const& group_ptr = info.group_ptr_;
size_t const num_groups = group_ptr.size() == 0 ? 0 : group_ptr.size() - 1;
// Use group index for weights?
bool const use_group_ind = num_groups != 0 && weights.size() != info.num_row_;

for (const auto &batch : p_fmat->GetRowBatches()) {
#pragma omp parallel num_threads(nthread)
size_t group_ind = 0;
if (use_group_ind) {
hcho3 marked this conversation as resolved.
Show resolved Hide resolved
group_ind = this->SearchGroupIndFromBaseRow(group_ptr, batch.base_rowid);
}
#pragma omp parallel num_threads(nthread) firstprivate(group_ind, use_group_ind)
{
CHECK_EQ(nthread, omp_get_num_threads());
auto tid = static_cast<unsigned>(omp_get_thread_num());
unsigned begin = std::min(nstep * tid, ncol);
unsigned end = std::min(nstep * (tid + 1), ncol);

// do not iterate if no columns are assigned to the thread
if (begin < end && end <= ncol) {
for (size_t i = 0; i < batch.Size(); ++i) { // NOLINT(*)
size_t ridx = batch.base_rowid + i;
SparsePage::Inst inst = batch[i];
for (auto& ins : inst) {
if (ins.index >= begin && ins.index < end) {
sketchs[ins.index].Push(ins.fvalue,
weights.size() > 0 ? weights[ridx] : 1.0f);
size_t const ridx = batch.base_rowid + i;
SparsePage::Inst const inst = batch[i];
if (use_group_ind &&
group_ptr[group_ind] == ridx &&
// maximum equals to weights.size() - 1
group_ind < num_groups - 1) {
// move to next group
group_ind++;
}
for (auto const& entry : inst) {
if (entry.index >= begin && entry.index < end) {
size_t w_idx = use_group_ind ? group_ind : ridx;
sketchs[entry.index].Push(entry.fvalue, info.GetWeight(w_idx));
}
}
}
Expand All @@ -65,6 +103,7 @@ void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) {
}

Init(&sketchs, max_num_bins);
monitor_.Stop("Init");
}

void HistCutMatrix::Init
Expand Down
9 changes: 9 additions & 0 deletions src/common/hist_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include "row_set.h"
#include "../tree/param.h"
#include "./quantile.h"
#include "./timer.h"
#include "../include/rabit/rabit.h"

namespace xgboost {
Expand All @@ -35,6 +36,14 @@ struct HistCutMatrix {
void Init(DMatrix* p_fmat, uint32_t max_num_bins);

void Init(std::vector<WXQSketch>* sketchs, uint32_t max_num_bins);

HistCutMatrix();

protected:
virtual size_t SearchGroupIndFromBaseRow(
std::vector<bst_uint> const& group_ptr, size_t const base_rowid) const;

Monitor monitor_;
};

/*! \brief Builds the cut matrix on the GPU */
Expand Down
24 changes: 23 additions & 1 deletion src/learner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -474,12 +474,16 @@ class LearnerImpl : public Learner {

void UpdateOneIter(int iter, DMatrix* train) override {
monitor_.Start("UpdateOneIter");

// TODO(trivialfis): Merge the duplicated code with BoostOneIter
CHECK(ModelInitialized())
<< "Always call InitModel or LoadModel before update";
if (tparam_.seed_per_iteration || rabit::IsDistributed()) {
common::GlobalRandom().seed(tparam_.seed * kRandSeedMagic + iter);
}
this->ValidateDMatrix(train);
this->PerformTreeMethodHeuristic(train);

monitor_.Start("PredictRaw");
this->PredictRaw(train, &preds_);
monitor_.Stop("PredictRaw");
Expand All @@ -493,10 +497,15 @@ class LearnerImpl : public Learner {
void BoostOneIter(int iter, DMatrix* train,
HostDeviceVector<GradientPair>* in_gpair) override {
monitor_.Start("BoostOneIter");

CHECK(ModelInitialized())
<< "Always call InitModel or LoadModel before boost.";
if (tparam_.seed_per_iteration || rabit::IsDistributed()) {
common::GlobalRandom().seed(tparam_.seed * kRandSeedMagic + iter);
}
this->ValidateDMatrix(train);
this->PerformTreeMethodHeuristic(train);

gbm_->DoBoost(train, in_gpair);
monitor_.Stop("BoostOneIter");
}
Expand Down Expand Up @@ -711,7 +720,7 @@ class LearnerImpl : public Learner {
mparam_.num_feature = num_feature;
}
CHECK_NE(mparam_.num_feature, 0)
<< "0 feature is supplied. Are you using raw Booster?";
<< "0 feature is supplied. Are you using raw Booster interface?";
// setup
cfg_["num_feature"] = common::ToString(mparam_.num_feature);
CHECK(obj_ == nullptr && gbm_ == nullptr);
Expand All @@ -736,6 +745,19 @@ class LearnerImpl : public Learner {
gbm_->PredictBatch(data, out_preds, ntree_limit);
}

void ValidateDMatrix(DMatrix* p_fmat) {
MetaInfo const& info = p_fmat->Info();
auto const& weights = info.weights_.HostVector();
if (info.group_ptr_.size() != 0 && weights.size() != 0) {
CHECK(weights.size() == info.group_ptr_.size() - 1)
<< "\n"
<< "weights size: " << weights.size() << ", "
<< "groups size: " << info.group_ptr_.size() -1 << ", "
<< "num rows: " << p_fmat->Info().num_row_ << "\n"
<< "Number of weights should be equal to number of groups in ranking task.";
}
}

// model parameter
LearnerModelParam mparam_;
// training parameter
Expand Down
51 changes: 51 additions & 0 deletions tests/cpp/common/test_hist_util.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#include <gtest/gtest.h>
#include <vector>
#include <string>
#include <utility>

#include "../../../src/common/hist_util.h"
#include "../helpers.h"

namespace xgboost {
namespace common {

class HistCutMatrixMock : public HistCutMatrix {
public:
size_t SearchGroupIndFromBaseRow(
std::vector<bst_uint> const& group_ptr, size_t const base_rowid) {
return HistCutMatrix::SearchGroupIndFromBaseRow(group_ptr, base_rowid);
}
};

TEST(HistCutMatrix, SearchGroupInd) {
size_t constexpr kNumGroups = 4;
size_t constexpr kNumRows = 17;
size_t constexpr kNumCols = 15;

auto pp_mat = CreateDMatrix(kNumRows, kNumCols, 0);

auto& p_mat = *pp_mat;
std::vector<bst_int> group(kNumGroups);
group[0] = 2;
group[1] = 3;
group[2] = 7;
group[3] = 5;

p_mat->Info().SetInfo(
"group", group.data(), DataType::kUInt32, kNumGroups);

HistCutMatrixMock hmat;

size_t group_ind = hmat.SearchGroupIndFromBaseRow(p_mat->Info().group_ptr_, 0);
ASSERT_EQ(group_ind, 0);

group_ind = hmat.SearchGroupIndFromBaseRow(p_mat->Info().group_ptr_, 5);
ASSERT_EQ(group_ind, 2);

EXPECT_ANY_THROW(hmat.SearchGroupIndFromBaseRow(p_mat->Info().group_ptr_, 17));

delete pp_mat;
}

} // namespace common
} // namespace xgboost
65 changes: 53 additions & 12 deletions tests/cpp/test_learner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@

namespace xgboost {

TEST(learner, Test) {
typedef std::pair<std::string, std::string> arg;
auto args = {arg("tree_method", "exact")};
TEST(Learner, Basic) {
typedef std::pair<std::string, std::string> Arg;
auto args = {Arg("tree_method", "exact")};
auto mat_ptr = CreateDMatrix(10, 10, 0);
std::vector<std::shared_ptr<xgboost::DMatrix>> mat = {*mat_ptr};
auto learner = std::unique_ptr<Learner>(Learner::Create(mat));
Expand All @@ -17,38 +17,79 @@ TEST(learner, Test) {
delete mat_ptr;
}

TEST(learner, SelectTreeMethod) {
using arg = std::pair<std::string, std::string>;
TEST(Learner, SelectTreeMethod) {
using Arg = std::pair<std::string, std::string>;
auto mat_ptr = CreateDMatrix(10, 10, 0);
std::vector<std::shared_ptr<xgboost::DMatrix>> mat = {*mat_ptr};
auto learner = std::unique_ptr<Learner>(Learner::Create(mat));

// Test if `tree_method` can be set
learner->Configure({arg("tree_method", "approx")});
learner->Configure({Arg("tree_method", "approx")});
ASSERT_EQ(learner->GetConfigurationArguments().at("updater"),
"grow_histmaker,prune");
learner->Configure({arg("tree_method", "exact")});
learner->Configure({Arg("tree_method", "exact")});
ASSERT_EQ(learner->GetConfigurationArguments().at("updater"),
"grow_colmaker,prune");
learner->Configure({arg("tree_method", "hist")});
learner->Configure({Arg("tree_method", "hist")});
ASSERT_EQ(learner->GetConfigurationArguments().at("updater"),
"grow_quantile_histmaker");
learner->Configure({arg{"booster", "dart"}, arg{"tree_method", "hist"}});
learner->Configure({Arg{"booster", "dart"}, Arg{"tree_method", "hist"}});
ASSERT_EQ(learner->GetConfigurationArguments().at("updater"),
"grow_quantile_histmaker");
#ifdef XGBOOST_USE_CUDA
learner->Configure({arg("tree_method", "gpu_exact")});
learner->Configure({Arg("tree_method", "gpu_exact")});
ASSERT_EQ(learner->GetConfigurationArguments().at("updater"),
"grow_gpu,prune");
learner->Configure({arg("tree_method", "gpu_hist")});
learner->Configure({Arg("tree_method", "gpu_hist")});
ASSERT_EQ(learner->GetConfigurationArguments().at("updater"),
"grow_gpu_hist");
learner->Configure({arg{"booster", "dart"}, arg{"tree_method", "gpu_hist"}});
learner->Configure({Arg{"booster", "dart"}, Arg{"tree_method", "gpu_hist"}});
ASSERT_EQ(learner->GetConfigurationArguments().at("updater"),
"grow_gpu_hist");
#endif

delete mat_ptr;
}

TEST(Learner, CheckGroup) {
using Arg = std::pair<std::string, std::string>;
size_t constexpr kNumGroups = 4;
size_t constexpr kNumRows = 17;
size_t constexpr kNumCols = 15;

auto pp_mat = CreateDMatrix(kNumRows, kNumCols, 0);
auto& p_mat = *pp_mat;
std::vector<bst_float> weight(kNumGroups);
std::vector<bst_int> group(kNumGroups);
group[0] = 2;
group[1] = 3;
group[2] = 7;
group[3] = 5;
std::vector<bst_float> labels (kNumRows);
for (size_t i = 0; i < kNumRows; ++i) {
labels[i] = i % 2;
}

p_mat->Info().SetInfo(
"weight", static_cast<void*>(weight.data()), DataType::kFloat32, kNumGroups);
p_mat->Info().SetInfo(
"group", group.data(), DataType::kUInt32, kNumGroups);
p_mat->Info().SetInfo("label", labels.data(), DataType::kFloat32, kNumRows);

std::vector<std::shared_ptr<xgboost::DMatrix>> mat = {p_mat};
auto learner = std::unique_ptr<Learner>(Learner::Create(mat));
learner->Configure({Arg{"objective", "rank:pairwise"}});
learner->InitModel();

EXPECT_NO_THROW(learner->UpdateOneIter(0, p_mat.get()));

group.resize(kNumGroups+1);
group[3] = 4;
group[4] = 1;
p_mat->Info().SetInfo("group", group.data(), DataType::kUInt32, kNumGroups+1);
EXPECT_ANY_THROW(learner->UpdateOneIter(0, p_mat.get()));

delete pp_mat;
}

} // namespace xgboost