Skip to content

Commit

Permalink
Rank tests
Browse files Browse the repository at this point in the history
  • Loading branch information
RAMitchell committed Feb 2, 2020
1 parent bf0269e commit 02e68d1
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 54 deletions.
78 changes: 34 additions & 44 deletions tests/cpp/common/test_hist_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -228,68 +228,58 @@ TEST(SparseCuts, MultiThreadedBuild) {
omp_set_num_threads(ori_nthreads);
}

TEST(hist_util, BasicCategorical) {
std::vector<float> x = {0.0, 1.0, 2.0, 4.0};
int num_bins = x.size();
auto dmat = GetDMatrixFromData(x);
HistogramCuts cuts;
DenseCuts dense(&cuts);
dense.Build(&dmat, num_bins);
// Expect each unique value to have its own bin
for (auto i = 0ull; i < x.size(); i++) {
EXPECT_EQ(cuts.SearchBin(x[i], 0), i);
}
auto cuts_from_sketch = cuts.Values();
EXPECT_EQ(cuts_from_sketch.size(), x.size());
EXPECT_LE(cuts.MinValues()[0], x.front());
EXPECT_GE(cuts_from_sketch.front(), x.front());
EXPECT_GE(cuts_from_sketch.back(), x.back());


}

TEST(hist_util, BasicContinuous) {
std::vector<float> x(256);
std::iota(x.begin(), x.end(), 0);
int num_bins = x.size();
auto dmat = GetDMatrixFromData(x);
HistogramCuts cuts;
DenseCuts dense(&cuts);
dense.Build(&dmat, num_bins);

// Expect each unique value to have its own bin
for (auto i = 0ull; i < x.size(); i++) {
EXPECT_EQ(cuts.SearchBin(x[i], 0), i);
TEST(hist_util, DenseCutsCategorical) {
int categorical_sizes[] = {2, 6, 8, 12};
int num_bins = 256;
int sizes[] = {25, 100, 1000};
for (auto n : sizes) {
for (auto num_categories : categorical_sizes) {
auto x = GenerateRandomCategoricalSingleColumn(n, num_categories);
auto dmat = GetDMatrixFromData(x);
std::vector<float> x_sorted(x);
std::sort(x_sorted.begin(), x_sorted.end());
HistogramCuts cuts;
DenseCuts dense(&cuts);
dense.Build(&dmat, num_bins);
auto cuts_from_sketch = cuts.Values();
EXPECT_LT(cuts.MinValues()[0], x_sorted.front());
EXPECT_GT(cuts_from_sketch.front(), x_sorted.front());
EXPECT_GE(cuts_from_sketch.back(), x_sorted.back());
EXPECT_EQ(cuts_from_sketch.size(), num_categories);
}
}
auto cuts_from_sketch = cuts.Values();
EXPECT_EQ(cuts_from_sketch.size(), x.size());
EXPECT_LE(cuts.MinValues()[0], x.front());
EXPECT_GE(cuts_from_sketch.front(), x.front());
EXPECT_GE(cuts_from_sketch.back(), x.back());
}

TEST(hist_util, DenseCutsAccuracyTest) {
int bin_sizes[] = {16};
int sizes[] = {25};
//int bin_sizes[] = {2, 16, 256,512};
//int sizes[] = {25, 100, 1000};
int bin_sizes[] = {2, 16, 256, 512};
int sizes[] = {25, 100, 1000};
float low = -100;
float high = 100;
for (auto n : sizes) {
auto x = GenerateRandomSingleColumn(n, low, high);
std::vector<float > x_sorted(x);
std::vector<float> x_sorted(x);
std::sort(x_sorted.begin(), x_sorted.end());
auto dmat = GetDMatrixFromData(x);
for (auto num_bins : bin_sizes) {
HistogramCuts cuts;
DenseCuts dense(&cuts);
dense.Build(&dmat, num_bins);
auto cuts_from_sketch = cuts.Values();
auto cuts_from_sort = CutsFromSort(x_sorted, num_bins);
EXPECT_LT(cuts.MinValues()[0], x_sorted.front());
EXPECT_GT(cuts_from_sketch.front(), x_sorted.front());
EXPECT_GE(cuts_from_sketch.back(), x_sorted.back());
ASSERT_EQ(cuts_from_sketch.size(), std::min(n, num_bins));

if (x.size() <= num_bins) {
// Less unique values than number of bins
// Each value should get its own bin
for (auto i = 0ull; i < x.size(); i++) {
EXPECT_EQ(cuts.SearchBin(x_sorted[i], 0), i);
}
}
// Don't perform this test for categorical
if (cuts_from_sketch.size() > 16) {
TestRank(cuts_from_sketch, x_sorted,0.01);
}
}
}
}
Expand Down
36 changes: 26 additions & 10 deletions tests/cpp/common/test_hist_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,22 +14,38 @@ std::vector<float> GenerateRandomSingleColumn(int n, float low = -100,
return x;
}

std::vector<float> GenerateRandomCategoricalSingleColumn(int n,
int num_categories) {
std::vector<float> x(n);
std::mt19937 rng(0);
std::uniform_int_distribution<int> dist(0, num_categories - 1);
std::generate(x.begin(), x.end(), [&]() { return dist(rng); });
// Make sure each category is present
for(auto i = 0ull; i < num_categories; i++)
{
x[i] = i;
}
return x;
}

data::SimpleDMatrix GetDMatrixFromData(const std::vector<float>& x) {
data::DenseAdapter adapter(x.data(), x.size(), x.size(), 1);
data::DenseAdapter adapter(x.data(), x.size(), 1);
return data::SimpleDMatrix(&adapter, std::numeric_limits<float>::quiet_NaN(),
1);
}

std::vector<float> CutsFromSort(const std::vector<float>& x_sorted,
int num_bins) {
if (x_sorted.size() <= num_bins) return x_sorted;
std::vector<float> cuts(num_bins);
for(auto i = 0ull; i < num_bins; i++)
{
double rank = double(i)/num_bins;
cuts[i] = x_sorted[size_t(rank*x_sorted.size())];
void TestRank(const std::vector<float>& cuts,
const std::vector<float>& sorted_x, float eps) {
// Ignore the first and last cut, they are special
size_t j = 0;
for (auto i = 1ull; i < cuts.size() - 1; i++) {
int expected_rank = (i * sorted_x.size()) / cuts.size() + 1;
while (cuts[i] > sorted_x[j]) {
j++;
}
int actual_rank = j;
EXPECT_LT(std::abs(expected_rank - actual_rank), sorted_x.size() * eps);
}
return cuts;
}
} // namespace common
} // namespace xgboost

0 comments on commit 02e68d1

Please sign in to comment.