Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Performance optimizations for Intel CPUs #3957

Merged
merged 15 commits into from
Jan 9, 2019
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -91,3 +91,4 @@ lib/
metastore_db

plugin/updater_gpu/test/cpp/data
/include/xgboost/build_config.h
26 changes: 26 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,26 @@ if(WIN32 AND MINGW)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -static-libstdc++")
endif()

# Check existence of software pre-fetching
include(CheckCXXSourceCompiles)
check_cxx_source_compiles("
#include <xmmintrin.h>
int main() {
char data = 0;
const char* address = &data;
_mm_prefetch(address, _MM_HINT_NTA);
return 0;
}
" XGBOOST_MM_PREFETCH_PRESENT)
check_cxx_source_compiles("
int main() {
char data = 0;
const char* address = &data;
__builtin_prefetch(address, 0, 0);
return 0;
}
" XGBOOST_BUILTIN_PREFETCH_PRESENT)

# Sanitizer
if(USE_SANITIZER)
include(cmake/Sanitizer.cmake)
Expand Down Expand Up @@ -82,6 +102,12 @@ include_directories (
${PROJECT_SOURCE_DIR}/rabit/include
)

# Generate configurable header
set(CMAKE_LOCAL "${PROJECT_SOURCE_DIR}/cmake")
set(INCLUDE_ROOT "${PROJECT_SOURCE_DIR}/include")
message(STATUS "${CMAKE_LOCAL}/build_config.h.in -> ${INCLUDE_ROOT}/xgboost/build_config.h")
configure_file("${CMAKE_LOCAL}/build_config.h.in" "${INCLUDE_ROOT}/xgboost/build_config.h")

file(GLOB_RECURSE SOURCES
src/*.cc
src/*.h
Expand Down
7 changes: 7 additions & 0 deletions cmake/build_config.h.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#ifndef XGBOOST_BUILD_CONFIG_H_
#define XGBOOST_BUILD_CONFIG_H_

#cmakedefine XGBOOST_MM_PREFETCH_PRESENT
#cmakedefine XGBOOST_BUILTIN_PREFETCH_PRESENT

#endif // XGBOOST_BUILD_CONFIG_H_
4 changes: 4 additions & 0 deletions include/xgboost/base.h
Original file line number Diff line number Diff line change
Expand Up @@ -218,4 +218,8 @@ using bst_omp_uint = dmlc::omp_uint; // NOLINT
#endif
#endif
} // namespace xgboost

/* Always keep this #include at the bottom of xgboost/base.h */
#include <xgboost/build_config.h>

#endif // XGBOOST_BASE_H_
20 changes: 20 additions & 0 deletions include/xgboost/build_config.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
/*!
* Copyright (c) 2018 by Contributors
* \file build_config.h
* \brief Fall-back logic for platform-specific feature detection.
* \author Hyunsu Philip Cho
*/
#ifndef XGBOOST_BUILD_CONFIG_H_
#define XGBOOST_BUILD_CONFIG_H_

/* default logic for software pre-fetching */
#if (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_AMD64))) || defined(__INTEL_COMPILER)
// Enable _mm_prefetch for Intel compiler and MSVC+x86
#define XGBOOST_MM_PREFETCH_PRESENT
#define XGBOOST_BUILTIN_PREFETCH_PRESENT
#elif defined(__GNUC__)
// Enable __builtin_prefetch for GCC
#define XGBOOST_BUILTIN_PREFETCH_PRESENT
#endif

#endif // XGBOOST_BUILD_CONFIG_H_
128 changes: 85 additions & 43 deletions src/common/hist_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,15 @@
#include "./hist_util.h"
#include "./quantile.h"

#if defined(XGBOOST_MM_PREFETCH_PRESENT)
#include <xmmintrin.h>
#define PREFETCH_READ_T0(addr) _mm_prefetch(reinterpret_cast<const char*>(addr), _MM_HINT_T0)
#elif defined(XGBOOST_BUILTIN_PREFETCH_PRESENT)
#define PREFETCH_READ_T0(addr) __builtin_prefetch(reinterpret_cast<const char*>(addr), 0, 3)
#else // no SW pre-fetching available; PREFETCH_READ_T0 is no-op
#define PREFETCH_READ_T0(addr) do {} while (0)
#endif

namespace xgboost {
namespace common {

Expand Down Expand Up @@ -399,56 +408,89 @@ void GHistBuilder::BuildHist(const std::vector<GradientPair>& gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix& gmat,
GHistRow hist) {
data_.resize(nbins_ * nthread_, GHistEntry());
std::fill(data_.begin(), data_.end(), GHistEntry());
const size_t nthread = static_cast<size_t>(this->nthread_);
data_.resize(nbins_ * nthread_);

const size_t* rid = row_indices.begin;
const size_t nrows = row_indices.Size();
const uint32_t* index = gmat.index.data();
const size_t* row_ptr = gmat.row_ptr.data();
const float* pgh = reinterpret_cast<const float*>(gpair.data());

double* hist_data = reinterpret_cast<double*>(hist.begin);
double* data = reinterpret_cast<double*>(data_.data());

const size_t block_size = 512;
size_t n_blocks = nrows/block_size;
n_blocks += !!(nrows - n_blocks*block_size);

const size_t nthread_to_process = std::min(nthread, n_blocks);
memset(thread_init_.data(), '\0', nthread_to_process*sizeof(size_t));

const size_t cache_line_size = 64;
const size_t prefetch_offset = 10;
size_t no_prefetch_size = prefetch_offset + cache_line_size/sizeof(*rid);
no_prefetch_size = no_prefetch_size > nrows ? nrows : no_prefetch_size;

#pragma omp parallel for num_threads(nthread_to_process) schedule(guided)
for (bst_omp_uint iblock = 0; iblock < n_blocks; iblock++) {
dmlc::omp_uint tid = omp_get_thread_num();
double* data_local_hist = ((nthread_to_process == 1) ? hist_data :
reinterpret_cast<double*>(data_.data() + tid * nbins_));

if (!thread_init_[tid]) {
memset(data_local_hist, '\0', 2*nbins_*sizeof(double));
thread_init_[tid] = true;
}

constexpr int kUnroll = 8; // loop unrolling factor
const auto nthread = static_cast<bst_omp_uint>(this->nthread_);
const size_t nrows = row_indices.end - row_indices.begin;
const size_t rest = nrows % kUnroll;
const size_t istart = iblock*block_size;
const size_t iend = (((iblock+1)*block_size > nrows) ? nrows : istart + block_size);
for (size_t i = istart; i < iend; ++i) {
const size_t icol_start = row_ptr[rid[i]];
const size_t icol_end = row_ptr[rid[i]+1];

#pragma omp parallel for num_threads(nthread) schedule(guided)
for (bst_omp_uint i = 0; i < nrows - rest; i += kUnroll) {
const bst_omp_uint tid = omp_get_thread_num();
const size_t off = tid * nbins_;
size_t rid[kUnroll];
size_t ibegin[kUnroll];
size_t iend[kUnroll];
GradientPair stat[kUnroll];
for (int k = 0; k < kUnroll; ++k) {
rid[k] = row_indices.begin[i + k];
}
for (int k = 0; k < kUnroll; ++k) {
ibegin[k] = gmat.row_ptr[rid[k]];
iend[k] = gmat.row_ptr[rid[k] + 1];
}
for (int k = 0; k < kUnroll; ++k) {
stat[k] = gpair[rid[k]];
}
for (int k = 0; k < kUnroll; ++k) {
for (size_t j = ibegin[k]; j < iend[k]; ++j) {
const uint32_t bin = gmat.index[j];
data_[off + bin].Add(stat[k]);
if (i < nrows - no_prefetch_size) {
PREFETCH_READ_T0(row_ptr + rid[i + prefetch_offset]);
PREFETCH_READ_T0(pgh + 2*rid[i + prefetch_offset]);
}

for (size_t j = icol_start; j < icol_end; ++j) {
const uint32_t idx_bin = 2*index[j];
const size_t idx_gh = 2*rid[i];

data_local_hist[idx_bin] += pgh[idx_gh];
data_local_hist[idx_bin+1] += pgh[idx_gh+1];
}
}
}
for (size_t i = nrows - rest; i < nrows; ++i) {
const size_t rid = row_indices.begin[i];
const size_t ibegin = gmat.row_ptr[rid];
const size_t iend = gmat.row_ptr[rid + 1];
const GradientPair stat = gpair[rid];
for (size_t j = ibegin; j < iend; ++j) {
const uint32_t bin = gmat.index[j];
data_[bin].Add(stat);

if (nthread_to_process > 1) {
const size_t size = (2*nbins_);
const size_t block_size = 1024;
size_t n_blocks = size/block_size;
n_blocks += !!(size - n_blocks*block_size);

size_t n_worked_bins = 0;
for (size_t i = 0; i < nthread_to_process; ++i) {
if (thread_init_[i]) {
thread_init_[n_worked_bins++] = i;
}
}
}

/* reduction */
const uint32_t nbins = nbins_;
#pragma omp parallel for num_threads(nthread) schedule(static)
for (bst_omp_uint bin_id = 0; bin_id < bst_omp_uint(nbins); ++bin_id) {
for (bst_omp_uint tid = 0; tid < nthread; ++tid) {
hist.begin[bin_id].Add(data_[tid * nbins_ + bin_id]);
#pragma omp parallel for num_threads(std::min(nthread, n_blocks)) schedule(guided)
for (bst_omp_uint iblock = 0; iblock < n_blocks; iblock++) {
const size_t istart = iblock*block_size;
const size_t iend = (((iblock+1)*block_size > size) ? size : istart + block_size);

const size_t bin = 2*thread_init_[0]*nbins_;
memcpy(hist_data + istart, (data + bin + istart), sizeof(double)*(iend - istart));

for (size_t i_bin_part = 1; i_bin_part < n_worked_bins; ++i_bin_part) {
const size_t bin = 2*thread_init_[i_bin_part]*nbins_;
for (size_t i = istart; i < iend; i++) {
hist_data[i] += data[bin + i];
}
}
}
}
}
Expand Down
2 changes: 2 additions & 0 deletions src/common/hist_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,7 @@ class GHistBuilder {
inline void Init(size_t nthread, uint32_t nbins) {
nthread_ = nthread;
nbins_ = nbins;
thread_init_.resize(nthread_);
}

// construct a histogram via histogram aggregation
Expand All @@ -259,6 +260,7 @@ class GHistBuilder {
/*! \brief number of all bins over all features */
uint32_t nbins_;
std::vector<GHistEntry> data_;
std::vector<size_t> thread_init_;
};


Expand Down