diff --git a/.gitignore b/.gitignore index bdfa3322a55a..5131e198399a 100644 --- a/.gitignore +++ b/.gitignore @@ -91,3 +91,4 @@ lib/ metastore_db plugin/updater_gpu/test/cpp/data +/include/xgboost/build_config.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 7b0d90797a44..061431de5684 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -49,6 +49,26 @@ if(WIN32 AND MINGW) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -static-libstdc++") endif() +# Check existence of software pre-fetching +include(CheckCXXSourceCompiles) +check_cxx_source_compiles(" +#include +int main() { + char data = 0; + const char* address = &data; + _mm_prefetch(address, _MM_HINT_NTA); + return 0; +} +" XGBOOST_MM_PREFETCH_PRESENT) +check_cxx_source_compiles(" +int main() { + char data = 0; + const char* address = &data; + __builtin_prefetch(address, 0, 0); + return 0; +} +" XGBOOST_BUILTIN_PREFETCH_PRESENT) + # Sanitizer if(USE_SANITIZER) include(cmake/Sanitizer.cmake) @@ -82,6 +102,12 @@ include_directories ( ${PROJECT_SOURCE_DIR}/rabit/include ) +# Generate configurable header +set(CMAKE_LOCAL "${PROJECT_SOURCE_DIR}/cmake") +set(INCLUDE_ROOT "${PROJECT_SOURCE_DIR}/include") +message(STATUS "${CMAKE_LOCAL}/build_config.h.in -> ${INCLUDE_ROOT}/xgboost/build_config.h") +configure_file("${CMAKE_LOCAL}/build_config.h.in" "${INCLUDE_ROOT}/xgboost/build_config.h") + file(GLOB_RECURSE SOURCES src/*.cc src/*.h diff --git a/cmake/build_config.h.in b/cmake/build_config.h.in new file mode 100644 index 000000000000..b49dde12e123 --- /dev/null +++ b/cmake/build_config.h.in @@ -0,0 +1,7 @@ +#ifndef XGBOOST_BUILD_CONFIG_H_ +#define XGBOOST_BUILD_CONFIG_H_ + +#cmakedefine XGBOOST_MM_PREFETCH_PRESENT +#cmakedefine XGBOOST_BUILTIN_PREFETCH_PRESENT + +#endif // XGBOOST_BUILD_CONFIG_H_ diff --git a/include/xgboost/base.h b/include/xgboost/base.h index 97b140294f22..55c2e4ac7144 100644 --- a/include/xgboost/base.h +++ b/include/xgboost/base.h @@ -218,4 +218,8 @@ using bst_omp_uint = dmlc::omp_uint; // NOLINT #endif #endif } // namespace xgboost + +/* Always keep this #include at the bottom of xgboost/base.h */ +#include + #endif // XGBOOST_BASE_H_ diff --git a/include/xgboost/build_config.h b/include/xgboost/build_config.h new file mode 100644 index 000000000000..1e36dc80889c --- /dev/null +++ b/include/xgboost/build_config.h @@ -0,0 +1,20 @@ +/*! + * Copyright (c) 2018 by Contributors + * \file build_config.h + * \brief Fall-back logic for platform-specific feature detection. + * \author Hyunsu Philip Cho + */ +#ifndef XGBOOST_BUILD_CONFIG_H_ +#define XGBOOST_BUILD_CONFIG_H_ + +/* default logic for software pre-fetching */ +#if (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_AMD64))) || defined(__INTEL_COMPILER) + // Enable _mm_prefetch for Intel compiler and MSVC+x86 + #define XGBOOST_MM_PREFETCH_PRESENT + #define XGBOOST_BUILTIN_PREFETCH_PRESENT +#elif defined(__GNUC__) + // Enable __builtin_prefetch for GCC + #define XGBOOST_BUILTIN_PREFETCH_PRESENT +#endif + +#endif // XGBOOST_BUILD_CONFIG_H_ diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc index da6e4d770bff..a988d3baf9ff 100644 --- a/src/common/hist_util.cc +++ b/src/common/hist_util.cc @@ -14,6 +14,15 @@ #include "./hist_util.h" #include "./quantile.h" +#if defined(XGBOOST_MM_PREFETCH_PRESENT) + #include + #define PREFETCH_READ_T0(addr) _mm_prefetch(reinterpret_cast(addr), _MM_HINT_T0) +#elif defined(XGBOOST_BUILTIN_PREFETCH_PRESENT) + #define PREFETCH_READ_T0(addr) __builtin_prefetch(reinterpret_cast(addr), 0, 3) +#else // no SW pre-fetching available; PREFETCH_READ_T0 is no-op + #define PREFETCH_READ_T0(addr) do {} while (0) +#endif + namespace xgboost { namespace common { @@ -399,56 +408,89 @@ void GHistBuilder::BuildHist(const std::vector& gpair, const RowSetCollection::Elem row_indices, const GHistIndexMatrix& gmat, GHistRow hist) { - data_.resize(nbins_ * nthread_, GHistEntry()); - std::fill(data_.begin(), data_.end(), GHistEntry()); + const size_t nthread = static_cast(this->nthread_); + data_.resize(nbins_ * nthread_); + + const size_t* rid = row_indices.begin; + const size_t nrows = row_indices.Size(); + const uint32_t* index = gmat.index.data(); + const size_t* row_ptr = gmat.row_ptr.data(); + const float* pgh = reinterpret_cast(gpair.data()); + + double* hist_data = reinterpret_cast(hist.begin); + double* data = reinterpret_cast(data_.data()); + + const size_t block_size = 512; + size_t n_blocks = nrows/block_size; + n_blocks += !!(nrows - n_blocks*block_size); + + const size_t nthread_to_process = std::min(nthread, n_blocks); + memset(thread_init_.data(), '\0', nthread_to_process*sizeof(size_t)); + + const size_t cache_line_size = 64; + const size_t prefetch_offset = 10; + size_t no_prefetch_size = prefetch_offset + cache_line_size/sizeof(*rid); + no_prefetch_size = no_prefetch_size > nrows ? nrows : no_prefetch_size; + + #pragma omp parallel for num_threads(nthread_to_process) schedule(guided) + for (bst_omp_uint iblock = 0; iblock < n_blocks; iblock++) { + dmlc::omp_uint tid = omp_get_thread_num(); + double* data_local_hist = ((nthread_to_process == 1) ? hist_data : + reinterpret_cast(data_.data() + tid * nbins_)); + + if (!thread_init_[tid]) { + memset(data_local_hist, '\0', 2*nbins_*sizeof(double)); + thread_init_[tid] = true; + } - constexpr int kUnroll = 8; // loop unrolling factor - const auto nthread = static_cast(this->nthread_); - const size_t nrows = row_indices.end - row_indices.begin; - const size_t rest = nrows % kUnroll; + const size_t istart = iblock*block_size; + const size_t iend = (((iblock+1)*block_size > nrows) ? nrows : istart + block_size); + for (size_t i = istart; i < iend; ++i) { + const size_t icol_start = row_ptr[rid[i]]; + const size_t icol_end = row_ptr[rid[i]+1]; - #pragma omp parallel for num_threads(nthread) schedule(guided) - for (bst_omp_uint i = 0; i < nrows - rest; i += kUnroll) { - const bst_omp_uint tid = omp_get_thread_num(); - const size_t off = tid * nbins_; - size_t rid[kUnroll]; - size_t ibegin[kUnroll]; - size_t iend[kUnroll]; - GradientPair stat[kUnroll]; - for (int k = 0; k < kUnroll; ++k) { - rid[k] = row_indices.begin[i + k]; - } - for (int k = 0; k < kUnroll; ++k) { - ibegin[k] = gmat.row_ptr[rid[k]]; - iend[k] = gmat.row_ptr[rid[k] + 1]; - } - for (int k = 0; k < kUnroll; ++k) { - stat[k] = gpair[rid[k]]; - } - for (int k = 0; k < kUnroll; ++k) { - for (size_t j = ibegin[k]; j < iend[k]; ++j) { - const uint32_t bin = gmat.index[j]; - data_[off + bin].Add(stat[k]); + if (i < nrows - no_prefetch_size) { + PREFETCH_READ_T0(row_ptr + rid[i + prefetch_offset]); + PREFETCH_READ_T0(pgh + 2*rid[i + prefetch_offset]); + } + + for (size_t j = icol_start; j < icol_end; ++j) { + const uint32_t idx_bin = 2*index[j]; + const size_t idx_gh = 2*rid[i]; + + data_local_hist[idx_bin] += pgh[idx_gh]; + data_local_hist[idx_bin+1] += pgh[idx_gh+1]; } } } - for (size_t i = nrows - rest; i < nrows; ++i) { - const size_t rid = row_indices.begin[i]; - const size_t ibegin = gmat.row_ptr[rid]; - const size_t iend = gmat.row_ptr[rid + 1]; - const GradientPair stat = gpair[rid]; - for (size_t j = ibegin; j < iend; ++j) { - const uint32_t bin = gmat.index[j]; - data_[bin].Add(stat); + + if (nthread_to_process > 1) { + const size_t size = (2*nbins_); + const size_t block_size = 1024; + size_t n_blocks = size/block_size; + n_blocks += !!(size - n_blocks*block_size); + + size_t n_worked_bins = 0; + for (size_t i = 0; i < nthread_to_process; ++i) { + if (thread_init_[i]) { + thread_init_[n_worked_bins++] = i; + } } - } - /* reduction */ - const uint32_t nbins = nbins_; - #pragma omp parallel for num_threads(nthread) schedule(static) - for (bst_omp_uint bin_id = 0; bin_id < bst_omp_uint(nbins); ++bin_id) { - for (bst_omp_uint tid = 0; tid < nthread; ++tid) { - hist.begin[bin_id].Add(data_[tid * nbins_ + bin_id]); + #pragma omp parallel for num_threads(std::min(nthread, n_blocks)) schedule(guided) + for (bst_omp_uint iblock = 0; iblock < n_blocks; iblock++) { + const size_t istart = iblock*block_size; + const size_t iend = (((iblock+1)*block_size > size) ? size : istart + block_size); + + const size_t bin = 2*thread_init_[0]*nbins_; + memcpy(hist_data + istart, (data + bin + istart), sizeof(double)*(iend - istart)); + + for (size_t i_bin_part = 1; i_bin_part < n_worked_bins; ++i_bin_part) { + const size_t bin = 2*thread_init_[i_bin_part]*nbins_; + for (size_t i = istart; i < iend; i++) { + hist_data[i] += data[bin + i]; + } + } } } } diff --git a/src/common/hist_util.h b/src/common/hist_util.h index ad83dd6c8e18..30d0454c6454 100644 --- a/src/common/hist_util.h +++ b/src/common/hist_util.h @@ -238,6 +238,7 @@ class GHistBuilder { inline void Init(size_t nthread, uint32_t nbins) { nthread_ = nthread; nbins_ = nbins; + thread_init_.resize(nthread_); } // construct a histogram via histogram aggregation @@ -259,6 +260,7 @@ class GHistBuilder { /*! \brief number of all bins over all features */ uint32_t nbins_; std::vector data_; + std::vector thread_init_; };