Skip to content

Commit

Permalink
Enhance nvtx support. (#5636)
Browse files Browse the repository at this point in the history
  • Loading branch information
trivialfis authored May 6, 2020
1 parent 67d267f commit eaf2a00
Show file tree
Hide file tree
Showing 13 changed files with 98 additions and 87 deletions.
7 changes: 7 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,10 @@ else (BUILD_STATIC_LIB)
add_library(xgboost SHARED ${XGBOOST_OBJ_SOURCES})
endif (BUILD_STATIC_LIB)

if (USE_NVTX)
enable_nvtx(xgboost)
endif (USE_NVTX)

#-- Hide all C++ symbols
if (HIDE_CXX_SYMBOLS)
set_target_properties(objxgboost PROPERTIES CXX_VISIBILITY_PRESET hidden)
Expand All @@ -178,6 +182,9 @@ endif (JVM_BINDINGS)

#-- CLI for xgboost
add_executable(runxgboost ${xgboost_SOURCE_DIR}/src/cli_main.cc ${XGBOOST_OBJ_SOURCES})
if (USE_NVTX)
enable_nvtx(runxgboost)
endif (USE_NVTX)

target_include_directories(runxgboost
PRIVATE
Expand Down
7 changes: 7 additions & 0 deletions cmake/Utils.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -141,3 +141,10 @@ DESTINATION \"${build_dir}/bak\")")
install(CODE "file(RENAME \"${build_dir}/bak/cmake_install.cmake\"
\"${build_dir}/R-package/cmake_install.cmake\")")
endfunction(setup_rpackage_install_target)

macro(enable_nvtx target)
find_package(NVTX REQUIRED)
target_include_directories(${target} PRIVATE "${NVTX_INCLUDE_DIR}")
target_link_libraries(${target} PRIVATE "${NVTX_LIBRARY}")
target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_NVTX=1)
endmacro()
26 changes: 26 additions & 0 deletions cmake/modules/FindNVTX.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
if (NVTX_LIBRARY)
unset(NVTX_LIBRARY CACHE)
endif (NVTX_LIBRARY)

set(NVTX_LIB_NAME nvToolsExt)


find_path(NVTX_INCLUDE_DIR
NAMES nvToolsExt.h
PATHS ${CUDA_HOME}/include ${CUDA_INCLUDE} /usr/local/cuda/include)


find_library(NVTX_LIBRARY
NAMES nvToolsExt
PATHS ${CUDA_HOME}/lib64 /usr/local/cuda/lib64)

message(STATUS "Using nvtx library: ${NVTX_LIBRARY}")

include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(NVTX DEFAULT_MSG
NVTX_INCLUDE_DIR NVTX_LIBRARY)

mark_as_advanced(
NVTX_INCLUDE_DIR
NVTX_LIBRARY
)
3 changes: 1 addition & 2 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,7 @@ if (USE_CUDA)
endif (USE_NCCL)

if (USE_NVTX)
target_include_directories(objxgboost PRIVATE "${NVTX_HEADER_DIR}")
target_compile_definitions(objxgboost PRIVATE -DXGBOOST_USE_NVTX=1)
enable_nvtx(objxgboost)
endif (USE_NVTX)

target_compile_options(objxgboost PRIVATE
Expand Down
14 changes: 13 additions & 1 deletion src/common/timer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,21 @@
#include "timer.h"
#include "xgboost/json.h"

#if defined(XGBOOST_USE_NVTX)
#include <nvToolsExt.h>
#endif // defined(XGBOOST_USE_NVTX)

namespace xgboost {
namespace common {

void Monitor::Start(std::string const &name) {
if (ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) {
statistics_map_[name].timer.Start();
auto &stats = statistics_map_[name];
stats.timer.Start();
#if defined(XGBOOST_USE_NVTX)
std::string nvtx_name = label_ + "::" + name;
stats.nvtx_id = nvtxRangeStartA(nvtx_name.c_str());
#endif // defined(XGBOOST_USE_NVTX)
}
}

Expand All @@ -24,6 +33,9 @@ void Monitor::Stop(const std::string &name) {
auto &stats = statistics_map_[name];
stats.timer.Stop();
stats.count++;
#if defined(XGBOOST_USE_NVTX)
nvtxRangeEnd(stats.nvtx_id);
#endif // defined(XGBOOST_USE_NVTX)
}
}

Expand Down
38 changes: 0 additions & 38 deletions src/common/timer.cu

This file was deleted.

2 changes: 0 additions & 2 deletions src/common/timer.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,6 @@ struct Monitor {
void Init(std::string label) { this->label_ = label; }
void Start(const std::string &name);
void Stop(const std::string &name);
void StartCuda(const std::string &name);
void StopCuda(const std::string &name);
};
} // namespace common
} // namespace xgboost
26 changes: 13 additions & 13 deletions src/data/ellpack_page.cu
Original file line number Diff line number Diff line change
Expand Up @@ -77,9 +77,9 @@ EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
monitor_.Init("ellpack_page");
dh::safe_cuda(cudaSetDevice(device));

monitor_.StartCuda("InitCompressedData");
this->InitCompressedData(device);
monitor_.StopCuda("InitCompressedData");
monitor_.Start("InitCompressedData");
InitCompressedData(device);
monitor_.Stop("InitCompressedData");
}

EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
Expand All @@ -101,21 +101,21 @@ EllpackPageImpl::EllpackPageImpl(DMatrix* dmat, const BatchParam& param)

n_rows = dmat->Info().num_row_;

monitor_.StartCuda("Quantiles");
monitor_.Start("Quantiles");
// Create the quantile sketches for the dmatrix and initialize HistogramCuts.
row_stride = GetRowStride(dmat);
cuts_ = common::DeviceSketch(param.gpu_id, dmat, param.max_bin);
monitor_.StopCuda("Quantiles");
monitor_.Stop("Quantiles");

monitor_.StartCuda("InitCompressedData");
monitor_.Start("InitCompressedData");
InitCompressedData(param.gpu_id);
monitor_.StopCuda("InitCompressedData");
monitor_.Stop("InitCompressedData");

monitor_.StartCuda("BinningCompression");
monitor_.Start("BinningCompression");
for (const auto& batch : dmat->GetBatches<SparsePage>()) {
CreateHistIndices(param.gpu_id, batch);
}
monitor_.StopCuda("BinningCompression");
monitor_.Stop("BinningCompression");
}

template <typename AdapterBatchT>
Expand Down Expand Up @@ -324,15 +324,15 @@ struct CopyPage {

// Copy the data from the given EllpackPage to the current page.
size_t EllpackPageImpl::Copy(int device, EllpackPageImpl* page, size_t offset) {
monitor_.StartCuda("Copy");
monitor_.Start("Copy");
size_t num_elements = page->n_rows * page->row_stride;
CHECK_EQ(row_stride, page->row_stride);
CHECK_EQ(NumSymbols(), page->NumSymbols());
CHECK_GE(n_rows * row_stride, offset + num_elements);
gidx_buffer.SetDevice(device);
page->gidx_buffer.SetDevice(device);
dh::LaunchN(device, num_elements, CopyPage(this, page, offset));
monitor_.StopCuda("Copy");
monitor_.Stop("Copy");
return num_elements;
}

Expand Down Expand Up @@ -381,14 +381,14 @@ struct CompactPage {
// Compacts the data from the given EllpackPage into the current page.
void EllpackPageImpl::Compact(int device, EllpackPageImpl* page,
common::Span<size_t> row_indexes) {
monitor_.StartCuda("Compact");
monitor_.Start("Compact");
CHECK_EQ(row_stride, page->row_stride);
CHECK_EQ(NumSymbols(), page->NumSymbols());
CHECK_LE(page->base_rowid + page->n_rows, row_indexes.size());
gidx_buffer.SetDevice(device);
page->gidx_buffer.SetDevice(device);
dh::LaunchN(device, page->n_rows, CompactPage(this, page, row_indexes));
monitor_.StopCuda("Compact");
monitor_.Stop("Compact");
}

// Initialize the buffer to stored compressed features.
Expand Down
8 changes: 4 additions & 4 deletions src/data/ellpack_page_source.cu
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,14 @@ EllpackPageSource::EllpackPageSource(DMatrix* dmat,
monitor_.Init("ellpack_page_source");
dh::safe_cuda(cudaSetDevice(param.gpu_id));

monitor_.StartCuda("Quantiles");
monitor_.Start("Quantiles");
size_t row_stride = GetRowStride(dmat);
auto cuts = common::DeviceSketch(param.gpu_id, dmat, param.max_bin);
monitor_.StopCuda("Quantiles");
monitor_.Stop("Quantiles");

monitor_.StartCuda("WriteEllpackPages");
monitor_.Start("WriteEllpackPages");
WriteEllpackPages(param.gpu_id, dmat, cuts, cache_info, row_stride);
monitor_.StopCuda("WriteEllpackPages");
monitor_.Stop("WriteEllpackPages");

external_prefetcher_.reset(
new ExternalMemoryPrefetcher<EllpackPage>(cache_info_));
Expand Down
4 changes: 2 additions & 2 deletions src/tree/gpu_hist/gradient_based_sampler.cu
Original file line number Diff line number Diff line change
Expand Up @@ -354,9 +354,9 @@ GradientBasedSampler::GradientBasedSampler(EllpackPageImpl* page,
// Sample a DMatrix based on the given gradient pairs.
GradientBasedSample GradientBasedSampler::Sample(common::Span<GradientPair> gpair,
DMatrix* dmat) {
monitor_.StartCuda("Sample");
monitor_.Start("Sample");
GradientBasedSample sample = strategy_->Sample(gpair, dmat);
monitor_.StopCuda("Sample");
monitor_.Stop("Sample");
return sample;
}

Expand Down
44 changes: 22 additions & 22 deletions src/tree/updater_gpu_hist.cu
Original file line number Diff line number Diff line change
Expand Up @@ -557,15 +557,15 @@ struct GPUHistMakerDevice {
}

void AllReduceHist(int nidx, dh::AllReducer* reducer) {
monitor.StartCuda("AllReduce");
monitor.Start("AllReduce");
auto d_node_hist = hist.GetNodeHistogram(nidx).data();
reducer->AllReduceSum(
reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
page->Cuts().TotalBins() * (sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT)));
reducer->Synchronize();

monitor.StopCuda("AllReduce");
monitor.Stop("AllReduce");
}

/**
Expand Down Expand Up @@ -670,13 +670,13 @@ struct GPUHistMakerDevice {
RegTree* p_tree, dh::AllReducer* reducer) {
auto& tree = *p_tree;

monitor.StartCuda("Reset");
monitor.Start("Reset");
this->Reset(gpair_all, p_fmat, p_fmat->Info().num_col_);
monitor.StopCuda("Reset");
monitor.Stop("Reset");

monitor.StartCuda("InitRoot");
monitor.Start("InitRoot");
this->InitRoot(p_tree, reducer);
monitor.StopCuda("InitRoot");
monitor.Stop("InitRoot");

auto timestamp = qexpand->size();
auto num_leaves = 1;
Expand All @@ -696,19 +696,19 @@ struct GPUHistMakerDevice {
// Only create child entries if needed
if (ExpandEntry::ChildIsValid(param, tree.GetDepth(left_child_nidx),
num_leaves)) {
monitor.StartCuda("UpdatePosition");
monitor.Start("UpdatePosition");
this->UpdatePosition(candidate.nid, (*p_tree)[candidate.nid]);
monitor.StopCuda("UpdatePosition");
monitor.Stop("UpdatePosition");

monitor.StartCuda("BuildHist");
monitor.Start("BuildHist");
this->BuildHistLeftRight(candidate, left_child_nidx, right_child_nidx, reducer);
monitor.StopCuda("BuildHist");
monitor.Stop("BuildHist");

monitor.StartCuda("EvaluateSplits");
monitor.Start("EvaluateSplits");
auto splits = this->EvaluateLeftRightSplits(candidate, left_child_nidx,
right_child_nidx,
*p_tree);
monitor.StopCuda("EvaluateSplits");
monitor.Stop("EvaluateSplits");

qexpand->push(ExpandEntry(left_child_nidx,
tree.GetDepth(left_child_nidx), splits.at(0),
Expand All @@ -719,9 +719,9 @@ struct GPUHistMakerDevice {
}
}

monitor.StartCuda("FinalisePosition");
monitor.Start("FinalisePosition");
this->FinalisePosition(p_tree, p_fmat);
monitor.StopCuda("FinalisePosition");
monitor.Stop("FinalisePosition");
}
};

Expand All @@ -744,7 +744,7 @@ class GPUHistMakerSpecialised {

void Update(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
const std::vector<RegTree*>& trees) {
monitor_.StartCuda("Update");
monitor_.Start("Update");

// rescale learning rate according to size of trees
float lr = param_.learning_rate;
Expand All @@ -765,7 +765,7 @@ class GPUHistMakerSpecialised {
}

param_.learning_rate = lr;
monitor_.StopCuda("Update");
monitor_.Stop("Update");
}

void InitDataOnce(DMatrix* dmat) {
Expand Down Expand Up @@ -800,9 +800,9 @@ class GPUHistMakerSpecialised {

void InitData(DMatrix* dmat) {
if (!initialised_) {
monitor_.StartCuda("InitDataOnce");
monitor_.Start("InitDataOnce");
this->InitDataOnce(dmat);
monitor_.StopCuda("InitDataOnce");
monitor_.Stop("InitDataOnce");
}
}

Expand All @@ -823,9 +823,9 @@ class GPUHistMakerSpecialised {

void UpdateTree(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat,
RegTree* p_tree) {
monitor_.StartCuda("InitData");
monitor_.Start("InitData");
this->InitData(p_fmat);
monitor_.StopCuda("InitData");
monitor_.Stop("InitData");

gpair->SetDevice(device_);
maker->UpdateTree(gpair, p_fmat, p_tree, &reducer_);
Expand All @@ -835,10 +835,10 @@ class GPUHistMakerSpecialised {
if (maker == nullptr || p_last_fmat_ == nullptr || p_last_fmat_ != data) {
return false;
}
monitor_.StartCuda("UpdatePredictionCache");
monitor_.Start("UpdatePredictionCache");
p_out_preds->SetDevice(device_);
maker->UpdatePredictionCache(p_out_preds->DevicePointer());
monitor_.StopCuda("UpdatePredictionCache");
monitor_.Stop("UpdatePredictionCache");
return true;
}

Expand Down
4 changes: 2 additions & 2 deletions tests/cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,9 @@ if (USE_CUDA)
endif (USE_NCCL)

if (USE_NVTX)
target_include_directories(testxgboost PRIVATE "${NVTX_HEADER_DIR}")
target_compile_definitions(testxgboost PRIVATE -DXGBOOST_USE_NVTX=1)
enable_nvtx(testxgboost)
endif (USE_NVTX)

if (MSVC)
target_compile_options(testxgboost PRIVATE
$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=/utf-8>
Expand Down
Loading

0 comments on commit eaf2a00

Please sign in to comment.