From eaf2a00b5c83c48d026bdae4108cffe5607b6185 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Wed, 6 May 2020 22:54:24 +0800 Subject: [PATCH] Enhance nvtx support. (#5636) --- CMakeLists.txt | 7 ++++ cmake/Utils.cmake | 7 ++++ cmake/modules/FindNVTX.cmake | 26 ++++++++++++ src/CMakeLists.txt | 3 +- src/common/timer.cc | 14 ++++++- src/common/timer.cu | 38 ------------------ src/common/timer.h | 2 - src/data/ellpack_page.cu | 26 ++++++------ src/data/ellpack_page_source.cu | 8 ++-- src/tree/gpu_hist/gradient_based_sampler.cu | 4 +- src/tree/updater_gpu_hist.cu | 44 ++++++++++----------- tests/cpp/CMakeLists.txt | 4 +- tests/cpp/data/test_sparse_page_dmatrix.cc | 2 +- 13 files changed, 98 insertions(+), 87 deletions(-) create mode 100644 cmake/modules/FindNVTX.cmake delete mode 100644 src/common/timer.cu diff --git a/CMakeLists.txt b/CMakeLists.txt index 2eb445e884bf..56c0fd7de0ac 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -158,6 +158,10 @@ else (BUILD_STATIC_LIB) add_library(xgboost SHARED ${XGBOOST_OBJ_SOURCES}) endif (BUILD_STATIC_LIB) +if (USE_NVTX) + enable_nvtx(xgboost) +endif (USE_NVTX) + #-- Hide all C++ symbols if (HIDE_CXX_SYMBOLS) set_target_properties(objxgboost PROPERTIES CXX_VISIBILITY_PRESET hidden) @@ -178,6 +182,9 @@ endif (JVM_BINDINGS) #-- CLI for xgboost add_executable(runxgboost ${xgboost_SOURCE_DIR}/src/cli_main.cc ${XGBOOST_OBJ_SOURCES}) +if (USE_NVTX) + enable_nvtx(runxgboost) +endif (USE_NVTX) target_include_directories(runxgboost PRIVATE diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake index 4a9e63a77aba..6105330b11f6 100644 --- a/cmake/Utils.cmake +++ b/cmake/Utils.cmake @@ -141,3 +141,10 @@ DESTINATION \"${build_dir}/bak\")") install(CODE "file(RENAME \"${build_dir}/bak/cmake_install.cmake\" \"${build_dir}/R-package/cmake_install.cmake\")") endfunction(setup_rpackage_install_target) + +macro(enable_nvtx target) + find_package(NVTX REQUIRED) + target_include_directories(${target} PRIVATE "${NVTX_INCLUDE_DIR}") + target_link_libraries(${target} PRIVATE "${NVTX_LIBRARY}") + target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_NVTX=1) +endmacro() diff --git a/cmake/modules/FindNVTX.cmake b/cmake/modules/FindNVTX.cmake new file mode 100644 index 000000000000..173e255c8951 --- /dev/null +++ b/cmake/modules/FindNVTX.cmake @@ -0,0 +1,26 @@ +if (NVTX_LIBRARY) + unset(NVTX_LIBRARY CACHE) +endif (NVTX_LIBRARY) + +set(NVTX_LIB_NAME nvToolsExt) + + +find_path(NVTX_INCLUDE_DIR + NAMES nvToolsExt.h + PATHS ${CUDA_HOME}/include ${CUDA_INCLUDE} /usr/local/cuda/include) + + +find_library(NVTX_LIBRARY + NAMES nvToolsExt + PATHS ${CUDA_HOME}/lib64 /usr/local/cuda/lib64) + +message(STATUS "Using nvtx library: ${NVTX_LIBRARY}") + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(NVTX DEFAULT_MSG + NVTX_INCLUDE_DIR NVTX_LIBRARY) + +mark_as_advanced( + NVTX_INCLUDE_DIR + NVTX_LIBRARY +) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 9a60603168a4..2dff25cee56a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -25,8 +25,7 @@ if (USE_CUDA) endif (USE_NCCL) if (USE_NVTX) - target_include_directories(objxgboost PRIVATE "${NVTX_HEADER_DIR}") - target_compile_definitions(objxgboost PRIVATE -DXGBOOST_USE_NVTX=1) + enable_nvtx(objxgboost) endif (USE_NVTX) target_compile_options(objxgboost PRIVATE diff --git a/src/common/timer.cc b/src/common/timer.cc index 2cc140496951..49d08a35cae6 100644 --- a/src/common/timer.cc +++ b/src/common/timer.cc @@ -10,12 +10,21 @@ #include "timer.h" #include "xgboost/json.h" +#if defined(XGBOOST_USE_NVTX) +#include +#endif // defined(XGBOOST_USE_NVTX) + namespace xgboost { namespace common { void Monitor::Start(std::string const &name) { if (ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) { - statistics_map_[name].timer.Start(); + auto &stats = statistics_map_[name]; + stats.timer.Start(); +#if defined(XGBOOST_USE_NVTX) + std::string nvtx_name = label_ + "::" + name; + stats.nvtx_id = nvtxRangeStartA(nvtx_name.c_str()); +#endif // defined(XGBOOST_USE_NVTX) } } @@ -24,6 +33,9 @@ void Monitor::Stop(const std::string &name) { auto &stats = statistics_map_[name]; stats.timer.Stop(); stats.count++; +#if defined(XGBOOST_USE_NVTX) + nvtxRangeEnd(stats.nvtx_id); +#endif // defined(XGBOOST_USE_NVTX) } } diff --git a/src/common/timer.cu b/src/common/timer.cu deleted file mode 100644 index 8b8e54bc31c9..000000000000 --- a/src/common/timer.cu +++ /dev/null @@ -1,38 +0,0 @@ -/*! - * Copyright by Contributors 2019 - */ -#if defined(XGBOOST_USE_NVTX) -#include -#endif // defined(XGBOOST_USE_NVTX) - -#include - -#include "xgboost/logging.h" -#include "device_helpers.cuh" -#include "timer.h" - -namespace xgboost { -namespace common { - -void Monitor::StartCuda(const std::string& name) { - if (ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) { - auto &stats = statistics_map_[name]; - stats.timer.Start(); -#if defined(XGBOOST_USE_NVTX) - stats.nvtx_id = nvtxRangeStartA(name.c_str()); -#endif // defined(XGBOOST_USE_NVTX) - } -} - -void Monitor::StopCuda(const std::string& name) { - if (ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) { - auto &stats = statistics_map_[name]; - stats.timer.Stop(); - stats.count++; -#if defined(XGBOOST_USE_NVTX) - nvtxRangeEnd(stats.nvtx_id); -#endif // defined(XGBOOST_USE_NVTX) - } -} -} // namespace common -} // namespace xgboost diff --git a/src/common/timer.h b/src/common/timer.h index bc71dca8e6d8..b2dc2a428d13 100644 --- a/src/common/timer.h +++ b/src/common/timer.h @@ -82,8 +82,6 @@ struct Monitor { void Init(std::string label) { this->label_ = label; } void Start(const std::string &name); void Stop(const std::string &name); - void StartCuda(const std::string &name); - void StopCuda(const std::string &name); }; } // namespace common } // namespace xgboost diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu index cd9b1360466d..9adffb3a7ef2 100644 --- a/src/data/ellpack_page.cu +++ b/src/data/ellpack_page.cu @@ -77,9 +77,9 @@ EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts, monitor_.Init("ellpack_page"); dh::safe_cuda(cudaSetDevice(device)); - monitor_.StartCuda("InitCompressedData"); - this->InitCompressedData(device); - monitor_.StopCuda("InitCompressedData"); + monitor_.Start("InitCompressedData"); + InitCompressedData(device); + monitor_.Stop("InitCompressedData"); } EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts, @@ -101,21 +101,21 @@ EllpackPageImpl::EllpackPageImpl(DMatrix* dmat, const BatchParam& param) n_rows = dmat->Info().num_row_; - monitor_.StartCuda("Quantiles"); + monitor_.Start("Quantiles"); // Create the quantile sketches for the dmatrix and initialize HistogramCuts. row_stride = GetRowStride(dmat); cuts_ = common::DeviceSketch(param.gpu_id, dmat, param.max_bin); - monitor_.StopCuda("Quantiles"); + monitor_.Stop("Quantiles"); - monitor_.StartCuda("InitCompressedData"); + monitor_.Start("InitCompressedData"); InitCompressedData(param.gpu_id); - monitor_.StopCuda("InitCompressedData"); + monitor_.Stop("InitCompressedData"); - monitor_.StartCuda("BinningCompression"); + monitor_.Start("BinningCompression"); for (const auto& batch : dmat->GetBatches()) { CreateHistIndices(param.gpu_id, batch); } - monitor_.StopCuda("BinningCompression"); + monitor_.Stop("BinningCompression"); } template @@ -324,7 +324,7 @@ struct CopyPage { // Copy the data from the given EllpackPage to the current page. size_t EllpackPageImpl::Copy(int device, EllpackPageImpl* page, size_t offset) { - monitor_.StartCuda("Copy"); + monitor_.Start("Copy"); size_t num_elements = page->n_rows * page->row_stride; CHECK_EQ(row_stride, page->row_stride); CHECK_EQ(NumSymbols(), page->NumSymbols()); @@ -332,7 +332,7 @@ size_t EllpackPageImpl::Copy(int device, EllpackPageImpl* page, size_t offset) { gidx_buffer.SetDevice(device); page->gidx_buffer.SetDevice(device); dh::LaunchN(device, num_elements, CopyPage(this, page, offset)); - monitor_.StopCuda("Copy"); + monitor_.Stop("Copy"); return num_elements; } @@ -381,14 +381,14 @@ struct CompactPage { // Compacts the data from the given EllpackPage into the current page. void EllpackPageImpl::Compact(int device, EllpackPageImpl* page, common::Span row_indexes) { - monitor_.StartCuda("Compact"); + monitor_.Start("Compact"); CHECK_EQ(row_stride, page->row_stride); CHECK_EQ(NumSymbols(), page->NumSymbols()); CHECK_LE(page->base_rowid + page->n_rows, row_indexes.size()); gidx_buffer.SetDevice(device); page->gidx_buffer.SetDevice(device); dh::LaunchN(device, page->n_rows, CompactPage(this, page, row_indexes)); - monitor_.StopCuda("Compact"); + monitor_.Stop("Compact"); } // Initialize the buffer to stored compressed features. diff --git a/src/data/ellpack_page_source.cu b/src/data/ellpack_page_source.cu index 1b91d1ba2bb9..d23042472f9e 100644 --- a/src/data/ellpack_page_source.cu +++ b/src/data/ellpack_page_source.cu @@ -29,14 +29,14 @@ EllpackPageSource::EllpackPageSource(DMatrix* dmat, monitor_.Init("ellpack_page_source"); dh::safe_cuda(cudaSetDevice(param.gpu_id)); - monitor_.StartCuda("Quantiles"); + monitor_.Start("Quantiles"); size_t row_stride = GetRowStride(dmat); auto cuts = common::DeviceSketch(param.gpu_id, dmat, param.max_bin); - monitor_.StopCuda("Quantiles"); + monitor_.Stop("Quantiles"); - monitor_.StartCuda("WriteEllpackPages"); + monitor_.Start("WriteEllpackPages"); WriteEllpackPages(param.gpu_id, dmat, cuts, cache_info, row_stride); - monitor_.StopCuda("WriteEllpackPages"); + monitor_.Stop("WriteEllpackPages"); external_prefetcher_.reset( new ExternalMemoryPrefetcher(cache_info_)); diff --git a/src/tree/gpu_hist/gradient_based_sampler.cu b/src/tree/gpu_hist/gradient_based_sampler.cu index 6c70af90b995..eb441f39e67b 100644 --- a/src/tree/gpu_hist/gradient_based_sampler.cu +++ b/src/tree/gpu_hist/gradient_based_sampler.cu @@ -354,9 +354,9 @@ GradientBasedSampler::GradientBasedSampler(EllpackPageImpl* page, // Sample a DMatrix based on the given gradient pairs. GradientBasedSample GradientBasedSampler::Sample(common::Span gpair, DMatrix* dmat) { - monitor_.StartCuda("Sample"); + monitor_.Start("Sample"); GradientBasedSample sample = strategy_->Sample(gpair, dmat); - monitor_.StopCuda("Sample"); + monitor_.Stop("Sample"); return sample; } diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 370f8c5f6329..69a3d79770ba 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -557,7 +557,7 @@ struct GPUHistMakerDevice { } void AllReduceHist(int nidx, dh::AllReducer* reducer) { - monitor.StartCuda("AllReduce"); + monitor.Start("AllReduce"); auto d_node_hist = hist.GetNodeHistogram(nidx).data(); reducer->AllReduceSum( reinterpret_cast(d_node_hist), @@ -565,7 +565,7 @@ struct GPUHistMakerDevice { page->Cuts().TotalBins() * (sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT))); reducer->Synchronize(); - monitor.StopCuda("AllReduce"); + monitor.Stop("AllReduce"); } /** @@ -670,13 +670,13 @@ struct GPUHistMakerDevice { RegTree* p_tree, dh::AllReducer* reducer) { auto& tree = *p_tree; - monitor.StartCuda("Reset"); + monitor.Start("Reset"); this->Reset(gpair_all, p_fmat, p_fmat->Info().num_col_); - monitor.StopCuda("Reset"); + monitor.Stop("Reset"); - monitor.StartCuda("InitRoot"); + monitor.Start("InitRoot"); this->InitRoot(p_tree, reducer); - monitor.StopCuda("InitRoot"); + monitor.Stop("InitRoot"); auto timestamp = qexpand->size(); auto num_leaves = 1; @@ -696,19 +696,19 @@ struct GPUHistMakerDevice { // Only create child entries if needed if (ExpandEntry::ChildIsValid(param, tree.GetDepth(left_child_nidx), num_leaves)) { - monitor.StartCuda("UpdatePosition"); + monitor.Start("UpdatePosition"); this->UpdatePosition(candidate.nid, (*p_tree)[candidate.nid]); - monitor.StopCuda("UpdatePosition"); + monitor.Stop("UpdatePosition"); - monitor.StartCuda("BuildHist"); + monitor.Start("BuildHist"); this->BuildHistLeftRight(candidate, left_child_nidx, right_child_nidx, reducer); - monitor.StopCuda("BuildHist"); + monitor.Stop("BuildHist"); - monitor.StartCuda("EvaluateSplits"); + monitor.Start("EvaluateSplits"); auto splits = this->EvaluateLeftRightSplits(candidate, left_child_nidx, right_child_nidx, *p_tree); - monitor.StopCuda("EvaluateSplits"); + monitor.Stop("EvaluateSplits"); qexpand->push(ExpandEntry(left_child_nidx, tree.GetDepth(left_child_nidx), splits.at(0), @@ -719,9 +719,9 @@ struct GPUHistMakerDevice { } } - monitor.StartCuda("FinalisePosition"); + monitor.Start("FinalisePosition"); this->FinalisePosition(p_tree, p_fmat); - monitor.StopCuda("FinalisePosition"); + monitor.Stop("FinalisePosition"); } }; @@ -744,7 +744,7 @@ class GPUHistMakerSpecialised { void Update(HostDeviceVector* gpair, DMatrix* dmat, const std::vector& trees) { - monitor_.StartCuda("Update"); + monitor_.Start("Update"); // rescale learning rate according to size of trees float lr = param_.learning_rate; @@ -765,7 +765,7 @@ class GPUHistMakerSpecialised { } param_.learning_rate = lr; - monitor_.StopCuda("Update"); + monitor_.Stop("Update"); } void InitDataOnce(DMatrix* dmat) { @@ -800,9 +800,9 @@ class GPUHistMakerSpecialised { void InitData(DMatrix* dmat) { if (!initialised_) { - monitor_.StartCuda("InitDataOnce"); + monitor_.Start("InitDataOnce"); this->InitDataOnce(dmat); - monitor_.StopCuda("InitDataOnce"); + monitor_.Stop("InitDataOnce"); } } @@ -823,9 +823,9 @@ class GPUHistMakerSpecialised { void UpdateTree(HostDeviceVector* gpair, DMatrix* p_fmat, RegTree* p_tree) { - monitor_.StartCuda("InitData"); + monitor_.Start("InitData"); this->InitData(p_fmat); - monitor_.StopCuda("InitData"); + monitor_.Stop("InitData"); gpair->SetDevice(device_); maker->UpdateTree(gpair, p_fmat, p_tree, &reducer_); @@ -835,10 +835,10 @@ class GPUHistMakerSpecialised { if (maker == nullptr || p_last_fmat_ == nullptr || p_last_fmat_ != data) { return false; } - monitor_.StartCuda("UpdatePredictionCache"); + monitor_.Start("UpdatePredictionCache"); p_out_preds->SetDevice(device_); maker->UpdatePredictionCache(p_out_preds->DevicePointer()); - monitor_.StopCuda("UpdatePredictionCache"); + monitor_.Stop("UpdatePredictionCache"); return true; } diff --git a/tests/cpp/CMakeLists.txt b/tests/cpp/CMakeLists.txt index 98a81ea7f432..53f3e8809660 100644 --- a/tests/cpp/CMakeLists.txt +++ b/tests/cpp/CMakeLists.txt @@ -40,9 +40,9 @@ if (USE_CUDA) endif (USE_NCCL) if (USE_NVTX) - target_include_directories(testxgboost PRIVATE "${NVTX_HEADER_DIR}") - target_compile_definitions(testxgboost PRIVATE -DXGBOOST_USE_NVTX=1) + enable_nvtx(testxgboost) endif (USE_NVTX) + if (MSVC) target_compile_options(testxgboost PRIVATE $<$:-Xcompiler=/utf-8> diff --git a/tests/cpp/data/test_sparse_page_dmatrix.cc b/tests/cpp/data/test_sparse_page_dmatrix.cc index 341630016a15..5c719f78ae4c 100644 --- a/tests/cpp/data/test_sparse_page_dmatrix.cc +++ b/tests/cpp/data/test_sparse_page_dmatrix.cc @@ -94,7 +94,7 @@ TEST(SparsePageDMatrix, ThreadSafetyException) { bool exception = false; int threads = 1000; -#pragma omp parallel for +#pragma omp parallel for num_threads(threads) for (auto i = 0; i < threads; i++) { try { auto iter = dmat->GetBatches().begin();