From 72ef553550521dd0512c2e98d4c2b3c211f86e50 Mon Sep 17 00:00:00 2001 From: Philip Hyunsu Cho Date: Thu, 24 Sep 2020 11:04:50 -0700 Subject: [PATCH] Fall back to CUB allocator if RMM memory pool is not set up (#6150) * Fall back to CUB allocator if RMM memory pool is not set up * Fix build * Prevent memory leak * Add note about lack of memory initialisation * Add check for other fast allocators * Set use_cub_allocator_ to true when RMM is not enabled * Fix clang-tidy * Do not demangle symbol; add check to ensure Linux+Clang/GCC combo --- CMakeLists.txt | 6 +++ src/common/device_helpers.cuh | 76 ++++++++++++++++++++--------------- 2 files changed, 49 insertions(+), 33 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 45992046b86d..f6b821de3ee8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -90,6 +90,12 @@ endif (USE_AVX) if (PLUGIN_RMM AND NOT (USE_CUDA)) message(SEND_ERROR "`PLUGIN_RMM` must be enabled with `USE_CUDA` flag.") endif (PLUGIN_RMM AND NOT (USE_CUDA)) +if (PLUGIN_RMM AND NOT ((CMAKE_CXX_COMPILER_ID STREQUAL "Clang") OR (CMAKE_CXX_COMPILER_ID STREQUAL "GNU"))) + message(SEND_ERROR "`PLUGIN_RMM` must be used with GCC or Clang compiler.") +endif (PLUGIN_RMM AND NOT ((CMAKE_CXX_COMPILER_ID STREQUAL "Clang") OR (CMAKE_CXX_COMPILER_ID STREQUAL "GNU"))) +if (PLUGIN_RMM AND NOT (CMAKE_SYSTEM_NAME STREQUAL "Linux")) + message(SEND_ERROR "`PLUGIN_RMM` must be used with Linux.") +endif (PLUGIN_RMM AND NOT (CMAKE_SYSTEM_NAME STREQUAL "Linux")) if (ENABLE_ALL_WARNINGS) if ((NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang") AND (NOT CMAKE_CXX_COMPILER_ID STREQUAL "GNU")) message(SEND_ERROR "ENABLE_ALL_WARNINGS is only available for Clang and GCC.") diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh index 70b3895fc7fe..471ec31f42ac 100644 --- a/src/common/device_helpers.cuh +++ b/src/common/device_helpers.cuh @@ -402,7 +402,7 @@ struct XGBDefaultDeviceAllocatorImpl : XGBBaseDeviceAllocator { } void deallocate(pointer ptr, size_t n) { // NOLINT GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T)); - return SuperT::deallocate(ptr, n); + SuperT::deallocate(ptr, n); } #if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 XGBDefaultDeviceAllocatorImpl() @@ -410,49 +410,59 @@ struct XGBDefaultDeviceAllocatorImpl : XGBBaseDeviceAllocator { #endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 }; -#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 -template -using XGBCachingDeviceAllocatorImpl = XGBDefaultDeviceAllocatorImpl; -#else /** - * \brief Caching memory allocator, uses cub::CachingDeviceAllocator as a back-end and logs - * allocations if verbose. Does not initialise memory on construction. + * \brief Caching memory allocator, uses cub::CachingDeviceAllocator as a back-end, unless + * RMM pool allocator is enabled. Does not initialise memory on construction. */ template -struct XGBCachingDeviceAllocatorImpl : thrust::device_malloc_allocator { +struct XGBCachingDeviceAllocatorImpl : XGBBaseDeviceAllocator { + using SuperT = XGBBaseDeviceAllocator; using pointer = thrust::device_ptr; // NOLINT template struct rebind // NOLINT { using other = XGBCachingDeviceAllocatorImpl; // NOLINT }; - cub::CachingDeviceAllocator& GetGlobalCachingAllocator () - { - // Configure allocator with maximum cached bin size of ~1GB and no limit on - // maximum cached bytes - static cub::CachingDeviceAllocator *allocator = new cub::CachingDeviceAllocator(2, 9, 29); - return *allocator; - } - pointer allocate(size_t n) { // NOLINT - T *ptr; - GetGlobalCachingAllocator().DeviceAllocate(reinterpret_cast(&ptr), - n * sizeof(T)); - pointer thrust_ptr(ptr); - GlobalMemoryLogger().RegisterAllocation(thrust_ptr.get(), n * sizeof(T)); - return thrust_ptr; - } - void deallocate(pointer ptr, size_t n) { // NOLINT - GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T)); - GetGlobalCachingAllocator().DeviceFree(ptr.get()); - } - - __host__ __device__ - void construct(T *) // NOLINT - { - // no-op + cub::CachingDeviceAllocator& GetGlobalCachingAllocator() { + // Configure allocator with maximum cached bin size of ~1GB and no limit on + // maximum cached bytes + static cub::CachingDeviceAllocator *allocator = new cub::CachingDeviceAllocator(2, 9, 29); + return *allocator; + } + pointer allocate(size_t n) { // NOLINT + pointer ptr; + if (use_cub_allocator_) { + T* raw_ptr{nullptr}; + GetGlobalCachingAllocator().DeviceAllocate(reinterpret_cast(&raw_ptr), n * sizeof(T)); + ptr = pointer(raw_ptr); + } else { + ptr = SuperT::allocate(n); + } + GlobalMemoryLogger().RegisterAllocation(ptr.get(), n * sizeof(T)); + return ptr; + } + void deallocate(pointer ptr, size_t n) { // NOLINT + GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T)); + if (use_cub_allocator_) { + GetGlobalCachingAllocator().DeviceFree(ptr.get()); + } else { + SuperT::deallocate(ptr, n); + } + } +#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 + XGBCachingDeviceAllocatorImpl() + : SuperT(rmm::mr::get_current_device_resource(), cudaStream_t{nullptr}) { + std::string symbol{typeid(*SuperT::resource()).name()}; + if (symbol.find("pool_memory_resource") != std::string::npos + || symbol.find("binning_memory_resource") != std::string::npos + || symbol.find("arena_memory_resource") != std::string::npos) { + use_cub_allocator_ = false; + } } -}; #endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 + private: + bool use_cub_allocator_{true}; +}; } // namespace detail // Declare xgboost allocators