Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use cuda virtual memory management and merge blocks #36189

Merged
merged 44 commits into from
Nov 8, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
12bba85
Use cuda virtual memory management and merge blocks, test=develop
wanghuancoder Sep 28, 2021
4ca9d2f
refine, test=develop
wanghuancoder Sep 28, 2021
1fa7328
refine, test=develop
wanghuancoder Sep 28, 2021
3544756
refine, test=develop
wanghuancoder Sep 28, 2021
ae06a0b
refine, test=develop
wanghuancoder Sep 28, 2021
97234d4
refine, test=develop
wanghuancoder Sep 28, 2021
41a4b97
refine, test=develop
wanghuancoder Sep 29, 2021
a21c982
refine, test=develop
wanghuancoder Sep 29, 2021
d38050b
window dll, test=develop
wanghuancoder Sep 29, 2021
5b83128
merge, test=develop
wanghuancoder Sep 29, 2021
1296daa
fix cuda error of CUDA_ERROR_NOT_INITIALIZED, test=develop
wanghuancoder Oct 8, 2021
cc368b5
use autogrowthv2 for system allocator, test=develop
wanghuancoder Oct 8, 2021
c3889e7
remove ~CUDAVirtualMemAllocator(), test=develop
wanghuancoder Oct 8, 2021
4d8cfc1
refine, test=develop
wanghuancoder Oct 9, 2021
7863dfb
fix cuda error of CUDA_ERROR_NOT_INITIALIZED, test=develop
wanghuancoder Oct 9, 2021
34983a8
fix cuda error of CUDA_ERROR_NOT_INITIALIZED, test=develop
wanghuancoder Oct 9, 2021
c53a782
fix bug, test=develop
wanghuancoder Oct 9, 2021
8208d51
revert system allocator, test =develop
wanghuancoder Oct 9, 2021
52d021f
revert multiprocessing, test=develop
wanghuancoder Oct 11, 2021
c01bf0a
fix AutoGrowthBestFitAllocatorV2 mutxt, test=develop
wanghuancoder Oct 11, 2021
a257984
catch cudaErrorInitializationError when create allocator, test=develop
wanghuancoder Oct 11, 2021
629a80c
Merge branch 'develop' into auto_growth_v2
wanghuancoder Oct 12, 2021
13d4285
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
wanghuancoder Oct 14, 2021
ec85a0f
fix cuMemSetAccess use, test=develop
wanghuancoder Oct 14, 2021
329c568
refine cuda api use, test=develop
wanghuancoder Oct 14, 2021
3ba1985
refine, test=develop
wanghuancoder Oct 14, 2021
f253ffb
for test, test=develop
wanghuancoder Oct 15, 2021
8312f3c
for test, test=develop
wanghuancoder Oct 18, 2021
7f1891e
switch to v2, test=develop
wanghuancoder Oct 18, 2021
ce93e11
refine virtual allocator, test=develop
wanghuancoder Oct 19, 2021
6ab7de3
Record cuMemCreate and cuMemRelease, test=develop
wanghuancoder Oct 19, 2021
1d90246
refine, test=develop
wanghuancoder Oct 19, 2021
b9c04cc
avoid out of bounds, test=develop
wanghuancoder Oct 19, 2021
5fca3b0
rename allocator, test=develop
wanghuancoder Oct 20, 2021
f164521
refine, test=develop
wanghuancoder Oct 21, 2021
55ef100
use PADDLE_ENFORCE_CUDA_SUCCESS, test=develop
wanghuancoder Oct 22, 2021
fa077f3
for test,test=develop
wanghuancoder Oct 22, 2021
b82a059
refine, test=develop
wanghuancoder Oct 22, 2021
a4db9cb
refine, test=develop
wanghuancoder Nov 1, 2021
7eadf41
refine, test=develop
wanghuancoder Nov 2, 2021
4b20091
refine, test=develop
wanghuancoder Nov 3, 2021
84211e5
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
wanghuancoder Nov 3, 2021
e8469f4
refine, test=develop
wanghuancoder Nov 5, 2021
f7df2b8
refine, test=develop
wanghuancoder Nov 5, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion paddle/fluid/memory/allocation/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ if (WITH_GPU)
nv_library(thread_local_allocator SRCS thread_local_allocator.cc DEPS allocator)
nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
cc_test(thread_local_allocator_test SRCS thread_local_allocator_test.cc DEPS thread_local_allocator)
if(CUDA_VERSION GREATER_EQUAL 10.2)
nv_library(cuda_virtual_mem_allocator SRCS cuda_virtual_mem_allocator.cc DEPS dynload_cuda)
endif()
endif()

if (WITH_ROCM)
Expand All @@ -36,6 +39,9 @@ cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator)

if (WITH_GPU OR WITH_ROCM)
set(AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator cuda_device_guard thread_local_allocator)
if(CUDA_VERSION GREATER_EQUAL 10.2)
list(APPEND AllocatorFacadeDeps cuda_virtual_mem_allocator)
endif()
elseif(WITH_XPU)
set(AllocatorFacadeDeps xpu_info)
elseif(WITH_ASCEND)
Expand Down Expand Up @@ -72,7 +78,7 @@ else()
cpu_allocator)
endif()

list(APPEND AllocatorFacadeDeps cpu_allocator locked_allocator aligned_allocator retry_allocator buffered_allocator naive_best_fit_allocator auto_growth_best_fit_allocator best_fit_allocator)
list(APPEND AllocatorFacadeDeps cpu_allocator locked_allocator aligned_allocator retry_allocator buffered_allocator naive_best_fit_allocator auto_growth_best_fit_allocator virtual_memory_auto_growth_best_fit_allocator best_fit_allocator)

if (WITH_ASCEND_CL)
list(APPEND AllocatorFacadeDeps npu_pinned_allocator)
Expand Down Expand Up @@ -107,6 +113,8 @@ cc_library(auto_growth_best_fit_allocator SRCS auto_growth_best_fit_allocator.cc
cc_test(auto_growth_best_fit_allocator_facade_test SRCS auto_growth_best_fit_allocator_facade_test.cc DEPS cpu_allocator auto_growth_best_fit_allocator)
cc_test(auto_growth_best_fit_allocator_test SRCS auto_growth_best_fit_allocator_test.cc DEPS auto_growth_best_fit_allocator)

cc_library(virtual_memory_auto_growth_best_fit_allocator SRCS virtual_memory_auto_growth_best_fit_allocator.cc DEPS allocator aligned_allocator)

if(NOT WIN32)
cc_library(mmap_allocator SRCS mmap_allocator.cc DEPS allocator)
cc_test(mmap_allocator_test SRCS mmap_allocator_test.cc DEPS mmap_allocator allocator)
Expand Down
44 changes: 44 additions & 0 deletions paddle/fluid/memory/allocation/allocator_facade.cc
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,11 @@
#include "paddle/fluid/memory/allocation/thread_local_allocator.h"
#include "paddle/fluid/platform/gpu_info.h"
#endif
#if CUDA_VERSION >= 10020
#include "paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h"
#include "paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.h"
#include "paddle/fluid/platform/dynload/cuda_driver.h"
#endif
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cuda_graph.h"
#endif
Expand All @@ -51,6 +56,9 @@ PADDLE_DEFINE_EXPORTED_bool(
"Whether to use system allocator to allocate CPU and GPU memory. "
"Only used for unittests.");

PADDLE_DEFINE_EXPORTED_bool(use_virtual_memory_auto_growth, false,
"Use VirtualMemoryAutoGrowthBestFitAllocator.");

DECLARE_string(allocator_strategy);

namespace paddle {
Expand Down Expand Up @@ -258,6 +266,40 @@ class AllocatorFacadePrivate {

void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p,
bool allow_free_idle_chunk) {
#if defined(PADDLE_WITH_HIP)
auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
cuda_allocator, platform::GpuMinChunkSize(), allow_free_idle_chunk);
#endif

#if defined(PADDLE_WITH_CUDA)
#if CUDA_VERSION >= 10020
CUdevice device;
int val;
try {
PADDLE_ENFORCE_CUDA_SUCCESS(
paddle::platform::dynload::cuDeviceGet(&device, p.GetDeviceId()));

PADDLE_ENFORCE_CUDA_SUCCESS(
paddle::platform::dynload::cuDeviceGetAttribute(
&val, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED,
device));
} catch (...) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

in which case it may raise exception?

Copy link
Contributor Author

@wanghuancoder wanghuancoder Oct 22, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see, plz add comments on that.

val = 0;
}

if (val > 0 && FLAGS_use_virtual_memory_auto_growth) {
auto cuda_allocator = std::make_shared<CUDAVirtualMemAllocator>(p);
allocators_[p] =
std::make_shared<VirtualMemoryAutoGrowthBestFitAllocator>(
cuda_allocator, platform::GpuMinChunkSize(), p);
} else {
auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
cuda_allocator, platform::GpuMinChunkSize(), allow_free_idle_chunk);
}

#else
auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
auto alignment = platform::GpuMinChunkSize();
bool need_addr_align = true;
Expand Down Expand Up @@ -292,6 +334,8 @@ class AllocatorFacadePrivate {
}
allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
underlying_allocator, alignment, 0, allow_free_idle_chunk);
#endif
#endif
}
#endif

Expand Down
225 changes: 225 additions & 0 deletions paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#ifdef PADDLE_WITH_CUDA
#include <cuda.h>
#include <cuda_runtime.h>
#endif

#include <string>
#include "paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h"
#include "paddle/fluid/platform/enforce.h"

#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/platform/dynload/cuda_driver.h"
#include "paddle/fluid/platform/gpu_info.h"
#endif
#if CUDA_VERSION >= 10020

namespace paddle {
namespace memory {
namespace allocation {

CUDAVirtualMemAllocator::CUDAVirtualMemAllocator(
const platform::CUDAPlace& place)
: place_(place) {
CUmemAllocationProp prop = {};

// Setup the properties common for all the chunks
// The allocations will be device pinned memory.
// This property structure describes the physical location where the memory
// will be allocated via cuMemCreate allong with additional properties In this
// case, the allocation will be pinnded device memory local to a given device.
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
prop.location.id = place.device;
prop_ = prop;

// Prepare the access descriptor array indicating where and how the backings
// should be visible.
access_desc_.resize(platform::GetCUDADeviceCount());
for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); ++dev_id) {
if (place.device != dev_id) {
int capable = 0;
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaDeviceCanAccessPeer(&capable, place.device, dev_id));
if (!capable) {
continue;
}
}
// Specify which device we are adding mappings for.
access_desc_[dev_id].location.type = CU_MEM_LOCATION_TYPE_DEVICE;
access_desc_[dev_id].location.id = dev_id;

// Specify both read and write access.
access_desc_[dev_id].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
}

// Get the minimum granularity needed for all devices
// (the max of the minimum granularity of each participating device)
granularity_ = 0;
for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); ++dev_id) {
size_t granularity;
prop.location.id = dev_id;
PADDLE_ENFORCE_CUDA_SUCCESS(
paddle::platform::dynload::cuMemGetAllocationGranularity(
&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
granularity_ = std::max(granularity, granularity_);
}

size_t actual_avail, actual_total;
paddle::platform::CUDADeviceGuard guard(place.device);
PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemGetInfo(&actual_avail, &actual_total));

virtual_mem_size_ = AlignedSize(actual_total, granularity_);

// Reserve the required contiguous virtual address space for the allocations
// The maximum video memory size we can apply for is the video memory size of
// GPU,
// so the virtual address space size we reserve is equal to the GPU video
// memory size
PADDLE_ENFORCE_CUDA_SUCCESS(paddle::platform::dynload::cuMemAddressReserve(
&virtual_mem_base_, virtual_mem_size_, 0, 0, 0));

virtual_mem_alloced_offset_ = 0;
}

bool CUDAVirtualMemAllocator::IsAllocThreadSafe() const { return false; }

void CUDAVirtualMemAllocator::FreeImpl(Allocation* allocation) {
PADDLE_ENFORCE_EQ(
BOOST_GET_CONST(platform::CUDAPlace, allocation->place()), place_,
platform::errors::PermissionDenied(
"GPU memory is freed in incorrect device. This may be a bug"));

auto iter = virtual_2_physical_map_.find(
reinterpret_cast<CUdeviceptr>(allocation->ptr()));
if (iter == virtual_2_physical_map_.end()) {
PADDLE_THROW(platform::errors::InvalidArgument(
"Can not find virtual memory address at %s", allocation->ptr()));
}

int prev_id;
cudaGetDevice(&prev_id);
if (prev_id != place_.device) {
cudaSetDevice(place_.device);
}

auto result =
paddle::platform::dynload::cuMemUnmap(iter->first, iter->second.second);
if (result != CUDA_ERROR_DEINITIALIZED) {
PADDLE_ENFORCE_CUDA_SUCCESS(result);
}

if (result != CUDA_ERROR_DEINITIALIZED) {
PADDLE_ENFORCE_CUDA_SUCCESS(platform::RecordedCuMemRelease(
iter->second.first, iter->second.second, place_.device));
}

if (prev_id != place_.device) {
cudaSetDevice(prev_id);
}

virtual_2_physical_map_.erase(iter);

delete allocation;
}

Allocation* CUDAVirtualMemAllocator::AllocateImpl(size_t size) {
size = AlignedSize(size, granularity_);

CUdeviceptr ptr = virtual_mem_base_ + virtual_mem_alloced_offset_;

if (ptr + size > virtual_mem_base_ + virtual_mem_size_) {
PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
"\n\nOut of memory error on GPU Virtual Memory %d. "
"Cannot allocate %s memory on GPU Virtual Memory %d, %s memory has "
"been allocated and "
"available memory is only %s.\n\n"
"Please decrease the batch size of your model.\n\n",
place_.device, string::HumanReadableSize(size), place_.device,
string::HumanReadableSize(virtual_mem_alloced_offset_),
string::HumanReadableSize(virtual_mem_size_ -
virtual_mem_alloced_offset_),
place_.device));
return nullptr;
}

CUmemGenericAllocationHandle handle;

paddle::platform::CUDADeviceGuard guard(place_.device);

// Create physical memory backing allocation.
auto result =
platform::RecordedCuMemCreate(&handle, size, &prop_, 0, place_.device);

if (result != CUDA_SUCCESS) {
if (result == CUDA_ERROR_OUT_OF_MEMORY) {
size_t actual_avail, actual_total;
PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemGetInfo(&actual_avail, &actual_total));
size_t actual_allocated = actual_total - actual_avail;

PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
"\n\nOut of memory error on GPU %d. "
"Cannot allocate %s memory on GPU %d, %s memory has been allocated "
"and "
"available memory is only %s.\n\n"
"Please check whether there is any other process using GPU %d.\n"
"1. If yes, please stop them, or start PaddlePaddle on another GPU.\n"
"2. If no, please decrease the batch size of your model.\n\n",
place_.device, string::HumanReadableSize(size), place_.device,
string::HumanReadableSize(actual_allocated),
string::HumanReadableSize(actual_avail), place_.device));
} else {
PADDLE_ENFORCE_CUDA_SUCCESS(result);
}
return nullptr;
}

// Assign the chunk to the appropriate VA range and release the handle.
// After mapping the memory, it can be referenced by virtual address.
// The allocation will be kept live until it is unmapped.
result = paddle::platform::dynload::cuMemMap(ptr, size, 0, handle, 0);

if (result != CUDA_SUCCESS) {
platform::RecordedCuMemRelease(handle, size, place_.device);
PADDLE_ENFORCE_CUDA_SUCCESS(result);
return nullptr;
}

// Apply the access descriptors to the whole VA range.
result = paddle::platform::dynload::cuMemSetAccess(
ptr, size, access_desc_.data(), access_desc_.size());

if (result != CUDA_SUCCESS) {
paddle::platform::dynload::cuMemUnmap(ptr, size);
platform::RecordedCuMemRelease(handle, size, place_.device);
PADDLE_ENFORCE_CUDA_SUCCESS(result);
return nullptr;
}

virtual_2_physical_map_.emplace(ptr, std::make_pair(handle, size));

virtual_mem_alloced_offset_ += size;

return new Allocation(reinterpret_cast<void*>(ptr), size,
platform::Place(place_));
}

} // namespace allocation
} // namespace memory
} // namespace paddle

#endif
62 changes: 62 additions & 0 deletions paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

#ifdef PADDLE_WITH_CUDA
#include <cuda_runtime.h>
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif

#include <mutex> // NOLINT
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/platform/place.h"

#if CUDA_VERSION >= 10020

namespace paddle {
namespace memory {
namespace allocation {

// Allocate memory using NVIDIA's virtual memory management technology
class CUDAVirtualMemAllocator : public Allocator {
public:
explicit CUDAVirtualMemAllocator(const platform::CUDAPlace& place);

bool IsAllocThreadSafe() const override;

protected:
void FreeImpl(Allocation* allocation) override;
Allocation* AllocateImpl(size_t size) override;

private:
platform::CUDAPlace place_;

CUdeviceptr virtual_mem_base_;
size_t virtual_mem_size_;
size_t virtual_mem_alloced_offset_;
size_t granularity_;

CUmemAllocationProp prop_;
std::vector<CUmemAccessDesc> access_desc_;

std::map<CUdeviceptr, std::pair<CUmemGenericAllocationHandle, size_t>>
virtual_2_physical_map_;
};

} // namespace allocation
} // namespace memory
} // namespace paddle

#endif
Loading