From 5c8517f5dd60b25e86fa0efcb6ce2b52633add2d Mon Sep 17 00:00:00 2001 From: zhiqiu Date: Wed, 29 Sep 2021 12:26:09 +0000 Subject: [PATCH 1/3] do not use alignedAllocator when cuda has alignment --- .../memory/allocation/aligned_allocator.cc | 1 + .../fluid/memory/allocation/allocator_facade.cc | 17 ++++++++++++++++- .../auto_growth_best_fit_allocator.cc | 15 +++++++++------ 3 files changed, 26 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/memory/allocation/aligned_allocator.cc b/paddle/fluid/memory/allocation/aligned_allocator.cc index 1d89918bfebf6..f0b7f1a4b0d9e 100644 --- a/paddle/fluid/memory/allocation/aligned_allocator.cc +++ b/paddle/fluid/memory/allocation/aligned_allocator.cc @@ -20,6 +20,7 @@ namespace paddle { namespace memory { namespace allocation { +// For memory address alignment class AlignedAllocation : public Allocation { public: AlignedAllocation(AllocationPtr underlying_allocation, size_t offset) diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 0388e2d13afb0..78d0941410f5e 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -23,6 +23,7 @@ #ifdef PADDLE_WITH_ASCEND_CL #include "paddle/fluid/memory/allocation/npu_pinned_allocator.h" #endif +#include "paddle/fluid/memory/allocation/aligned_allocator.h" #include "paddle/fluid/memory/allocation/retry_allocator.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" @@ -256,8 +257,22 @@ class AllocatorFacadePrivate { void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p, bool allow_free_idle_chunk) { auto cuda_allocator = std::make_shared(p); + auto alignment = platform::GpuMinChunkSize(); + const auto& prop = platform::GetDeviceProperties(p.GetDeviceId()); + bool need_addr_align = prop.textureAlignment < alignment; + std::shared_ptr underlying_allocator{nullptr}; + if (need_addr_align) { + VLOG(10) << "use AlignedAllocator with alignment: " << alignment + << ", textureAlignment: " << prop.textureAlignment; + underlying_allocator = + std::make_shared(underlying_allocator, alignment); + } else { + VLOG(10) << "not use AlignedAllocator with alignment: " << alignment + << ", textureAlignment: " << prop.textureAlignment; + underlying_allocator = cuda_allocator; + } allocators_[p] = std::make_shared( - cuda_allocator, platform::GpuMinChunkSize(), allow_free_idle_chunk); + underlying_allocator, alignment, 0, allow_free_idle_chunk); } #endif diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc index f36d589f907fb..9f34f5198a179 100644 --- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc @@ -40,14 +40,14 @@ namespace allocation { AutoGrowthBestFitAllocator::AutoGrowthBestFitAllocator( const std::shared_ptr &underlying_allocator, size_t alignment, size_t chunk_size, bool allow_free_idle_chunk) - : underlying_allocator_( - std::make_shared(underlying_allocator, alignment)), + : underlying_allocator_(underlying_allocator), alignment_(alignment), chunk_size_(std::max(AlignedSize(chunk_size, alignment), alignment)), allow_free_idle_chunk_(allow_free_idle_chunk) {} -Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t size) { - size = AlignedSize(size, alignment_); +Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t unaligned_size) { + size_t size = AlignedSize(unaligned_size, alignment_); + VLOG(10) << "Allocate " << unaligned_size << " bytes, aligned to " << size; std::lock_guard guard(spinlock_); auto iter = free_blocks_.lower_bound(std::make_pair(size, nullptr)); @@ -57,6 +57,8 @@ Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t size) { free_blocks_.erase(iter); auto *chunk = block_it->chunk_; size_t remaining_size = block_it->size_ - size; + VLOG(10) << "Allocate " << size << " bytes from chunk size " + << block_it->size_ << ", remaining " << remaining_size; if (remaining_size == 0) { block_it->is_free_ = false; } else { @@ -95,13 +97,14 @@ Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t size) { } blocks.emplace_back(p + remaining_size, size, false, chunk); block_it = --(blocks.end()); - VLOG(2) << "Not found and reallocate " << realloc_size << ", and remaining " - << remaining_size; + VLOG(2) << "Not found and reallocate " << realloc_size << "(" + << static_cast(p) << "), and remaining " << remaining_size; } return new BlockAllocation(block_it); } void AutoGrowthBestFitAllocator::FreeImpl(Allocation *allocation) { + VLOG(10) << "Free " << allocation->size() << " bytes"; std::lock_guard guard(spinlock_); auto block_it = static_cast(allocation)->block_it_; auto &blocks = block_it->chunk_->blocks_; From 8efe79da281733dfd30453474d07a22696b2f389 Mon Sep 17 00:00:00 2001 From: zhiqiu Date: Sat, 9 Oct 2021 08:41:56 +0000 Subject: [PATCH 2/3] update test --- .../auto_growth_best_fit_allocator_test.cc | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc index 6f2591c8b15c8..926af8292d2e8 100644 --- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc +++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc @@ -12,10 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h" - #include +#include "paddle/fluid/memory/allocation/aligned_allocator.h" +#include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h" + #include "gtest/gtest.h" DECLARE_bool(free_idle_chunk); @@ -50,10 +51,13 @@ static void TestFreeIdleChunk(bool free_idle_chunk, FLAGS_free_idle_chunk = free_idle_chunk; FLAGS_free_when_no_cache_hit = free_when_no_cache_hit; auto recorded_allocator = std::make_shared(); + size_t alignment = 4096; size_t memory_size = 8192; + auto underlying_allocator = + std::make_shared(recorded_allocator, alignment); auto ag_allocator = std::make_shared( - recorded_allocator, alignment); + underlying_allocator, alignment); for (size_t i = 0; i < 10; ++i) { auto allocation = ag_allocator->Allocate(memory_size); @@ -131,8 +135,10 @@ static void TestFreeWhenNoCacheHit(bool free_when_no_cache_hit) { auto underlying_allocator = std::make_shared(memory_capacity); + auto aligned_allocator = + std::make_shared(underlying_allocator, alignment); auto ag_allocator = std::make_shared( - underlying_allocator, alignment); + aligned_allocator, alignment); ag_allocator->Allocate(allocate_size[0]); ASSERT_EQ(underlying_allocator->AllocatedSize(), From e562d20c871745932433076e0b4ab4c83eb1c4ae Mon Sep 17 00:00:00 2001 From: zhiqiu Date: Mon, 11 Oct 2021 07:18:49 +0000 Subject: [PATCH 3/3] fix error during multiple process --- .../memory/allocation/allocator_facade.cc | 31 +++++++++++++++---- 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 78d0941410f5e..281902f3a2b12 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -202,6 +202,8 @@ class AllocatorFacadePrivate { inline const std::shared_ptr& GetAllocator( const platform::Place& place, size_t size) { + VLOG(4) << "GetAllocator" + << " " << place << " " << size; const auto& allocators = (size > 0 ? (UNLIKELY(FLAGS_use_system_allocator) ? system_allocators_ : GetAllocatorMap()) @@ -258,17 +260,34 @@ class AllocatorFacadePrivate { bool allow_free_idle_chunk) { auto cuda_allocator = std::make_shared(p); auto alignment = platform::GpuMinChunkSize(); - const auto& prop = platform::GetDeviceProperties(p.GetDeviceId()); - bool need_addr_align = prop.textureAlignment < alignment; + bool need_addr_align = true; + // NOTE: sometimes, since cuda runtime can not be forked, calling any cuda + // API in that case may got cuda error(3), i.e., + // cudaErrorInitializationError. And, the CUDAAllocator is only initialized + // but not really used. + // Here, the try-catch block is added to handle the case that + // GetDeviceProperties() may failed in the multiple process(for example, in + // dataloader with num_worker > 0) + try { + const auto& prop = platform::GetDeviceProperties(p.GetDeviceId()); + need_addr_align = prop.textureAlignment < alignment; + VLOG(4) << "GetDeviceProperties ok, textureAlignment: " + << prop.textureAlignment + << ", set need_addr_align=" << need_addr_align; + } catch (...) { + need_addr_align = true; + VLOG(4) << "GetDeviceProperties failed, set need_addr_align=true"; + } + // The address returned is aligned already, + // ref: + // https://stackoverflow.com/questions/14082964/cuda-alignment-256bytes-seriously/14083295#14083295 std::shared_ptr underlying_allocator{nullptr}; if (need_addr_align) { - VLOG(10) << "use AlignedAllocator with alignment: " << alignment - << ", textureAlignment: " << prop.textureAlignment; + VLOG(10) << "use AlignedAllocator with alignment: " << alignment; underlying_allocator = std::make_shared(underlying_allocator, alignment); } else { - VLOG(10) << "not use AlignedAllocator with alignment: " << alignment - << ", textureAlignment: " << prop.textureAlignment; + VLOG(10) << "not use AlignedAllocator with alignment: " << alignment; underlying_allocator = cuda_allocator; } allocators_[p] = std::make_shared(