From 5c8517f5dd60b25e86fa0efcb6ce2b52633add2d Mon Sep 17 00:00:00 2001
From: zhiqiu <chenqiuliang@baidu.com>
Date: Wed, 29 Sep 2021 12:26:09 +0000
Subject: [PATCH 1/3] do not use alignedAllocator when cuda has alignment

---
 .../memory/allocation/aligned_allocator.cc      |  1 +
 .../fluid/memory/allocation/allocator_facade.cc | 17 ++++++++++++++++-
 .../auto_growth_best_fit_allocator.cc           | 15 +++++++++------
 3 files changed, 26 insertions(+), 7 deletions(-)
diff --git a/paddle/fluid/memory/allocation/aligned_allocator.cc b/paddle/fluid/memory/allocation/aligned_allocator.cc
index 1d89918bfebf6..f0b7f1a4b0d9e 100644
--- a/paddle/fluid/memory/allocation/aligned_allocator.cc
+++ b/paddle/fluid/memory/allocation/aligned_allocator.cc
@@ -20,6 +20,7 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
+// For memory address alignment
 class AlignedAllocation : public Allocation {
  public:
   AlignedAllocation(AllocationPtr underlying_allocation, size_t offset)
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 0388e2d13afb0..78d0941410f5e 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -23,6 +23,7 @@
 #ifdef PADDLE_WITH_ASCEND_CL
 #include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
 #endif
+#include "paddle/fluid/memory/allocation/aligned_allocator.h"
 #include "paddle/fluid/memory/allocation/retry_allocator.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
@@ -256,8 +257,22 @@ class AllocatorFacadePrivate {
   void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p,
                                    bool allow_free_idle_chunk) {
     auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
+    auto alignment = platform::GpuMinChunkSize();
+    const auto& prop = platform::GetDeviceProperties(p.GetDeviceId());
+    bool need_addr_align = prop.textureAlignment < alignment;
+    std::shared_ptr<Allocator> underlying_allocator{nullptr};
+    if (need_addr_align) {
+      VLOG(10) << "use AlignedAllocator with alignment: " << alignment
+               << ", textureAlignment: " << prop.textureAlignment;
+      underlying_allocator =
+          std::make_shared<AlignedAllocator>(underlying_allocator, alignment);
+    } else {
+      VLOG(10) << "not use AlignedAllocator with alignment: " << alignment
+               << ", textureAlignment: " << prop.textureAlignment;
+      underlying_allocator = cuda_allocator;
+    }
     allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
-        cuda_allocator, platform::GpuMinChunkSize(), allow_free_idle_chunk);
+        underlying_allocator, alignment, 0, allow_free_idle_chunk);
   }
 #endif
 
diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
index f36d589f907fb..9f34f5198a179 100644
--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
@@ -40,14 +40,14 @@ namespace allocation {
 AutoGrowthBestFitAllocator::AutoGrowthBestFitAllocator(
     const std::shared_ptr<Allocator> &underlying_allocator, size_t alignment,
     size_t chunk_size, bool allow_free_idle_chunk)
-    : underlying_allocator_(
-          std::make_shared<AlignedAllocator>(underlying_allocator, alignment)),
+    : underlying_allocator_(underlying_allocator),
       alignment_(alignment),
       chunk_size_(std::max(AlignedSize(chunk_size, alignment), alignment)),
       allow_free_idle_chunk_(allow_free_idle_chunk) {}
 
-Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t size) {
-  size = AlignedSize(size, alignment_);
+Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t unaligned_size) {
+  size_t size = AlignedSize(unaligned_size, alignment_);
+  VLOG(10) << "Allocate " << unaligned_size << " bytes, aligned to " << size;
 
   std::lock_guard<SpinLock> guard(spinlock_);
   auto iter = free_blocks_.lower_bound(std::make_pair(size, nullptr));
@@ -57,6 +57,8 @@ Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t size) {
     free_blocks_.erase(iter);
     auto *chunk = block_it->chunk_;
     size_t remaining_size = block_it->size_ - size;
+    VLOG(10) << "Allocate " << size << " bytes from chunk size "
+             << block_it->size_ << ", remaining " << remaining_size;
     if (remaining_size == 0) {
       block_it->is_free_ = false;
     } else {
@@ -95,13 +97,14 @@ Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t size) {
     }
     blocks.emplace_back(p + remaining_size, size, false, chunk);
     block_it = --(blocks.end());
-    VLOG(2) << "Not found and reallocate " << realloc_size << ", and remaining "
-            << remaining_size;
+    VLOG(2) << "Not found and reallocate " << realloc_size << "("
+            << static_cast<void *>(p) << "), and remaining " << remaining_size;
   }
   return new BlockAllocation(block_it);
 }
 
 void AutoGrowthBestFitAllocator::FreeImpl(Allocation *allocation) {
+  VLOG(10) << "Free " << allocation->size() << " bytes";
   std::lock_guard<SpinLock> guard(spinlock_);
   auto block_it = static_cast<BlockAllocation *>(allocation)->block_it_;
   auto &blocks = block_it->chunk_->blocks_;

From 8efe79da281733dfd30453474d07a22696b2f389 Mon Sep 17 00:00:00 2001
From: zhiqiu <chenqiuliang@baidu.com>
Date: Sat, 9 Oct 2021 08:41:56 +0000
Subject: [PATCH 2/3] update test

---
 .../auto_growth_best_fit_allocator_test.cc         | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc
index 6f2591c8b15c8..926af8292d2e8 100644
--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc
@@ -12,10 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h"
-
 #include <cstdlib>
 
+#include "paddle/fluid/memory/allocation/aligned_allocator.h"
+#include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h"
+
 #include "gtest/gtest.h"
 
 DECLARE_bool(free_idle_chunk);
@@ -50,10 +51,13 @@ static void TestFreeIdleChunk(bool free_idle_chunk,
   FLAGS_free_idle_chunk = free_idle_chunk;
   FLAGS_free_when_no_cache_hit = free_when_no_cache_hit;
   auto recorded_allocator = std::make_shared<RecordedAllocator>();
+
   size_t alignment = 4096;
   size_t memory_size = 8192;
+  auto underlying_allocator =
+      std::make_shared<AlignedAllocator>(recorded_allocator, alignment);
   auto ag_allocator = std::make_shared<AutoGrowthBestFitAllocator>(
-      recorded_allocator, alignment);
+      underlying_allocator, alignment);
 
   for (size_t i = 0; i < 10; ++i) {
     auto allocation = ag_allocator->Allocate(memory_size);
@@ -131,8 +135,10 @@ static void TestFreeWhenNoCacheHit(bool free_when_no_cache_hit) {
 
   auto underlying_allocator =
       std::make_shared<LimitedResourceAllocator>(memory_capacity);
+  auto aligned_allocator =
+      std::make_shared<AlignedAllocator>(underlying_allocator, alignment);
   auto ag_allocator = std::make_shared<AutoGrowthBestFitAllocator>(
-      underlying_allocator, alignment);
+      aligned_allocator, alignment);
 
   ag_allocator->Allocate(allocate_size[0]);
   ASSERT_EQ(underlying_allocator->AllocatedSize(),

From e562d20c871745932433076e0b4ab4c83eb1c4ae Mon Sep 17 00:00:00 2001
From: zhiqiu <chenqiuliang@baidu.com>
Date: Mon, 11 Oct 2021 07:18:49 +0000
Subject: [PATCH 3/3] fix error during multiple process

---
 .../memory/allocation/allocator_facade.cc     | 31 +++++++++++++++----
 1 file changed, 25 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 78d0941410f5e..281902f3a2b12 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -202,6 +202,8 @@ class AllocatorFacadePrivate {
 
   inline const std::shared_ptr<Allocator>& GetAllocator(
       const platform::Place& place, size_t size) {
+    VLOG(4) << "GetAllocator"
+            << " " << place << " " << size;
     const auto& allocators =
         (size > 0 ? (UNLIKELY(FLAGS_use_system_allocator) ? system_allocators_
                                                           : GetAllocatorMap())
@@ -258,17 +260,34 @@ class AllocatorFacadePrivate {
                                    bool allow_free_idle_chunk) {
     auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
     auto alignment = platform::GpuMinChunkSize();
-    const auto& prop = platform::GetDeviceProperties(p.GetDeviceId());
-    bool need_addr_align = prop.textureAlignment < alignment;
+    bool need_addr_align = true;
+    // NOTE: sometimes, since cuda runtime can not be forked, calling any cuda
+    // API in that case may got cuda error(3), i.e.,
+    // cudaErrorInitializationError. And, the CUDAAllocator is only initialized
+    // but not really used.
+    // Here, the try-catch block is added to handle the case that
+    // GetDeviceProperties() may failed in the multiple process(for example, in
+    // dataloader with num_worker > 0)
+    try {
+      const auto& prop = platform::GetDeviceProperties(p.GetDeviceId());
+      need_addr_align = prop.textureAlignment < alignment;
+      VLOG(4) << "GetDeviceProperties ok, textureAlignment: "
+              << prop.textureAlignment
+              << ", set need_addr_align=" << need_addr_align;
+    } catch (...) {
+      need_addr_align = true;
+      VLOG(4) << "GetDeviceProperties failed, set need_addr_align=true";
+    }
+    // The address returned is aligned already,
+    // ref:
+    // https://stackoverflow.com/questions/14082964/cuda-alignment-256bytes-seriously/14083295#14083295
     std::shared_ptr<Allocator> underlying_allocator{nullptr};
     if (need_addr_align) {
-      VLOG(10) << "use AlignedAllocator with alignment: " << alignment
-               << ", textureAlignment: " << prop.textureAlignment;
+      VLOG(10) << "use AlignedAllocator with alignment: " << alignment;
       underlying_allocator =
           std::make_shared<AlignedAllocator>(underlying_allocator, alignment);
     } else {
-      VLOG(10) << "not use AlignedAllocator with alignment: " << alignment
-               << ", textureAlignment: " << prop.textureAlignment;
+      VLOG(10) << "not use AlignedAllocator with alignment: " << alignment;
       underlying_allocator = cuda_allocator;
     }
     allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(