PaddlePaddle · zhiqiu · Oct 11, 2021 · Sep 29, 2021 · Oct 9, 2021 · Oct 11, 2021
diff --git a/paddle/fluid/memory/allocation/aligned_allocator.cc b/paddle/fluid/memory/allocation/aligned_allocator.cc
@@ -20,6 +20,7 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
+// For memory address alignment
 class AlignedAllocation : public Allocation {
  public:
   AlignedAllocation(AllocationPtr underlying_allocation, size_t offset)

diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -23,6 +23,7 @@
 #ifdef PADDLE_WITH_ASCEND_CL
 #include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
 #endif
+#include "paddle/fluid/memory/allocation/aligned_allocator.h"
 #include "paddle/fluid/memory/allocation/retry_allocator.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
@@ -201,6 +202,8 @@ class AllocatorFacadePrivate {
 
   inline const std::shared_ptr<Allocator>& GetAllocator(
       const platform::Place& place, size_t size) {
+    VLOG(4) << "GetAllocator"
+            << " " << place << " " << size;
     const auto& allocators =
         (size > 0 ? (UNLIKELY(FLAGS_use_system_allocator) ? system_allocators_
                                                           : GetAllocatorMap())
@@ -256,8 +259,39 @@ class AllocatorFacadePrivate {
   void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p,
                                    bool allow_free_idle_chunk) {
     auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
+    auto alignment = platform::GpuMinChunkSize();
+    bool need_addr_align = true;
+    // NOTE: sometimes, since cuda runtime can not be forked, calling any cuda
+    // API in that case may got cuda error(3), i.e.,
+    // cudaErrorInitializationError. And, the CUDAAllocator is only initialized
+    // but not really used.
+    // Here, the try-catch block is added to handle the case that
+    // GetDeviceProperties() may failed in the multiple process(for example, in
+    // dataloader with num_worker > 0)
+    try {
+      const auto& prop = platform::GetDeviceProperties(p.GetDeviceId());
+      need_addr_align = prop.textureAlignment < alignment;
+      VLOG(4) << "GetDeviceProperties ok, textureAlignment: "
+              << prop.textureAlignment
+              << ", set need_addr_align=" << need_addr_align;
+    } catch (...) {
+      need_addr_align = true;
+      VLOG(4) << "GetDeviceProperties failed, set need_addr_align=true";
+    }
+    // The address returned is aligned already,
+    // ref:
+    // https://stackoverflow.com/questions/14082964/cuda-alignment-256bytes-seriously/14083295#14083295
+    std::shared_ptr<Allocator> underlying_allocator{nullptr};
+    if (need_addr_align) {
+      VLOG(10) << "use AlignedAllocator with alignment: " << alignment;
+      underlying_allocator =
+          std::make_shared<AlignedAllocator>(underlying_allocator, alignment);
+    } else {
+      VLOG(10) << "not use AlignedAllocator with alignment: " << alignment;
+      underlying_allocator = cuda_allocator;
+    }
     allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
-        cuda_allocator, platform::GpuMinChunkSize(), allow_free_idle_chunk);
+        underlying_allocator, alignment, 0, allow_free_idle_chunk);
   }
 #endif
 

diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
@@ -40,14 +40,14 @@ namespace allocation {
 AutoGrowthBestFitAllocator::AutoGrowthBestFitAllocator(
     const std::shared_ptr<Allocator> &underlying_allocator, size_t alignment,
     size_t chunk_size, bool allow_free_idle_chunk)
-    : underlying_allocator_(
-          std::make_shared<AlignedAllocator>(underlying_allocator, alignment)),
+    : underlying_allocator_(underlying_allocator),
       alignment_(alignment),
       chunk_size_(std::max(AlignedSize(chunk_size, alignment), alignment)),
       allow_free_idle_chunk_(allow_free_idle_chunk) {}
 
-Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t size) {
-  size = AlignedSize(size, alignment_);
+Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t unaligned_size) {
+  size_t size = AlignedSize(unaligned_size, alignment_);
+  VLOG(10) << "Allocate " << unaligned_size << " bytes, aligned to " << size;
 
   std::lock_guard<SpinLock> guard(spinlock_);
   auto iter = free_blocks_.lower_bound(std::make_pair(size, nullptr));
@@ -57,6 +57,8 @@ Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t size) {
     free_blocks_.erase(iter);
     auto *chunk = block_it->chunk_;
     size_t remaining_size = block_it->size_ - size;
+    VLOG(10) << "Allocate " << size << " bytes from chunk size "
+             << block_it->size_ << ", remaining " << remaining_size;
     if (remaining_size == 0) {
       block_it->is_free_ = false;
     } else {
@@ -95,13 +97,14 @@ Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t size) {
     }
     blocks.emplace_back(p + remaining_size, size, false, chunk);
     block_it = --(blocks.end());
-    VLOG(2) << "Not found and reallocate " << realloc_size << ", and remaining "
-            << remaining_size;
+    VLOG(2) << "Not found and reallocate " << realloc_size << "("
+            << static_cast<void *>(p) << "), and remaining " << remaining_size;
   }
   return new BlockAllocation(block_it);
 }
 
 void AutoGrowthBestFitAllocator::FreeImpl(Allocation *allocation) {
+  VLOG(10) << "Free " << allocation->size() << " bytes";
   std::lock_guard<SpinLock> guard(spinlock_);
   auto block_it = static_cast<BlockAllocation *>(allocation)->block_it_;
   auto &blocks = block_it->chunk_->blocks_;

diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc
@@ -12,10 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h"
-
 #include <cstdlib>
 
+#include "paddle/fluid/memory/allocation/aligned_allocator.h"
+#include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h"
+
 #include "gtest/gtest.h"
 
 DECLARE_bool(free_idle_chunk);
@@ -50,10 +51,13 @@ static void TestFreeIdleChunk(bool free_idle_chunk,
   FLAGS_free_idle_chunk = free_idle_chunk;
   FLAGS_free_when_no_cache_hit = free_when_no_cache_hit;
   auto recorded_allocator = std::make_shared<RecordedAllocator>();
+
   size_t alignment = 4096;
   size_t memory_size = 8192;
+  auto underlying_allocator =
+      std::make_shared<AlignedAllocator>(recorded_allocator, alignment);
   auto ag_allocator = std::make_shared<AutoGrowthBestFitAllocator>(
-      recorded_allocator, alignment);
+      underlying_allocator, alignment);
 
   for (size_t i = 0; i < 10; ++i) {
     auto allocation = ag_allocator->Allocate(memory_size);
@@ -131,8 +135,10 @@ static void TestFreeWhenNoCacheHit(bool free_when_no_cache_hit) {
 
   auto underlying_allocator =
       std::make_shared<LimitedResourceAllocator>(memory_capacity);
+  auto aligned_allocator =
+      std::make_shared<AlignedAllocator>(underlying_allocator, alignment);
   auto ag_allocator = std::make_shared<AutoGrowthBestFitAllocator>(
-      underlying_allocator, alignment);
+      aligned_allocator, alignment);
 
   ag_allocator->Allocate(allocate_size[0]);
   ASSERT_EQ(underlying_allocator->AllocatedSize(),