From e562d20c871745932433076e0b4ab4c83eb1c4ae Mon Sep 17 00:00:00 2001
From: zhiqiu <chenqiuliang@baidu.com>
Date: Mon, 11 Oct 2021 07:18:49 +0000
Subject: [PATCH] fix error during multiple process

---
 .../memory/allocation/allocator_facade.cc     | 31 +++++++++++++++----
 1 file changed, 25 insertions(+), 6 deletions(-)
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 78d0941410f5e..281902f3a2b12 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -202,6 +202,8 @@ class AllocatorFacadePrivate {
 
   inline const std::shared_ptr<Allocator>& GetAllocator(
       const platform::Place& place, size_t size) {
+    VLOG(4) << "GetAllocator"
+            << " " << place << " " << size;
     const auto& allocators =
         (size > 0 ? (UNLIKELY(FLAGS_use_system_allocator) ? system_allocators_
                                                           : GetAllocatorMap())
@@ -258,17 +260,34 @@ class AllocatorFacadePrivate {
                                    bool allow_free_idle_chunk) {
     auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
     auto alignment = platform::GpuMinChunkSize();
-    const auto& prop = platform::GetDeviceProperties(p.GetDeviceId());
-    bool need_addr_align = prop.textureAlignment < alignment;
+    bool need_addr_align = true;
+    // NOTE: sometimes, since cuda runtime can not be forked, calling any cuda
+    // API in that case may got cuda error(3), i.e.,
+    // cudaErrorInitializationError. And, the CUDAAllocator is only initialized
+    // but not really used.
+    // Here, the try-catch block is added to handle the case that
+    // GetDeviceProperties() may failed in the multiple process(for example, in
+    // dataloader with num_worker > 0)
+    try {
+      const auto& prop = platform::GetDeviceProperties(p.GetDeviceId());
+      need_addr_align = prop.textureAlignment < alignment;
+      VLOG(4) << "GetDeviceProperties ok, textureAlignment: "
+              << prop.textureAlignment
+              << ", set need_addr_align=" << need_addr_align;
+    } catch (...) {
+      need_addr_align = true;
+      VLOG(4) << "GetDeviceProperties failed, set need_addr_align=true";
+    }
+    // The address returned is aligned already,
+    // ref:
+    // https://stackoverflow.com/questions/14082964/cuda-alignment-256bytes-seriously/14083295#14083295
     std::shared_ptr<Allocator> underlying_allocator{nullptr};
     if (need_addr_align) {
-      VLOG(10) << "use AlignedAllocator with alignment: " << alignment
-               << ", textureAlignment: " << prop.textureAlignment;
+      VLOG(10) << "use AlignedAllocator with alignment: " << alignment;
       underlying_allocator =
           std::make_shared<AlignedAllocator>(underlying_allocator, alignment);
     } else {
-      VLOG(10) << "not use AlignedAllocator with alignment: " << alignment
-               << ", textureAlignment: " << prop.textureAlignment;
+      VLOG(10) << "not use AlignedAllocator with alignment: " << alignment;
       underlying_allocator = cuda_allocator;
     }
     allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(