deal with conflict

YuanRisheng · Jan 25, 2022 · feedeed · feedeed
2 parents 33bb053 + c1e5a39
commit feedeed
Show file tree

Hide file tree

Showing 36 changed files with 1,004 additions and 475 deletions.
diff --git a/paddle/fluid/framework/pten_utils.h b/paddle/fluid/framework/pten_utils.h
@@ -86,5 +86,12 @@ struct ConvertToPtenContext<platform::CPUDeviceContext> {
   using TYPE = pten::CPUContext;
 };
 
+#ifdef PADDLE_WITH_XPU
+template <>
+struct ConvertToPtenContext<platform::XPUDeviceContext> {
+  using TYPE = pten::XPUContext;
+};
+#endif
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/memory/CMakeLists.txt b/paddle/fluid/memory/CMakeLists.txt
@@ -34,6 +34,13 @@ if (WITH_ROCM)
             DEPS device_context malloc)
 endif()
 
+if(WITH_GPU AND WITH_TESTING AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON")       
+  nv_test(get_base_ptr_test SRCS get_base_ptr_test.cu DEPS malloc gpu_info)
+  set_tests_properties(get_base_ptr_test PROPERTIES 
+                       ENVIRONMENT "FLAGS_allocator_strategy=auto_growth;
+                                    FLAGS_use_stream_safe_cuda_allocator=true;")
+endif()
+
 #if (WITH_GPU)
 #   nv_test(pinned_memory_test SRCS pinned_memory_test.cu  DEPS place memory)
 #endif()
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -125,10 +125,3 @@ if(NOT WIN32)
   cc_library(mmap_allocator SRCS mmap_allocator.cc DEPS allocator)
   cc_test(mmap_allocator_test SRCS mmap_allocator_test.cc DEPS mmap_allocator allocator)
 endif(NOT WIN32)
-
-if(WITH_GPU AND WITH_TESTING AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON")       
-  nv_test(base_ptr_test SRCS base_ptr_test.cu DEPS malloc gpu_info)
-  set_tests_properties(base_ptr_test PROPERTIES 
-                       ENVIRONMENT "FLAGS_allocator_strategy=auto_growth;
-                                    FLAGS_use_stream_safe_cuda_allocator=true;")
-endif()
diff --git a/paddle/fluid/memory/allocation/allocator.h b/paddle/fluid/memory/allocation/allocator.h
@@ -93,14 +93,7 @@ class Allocation : public pten::Allocation {
              const platform::Place& place)
       : pten::Allocation(ptr, size, place), base_ptr_(base_ptr) {}
 
-  void* base_ptr() const {
-    PADDLE_ENFORCE_EQ(FLAGS_allocator_strategy, "auto_growth",
-                      paddle::platform::errors::Unimplemented(
-                          "base_ptr() is only implemented for auto_growth "
-                          "strategy, not support %s strategy",
-                          FLAGS_allocator_strategy));
-    return base_ptr_;
-  }
+  void* base_ptr() const { return base_ptr_; }
 
  private:
   inline void RegisterDecoratedAllocator(Allocator* allocator) {

diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -282,6 +282,10 @@ class AllocatorFacadePrivate {
     return iter->second;
   }
 
+  void* GetBasePtr(const std::shared_ptr<pten::Allocation>& allocation) {
+    return static_cast<Allocation*>(allocation.get())->base_ptr();
+  }
+
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   bool HasCUDAAllocator(const platform::CUDAPlace& place,
                         const gpuStream_t& stream) {
@@ -821,6 +825,21 @@ const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
   return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
 }
 
+void* AllocatorFacade::GetBasePtr(
+    const std::shared_ptr<pten::Allocation>& allocation) {
+  PADDLE_ENFORCE_EQ(GetAllocatorStrategy(), AllocatorStrategy::kAutoGrowth,
+                    paddle::platform::errors::Unimplemented(
+                        "GetBasePtr() is only implemented for auto_growth "
+                        "strategy, not support allocator strategy: %d",
+                        static_cast<int>(GetAllocatorStrategy())));
+  PADDLE_ENFORCE_EQ(platform::is_gpu_place(allocation->place()), true,
+                    paddle::platform::errors::Unimplemented(
+                        "GetBasePtr() is only implemented for CUDAPlace(), not "
+                        "suppot place: %s",
+                        allocation->place()));
+  return m_->GetBasePtr(allocation);
+}
+
 std::shared_ptr<pten::Allocation> AllocatorFacade::AllocShared(
     const platform::Place& place, size_t size) {
   return std::shared_ptr<pten::Allocation>(Alloc(place, size));

diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h
@@ -51,6 +51,8 @@ class AllocatorFacade {
 
   const std::shared_ptr<Allocator>& GetAllocator(const platform::Place& place);
 
+  void* GetBasePtr(const std::shared_ptr<Allocation>& allocation);
+
   // Allocate a shared allocation.
   std::shared_ptr<Allocation> AllocShared(const platform::Place& place,
                                           size_t size);

diff --git a/.../fluid/memory/allocation/base_ptr_test.cu → paddle/fluid/memory/get_base_ptr_test.cu b/.../fluid/memory/allocation/base_ptr_test.cu → paddle/fluid/memory/get_base_ptr_test.cu
@@ -35,9 +35,9 @@ class CUDAAllocatoionBasePtrTest : public ::testing::Test {
   void OneByOneAllocTest() {
     for (size_t i = 0; i < alloc_times_; ++i) {
       size_t size = dis_(random_engine_);
-      AllocationPtr allocation = Alloc(place_, size);
+      auto allocation = AllocShared(place_, size);
 
-      void* base_ptr = static_cast<Allocation*>(allocation.get())->base_ptr();
+      void* base_ptr = GetBasePtr(allocation);
       void* system_ptr =
           platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId());
       EXPECT_EQ(base_ptr, system_ptr);
@@ -47,21 +47,21 @@ class CUDAAllocatoionBasePtrTest : public ::testing::Test {
   }
 
   void BatchByBatchAllocTest() {
-    std::vector<AllocationPtr> allocations;
+    std::vector<std::shared_ptr<pten::Allocation>> allocations;
     allocations.reserve(batch_size_);
     size_t batch_num = alloc_times_ / batch_size_;
 
     for (size_t i = 0; i < batch_num; ++i) {
       for (size_t j = 0; j < batch_size_; ++j) {
         size_t size = dis_(random_engine_);
-        AllocationPtr allocation = Alloc(place_, size);
+        auto allocation = AllocShared(place_, size);
 
-        void* base_ptr = static_cast<Allocation*>(allocation.get())->base_ptr();
+        void* base_ptr = GetBasePtr(allocation);
         void* system_ptr =
             platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId());
         EXPECT_EQ(base_ptr, system_ptr);
 
-        allocations.emplace_back(std::move(allocation));
+        allocations.emplace_back(allocation);
       }
       allocations.clear();
     }
@@ -70,28 +70,28 @@ class CUDAAllocatoionBasePtrTest : public ::testing::Test {
   }
 
   void ContinuousAllocTest() {
-    std::vector<AllocationPtr> allocations;
+    std::vector<std::shared_ptr<pten::Allocation>> allocations;
     allocations.reserve(alloc_times_);
 
     for (size_t i = 0; i < alloc_times_; ++i) {
       size_t size = dis_(random_engine_);
-      AllocationPtr allocation = Alloc(place_, size);
+      auto allocation = AllocShared(place_, size);
 
-      void* base_ptr = static_cast<Allocation*>(allocation.get())->base_ptr();
+      void* base_ptr = GetBasePtr(allocation);
       void* system_ptr =
           platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId());
       EXPECT_EQ(base_ptr, system_ptr);
 
-      allocations.emplace_back(std::move(allocation));
+      allocations.emplace_back(allocation);
     }
 
     allocations.clear();
     Release(place_);
   }
 
   void ZeroSizeAllocTest() {
-    AllocationPtr allocation = Alloc(place_, 0);
-    void* base_ptr = static_cast<Allocation*>(allocation.get())->base_ptr();
+    auto allocation = AllocShared(place_, 0);
+    void* base_ptr = GetBasePtr(allocation);
     void* system_ptr =
         platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId());
     EXPECT_EQ(base_ptr, system_ptr);

diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc
@@ -47,6 +47,10 @@ bool InSameStream(const std::shared_ptr<Allocation>& allocation,
                                                               stream);
 }
 
+void* GetBasePtr(const std::shared_ptr<Allocation>& allocation) {
+  return allocation::AllocatorFacade::Instance().GetBasePtr(allocation);
+}
+
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size,
                     const gpuStream_t& stream) {

diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h
@@ -44,6 +44,8 @@ extern std::shared_ptr<Allocation> AllocShared(const platform::Place& place,
 extern bool InSameStream(const std::shared_ptr<Allocation>& allocation,
                          const platform::Stream& stream);
 
+extern void* GetBasePtr(const std::shared_ptr<Allocation>& allocation);
+
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 extern AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size,
                            const gpuStream_t& stream);

diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc
@@ -94,11 +94,11 @@ class CheckFiniteAndUnscaleXPUKernel : public framework::OpKernel<T> {
         inverse_scale = 0.0;
       }
 
-      paddle::platform::XPUVersion version = dev_ctx.xpu_version();
+      auto version = dev_ctx.xpu_version();
       framework::Tensor float_x;
       framework::Tensor float_out;
       if (std::is_same<T, paddle::platform::float16>::value &&
-          (version == paddle::platform::XPUVersion::XPU1)) {
+          (version == pten::backends::xpu::XPUVersion::XPU1)) {
         float_x.mutable_data<MPDType>(dev_ctx.GetPlace(),
                                       x->numel() * sizeof(MPDType));
         float_out.mutable_data<MPDType>(dev_ctx.GetPlace(),

diff --git a/paddle/fluid/operators/dropout_op_xpu.cc b/paddle/fluid/operators/dropout_op_xpu.cc
@@ -107,8 +107,8 @@ class DropoutGradXPUKernel : public framework::OpKernel<T> {
       return;
     }
 
-    paddle::platform::XPUVersion version = dev_ctx.xpu_version();
-    if (version == paddle::platform::XPUVersion::XPU1) {
+    auto version = dev_ctx.xpu_version();
+    if (version == pten::backends::xpu::XPUVersion::XPU1) {
       xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
       XPUType* mask_new = RAII_GUARD.alloc_l3_or_gm<XPUType>(mask->numel());
       float scale =

diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
@@ -395,7 +395,8 @@ class ReshapeKernel {
 #ifdef PADDLE_WITH_XPU
     if (platform::is_xpu_place(ctx.GetPlace())) {
       auto &dev_ctx = ctx.device_context<platform::XPUDeviceContext>();
-      pten::ReshapeKernel(dev_ctx, *in, pt_scalar_shape, out);
+      pten::ReshapeKernel(static_cast<const pten::XPUContext &>(dev_ctx), *in,
+                          pt_scalar_shape, out);
     }
 #endif
   }
@@ -422,7 +423,8 @@ class ReshapeGradKernel {
 #ifdef PADDLE_WITH_XPU
     if (platform::is_xpu_place(ctx.GetPlace())) {
       auto &dev_ctx = ctx.device_context<platform::XPUDeviceContext>();
-      pten::ReshapeGradKernel(dev_ctx, *d_out, d_x);
+      pten::ReshapeGradKernel(static_cast<const pten::XPUContext &>(dev_ctx),
+                              *d_out, d_x);
     }
 #endif
   }
@@ -449,7 +451,8 @@ class ReshapeDoubleGradKernel {
 #ifdef PADDLE_WITH_XPU
     if (platform::is_xpu_place(ctx.GetPlace())) {
       auto &dev_ctx = ctx.device_context<platform::XPUDeviceContext>();
-      pten::ReshapeDoubleGradKernel(dev_ctx, *dd_x, dd_out);
+      pten::ReshapeDoubleGradKernel(
+          static_cast<const pten::XPUContext &>(dev_ctx), *dd_x, dd_out);
     }
 #endif
   }

diff --git a/paddle/fluid/operators/softmax_op_xpu.cc b/paddle/fluid/operators/softmax_op_xpu.cc
@@ -45,8 +45,8 @@ class SoftmaxXPUKernel : public framework::OpKernel<T> {
     auto& dev_ctx = context.template device_context<DeviceContext>();
 
     int r = XPU_SUCCESS;
-    paddle::platform::XPUVersion version = dev_ctx.xpu_version();
-    if (version == paddle::platform::XPUVersion::XPU1) {
+    auto version = dev_ctx.xpu_version();
+    if (version == pten::backends::xpu::XPUVersion::XPU1) {
       xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
       XPUType* clip_x_data_l3 = RAII_GUARD.alloc_l3_or_gm<XPUType>(x->numel());
       r = xpu::clip_v2(dev_ctx.x_context(),

diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
@@ -121,6 +121,9 @@ cc_library(cudnn_workspace_helper SRCS cudnn_workspace_helper.cc DEPS boost)
 cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc xxhash ${STREAM_CALLBACK_DEPS}
     place pten_place eigen3 stringpiece cpu_helper cpu_info framework_proto ${IPU_CTX_DEPS} ${GPU_CTX_DEPS} ${NPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}
     ${dgc_deps} dlpack cudnn_workspace_helper ${XPU_CTX_DEPS} ${MLU_CTX_DEPS} cpu_context)
+if(WITH_XPU)
+  target_link_libraries(device_context xpu_context)
+endif()
 
 cc_library(collective_helper SRCS collective_helper.cc gen_comm_id_helper.cc DEPS framework_proto device_context enforce)
 if(WITH_ASCEND_CL)

diff --git a/paddle/fluid/platform/device/xpu/CMakeLists.txt b/paddle/fluid/platform/device/xpu/CMakeLists.txt
@@ -4,7 +4,7 @@ endif()
 
 set(XPU_CTX_DEPS xpulib ssl crypto rt z resolv dl)
 
-cc_library(xpu_info SRCS xpu_info.cc DEPS gflags glog enforce xpulib device_context place)
+cc_library(xpu_info SRCS xpu_info.cc DEPS gflags glog enforce xpulib device_context place pten_xpu_info)
 cc_library(xpu_op_list SRCS xpu_op_list.cc DEPS gflags glog enforce xpulib device_context)
 
 add_subdirectory(tests)