From b2a7261d3cecde4153fbfcbc9aaef769fe936c4e Mon Sep 17 00:00:00 2001 From: From00 Date: Tue, 25 Jan 2022 10:53:33 +0800 Subject: [PATCH 1/2] Add GetBasePtr interface in paddle::memory (#39145) --- paddle/fluid/memory/CMakeLists.txt | 7 ++++++ paddle/fluid/memory/allocation/CMakeLists.txt | 7 ------ paddle/fluid/memory/allocation/allocator.h | 9 +------ .../memory/allocation/allocator_facade.cc | 19 +++++++++++++++ .../memory/allocation/allocator_facade.h | 2 ++ .../base_ptr_test.cu => get_base_ptr_test.cu} | 24 +++++++++---------- paddle/fluid/memory/malloc.cc | 4 ++++ paddle/fluid/memory/malloc.h | 2 ++ 8 files changed, 47 insertions(+), 27 deletions(-) rename paddle/fluid/memory/{allocation/base_ptr_test.cu => get_base_ptr_test.cu} (80%) diff --git a/paddle/fluid/memory/CMakeLists.txt b/paddle/fluid/memory/CMakeLists.txt index 97952e4b71641..023b40518edf2 100644 --- a/paddle/fluid/memory/CMakeLists.txt +++ b/paddle/fluid/memory/CMakeLists.txt @@ -34,6 +34,13 @@ if (WITH_ROCM) DEPS device_context malloc) endif() +if(WITH_GPU AND WITH_TESTING AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON") + nv_test(get_base_ptr_test SRCS get_base_ptr_test.cu DEPS malloc gpu_info) + set_tests_properties(get_base_ptr_test PROPERTIES + ENVIRONMENT "FLAGS_allocator_strategy=auto_growth; + FLAGS_use_stream_safe_cuda_allocator=true;") +endif() + #if (WITH_GPU) # nv_test(pinned_memory_test SRCS pinned_memory_test.cu DEPS place memory) #endif() diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt index 939ad140415df..c0d1934a703b6 100644 --- a/paddle/fluid/memory/allocation/CMakeLists.txt +++ b/paddle/fluid/memory/allocation/CMakeLists.txt @@ -125,10 +125,3 @@ if(NOT WIN32) cc_library(mmap_allocator SRCS mmap_allocator.cc DEPS allocator) cc_test(mmap_allocator_test SRCS mmap_allocator_test.cc DEPS mmap_allocator allocator) endif(NOT WIN32) - -if(WITH_GPU AND WITH_TESTING AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON") - nv_test(base_ptr_test SRCS base_ptr_test.cu DEPS malloc gpu_info) - set_tests_properties(base_ptr_test PROPERTIES - ENVIRONMENT "FLAGS_allocator_strategy=auto_growth; - FLAGS_use_stream_safe_cuda_allocator=true;") -endif() diff --git a/paddle/fluid/memory/allocation/allocator.h b/paddle/fluid/memory/allocation/allocator.h index 3f04d47516377..878633d1a6291 100644 --- a/paddle/fluid/memory/allocation/allocator.h +++ b/paddle/fluid/memory/allocation/allocator.h @@ -93,14 +93,7 @@ class Allocation : public pten::Allocation { const platform::Place& place) : pten::Allocation(ptr, size, place), base_ptr_(base_ptr) {} - void* base_ptr() const { - PADDLE_ENFORCE_EQ(FLAGS_allocator_strategy, "auto_growth", - paddle::platform::errors::Unimplemented( - "base_ptr() is only implemented for auto_growth " - "strategy, not support %s strategy", - FLAGS_allocator_strategy)); - return base_ptr_; - } + void* base_ptr() const { return base_ptr_; } private: inline void RegisterDecoratedAllocator(Allocator* allocator) { diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 6615bdf4b138b..7cdac0de6138f 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -282,6 +282,10 @@ class AllocatorFacadePrivate { return iter->second; } + void* GetBasePtr(const std::shared_ptr& allocation) { + return static_cast(allocation.get())->base_ptr(); + } + #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) bool HasCUDAAllocator(const platform::CUDAPlace& place, const gpuStream_t& stream) { @@ -821,6 +825,21 @@ const std::shared_ptr& AllocatorFacade::GetAllocator( return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1); } +void* AllocatorFacade::GetBasePtr( + const std::shared_ptr& allocation) { + PADDLE_ENFORCE_EQ(GetAllocatorStrategy(), AllocatorStrategy::kAutoGrowth, + paddle::platform::errors::Unimplemented( + "GetBasePtr() is only implemented for auto_growth " + "strategy, not support allocator strategy: %d", + static_cast(GetAllocatorStrategy()))); + PADDLE_ENFORCE_EQ(platform::is_gpu_place(allocation->place()), true, + paddle::platform::errors::Unimplemented( + "GetBasePtr() is only implemented for CUDAPlace(), not " + "suppot place: %s", + allocation->place())); + return m_->GetBasePtr(allocation); +} + std::shared_ptr AllocatorFacade::AllocShared( const platform::Place& place, size_t size) { return std::shared_ptr(Alloc(place, size)); diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h index 76e2f0b5a94f6..a9b92e1801e4a 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.h +++ b/paddle/fluid/memory/allocation/allocator_facade.h @@ -51,6 +51,8 @@ class AllocatorFacade { const std::shared_ptr& GetAllocator(const platform::Place& place); + void* GetBasePtr(const std::shared_ptr& allocation); + // Allocate a shared allocation. std::shared_ptr AllocShared(const platform::Place& place, size_t size); diff --git a/paddle/fluid/memory/allocation/base_ptr_test.cu b/paddle/fluid/memory/get_base_ptr_test.cu similarity index 80% rename from paddle/fluid/memory/allocation/base_ptr_test.cu rename to paddle/fluid/memory/get_base_ptr_test.cu index 5edabfcb9f5e7..fe1d73b602849 100644 --- a/paddle/fluid/memory/allocation/base_ptr_test.cu +++ b/paddle/fluid/memory/get_base_ptr_test.cu @@ -35,9 +35,9 @@ class CUDAAllocatoionBasePtrTest : public ::testing::Test { void OneByOneAllocTest() { for (size_t i = 0; i < alloc_times_; ++i) { size_t size = dis_(random_engine_); - AllocationPtr allocation = Alloc(place_, size); + auto allocation = AllocShared(place_, size); - void* base_ptr = static_cast(allocation.get())->base_ptr(); + void* base_ptr = GetBasePtr(allocation); void* system_ptr = platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId()); EXPECT_EQ(base_ptr, system_ptr); @@ -47,21 +47,21 @@ class CUDAAllocatoionBasePtrTest : public ::testing::Test { } void BatchByBatchAllocTest() { - std::vector allocations; + std::vector> allocations; allocations.reserve(batch_size_); size_t batch_num = alloc_times_ / batch_size_; for (size_t i = 0; i < batch_num; ++i) { for (size_t j = 0; j < batch_size_; ++j) { size_t size = dis_(random_engine_); - AllocationPtr allocation = Alloc(place_, size); + auto allocation = AllocShared(place_, size); - void* base_ptr = static_cast(allocation.get())->base_ptr(); + void* base_ptr = GetBasePtr(allocation); void* system_ptr = platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId()); EXPECT_EQ(base_ptr, system_ptr); - allocations.emplace_back(std::move(allocation)); + allocations.emplace_back(allocation); } allocations.clear(); } @@ -70,19 +70,19 @@ class CUDAAllocatoionBasePtrTest : public ::testing::Test { } void ContinuousAllocTest() { - std::vector allocations; + std::vector> allocations; allocations.reserve(alloc_times_); for (size_t i = 0; i < alloc_times_; ++i) { size_t size = dis_(random_engine_); - AllocationPtr allocation = Alloc(place_, size); + auto allocation = AllocShared(place_, size); - void* base_ptr = static_cast(allocation.get())->base_ptr(); + void* base_ptr = GetBasePtr(allocation); void* system_ptr = platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId()); EXPECT_EQ(base_ptr, system_ptr); - allocations.emplace_back(std::move(allocation)); + allocations.emplace_back(allocation); } allocations.clear(); @@ -90,8 +90,8 @@ class CUDAAllocatoionBasePtrTest : public ::testing::Test { } void ZeroSizeAllocTest() { - AllocationPtr allocation = Alloc(place_, 0); - void* base_ptr = static_cast(allocation.get())->base_ptr(); + auto allocation = AllocShared(place_, 0); + void* base_ptr = GetBasePtr(allocation); void* system_ptr = platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId()); EXPECT_EQ(base_ptr, system_ptr); diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc index 3e859377e98d8..63c562be97fa0 100644 --- a/paddle/fluid/memory/malloc.cc +++ b/paddle/fluid/memory/malloc.cc @@ -47,6 +47,10 @@ bool InSameStream(const std::shared_ptr& allocation, stream); } +void* GetBasePtr(const std::shared_ptr& allocation) { + return allocation::AllocatorFacade::Instance().GetBasePtr(allocation); +} + #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size, const gpuStream_t& stream) { diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h index 6443e91f08cbe..855cbb775a109 100644 --- a/paddle/fluid/memory/malloc.h +++ b/paddle/fluid/memory/malloc.h @@ -44,6 +44,8 @@ extern std::shared_ptr AllocShared(const platform::Place& place, extern bool InSameStream(const std::shared_ptr& allocation, const platform::Stream& stream); +extern void* GetBasePtr(const std::shared_ptr& allocation); + #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) extern AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size, const gpuStream_t& stream); From c1e5a393694041046f6556019c73e16ddf53d5e3 Mon Sep 17 00:00:00 2001 From: Wilber Date: Tue, 25 Jan 2022 11:04:48 +0800 Subject: [PATCH 2/2] [PTEN] Add xpu context. (#39098) --- paddle/fluid/framework/pten_utils.h | 7 + .../amp/check_finite_and_unscale_op_xpu.cc | 4 +- paddle/fluid/operators/dropout_op_xpu.cc | 4 +- paddle/fluid/operators/reshape_op.cc | 10 +- paddle/fluid/operators/softmax_op_xpu.cc | 4 +- paddle/fluid/platform/CMakeLists.txt | 3 + .../fluid/platform/device/xpu/CMakeLists.txt | 2 +- .../fluid/platform/device/xpu/enforce_xpu.h | 157 +------------- paddle/fluid/platform/device/xpu/xpu_header.h | 39 +--- paddle/fluid/platform/device/xpu/xpu_info.cc | 120 ++--------- paddle/fluid/platform/device/xpu/xpu_info.h | 27 +-- .../fluid/platform/device/xpu/xpu_op_list.cc | 14 +- .../fluid/platform/device/xpu/xpu_op_list.h | 6 +- paddle/fluid/platform/device_context.cc | 44 +--- paddle/fluid/platform/device_context.h | 30 +-- paddle/fluid/pybind/pybind.cc | 23 +- paddle/pten/backends/CMakeLists.txt | 10 +- paddle/pten/backends/cpu/cpu_context.cc | 22 +- paddle/pten/backends/xpu/CMakeLists.txt | 2 + paddle/pten/backends/xpu/enforce_xpu.h | 194 +++++++++++++++++ paddle/pten/backends/xpu/forwards.h | 28 +++ paddle/pten/backends/xpu/xpu_context.cc | 169 +++++++++++++++ paddle/pten/backends/xpu/xpu_context.h | 59 +++++- paddle/pten/backends/xpu/xpu_header.h | 56 +++++ paddle/pten/backends/xpu/xpu_info.cc | 199 ++++++++++++++++++ paddle/pten/backends/xpu/xpu_info.h | 93 ++++++++ paddle/pten/core/device_context.cc | 51 ++++- paddle/pten/core/device_context.h | 29 ++- 28 files changed, 958 insertions(+), 448 deletions(-) create mode 100644 paddle/pten/backends/xpu/CMakeLists.txt create mode 100644 paddle/pten/backends/xpu/enforce_xpu.h create mode 100644 paddle/pten/backends/xpu/forwards.h create mode 100644 paddle/pten/backends/xpu/xpu_context.cc create mode 100644 paddle/pten/backends/xpu/xpu_header.h create mode 100644 paddle/pten/backends/xpu/xpu_info.cc create mode 100644 paddle/pten/backends/xpu/xpu_info.h diff --git a/paddle/fluid/framework/pten_utils.h b/paddle/fluid/framework/pten_utils.h index ab129c6313dab..4985e53ee656c 100644 --- a/paddle/fluid/framework/pten_utils.h +++ b/paddle/fluid/framework/pten_utils.h @@ -86,5 +86,12 @@ struct ConvertToPtenContext { using TYPE = pten::CPUContext; }; +#ifdef PADDLE_WITH_XPU +template <> +struct ConvertToPtenContext { + using TYPE = pten::XPUContext; +}; +#endif + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc index 979ae5c508c6b..5d769214df4d1 100644 --- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc +++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc @@ -94,11 +94,11 @@ class CheckFiniteAndUnscaleXPUKernel : public framework::OpKernel { inverse_scale = 0.0; } - paddle::platform::XPUVersion version = dev_ctx.xpu_version(); + auto version = dev_ctx.xpu_version(); framework::Tensor float_x; framework::Tensor float_out; if (std::is_same::value && - (version == paddle::platform::XPUVersion::XPU1)) { + (version == pten::backends::xpu::XPUVersion::XPU1)) { float_x.mutable_data(dev_ctx.GetPlace(), x->numel() * sizeof(MPDType)); float_out.mutable_data(dev_ctx.GetPlace(), diff --git a/paddle/fluid/operators/dropout_op_xpu.cc b/paddle/fluid/operators/dropout_op_xpu.cc index cded525b030d8..e80797bd9b971 100644 --- a/paddle/fluid/operators/dropout_op_xpu.cc +++ b/paddle/fluid/operators/dropout_op_xpu.cc @@ -107,8 +107,8 @@ class DropoutGradXPUKernel : public framework::OpKernel { return; } - paddle::platform::XPUVersion version = dev_ctx.xpu_version(); - if (version == paddle::platform::XPUVersion::XPU1) { + auto version = dev_ctx.xpu_version(); + if (version == pten::backends::xpu::XPUVersion::XPU1) { xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); XPUType* mask_new = RAII_GUARD.alloc_l3_or_gm(mask->numel()); float scale = diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index dc82d7c6c1ee4..5170729a7692f 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -448,7 +448,8 @@ class ReshapeKernel { #ifdef PADDLE_WITH_XPU if (platform::is_xpu_place(ctx.GetPlace())) { auto &dev_ctx = ctx.device_context(); - pten::ReshapeKernel(dev_ctx, *pt_x.get(), pt_scalar_shape, pt_out); + pten::ReshapeKernel(static_cast(dev_ctx), + *pt_x.get(), pt_scalar_shape, pt_out); } #endif // non-inplace need move all result from pt_out to out, inplace need set @@ -485,7 +486,8 @@ class ReshapeGradKernel { #ifdef PADDLE_WITH_XPU if (platform::is_xpu_place(ctx.GetPlace())) { auto &dev_ctx = ctx.device_context(); - pten::ReshapeGradKernel(dev_ctx, *pt_d_out.get(), pt_d_x.get()); + pten::ReshapeGradKernel(static_cast(dev_ctx), + *pt_d_out.get(), pt_d_x.get()); } #endif } @@ -516,7 +518,9 @@ class ReshapeDoubleGradKernel { #ifdef PADDLE_WITH_XPU if (platform::is_xpu_place(ctx.GetPlace())) { auto &dev_ctx = ctx.device_context(); - pten::ReshapeDoubleGradKernel(dev_ctx, *pt_dd_x.get(), pt_dd_out.get()); + pten::ReshapeDoubleGradKernel( + static_cast(dev_ctx), *pt_dd_x.get(), + pt_dd_out.get()); } #endif } diff --git a/paddle/fluid/operators/softmax_op_xpu.cc b/paddle/fluid/operators/softmax_op_xpu.cc index 0adc12e684c3a..a0d4b4c4eb460 100644 --- a/paddle/fluid/operators/softmax_op_xpu.cc +++ b/paddle/fluid/operators/softmax_op_xpu.cc @@ -45,8 +45,8 @@ class SoftmaxXPUKernel : public framework::OpKernel { auto& dev_ctx = context.template device_context(); int r = XPU_SUCCESS; - paddle::platform::XPUVersion version = dev_ctx.xpu_version(); - if (version == paddle::platform::XPUVersion::XPU1) { + auto version = dev_ctx.xpu_version(); + if (version == pten::backends::xpu::XPUVersion::XPU1) { xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); XPUType* clip_x_data_l3 = RAII_GUARD.alloc_l3_or_gm(x->numel()); r = xpu::clip_v2(dev_ctx.x_context(), diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 97a3175233357..5695fd03bacf3 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -121,6 +121,9 @@ cc_library(cudnn_workspace_helper SRCS cudnn_workspace_helper.cc DEPS boost) cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc xxhash ${STREAM_CALLBACK_DEPS} place pten_place eigen3 stringpiece cpu_helper cpu_info framework_proto ${IPU_CTX_DEPS} ${GPU_CTX_DEPS} ${NPU_CTX_DEPS} ${MKLDNN_CTX_DEPS} ${dgc_deps} dlpack cudnn_workspace_helper ${XPU_CTX_DEPS} ${MLU_CTX_DEPS} cpu_context) +if(WITH_XPU) + target_link_libraries(device_context xpu_context) +endif() cc_library(collective_helper SRCS collective_helper.cc gen_comm_id_helper.cc DEPS framework_proto device_context enforce) if(WITH_ASCEND_CL) diff --git a/paddle/fluid/platform/device/xpu/CMakeLists.txt b/paddle/fluid/platform/device/xpu/CMakeLists.txt index f89c8c193ae7c..d292ce130eb34 100644 --- a/paddle/fluid/platform/device/xpu/CMakeLists.txt +++ b/paddle/fluid/platform/device/xpu/CMakeLists.txt @@ -4,7 +4,7 @@ endif() set(XPU_CTX_DEPS xpulib ssl crypto rt z resolv dl) -cc_library(xpu_info SRCS xpu_info.cc DEPS gflags glog enforce xpulib device_context place) +cc_library(xpu_info SRCS xpu_info.cc DEPS gflags glog enforce xpulib device_context place pten_xpu_info) cc_library(xpu_op_list SRCS xpu_op_list.cc DEPS gflags glog enforce xpulib device_context) add_subdirectory(tests) diff --git a/paddle/fluid/platform/device/xpu/enforce_xpu.h b/paddle/fluid/platform/device/xpu/enforce_xpu.h index 4c85168f68dd3..ae5ec8e851d68 100644 --- a/paddle/fluid/platform/device/xpu/enforce_xpu.h +++ b/paddle/fluid/platform/device/xpu/enforce_xpu.h @@ -15,177 +15,36 @@ limitations under the License. */ #pragma once #include "paddle/fluid/platform/device/xpu/xpu_header.h" -#include "paddle/fluid/platform/enforce.h" -#include "xpu/bkcl.h" + +#include "paddle/pten/backends/xpu/enforce_xpu.h" namespace paddle { namespace platform { // Note: XPU runtime api return int, not XPUError_t inline const char* xpuGetErrorString(int stat) { - switch (stat) { - case XPU_SUCCESS: - return "Success"; - case XPUERR_INVALID_DEVICE: - return "Invalid XPU device"; - case XPUERR_UNINIT: - return "XPU runtime not properly inited"; - case XPUERR_NOMEM: - return "Device memory not enough"; - case XPUERR_NOCPUMEM: - return "CPU memory not enough"; - case XPUERR_INVALID_PARAM: - return "Invalid parameter"; - case XPUERR_NOXPUFUNC: - return "Cannot get XPU Func"; - case XPUERR_LDSO: - return "Error loading dynamic library"; - case XPUERR_LDSYM: - return "Error loading func from dynamic library"; - case XPUERR_SIMULATOR: - return "Error from XPU Simulator"; - case XPUERR_NOSUPPORT: - return "Operation not supported"; - case XPUERR_ABNORMAL: - return "Device abnormal due to previous error"; - case XPUERR_KEXCEPTION: - return "Exception in kernel execution"; - case XPUERR_TIMEOUT: - return "Kernel execution timed out"; - case XPUERR_BUSY: - return "Resource busy"; - case XPUERR_USEAFCLOSE: - return "Use a stream after closed"; - case XPUERR_UCECC: - return "Uncorrectable ECC"; - case XPUERR_OVERHEAT: - return "Overheat"; - case XPUERR_UNEXPECT: - return "Execution error, reach unexpected control flow"; - case XPUERR_DEVRESET: - return "Device is being reset, try again later"; - case XPUERR_HWEXCEPTION: - return "Hardware module exception"; - case XPUERR_HBM_INIT: - return "Error init HBM"; - case XPUERR_DEVINIT: - return "Error init device"; - case XPUERR_PEERRESET: - return "Device is being reset, try again later"; - case XPUERR_MAXDEV: - return "Device count exceed limit"; - case XPUERR_NOIOC: - return "Unknown IOCTL command"; - case XPUERR_DMATIMEOUT: - return "DMA timed out, a reboot maybe needed"; - case XPUERR_DMAABORT: - return "DMA aborted due to error, possibly wrong address or hardware " - "state"; - case XPUERR_MCUUNINIT: - return "Firmware not initialized"; - case XPUERR_OLDFW: - return "Firmware version too old (<15), please update."; - case XPUERR_PCIE: - return "Error in PCIE"; - case XPUERR_FAULT: - return "Error copy between kernel and user space"; - case XPUERR_INTERRUPTED: - return "Execution interrupted by user"; - default: - return "unkonwn error"; - } + return pten::backends::xpu::xpuGetErrorString(stat); } inline const char* bkclGetErrorString(BKCLResult_t stat) { - switch (stat) { - case BKCL_SUCCESS: - return "BKCL_SUCCESS"; - case BKCL_INVALID_ARGUMENT: - return "BKCL_INVALID_ARGUMENT"; - case BKCL_RUNTIME_ERROR: - return "BKCL_RUNTIME_ERROR"; - case BKCL_SYSTEM_ERROR: - return "BKCL_SYSTEM_ERROR"; - case BKCL_INTERNAL_ERROR: - return "BKCL_INTERNAL_ERROR"; - default: - return "Unknown BKCL status"; - } + return pten::backends::xpu::bkclGetErrorString(stat); } inline const char* xdnnGetErrorString(int stat) { - switch (stat) { - case xpu::Error_t::SUCCESS: - return "XDNN_SUCCESS"; - case xpu::Error_t::INVALID_PARAM: - return "XDNN_INVALID_PARAM"; - case xpu::Error_t::RUNTIME_ERROR: - return "XDNN_RUNTIME_ERROR"; - case xpu::Error_t::NO_ENOUGH_WORKSPACE: - return "XDNN_NO_ENOUGH_WORKSPACE"; - case xpu::Error_t::NOT_IMPLEMENT: - return "XDNN_NOT_IMPLEMENT"; - default: - return "Unknown XDNN status"; - } + return pten::backends::xpu::xdnnGetErrorString(stat); } inline std::string build_xpu_error_msg(int stat) { - std::string msg("XPU Error <" + std::to_string(stat) + ">, "); - return msg + xpuGetErrorString(stat) + " "; + return pten::backends::xpu::build_xpu_error_msg(stat); } inline std::string build_xpu_error_msg(BKCLResult_t stat) { - std::string msg("BKCL Error, "); - return msg + bkclGetErrorString(stat) + " "; + return pten::backends::xpu::build_xpu_error_msg(stat); } inline std::string build_xpu_xdnn_error_msg(int stat, std::string msg) { - return msg + " XDNN Error, " + xdnnGetErrorString(stat) + " "; + return pten::backends::xpu::build_xpu_xdnn_error_msg(stat, msg); } -namespace details { - -template -struct ExternalApiType {}; - -#define DEFINE_EXTERNAL_API_TYPE(type, success_value) \ - template <> \ - struct ExternalApiType { \ - using Type = type; \ - static constexpr Type kSuccess = success_value; \ - } - -DEFINE_EXTERNAL_API_TYPE(int, XPU_SUCCESS); -DEFINE_EXTERNAL_API_TYPE(BKCLResult_t, BKCL_SUCCESS); - -#undef DEFINE_EXTERNAL_API_TYPE - -} // namespace details - -#define PADDLE_ENFORCE_XPU_SUCCESS(COND) \ - do { \ - auto __cond__ = (COND); \ - using __XPU_STATUS_TYPE__ = decltype(__cond__); \ - constexpr auto __success_type__ = \ - ::paddle::platform::details::ExternalApiType< \ - __XPU_STATUS_TYPE__>::kSuccess; \ - if (UNLIKELY(__cond__ != __success_type__)) { \ - auto __summary__ = paddle::platform::errors::External( \ - ::paddle::platform::build_xpu_error_msg(__cond__)); \ - __THROW_ERROR_INTERNAL__(__summary__); \ - } \ - } while (0) - -#define PADDLE_ENFORCE_XDNN_SUCCESS(COND, MSG) \ - do { \ - auto __cond__ = (COND); \ - if (UNLIKELY(__cond__ != xpu::Error_t::SUCCESS)) { \ - auto __summary__ = paddle::platform::errors::External( \ - ::paddle::platform::build_xpu_xdnn_error_msg(__cond__, MSG)); \ - __THROW_ERROR_INTERNAL__(__summary__); \ - } \ - } while (0) - } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/device/xpu/xpu_header.h b/paddle/fluid/platform/device/xpu/xpu_header.h index 1177fd63742b3..6b5c32fd511b3 100644 --- a/paddle/fluid/platform/device/xpu/xpu_header.h +++ b/paddle/fluid/platform/device/xpu/xpu_header.h @@ -15,42 +15,5 @@ limitations under the License. */ #pragma once #ifdef PADDLE_WITH_XPU -#include -#include -#include - -#include "paddle/fluid/platform/bfloat16.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/float16.h" - -#include "xpu/runtime.h" -#include "xpu/runtime_ex.h" -#include "xpu/xdnn.h" - -namespace xpu = baidu::xpu::api; - -static std::map XPUAPIErrorMsg = { - {xpu::Error_t::SUCCESS, "xpu api success"}, - {xpu::Error_t::INVALID_PARAM, "xpu api invalid param"}, - {xpu::Error_t::RUNTIME_ERROR, "xpu api runtime error"}, - {xpu::Error_t::NO_ENOUGH_WORKSPACE, "xpu api no enough workspace"}}; - -template -class XPUTypeTrait { - public: - using Type = T; -}; - -template <> -class XPUTypeTrait { - public: - using Type = float16; -}; - -template <> -class XPUTypeTrait { - public: - using Type = bfloat16; -}; - +#include "paddle/pten/backends/xpu/xpu_header.h" #endif diff --git a/paddle/fluid/platform/device/xpu/xpu_info.cc b/paddle/fluid/platform/device/xpu/xpu_info.cc index a8c6ee8f3b035..cf08f9ada6b30 100644 --- a/paddle/fluid/platform/device/xpu/xpu_info.cc +++ b/paddle/fluid/platform/device/xpu/xpu_info.cc @@ -14,22 +14,14 @@ limitations under the License. */ #include #include #include "gflags/gflags.h" + #include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/device/xpu/enforce_xpu.h" #include "paddle/fluid/platform/device/xpu/xpu_header.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/place.h" -#include "paddle/fluid/string/split.h" - -PADDLE_DEFINE_EXPORTED_string( - selected_xpus, "", - "A list of device ids separated by comma, like: 0,1,2,3. " - "This option is useful when doing multi process training and " - "each process have only one device (XPU). If you want to use " - "all visible devices, set this to empty string. NOTE: the " - "reason of doing this is that we want to use P2P communication" - "between XPU devices, use XPU_VISIBLE_DEVICES can only use" - "share-memory only."); + +#include "paddle/pten/backends/xpu/xpu_info.h" namespace paddle { namespace platform { @@ -37,101 +29,40 @@ namespace platform { /**************************** Version Management **************************/ //! Get the version of XPU Driver -int GetDriverVersion() { - uint32_t driver_version_major = 0; - uint32_t driver_version_minor = 0; - PADDLE_ENFORCE_XPU_SUCCESS( - xpu_get_driver_version(&driver_version_major, &driver_version_minor)); - int driver_version = driver_version_major * 10 + driver_version_minor; - return driver_version; -} +int GetDriverVersion() { return pten::backends::xpu::GetDriverVersion(); } //! Get the version of XPU Runtime -int GetRuntimeVersion() { - uint32_t rumtime_version_major = 0; - uint32_t rumtime_version_minor = 0; - PADDLE_ENFORCE_XPU_SUCCESS( - xpu_get_runtime_version(&rumtime_version_major, &rumtime_version_minor)); - int runtime_version = rumtime_version_major * 10 + rumtime_version_minor; - return runtime_version; -} +int GetRuntimeVersion() { return pten::backends::xpu::GetRuntimeVersion(); } /**************************** Device Management **************************/ -static int GetDeviceCountImpl() { - const auto* xpu_visible_devices = std::getenv("XPU_VISIBLE_DEVICES"); - if (xpu_visible_devices != nullptr) { - std::string xpu_visible_devices_str(xpu_visible_devices); - if (std::all_of(xpu_visible_devices_str.begin(), - xpu_visible_devices_str.end(), - [](char ch) { return ch == ' '; })) { - VLOG(2) << "XPU_VISIBLE_DEVICES is set to be empty. No XPU detected."; - return 0; - } - } - - int count = 0; - PADDLE_ENFORCE_XPU_SUCCESS(xpu_device_count(&count)); - return count; -} - -int GetXPUDeviceCount() { - static auto dev_cnt = GetDeviceCountImpl(); - return dev_cnt; -} +int GetXPUDeviceCount() { return pten::backends::xpu::GetXPUDeviceCount(); } int GetXPUCurrentDeviceId() { - int dev_id; - PADDLE_ENFORCE_XPU_SUCCESS(xpu_current_device(&dev_id)); - if (dev_id >= 64) { - // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id - dev_id -= 64; - } - return dev_id; + return pten::backends::xpu::GetXPUCurrentDeviceId(); } -void SetXPUDeviceId(int id) { - PADDLE_ENFORCE_LT( - id, GetXPUDeviceCount(), - platform::errors::InvalidArgument("id must less than XPU count")); - PADDLE_ENFORCE_XPU_SUCCESS(xpu_set_device(id)); -} +void SetXPUDeviceId(int id) { pten::backends::xpu::SetXPUDeviceId(id); } //! Get a list of device ids from environment variable or use all. std::vector GetXPUSelectedDevices() { // use user specified XPUs in single-node multi-process mode. - std::vector devices; - if (!FLAGS_selected_xpus.empty()) { - auto devices_str = paddle::string::Split(FLAGS_selected_xpus, ','); - for (auto id : devices_str) { - devices.push_back(atoi(id.c_str())); - } - } else { - int count = GetXPUDeviceCount(); - for (int i = 0; i < count; ++i) { - devices.push_back(i); - } - } - return devices; + return pten::backends::xpu::GetXPUSelectedDevices(); } /**************************** Memory Management **************************/ void MemcpySyncH2D(void* dst, const void* src, size_t count, const platform::XPUPlace& dst_place) { - platform::XPUDeviceGuard guard(dst_place.device); - PADDLE_ENFORCE_XPU_SUCCESS( - xpu_memcpy(dst, src, count, XPUMemcpyKind::XPU_HOST_TO_DEVICE)); + pten::backends::xpu::MemcpySyncH2D(dst, src, count, dst_place); } void MemcpySyncD2H(void* dst, const void* src, size_t count, const platform::XPUPlace& src_place) { - platform::XPUDeviceGuard guard(src_place.device); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto* dev_ctx = pool.GetByPlace(src_place); dev_ctx->Wait(); - PADDLE_ENFORCE_XPU_SUCCESS( - xpu_memcpy(dst, src, count, XPUMemcpyKind::XPU_DEVICE_TO_HOST)); + pten::backends::xpu::MemcpySyncD2H(dst, src, count, src_place, *dev_ctx); } // if src.device == dst.device and you need sync , after call this function, @@ -139,33 +70,16 @@ void MemcpySyncD2H(void* dst, const void* src, size_t count, void MemcpySyncD2D(void* dst, const platform::XPUPlace& dst_place, const void* src, const platform::XPUPlace& src_place, size_t count) { - int dev_id = GetXPUCurrentDeviceId(); - if (dst_place.device == dev_id && src_place.device == dev_id) { - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto* dev_ctx = pool.GetByPlace(src_place); - PADDLE_ENFORCE_XDNN_SUCCESS( - xpu::copy(dev_ctx->x_context(), static_cast(src), - static_cast(dst), count), - "copy "); - } else { - PADDLE_ENFORCE_XPU_SUCCESS( - xpu_memcpy_peer(dst_place.device, dst, src_place.device, src, count)); - } + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto* dev_ctx = pool.GetByPlace(src_place); + pten::backends::xpu::MemcpySyncD2D(dst, dst_place, src, src_place, count, + *dev_ctx); } /**************************** Others **************************/ -XPUVersion get_xpu_version(int dev_id) { - uint64_t v = 0; - PADDLE_ENFORCE_XPU_SUCCESS(xpu_device_get_attr(&v, XPUATTR_MODEL, dev_id)); - - if (v == K100 || v == K200) { - VLOG(1) << "KUNLUN device " << dev_id << " is XPU1\n"; - return XPU1; - } else { - VLOG(1) << "KUNLUN device " << dev_id << " is XPU2\n"; - return XPU2; - } +pten::backends::xpu::XPUVersion get_xpu_version(int dev_id) { + return pten::backends::xpu::get_xpu_version(dev_id); } } // namespace platform diff --git a/paddle/fluid/platform/device/xpu/xpu_info.h b/paddle/fluid/platform/device/xpu/xpu_info.h index 220bebb9e6b05..03082e8dc50ec 100644 --- a/paddle/fluid/platform/device/xpu/xpu_info.h +++ b/paddle/fluid/platform/device/xpu/xpu_info.h @@ -13,6 +13,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU #include #include "paddle/fluid/platform/place.h" +#include "paddle/pten/backends/xpu/xpu_info.h" namespace paddle { namespace platform { @@ -50,31 +51,9 @@ void MemcpySyncD2D(void *dst, const platform::XPUPlace &dst_place, const void *src, const platform::XPUPlace &src_place, size_t count); -class XPUDeviceGuard { - public: - explicit inline XPUDeviceGuard(int dev_id) { - int prev_id = platform::GetXPUCurrentDeviceId(); - if (prev_id != dev_id) { - prev_id_ = prev_id; - platform::SetXPUDeviceId(dev_id); - } - } +using XPUDeviceGuard = pten::backends::xpu::XPUDeviceGuard; - inline ~XPUDeviceGuard() { - if (prev_id_ != -1) { - platform::SetXPUDeviceId(prev_id_); - } - } - - XPUDeviceGuard(const XPUDeviceGuard &o) = delete; - XPUDeviceGuard &operator=(const XPUDeviceGuard &o) = delete; - - private: - int prev_id_{-1}; -}; - -enum XPUVersion { XPU1, XPU2 }; -XPUVersion get_xpu_version(int dev_id); +pten::backends::xpu::XPUVersion get_xpu_version(int dev_id); } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/device/xpu/xpu_op_list.cc b/paddle/fluid/platform/device/xpu/xpu_op_list.cc index 36be4a55d0a6f..e9b494024bd69 100644 --- a/paddle/fluid/platform/device/xpu/xpu_op_list.cc +++ b/paddle/fluid/platform/device/xpu/xpu_op_list.cc @@ -24,7 +24,7 @@ namespace platform { bool is_xpu_support_op(const std::string& op_name, const pOpKernelType& type) { auto& ops = get_kl1_ops(); auto v = get_xpu_version(type.place_.device); - if (v == XPU2) { + if (v == pten::backends::xpu::XPUVersion::XPU2) { ops = get_kl2_ops(); } @@ -74,10 +74,11 @@ bool is_in_xpu_black_list(const std::string& op_name) { return false; } -std::vector get_xpu_op_support_type(const std::string& op_name, - XPUVersion version) { +std::vector get_xpu_op_support_type( + const std::string& op_name, pten::backends::xpu::XPUVersion version) { std::vector res; - auto& ops = version == XPU1 ? get_kl1_ops() : get_kl2_ops(); + auto& ops = version == pten::backends::xpu::XPUVersion::XPU1 ? get_kl1_ops() + : get_kl2_ops(); if (ops.find(op_name) != ops.end()) { XPUKernelSet& type_set = ops[op_name]; for (auto& item : type_set) { @@ -87,9 +88,10 @@ std::vector get_xpu_op_support_type(const std::string& op_name, return res; } -XPUOpListMap get_xpu_op_list(XPUVersion version) { +XPUOpListMap get_xpu_op_list(pten::backends::xpu::XPUVersion version) { XPUOpListMap res; - auto& ops = version == XPU1 ? get_kl1_ops() : get_kl2_ops(); + auto& ops = version == pten::backends::xpu::XPUVersion::XPU1 ? get_kl1_ops() + : get_kl2_ops(); for (auto& op : ops) { std::vector op_vartypes; for (auto& item : op.second) { diff --git a/paddle/fluid/platform/device/xpu/xpu_op_list.h b/paddle/fluid/platform/device/xpu/xpu_op_list.h index 3672d68492a6f..4c3eb097a147e 100644 --- a/paddle/fluid/platform/device/xpu/xpu_op_list.h +++ b/paddle/fluid/platform/device/xpu/xpu_op_list.h @@ -27,9 +27,9 @@ using XPUOpListMap = bool is_xpu_support_op(const std::string& op_name, const pOpKernelType& type); bool is_in_xpu_black_list(const std::string& op_name); -std::vector get_xpu_op_support_type(const std::string& op_name, - XPUVersion version); -XPUOpListMap get_xpu_op_list(XPUVersion version); +std::vector get_xpu_op_support_type( + const std::string& op_name, pten::backends::xpu::XPUVersion version); +XPUOpListMap get_xpu_op_list(pten::backends::xpu::XPUVersion version); } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 6ffeaf101feca..bfb1f572068e0 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -246,52 +246,14 @@ IPUDeviceContext::~IPUDeviceContext() {} #endif #ifdef PADDLE_WITH_XPU -XPUDeviceContext::XPUDeviceContext() { - context_ = xpu::create_context(); - xpu_version_ = get_xpu_version(place_.device); -} +XPUDeviceContext::XPUDeviceContext() : pten::XPUContext() {} XPUDeviceContext::~XPUDeviceContext() {} -XPUDeviceContext::XPUDeviceContext(XPUPlace place) : place_(place) { - platform::XPUDeviceGuard guard(place.device); - +XPUDeviceContext::XPUDeviceContext(XPUPlace place) : pten::XPUContext(place) { LOG_FIRST_N(WARNING, 1) << "Please NOTE: xpu device: " - << static_cast(place_.device); - - context_ = xpu::create_context(); - const int MAX_XPU_NUM = 16; - static void* l3ptrs[MAX_XPU_NUM] = {nullptr}; - - int l3_size = 13.5 * 1024 * 1024; - if (std::getenv("XPU_PADDLE_L3_SIZE") != nullptr) { - l3_size = atoi(std::getenv("XPU_PADDLE_L3_SIZE")); - } - - auto selected_xpus = GetXPUSelectedDevices(); - for (unsigned int i = 0; i < selected_xpus.size(); i++) { - if (place.device == selected_xpus[i]) { - if (l3ptrs[place.device] == nullptr) { - xpu_malloc(static_cast(&l3ptrs[place.device]), l3_size, - XPU_MEM_L3); - } - if (l3ptrs[place.device] != nullptr) { - context_->_l3_mgr.set(l3ptrs[place.device], l3_size); - VLOG(3) << "xpu place " << place.device << " set l3 size " << l3_size; - } - break; - } - } + << static_cast(place.device); } - -void XPUDeviceContext::Wait() const { - platform::SetXPUDeviceId(place_.device); - xpu_wait(context_->xpu_stream); -} - -Place XPUDeviceContext::GetPlace() const { return place_; } - -xpu::Context* XPUDeviceContext::x_context() const { return context_; } #endif #ifdef PADDLE_WITH_ASCEND_CL diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 78c09dca5b488..52f17cd986ce2 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -78,6 +78,7 @@ struct GpuDevice; #ifdef PADDLE_WITH_XPU #include "paddle/fluid/platform/device/xpu/xpu_header.h" #include "paddle/fluid/platform/device/xpu/xpu_info.h" +#include "paddle/pten/backends/xpu/xpu_context.h" #endif #ifdef PADDLE_WITH_ASCEND_CL @@ -171,39 +172,12 @@ struct DefaultDeviceContextType; #ifdef PADDLE_WITH_XPU namespace xpu = baidu::xpu::api; -class XPUDeviceContext : public DeviceContext { +class XPUDeviceContext : public pten::XPUContext { public: XPUDeviceContext(); explicit XPUDeviceContext(XPUPlace place); virtual ~XPUDeviceContext(); Eigen::DefaultDevice* eigen_device() const { return nullptr; } - XPUVersion xpu_version() const { return xpu_version_; } - Place GetPlace() const override; - xpu::Context* x_context() const; - - /*! \brief Wait for all operations completion in the stream. */ - void Wait() const override; - -#ifdef PADDLE_WITH_XPU_BKCL - /*! \brief Return bkcl context. */ - BKCLContext_t bkcl_context() const { return bkcl_context_; } - - /*! \brief Set bkcl context. */ - void set_bkcl_context(BKCLContext_t context) { bkcl_context_ = context; } -#endif - - private: - XPUPlace place_; - XPUVersion xpu_version_; - xpu::Context* context_; -#ifdef PADDLE_WITH_XPU_BKCL - BKCLContext_t bkcl_context_; -#endif - - // Need to be the same with other DeviceContext, - // Eventhough eigen_device_ is not used in XPU - std::unique_ptr eigen_device_; - DISABLE_COPY_AND_ASSIGN(XPUDeviceContext); }; template <> diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 454e3b524f5f1..34dc0b2c050c9 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -1756,27 +1756,30 @@ All parameter, weight, gradient are variables in Paddle. .def("__repr__", string::to_string) .def("__str__", string::to_string); #ifdef PADDLE_WITH_XPU - py::enum_(m, "XPUVersion", py::arithmetic()) - .value("XPU1", platform::XPUVersion::XPU1) - .value("XPU2", platform::XPUVersion::XPU2) + py::enum_(m, "XPUVersion", py::arithmetic()) + .value("XPU1", pten::backends::xpu::XPUVersion::XPU1) + .value("XPU2", pten::backends::xpu::XPUVersion::XPU2) .export_values(); m.def("get_xpu_device_count", platform::GetXPUDeviceCount); m.def("get_xpu_device_version", [](int device_id) { return platform::get_xpu_version(device_id); }); - m.def("get_xpu_device_op_support_types", - [](const std::string &op_name, platform::XPUVersion version) { - return platform::get_xpu_op_support_type(op_name, version); - }); - m.def("get_xpu_device_op_list", [](platform::XPUVersion version) { + m.def( + "get_xpu_device_op_support_types", + [](const std::string &op_name, pten::backends::xpu::XPUVersion version) { + return platform::get_xpu_op_support_type(op_name, version); + }); + m.def("get_xpu_device_op_list", [](pten::backends::xpu::XPUVersion version) { return platform::get_xpu_op_list(version); }); m.def("is_float16_supported", [](const platform::XPUPlace &place) -> bool { // XPUs with Compute Capability > xpu2 support float16 and bfloat16 - return platform::get_xpu_version(place.device) > platform::XPUVersion::XPU1; + return platform::get_xpu_version(place.device) > + pten::backends::xpu::XPUVersion::XPU1; }); m.def("is_bfloat16_supported", [](const platform::XPUPlace &place) -> bool { // XPUs with Compute Capability > xpu2 support float16 and bfloat16 - return platform::get_xpu_version(place.device) > platform::XPUVersion::XPU1; + return platform::get_xpu_version(place.device) > + pten::backends::xpu::XPUVersion::XPU1; }); #endif diff --git a/paddle/pten/backends/CMakeLists.txt b/paddle/pten/backends/CMakeLists.txt index 3587910ff506e..e9f222d642ea0 100644 --- a/paddle/pten/backends/CMakeLists.txt +++ b/paddle/pten/backends/CMakeLists.txt @@ -2,4 +2,12 @@ add_subdirectory(dynload) add_subdirectory(cpu) -cc_library(pten_context SRCS all_context.cc DEPS device_context) +if(WITH_XPU) + add_subdirectory(xpu) +endif() + +cc_library(pten_context SRCS all_context.cc DEPS device_context cpu_context) + +if(WITH_XPU) + add_dependencies(pten_context xpu_context) +endif() diff --git a/paddle/pten/backends/cpu/cpu_context.cc b/paddle/pten/backends/cpu/cpu_context.cc index e749dfb9bd70e..efce128596b81 100644 --- a/paddle/pten/backends/cpu/cpu_context.cc +++ b/paddle/pten/backends/cpu/cpu_context.cc @@ -18,16 +18,11 @@ // NOTE: The paddle framework should add WITH_EIGEN option to support compile // without eigen. -#include "paddle/pten/core/device_context.h" #include "unsupported/Eigen/CXX11/Tensor" namespace pten { struct CPUContext::CPUImpl { - Eigen::DefaultDevice* device_{nullptr}; - CPUContextResource res_; - CPUPlace place_; - CPUImpl() { device_ = new Eigen::DefaultDevice(); } // Users need to manage external resources. @@ -36,7 +31,7 @@ struct CPUContext::CPUImpl { } ~CPUImpl() { - if (res_.device == nullptr) { + if (res_.device == nullptr && device_ != nullptr) { delete device_; device_ = nullptr; } @@ -56,27 +51,28 @@ struct CPUContext::CPUImpl { } Place GetPlace() const { return place_; } + + Eigen::DefaultDevice* device_{nullptr}; + CPUContextResource res_; + CPUPlace place_; }; -CPUContext::CPUContext() : DeviceContext(), cpu_impl_(nullptr) { +CPUContext::CPUContext() : DeviceContext() { cpu_impl_ = std::make_unique(); } -CPUContext::CPUContext(const CPUContext& other) - : DeviceContext(), cpu_impl_(nullptr) { +CPUContext::CPUContext(const CPUContext& other) : DeviceContext() { cpu_impl_ = std::make_unique(); cpu_impl_->SetEigenDevice(other.eigen_device()); } -CPUContext::CPUContext(CPUContext&& other) - : DeviceContext(), cpu_impl_(nullptr) { +CPUContext::CPUContext(CPUContext&& other) : DeviceContext() { cpu_impl_ = std::move(other.cpu_impl_); } CPUContext::~CPUContext() = default; -CPUContext::CPUContext(const CPUContextResource& ctx_res) - : DeviceContext(), cpu_impl_(nullptr) { +CPUContext::CPUContext(const CPUContextResource& ctx_res) : DeviceContext() { cpu_impl_ = std::make_unique(ctx_res); } diff --git a/paddle/pten/backends/xpu/CMakeLists.txt b/paddle/pten/backends/xpu/CMakeLists.txt new file mode 100644 index 0000000000000..65341dd206fd3 --- /dev/null +++ b/paddle/pten/backends/xpu/CMakeLists.txt @@ -0,0 +1,2 @@ +cc_library(pten_xpu_info SRCS xpu_info.cc DEPS enforce xpulib pten_place) +cc_library(xpu_context SRCS xpu_context.cc DEPS pten_device_context pten_xpu_info) diff --git a/paddle/pten/backends/xpu/enforce_xpu.h b/paddle/pten/backends/xpu/enforce_xpu.h new file mode 100644 index 0000000000000..38aeff198d44b --- /dev/null +++ b/paddle/pten/backends/xpu/enforce_xpu.h @@ -0,0 +1,194 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/pten/backends/xpu/xpu_header.h" +#include "xpu/bkcl.h" + +#include "paddle/fluid/platform/enforce.h" + +namespace pten { +namespace backends { +namespace xpu { + +// Note: XPU runtime api return int, not XPUError_t +inline const char* xpuGetErrorString(int stat) { + switch (stat) { + case XPU_SUCCESS: + return "Success"; + case XPUERR_INVALID_DEVICE: + return "Invalid XPU device"; + case XPUERR_UNINIT: + return "XPU runtime not properly inited"; + case XPUERR_NOMEM: + return "Device memory not enough"; + case XPUERR_NOCPUMEM: + return "CPU memory not enough"; + case XPUERR_INVALID_PARAM: + return "Invalid parameter"; + case XPUERR_NOXPUFUNC: + return "Cannot get XPU Func"; + case XPUERR_LDSO: + return "Error loading dynamic library"; + case XPUERR_LDSYM: + return "Error loading func from dynamic library"; + case XPUERR_SIMULATOR: + return "Error from XPU Simulator"; + case XPUERR_NOSUPPORT: + return "Operation not supported"; + case XPUERR_ABNORMAL: + return "Device abnormal due to previous error"; + case XPUERR_KEXCEPTION: + return "Exception in kernel execution"; + case XPUERR_TIMEOUT: + return "Kernel execution timed out"; + case XPUERR_BUSY: + return "Resource busy"; + case XPUERR_USEAFCLOSE: + return "Use a stream after closed"; + case XPUERR_UCECC: + return "Uncorrectable ECC"; + case XPUERR_OVERHEAT: + return "Overheat"; + case XPUERR_UNEXPECT: + return "Execution error, reach unexpected control flow"; + case XPUERR_DEVRESET: + return "Device is being reset, try again later"; + case XPUERR_HWEXCEPTION: + return "Hardware module exception"; + case XPUERR_HBM_INIT: + return "Error init HBM"; + case XPUERR_DEVINIT: + return "Error init device"; + case XPUERR_PEERRESET: + return "Device is being reset, try again later"; + case XPUERR_MAXDEV: + return "Device count exceed limit"; + case XPUERR_NOIOC: + return "Unknown IOCTL command"; + case XPUERR_DMATIMEOUT: + return "DMA timed out, a reboot maybe needed"; + case XPUERR_DMAABORT: + return "DMA aborted due to error, possibly wrong address or hardware " + "state"; + case XPUERR_MCUUNINIT: + return "Firmware not initialized"; + case XPUERR_OLDFW: + return "Firmware version too old (<15), please update."; + case XPUERR_PCIE: + return "Error in PCIE"; + case XPUERR_FAULT: + return "Error copy between kernel and user space"; + case XPUERR_INTERRUPTED: + return "Execution interrupted by user"; + default: + return "unkonwn error"; + } +} + +inline const char* bkclGetErrorString(BKCLResult_t stat) { + switch (stat) { + case BKCL_SUCCESS: + return "BKCL_SUCCESS"; + case BKCL_INVALID_ARGUMENT: + return "BKCL_INVALID_ARGUMENT"; + case BKCL_RUNTIME_ERROR: + return "BKCL_RUNTIME_ERROR"; + case BKCL_SYSTEM_ERROR: + return "BKCL_SYSTEM_ERROR"; + case BKCL_INTERNAL_ERROR: + return "BKCL_INTERNAL_ERROR"; + default: + return "Unknown BKCL status"; + } +} + +inline const char* xdnnGetErrorString(int stat) { + switch (stat) { + case baidu::xpu::api::Error_t::SUCCESS: + return "XDNN_SUCCESS"; + case baidu::xpu::api::Error_t::INVALID_PARAM: + return "XDNN_INVALID_PARAM"; + case baidu::xpu::api::Error_t::RUNTIME_ERROR: + return "XDNN_RUNTIME_ERROR"; + case baidu::xpu::api::Error_t::NO_ENOUGH_WORKSPACE: + return "XDNN_NO_ENOUGH_WORKSPACE"; + case baidu::xpu::api::Error_t::NOT_IMPLEMENT: + return "XDNN_NOT_IMPLEMENT"; + default: + return "Unknown XDNN status"; + } +} + +inline std::string build_xpu_error_msg(int stat) { + std::string msg("XPU Error <" + std::to_string(stat) + ">, "); + return msg + xpuGetErrorString(stat) + " "; +} + +inline std::string build_xpu_error_msg(BKCLResult_t stat) { + std::string msg("BKCL Error, "); + return msg + bkclGetErrorString(stat) + " "; +} + +inline std::string build_xpu_xdnn_error_msg(int stat, std::string msg) { + return msg + " XDNN Error, " + xdnnGetErrorString(stat) + " "; +} + +namespace details { + +template +struct ExternalApiType {}; + +#define DEFINE_EXTERNAL_API_TYPE(type, success_value) \ + template <> \ + struct ExternalApiType { \ + using Type = type; \ + static constexpr Type kSuccess = success_value; \ + } + +DEFINE_EXTERNAL_API_TYPE(int, XPU_SUCCESS); +DEFINE_EXTERNAL_API_TYPE(BKCLResult_t, BKCL_SUCCESS); + +#undef DEFINE_EXTERNAL_API_TYPE + +} // namespace details + +#define PADDLE_ENFORCE_XPU_SUCCESS(COND) \ + do { \ + auto __cond__ = (COND); \ + using __XPU_STATUS_TYPE__ = decltype(__cond__); \ + constexpr auto __success_type__ = \ + ::pten::backends::xpu::details::ExternalApiType< \ + __XPU_STATUS_TYPE__>::kSuccess; \ + if (UNLIKELY(__cond__ != __success_type__)) { \ + auto __summary__ = paddle::platform::errors::External( \ + ::pten::backends::xpu::build_xpu_error_msg(__cond__)); \ + __THROW_ERROR_INTERNAL__(__summary__); \ + } \ + } while (0) + +#define PADDLE_ENFORCE_XDNN_SUCCESS(COND, MSG) \ + do { \ + auto __cond__ = (COND); \ + if (UNLIKELY(__cond__ != baidu::xpu::api::Error_t::SUCCESS)) { \ + auto __summary__ = paddle::platform::errors::External( \ + ::pten::backends::xpu::build_xpu_xdnn_error_msg(__cond__, MSG)); \ + __THROW_ERROR_INTERNAL__(__summary__); \ + } \ + } while (0) + +} // namespace xpu +} // namespace backends +} // namespace pten diff --git a/paddle/pten/backends/xpu/forwards.h b/paddle/pten/backends/xpu/forwards.h new file mode 100644 index 0000000000000..805a74865b6d8 --- /dev/null +++ b/paddle/pten/backends/xpu/forwards.h @@ -0,0 +1,28 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +// Forward-declares. +#pragma once + +// Forward declaration of xpu context. +namespace baidu { +namespace xpu { +namespace api { + +struct Context; +typedef void* BKCLContext_t; + +} // namespace api +} // namespace xpu +} // namespace baidu diff --git a/paddle/pten/backends/xpu/xpu_context.cc b/paddle/pten/backends/xpu/xpu_context.cc new file mode 100644 index 0000000000000..af4478662a53b --- /dev/null +++ b/paddle/pten/backends/xpu/xpu_context.cc @@ -0,0 +1,169 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/pten/backends/xpu/xpu_context.h" +#include +#include "paddle/pten/api/ext/exception.h" + +#include "xpu/runtime.h" +#include "xpu/runtime_ex.h" +#include "xpu/xdnn.h" + +namespace xpu = baidu::xpu::api; + +namespace pten { + +struct XPUContext::XPUImpl { + void SetL3Cache() { + const int MAX_XPU_NUM = 16; + static void* l3ptrs[MAX_XPU_NUM] = {nullptr}; + + int l3_size = 13.5 * 1024 * 1024; + if (std::getenv("XPU_PADDLE_L3_SIZE") != nullptr) { + l3_size = atoi(std::getenv("XPU_PADDLE_L3_SIZE")); + } + + auto selected_xpus = backends::xpu::GetXPUSelectedDevices(); + for (unsigned int i = 0; i < selected_xpus.size(); i++) { + if (place_.GetDeviceId() == selected_xpus[i]) { + if (l3ptrs[place_.GetDeviceId()] == nullptr) { + xpu_malloc(static_cast(&l3ptrs[place_.GetDeviceId()]), + l3_size, + XPU_MEM_L3); + } + if (l3ptrs[place_.GetDeviceId()] != nullptr) { + context_->_l3_mgr.set(l3ptrs[place_.GetDeviceId()], l3_size); + VLOG(3) << "xpu place " << place_.GetDeviceId() << " set l3 size " + << l3_size; + } + break; + } + } + } + + XPUImpl() { + context_ = xpu::create_context(); + xpu_version_ = backends::xpu::get_xpu_version(place_.device); + } + + explicit XPUImpl(XPUPlace place) : place_(place) { + backends::xpu::XPUDeviceGuard guard(place_.GetDeviceId()); + + LOG_FIRST_N(WARNING, 1) << "Please NOTE: xpu device: " + << static_cast(place_.device); + + context_ = xpu::create_context(); + xpu_version_ = backends::xpu::get_xpu_version(place_.device); + SetL3Cache(); + } + + // Users need to manage external resources. + explicit XPUImpl(const XPUContextResource& ctx_res, + const XPUPlace& place = XPUPlace(0)) + : res_(ctx_res), place_(place) { + context_ = res_.context; + xpu_version_ = backends::xpu::get_xpu_version(place_.device); + SetL3Cache(); + } + + ~XPUImpl() { + if (res_.context == nullptr && context_ != nullptr) { + xpu::destroy_context(context_); + context_ = nullptr; + } + } + + Place GetPlace() const { return place_; } + + backends::xpu::XPUVersion GetXpuVersion() const { return xpu_version_; } + + xpu::Context* GetXContext() const { + PD_CHECK(context_ != nullptr, "the xpu context is nullptr."); + return context_; + } + + xpu::BKCLContext_t GetBkclContext() const { return bkcl_context_; } + + void Wait() const { + backends::xpu::SetXPUDeviceId(place_.GetDeviceId()); + PD_CHECK(context_ != nullptr, "the xpu context is nullptr."); + xpu_wait(context_->xpu_stream); + } + + void SetXContext(xpu::Context* context) { + if (context == nullptr) { + return; + } + res_.context = context; + context_ = context; + } + + void SetBkclContext(xpu::BKCLContext_t context) { bkcl_context_ = context; } + + XPUContextResource res_; + XPUPlace place_; + backends::xpu::XPUVersion xpu_version_; + xpu::Context* context_{nullptr}; + // NOTE: Distributed communicator, distributed framework manages its + // resources, XPUContext only holds references. + xpu::BKCLContext_t bkcl_context_{nullptr}; +}; + +XPUContext::XPUContext() : DeviceContext() { + impl_ = std::make_unique(); +} + +XPUContext::XPUContext(const XPUPlace& place) { + impl_ = std::make_unique(place); +} + +XPUContext::XPUContext(const XPUContext& other) : DeviceContext() { + impl_ = std::make_unique(); + impl_->SetXContext(other.x_context()); + impl_->SetBkclContext(other.bkcl_context()); +} + +XPUContext::XPUContext(XPUContext&& other) : DeviceContext() { + impl_ = std::move(other.impl_); +} + +XPUContext::~XPUContext() = default; + +XPUContext::XPUContext(const XPUContextResource& ctx_res) : DeviceContext() { + impl_ = std::make_unique(ctx_res); +} + +Place XPUContext::GetPlace() const { return impl_->GetPlace(); } + +backends::xpu::XPUVersion XPUContext::xpu_version() const { + return impl_->GetXpuVersion(); +} + +xpu::Context* XPUContext::x_context() const { return impl_->GetXContext(); } + +xpu::BKCLContext_t XPUContext::bkcl_context() const { + return impl_->GetBkclContext(); +} + +void XPUContext::Wait() const { impl_->Wait(); } + +void XPUContext::set_x_context(xpu::Context* context) { + impl_->SetXContext(context); +} + +void XPUContext::set_bkcl_context(xpu::BKCLContext_t context) { + impl_->SetBkclContext(context); +} + +} // namespace pten diff --git a/paddle/pten/backends/xpu/xpu_context.h b/paddle/pten/backends/xpu/xpu_context.h index 94d2a1532f636..4ae5786211dd2 100644 --- a/paddle/pten/backends/xpu/xpu_context.h +++ b/paddle/pten/backends/xpu/xpu_context.h @@ -14,13 +14,60 @@ limitations under the License. */ #pragma once -#ifdef PADDLE_WITH_XPU +#include +#include "paddle/pten/backends/xpu/forwards.h" +#include "paddle/pten/common/place.h" +#include "paddle/pten/core/device_context.h" -// See Note [ Why still include the fluid headers? ] -#include "paddle/fluid/platform/device_context.h" +#include "paddle/pten/backends/xpu/xpu_header.h" +#include "paddle/pten/backends/xpu/xpu_info.h" + +namespace xpu = baidu::xpu::api; namespace pten { -using XPUContext = paddle::platform::XPUDeviceContext; -} // namespace pten -#endif // PADDLE_WITH_XPU +struct XPUContextResource { + xpu::Context* context{nullptr}; +}; + +class XPUContext : public DeviceContext { + public: + // NOTE: DeviceContext hold resources. Used in training scenarios. + XPUContext(); + + explicit XPUContext(const XPUPlace&); + + // NOTE: Share the same underlying resources, please ensure that resources are + // not released. + XPUContext(const XPUContext&); + + XPUContext(XPUContext&&); + + virtual ~XPUContext(); + + Place GetPlace() const override; + + backends::xpu::XPUVersion xpu_version() const; + + xpu::Context* x_context() const; + + // Return bkcl context. + xpu::BKCLContext_t bkcl_context() const; + + // Wait for all operations completion in the stream. + void Wait() const override; + + public: + // NOTE: External users manage resources. Used in inference scenarios. + explicit XPUContext(const XPUContextResource&); + + void set_x_context(xpu::Context*); + + void set_bkcl_context(xpu::BKCLContext_t context); + + private: + struct XPUImpl; + std::unique_ptr impl_; +}; + +} // namespace pten diff --git a/paddle/pten/backends/xpu/xpu_header.h b/paddle/pten/backends/xpu/xpu_header.h new file mode 100644 index 0000000000000..99e4a06720f22 --- /dev/null +++ b/paddle/pten/backends/xpu/xpu_header.h @@ -0,0 +1,56 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#ifdef PADDLE_WITH_XPU +#include +#include +#include + +#include "paddle/fluid/platform/enforce.h" +#include "paddle/pten/common/bfloat16.h" +#include "paddle/pten/common/float16.h" + +#include "xpu/runtime.h" +#include "xpu/runtime_ex.h" +#include "xpu/xdnn.h" + +namespace xpu = baidu::xpu::api; + +static std::map XPUAPIErrorMsg = { + {xpu::Error_t::SUCCESS, "xpu api success"}, + {xpu::Error_t::INVALID_PARAM, "xpu api invalid param"}, + {xpu::Error_t::RUNTIME_ERROR, "xpu api runtime error"}, + {xpu::Error_t::NO_ENOUGH_WORKSPACE, "xpu api no enough workspace"}}; + +template +class XPUTypeTrait { + public: + using Type = T; +}; + +template <> +class XPUTypeTrait { + public: + using Type = float16; +}; + +template <> +class XPUTypeTrait { + public: + using Type = bfloat16; +}; + +#endif diff --git a/paddle/pten/backends/xpu/xpu_info.cc b/paddle/pten/backends/xpu/xpu_info.cc new file mode 100644 index 0000000000000..01d23be848bde --- /dev/null +++ b/paddle/pten/backends/xpu/xpu_info.cc @@ -0,0 +1,199 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/pten/backends/xpu/xpu_info.h" + +#include +#include +#include + +#include "paddle/pten/backends/xpu/enforce_xpu.h" +#include "paddle/pten/backends/xpu/xpu_context.h" +#include "paddle/pten/backends/xpu/xpu_header.h" +#include "paddle/pten/common/place.h" + +// TODO(wilber): The pten computing library requires a component to manage +// flags. +#include "paddle/fluid/platform/flags.h" + +PADDLE_DEFINE_EXPORTED_string( + selected_xpus, + "", + "A list of device ids separated by comma, like: 0,1,2,3. " + "This option is useful when doing multi process training and " + "each process have only one device (XPU). If you want to use " + "all visible devices, set this to empty string. NOTE: the " + "reason of doing this is that we want to use P2P communication" + "between XPU devices, use XPU_VISIBLE_DEVICES can only use" + "share-memory only."); + +namespace pten { +class XPUContext; + +namespace backends { +namespace xpu { + +/**************************** Version Management **************************/ + +//! Get the version of XPU Driver +int GetDriverVersion() { + uint32_t driver_version_major = 0; + uint32_t driver_version_minor = 0; + PADDLE_ENFORCE_XPU_SUCCESS( + xpu_get_driver_version(&driver_version_major, &driver_version_minor)); + int driver_version = driver_version_major * 10 + driver_version_minor; + return driver_version; +} + +//! Get the version of XPU Runtime +int GetRuntimeVersion() { + uint32_t rumtime_version_major = 0; + uint32_t rumtime_version_minor = 0; + PADDLE_ENFORCE_XPU_SUCCESS( + xpu_get_runtime_version(&rumtime_version_major, &rumtime_version_minor)); + int runtime_version = rumtime_version_major * 10 + rumtime_version_minor; + return runtime_version; +} + +/**************************** Device Management **************************/ + +static int GetDeviceCountImpl() { + const auto* xpu_visible_devices = std::getenv("XPU_VISIBLE_DEVICES"); + if (xpu_visible_devices != nullptr) { + std::string xpu_visible_devices_str(xpu_visible_devices); + if (std::all_of(xpu_visible_devices_str.begin(), + xpu_visible_devices_str.end(), + [](char ch) { return ch == ' '; })) { + VLOG(2) << "XPU_VISIBLE_DEVICES is set to be empty. No XPU detected."; + return 0; + } + } + + int count = 0; + PADDLE_ENFORCE_XPU_SUCCESS(xpu_device_count(&count)); + return count; +} + +int GetXPUDeviceCount() { + static auto dev_cnt = GetDeviceCountImpl(); + return dev_cnt; +} + +int GetXPUCurrentDeviceId() { + int dev_id; + PADDLE_ENFORCE_XPU_SUCCESS(xpu_current_device(&dev_id)); + if (dev_id >= 64) { + // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id + dev_id -= 64; + } + return dev_id; +} + +void SetXPUDeviceId(int id) { + PADDLE_ENFORCE_LT( + id, + GetXPUDeviceCount(), + paddle::platform::errors::InvalidArgument("id must less than XPU count")); + PADDLE_ENFORCE_XPU_SUCCESS(xpu_set_device(id)); +} + +static inline std::vector Split(std::string const& original, + char separator) { + std::vector results; + std::string token; + std::istringstream is(original); + while (std::getline(is, token, separator)) { + if (!token.empty()) { + results.push_back(token); + } + } + return results; +} + +//! Get a list of device ids from environment variable or use all. +std::vector GetXPUSelectedDevices() { + // use user specified XPUs in single-node multi-process mode. + std::vector devices; + if (!FLAGS_selected_xpus.empty()) { + auto devices_str = Split(FLAGS_selected_xpus, ','); + for (auto id : devices_str) { + devices.push_back(atoi(id.c_str())); + } + } else { + int count = GetXPUDeviceCount(); + for (int i = 0; i < count; ++i) { + devices.push_back(i); + } + } + return devices; +} + +/**************************** Memory Management **************************/ + +void MemcpySyncH2D(void* dst, + const void* src, + size_t count, + const pten::XPUPlace& dst_place) { + XPUDeviceGuard guard(dst_place.device); + PADDLE_ENFORCE_XPU_SUCCESS( + xpu_memcpy(dst, src, count, XPUMemcpyKind::XPU_HOST_TO_DEVICE)); +} + +void MemcpySyncD2H(void* dst, + const void* src, + size_t count, + const pten::XPUPlace& src_place, + const pten::XPUContext& dev_ctx) { + XPUDeviceGuard guard(src_place.GetDeviceId()); + dev_ctx.Wait(); + PADDLE_ENFORCE_XPU_SUCCESS( + xpu_memcpy(dst, src, count, XPUMemcpyKind::XPU_DEVICE_TO_HOST)); +} + +// if src.device == dst.device and you need sync , after call this function, +// need to call xpu_wait() +void MemcpySyncD2D(void* dst, + const pten::XPUPlace& dst_place, + const void* src, + const pten::XPUPlace& src_place, + size_t count, + const pten::XPUContext& dev_ctx) { + int dev_id = GetXPUCurrentDeviceId(); + if (dst_place.device == dev_id && src_place.device == dev_id) { + PADDLE_ENFORCE_XDNN_SUCCESS( + baidu::xpu::api::copy(dev_ctx.x_context(), + static_cast(src), + static_cast(dst), + count), + "copy "); + } else { + PADDLE_ENFORCE_XPU_SUCCESS( + xpu_memcpy_peer(dst_place.device, dst, src_place.device, src, count)); + } +} + +/**************************** Others **************************/ + +XPUVersion get_xpu_version(int dev_id) { + uint64_t v = 0; + PADDLE_ENFORCE_XPU_SUCCESS(xpu_device_get_attr(&v, XPUATTR_MODEL, dev_id)); + + if (v == K100 || v == K200) { + VLOG(1) << "KUNLUN device " << dev_id << " is XPU1\n"; + return XPU1; + } else { + VLOG(1) << "KUNLUN device " << dev_id << " is XPU2\n"; + return XPU2; + } +} + +} // namespace xpu +} // namespace backends +} // namespace pten diff --git a/paddle/pten/backends/xpu/xpu_info.h b/paddle/pten/backends/xpu/xpu_info.h new file mode 100644 index 0000000000000..8cf836ba16dc6 --- /dev/null +++ b/paddle/pten/backends/xpu/xpu_info.h @@ -0,0 +1,93 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#include +#include +#include "paddle/pten/common/place.h" + +namespace pten { + +class XPUContext; + +namespace backends { +namespace xpu { + +/***** Version Management *****/ + +//! Get the version of XPU Driver +int GetDriverVersion(); + +//! Get the version of XPU Runtime +int GetRuntimeVersion(); + +/***** Device Management *****/ + +//! Get the total number of XPU devices in system. +int GetXPUDeviceCount(); + +//! Set the XPU device id for next execution. +void SetXPUDeviceId(int device_id); + +//! Get the current XPU device id in system. +int GetXPUCurrentDeviceId(); + +//! Get a list of device ids from environment variable or use all. +std::vector GetXPUSelectedDevices(); + +/***** Memory Management *****/ + +//! Copy memory from address src to dst synchronously. +void MemcpySyncH2D(void *dst, + const void *src, + size_t count, + const pten::XPUPlace &dst_place); +void MemcpySyncD2H(void *dst, + const void *src, + size_t count, + const pten::XPUPlace &src_place, + const pten::XPUContext &dev_ctx); +void MemcpySyncD2D(void *dst, + const pten::XPUPlace &dst_place, + const void *src, + const pten::XPUPlace &src_place, + size_t count, + const pten::XPUContext &dev_ctx); + +class XPUDeviceGuard { + public: + explicit inline XPUDeviceGuard(int dev_id) { + int prev_id = GetXPUCurrentDeviceId(); + if (prev_id != dev_id) { + prev_id_ = prev_id; + SetXPUDeviceId(dev_id); + } + } + + inline ~XPUDeviceGuard() { + if (prev_id_ != -1) { + SetXPUDeviceId(prev_id_); + } + } + + XPUDeviceGuard(const XPUDeviceGuard &o) = delete; + XPUDeviceGuard &operator=(const XPUDeviceGuard &o) = delete; + + private: + int prev_id_{-1}; +}; + +enum XPUVersion { XPU1, XPU2 }; +XPUVersion get_xpu_version(int dev_id); + +} // namespace xpu +} // namespace backends +} // namespace pten diff --git a/paddle/pten/core/device_context.cc b/paddle/pten/core/device_context.cc index 7b2c4a2cf170f..7566b351bf634 100644 --- a/paddle/pten/core/device_context.cc +++ b/paddle/pten/core/device_context.cc @@ -13,28 +13,45 @@ // limitations under the License. #include "paddle/pten/core/device_context.h" +#include "paddle/pten/api/ext/exception.h" namespace pten { struct DeviceContext::Impl { - Allocator* allocator_{nullptr}; - Impl() = default; ~Impl() = default; - void SetAllocator(Allocator* allocator) { allocator_ = allocator; } + void SetDeviceAllocator(Allocator* allocator) { + device_allocator_ = allocator; + } + + void SetHostAllocator(Allocator* allocator) { host_allocator_ = allocator; } + + const Allocator& GetDeviceAllocator() const { + PD_CHECK(device_allocator_ != nullptr, "the device_allocator is nullptr."); + return *device_allocator_; + } - const Allocator& GetAllocator() const { return *allocator_; } + const Allocator& GetHostAllocator() const { + PD_CHECK(host_allocator_ != nullptr, "the host_allocator is nullptr."); + return *host_allocator_; + } // TODO(Wilber): Add impl. It seems that tensorbase not have interface to // communicate with allocator. - void Alloc(TensorBase* tensor) {} + void HostAlloc(TensorBase* tensor) {} + void DeviceAlloc(TensorBase* tensor) {} + + Allocator* device_allocator_{nullptr}; + Allocator* host_allocator_{nullptr}; }; DeviceContext::DeviceContext() { impl_ = std::make_unique(); } DeviceContext::DeviceContext(const DeviceContext& other) { - impl_->SetAllocator(const_cast(&other.GetAllocator())); + impl_->SetDeviceAllocator( + const_cast(&other.GetDeviceAllocator())); + impl_->SetHostAllocator(const_cast(&other.GetHostAllocator())); } DeviceContext::DeviceContext(DeviceContext&& other) { @@ -43,14 +60,26 @@ DeviceContext::DeviceContext(DeviceContext&& other) { DeviceContext::~DeviceContext() = default; -void DeviceContext::SetAllocator(Allocator* allocator) { - impl_->SetAllocator(allocator); +void DeviceContext::SetHostAllocator(Allocator* allocator) { + impl_->SetHostAllocator(allocator); +} + +void DeviceContext::SetDeviceAllocator(Allocator* allocator) { + impl_->SetDeviceAllocator(allocator); +} + +const Allocator& DeviceContext::GetHostAllocator() const { + return impl_->GetHostAllocator(); } -const Allocator& DeviceContext::GetAllocator() const { - return impl_->GetAllocator(); +const Allocator& DeviceContext::GetDeviceAllocator() const { + return impl_->GetDeviceAllocator(); } -void DeviceContext::Alloc(TensorBase* tensor) { impl_->Alloc(tensor); } +void DeviceContext::HostAlloc(TensorBase* tensor) { impl_->HostAlloc(tensor); } + +void DeviceContext::DeviceAlloc(TensorBase* tensor) { + impl_->DeviceAlloc(tensor); +} } // namespace pten diff --git a/paddle/pten/core/device_context.h b/paddle/pten/core/device_context.h index 1ee2e21494bf5..c658a24c3527d 100644 --- a/paddle/pten/core/device_context.h +++ b/paddle/pten/core/device_context.h @@ -57,19 +57,38 @@ class DeviceContext { * * @param allocator */ - void SetAllocator(Allocator*); + void SetDeviceAllocator(Allocator*); /** - * @brief Get the const Allocator object. + * @brief Get the const deveice-releated Allocator object. * * @return Allocator */ - const Allocator& GetAllocator() const; + const Allocator& GetDeviceAllocator() const; /** - * @brief Allocate memory for tensor. + * @brief Allocate device memory for tensor. */ - void Alloc(pten::TensorBase*); + void DeviceAlloc(pten::TensorBase*); + + /** + * @brief Set the host Allocator object. + * + * @param allocator + */ + void SetHostAllocator(Allocator*); + + /** + * @brief Get the const host Allocator object. + * + * @return Allocator + */ + const Allocator& GetHostAllocator() const; + + /** + * @brief Allocate host memory for tensor. + */ + void HostAlloc(pten::TensorBase*); // TODO(wilber): Just for the convenience of migrating the code, it will be // modified or removed later.