Skip to content

Commit

Permalink
[XPU][Auto Parallel] Compatible new comm library upgrade for XPUs. (P…
Browse files Browse the repository at this point in the history
  • Loading branch information
ZibinGuo authored and runzhech committed Apr 30, 2024
1 parent 98ef510 commit 8dddd83
Show file tree
Hide file tree
Showing 9 changed files with 186 additions and 7 deletions.
20 changes: 20 additions & 0 deletions paddle/fluid/memory/allocation/allocator_facade.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1813,6 +1813,26 @@ void AllocatorFacade::RemoveMemoryPoolOfCUDAGraph(int64_t id) {
}
}
#endif
#elif defined(PADDLE_WITH_XPU)
const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
const platform::Place& place, XPUStream stream) {
AllocatorFacadePrivate* m = GetPrivate();

// The XPU currently does not have the concept of MallocAsyncAllocatorUsed
// and shares the logic of IsStreamSafeCUDAAllocatorUsed.
if (!m->IsStreamSafeCUDAAllocatorUsed()) {
VLOG(6) << "Warning: StreamSafeCUDAAllocator "
"are not used!";
return GetAllocator(place);
}

if (platform::is_xpu_place(place) && FLAGS_use_system_allocator == false) {
return m->GetAllocator(place,
stream,
/*create_if_not_found=*/true);
}
return m->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
}
#endif

#ifdef PADDLE_WITH_CUSTOM_DEVICE
Expand Down
3 changes: 3 additions & 0 deletions paddle/fluid/memory/allocation/allocator_facade.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,9 @@ class AllocatorFacade {
const platform::Place& place, gpuStream_t stream);
gpuStream_t GetStream(const std::shared_ptr<Allocation>& allocation) const;
void SetDefaultStream(const platform::CUDAPlace& place, gpuStream_t stream);
#elif defined(PADDLE_WITH_XPU)
TEST_API const std::shared_ptr<Allocator>& GetAllocator(
const platform::Place& place, XPUStream stream);
#endif

#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
Expand Down
30 changes: 30 additions & 0 deletions paddle/fluid/platform/init.cc
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ limitations under the License. */
#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
(defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL))
#include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h"
#elif (defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL))
#include "paddle/fluid/platform/device/xpu/xpu_resource_pool.h"
#endif

COMMON_DECLARE_int32(paddle_num_threads);
Expand Down Expand Up @@ -476,6 +478,34 @@ void InitMemoryMethod() {
memory_method->get_new_cuda_event = [](int device_id) {
return paddle::platform::CudaEventResourcePool::Instance().New(device_id);
};
#elif (defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL))
// TODO(ZibinGuo): Use phi methods later.
memory_method->get_allocator =
[](int device_id, XPUStream stream) -> phi::Allocator * {
return paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(phi::XPUPlace(device_id), stream)
.get();
};
memory_method->get_host_allocator = []() -> phi::Allocator * {
return paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(phi::CPUPlace())
.get();
};
memory_method->get_zero_allocator = [](int device_id) -> phi::Allocator * {
return paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(phi::XPUPlace(device_id))
.get();
};
memory_method->get_host_zero_allocator = []() -> phi::Allocator * {
return paddle::memory::allocation::AllocatorFacade::Instance()
.GetZeroAllocator(phi::CPUPlace())
.get();
};
// XPUs do not have the concept of pinned memory,
// so the get_pinned_allocator function is not set.
memory_method->get_new_xpu_event = [](int device_id) {
return paddle::platform::XpuEventResourcePool::Instance().New(device_id);
};
#endif

memory_method->emplace_device_contexts =
Expand Down
23 changes: 23 additions & 0 deletions paddle/phi/common/memory_utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,29 @@ std::shared_ptr<std::remove_pointer<phi::gpuEvent_t>::type> GetCudaEvent(
int device_id) {
return MemoryUtils::Instance().GetCudaEvent(device_id);
}
#elif (defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL))
const phi::Allocator* GetAllocator(int device_id, XPUStream stream) {
return MemoryUtils::Instance().GetAllocator(device_id, stream);
}

const phi::Allocator* GetHostAllocator() {
return MemoryUtils::Instance().GetHostAllocator();
}

const phi::Allocator* GetZeroAllocator(int device_id) {
return MemoryUtils::Instance().GetZeroAllocator(device_id);
}

const phi::Allocator* GetHostZeroAllocator() {
return MemoryUtils::Instance().GetHostZeroAllocator();
}

// XPUs do not have the concept of pinned memory,
// so the get_pinned_allocator function is not set.
std::shared_ptr<std::remove_pointer<XPUEvent>::type> GetXpuEvent(
int device_id) {
return MemoryUtils::Instance().GetXpuEvent(device_id);
}
#endif

} // namespace memory_utils
Expand Down
41 changes: 41 additions & 0 deletions paddle/phi/common/memory_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,14 @@ struct MemoryInterface {
phi::Allocator* (*get_pinned_allocator)();
std::shared_ptr<std::remove_pointer<phi::gpuEvent_t>::type> (
*get_new_cuda_event)(int device_id);
#elif (defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL))
phi::Allocator* (*get_allocator)(int device_id, XPUStream stream);
phi::Allocator* (*get_host_allocator)();
phi::Allocator* (*get_zero_allocator)(int device_id);
phi::Allocator* (*get_host_zero_allocator)();
// phi::Allocator* (*get_pinned_allocator)();
std::shared_ptr<std::remove_pointer<XPUEvent>::type> (*get_new_xpu_event)(
int device_id);
#endif
};

Expand Down Expand Up @@ -370,6 +378,27 @@ class MemoryUtils {
int device_id) {
return memory_method_->get_new_cuda_event(device_id);
}
#elif (defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL))
const phi::Allocator* GetAllocator(int device_id, XPUStream stream) {
return memory_method_->get_allocator(device_id, stream);
}

const phi::Allocator* GetHostAllocator() {
return memory_method_->get_host_allocator();
}

const phi::Allocator* GetZeroAllocator(int device_id) {
return memory_method_->get_zero_allocator(device_id);
}

const phi::Allocator* GetHostZeroAllocator() {
return memory_method_->get_host_zero_allocator();
}

std::shared_ptr<std::remove_pointer<XPUEvent>::type> GetXpuEvent(
int device_id) {
return memory_method_->get_new_xpu_event(device_id);
}
#endif

private:
Expand Down Expand Up @@ -448,6 +477,18 @@ const Allocator* GetPinnedAllocator();

std::shared_ptr<std::remove_pointer<phi::gpuEvent_t>::type> GetCudaEvent(
int device_id);
#elif (defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL))
const Allocator* GetAllocator(int device_id, XPUStream stream);

const Allocator* GetHostAllocator();

const Allocator* GetZeroAllocator(int device_id);

const Allocator* GetHostZeroAllocator();

// XPUs do not have the concept of pinned memory,
// so the get_pinned_allocator function is not set.
std::shared_ptr<std::remove_pointer<XPUEvent>::type> GetXpuEvent(int device_id);
#endif

class Buffer {
Expand Down
14 changes: 14 additions & 0 deletions paddle/phi/core/distributed/bkcl_comm_context.cc
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,20 @@ void BKCLCommContext::SetDevContext(
dev_ctx_ = std::move(dev_ctx);
}

XPUEvent BKCLCommContext::GetComputeEvent() { return compute_event_.get(); }

void BKCLCommContext::SetComputeEvent(
std::shared_ptr<std::remove_pointer<XPUEvent>::type>&& compute_event) {
compute_event_ = std::move(compute_event);
}

XPUEvent BKCLCommContext::GetCommEvent() { return comm_event_.get(); }

void BKCLCommContext::SetCommEvent(
std::shared_ptr<std::remove_pointer<XPUEvent>::type>&& comm_event) {
comm_event_ = std::move(comm_event);
}

void BKCLCommContext::Broadcast(phi::DenseTensor* out_tensor,
const phi::DenseTensor& in_tensor,
int root,
Expand Down
16 changes: 16 additions & 0 deletions paddle/phi/core/distributed/bkcl_comm_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,16 @@ class BKCLCommContext final : public CommContext {

XPUStream GetStream();

XPUEvent GetComputeEvent();

void SetComputeEvent(
std::shared_ptr<std::remove_pointer<XPUEvent>::type>&& compute_event);

XPUEvent GetCommEvent();

void SetCommEvent(
std::shared_ptr<std::remove_pointer<XPUEvent>::type>&& comm_event);

phi::XPUContext* GetDevContext();

void SetDevContext(std::unique_ptr<phi::XPUContext>&& dev_ctx);
Expand Down Expand Up @@ -79,6 +89,12 @@ class BKCLCommContext final : public CommContext {
BKCLContext_t bkcl_comm_;

std::unique_ptr<phi::XPUContext> dev_ctx_;

// used for comm wait compute, compute_stream-->event-->comm_stream
std::shared_ptr<std::remove_pointer<XPUEvent>::type> compute_event_;

// used for compute wait comm, comm_stream-->event-->compute_stream
std::shared_ptr<std::remove_pointer<XPUEvent>::type> comm_event_;
};

} // namespace distributed
Expand Down
34 changes: 30 additions & 4 deletions paddle/phi/core/distributed/comm_context_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,14 @@
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/distributed/nccl_comm_context.h"
#include "paddle/phi/core/distributed/nccl_tools.h"
#elif defined(PADDLE_WITH_XPU_BKCL)
#include "paddle/phi/backends/xpu/xpu_info.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/distributed/bkcl_comm_context.h"
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#include "paddle/phi/core/distributed/xccl_comm_context.h"
#endif
#ifdef PADDLE_WITH_XPU_BKCL
#include "paddle/phi/backends/xpu/xpu_info.h"
#include "paddle/phi/core/distributed/bkcl_comm_context.h"
#endif

namespace phi {
namespace distributed {
Expand All @@ -52,6 +52,9 @@ void CommContextManager::SetDeviceId(int dev_id) {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
phi::backends::gpu::SetDeviceId(dev_id);
CommContextManager::device_id = dev_id;
#elif defined(PADDLE_WITH_XPU_BKCL)
phi::backends::xpu::SetXPUDeviceId(dev_id);
CommContextManager::device_id = dev_id;
#endif
}

Expand Down Expand Up @@ -207,6 +210,29 @@ void CommContextManager::CreateBKCLCommContext(
auto bkcl_comm_context =
std::make_unique<BKCLCommContext>(rank, size, bkcl_id);

if (CommContextManager::device_id != -1) {
std::unique_ptr<phi::XPUContext> dev_ctx(
new phi::XPUContext(phi::XPUPlace(CommContextManager::device_id)));
dev_ctx->SetAllocator(phi::memory_utils::GetAllocator(
CommContextManager::device_id, dev_ctx->stream()));
dev_ctx->SetHostAllocator(phi::memory_utils::GetHostAllocator());
dev_ctx->SetZeroAllocator(
phi::memory_utils::GetZeroAllocator(CommContextManager::device_id));
dev_ctx->SetHostZeroAllocator(phi::memory_utils::GetHostZeroAllocator());
// XPUs do not have the concept of pinned memory,
// so the get_pinned_allocator function is not set.

// It currently does not support dev_ctx->PartialInitWithAllocator().
auto compute_event =
phi::memory_utils::GetXpuEvent(CommContextManager::device_id);
auto comm_event =
phi::memory_utils::GetXpuEvent(CommContextManager::device_id);

bkcl_comm_context->SetDevContext(std::move(dev_ctx));
bkcl_comm_context->SetComputeEvent(std::move(compute_event));
bkcl_comm_context->SetCommEvent(std::move(comm_event));
}

comm_context_manager.SetStore(store);
comm_context_manager.Emplace(unique_comm_key, std::move(bkcl_comm_context));
}
Expand Down
12 changes: 9 additions & 3 deletions python/paddle/distributed/launch/context/device.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,12 +101,18 @@ def parse_device(self):
)
if visible_devices_str in os.environ:
visible_devices = os.getenv(visible_devices_str)
elif 'CUDA_VISIBLE_DEVICES' in os.environ:
dev._dtype = DeviceType.GPU
visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
elif 'XPULINK_VISIBLE_DEVICES' in os.environ:
dev._dtype = DeviceType.XPU
visible_devices = os.getenv("XPULINK_VISIBLE_DEVICES")
elif 'XPU_VISIBLE_DEVICES' in os.environ:
dev._dtype = DeviceType.XPU
visible_devices = os.getenv("XPU_VISIBLE_DEVICES")
elif 'CUDA_VISIBLE_DEVICES' in os.environ:
if core.is_compiled_with_xpu():
dev._dtype = DeviceType.XPU
else:
dev._dtype = DeviceType.GPU
visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")

if visible_devices is not None and visible_devices != 'all':
dev._labels = visible_devices.split(',')
Expand Down

0 comments on commit 8dddd83

Please sign in to comment.