Skip to content

Commit

Permalink
[CustomDevice] release all xccl_comm in DeviceManager::Release (#60465)
Browse files Browse the repository at this point in the history
  • Loading branch information
ronny1996 authored Dec 30, 2023
1 parent 5bc7a59 commit 3177d59
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 0 deletions.
4 changes: 4 additions & 0 deletions paddle/phi/backends/device_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

#include "paddle/phi/backends/device_manager.h"
#include "paddle/phi/common/complex.h"
#include "paddle/phi/core/distributed/xccl_comm_context.h"

#if !defined(_WIN32)
#include <dirent.h>
Expand Down Expand Up @@ -699,6 +700,9 @@ DeviceManager& DeviceManager::Instance() {
void DeviceManager::Release() {
event::Event::ReleaseAll();
stream::Stream::ReleaseAll();
#ifdef PADDLE_WITH_CUSTOM_DEVICE
phi::distributed::XCCLCommContext::ReleaseAll();
#endif
Instance().device_map_.clear();
Instance().device_impl_map_.clear();
}
Expand Down
27 changes: 27 additions & 0 deletions paddle/phi/core/distributed/xccl_comm_context.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@

#include "paddle/phi/core/distributed/xccl_comm_context.h"

#include <list>

#include "glog/logging.h"

#include "paddle/phi/core/dense_tensor.h"
Expand All @@ -25,6 +27,29 @@
namespace phi {
namespace distributed {

std::list<XCCLCommContext*> g_xccl_comm_contexts;
std::mutex g_xccl_comm_contexts_mutex;

void XCCLCommContext::ReleaseAll() {
std::unique_lock lock(g_xccl_comm_contexts_mutex);
for (auto xccl_comm_ctx : g_xccl_comm_contexts) {
phi::DeviceManager::CCLDestroyComm(xccl_comm_ctx->GetDeviceType(),
xccl_comm_ctx->GetXcclComm());
xccl_comm_ctx->xccl_comm_ = nullptr;
}
g_xccl_comm_contexts.clear();
}

XCCLCommContext::~XCCLCommContext() {
std::unique_lock lock(g_xccl_comm_contexts_mutex);
if (phi::DeviceManager::HasDeviceType(this->GetDeviceType()) &&
xccl_comm_ != nullptr) {
phi::DeviceManager::CCLDestroyComm(this->GetDeviceType(), xccl_comm_);
xccl_comm_ = nullptr;
}
g_xccl_comm_contexts.remove(this);
}

XCCLCommContext::XCCLCommContext(const phi::Place& place,
int rank,
int size,
Expand All @@ -38,6 +63,8 @@ XCCLCommContext::XCCLCommContext(const phi::Place& place,
&xccl_comm_);
stream_ = std::make_shared<phi::stream::Stream>();
stream_->Init(place_);
std::unique_lock lock(g_xccl_comm_contexts_mutex);
g_xccl_comm_contexts.push_back(this);
}

void XCCLCommContext::Broadcast(phi::DenseTensor* out_tensor,
Expand Down
3 changes: 3 additions & 0 deletions paddle/phi/core/distributed/xccl_comm_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@ class XCCLCommContext final : public CommContext {
int rank,
int size,
const ccl::CCLRootId& xccl_id);
~XCCLCommContext();

static void ReleaseAll();

ccl::CCLComm GetXcclComm() const { return xccl_comm_; }

Expand Down

0 comments on commit 3177d59

Please sign in to comment.