Skip to content

Commit

Permalink
Revert "prov/efa: Refactor dmabuf reg"
Browse files Browse the repository at this point in the history
This reverts commit 6aa6708
because NCCL alltoall test fails to register memory with dmabuf on
16 nodes.

Signed-off-by: Jessie Yang <jiaxiyan@amazon.com>
  • Loading branch information
jiaxiyan authored and shijin-aws committed Jun 24, 2024
1 parent fb052c6 commit 5e48b55
Show file tree
Hide file tree
Showing 4 changed files with 91 additions and 81 deletions.
43 changes: 18 additions & 25 deletions prov/efa/src/efa_hmem.c
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,6 @@ static int efa_domain_hmem_info_init_system(struct efa_domain *efa_domain)
info->p2p_disabled_by_user = false;
info->p2p_required_by_impl = false;
info->p2p_supported_by_device = true;
info->dmabuf_supported = false;

efa_domain_hmem_info_init_protocol_thresholds(efa_domain, FI_HMEM_SYSTEM);
return 0;
}
Expand Down Expand Up @@ -135,7 +133,6 @@ static int efa_domain_hmem_info_init_cuda(struct efa_domain *efa_domain)

info->initialized = true;
info->p2p_disabled_by_user = false;
info->dmabuf_supported = false;

/* If user is using libfabric API 1.18 or later, by default EFA provider is permitted to
* use CUDA library to support CUDA memory, therefore p2p is not required.
Expand All @@ -145,24 +142,26 @@ static int efa_domain_hmem_info_init_cuda(struct efa_domain *efa_domain)
else
info->p2p_required_by_impl = true;

ret = ofi_hmem_get_dmabuf_fd(FI_HMEM_CUDA, ptr, len, &dmabuf_fd, &dmabuf_offset);
#if HAVE_EFA_DMABUF_MR
ret = cuda_get_dmabuf_fd(ptr, len, &dmabuf_fd, &dmabuf_offset);
if (ret == FI_SUCCESS) {
ibv_mr = efa_mr_reg_ibv_dmabuf_mr(efa_domain->ibv_pd, dmabuf_offset,
ibv_mr = ibv_reg_dmabuf_mr(g_device_list[0].ibv_pd, dmabuf_offset,
len, (uint64_t)ptr, dmabuf_fd, ibv_access);
if (!ibv_mr) {
EFA_INFO(FI_LOG_DOMAIN,
"Unable to register CUDA device buffer via dmabuf: %s. "
"Fall back to ibv_reg_mr\n", fi_strerror(-errno));
ibv_mr = ibv_reg_mr(efa_domain->ibv_pd, ptr, len, ibv_access);
} else {
info->dmabuf_supported = true;
ibv_mr = ibv_reg_mr(g_device_list[0].ibv_pd, ptr, len, ibv_access);
}
} else {
EFA_INFO(FI_LOG_DOMAIN,
"Unable to retrieve dmabuf fd of CUDA device buffer: %d. "
"Fall back to ibv_reg_mr\n", ret);
ibv_mr = ibv_reg_mr(efa_domain->ibv_pd, ptr, len, ibv_access);
ibv_mr = ibv_reg_mr(g_device_list[0].ibv_pd, ptr, len, ibv_access);
}
#else
ibv_mr = ibv_reg_mr(g_device_list[0].ibv_pd, ptr, len, ibv_access);
#endif

if (!ibv_mr) {
info->p2p_supported_by_device = false;
Expand Down Expand Up @@ -244,27 +243,22 @@ static int efa_domain_hmem_info_init_neuron(struct efa_domain *efa_domain)
info->p2p_disabled_by_user = false;
/* Neuron currently requires P2P */
info->p2p_required_by_impl = true;
info->dmabuf_supported = false;

ret = ofi_hmem_get_dmabuf_fd(FI_HMEM_NEURON, ptr, (uint64_t)len, &dmabuf_fd, &offset);
#if HAVE_EFA_DMABUF_MR
ret = neuron_get_dmabuf_fd(ptr, (uint64_t)len, &dmabuf_fd, &offset);
if (ret == FI_SUCCESS) {
ibv_mr = efa_mr_reg_ibv_dmabuf_mr(
efa_domain->ibv_pd, offset,
ibv_mr = ibv_reg_dmabuf_mr(
g_device_list[0].ibv_pd, offset,
len, (uint64_t)ptr, dmabuf_fd, ibv_access);
if (!ibv_mr) {
EFA_INFO(FI_LOG_DOMAIN,
"Unable to register neuron device buffer via dmabuf: %s. "
"Fall back to ibv_reg_mr\n", fi_strerror(-errno));
ibv_mr = ibv_reg_mr(efa_domain->ibv_pd, ptr, len, ibv_access);
} else {
info->dmabuf_supported = true;
}
} else {
EFA_INFO(FI_LOG_DOMAIN,
} else if (ret == -FI_ENOPROTOOPT) {
EFA_INFO(FI_LOG_MR,
"Unable to retrieve dmabuf fd of Neuron device buffer, "
"Fall back to ibv_reg_mr\n");
ibv_mr = ibv_reg_mr(efa_domain->ibv_pd, ptr, len, ibv_access);
ibv_mr = ibv_reg_mr(g_device_list[0].ibv_pd, ptr, len, ibv_access);
}
#else
ibv_mr = ibv_reg_mr(g_device_list[0].ibv_pd, ptr, len, ibv_access);
#endif

if (!ibv_mr) {
info->p2p_supported_by_device = false;
Expand Down Expand Up @@ -327,7 +321,6 @@ static int efa_domain_hmem_info_init_synapseai(struct efa_domain *efa_domain)
/* SynapseAI currently requires P2P */
info->p2p_required_by_impl = true;
info->p2p_supported_by_device = true;
info->dmabuf_supported = true;
efa_domain_hmem_info_init_protocol_thresholds(efa_domain, FI_HMEM_SYNAPSEAI);

/* Only the long read protocol is supported */
Expand Down
1 change: 0 additions & 1 deletion prov/efa/src/efa_hmem.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ struct efa_hmem_info {
bool p2p_disabled_by_user; /* Did the user disable p2p via FI_OPT_FI_HMEM_P2P? */
bool p2p_required_by_impl; /* Is p2p required for this interface? */
bool p2p_supported_by_device; /* do we support p2p with this device */
bool dmabuf_supported;

size_t max_medium_msg_size;
size_t runt_size;
Expand Down
100 changes: 73 additions & 27 deletions prov/efa/src/efa_mr.c
Original file line number Diff line number Diff line change
Expand Up @@ -475,6 +475,30 @@ struct fi_ops efa_mr_ops = {
.ops_open = fi_no_ops_open,
};

#if HAVE_EFA_DMABUF_MR

static inline
struct ibv_mr *efa_mr_reg_ibv_dmabuf_mr(struct ibv_pd *pd, uint64_t offset,
size_t len, uint64_t iova, int fd, int access)
{
return ibv_reg_dmabuf_mr(pd, offset, len, iova, fd, access);
}

#else

static inline
struct ibv_mr *efa_mr_reg_ibv_dmabuf_mr(struct ibv_pd *pd, uint64_t offset,
size_t len, uint64_t iova, int fd, int access)
{
EFA_WARN(FI_LOG_MR,
"ibv_reg_dmabuf_mr is required for memory"
" registration with FI_MR_DMABUF flags, but "
" not available in the current rdma-core library."
" please build libfabric with rdma-core >= 34.0\n");
return NULL;
}

#endif
/**
* @brief Register a memory buffer with rdma-core api.
*
Expand All @@ -487,20 +511,7 @@ struct fi_ops efa_mr_ops = {
static struct ibv_mr *efa_mr_reg_ibv_mr(struct efa_mr *efa_mr, struct fi_mr_attr *mr_attr,
int access, const uint64_t flags)
{
int dmabuf_fd;
uint64_t offset;
int ret;

assert(efa_mr->domain->hmem_info[mr_attr->iface].p2p_supported_by_device);

if (flags & FI_MR_DMABUF) {
if (OFI_UNLIKELY(!efa_mr->domain->hmem_info[mr_attr->iface].dmabuf_supported)) {
EFA_WARN(FI_LOG_MR, "Requested FI_MR_DMABUF, but dmabuf is not supported.\n");
return NULL;
}

EFA_INFO(FI_LOG_MR, "FI_MR_DMABUF is set. Registering dmabuf mr with fd: %d, offset: %lu, len: %zu\n",
mr_attr->dmabuf->fd, mr_attr->dmabuf->offset, mr_attr->dmabuf->len);
if (flags & FI_MR_DMABUF)
return efa_mr_reg_ibv_dmabuf_mr(
efa_mr->domain->ibv_pd,
mr_attr->dmabuf->offset,
Expand All @@ -509,29 +520,64 @@ static struct ibv_mr *efa_mr_reg_ibv_mr(struct efa_mr *efa_mr, struct fi_mr_attr
mr_attr->dmabuf->fd,
access
);
}

if (efa_mr->domain->hmem_info[mr_attr->iface].dmabuf_supported) {
ret = ofi_hmem_get_dmabuf_fd(
mr_attr->iface,
mr_attr->mr_iov->iov_base,
(uint64_t) mr_attr->mr_iov->iov_len,
&dmabuf_fd, &offset);
/*
* TODO: remove the synapseai and neuron blocks by onboarding the
* ofi_hmem_get_dmabuf_fd API.
*/
#if HAVE_SYNAPSEAI
if (efa_mr_is_synapseai(efa_mr)) {
int dmabuf_fd;
uint64_t offset;
int ret;

ret = synapseai_get_dmabuf_fd(mr_attr->mr_iov->iov_base,
(uint64_t) mr_attr->mr_iov->iov_len,
&dmabuf_fd, &offset);
if (ret != FI_SUCCESS) {
EFA_WARN(FI_LOG_MR, "Unable to get dmabuf fd for device buffer. errno: %d, err_msg: %s\n",
ret, fi_strerror(-ret));
EFA_WARN(FI_LOG_MR, "Unable to get dmabuf fd for Gaudi device buffer \n");
return NULL;
}
EFA_INFO(FI_LOG_MR, "Registering dmabuf mr with fd: %d, offset: %lu, len: %zu\n",
dmabuf_fd, offset, mr_attr->mr_iov->iov_len);
return efa_mr_reg_ibv_dmabuf_mr(efa_mr->domain->ibv_pd, offset,
mr_attr->mr_iov->iov_len,
(uint64_t)mr_attr->mr_iov->iov_base,
dmabuf_fd, access);
}
#endif

#if HAVE_NEURON
if (efa_mr_is_neuron(efa_mr)) {
int dmabuf_fd;
uint64_t offset;
int ret;

ret = neuron_get_dmabuf_fd(
mr_attr->mr_iov->iov_base,
mr_attr->mr_iov->iov_len,
&dmabuf_fd,
&offset);

if (ret == FI_SUCCESS) {
/* Success => invoke ibv_reg_dmabuf_mr */
return efa_mr_reg_ibv_dmabuf_mr(
efa_mr->domain->ibv_pd, 0,
mr_attr->mr_iov->iov_len,
(uint64_t)mr_attr->mr_iov->iov_base,
dmabuf_fd, access);
} else if (ret == -FI_ENOPROTOOPT) {
/* Protocol not availabe => fallback */
EFA_INFO(FI_LOG_MR,
"Unable to get dmabuf fd for Neuron device buffer, "
"Fall back to ibv_reg_mr\n");
return ibv_reg_mr(
efa_mr->domain->ibv_pd,
(void *)mr_attr->mr_iov->iov_base,
mr_attr->mr_iov->iov_len, access);
}
return NULL;
}
#endif

EFA_INFO(FI_LOG_MR, "Dmabuf is not supported. Registering memory via ibv_reg_mr with addr: %lu, len: %zu\n",
(uint64_t)mr_attr->mr_iov->iov_base, mr_attr->mr_iov->iov_len);
return ibv_reg_mr(efa_mr->domain->ibv_pd,
(void *)mr_attr->mr_iov->iov_base,
mr_attr->mr_iov->iov_len, access);
Expand Down
28 changes: 0 additions & 28 deletions prov/efa/src/efa_mr.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,6 @@

#include <stdbool.h>
#include <ofi_mr.h>
#include <infiniband/verbs.h>

#include "efa_prov.h"

/*
* Descriptor returned for FI_HMEM peer memory registrations
Expand Down Expand Up @@ -38,31 +35,6 @@ struct efa_mr {
bool needs_sync;
};

#if HAVE_EFA_DMABUF_MR

static inline
struct ibv_mr *efa_mr_reg_ibv_dmabuf_mr(struct ibv_pd *pd, uint64_t offset,
size_t len, uint64_t iova, int fd, int access)
{
return ibv_reg_dmabuf_mr(pd, offset, len, iova, fd, access);
}

#else

static inline
struct ibv_mr *efa_mr_reg_ibv_dmabuf_mr(struct ibv_pd *pd, uint64_t offset,
size_t len, uint64_t iova, int fd, int access)
{
EFA_WARN(FI_LOG_MR,
"ibv_reg_dmabuf_mr is required for memory"
" registration with FI_MR_DMABUF flags, but "
" not available in the current rdma-core library."
" please build libfabric with rdma-core >= 34.0\n");
return NULL;
}

#endif

extern int efa_mr_cache_enable;
extern size_t efa_mr_max_cached_count;
extern size_t efa_mr_max_cached_size;
Expand Down

0 comments on commit 5e48b55

Please sign in to comment.