Skip to content

Commit

Permalink
[v1.21.x] prov/efa: Use srx lock from domain directly
Browse files Browse the repository at this point in the history
Currently, efa_rdm_cq_readfrom access srx_lock
via util_domain->srx->peer_srx.ep_fid.fid.context.
However, srx is destroyed during ep close and may not
be accessible if a cq read is called after ep close.
This patch fixes this issue by accessing the srx lock
via efa_domain directly, as the lock is created by
efa domain and cannot be destroyed before domain close.

Same issue applies to efa cntr.

Signed-off-by: Shi Jin <sjina@amazon.com>
(cherry picked from commit 0bb72fd)
  • Loading branch information
shijin-aws committed Apr 16, 2024
1 parent fe6cde6 commit 9562dc5
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 38 deletions.
32 changes: 13 additions & 19 deletions prov/efa/src/efa_cntr.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,13 @@ static int efa_cntr_wait(struct fid_cntr *cntr_fid, uint64_t threshold, int time
int numtry = 5;
int tryid = 0;
int waitim = 1;
struct util_srx_ctx *srx_ctx;
struct efa_domain *domain;

srx_ctx = efa_cntr_get_srx_ctx(cntr_fid);
cntr = container_of(cntr_fid, struct util_cntr, cntr_fid);
domain = container_of(cntr->domain, struct efa_domain, util_domain);

if (srx_ctx)
ofi_genlock_lock(srx_ctx->lock);
ofi_genlock_lock(&domain->srx_lock);

cntr = container_of(cntr_fid, struct util_cntr, cntr_fid);
assert(cntr->wait);
errcnt = ofi_atomic_get64(&cntr->err);
start = (timeout >= 0) ? ofi_gettime_ms() : 0;
Expand Down Expand Up @@ -55,52 +54,47 @@ static int efa_cntr_wait(struct fid_cntr *cntr_fid, uint64_t threshold, int time
}

unlock:
if (srx_ctx)
ofi_genlock_unlock(srx_ctx->lock);
ofi_genlock_unlock(&domain->srx_lock);
return ret;
}

static uint64_t efa_cntr_read(struct fid_cntr *cntr_fid)
{
struct util_srx_ctx *srx_ctx;
struct efa_domain *domain;
struct efa_cntr *efa_cntr;
uint64_t ret;

efa_cntr = container_of(cntr_fid, struct efa_cntr, util_cntr.cntr_fid);

srx_ctx = efa_cntr_get_srx_ctx(cntr_fid);
domain = container_of(efa_cntr->util_cntr.domain, struct efa_domain, util_domain);

if (srx_ctx)
ofi_genlock_lock(srx_ctx->lock);
ofi_genlock_lock(&domain->srx_lock);

if (efa_cntr->shm_cntr)
fi_cntr_read(efa_cntr->shm_cntr);
ret = ofi_cntr_read(cntr_fid);

if (srx_ctx)
ofi_genlock_unlock(srx_ctx->lock);
ofi_genlock_unlock(&domain->srx_lock);

return ret;
}

static uint64_t efa_cntr_readerr(struct fid_cntr *cntr_fid)
{
struct util_srx_ctx *srx_ctx;
struct efa_domain *domain;
struct efa_cntr *efa_cntr;
uint64_t ret;

efa_cntr = container_of(cntr_fid, struct efa_cntr, util_cntr.cntr_fid);

srx_ctx = efa_cntr_get_srx_ctx(cntr_fid);
domain = container_of(efa_cntr->util_cntr.domain, struct efa_domain, util_domain);

if (srx_ctx)
ofi_genlock_lock(srx_ctx->lock);
ofi_genlock_lock(&domain->srx_lock);
if (efa_cntr->shm_cntr)
fi_cntr_read(efa_cntr->shm_cntr);
ret = ofi_cntr_readerr(cntr_fid);

if (srx_ctx)
ofi_genlock_unlock(srx_ctx->lock);
ofi_genlock_unlock(&domain->srx_lock);

return ret;
}
Expand Down
15 changes: 0 additions & 15 deletions prov/efa/src/efa_cntr.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,20 +23,5 @@ void efa_cntr_report_rx_completion(struct util_ep *ep, uint64_t flags);

void efa_cntr_report_error(struct util_ep *ep, uint64_t flags);

static inline
void *efa_cntr_get_srx_ctx(struct fid_cntr *cntr_fid)
{
struct efa_cntr *efa_cntr;
struct fid_peer_srx *srx = NULL;

efa_cntr = container_of(cntr_fid, struct efa_cntr, util_cntr.cntr_fid);

srx = efa_cntr->util_cntr.domain->srx;
if (!srx)
return NULL;

return srx->ep_fid.fid.context;
}

#endif

8 changes: 4 additions & 4 deletions prov/efa/src/rdm/efa_rdm_cq.c
Original file line number Diff line number Diff line change
Expand Up @@ -401,13 +401,13 @@ static ssize_t efa_rdm_cq_readfrom(struct fid_cq *cq_fid, void *buf, size_t coun
{
struct efa_rdm_cq *cq;
ssize_t ret;
struct util_srx_ctx *srx_ctx;
struct efa_domain *domain;

cq = container_of(cq_fid, struct efa_rdm_cq, util_cq.cq_fid.fid);

srx_ctx = cq->util_cq.domain->srx->ep_fid.fid.context;
domain = container_of(cq->util_cq.domain, struct efa_domain, util_domain);

ofi_genlock_lock(srx_ctx->lock);
ofi_genlock_lock(&domain->srx_lock);

if (cq->shm_cq) {
fi_cq_read(cq->shm_cq, NULL, 0);
Expand All @@ -426,7 +426,7 @@ static ssize_t efa_rdm_cq_readfrom(struct fid_cq *cq_fid, void *buf, size_t coun
ret = ofi_cq_readfrom(&cq->util_cq.cq_fid, buf, count, src_addr);

out:
ofi_genlock_unlock(srx_ctx->lock);
ofi_genlock_unlock(&domain->srx_lock);

return ret;
}
Expand Down

0 comments on commit 9562dc5

Please sign in to comment.