Skip to content

Commit

Permalink
prov/verbs: Enable profiling on the verbs provider.
Browse files Browse the repository at this point in the history
Add profiling counters and timers to count:
(1) the verbs qps created, the number of connection established, and
    the number of rdma_connect, rdma_accept, rdma_reject calls made.
(2) the time an endpoint spent to reach different states for connection.
(3) the execution time on function calls.
(4) memory allocated by OFI for rxm-verbs provider.

The profile result is reported through trace log

Two environment varialbles are added:
(1) FI_DISENABLE_PROF="prov1,..,prov2"
    Disable profile on providers in the list. By default, profiling are enabled
    on all providers if libfabric built with "--enable-profile" option.
(2) FI_PROF_DATA_SIZE=<size_in_byte>
    Change the buffer size for reporting profile data. Default is 4K

Signed-off-by: Peinan Zhang <peinan.zhang@intel.com>
  • Loading branch information
peinanz authored and j-xiong committed Jun 25, 2024
1 parent 5bfad94 commit da62d0f
Show file tree
Hide file tree
Showing 11 changed files with 880 additions and 9 deletions.
3 changes: 3 additions & 0 deletions libfabric.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -839,6 +839,9 @@
<ClCompile Include="prov\verbs\src\verbs_rma.c">
<AdditionalIncludeDirectories>$(ProjectDir)prov\verbs\src;$(ProjectDir)prov\verbs\include;$(ProjectDir)prov\verbs\include\windows;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
</ClCompile>
<ClCompile Include="prov\verbs\src\verbs_profile.c">
<AdditionalIncludeDirectories>$(ProjectDir)prov\verbs\src;$(ProjectDir)prov\verbs\include;$(ProjectDir)prov\verbs\include\windows;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
</ClCompile>
<ClCompile Include="prov\verbs\src\windows\addrinfo.c">
<AdditionalIncludeDirectories>$(ProjectDir)prov\verbs\src;$(ProjectDir)prov\verbs\include;$(ProjectDir)prov\verbs\include\windows;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
</ClCompile>
Expand Down
3 changes: 3 additions & 0 deletions libfabric.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -579,6 +579,9 @@
<ClCompile Include="prov\verbs\src\verbs_rma.c">
<Filter>Source Files\prov\verbs\src</Filter>
</ClCompile>
<ClCompile Include="prov\verbs\src\verbs_profile.c">
<Filter>Source Files\prov\verbs\src</Filter>
</ClCompile>
<ClCompile Include="prov\verbs\src\windows\addrinfo.c">
<Filter>Source Files\prov\verbs\src\windows</Filter>
</ClCompile>
Expand Down
1 change: 1 addition & 0 deletions prov/verbs/Makefile.include
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ _verbs_files = \
prov/verbs/src/verbs_rma.c \
prov/verbs/src/verbs_dgram_ep_msg.c \
prov/verbs/src/verbs_dgram_av.c \
prov/verbs/src/verbs_profile.c \
prov/verbs/include/ofi_verbs_compat.h \
prov/verbs/include/linux/verbs_osd.h

Expand Down
22 changes: 21 additions & 1 deletion prov/verbs/src/verbs_cm.c
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,11 @@ vrb_msg_ep_connect(struct fid_ep *ep_fid, const void *addr,
struct vrb_cm_data_hdr *cm_hdr;
int ret = 0;

if (ep->profile)
vrb_prof_st_start(ep->profile, ofi_gettime_ns());

vrb_prof_func_start(__func__);

if (OFI_UNLIKELY(paramlen > VERBS_CM_DATA_SIZE))
return -FI_EINVAL;

Expand Down Expand Up @@ -200,6 +205,7 @@ vrb_msg_ep_connect(struct fid_ep *ep_fid, const void *addr,
ofi_genlock_lock(&vrb_ep2_progress(ep)->ep_lock);
assert(ep->state == VRB_IDLE);
ep->state = VRB_RESOLVE_ADDR;
vrb_prof_func_start("rdma_resolve_addr");
if (rdma_resolve_addr(ep->id, ep->info_attr.src_addr,
ep->info_attr.dest_addr, VERBS_RESOLVE_TIMEOUT)) {
ret = -errno;
Expand All @@ -212,7 +218,9 @@ vrb_msg_ep_connect(struct fid_ep *ep_fid, const void *addr,
ep->cm_priv_data = NULL;
ep->state = VRB_IDLE;
}
vrb_prof_func_end("rdma_resolve_addr");
ofi_genlock_unlock(&vrb_ep2_progress(ep)->ep_lock);
vrb_prof_func_end(__func__);
return ret;
}

Expand Down Expand Up @@ -247,7 +255,9 @@ vrb_msg_ep_accept(struct fid_ep *ep, const void *param, size_t paramlen)
ofi_genlock_lock(&vrb_ep2_progress(_ep)->ep_lock);
assert(_ep->state == VRB_REQ_RCVD);
_ep->state = VRB_ACCEPTING;
vrb_prof_func_start("rdma_accept");
ret = rdma_accept(_ep->id, &conn_param);
vrb_prof_func_end("rdma_accept");
if (ret) {
VRB_WARN_ERRNO(FI_LOG_EP_CTRL, "rdma_accept");
_ep->state = VRB_DISCONNECTED;
Expand All @@ -257,6 +267,9 @@ vrb_msg_ep_accept(struct fid_ep *ep, const void *param, size_t paramlen)
struct vrb_connreq, handle);
free(connreq);
}
if (!ret && _ep->profile)
vrb_prof_cntr_inc(_ep->profile, FI_VAR_CONN_ACCEPT);

ofi_genlock_unlock(&vrb_ep2_progress(_ep)->ep_lock);
return ret;
}
Expand Down Expand Up @@ -302,8 +315,11 @@ vrb_msg_xrc_ep_reject(struct vrb_connreq *connreq,

vrb_set_xrc_cm_data(cm_data, connreq->xrc.is_reciprocal,
connreq->xrc.conn_tag, connreq->xrc.port, 0, 0);
vrb_prof_func_start("rdma_reject");
ret = rdma_reject(connreq->id, cm_data,
(uint8_t) paramlen) ? -errno : 0;
vrb_prof_func_end("rdma_reject");

if (rdma_destroy_id(connreq->id))
VRB_WARN_ERR(FI_LOG_EP_CTRL, "rdma_destroy_id", -errno);
connreq->id = NULL;
Expand Down Expand Up @@ -333,16 +349,20 @@ vrb_msg_ep_reject(struct fid_pep *pep, fid_t handle,
ret = vrb_msg_xrc_ep_reject(connreq, cm_hdr,
(uint8_t)(sizeof(*cm_hdr) + paramlen));
} else if (connreq->id) {
vrb_prof_func_start("rdma_reject");
ret = rdma_reject(connreq->id, cm_hdr,
(uint8_t)(sizeof(*cm_hdr) + paramlen)) ? -errno : 0;
vrb_prof_func_end("rdma_reject");
if (rdma_destroy_id(connreq->id))
VRB_WARN_ERR(FI_LOG_EP_CTRL, "rdma_destroy_id", -errno);
connreq->id = NULL;
} else {
ret = -FI_EBUSY;
}
ofi_mutex_unlock(&_pep->eq->event_lock);
if (!ret && _pep->profile)
vrb_prof_cntr_inc(_pep->profile, FI_VAR_CONN_REJECT);

ofi_mutex_unlock(&_pep->eq->event_lock);
if (ret)
VRB_WARN_ERR(FI_LOG_EP_CTRL, "rdma_reject", ret);

Expand Down
5 changes: 5 additions & 0 deletions prov/verbs/src/verbs_domain.c
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,9 @@ static int vrb_domain_close(fid_t fid)

vrb_close_progress(&domain->progress);

if (domain->profile)
vrb_prof_report(domain->profile);

switch (domain->ep_type) {
case FI_EP_DGRAM:
fab = container_of(&domain->util_domain.fabric->fabric_fid,
Expand Down Expand Up @@ -420,6 +423,8 @@ vrb_domain(struct fid_fabric *fabric, struct fi_info *info,
if (ret)
goto err4;

vrb_prof_create(&_domain->profile);

*domain = &_domain->util_domain.domain_fid;
return FI_SUCCESS;
err4:
Expand Down
44 changes: 43 additions & 1 deletion prov/verbs/src/verbs_ep.c
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -618,6 +618,10 @@ static int vrb_ep_close(fid_t fid)
struct vrb_ep *ep =
container_of(fid, struct vrb_ep, util_ep.ep_fid.fid);

if (ep->profile)
vrb_prof_set_st_time(ep->profile, (ofi_gettime_ns()),
VRB_DISCONNECTED);

switch (ep->util_ep.type) {
case FI_EP_MSG:
if (ep->eq) {
Expand Down Expand Up @@ -704,6 +708,8 @@ static int vrb_ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags)
struct vrb_dgram_av *av;
int ret;

vrb_prof_func_start(__func__);

ep = container_of(fid, struct vrb_ep, util_ep.ep_fid.fid);
ret = ofi_ep_bind_valid(&vrb_prov, bfid, flags);
if (ret)
Expand Down Expand Up @@ -749,6 +755,7 @@ static int vrb_ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags)
ret = -FI_EINVAL;
break;
}
vrb_prof_func_start(__func__);

return ret;
}
Expand All @@ -770,11 +777,13 @@ static int vrb_create_dgram_ep(struct vrb_domain *domain, struct vrb_ep *ep,

init_attr->qp_type = IBV_QPT_UD;

vrb_prof_func_start("ibv_create_qp");
ep->ibv_qp = ibv_create_qp(domain->pd, init_attr);
if (!ep->ibv_qp) {
VRB_WARN_ERRNO(FI_LOG_EP_CTRL, "ibv_create_qp");
return -errno;
}
vrb_prof_func_end("ibv_create_qp");

ret = ibv_modify_qp(ep->ibv_qp, &attr,
IBV_QP_STATE |
Expand Down Expand Up @@ -987,6 +996,8 @@ static int vrb_ep_enable(struct fid_ep *ep_fid)
struct vrb_domain *domain = vrb_ep2_domain(ep);
int ret;

vrb_prof_func_start(__func__);

if (!ep->eq && (ep->util_ep.type == FI_EP_MSG)) {
VRB_WARN(FI_LOG_EP_CTRL,
"Endpoint is not bound to an event queue\n");
Expand Down Expand Up @@ -1036,11 +1047,16 @@ static int vrb_ep_enable(struct fid_ep *ep_fid)
/* Server-side QP creation, after RDMA_CM_EVENT_CONNECT_REQUEST
* is recevied */
if (ep->id->verbs && ep->ibv_qp == NULL) {
vrb_prof_func_start("rdma_create_qp");
ret = rdma_create_qp(ep->id, domain->pd, &attr);
if (ret) {
VRB_WARN_ERRNO(FI_LOG_EP_CTRL, "rdma_create_qp");
return -errno;
}
vrb_prof_func_end("rdma_create_qp");
if (ep->profile)
vrb_prof_cntr_inc(ep->profile,
FI_VAR_MSG_QUEUE_CNT);

/* Allow shared XRC INI QP not controlled by RDMA CM
* to share same post functions as RC QP. */
Expand All @@ -1061,6 +1077,7 @@ static int vrb_ep_enable(struct fid_ep *ep_fid)
assert(0);
return -FI_EINVAL;
}
vrb_prof_func_end(__func__);
return 0;
}

Expand Down Expand Up @@ -1200,6 +1217,8 @@ int vrb_open_ep(struct fid_domain *domain, struct fi_info *info,
struct fi_info *fi;
int ret;

vrb_prof_func_start("vrb_open_ep");

if (!info->ep_attr || !info->rx_attr || !info->tx_attr)
return -FI_EINVAL;

Expand Down Expand Up @@ -1251,6 +1270,22 @@ int vrb_open_ep(struct fid_domain *domain, struct fi_info *info,
if (ret)
goto close_ep;

// initiate profile
if ((info->ep_attr->type == FI_EP_MSG) ||
(info->ep_attr->type == FI_EP_DGRAM)) {
ret = vrb_prof_create(&ep->profile);
if (!ret) {
if (info->handle &&
(info->handle->fclass == FI_CLASS_CONNREQ)) {
vrb_prof_init_state(ep->profile,
ofi_gettime_ns(), VRB_PASSIVE_CONN);
} else {
vrb_prof_init_state(ep->profile,
ofi_gettime_ns(), VRB_ACTIVE_CONN);
}
}
}

switch (info->ep_attr->type) {
case FI_EP_MSG:
if (dom->ext_flags & VRB_USE_XRC) {
Expand Down Expand Up @@ -1313,7 +1348,7 @@ int vrb_open_ep(struct fid_domain *domain, struct fi_info *info,
ep->id = pep->id;
ep->ibv_qp = ep->id->qp;
pep->id = NULL;

vrb_prof_func_start("rdma_resolve_addr");
if (rdma_resolve_addr(ep->id, info->src_addr, info->dest_addr,
VERBS_RESOLVE_TIMEOUT)) {
ret = -errno;
Expand All @@ -1323,6 +1358,7 @@ int vrb_open_ep(struct fid_domain *domain, struct fi_info *info,
rdma_destroy_ep(ep->id);
goto close_ep;
}
vrb_prof_func_end("rdma_resolve_addr");
ep->id->context = &ep->util_ep.ep_fid.fid;
} else {
ret = -FI_ENOSYS;
Expand Down Expand Up @@ -1356,6 +1392,9 @@ int vrb_open_ep(struct fid_domain *domain, struct fi_info *info,
*ep_fid = &ep->util_ep.ep_fid;
ep->util_ep.ep_fid.fid.ops = &vrb_ep_ops;
ep->util_ep.ep_fid.ops = &vrb_ep_base_ops;
(*ep_fid)->fid.ops->ops_open = vrb_ep_ops_open;

vrb_prof_func_end("vrb_open_ep");

return FI_SUCCESS;

Expand Down Expand Up @@ -1530,6 +1569,9 @@ int vrb_passive_ep(struct fid_fabric *fabric, struct fi_info *info,
_pep->src_addrlen = info->src_addrlen;

*pep = &_pep->pep_fid;

vrb_prof_create(&_pep->profile);

return 0;

err4:
Expand Down
Loading

0 comments on commit da62d0f

Please sign in to comment.