Skip to content

Commit

Permalink
TL/MLX5: fix context create hang (#887)
Browse files Browse the repository at this point in the history
  • Loading branch information
Sergei-Lebedev authored Dec 7, 2023
1 parent a2ae31c commit 91a7560
Show file tree
Hide file tree
Showing 4 changed files with 20 additions and 10 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/clang-tidy-nvidia.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ jobs:
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb
sudo dpkg -i cuda-keyring_1.0-1_all.deb
sudo apt-get update
sudo apt-get install -y --no-install-recommends cuda-cudart-dev-${CUDA_VER} cuda-nvcc-${CUDA_VER} cuda-nvml-dev-${CUDA_VER}
sudo apt-get install -y --no-install-recommends cuda-cudart-dev-${CUDA_VER} cuda-nvcc-${CUDA_VER} cuda-nvml-dev-${CUDA_VER}
- name: Get UCX
run: git clone ${OPEN_UCX_LINK} -b ${OPEN_UCX_BRANCH} /tmp/ucx
- name: Build UCX
Expand Down
19 changes: 13 additions & 6 deletions src/components/tl/mlx5/tl_mlx5_context.c
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include "tl_mlx5_ib.h"

#define PD_OWNER_RANK 0
#define TL_MLX5_IB_PORT_INVALID -1

UCC_CLASS_INIT_FUNC(ucc_tl_mlx5_context_t,
const ucc_base_context_params_t *params,
Expand Down Expand Up @@ -210,7 +211,8 @@ ucc_status_t ucc_tl_mlx5_context_ib_ctx_pd_setup(ucc_base_context_t *context)
if (!ctx->is_imported) {
status = ucc_tl_mlx5_ib_ctx_pd_init(ctx);
if (UCC_OK != status) {
goto err_ib_ctx_pd_init;
ctx->ib_port = TL_MLX5_IB_PORT_INVALID;
goto start_bcast;
}
if (UCC_SBGP_NOT_EXISTS == sbgp->status) {
goto topo_ppn_1;
Expand All @@ -228,21 +230,20 @@ ucc_status_t ucc_tl_mlx5_context_ib_ctx_pd_setup(ucc_base_context_t *context)
tl_debug(context->lib, "failed to create tmp file for socket path");
sock_path[0] = '\0';
}
sbcast_data->ib_port = ctx->ib_port;
memcpy(sbcast_data->sock_path, sock_path, sizeof(sock_path));
}
start_bcast:
sbcast_data->ib_port = ctx->ib_port;
steam = core_ctx->service_team;

s.map = sbgp->map;
s.myrank = sbgp->group_rank;
status = UCC_TL_TEAM_IFACE(steam)->scoll.bcast(
status = UCC_TL_TEAM_IFACE(steam)->scoll.bcast(
&steam->super, sbcast_data, sbcast_data_length, PD_OWNER_RANK, s, &req);

if (UCC_OK != status) {
tl_debug(context->lib, "failed to start mlx5 ctx bcast");
goto err;
}

while (UCC_INPROGRESS == (status = ucc_collective_test(&req->super))) {
ucc_context_progress(core_ctx);
}
Expand All @@ -256,9 +257,15 @@ ucc_status_t ucc_tl_mlx5_context_ib_ctx_pd_setup(ucc_base_context_t *context)
ctx->ib_port = sbcast_data->ib_port;
memcpy(sock_path, sbcast_data->sock_path, sizeof(sock_path));

if (ctx->ib_port == TL_MLX5_IB_PORT_INVALID) {
tl_debug(context->lib, "invalid ib port received");
status = UCC_ERR_NO_RESOURCE;
goto err_ib_ctx_pd_init;
}

if (strlen(sock_path) == 0) {
tl_debug(context->lib, "failed to share ctx and pd");
status = UCC_ERR_NO_MESSAGE;
status = UCC_ERR_NO_RESOURCE;
goto err;
}
status = ucc_tl_mlx5_share_ctx_pd(ctx, sock_path, sbgp->group_size,
Expand Down
7 changes: 5 additions & 2 deletions src/components/tl/mlx5/tl_mlx5_pd.c
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,8 @@ ucc_status_t ucc_tl_mlx5_share_ctx_pd(ucc_tl_mlx5_context_t *ctx,
}

static void ucc_tl_mlx5_context_barrier(ucc_context_oob_coll_t *oob,
ucc_base_lib_t *lib)
ucc_context_t *core_ctx,
ucc_base_lib_t *lib)
{
char *rbuf;
char sbuf;
Expand All @@ -284,6 +285,7 @@ static void ucc_tl_mlx5_context_barrier(ucc_context_oob_coll_t *oob,
oob->allgather(&sbuf, rbuf, sizeof(char), oob->coll_info, &req)) {
ucc_assert(req != NULL);
while (UCC_OK != (status = oob->req_test(req))) {
ucc_context_progress(core_ctx);
if (status < 0) {
tl_debug(lib, "failed to test oob req");
break;
Expand All @@ -303,7 +305,8 @@ ucc_status_t ucc_tl_mlx5_remove_shared_ctx_pd(ucc_tl_mlx5_context_t *ctx)
if (ctx->shared_pd && ctx->is_imported) {
ibv_unimport_pd(ctx->shared_pd);
}
ucc_tl_mlx5_context_barrier(&UCC_TL_CTX_OOB(ctx), lib);
ucc_tl_mlx5_context_barrier(&UCC_TL_CTX_OOB(ctx),
ctx->super.super.ucc_context, lib);
if (ctx->shared_pd && !ctx->is_imported) {
err = ibv_dealloc_pd(ctx->shared_pd);
if (err) {
Expand Down
2 changes: 1 addition & 1 deletion src/components/tl/mlx5/tl_mlx5_team.c
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ UCC_CLASS_INIT_FUNC(ucc_tl_mlx5_team_t, ucc_base_context_t *tl_context,
}

self->a2a = NULL;
status = ucc_tl_mlx5_team_init_alltoall(self);
status = ucc_tl_mlx5_team_init_alltoall(self);
if (UCC_OK != status) {
return status;
}
Expand Down

0 comments on commit 91a7560

Please sign in to comment.