From 91a7560ad5f490d375d01f5cbcbc804e6c010817 Mon Sep 17 00:00:00 2001 From: Sergey Lebedev Date: Thu, 7 Dec 2023 08:47:54 +0100 Subject: [PATCH] TL/MLX5: fix context create hang (#887) --- .github/workflows/clang-tidy-nvidia.yaml | 2 +- src/components/tl/mlx5/tl_mlx5_context.c | 19 +++++++++++++------ src/components/tl/mlx5/tl_mlx5_pd.c | 7 +++++-- src/components/tl/mlx5/tl_mlx5_team.c | 2 +- 4 files changed, 20 insertions(+), 10 deletions(-) diff --git a/.github/workflows/clang-tidy-nvidia.yaml b/.github/workflows/clang-tidy-nvidia.yaml index 408f145f83..3609a0a7a1 100644 --- a/.github/workflows/clang-tidy-nvidia.yaml +++ b/.github/workflows/clang-tidy-nvidia.yaml @@ -33,7 +33,7 @@ jobs: wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb sudo dpkg -i cuda-keyring_1.0-1_all.deb sudo apt-get update - sudo apt-get install -y --no-install-recommends cuda-cudart-dev-${CUDA_VER} cuda-nvcc-${CUDA_VER} cuda-nvml-dev-${CUDA_VER} + sudo apt-get install -y --no-install-recommends cuda-cudart-dev-${CUDA_VER} cuda-nvcc-${CUDA_VER} cuda-nvml-dev-${CUDA_VER} - name: Get UCX run: git clone ${OPEN_UCX_LINK} -b ${OPEN_UCX_BRANCH} /tmp/ucx - name: Build UCX diff --git a/src/components/tl/mlx5/tl_mlx5_context.c b/src/components/tl/mlx5/tl_mlx5_context.c index 0c56ff9390..5ac7b59f7d 100644 --- a/src/components/tl/mlx5/tl_mlx5_context.c +++ b/src/components/tl/mlx5/tl_mlx5_context.c @@ -14,6 +14,7 @@ #include "tl_mlx5_ib.h" #define PD_OWNER_RANK 0 +#define TL_MLX5_IB_PORT_INVALID -1 UCC_CLASS_INIT_FUNC(ucc_tl_mlx5_context_t, const ucc_base_context_params_t *params, @@ -210,7 +211,8 @@ ucc_status_t ucc_tl_mlx5_context_ib_ctx_pd_setup(ucc_base_context_t *context) if (!ctx->is_imported) { status = ucc_tl_mlx5_ib_ctx_pd_init(ctx); if (UCC_OK != status) { - goto err_ib_ctx_pd_init; + ctx->ib_port = TL_MLX5_IB_PORT_INVALID; + goto start_bcast; } if (UCC_SBGP_NOT_EXISTS == sbgp->status) { goto topo_ppn_1; @@ -228,21 +230,20 @@ ucc_status_t ucc_tl_mlx5_context_ib_ctx_pd_setup(ucc_base_context_t *context) tl_debug(context->lib, "failed to create tmp file for socket path"); sock_path[0] = '\0'; } - sbcast_data->ib_port = ctx->ib_port; memcpy(sbcast_data->sock_path, sock_path, sizeof(sock_path)); } +start_bcast: + sbcast_data->ib_port = ctx->ib_port; steam = core_ctx->service_team; - s.map = sbgp->map; s.myrank = sbgp->group_rank; - status = UCC_TL_TEAM_IFACE(steam)->scoll.bcast( + status = UCC_TL_TEAM_IFACE(steam)->scoll.bcast( &steam->super, sbcast_data, sbcast_data_length, PD_OWNER_RANK, s, &req); if (UCC_OK != status) { tl_debug(context->lib, "failed to start mlx5 ctx bcast"); goto err; } - while (UCC_INPROGRESS == (status = ucc_collective_test(&req->super))) { ucc_context_progress(core_ctx); } @@ -256,9 +257,15 @@ ucc_status_t ucc_tl_mlx5_context_ib_ctx_pd_setup(ucc_base_context_t *context) ctx->ib_port = sbcast_data->ib_port; memcpy(sock_path, sbcast_data->sock_path, sizeof(sock_path)); + if (ctx->ib_port == TL_MLX5_IB_PORT_INVALID) { + tl_debug(context->lib, "invalid ib port received"); + status = UCC_ERR_NO_RESOURCE; + goto err_ib_ctx_pd_init; + } + if (strlen(sock_path) == 0) { tl_debug(context->lib, "failed to share ctx and pd"); - status = UCC_ERR_NO_MESSAGE; + status = UCC_ERR_NO_RESOURCE; goto err; } status = ucc_tl_mlx5_share_ctx_pd(ctx, sock_path, sbgp->group_size, diff --git a/src/components/tl/mlx5/tl_mlx5_pd.c b/src/components/tl/mlx5/tl_mlx5_pd.c index a553dbc5f5..bf98352883 100644 --- a/src/components/tl/mlx5/tl_mlx5_pd.c +++ b/src/components/tl/mlx5/tl_mlx5_pd.c @@ -263,7 +263,8 @@ ucc_status_t ucc_tl_mlx5_share_ctx_pd(ucc_tl_mlx5_context_t *ctx, } static void ucc_tl_mlx5_context_barrier(ucc_context_oob_coll_t *oob, - ucc_base_lib_t *lib) + ucc_context_t *core_ctx, + ucc_base_lib_t *lib) { char *rbuf; char sbuf; @@ -284,6 +285,7 @@ static void ucc_tl_mlx5_context_barrier(ucc_context_oob_coll_t *oob, oob->allgather(&sbuf, rbuf, sizeof(char), oob->coll_info, &req)) { ucc_assert(req != NULL); while (UCC_OK != (status = oob->req_test(req))) { + ucc_context_progress(core_ctx); if (status < 0) { tl_debug(lib, "failed to test oob req"); break; @@ -303,7 +305,8 @@ ucc_status_t ucc_tl_mlx5_remove_shared_ctx_pd(ucc_tl_mlx5_context_t *ctx) if (ctx->shared_pd && ctx->is_imported) { ibv_unimport_pd(ctx->shared_pd); } - ucc_tl_mlx5_context_barrier(&UCC_TL_CTX_OOB(ctx), lib); + ucc_tl_mlx5_context_barrier(&UCC_TL_CTX_OOB(ctx), + ctx->super.super.ucc_context, lib); if (ctx->shared_pd && !ctx->is_imported) { err = ibv_dealloc_pd(ctx->shared_pd); if (err) { diff --git a/src/components/tl/mlx5/tl_mlx5_team.c b/src/components/tl/mlx5/tl_mlx5_team.c index 5d9f7560cc..b326166674 100644 --- a/src/components/tl/mlx5/tl_mlx5_team.c +++ b/src/components/tl/mlx5/tl_mlx5_team.c @@ -66,7 +66,7 @@ UCC_CLASS_INIT_FUNC(ucc_tl_mlx5_team_t, ucc_base_context_t *tl_context, } self->a2a = NULL; - status = ucc_tl_mlx5_team_init_alltoall(self); + status = ucc_tl_mlx5_team_init_alltoall(self); if (UCC_OK != status) { return status; }