Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TL/MLX5: fix context create hang #887

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/clang-tidy-nvidia.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ jobs:
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb
sudo dpkg -i cuda-keyring_1.0-1_all.deb
sudo apt-get update
sudo apt-get install -y --no-install-recommends cuda-cudart-dev-${CUDA_VER} cuda-nvcc-${CUDA_VER} cuda-nvml-dev-${CUDA_VER}
sudo apt-get install -y --no-install-recommends cuda-cudart-dev-${CUDA_VER} cuda-nvcc-${CUDA_VER} cuda-nvml-dev-${CUDA_VER}
- name: Get UCX
run: git clone ${OPEN_UCX_LINK} -b ${OPEN_UCX_BRANCH} /tmp/ucx
- name: Build UCX
Expand Down
19 changes: 13 additions & 6 deletions src/components/tl/mlx5/tl_mlx5_context.c
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include "tl_mlx5_ib.h"

#define PD_OWNER_RANK 0
#define TL_MLX5_IB_PORT_INVALID -1

UCC_CLASS_INIT_FUNC(ucc_tl_mlx5_context_t,
const ucc_base_context_params_t *params,
Expand Down Expand Up @@ -210,7 +211,8 @@ ucc_status_t ucc_tl_mlx5_context_ib_ctx_pd_setup(ucc_base_context_t *context)
if (!ctx->is_imported) {
status = ucc_tl_mlx5_ib_ctx_pd_init(ctx);
if (UCC_OK != status) {
goto err_ib_ctx_pd_init;
ctx->ib_port = TL_MLX5_IB_PORT_INVALID;
goto start_bcast;
}
if (UCC_SBGP_NOT_EXISTS == sbgp->status) {
goto topo_ppn_1;
Expand All @@ -228,21 +230,20 @@ ucc_status_t ucc_tl_mlx5_context_ib_ctx_pd_setup(ucc_base_context_t *context)
tl_debug(context->lib, "failed to create tmp file for socket path");
sock_path[0] = '\0';
}
sbcast_data->ib_port = ctx->ib_port;
memcpy(sbcast_data->sock_path, sock_path, sizeof(sock_path));
}
start_bcast:
sbcast_data->ib_port = ctx->ib_port;
steam = core_ctx->service_team;

s.map = sbgp->map;
s.myrank = sbgp->group_rank;
status = UCC_TL_TEAM_IFACE(steam)->scoll.bcast(
status = UCC_TL_TEAM_IFACE(steam)->scoll.bcast(
&steam->super, sbcast_data, sbcast_data_length, PD_OWNER_RANK, s, &req);

if (UCC_OK != status) {
tl_debug(context->lib, "failed to start mlx5 ctx bcast");
goto err;
}

while (UCC_INPROGRESS == (status = ucc_collective_test(&req->super))) {
ucc_context_progress(core_ctx);
}
Expand All @@ -256,9 +257,15 @@ ucc_status_t ucc_tl_mlx5_context_ib_ctx_pd_setup(ucc_base_context_t *context)
ctx->ib_port = sbcast_data->ib_port;
memcpy(sock_path, sbcast_data->sock_path, sizeof(sock_path));

if (ctx->ib_port == TL_MLX5_IB_PORT_INVALID) {
tl_debug(context->lib, "invalid ib port received");
status = UCC_ERR_NO_RESOURCE;
goto err_ib_ctx_pd_init;
}

if (strlen(sock_path) == 0) {
tl_debug(context->lib, "failed to share ctx and pd");
status = UCC_ERR_NO_MESSAGE;
status = UCC_ERR_NO_RESOURCE;
goto err;
}
status = ucc_tl_mlx5_share_ctx_pd(ctx, sock_path, sbgp->group_size,
Expand Down
7 changes: 5 additions & 2 deletions src/components/tl/mlx5/tl_mlx5_pd.c
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,8 @@ ucc_status_t ucc_tl_mlx5_share_ctx_pd(ucc_tl_mlx5_context_t *ctx,
}

static void ucc_tl_mlx5_context_barrier(ucc_context_oob_coll_t *oob,
ucc_base_lib_t *lib)
ucc_context_t *core_ctx,
ucc_base_lib_t *lib)
{
char *rbuf;
char sbuf;
Expand All @@ -284,6 +285,7 @@ static void ucc_tl_mlx5_context_barrier(ucc_context_oob_coll_t *oob,
oob->allgather(&sbuf, rbuf, sizeof(char), oob->coll_info, &req)) {
ucc_assert(req != NULL);
while (UCC_OK != (status = oob->req_test(req))) {
ucc_context_progress(core_ctx);
if (status < 0) {
tl_debug(lib, "failed to test oob req");
break;
Expand All @@ -303,7 +305,8 @@ ucc_status_t ucc_tl_mlx5_remove_shared_ctx_pd(ucc_tl_mlx5_context_t *ctx)
if (ctx->shared_pd && ctx->is_imported) {
ibv_unimport_pd(ctx->shared_pd);
}
ucc_tl_mlx5_context_barrier(&UCC_TL_CTX_OOB(ctx), lib);
ucc_tl_mlx5_context_barrier(&UCC_TL_CTX_OOB(ctx),
ctx->super.super.ucc_context, lib);
if (ctx->shared_pd && !ctx->is_imported) {
err = ibv_dealloc_pd(ctx->shared_pd);
if (err) {
Expand Down
2 changes: 1 addition & 1 deletion src/components/tl/mlx5/tl_mlx5_team.c
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ UCC_CLASS_INIT_FUNC(ucc_tl_mlx5_team_t, ucc_base_context_t *tl_context,
}

self->a2a = NULL;
status = ucc_tl_mlx5_team_init_alltoall(self);
status = ucc_tl_mlx5_team_init_alltoall(self);
if (UCC_OK != status) {
return status;
}
Expand Down
Loading