Skip to content

Commit

Permalink
TL/MLX5: fix bug with with socket
Browse files Browse the repository at this point in the history
  • Loading branch information
samnordmann committed Aug 7, 2023
1 parent cbe3abb commit 9008049
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 11 deletions.
1 change: 1 addition & 0 deletions src/components/tl/mlx5/tl_mlx5.h
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ typedef struct ucc_tl_mlx5_context {
ucc_rcache_t *rcache;
int is_imported;
int ib_port;
int sock;
ucc_mpool_t req_mp;
ucc_tl_mlx5_mcast_context_t mcast;
} ucc_tl_mlx5_context_t;
Expand Down
19 changes: 9 additions & 10 deletions src/components/tl/mlx5/tl_mlx5_context.c
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@ UCC_CLASS_CLEANUP_FUNC(ucc_tl_mlx5_context_t)
};

ucc_mpool_cleanup(&self->req_mp, 1);

close(self->sock);
}

UCC_CLASS_DEFINE(ucc_tl_mlx5_context_t, ucc_tl_context_t);
Expand Down Expand Up @@ -161,7 +163,6 @@ ucc_status_t ucc_tl_mlx5_context_ib_ctx_pd_setup(ucc_base_context_t *context)
const char * sockname = "/sock";
size_t sock_dir_len = strlen(template) + 1;
size_t sock_path_len = sock_dir_len + strlen(sockname);
int sock = 0;
size_t sbcast_data_length = sizeof(int) + sock_path_len;
char sock_path[sock_path_len];
ucc_subset_t s;
Expand Down Expand Up @@ -214,7 +215,7 @@ ucc_status_t ucc_tl_mlx5_context_ib_ctx_pd_setup(ucc_base_context_t *context)
ucc_strncpy_safe(sock_path, template, sock_dir_len);
if (mkdtemp(sock_path) != NULL) {
strncat(sock_path, sockname, sizeof(sock_path) - strlen(sock_path) - 1);
status = ucc_tl_mlx5_socket_init(ctx, sbgp->group_size, &sock,
status = ucc_tl_mlx5_socket_init(ctx, sbgp->group_size, &ctx->sock,
sock_path);
if (UCC_OK != status) {
sock_path[0] = '\0';
Expand Down Expand Up @@ -257,18 +258,15 @@ ucc_status_t ucc_tl_mlx5_context_ib_ctx_pd_setup(ucc_base_context_t *context)
status = UCC_ERR_NO_MESSAGE;
goto err;
}

status = ucc_tl_mlx5_share_ctx_pd(ctx, sock_path, sbgp->group_size,
!ctx->is_imported, sock);
if (!ctx->is_imported) {
sock_path[sock_dir_len - 1] = '\0';
rmdir(sock_path);
}
!ctx->is_imported, ctx->sock);

if (UCC_OK != status) {
goto err;
}

close(sock);
rmdir(sock_path);
// close(sock);
topo_ppn_1:
ucc_free(sbcast_data);
ucc_topo_cleanup(topo);
Expand All @@ -277,7 +275,8 @@ ucc_status_t ucc_tl_mlx5_context_ib_ctx_pd_setup(ucc_base_context_t *context)

err:
ucc_tl_mlx5_remove_shared_ctx_pd(ctx);
close(sock);
rmdir(sock_path);
close(ctx->sock);
err_ib_ctx_pd_init:
ucc_topo_cleanup(topo);
err_topo:
Expand Down
2 changes: 1 addition & 1 deletion src/components/tl/mlx5/tl_mlx5_pd.c
Original file line number Diff line number Diff line change
Expand Up @@ -309,7 +309,7 @@ ucc_status_t ucc_tl_mlx5_remove_shared_ctx_pd(ucc_tl_mlx5_context_t *ctx)
if (ctx->shared_ctx) {
if (ibv_close_device(ctx->shared_ctx)) {
tl_debug(lib, "failed to close ib ctx");
status |= UCC_ERR_NO_MESSAGE;
status = UCC_ERR_NO_MESSAGE;
}
}

Expand Down

0 comments on commit 9008049

Please sign in to comment.