Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TL/MLX5: rcache #753

Merged
merged 8 commits into from
May 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion src/components/tl/mlx5/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ sources = \
tl_mlx5_wqe.h \
tl_mlx5_wqe.c \
tl_mlx5_pd.h \
tl_mlx5_pd.c
tl_mlx5_pd.c \
tl_mlx5_rcache.c

module_LTLIBRARIES = libucc_tl_mlx5.la
libucc_tl_mlx5_la_SOURCES = $(sources)
Expand Down
11 changes: 11 additions & 0 deletions src/components/tl/mlx5/tl_mlx5.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,17 @@ typedef struct ucc_tl_mlx5_team {
UCC_CLASS_DECLARE(ucc_tl_mlx5_team_t, ucc_base_context_t *,
const ucc_base_team_params_t *);

ucc_status_t tl_mlx5_rcache_create(ucc_tl_mlx5_context_t *ctx);

typedef struct ucc_tl_mlx5_reg {
samnordmann marked this conversation as resolved.
Show resolved Hide resolved
struct ibv_mr *mr;
} ucc_tl_mlx5_reg_t;

typedef struct ucc_tl_mlx5_rcache_region {
ucc_rcache_region_t super;
ucc_tl_mlx5_reg_t reg;
} ucc_tl_mlx5_rcache_region_t;

#define UCC_TL_MLX5_SUPPORTED_COLLS (UCC_COLL_TYPE_ALLTOALL)

#define UCC_TL_MLX5_TEAM_LIB(_team) \
Expand Down
14 changes: 12 additions & 2 deletions src/components/tl/mlx5/tl_mlx5_context.c
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ UCC_CLASS_INIT_FUNC(ucc_tl_mlx5_context_t,
status = ucc_mpool_init(
&self->req_mp, 0,
ucc_max(sizeof(ucc_tl_mlx5_task_t), sizeof(ucc_tl_mlx5_schedule_t)), 0,
UCC_CACHE_LINE_SIZE, 8, UINT_MAX, NULL, params->thread_mode,
"tl_mlx5_req_mp");
UCC_CACHE_LINE_SIZE, 8, UINT_MAX, &ucc_coll_task_mpool_ops,
params->thread_mode, "tl_mlx5_req_mp");
if (UCC_OK != status) {
tl_error(self->super.super.lib,
"failed to initialize tl_mlx5_req mpool");
Expand All @@ -48,6 +48,9 @@ UCC_CLASS_INIT_FUNC(ucc_tl_mlx5_context_t,
UCC_CLASS_CLEANUP_FUNC(ucc_tl_mlx5_context_t)
{
tl_debug(self->super.super.lib, "finalizing tl context: %p", self);
if (self->rcache) {
ucc_rcache_destroy(self->rcache);
}

if (ucc_tl_mlx5_remove_shared_ctx_pd(self) != UCC_OK) {
tl_error(self->super.super.lib, "failed to free ib ctx and pd");
Expand Down Expand Up @@ -245,8 +248,15 @@ ucc_status_t ucc_tl_mlx5_context_create_epilog(ucc_base_context_t *context)
goto err;
}

status = tl_mlx5_rcache_create(ctx);
if (UCC_OK != status) {
tl_error(context->lib, "failed to create rcache");
goto err;
}

ucc_free(sbcast_data);
ucc_topo_cleanup(topo);

return UCC_OK;

err:
Expand Down
75 changes: 75 additions & 0 deletions src/components/tl/mlx5/tl_mlx5_rcache.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
/**
* Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See file LICENSE for terms.
*/

#include "tl_mlx5.h"

static ucs_status_t
rcache_reg_mr(void *context, ucc_rcache_t *rcache, //NOLINT: rcache is unused
void *arg, ucc_rcache_region_t *rregion,
uint16_t flags) //NOLINT: flags is unused
{
ucc_tl_mlx5_context_t *ctx =
(ucc_tl_mlx5_context_t *)context;
void *addr = (void *)rregion->super.start;
size_t length = (size_t)(rregion->super.end
- rregion->super.start);
int *change_flag = (int *)arg;
ucc_tl_mlx5_rcache_region_t *mlx5_rregion = ucc_derived_of(rregion,
ucc_tl_mlx5_rcache_region_t);

*change_flag = 1;
mlx5_rregion->reg.mr =
ibv_reg_mr(ctx->shared_pd, addr, length,
IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE);
if (!mlx5_rregion->reg.mr) {
tl_error(ctx->super.super.lib, "failed to register memory");
return UCS_ERR_NO_MESSAGE;
}
return UCS_OK;
}

static void rcache_dereg_mr(void *context, //NOLINT: context is unused
ucc_rcache_t *rcache, //NOLINT: rcache is unused
ucc_rcache_region_t *rregion)
{
ucc_tl_mlx5_rcache_region_t *mlx5_rregion =
ucc_derived_of(rregion, ucc_tl_mlx5_rcache_region_t);

ibv_dereg_mr(mlx5_rregion->reg.mr);
}

static void ucc_tl_mlx5_rcache_dump_region_cb(void *context, //NOLINT
ucc_rcache_t *rcache, //NOLINT
ucs_rcache_region_t *rregion,
char *buf, size_t max)
{
ucc_tl_mlx5_rcache_region_t *mlx5_rregion =
ucc_derived_of(rregion, ucc_tl_mlx5_rcache_region_t);

snprintf(buf, max, "bar ptr:%p", mlx5_rregion->reg.mr);
}

static ucc_rcache_ops_t ucc_rcache_ops = {
.mem_reg = rcache_reg_mr,
.mem_dereg = rcache_dereg_mr,
.dump_region = ucc_tl_mlx5_rcache_dump_region_cb
};

ucc_status_t tl_mlx5_rcache_create(ucc_tl_mlx5_context_t *ctx)
{
ucc_rcache_params_t rcache_params;

rcache_params.region_struct_size = sizeof(ucc_tl_mlx5_rcache_region_t);
rcache_params.alignment = UCS_PGT_ADDR_ALIGN;
rcache_params.max_alignment = ucc_get_page_size();
rcache_params.ucm_event_priority = 1000;
rcache_params.context = (void *)ctx;
rcache_params.ops = &ucc_rcache_ops;
rcache_params.ucm_events = UCM_EVENT_VM_UNMAPPED
| UCM_EVENT_MEM_TYPE_FREE;

return ucc_rcache_create(&rcache_params, "MLX5", &ctx->rcache);
}