Skip to content

Commit

Permalink
TL/SHARP: Prevent sharp team with team max ppn > 1
Browse files Browse the repository at this point in the history
  • Loading branch information
bureddy committed Apr 12, 2023
1 parent 11111a9 commit 8dbbab7
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 13 deletions.
5 changes: 5 additions & 0 deletions src/components/tl/sharp/tl_sharp.c
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,11 @@ static ucc_config_field_t ucc_tl_sharp_context_config_table[] = {
ucc_offsetof(ucc_tl_sharp_context_config_t, rand_seed),
UCC_CONFIG_TYPE_UINT},

{"TEAM_MAX_PPN", "1",
"SHARP team max PPN threshold",
ucc_offsetof(ucc_tl_sharp_context_config_t, team_max_ppn),
UCC_CONFIG_TYPE_UINT},

{NULL}};

UCC_CLASS_DEFINE_NEW_FUNC(ucc_tl_sharp_lib_t, ucc_base_lib_t,
Expand Down
2 changes: 2 additions & 0 deletions src/components/tl/sharp/tl_sharp.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ typedef struct ucc_tl_sharp_context_config {
unsigned int uprogress_num_polls;
int context_per_team;
int enable_lazy_group_alloc;
int team_max_ppn;
} ucc_tl_sharp_context_config_t;

typedef struct ucc_tl_sharp_lib {
Expand Down Expand Up @@ -96,6 +97,7 @@ typedef struct ucc_tl_sharp_team {
ucc_rcache_t *rcache;
struct sharp_coll_comm *sharp_comm;
ucc_tl_sharp_oob_ctx_t oob_ctx;
ucc_topo_t *topo;
} ucc_tl_sharp_team_t;

typedef struct ucc_tl_sharp_task {
Expand Down
31 changes: 18 additions & 13 deletions src/components/tl/sharp/tl_sharp_team.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ UCC_CLASS_INIT_FUNC(ucc_tl_sharp_team_t, ucc_base_context_t *tl_context,
struct sharp_coll_comm_init_spec comm_spec;
int ret;
ucc_status_t status;
ucc_topo_t *topo;
ucc_subset_t set;

if (!(params->params.mask & UCC_TEAM_PARAM_FIELD_OOB)) {
Expand All @@ -42,21 +41,25 @@ UCC_CLASS_INIT_FUNC(ucc_tl_sharp_team_t, ucc_base_context_t *tl_context,
self->oob_ctx.oob = &UCC_TL_TEAM_OOB(self);
}

if (sharp_ctx == NULL) {
status = ucc_topo_init(set, ctx->super.super.ucc_context->topo, &topo);
if (UCC_OK != status) {
tl_error(ctx->super.super.lib, "failed to init team topo");
return status;
}
status = ucc_topo_init(set, ctx->super.super.ucc_context->topo, &self->topo);
if (UCC_OK != status) {
tl_error(ctx->super.super.lib, "failed to init team topo");
return status;
}

if (ucc_topo_max_ppn(self->topo) != ctx->cfg.team_max_ppn) {
tl_debug(ctx->super.super.lib, "sharp team not supported with ppn > 1");
status = UCC_ERR_NOT_SUPPORTED;
goto cleanup;
}

if (sharp_ctx == NULL) {
status = ucc_tl_sharp_context_init(ctx, &self->sharp_context,
&self->oob_ctx, topo);
&self->oob_ctx, self->topo);
if (status != UCC_OK) {
return status;
goto cleanup;
}

ucc_topo_cleanup(topo);

if (ctx->cfg.use_rcache) {
status = ucc_tl_sharp_rcache_create(self->sharp_context, &self->rcache);
if (status != UCC_OK) {
Expand Down Expand Up @@ -136,7 +139,8 @@ UCC_CLASS_INIT_FUNC(ucc_tl_sharp_team_t, ucc_base_context_t *tl_context,
goto cleanup;
}

tl_debug(self->super.super.context->lib, "initialized tl team: %p", self);
tl_debug(self->super.super.context->lib,
"initialized tl team: %p size:%d", self, UCC_TL_TEAM_SIZE(self));
return UCC_OK;
cleanup:
if (ctx->cfg.context_per_team) {
Expand All @@ -151,7 +155,7 @@ UCC_CLASS_INIT_FUNC(ucc_tl_sharp_team_t, ucc_base_context_t *tl_context,
sharp_coll_finalize(self->sharp_context);
}
}

ucc_topo_cleanup(self->topo);
return status;
}

Expand All @@ -161,6 +165,7 @@ UCC_CLASS_CLEANUP_FUNC(ucc_tl_sharp_team_t)

tl_debug(self->super.super.context->lib, "finalizing tl team: %p", self);
sharp_coll_comm_destroy(self->sharp_comm);
ucc_topo_cleanup(self->topo);

if (ctx->cfg.context_per_team) {
if (UCC_TL_SHARP_TEAM_LIB(self)->cfg.use_internal_oob) {
Expand Down

0 comments on commit 8dbbab7

Please sign in to comment.