From b8e3c450373435c5f4a5f27c4f43e503f833cd49 Mon Sep 17 00:00:00 2001 From: Devendar Bureddy Date: Fri, 31 Mar 2023 01:36:03 +0300 Subject: [PATCH] TL/SHARP: Prevent sharp team with team max ppn > 1 --- src/components/tl/sharp/tl_sharp.c | 5 ++++ src/components/tl/sharp/tl_sharp.h | 2 ++ src/components/tl/sharp/tl_sharp_team.c | 31 ++++++++++++++----------- 3 files changed, 25 insertions(+), 13 deletions(-) diff --git a/src/components/tl/sharp/tl_sharp.c b/src/components/tl/sharp/tl_sharp.c index ab6e84e870..1da51f860e 100644 --- a/src/components/tl/sharp/tl_sharp.c +++ b/src/components/tl/sharp/tl_sharp.c @@ -67,6 +67,11 @@ static ucc_config_field_t ucc_tl_sharp_context_config_table[] = { ucc_offsetof(ucc_tl_sharp_context_config_t, rand_seed), UCC_CONFIG_TYPE_UINT}, + {"TEAM_MAX_PPN", "1", + "SHARP team max PPN threshold", + ucc_offsetof(ucc_tl_sharp_context_config_t, team_max_ppn), + UCC_CONFIG_TYPE_UINT}, + {NULL}}; UCC_CLASS_DEFINE_NEW_FUNC(ucc_tl_sharp_lib_t, ucc_base_lib_t, diff --git a/src/components/tl/sharp/tl_sharp.h b/src/components/tl/sharp/tl_sharp.h index cfb2832f6d..cc44e9e1f4 100644 --- a/src/components/tl/sharp/tl_sharp.h +++ b/src/components/tl/sharp/tl_sharp.h @@ -52,6 +52,7 @@ typedef struct ucc_tl_sharp_context_config { unsigned int uprogress_num_polls; int context_per_team; int enable_lazy_group_alloc; + int team_max_ppn; } ucc_tl_sharp_context_config_t; typedef struct ucc_tl_sharp_lib { @@ -96,6 +97,7 @@ typedef struct ucc_tl_sharp_team { ucc_rcache_t *rcache; struct sharp_coll_comm *sharp_comm; ucc_tl_sharp_oob_ctx_t oob_ctx; + ucc_topo_t *topo; } ucc_tl_sharp_team_t; typedef struct ucc_tl_sharp_task { diff --git a/src/components/tl/sharp/tl_sharp_team.c b/src/components/tl/sharp/tl_sharp_team.c index 26b89800fe..dadce26f5e 100644 --- a/src/components/tl/sharp/tl_sharp_team.c +++ b/src/components/tl/sharp/tl_sharp_team.c @@ -19,7 +19,6 @@ UCC_CLASS_INIT_FUNC(ucc_tl_sharp_team_t, ucc_base_context_t *tl_context, struct sharp_coll_comm_init_spec comm_spec; int ret; ucc_status_t status; - ucc_topo_t *topo; ucc_subset_t set; if (!(params->params.mask & UCC_TEAM_PARAM_FIELD_OOB)) { @@ -42,21 +41,25 @@ UCC_CLASS_INIT_FUNC(ucc_tl_sharp_team_t, ucc_base_context_t *tl_context, self->oob_ctx.oob = &UCC_TL_TEAM_OOB(self); } - if (sharp_ctx == NULL) { - status = ucc_topo_init(set, ctx->super.super.ucc_context->topo, &topo); - if (UCC_OK != status) { - tl_error(ctx->super.super.lib, "failed to init team topo"); - return status; - } + status = ucc_topo_init(set, ctx->super.super.ucc_context->topo, &self->topo); + if (UCC_OK != status) { + tl_error(ctx->super.super.lib, "failed to init team topo"); + return status; + } + if (ucc_topo_max_ppn(self->topo) > ctx->cfg.team_max_ppn) { + tl_debug(ctx->super.super.lib, "sharp team not supported with ppn > 1"); + status = UCC_ERR_NOT_SUPPORTED; + goto cleanup; + } + + if (sharp_ctx == NULL) { status = ucc_tl_sharp_context_init(ctx, &self->sharp_context, - &self->oob_ctx, topo); + &self->oob_ctx, self->topo); if (status != UCC_OK) { - return status; + goto cleanup; } - ucc_topo_cleanup(topo); - if (ctx->cfg.use_rcache) { status = ucc_tl_sharp_rcache_create(self->sharp_context, &self->rcache); if (status != UCC_OK) { @@ -136,7 +139,8 @@ UCC_CLASS_INIT_FUNC(ucc_tl_sharp_team_t, ucc_base_context_t *tl_context, goto cleanup; } - tl_debug(self->super.super.context->lib, "initialized tl team: %p", self); + tl_debug(self->super.super.context->lib, + "initialized tl team: %p size:%d", self, UCC_TL_TEAM_SIZE(self)); return UCC_OK; cleanup: if (ctx->cfg.context_per_team) { @@ -151,7 +155,7 @@ UCC_CLASS_INIT_FUNC(ucc_tl_sharp_team_t, ucc_base_context_t *tl_context, sharp_coll_finalize(self->sharp_context); } } - + ucc_topo_cleanup(self->topo); return status; } @@ -161,6 +165,7 @@ UCC_CLASS_CLEANUP_FUNC(ucc_tl_sharp_team_t) tl_debug(self->super.super.context->lib, "finalizing tl team: %p", self); sharp_coll_comm_destroy(self->sharp_comm); + ucc_topo_cleanup(self->topo); if (ctx->cfg.context_per_team) { if (UCC_TL_SHARP_TEAM_LIB(self)->cfg.use_internal_oob) {