From 5d07eaadf618775955784fee822e5b5c4779476f Mon Sep 17 00:00:00 2001 From: Devendar Bureddy Date: Wed, 22 Feb 2023 19:07:35 +0200 Subject: [PATCH] TL/SHARP: SHARP OOB fixes - hide sharp lib errors - disable lazy init by default --- config/m4/sharp.m4 | 2 ++ src/components/tl/sharp/tl_sharp.c | 7 ++++++- src/components/tl/sharp/tl_sharp.h | 1 + src/components/tl/sharp/tl_sharp_context.c | 12 ++++++++++++ 4 files changed, 21 insertions(+), 1 deletion(-) diff --git a/config/m4/sharp.m4 b/config/m4/sharp.m4 index da98c13f49..bedc550476 100644 --- a/config/m4/sharp.m4 +++ b/config/m4/sharp.m4 @@ -42,6 +42,8 @@ AS_IF([test "x$with_sharp" != "xno"], [ AC_SUBST(SHARP_CPPFLAGS, "-I$check_sharp_dir/include/ ") AC_SUBST(SHARP_LDFLAGS, "-lsharp_coll -L$check_sharp_dir/lib") + AC_CHECK_DECLS([SHARP_COLL_HIDE_ERRORS], [], [], [[#include ]]) + AC_CHECK_DECLS([SHARP_COLL_DISABLE_LAZY_GROUP_RESOURCE_ALLOC], [], [], [[#include ]]) ], [ AS_IF([test "x$with_sharp" != "xguess"], diff --git a/src/components/tl/sharp/tl_sharp.c b/src/components/tl/sharp/tl_sharp.c index ddc45b77c8..1f49559b17 100644 --- a/src/components/tl/sharp/tl_sharp.c +++ b/src/components/tl/sharp/tl_sharp.c @@ -30,7 +30,7 @@ static ucc_config_field_t ucc_tl_sharp_context_config_table[] = { {"", "", NULL, ucc_offsetof(ucc_tl_sharp_context_config_t, super), UCC_CONFIG_TYPE_TABLE(ucc_tl_context_config_table)}, - {"DEVICES", "mlx5_0:1", + {"DEVICES", "", "SHARP device list", ucc_offsetof(ucc_tl_sharp_context_config_t, dev_list), UCC_CONFIG_TYPE_STRING}, @@ -55,6 +55,11 @@ static ucc_config_field_t ucc_tl_sharp_context_config_table[] = { ucc_offsetof(ucc_tl_sharp_context_config_t, context_per_team), UCC_CONFIG_TYPE_BOOL}, + {"ENABLE_LAZY_GROUP_ALLOC", "n", + "Enable lazy group resource allocation", + ucc_offsetof(ucc_tl_sharp_context_config_t, enable_lazy_group_alloc), + UCC_CONFIG_TYPE_BOOL}, + {"RAND_SEED", "0", "Seed for random sharp job ID. (0 - use default).", ucc_offsetof(ucc_tl_sharp_context_config_t, rand_seed), diff --git a/src/components/tl/sharp/tl_sharp.h b/src/components/tl/sharp/tl_sharp.h index 2a355a28c4..61e786b86f 100644 --- a/src/components/tl/sharp/tl_sharp.h +++ b/src/components/tl/sharp/tl_sharp.h @@ -51,6 +51,7 @@ typedef struct ucc_tl_sharp_context_config { unsigned int rand_seed; unsigned int uprogress_num_polls; int context_per_team; + int enable_lazy_group_alloc; } ucc_tl_sharp_context_config_t; typedef struct ucc_tl_sharp_lib { diff --git a/src/components/tl/sharp/tl_sharp_context.c b/src/components/tl/sharp/tl_sharp_context.c index 512a3cac8c..de2b3481c6 100644 --- a/src/components/tl/sharp/tl_sharp_context.c +++ b/src/components/tl/sharp/tl_sharp_context.c @@ -300,6 +300,18 @@ ucc_status_t ucc_tl_sharp_context_init(ucc_tl_sharp_context_t *sharp_ctx, init_spec.config = sharp_coll_default_config; init_spec.config.user_progress_num_polls = sharp_ctx->cfg.uprogress_num_polls; init_spec.config.ib_dev_list = sharp_ctx->cfg.dev_list; + init_spec.config.flags = 0; +#if HAVE_DECL_SHARP_COLL_HIDE_ERRORS + if (lib->super.super.log_component.log_level < UCC_LOG_LEVEL_DEBUG) { + init_spec.config.flags |= SHARP_COLL_HIDE_ERRORS; + } +#endif +#if HAVE_DECL_SHARP_COLL_DISABLE_LAZY_GROUP_RESOURCE_ALLOC + if(!sharp_ctx->cfg.enable_lazy_group_alloc) { + init_spec.config.flags |= SHARP_COLL_DISABLE_LAZY_GROUP_RESOURCE_ALLOC; + } +#endif + init_spec.job_id = ((getpid() ^ pthread_self()) ^ rand_r(&sharp_ctx->cfg.rand_seed)); init_spec.enable_thread_support =