From 7ac4529b9e0fe3ee1ab50176eddf92d991a10fa7 Mon Sep 17 00:00:00 2001 From: Naga Kishore Kommuri Date: Wed, 31 Aug 2022 23:16:40 -0600 Subject: [PATCH 1/3] CORTX-33899 : Delayed Motr service starts makes cortx-rgw container failing on startup Issue: During startup, if data PODs are delayed, then server PODs are retarting with error "Couldn't init storage provider" Root Cause: During startup, radosgw calling m0_clinet_init() api to initialze motr client. This api initializes different components in order, one of which is IL_IDX_SERVICE. Initialization of this service is failng with -EIO OR -EPROTO, as data PODs are not yet up. Solution: During startup, if PODs are started in out of order, then server POD should not crash. Added a retry mechanism in initialization of IL_IDX_SERVICE. Signed-off-by: Naga Kishore Kommuri --- motr/client_init.c | 41 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 38 insertions(+), 3 deletions(-) diff --git a/motr/client_init.c b/motr/client_init.c index c7fbd14b116..fcb8198d5c8 100644 --- a/motr/client_init.c +++ b/motr/client_init.c @@ -190,7 +190,8 @@ struct m0_sm_state_descr initlift_phases[] = { [IL_IDX_SERVICE] = { .sd_name = "init/fini-resource-manager", .sd_allowed = M0_BITS(IL_ROOT_FID, - IL_LAYOUT_DB), + IL_LAYOUT_DB, + IL_IDX_SERVICE), .sd_in = initlift_idx_service, }, [IL_ROOT_FID] = { @@ -247,6 +248,9 @@ struct m0_sm_trans_descr initlift_trans[] = { {"initialising-index-service", IL_LAYOUT_DB, IL_IDX_SERVICE}, + {"retry-initialising-index-service", + IL_IDX_SERVICE, + IL_IDX_SERVICE}, {"retrieving-root-fid", IL_IDX_SERVICE, IL_ROOT_FID}, {"initialising-addb2", IL_ROOT_FID, @@ -356,6 +360,18 @@ static int initlift_get_next_floor(struct m0_client *m0c) return M0_RC(rc); } +/** + * Helper function to get the value of the current floor. + * + * @param m0c the client instance we are working with. + * @return the current state/floor. + */ +static int initlift_get_cur_floor(struct m0_client *m0c) +{ + M0_PRE(m0c != NULL); + return M0_RC(m0c->m0c_initlift_sm.sm_state); +} + /** * Helper function to move the initlift onto its next state in the * direction of travel. @@ -1222,6 +1238,8 @@ static int initlift_layouts(struct m0_sm *mach) return M0_RC(initlift_get_next_floor(m0c)); } +#define MAX_CLIENT_INIT_RETRIES 1000 +int retry_count = 0; static int initlift_idx_service(struct m0_sm *mach) { int rc = 0; @@ -1250,8 +1268,25 @@ static int initlift_idx_service(struct m0_sm *mach) rc = service->is_svc_ops->iso_init((void *)ctx); m0_sm_group_lock(&m0c->m0c_sm_group); - if (rc != 0) - initlift_fail(rc, m0c); + if (rc != 0) { + /* + * Added retry logic to handle out of + * order startup of data and server + * PODs. Ref: Jira ID Cortx-33899 + */ + if (retry_count < MAX_CLIENT_INIT_RETRIES + && (rc == -EIO || rc == -EPROTO)) { + retry_count += 1; + M0_LOG(M0_ERROR, "client init \ + failed with %d. Retrying.", rc); + return M0_RC(initlift_get_cur_floor(m0c)); + } else { + retry_count = 0; + initlift_fail(rc, m0c); + } + } else { + retry_count = 0; + } } else { service = ctx->isc_service; M0_ASSERT(service != NULL && From da08523a9cf407bf0fe60d2ee1937c9a5c7b6b85 Mon Sep 17 00:00:00 2001 From: Naga Kishore Kommuri Date: Tue, 6 Sep 2022 02:58:35 -0600 Subject: [PATCH 2/3] CORTX-33899 : Made retry_count static, instead of global. Used M0_IN macro, instead of manual comparision. Signed-off-by: Naga Kishore Kommuri --- motr/client_init.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/motr/client_init.c b/motr/client_init.c index fcb8198d5c8..1a8d12b4f68 100644 --- a/motr/client_init.c +++ b/motr/client_init.c @@ -1239,13 +1239,13 @@ static int initlift_layouts(struct m0_sm *mach) } #define MAX_CLIENT_INIT_RETRIES 1000 -int retry_count = 0; static int initlift_idx_service(struct m0_sm *mach) { int rc = 0; struct m0_client *m0c; struct m0_idx_service *service; struct m0_idx_service_ctx *ctx; + static int retry_count = 0; M0_ENTRY(); M0_PRE(mach != NULL); @@ -1275,7 +1275,7 @@ static int initlift_idx_service(struct m0_sm *mach) * PODs. Ref: Jira ID Cortx-33899 */ if (retry_count < MAX_CLIENT_INIT_RETRIES - && (rc == -EIO || rc == -EPROTO)) { + && M0_IN(rc, (-EIO, -EPROTO))) { retry_count += 1; M0_LOG(M0_ERROR, "client init \ failed with %d. Retrying.", rc); From c91085fed241fb80c33ff9c7373b170f7bb4da0a Mon Sep 17 00:00:00 2001 From: Naga Kishore Kommuri Date: Wed, 7 Sep 2022 06:06:38 -0600 Subject: [PATCH 3/3] CORTX-33899 : Introduced sleep between retries in exponential backoff manner to retry for atmost 4min Signed-off-by: Naga Kishore Kommuri --- motr/client_init.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/motr/client_init.c b/motr/client_init.c index a8498246f80..68ce2b05c5e 100644 --- a/motr/client_init.c +++ b/motr/client_init.c @@ -1251,7 +1251,10 @@ static int initlift_layouts(struct m0_sm *mach) return M0_RC(initlift_get_next_floor(m0c)); } -#define MAX_CLIENT_INIT_RETRIES 1000 +/* + * Retry for at most 4min in exponential backoff manner + */ +#define MAX_CLIENT_INIT_RETRIES 38 static int initlift_idx_service(struct m0_sm *mach) { int rc = 0; @@ -1289,9 +1292,10 @@ static int initlift_idx_service(struct m0_sm *mach) */ if (retry_count < MAX_CLIENT_INIT_RETRIES && M0_IN(rc, (-EIO, -EPROTO))) { - retry_count += 1; M0_LOG(M0_ERROR, "client init \ failed with %d. Retrying.", rc); + m0_nanosleep(1 << retry_count, NULL); + retry_count += 1; return M0_RC(initlift_get_cur_floor(m0c)); } else { retry_count = 0;