diff --git a/motr/client_init.c b/motr/client_init.c index 58faf48b96b..68ce2b05c5e 100644 --- a/motr/client_init.c +++ b/motr/client_init.c @@ -193,7 +193,8 @@ struct m0_sm_state_descr initlift_phases[] = { [IL_IDX_SERVICE] = { .sd_name = "init/fini-resource-manager", .sd_allowed = M0_BITS(IL_ROOT_FID, - IL_LAYOUT_DB), + IL_LAYOUT_DB, + IL_IDX_SERVICE), .sd_in = initlift_idx_service, }, [IL_ROOT_FID] = { @@ -256,6 +257,9 @@ struct m0_sm_trans_descr initlift_trans[] = { {"initialising-index-service", IL_LAYOUT_DB, IL_IDX_SERVICE}, + {"retry-initialising-index-service", + IL_IDX_SERVICE, + IL_IDX_SERVICE}, {"retrieving-root-fid", IL_IDX_SERVICE, IL_ROOT_FID}, {"initialising-fis", IL_ROOT_FID, @@ -369,6 +373,18 @@ static int initlift_get_next_floor(struct m0_client *m0c) return M0_RC(rc); } +/** + * Helper function to get the value of the current floor. + * + * @param m0c the client instance we are working with. + * @return the current state/floor. + */ +static int initlift_get_cur_floor(struct m0_client *m0c) +{ + M0_PRE(m0c != NULL); + return M0_RC(m0c->m0c_initlift_sm.sm_state); +} + /** * Helper function to move the initlift onto its next state in the * direction of travel. @@ -1235,12 +1251,17 @@ static int initlift_layouts(struct m0_sm *mach) return M0_RC(initlift_get_next_floor(m0c)); } +/* + * Retry for at most 4min in exponential backoff manner + */ +#define MAX_CLIENT_INIT_RETRIES 38 static int initlift_idx_service(struct m0_sm *mach) { int rc = 0; struct m0_client *m0c; struct m0_idx_service *service; struct m0_idx_service_ctx *ctx; + static int retry_count = 0; M0_ENTRY(); M0_PRE(mach != NULL); @@ -1263,8 +1284,26 @@ static int initlift_idx_service(struct m0_sm *mach) rc = service->is_svc_ops->iso_init((void *)ctx); m0_sm_group_lock(&m0c->m0c_sm_group); - if (rc != 0) - initlift_fail(rc, m0c); + if (rc != 0) { + /* + * Added retry logic to handle out of + * order startup of data and server + * PODs. Ref: Jira ID Cortx-33899 + */ + if (retry_count < MAX_CLIENT_INIT_RETRIES + && M0_IN(rc, (-EIO, -EPROTO))) { + M0_LOG(M0_ERROR, "client init \ + failed with %d. Retrying.", rc); + m0_nanosleep(1 << retry_count, NULL); + retry_count += 1; + return M0_RC(initlift_get_cur_floor(m0c)); + } else { + retry_count = 0; + initlift_fail(rc, m0c); + } + } else { + retry_count = 0; + } } else { service = ctx->isc_service; M0_ASSERT(service != NULL &&