diff --git a/addb2/dump.c b/addb2/dump.c index c9bd0717e2a..b306a74e55d 100644 --- a/addb2/dump.c +++ b/addb2/dump.c @@ -861,6 +861,17 @@ static void dtx0_state_counter(struct m0_addb2__context *ctx, char *buf) sm_trans(&m0_dtx_sm_conf, "dtx0", ctx, buf); } +extern struct m0_sm_conf m0_drm_sm_conf; +static void drm_state(struct m0_addb2__context *ctx, const uint64_t *v, + char *buf) +{ + sm_state(&m0_drm_sm_conf, ctx, v, buf); +} + +static void drm_state_counter(struct m0_addb2__context *ctx, char *buf) +{ + sm_trans(&m0_drm_sm_conf, "drm", ctx, buf); +} extern struct m0_sm_conf op_states_conf; static void beop_state_counter(struct m0_addb2__context *ctx, char *buf) @@ -1149,6 +1160,11 @@ struct m0_addb2__id_intrp ids[] = { .ii_repeat = M0_AVI_DTX0_SM_COUNTER_END - M0_AVI_DTX0_SM_COUNTER, .ii_spec = &dtx0_state_counter }, + { M0_AVI_DRM_SM_STATE, "drm-state", { &drm_state, SKIP2 } }, + { M0_AVI_DRM_SM_COUNTER, "", + .ii_repeat = M0_AVI_DRM_SM_COUNTER_END - M0_AVI_DRM_SM_COUNTER, + .ii_spec = &drm_state_counter }, + { M0_AVI_BE_TX_STATE, "tx-state", { &tx_state, SKIP2 } }, { M0_AVI_BE_TX_COUNTER, "", .ii_repeat = M0_AVI_BE_TX_COUNTER_END - M0_AVI_BE_TX_COUNTER, diff --git a/be/dtm0_log.c b/be/dtm0_log.c index 813222f1dfe..fae419ab20b 100644 --- a/be/dtm0_log.c +++ b/be/dtm0_log.c @@ -116,15 +116,12 @@ M0_INTERNAL void m0_be_dtm0_log_fini(struct m0_be_dtm0_log *log) log->dl_cs = NULL; } -M0_INTERNAL void m0_be_dtm0_log_free(struct m0_be_dtm0_log **in_log) +M0_INTERNAL void m0_be_dtm0_log_free(struct m0_be_dtm0_log *log) { - struct m0_be_dtm0_log *log = *in_log; - M0_PRE(!log->dl_is_persistent); m0_free(log->u.dl_inmem); m0_free(log); - *in_log = NULL; } /** @@ -394,8 +391,8 @@ static int dtm0_log__insert(struct m0_be_dtm0_log *log, struct m0_buf *payload) { int rc; - struct m0_dtm0_log_rec *rec; - struct m0_be_seg *seg = log->dl_seg; + struct m0_dtm0_log_rec *rec; + struct m0_be_seg *seg = log->dl_seg; if (log->dl_is_persistent) { rc = plog_rec_init(&rec, tx, seg, txd, payload); @@ -454,7 +451,7 @@ M0_INTERNAL int m0_be_dtm0_log_update(struct m0_be_dtm0_log *log, struct m0_dtm0_tx_desc *txd, struct m0_buf *payload) { - struct m0_dtm0_log_rec *rec; + struct m0_dtm0_log_rec *rec; M0_PRE(payload != NULL); M0_PRE(m0_be_dtm0_log__invariant(log)); @@ -653,8 +650,8 @@ static bool be_dtm0_log_iter_is_first(const struct m0_be_dtm0_log_iter *iter) static bool be_dtm0_log_iter_invariant(const struct m0_be_dtm0_log_iter *iter) { return _0C(m0_be_dtm0_log__invariant(iter->dli_log)) && - _0C(be_dtm0_log_iter_is_first(iter) || - m0_dtm0_tid__invariant(&iter->dli_current_tid)); + _0C(ergo(!be_dtm0_log_iter_is_first(iter), + m0_dtm0_tid__invariant(&iter->dli_current_tid))); } M0_INTERNAL void m0_be_dtm0_log_iter_init(struct m0_be_dtm0_log_iter *iter, @@ -671,10 +668,10 @@ M0_INTERNAL void m0_be_dtm0_log_iter_fini(struct m0_be_dtm0_log_iter *iter) } M0_INTERNAL int m0_be_dtm0_log_iter_next(struct m0_be_dtm0_log_iter *iter, - struct m0_dtm0_log_rec *out) + struct m0_dtm0_log_rec *out) { struct m0_dtm0_log_rec *rec; - int rc; + int rc; M0_PRE(m0_mutex_is_locked(&iter->dli_log->dl_lock)); M0_PRE(be_dtm0_log_iter_invariant(iter)); @@ -699,7 +696,7 @@ M0_INTERNAL int m0_be_dtm0_log_iter_next(struct m0_be_dtm0_log_iter *iter, return M0_ERR(rc); } - return M0_RC(rec == NULL ? 0 : +1); + return M0_RC(rec != NULL ? 0 : -ENOENT); } M0_INTERNAL int m0_dtm0_log_rec_copy(struct m0_dtm0_log_rec *dst, diff --git a/be/dtm0_log.h b/be/dtm0_log.h index 4e5751bdf5f..70763aebd6f 100644 --- a/be/dtm0_log.h +++ b/be/dtm0_log.h @@ -179,7 +179,7 @@ struct m0_dtm0_log_rec { }; /** - * @b m0_be_dtm0_log_rec structure + * @b m0_be_dtm0_log structure * * A DTM0 log is represented by m0_be_dtm0_log structure. The important * fields in this structure are: @@ -277,13 +277,21 @@ M0_INTERNAL void m0_be_dtm0_log_fini(struct m0_be_dtm0_log *log); * Free the memory allocated by m0_be_dtm0_log_alloc * * @pre m0_be_dtm0_log->dl_is_persistent needs to be false. - * @post *log is set to NULL. + * @post None * * @param log Pointer to a log structure that has been previously allocated. * * @return None */ -M0_INTERNAL void m0_be_dtm0_log_free(struct m0_be_dtm0_log **log); +M0_INTERNAL void m0_be_dtm0_log_free(struct m0_be_dtm0_log *log); + +/** Frees memory and zeroes the pointer. */ +#define m0_be_dtm0_log_free0(pptr) \ + do { \ + typeof(pptr) __pptr = (pptr); \ + m0_be_dtm0_log_free(*__pptr); \ + *__pptr = NULL; \ + } while (0) /** * For performing an operation on a persistent log, we need to take the @@ -553,11 +561,11 @@ M0_INTERNAL void m0_be_dtm0_log_iter_fini(struct m0_be_dtm0_log_iter *iter); * @param out returned record which is copied and needs to be freed with * m0_dtm0_log_iter_rec_fini() * - * @return +1 when the iterator was successfully moved to the next record. - * @return 0 when the end of the log has been reached. + * @return 0 when the iterator was successfully moved to the next record. + * @return -ENOENT when the end of the log has been reached. */ M0_INTERNAL int m0_be_dtm0_log_iter_next(struct m0_be_dtm0_log_iter *iter, - struct m0_dtm0_log_rec *out); + struct m0_dtm0_log_rec *out); /** @} */ /* end DTM0Internals */ diff --git a/be/ut/dtm0_log_ut.c b/be/ut/dtm0_log_ut.c index 88f9ec77127..281a8654e12 100644 --- a/be/ut/dtm0_log_ut.c +++ b/be/ut/dtm0_log_ut.c @@ -281,7 +281,7 @@ void test_volatile_dtm0_log(void) m0_mutex_unlock(&log->dl_lock); m0_be_dtm0_log_fini(log); - m0_be_dtm0_log_free(&log); + m0_be_dtm0_log_free0(&log); m0_dtm0_clk_src_fini(&cs); } @@ -625,7 +625,7 @@ static void m0_be_ut_dtm0_log_init_fini(void) m0_be_dtm0_log_iter_fini(&iter); m0_be_dtm0_log_fini(log); - m0_be_dtm0_log_free(&log); + m0_be_dtm0_log_free0(&log); m0_dtm0_clk_src_fini(&cs); } @@ -635,10 +635,10 @@ static void m0_be_ut_dtm0_log_next(void) struct m0_buf buf = {}; struct m0_be_dtm0_log_iter iter; - struct m0_dtm0_log_rec out; + struct m0_dtm0_log_rec out; struct m0_dtm0_clk_src cs; struct m0_be_dtm0_log *log; - int rc; + int rc; m0_dtm0_clk_src_init(&cs, M0_DTM0_CS_PHYS); @@ -660,11 +660,12 @@ static void m0_be_ut_dtm0_log_next(void) m0_be_dtm0_log_iter_init(&iter, log); rc = m0_be_dtm0_log_iter_next(&iter, &out); - M0_UT_ASSERT(rc == 1); + M0_UT_ASSERT(rc == 0); + M0_UT_ASSERT(ut_dl_verify_log_rec(&out, 42)); m0_dtm0_log_iter_rec_fini(&out); rc = m0_be_dtm0_log_iter_next(&iter, &out); - M0_UT_ASSERT(rc == 0); + M0_UT_ASSERT(rc != 0); m0_be_dtm0_log_iter_fini(&iter); /* make log finalisation happy */ @@ -672,7 +673,7 @@ static void m0_be_ut_dtm0_log_next(void) M0_UT_ASSERT(rc == 0); m0_mutex_unlock(&log->dl_lock); m0_be_dtm0_log_fini(log); - m0_be_dtm0_log_free(&log); + m0_be_dtm0_log_free0(&log); m0_dtm0_clk_src_fini(&cs); } diff --git a/cas/cas.h b/cas/cas.h index fb09975e798..7e9e92fb4b9 100644 --- a/cas/cas.h +++ b/cas/cas.h @@ -471,6 +471,9 @@ M0_INTERNAL int m0_cas_fom_spawn( struct m0_fop *cas_fop, void (*on_fom_complete)(struct m0_fom_thralldom *, struct m0_fom *)); +M0_INTERNAL uint32_t m0_cas_svc_device_id_get( + const struct m0_reqh_service_type *stype, + const struct m0_reqh *reqh); #else #define m0_cas_svc_init() #define m0_cas_svc_fini() diff --git a/cas/service.c b/cas/service.c index 515bff9675f..2458b824855 100644 --- a/cas/service.c +++ b/cas/service.c @@ -308,9 +308,21 @@ enum { STATS_NR }; +enum { + INVALID_CAS_SDEV_ID = UINT32_MAX +}; + struct cas_service { struct m0_reqh_service c_service; struct m0_be_domain *c_be_domain; + + /** + * sdev used by an instance of CAS service. This should be just one + * for the standard configuration, but current unit tests use more + * than one storage device attached to emulate several CAS services. + * In this case a special invalid value is used. + */ + uint32_t c_sdev_id; }; struct cas_kv { @@ -556,6 +568,60 @@ m0_cas__ut_svc_be_get(struct m0_reqh_service *svc) return service->c_be_domain; } +static int cas_service_sdev_id_set(struct cas_service *cas_svc) +{ + struct m0_reqh *reqh; + struct m0_conf_cache *cache; + struct m0_conf_obj *obj; + struct m0_fid *cas_svc_fid; + struct m0_conf_service *service; + struct m0_conf_obj *sdevs_dir; + struct m0_conf_sdev *sdev = NULL; + bool found = false; + int rc = 0; + + M0_ASSERT(cas_svc->c_sdev_id == INVALID_CAS_SDEV_ID); + + if (cas_in_ut()) + return M0_RC(0); + + reqh = cas_svc->c_service.rs_reqh; + cache = &m0_reqh2confc(reqh)->cc_cache; + + cas_svc_fid = &cas_svc->c_service.rs_service_fid; + obj = m0_conf_cache_lookup(cache, cas_svc_fid); + M0_ASSERT(obj != NULL); + + service = M0_CONF_CAST(obj, m0_conf_service); + M0_ASSERT(service != NULL); + + sdevs_dir = &service->cs_sdevs->cd_obj; + rc = m0_confc_open_sync(&sdevs_dir, sdevs_dir, M0_FID0); + if (rc != 0) + return M0_RC(rc); + + obj = NULL; + while ((rc = m0_confc_readdir_sync(sdevs_dir, &obj)) > 0) { + sdev = M0_CONF_CAST(obj, m0_conf_sdev); + if (!found) { + cas_svc->c_sdev_id = sdev->sd_dev_idx; + found = true; + } else { + /* + * Several devices attached to a single CAS service are + * possible if we are run in ut. In this case we use + * an invalid value for attached storage device. + * Also we can not break the loop until whole directory + * is iterated since it leads to crash during request + * handler finalisation. + */ + cas_svc->c_sdev_id = INVALID_CAS_SDEV_ID; + } + } + + m0_confc_close(sdevs_dir); + return M0_RC(rc); +} static int cas_service_start(struct m0_reqh_service *svc) { @@ -568,6 +634,7 @@ static int cas_service_start(struct m0_reqh_service *svc) /* XXX It's a workaround. It's needed until we have a better way. */ service->c_be_domain = ut_dom != NULL ? ut_dom : svc->rs_reqh_ctx->rc_beseg->bs_domain; + service->c_sdev_id = INVALID_CAS_SDEV_ID; rc = m0_ctg_store_init(service->c_be_domain); if (rc == 0) { /* @@ -576,6 +643,7 @@ static int cas_service_start(struct m0_reqh_service *svc) * If no pending index drop, it finishes soon. */ m0_cas_gc_start(svc); + rc = cas_service_sdev_id_set(service); } return rc; } @@ -1832,6 +1900,7 @@ static int cas_device_check(const struct cas_fom *fom, pm = &pver->pv_mach; rc = cas_sdev_state(pm, device_id, &state); if (rc == 0 && !M0_IN(state, (M0_PNDS_ONLINE, + M0_PNDS_OFFLINE, M0_PNDS_SNS_REBALANCING))) rc = M0_ERR(-EBADFD); } else @@ -2610,6 +2679,26 @@ M0_INTERNAL int m0_cas_fom_spawn( return M0_RC(rc); } +M0_INTERNAL uint32_t m0_cas_svc_device_id_get( + const struct m0_reqh_service_type *stype, + const struct m0_reqh *reqh) +{ + struct m0_reqh_service *svc; + struct cas_service *cas_svc; + + M0_PRE(stype != NULL); + M0_PRE(reqh != NULL); + + svc = m0_reqh_service_find(stype, reqh); + M0_ASSERT(svc != NULL); + M0_ASSERT(m0_reqh_service_state_get(svc) == M0_RST_STARTED); + + cas_svc = M0_AMB(cas_svc, svc, c_service); + + M0_ASSERT(cas_svc->c_sdev_id != INVALID_CAS_SDEV_ID); + return cas_svc->c_sdev_id; +} + static int cas_ctg_crow_handle(struct cas_fom *fom, const struct m0_cas_id *cid) { struct m0_fop *fop; diff --git a/dix/fid_convert.c b/dix/fid_convert.c index 5a221d89de0..c326158c554 100644 --- a/dix/fid_convert.c +++ b/dix/fid_convert.c @@ -35,6 +35,15 @@ * @{ */ +/* Set device id in a DIX fid. */ +M0_INTERNAL void m0_dix_fid__device_id_set(struct m0_fid *fid, + uint32_t dev_id) +{ + M0_PRE(fid != NULL && (dev_id <= M0_DIX_FID_DEVICE_ID_MAX)); + fid->f_container = (fid->f_container & ~M0_DIX_FID_DEVICE_ID_MASK) | + (((uint64_t)dev_id) << M0_DIX_FID_DEVICE_ID_OFFSET); +} + /* extract bits [32, 56) from fid->f_container */ M0_INTERNAL uint32_t m0_dix_fid__device_id_extract(const struct m0_fid *fid) { diff --git a/dix/fid_convert.h b/dix/fid_convert.h index 548636e0e6a..3508201e450 100644 --- a/dix/fid_convert.h +++ b/dix/fid_convert.h @@ -78,6 +78,8 @@ M0_INTERNAL bool m0_dix_fid_validate_cctg(const struct m0_fid *cctg_fid); M0_INTERNAL uint32_t m0_dix_fid__device_id_extract(const struct m0_fid *fid); +M0_INTERNAL void m0_dix_fid__device_id_set(struct m0_fid *fid, + uint32_t dev_id); /** @} end of dix group */ #endif /* __MOTR_DIX_FID_CONVERT_H__ */ diff --git a/dix/req.c b/dix/req.c index 8f797311ab9..f5cb4b113fd 100644 --- a/dix/req.c +++ b/dix/req.c @@ -1666,33 +1666,6 @@ static void dix_rop_completed(struct m0_sm_group *grp, struct m0_sm_ast *ast) } } -static void dix_rop_one_completed(struct m0_dix_cas_rop *crop) -{ - struct m0_dix_req *dreq = crop->crp_parent; - struct m0_dix_rop_ctx *rop; - - M0_ENTRY(); - M0_PRE(!dreq->dr_is_meta); - M0_PRE(M0_IN(dreq->dr_type, (DIX_PUT, DIX_DEL))); - M0_PRE(dreq->dr_dtx != NULL); - M0_PRE(dix_req_smgrp(dreq) == dreq->dr_dtx->tx_dtx->dd_sm.sm_grp); - - rop = crop->crp_parent->dr_rop; - dix_cas_rop_rc_update(crop, 0); - - m0_dtx0_executed(dreq->dr_dtx, crop->crp_pa_idx); - - if (rop->dg_completed_nr == rop->dg_cas_reqs_nr) { - rop->dg_ast = (struct m0_sm_ast) { - .sa_cb = dix_rop_completed, - .sa_datum = dreq, - }; - m0_sm_ast_post(dix_req_smgrp(dreq), &rop->dg_ast); - } - - M0_LEAVE(); -} - static bool dix_cas_rop_clink_cb(struct m0_clink *cl) { struct m0_dix_cas_rop *crop = container_of(cl, struct m0_dix_cas_rop, @@ -1714,28 +1687,20 @@ static bool dix_cas_rop_clink_cb(struct m0_clink *cl) dreq, crop->crp_creq.ccr_sess, &crop->crp_creq.ccr_remid); - m0_clink_del(cl); m0_clink_fini(cl); rop = crop->crp_parent->dr_rop; rop->dg_completed_nr++; M0_PRE(rop->dg_completed_nr <= rop->dg_cas_reqs_nr); - if (dreq->dr_dtx != NULL) { - M0_ASSERT(dix_req_smgrp(dreq) == - dreq->dr_dtx->tx_dtx->dd_sm.sm_grp); - dix_rop_one_completed(crop); - } else { - if (rop->dg_completed_nr == rop->dg_cas_reqs_nr) { - rop->dg_ast = (struct m0_sm_ast) { - .sa_cb = dix_rop_completed, - .sa_datum = dreq, - }; - m0_sm_ast_post(dix_req_smgrp(dreq), - &rop->dg_ast); - } + if (rop->dg_completed_nr == rop->dg_cas_reqs_nr) { + rop->dg_ast = (struct m0_sm_ast) { + .sa_cb = dix_rop_completed, + .sa_datum = dreq, + }; + m0_sm_ast_post(dix_req_smgrp(dreq), + &rop->dg_ast); } - } return true; } @@ -1747,7 +1712,6 @@ static int dix_cas_rops_send(struct m0_dix_req *req) struct m0_dix_cas_rop *cas_rop; struct m0_cas_req *creq; uint32_t sdev_idx; - uint32_t pa_idx; struct m0_cas_id cctg_id; struct m0_reqh_service_ctx *cas_svc; struct m0_dix_layout *layout = &req->dr_indices[0].dd_layout; @@ -1812,17 +1776,6 @@ static int dix_cas_rops_send(struct m0_dix_req *req) } if (rc != 0) { - /* - * Treat failed and not sent CAS requests as executed - * to unblock the EXECUTED-ALL logic. It allows to move - * transaction to the stable state once the persistent - * message received (EXECUTED state required for all - * participants). So EXECUTED participant state is - * reused in case of failure. - */ - if (req->dr_dtx != NULL) - m0_dtx0_executed(req->dr_dtx, - cas_rop->crp_pa_idx); m0_clink_del(&cas_rop->crp_clink); m0_clink_fini(&cas_rop->crp_clink); m0_cas_req_fini(&cas_rop->crp_creq); @@ -1849,16 +1802,6 @@ static int dix_cas_rops_send(struct m0_dix_req *req) rc = m0_dtx0_close(req->dr_dtx); if (rc != 0) return M0_ERR(rc); - /* - * It is safe to set EXECUTED dtx state for those - * participants that experience transient failure, - * it allows to trigger EXECUTED-ALL logic. See - * the similar comment above for details. - */ - for (pa_idx = cas_rop_tlist_length(&rop->dg_cas_reqs); - pa_idx < req->dr_dtx->tx_dtx->dd_txd.dtd_ps.dtp_nr; - pa_idx++) - m0_dtx0_executed(req->dr_dtx, pa_idx); } return M0_RC(0); diff --git a/doc/dld/dld-index.c b/doc/dld/dld-index.c index a1a53a2d44d..a5f11c764e8 100644 --- a/doc/dld/dld-index.c +++ b/doc/dld/dld-index.c @@ -53,6 +53,7 @@ file.c --> - @subpage spiel-dld "SPIEL API DLD" - @subpage cas-dld "The catalogue service (CAS)" - @subpage fis-dld "Fault Injection at run time" +- @subpage dtm0br-dld "DTM0 basic recovery SM" Detailed designs should use the @subpage DLD "Motr DLD Template" as a style guide. diff --git a/dtm0/Makefile.sub b/dtm0/Makefile.sub index 965dbbd2dee..3d9a3ffee64 100644 --- a/dtm0/Makefile.sub +++ b/dtm0/Makefile.sub @@ -15,6 +15,7 @@ nobase_motr_include_HEADERS += \ dtm0/remach.h \ dtm0/service.h \ dtm0/svc_internal.h \ + dtm0/recovery.h \ dtm0/tx_desc.h @@ -33,6 +34,7 @@ motr_libmotr_la_SOURCES += \ dtm0/pruner.c \ dtm0/remach.c \ dtm0/service.c \ + dtm0/recovery.c \ dtm0/tx_desc.c diff --git a/dtm0/addb2.h b/dtm0/addb2.h index 79f1918295d..7203c0ff812 100644 --- a/dtm0/addb2.h +++ b/dtm0/addb2.h @@ -34,9 +34,15 @@ #include "addb2/identifier.h" enum m0_avi_dtm0_labels { - M0_AVI_DTX0_SM_STATE = M0_AVI_DTM0_RANGE_START, + /* dtx0 */ + M0_AVI_DTX0_SM_STATE = M0_AVI_DTM0_RANGE_START + 1, M0_AVI_DTX0_SM_COUNTER, M0_AVI_DTX0_SM_COUNTER_END = M0_AVI_DTX0_SM_COUNTER + 0x100, + + /* Recovery Machine */ + M0_AVI_DRM_SM_STATE = M0_AVI_DTX0_SM_COUNTER_END + 1, + M0_AVI_DRM_SM_COUNTER, + M0_AVI_DRM_SM_COUNTER_END = M0_AVI_DRM_SM_COUNTER + 0x100, }; /** @} end of dtm0 group */ diff --git a/dtm0/drlink.c b/dtm0/drlink.c index 9e5dc46d443..e29eeb75d1b 100644 --- a/dtm0/drlink.c +++ b/dtm0/drlink.c @@ -23,6 +23,7 @@ #include "lib/trace.h" #include "addb2/identifier.h" /* M0_AVI_FOM_TO_TX */ #include "dtm0/fop.h" /* dtm0_req_fop */ +#include "dtm0/fop_xc.h" /* dtm0_req_fop_xc */ #include "dtm0/service.h" /* m0_dtm0_service */ #include "dtm0/svc_internal.h" /* dtm0_process */ #include "lib/coroutine.h" /* m0_co API */ @@ -31,6 +32,18 @@ #include "rpc/rpc.h" /* m0_rpc_item_post */ #include "rpc/rpc_machine.h" /* m0_rpc_machine */ #include "rpc/rpc_opcodes.h" /* M0_DTM0_{RLINK,REQ}_OPCODE */ +#include "lib/string.h" /* m0_streq */ + +enum { + /* + * TODO: DTM model assumes infinite timeouts. But this is too scary at + * the moment, we cannot yet rely that infinite timeout will work + * without issues in connect/disconnect case. These timeouts will be + * adjusted/reworked when we work more on stabilising the DTM. + */ + DRLINK_CONNECT_TIMEOUT_SEC = 1, + DRLINK_DISCONN_TIMEOUT_SEC = DRLINK_CONNECT_TIMEOUT_SEC, +}; struct drlink_fom { struct m0_fom df_gen; @@ -71,7 +84,9 @@ static const struct m0_fom_ops drlink_fom_ops = { }; static struct m0_fom_type drlink_fom_type; -static const struct m0_fom_type_ops drlink_fom_type_ops = {}; +static const struct m0_fom_type_ops drlink_fom_type_ops = { + .fto_create = NULL +}; const static struct m0_sm_conf drlink_fom_conf; M0_INTERNAL int m0_dtm0_rpc_link_mod_init(void) @@ -88,33 +103,26 @@ M0_INTERNAL void m0_dtm0_rpc_link_mod_fini(void) { } -/* Create a deep copy of the given request. */ +/** Create a deep copy of the given request. */ static struct dtm0_req_fop *dtm0_req_fop_dup(const struct dtm0_req_fop *src) { - int rc; - struct dtm0_req_fop *dst; - - M0_ALLOC_PTR(dst); - if (dst == NULL) - return NULL; - - rc = m0_dtm0_tx_desc_copy(&src->dtr_txr, &dst->dtr_txr); - if (rc != 0) { - M0_ASSERT(rc == -ENOMEM); - m0_free(dst); - return NULL; - } - - rc = m0_buf_copy(&dst->dtr_payload, &src->dtr_payload); - if (rc != 0) { - m0_dtm0_tx_desc_fini(&dst->dtr_txr); - m0_free(dst); - return NULL; - } - - dst->dtr_msg = src->dtr_msg; - - return dst; + int rc = 0; + struct m0_xcode_obj src_obj; + struct m0_xcode_obj dest_obj; + struct m0_xcode_ctx sctx; + struct m0_xcode_ctx dctx; + struct dtm0_req_fop *dest = NULL; + + /* There is no const version of xcode objects, we'll have to cast it. */ + src_obj = M0_XCODE_OBJ(dtm0_req_fop_xc, (struct dtm0_req_fop *)src); + dest_obj = M0_XCODE_OBJ(dtm0_req_fop_xc, NULL); + m0_xcode_ctx_init(&sctx, &src_obj); + m0_xcode_ctx_init(&dctx, &dest_obj); + dctx.xcx_alloc = m0_xcode_alloc; + rc = m0_xcode_dup(&dctx, &sctx); + if (rc == 0) + dest = dctx.xcx_it.xcu_stack[0].s_obj.xo_ptr; + return dest; } static void dtm0_req_fop_fini(struct dtm0_req_fop *req) @@ -189,7 +197,8 @@ static int drlink_fom_init(struct drlink_fom *fom, static void drlink_fom_fini(struct m0_fom *fom) { struct drlink_fom *df = fom2drlink_fom(fom); - m0_fop_put_lock(df->df_rfop); + if (df->df_rfop != NULL) + m0_fop_put_lock(df->df_rfop); m0_co_op_fini(&df->df_co_op); m0_fom_fini(fom); m0_free(fom); @@ -207,23 +216,6 @@ static void co_long_write_lock(struct m0_co_context *context, M0_CO_YIELD_RC(context, outcome); } -static void co_rpc_link_connect(struct m0_co_context *context, - struct m0_rpc_link *rlink, - struct m0_fom *fom, - int next_phase) -{ - M0_CO_REENTER(context); - - m0_chan_lock(&rlink->rlk_wait); - m0_fom_wait_on(fom, &rlink->rlk_wait, &fom->fo_cb); - m0_chan_unlock(&rlink->rlk_wait); - - m0_rpc_link_connect_async(rlink, M0_TIME_NEVER, NULL); - m0_fom_phase_set(fom, next_phase); - - M0_CO_YIELD_RC(context, M0_FSO_WAIT); -} - static int find_or_add(struct m0_dtm0_service *dtms, const struct m0_fid *tgt, struct dtm0_process **out) @@ -254,6 +246,7 @@ enum drlink_fom_state { DRF_INIT = M0_FOM_PHASE_INIT, DRF_DONE = M0_FOM_PHASE_FINISH, DRF_LOCKING = M0_FOM_PHASE_NR, + DRF_DISCONNECTING, DRF_CONNECTING, DRF_SENDING, DRF_WAITING_FOR_REPLY, @@ -286,10 +279,15 @@ static struct m0_sm_state_descr drlink_fom_states[] = { .sd_name = #name, \ .sd_allowed = allowed \ } - _ST(DRF_LOCKING, M0_BITS(DRF_CONNECTING, + _ST(DRF_LOCKING, M0_BITS(DRF_DISCONNECTING, + DRF_CONNECTING, DRF_SENDING, DRF_FAILED)), + _ST(DRF_DISCONNECTING, M0_BITS(DRF_CONNECTING, + DRF_FAILED)), _ST(DRF_CONNECTING, M0_BITS(DRF_SENDING, + DRF_CONNECTING, + DRF_DISCONNECTING, DRF_FAILED)), _ST(DRF_SENDING, M0_BITS(DRF_DONE, DRF_WAITING_FOR_REPLY, @@ -345,6 +343,7 @@ static int dtm0_process_rlink_reinit(struct dtm0_process *proc, const int max_in_flight = DTM0_MAX_RPCS_IN_FLIGHT; if (!M0_IS0(&proc->dop_rlink)) { + m0_rpc_conn_sessions_cancel(&proc->dop_rlink.rlk_conn); m0_rpc_link_fini(&proc->dop_rlink); M0_SET0(&proc->dop_rlink); } @@ -360,6 +359,9 @@ static int dtm0_process_rlink_send(struct dtm0_process *proc, struct m0_rpc_session *session = &proc->dop_rlink.rlk_sess; struct m0_rpc_item *item = &fop->f_item; + M0_ENTRY("remote: proc=" FID_F ", ep=%s", + FID_P(&proc->dop_rproc_fid), proc->dop_rep); + item->ri_ops = &dtm0_req_fop_rlink_rpc_item_ops; item->ri_session = session; item->ri_prio = M0_RPC_ITEM_PRIO_MID; @@ -368,43 +370,7 @@ static int dtm0_process_rlink_send(struct dtm0_process *proc, if (drf->df_wait_for_ack) m0_co_op_active(&drf->df_co_op); - return m0_rpc_post(item); -} - -/** An aggregated status of a dtm0_process:dop_rlink */ -enum dpr_state { - /** Link is not alive but we can resurrect it. */ - DPR_TRANSIENT, - /** Link is alive and ready to transfer items. */ - DPR_ONLINE, - /** Link is permanently dead. */ - DPR_FAILED, -}; - -static enum dpr_state dpr_state_infer(struct dtm0_process *proc) -{ - /* - * TODO: - * Observe the states of the following enitities: - * RPC connection - * RPC session - * Conf obj - * and then decide whether it is alive, dead or permanently dead. - * - * @verbatim - * if (conf_obj is ONLINE) { - * if (conn is ACTIVE && session is in (IDLE, BUSY)) - * return ONLINE; - * else - * return TRANSIENT; - * } else - * return FAILED; - * @endverbatim - */ - if (m0_rpc_link_is_connected(&proc->dop_rlink)) - return DPR_ONLINE; - - return DPR_TRANSIENT; + return M0_RC(m0_rpc_post(item)); } /* @@ -430,12 +396,96 @@ static void drlink_addb_drf2parent_relate(struct drlink_fom *drf) m0_sm_id_get(&drf->df_gen.fo_sm_phase)); } + #define F M0_CO_FRAME_DATA -static void drlink_coro_fom_tick(struct m0_co_context *context) + +/** + * Connect-disconnect context for an rpc link. + * The context tightens together clink-based notification from rpc link + * channel and a co_op. + */ +struct rlink_cd_ctx { + struct m0_clink dc_clink; + struct m0_co_op dc_op; +}; + +static bool rlink_cd_cb(struct m0_clink *clink) +{ + struct rlink_cd_ctx *dc = M0_AMB(dc, clink, dc_clink); + m0_co_op_done(&dc->dc_op); + return false; +} + +static void co_rlink_do(struct m0_co_context *context, + struct dtm0_process *proc, + enum drlink_fom_state what) { - int rc = 0; struct drlink_fom *drf = M0_AMB(drf, context, df_co); struct m0_fom *fom = &drf->df_gen; + int timeout_sec = 0; + void (*action)(struct m0_rpc_link *, m0_time_t, + struct m0_clink *) = NULL; + + M0_CO_REENTER(context, struct rlink_cd_ctx dc;); + + M0_SET0(&F(dc)); + m0_clink_init(&F(dc).dc_clink, rlink_cd_cb); + F(dc).dc_clink.cl_is_oneshot = true; + m0_co_op_init(&F(dc).dc_op); + m0_co_op_active(&F(dc).dc_op); + + switch (what) { + case DRF_CONNECTING: + timeout_sec = DRLINK_CONNECT_TIMEOUT_SEC; + action = m0_rpc_link_connect_async; + break; + case DRF_DISCONNECTING: + m0_rpc_conn_sessions_cancel(&proc->dop_rlink.rlk_conn); + timeout_sec = DRLINK_DISCONN_TIMEOUT_SEC; + action = m0_rpc_link_disconnect_async; + break; + default: + M0_IMPOSSIBLE("%d is something that cannot be done", what); + break; + } + + action(&proc->dop_rlink, m0_time_from_now(timeout_sec, 0), + &F(dc).dc_clink); + + M0_CO_YIELD_RC(context, m0_co_op_tick_ret(&F(dc).dc_op, fom, what)); + m0_chan_wait(&F(dc).dc_clink); + + m0_co_op_fini(&F(dc).dc_op); + m0_clink_fini(&F(dc).dc_clink); +} + +static bool has_volatile_param(struct m0_conf_obj *obj) +{ + struct m0_conf_service *svc; + const char **param; + + svc = M0_CONF_CAST(obj, m0_conf_service); + M0_ASSERT(svc->cs_params != NULL); + + for (param = svc->cs_params; *param != NULL; ++param) { + if (m0_streq(*param, "origin:in-volatile")) + return true; + else if (m0_streq(*param, "origin:in-persistent")) + return false; + } + + M0_IMPOSSIBLE("Service origin is not defined in the config?"); +} + +static void drlink_coro_fom_tick(struct m0_co_context *context) +{ + int rc = 0; + struct drlink_fom *drf = M0_AMB(drf, context, df_co); + struct m0_fom *fom = &drf->df_gen; + const char *reason = "Unknown"; + struct m0_conf_obj *obj = NULL; + struct m0_confc *confc = m0_reqh2confc(m0_fom2reqh(fom)); + bool always_reconnect = false; M0_CO_REENTER(context, struct m0_long_lock_link llink; @@ -455,8 +505,10 @@ static void drlink_coro_fom_tick(struct m0_co_context *context) */ m0_mutex_unlock(&drf->df_svc->dos_generic.rs_mutex); - if (rc != 0) + if (rc != 0) { + reason = "Cannot find-or-add remote process"; goto out; + } m0_long_lock_link_init(&F(llink), fom, &F(llock_addb2)); @@ -466,27 +518,73 @@ static void drlink_coro_fom_tick(struct m0_co_context *context) DRF_LOCKING)); M0_ASSERT(m0_long_is_write_locked(&F(proc)->dop_llock, fom)); - if (dpr_state_infer(F(proc)) == DPR_TRANSIENT) { + m0_conf_cache_lock(&confc->cc_cache); + obj = m0_conf_cache_lookup(&confc->cc_cache, &F(proc)->dop_rserv_fid); + M0_ASSERT(obj != NULL); + if (has_volatile_param(obj) && obj->co_ha_state != M0_NC_ONLINE) { + M0_LOG(M0_DEBUG, "Force state transition %s -> %s for " FID_F, + m0_ha_state2str(obj->co_ha_state), + m0_ha_state2str(obj->co_ha_state), + FID_P(&F(proc)->dop_rserv_fid)); + obj->co_ha_state = M0_NC_ONLINE; + m0_chan_broadcast(&obj->co_ha_chan); + always_reconnect = true; + } + m0_conf_cache_unlock(&confc->cc_cache); + + /* Reconnect if the session was canceled */ + if (m0_rpc_link_is_connected(&F(proc)->dop_rlink) && + F(proc)->dop_rlink.rlk_sess.s_cancelled) { + always_reconnect = true; + } + + /* XXX: + * At this moment we cannot detect client falures. + * Because of that, we cannot detect the case where + * the client drops RPC items because it cannot + * find the corresponding connection. + * As a workaround, we force drlink to re-connect + * whenever it tries to send a message. + */ + + if (always_reconnect) { + if (m0_rpc_link_is_connected(&F(proc)->dop_rlink)) { + M0_CO_FUN(context, co_rlink_do(context, F(proc), + DRF_DISCONNECTING)); + } + rc = dtm0_process_rlink_reinit(F(proc), drf); - if (rc != 0) + if (rc != 0) { + reason = "Cannot reinit RPC link."; goto unlock; + } /* * TODO handle network failure after link is connected, but * before the message is successfully sent */ - M0_CO_FUN(context, co_rpc_link_connect(context, - &F(proc)->dop_rlink, - fom, DRF_CONNECTING)); + M0_CO_FUN(context, co_rlink_do(context, F(proc), + DRF_CONNECTING)); + } else { + if (!m0_rpc_link_is_connected(&F(proc)->dop_rlink)) { + rc = dtm0_process_rlink_reinit(F(proc), drf); + if (rc != 0) { + reason = "Cannot reinit RPC link."; + goto unlock; + } + M0_CO_FUN(context, co_rlink_do(context, F(proc), + DRF_CONNECTING)); + } } - if (dpr_state_infer(F(proc)) == DPR_FAILED) - goto unlock; + M0_ASSERT(ergo(m0_rpc_link_is_connected(&F(proc)->dop_rlink), + !F(proc)->dop_rlink.rlk_sess.s_cancelled)); - M0_ASSERT(dpr_state_infer(F(proc)) == DPR_ONLINE); m0_fom_phase_set(fom, DRF_SENDING); rc = dtm0_process_rlink_send(F(proc), drf); - if (rc != 0) + if (rc != 0) { + reason = "Failed to post a message to RPC layer."; goto unlock; + } /* Safety: FOP (and item) can be released only in ::drlink_fom_fini. */ drlink_addb_drf2item_relate(drf); @@ -497,15 +595,27 @@ static void drlink_coro_fom_tick(struct m0_co_context *context) DRF_WAITING_FOR_REPLY)); m0_co_op_reset(&drf->df_co_op); rc = m0_rpc_item_error(&drf->df_rfop->f_item); + if (rc != 0) { + reason = "Rpc item error"; + goto unlock; + } } unlock: + if (drf->df_rfop != NULL) { + m0_fop_put_lock(drf->df_rfop); + drf->df_rfop = NULL; + } + m0_long_write_unlock(&F(proc)->dop_llock, &F(llink)); m0_long_lock_link_fini(&F(llink)); out: /* TODO handle the error */ - if (rc != 0) + if (rc != 0) { + M0_LOG(M0_ERROR, "Failed to send a message to" FID_F ", rc=%d, " + "reason=%s", FID_P(&drf->df_tgt), rc, reason); m0_fom_phase_move(fom, rc, DRF_FAILED); + } if (drf->df_op != NULL) m0_be_op_done(drf->df_op); diff --git a/dtm0/dtx.c b/dtm0/dtx.c index b9825b769cd..f81009e0bde 100644 --- a/dtm0/dtx.c +++ b/dtm0/dtx.c @@ -46,15 +46,7 @@ static struct m0_sm_state_descr dtx_states[] = { }, [M0_DDS_INPROGRESS] = { .sd_name = "inprogress", - .sd_allowed = M0_BITS(M0_DDS_EXECUTED, M0_DDS_FAILED), - }, - [M0_DDS_EXECUTED] = { - .sd_name = "executed", - .sd_allowed = M0_BITS(M0_DDS_EXECUTED_ALL), - }, - [M0_DDS_EXECUTED_ALL] = { - .sd_name = "executed-all", - .sd_allowed = M0_BITS(M0_DDS_STABLE), + .sd_allowed = M0_BITS(M0_DDS_STABLE, M0_DDS_FAILED), }, [M0_DDS_STABLE] = { .sd_name = "stable", @@ -72,11 +64,9 @@ static struct m0_sm_state_descr dtx_states[] = { static struct m0_sm_trans_descr dtx_trans[] = { { "populated", M0_DDS_INIT, M0_DDS_INPROGRESS }, - { "executed", M0_DDS_INPROGRESS, M0_DDS_EXECUTED }, - { "exec-all", M0_DDS_EXECUTED, M0_DDS_EXECUTED_ALL }, - { "exec-fail", M0_DDS_INPROGRESS, M0_DDS_FAILED }, - { "stable", M0_DDS_EXECUTED_ALL, M0_DDS_STABLE }, - { "prune", M0_DDS_STABLE, M0_DDS_DONE } + { "stabilised", M0_DDS_INPROGRESS, M0_DDS_STABLE }, + { "stab-fail", M0_DDS_INPROGRESS, M0_DDS_FAILED }, + { "prune-it", M0_DDS_STABLE, M0_DDS_DONE } }; struct m0_sm_conf m0_dtx_sm_conf = { @@ -109,7 +99,6 @@ static int dtx_log_insert(struct m0_dtm0_dtx *dtx) struct m0_dtm0_log_rec *record = M0_AMB(record, dtx, dlr_dtx); int rc; - M0_PRE(m0_dtm0_tx_desc_state_eq(&dtx->dd_txd, M0_DTPS_INPROGRESS)); M0_PRE(dtx->dd_dtms != NULL); log = dtx->dd_dtms->dos_log; M0_PRE(log != NULL); @@ -285,12 +274,20 @@ static int dtx_fid_assign(struct m0_dtm0_dtx *dtx, static int dtx_close(struct m0_dtm0_dtx *dtx) { int rc; + int i; + struct m0_dtm0_tx_pa *pa; M0_ENTRY("dtx=%p", dtx); M0_PRE(dtx != NULL); M0_PRE(m0_sm_group_is_locked(dtx->dd_sm.sm_grp)); + for (i = 0; i < dtx->dd_txd.dtd_ps.dtp_nr; ++i) { + pa = &dtx->dd_txd.dtd_ps.dtp_pa[i]; + pa->p_state = max_check(pa->p_state, + (uint32_t)M0_DTPS_EXECUTED); + } + /* * TODO:REDO: We may want to capture the fop contents here. * See ::fol_record_pack and ::m0_fop_encdec for the details. @@ -299,14 +296,11 @@ static int dtx_close(struct m0_dtm0_dtx *dtx) */ rc = dtx_log_insert(dtx); - M0_ASSERT(rc == 0); + if (rc == 0) + m0_sm_state_set(&dtx->dd_sm, M0_DDS_INPROGRESS); + else + m0_sm_move(&dtx->dd_sm, rc, M0_DDS_FAILED); - /* - * Once a dtx is closed, the FOP (or FOPs) has to be serialized - * into the log, so that we should no longer hold any references to it. - */ - dtx->dd_fop = NULL; - m0_sm_state_set(&dtx->dd_sm, M0_DDS_INPROGRESS); return M0_RC(rc); } @@ -322,24 +316,6 @@ static void dtx_done(struct m0_dtm0_dtx *dtx) M0_LEAVE(); } -static void dtx_exec_all_ast_cb(struct m0_sm_group *grp, struct m0_sm_ast *ast) -{ - struct m0_dtm0_dtx *dtx = ast->sa_datum; - - M0_ENTRY("dtx=%p", dtx); - - M0_ASSERT(dtx->dd_sm.sm_state == M0_DDS_EXECUTED_ALL); - M0_ASSERT(dtx->dd_nr_executed == dtx->dd_txd.dtd_ps.dtp_nr); - - if (m0_dtm0_tx_desc_state_exists(&dtx->dd_txd, M0_DTPS_PERSISTENT)) { - m0_sm_state_set(&dtx->dd_sm, M0_DDS_STABLE); - M0_LOG(M0_DEBUG, "dtx " DTID0_F "is stable (EXEC_ALL)", - DTID0_P(&dtx->dd_txd.dtd_id)); - } - - M0_LEAVE(); -} - static void dtx_persistent_ast_cb(struct m0_sm_group *grp, struct m0_sm_ast *ast) { @@ -355,21 +331,20 @@ static void dtx_persistent_ast_cb(struct m0_sm_group *grp, m0_mutex_lock(&log->dl_lock); rec = m0_be_dtm0_log_find(log, &txd->dtd_id); + dtx = rec == NULL ? NULL : &rec->dlr_dtx; - if (rec != NULL) { + if (dtx != NULL && dtx->dd_sm.sm_state < M0_DDS_STABLE) { dtx = &rec->dlr_dtx; m0_dtm0_tx_desc_apply(&dtx->dd_txd, txd); dtx_log_update(dtx); - if (dtx->dd_sm.sm_state == M0_DDS_EXECUTED_ALL && - m0_dtm0_tx_desc_state_exists(&dtx->dd_txd, - M0_DTPS_PERSISTENT)) { - M0_ASSERT(dtx->dd_nr_executed == - dtx->dd_txd.dtd_ps.dtp_nr); - M0_LOG(M0_DEBUG, "dtx " DTID0_F "is stable (PMA)", - DTID0_P(&dtx->dd_txd.dtd_id)); - m0_sm_state_set(&dtx->dd_sm, M0_DDS_STABLE); - } + M0_LOG(M0_DEBUG, "dtx " DTID0_F "is stable.", + DTID0_P(&dtx->dd_txd.dtd_id)); + /* + * At this moment, DTX is considered to be stable if + * at least one Pmsg was received. + */ + m0_sm_state_set(&dtx->dd_sm, M0_DDS_STABLE); } m0_mutex_unlock(&log->dl_lock); @@ -404,9 +379,8 @@ M0_INTERNAL void m0_dtm0_dtx_pmsg_post(struct m0_be_dtm0_log *log, m0_mutex_lock(&log->dl_lock); rec = m0_be_dtm0_log_find(log, &txd->dtd_id); dtx_sm_grp = rec != NULL ? rec->dlr_dtx.dd_sm.sm_grp : NULL; - m0_mutex_unlock(&log->dl_lock); - if (rec != NULL) { + if (dtx_sm_grp != NULL) { M0_ASSERT(fop->f_opaque != NULL); pma = fop->f_opaque; *pma = (struct m0_dtm0_pmsg_ast) { @@ -423,61 +397,8 @@ M0_INTERNAL void m0_dtm0_dtx_pmsg_post(struct m0_be_dtm0_log *log, m0_fop_get(fop); m0_sm_ast_post(dtx_sm_grp, &pma->p_ast); } - M0_LEAVE(); -} - -static void dtx_executed(struct m0_dtm0_dtx *dtx, uint32_t idx) -{ - struct m0_dtm0_tx_pa *pa; - - M0_ENTRY("dtx=%p, idx=%"PRIu32, dtx, idx); - - M0_PRE(dtx != NULL); - M0_PRE(m0_sm_group_is_locked(dtx->dd_sm.sm_grp)); - - pa = &dtx->dd_txd.dtd_ps.dtp_pa[idx]; - - M0_ASSERT(pa->p_state >= M0_DTPS_INPROGRESS); - - pa->p_state = max_check(pa->p_state, (uint32_t)M0_DTPS_EXECUTED); - - dtx->dd_nr_executed++; - - if (dtx->dd_sm.sm_state < M0_DDS_EXECUTED) { - M0_ASSERT(dtx->dd_sm.sm_state == M0_DDS_INPROGRESS); - m0_sm_state_set(&dtx->dd_sm, M0_DDS_EXECUTED); - } - - if (dtx->dd_nr_executed == dtx->dd_txd.dtd_ps.dtp_nr) { - M0_ASSERT(dtx->dd_sm.sm_state == M0_DDS_EXECUTED); - M0_ASSERT_INFO(dtds_forall(&dtx->dd_txd, >= M0_DTPS_EXECUTED), - "Non-executed PAs should not exist " - "at this point."); - m0_sm_state_set(&dtx->dd_sm, M0_DDS_EXECUTED_ALL); - - /* - * EXECUTED and STABLE should not be triggered within the - * same ast tick. This ast helps us to enforce it. - * XXX: there is a catch22-like problem with DIX states: - * DIX request should already be in "FINAL" state when - * the corresponding dtx reaches STABLE. However, the dtx - * cannot transit from EXECUTED to STABLE (through EXEC_ALL) - * if DIX reached FINAL already (the list of CAS rops has been - * destroyed). So that the EXEC_ALL ast cuts this knot by - * scheduling the transition EXEC_ALL -> STABLE in a separate - * tick where DIX request reached FINAL. - */ - dtx->dd_exec_all_ast = (struct m0_sm_ast) { - .sa_cb = dtx_exec_all_ast_cb, - .sa_datum = dtx, - }; - m0_sm_ast_post(dtx->dd_sm.sm_grp, &dtx->dd_exec_all_ast); - } - - m0_mutex_lock(&dtx->dd_dtms->dos_log->dl_lock); - dtx_log_update(dtx); - m0_mutex_unlock(&dtx->dd_dtms->dos_log->dl_lock); + m0_mutex_unlock(&log->dl_lock); M0_LEAVE(); } @@ -516,19 +437,12 @@ M0_INTERNAL void m0_dtx0_fop_assign(struct m0_dtx *dtx, dtx_fop_assign(dtx->tx_dtx, pa_idx, pa_fop); } - M0_INTERNAL int m0_dtx0_close(struct m0_dtx *dtx) { M0_PRE(dtx != NULL); return dtx_close(dtx->tx_dtx); } -M0_INTERNAL void m0_dtx0_executed(struct m0_dtx *dtx, uint32_t pa_idx) -{ - M0_PRE(dtx != NULL); - dtx_executed(dtx->tx_dtx, pa_idx); -} - M0_INTERNAL void m0_dtx0_done(struct m0_dtx *dtx) { struct m0_be_dtm0_log *log; diff --git a/dtm0/dtx.h b/dtm0/dtx.h index ac3dfc8448d..f8907579b7b 100644 --- a/dtm0/dtx.h +++ b/dtm0/dtx.h @@ -33,10 +33,6 @@ enum m0_dtm0_dtx_state { M0_DDS_INIT, /* dtx has a valid tx record. */ M0_DDS_INPROGRESS, - /* dtx got one reply. */ - M0_DDS_EXECUTED, - /* dtx got all replies. */ - M0_DDS_EXECUTED_ALL, /* dtx got enough PERSISTENT messages. */ M0_DDS_STABLE, /* dtx can be released when this state reached. */ @@ -61,8 +57,6 @@ struct m0_dtm0_dtx { struct m0_sm dd_sm; struct m0_dtm0_tx_desc dd_txd; struct m0_dtm0_service *dd_dtms; - uint32_t dd_nr_executed; - struct m0_sm_ast dd_exec_all_ast; /* * XXX: The implementation is very simple and it relies on the idea @@ -129,13 +123,6 @@ M0_INTERNAL void m0_dtx0_fop_assign(struct m0_dtx *dtx, */ M0_INTERNAL int m0_dtx0_close(struct m0_dtx *dtx); -/** - * Notifies DTM0 that DTX is executed on the particular participant. - * @param dtx A DTX that is executed on the particular participant. - * @param pa_idx Index of the participant. - */ -M0_INTERNAL void m0_dtx0_executed(struct m0_dtx *dtx, uint32_t pa_idx); - /** * Marks a transaction as "no longer in-use". * The user does not have exclusive ownership on a dtx after it has diff --git a/dtm0/fop.c b/dtm0/fop.c index 3fd5b1ace6d..7b88c6bdefd 100644 --- a/dtm0/fop.c +++ b/dtm0/fop.c @@ -24,6 +24,7 @@ #include "lib/trace.h" /* M0_LOG */ #include "cas/cas.h" #include "cas/cas_xc.h" +#include "dix/fid_convert.h" /* m0_dix_fid__device_id_set */ #include "dtm0/fop.h" #include "dtm0/fop_xc.h" #include "dtm0/addb2.h" @@ -58,7 +59,8 @@ static int dtm0_fom_create(struct m0_fop *fop, struct m0_fom **out, struct m0_reqh *reqh); static void dtm0_fom_fini(struct m0_fom *fom); static size_t dtm0_fom_locality(const struct m0_fom *fom); -static int dtm0_cas_fop_prepare(struct dtm0_req_fop *req, +static int dtm0_cas_fop_prepare(struct m0_reqh *reqh, + struct dtm0_req_fop *req, struct m0_fop_type *cas_fopt, struct m0_fop **cas_fop_out); static int dtm0_cas_fom_spawn( @@ -119,6 +121,7 @@ struct m0_sm_state_descr dtm0_phases[] = { .sd_name = "dtm0-entry", .sd_allowed = M0_BITS(M0_FOPH_DTM0_LOGGING, M0_FOPH_DTM0_TO_CAS, + M0_FOPH_DTM0_CAS_DONE, M0_FOPH_SUCCESS, M0_FOPH_FAILURE) }, @@ -150,6 +153,8 @@ struct m0_sm_trans_descr dtm0_phases_trans[] = { {"dtm0-to-cas", M0_FOPH_DTM0_ENTRY, M0_FOPH_DTM0_TO_CAS}, + {"dtm0-empty-rec", M0_FOPH_DTM0_ENTRY, M0_FOPH_DTM0_CAS_DONE}, + {"dtm0-to-cas-fail", M0_FOPH_DTM0_TO_CAS, M0_FOPH_FAILURE}, {"dtm0-cas-done", M0_FOPH_DTM0_TO_CAS, M0_FOPH_DTM0_CAS_DONE}, @@ -202,8 +207,10 @@ M0_INTERNAL int m0_dtm0_fop_init(void) .rpc_flags = M0_RPC_ITEM_TYPE_REPLY, .fom_ops = &dtm0_req_fom_type_ops); - return m0_fop_type_addb2_instrument(&dtm0_req_fop_fopt) ?: + return m0_fop_type_addb2_instrument(&dtm0_req_fop_fopt); + /* m0_fop_type_addb2_instrument(&dtm0_redo_fop_fopt); + */ } @@ -358,7 +365,7 @@ M0_INTERNAL int m0_dtm0_on_committed(struct m0_fom *fom, if (m0_fid_eq(target, source)) target = &txd->dtd_id.dti_fid; - rc = m0_dtm0_req_post(dtms, NULL, &req, target, fom, false); + rc = m0_dtm0_req_post(dtms, NULL, &req, target, fom, true); if (rc != 0) { M0_LOG(M0_WARN, "Failed to send PERSISTENT msg " FID_F " -> " FID_F " (%d).", @@ -532,7 +539,35 @@ static int dtm0_cas_fom_spawn( #endif } -static int dtm0_cas_fop_prepare(struct dtm0_req_fop *req, +static void dtm0_cas_sdev_id_set( + struct m0_reqh *reqh, + const struct m0_reqh_service_type *stype, + struct m0_cas_op *cas_op) +{ +#ifndef __KERNEL__ + struct m0_cas_id *cid = &cas_op->cg_id; + uint32_t sdev_id; + + M0_PRE(reqh != NULL); + M0_PRE(stype != NULL); + M0_PRE(cas_op != NULL); + + M0_ENTRY(); + + sdev_id = m0_cas_svc_device_id_get(stype, reqh); + M0_LOG(M0_DEBUG, "Replace device ID: %d -> %d", + m0_dix_fid_cctg_device_id(&cid->ci_fid), sdev_id); + m0_dix_fid__device_id_set(&cid->ci_fid, sdev_id); + + M0_LEAVE(); +#else + /* CAS service is not compiled for kernel. */ + return; +#endif +} + +static int dtm0_cas_fop_prepare(struct m0_reqh *reqh, + struct dtm0_req_fop *req, struct m0_fop_type *cas_fopt, struct m0_fop **cas_fop_out) { @@ -540,6 +575,8 @@ static int dtm0_cas_fop_prepare(struct dtm0_req_fop *req, struct m0_cas_op *cas_op; struct m0_fop *cas_fop; + M0_ENTRY(); + *cas_fop_out = NULL; M0_ALLOC_PTR(cas_op); @@ -552,9 +589,12 @@ static int dtm0_cas_fop_prepare(struct dtm0_req_fop *req, &M0_XCODE_OBJ(m0_cas_op_xc, cas_op), req->dtr_payload.b_addr, req->dtr_payload.b_nob); - if (rc == 0) - m0_fop_init(cas_fop, cas_fopt, cas_op, &m0_fop_release); - else + if (rc == 0) { + dtm0_cas_sdev_id_set( + reqh, cas_fopt->ft_fom_type.ft_rstype, cas_op); + m0_fop_init(cas_fop, cas_fopt, cas_op, + &m0_fop_release); + } else M0_LOG(M0_ERROR, "Could not decode the REDO payload"); } @@ -565,7 +605,7 @@ static int dtm0_cas_fop_prepare(struct dtm0_req_fop *req, m0_free(cas_fop); } - return rc; + return M0_RC(rc); } static int dtm0_rmsg_fom_tick(struct m0_fom *fom) @@ -576,6 +616,12 @@ static int dtm0_rmsg_fom_tick(struct m0_fom *fom) struct dtm0_fom *dfom = M0_AMB(dfom, fom, dtf_fom); struct dtm0_req_fop *req = m0_fop_data(fom->fo_fop); struct m0_fop *cas_fop = NULL; + struct m0_dtm0_service *svc = m0_dtm0_fom2service(fom); + struct m0_dtm0_recovery_machine *m = &svc->dos_remach; + const struct m0_dtm0_tx_desc *txd = &req->dtr_txr; + + M0_PRE(ergo(m0_dtm0_tx_desc_is_none(txd), + !!(req->dtr_flags & M0_BITS(M0_DMF_EOL)))); M0_ENTRY("fom %p phase %d", fom, phase); @@ -584,7 +630,9 @@ static int dtm0_rmsg_fom_tick(struct m0_fom *fom) result = m0_fom_tick_generic(fom); break; case M0_FOPH_DTM0_ENTRY: - m0_fom_phase_set(fom, M0_FOPH_DTM0_TO_CAS); + m0_fom_phase_set(fom, + m0_dtm0_tx_desc_is_none(txd) ? + M0_FOPH_DTM0_CAS_DONE : M0_FOPH_DTM0_TO_CAS); break; case M0_FOPH_DTM0_TO_CAS: /* REDO_END()s from all recovering processes received, send @@ -593,7 +641,8 @@ static int dtm0_rmsg_fom_tick(struct m0_fom *fom) cs_ha_process_event(m0_cs_ctx_get(m0_fom_reqh(fom)), M0_CONF_HA_PROCESS_DTM_RECOVERED); */ - rc = dtm0_cas_fop_prepare(req, &cas_put_fopt, &cas_fop); + rc = dtm0_cas_fop_prepare(dfom->dtf_fom.fo_service->rs_reqh, + req, &cas_put_fopt, &cas_fop); if (rc == 0) { rc = dtm0_cas_fom_spawn(dfom, cas_fop, &dtm0_cas_done_cb); @@ -621,6 +670,25 @@ static int dtm0_rmsg_fom_tick(struct m0_fom *fom) M0_FOPH_FAILURE); } else { m0_fom_phase_set(fom, M0_FOPH_SUCCESS); + /* + * TODO: make it async when recovery machine starts + * using a larger sliding window or the amount of + * participants gets bigger than the length of + * the EOL queue. + * At this moment EOLQ_MAX_LEN is 100, and we may + * have at most 3 concurrent EOL. It leaves room + * for 97 pending HA state transitions which is + * far more than enough for a 3 node cluster with + * one single failure. + */ + /* + * TODO: Consider propagating FOM or its sm_id + * (directly or indirectly) down to the recovery + * machine, so that we may relate REDO FOM + * and the corresponding recovery FOM. + */ + M0_BE_OP_SYNC(op, + m0_dtm0_recovery_machine_redo_post(m, req, &op)); } break; default: diff --git a/dtm0/fop.h b/dtm0/fop.h index 38f00cc15bf..357aac4a572 100644 --- a/dtm0/fop.h +++ b/dtm0/fop.h @@ -47,16 +47,34 @@ enum m0_dtm0s_msg { DTM_EXECUTE, DTM_EXECUTED, DTM_PERSISTENT, - DTM_REDO + DTM_REDO, } M0_XCA_ENUM; +enum m0_dtm0_msg_flags { + M0_DMF_EOL, + M0_DMF_EVICTION, +}; + /** A DTM0 message sent as an RPC request to remote DTM0 services. */ struct dtm0_req_fop { uint32_t dtr_msg M0_XCA_FENUM(m0_dtm0s_msg); struct m0_dtm0_tx_desc dtr_txr; struct m0_buf dtr_payload; + uint64_t dtr_flags; + /** + * The participant (DTM0 service) that sent this message. + * The initiator is set for DTM_REDO messages. + */ + struct m0_fid dtr_initiator; } M0_XCA_RECORD M0_XCA_DOMAIN(rpc); + +#define REDO_F "redo={ txid=" DTID0_F ", ini=" FID_F ", is_eol=%d }" +#define REDO_P(_redo) \ + DTID0_P(&(_redo)->dtr_txr.dtd_id), \ + FID_P(&(_redo)->dtr_initiator), \ + !!((_redo)->dtr_flags & M0_BITS(M0_DMF_EOL)) + struct dtm0_rep_fop { /** Status code of dtm operation. */ int32_t dr_rc; diff --git a/dtm0/it/all2all/.gitignore b/dtm0/it/all2all/.gitignore new file mode 100644 index 00000000000..5e548163dff --- /dev/null +++ b/dtm0/it/all2all/.gitignore @@ -0,0 +1,4 @@ +m0trace.* +addb_*/ +addb-stobs-*/ +m0crate.yaml diff --git a/dtm0/it/all2all/all2all b/dtm0/it/all2all/all2all index f659b62b9d7..b6db8f8985a 100755 --- a/dtm0/it/all2all/all2all +++ b/dtm0/it/all2all/all2all @@ -15,6 +15,10 @@ LOOP_IMG_DIR=$TEST_ROOT CLIENT_PID= M0D_DIR_COMMON=$MOTR_VAR_DIR/m0d-0x720000000000000 ADDB_DUMP_DIR="/tmp/a2a-addb-out" +CTGDUMP="$MOTR_ROOT/cas/m0ctgdump" +DINDEX="1:5" +CTGDUMP_DIR="/tmp/a2a-ctgdump" + M0D_ENDPOINTS=() M0D_FIDS_DEC=() @@ -23,10 +27,16 @@ M0D_PIDS=() M0D_CLI_FID_DEC= M0D_CLI_FID_HEX= +MOD_CLI_EP= POOL_WIDTH=4 IPOOL_WIDTH=2 +# m0d that got killed +VICTIM=1 +# m0ds that are alive +WITNESSES=(0 2) + . ${MOTR_ROOT}/scripts/addb-py/chronometry/common/common_funcs @@ -37,7 +47,12 @@ function stop_cluster() function bootstrap_cluster() { - hctl bootstrap --mkfs $CURRENT_CDF + # XXX: For some reason Hare does not like localhost. + hctl bootstrap --mkfs <(sed "s/localhost/$(hostname -f)/" $CURRENT_CDF) + if [[ $? -ne 0 ]]; then + _err "Cluster bootstrap failed" + exit 1 + fi } function get_m0d_pids() @@ -46,7 +61,7 @@ function get_m0d_pids() local pid for fid in ${M0D_FIDS_HEX[@]} ; do - pid=$(ps ax | grep m0d | grep $fid | awk '{ print $1; }') + pid=$(pgrep -a lt-m0d | grep $fid | awk '{ print $1; }') M0D_PIDS+=($pid) pids+="$pid " done @@ -60,10 +75,12 @@ function create_m0crate_cfg() local svcs_json_out=$(echo $hctl_json_out | jq -r '.nodes[] | .svcs[]') local PROF=$(echo $hctl_json_out | jq -r '.profiles[] | .fid') - local MOTR_LOCAL_ADDR=$(echo $svcs_json_out | jq -r 'select( .name | contains("m0_client")) | .ep') - local PROCESS_FID=$(echo $svcs_json_out | jq -r 'select( .name | contains("m0_client")) | .fid') + local MOTR_LOCAL_ADDR=$(echo $svcs_json_out | jq -r 'select( .name | contains("motr_client")) | .ep') + local PROCESS_FID=$(echo $svcs_json_out | jq -r 'select( .name | contains("motr_client")) | .fid') local MOTR_HA_ADDR=$(echo $svcs_json_out | jq -r 'select( .name | contains("hax")) | .ep') + MOD_CLI_EP=$(echo "$MOTR_LOCAL_ADDR") + local M0CRATE_CFG_TMP=m0crate_cfg.tmp cp $M0CRATE_CFG_IN $M0CRATE_CFG_TMP sed -i "s/###__PROF__###/$PROF/g" $M0CRATE_CFG_TMP @@ -77,64 +94,175 @@ function get_params_for_ha_msgs() { local svcs_json_out=$(hctl status --json | jq -r '.nodes[] | .svcs[]') local svc_json_out=$(echo $svcs_json_out | jq -r 'select( .name | contains("ioservice"))') - local cli_json_out=$(echo $svcs_json_out | jq -r 'select( .name | contains("m0_client"))') - M0D_ENDPOINTS=($(echo $svc_json_out | jq -r '.ep' | sed -E 's/.*@tcp[:](.*)/\1/')) + local cli_json_out=$(echo $svcs_json_out | jq -r 'select( .name | contains("motr_client"))') + M0D_ENDPOINTS=($(echo $svc_json_out | jq -r '.ep')) M0D_FIDS_HEX=($(echo $svc_json_out | jq -r '.fid' | sed -E 's/0x720+([0-9][:]0x[A-Za-z0-9]+)/\1/')) M0D_FIDS_DEC=($(echo $svc_json_out | jq -r '.fid' | sed -E 's/0x720+([0-9][:])(0x[A-Za-z0-9]+)/printf "%s%d" \1 \2/e')) M0D_CLI_FID_DEC=$(echo $cli_json_out | jq -r '.fid' | sed -E 's/0x720+([0-9][:])(0x[A-Za-z0-9]+)/printf "%s%d" \1 \2/e') M0D_CLI_FID_HEX=$(echo $cli_json_out | jq -r '.fid' | sed -E 's/0x720+([0-9][:]0x[A-Za-z0-9]+)/\1/') } -function ha_msg_send_transient() +# params +# (i) whom to send the message (endpoint). Use "cli" for the client; 0, 1, 2, etc. for m0ds. +# (ii) who has changed its state. Use "cli" or 0, 1, 2. +# (iii) what happened. Use "transient", "online" etc. +function ha_msg_send() { - # Here we send "TRANSIENT" messages to trigger start of - # HA messages handling on the m0d side as dtm0 doesn't - # handle them until "TRANSIENT" received due to incomplete - # implementation on the Hare side. - for i in $(seq 0 $((${#M0D_ENDPOINTS[@]}-1))) ; do - for j in $(seq 0 $((${#M0D_FIDS_DEC[@]}-1))) ; do - if [[ $i -ne $j ]]; then - $MOTR_ST_UTILS_DIR/ha_msg_send.sh "${M0D_ENDPOINTS[$i]}" "^r|${M0D_FIDS_DEC[$j]}" "transient" - break - fi - done - done + local whom="$1" + local who="$2" + local what="$3" + + if [[ "x$whom" == "xcli" ]]; then + whom="${MOD_CLI_EP}" + else + whom="${M0D_ENDPOINTS[$whom]}" + fi + + if [[ "x$who" == "xcli" ]]; then + who="${M0D_CLI_FID_DEC}" + else + who="${M0D_FIDS_DEC[$who]}" + fi + + _info "ha_msg_send: whom='${whom}', who='${who}', what='${what}'" + $MOTR_ST_UTILS_DIR/ha_msg_send.sh "$whom" "^r|$who" "$what" } -function ha_msg_send_online() +function ha_msg_send_cli() { - # Here we send "ONLINE" messages to trigger connections logic. - for i in $(seq 0 $((${#M0D_ENDPOINTS[@]}-1))) ; do - for j in $(seq 0 $((${#M0D_FIDS_DEC[@]}-1))) ; do - if [[ $i -ne $j ]]; then - $MOTR_ST_UTILS_DIR/ha_msg_send.sh "${M0D_ENDPOINTS[$i]}" "^r|${M0D_FIDS_DEC[$j]}" "online" - fi - done - done + local whom="cli" + local who="$1" + local what="$2" + ha_msg_send $whom $who $what } -function ha_msg_send_cli_online() +# params +# (i) who Who changed its state? +# (ii) what What is the new state? +function ha_msg_send_m0ds() { - # Here we send "ONLINE" messages to connect servers to client. - for i in $(seq 0 $((${#M0D_ENDPOINTS[@]}-1))) ; do - $MOTR_ST_UTILS_DIR/ha_msg_send.sh "${M0D_ENDPOINTS[$i]}" "^r|${M0D_CLI_FID_DEC}" "online" + local who="$1" + local what="$2" + + if [[ ${#M0D_ENDPOINTS[@]} -ne ${#M0D_FIDS_DEC[@]} ]]; then + echo "The number of endpoints is not equal to the number of fids. How is that possible?" + exit 1 + fi + + for i in $(seq 0 $((${#M0D_ENDPOINTS[@]}-1))); do + ha_msg_send $i $who $what done } -function expected_trace_lines_num() +function ha_msg_send_all() +{ + local who="$1" + local what="$2" + ha_msg_send_cli "$who" "$what" + ha_msg_send_m0ds "$who" "$what" +} + +# param +# (i) trace_path Path to the trace file +# (ii) pattern String to grep. +# (iii) exp_cnt Expected number of lines that match the pattern. +function expect_trace_lines +{ + local trace_path="$1" + local pattern="$2" + local exp_cnt="$3" + local cnt + local cmd; + + _info "expect trace: path=$trace_path, pattern=$pattern, exp_cnt=$exp_cnt" + cnt=$($MOTR_ROOT/utils/trace/m0trace -i "$trace_path*" | grep -c "$pattern") + if [[ $cnt -ne $exp_cnt ]]; then + _info "Unexpected number of trace lines: $cnt != $exp_cnt" + return 1 + fi + + _info "Found" + return 0 +} + +function expect_trace_lines_from_m0d() +{ + local index="$1" + local pattern="$2" + local exp_cnt="$3" + local path="${M0D_DIR_COMMON}${M0D_FIDS_HEX[index]}/m0trace.${M0D_PIDS[index]}" + + if expect_trace_lines "$path" "$pattern" "$exp_cnt"; then + return 0; + fi + + return 1; +} + +function expect_trace_lines_from_m0ds() { local pattern="$1" local exp_cnt=$2 - local cnt for i in ${!M0D_PIDS[@]} ; do - cnt=$($MOTR_ROOT/utils/trace/m0trace -i "${M0D_DIR_COMMON}${M0D_FIDS_HEX[i]}/m0trace.${M0D_PIDS[i]}" | grep "$pattern" | wc -l) - if [[ $cnt -ne $exp_cnt ]]; then + if ! expect_trace_lines_from_m0d "$i" "$pattern" "$exp_cnt"; then + _info "Not found in ${M0D_PIDS[i]}" return 1 fi done - return 0 + return 0; +} + +function expect_trace_lines_from_cli() +{ + local pattern="$1" + local exp_cnt="$2" + local path="$PWD/m0trace.$CLIENT_PID" + local cnt + + if ! expect_trace_lines "$path" "$pattern" "$exp_cnt"; then + return 1; + fi + + return 0; +} + +function expect_trace_lines_from_all() +{ + local pattern="$1" + local exp_cnt="$2" + + if expect_trace_lines_from_m0ds "$pattern" "$exp_cnt" && \ + expect_trace_lines_from_cli "$pattern" "$exp_cnt"; then + return 0; + fi + + return 1 +} + +function addb2dump() +{ + local inpfile=$1 + local outfile=$2 + local a2d=$MOTR_ROOT/utils/m0addb2dump + local size + + _info "Dumping ${inpfile} -> ${outfile} ..." + + if ! $a2d -f "${inpfile}" > "${outfile}"; then + if [[ "$(stat -c %s ${inpfile})" == "0" ]]; then + _info "Skipping empty ADDB file." + return 0; + else + _info "Addb file is not empty but cannot dump it" + return 1; + fi + else + echo "OK." + return 0; + fi + } function addb_dump() @@ -143,7 +271,6 @@ function addb_dump() local outfile local inpfile local fid - local a2d=$MOTR_ROOT/utils/m0addb2dump rm -fR "${outdir}" mkdir "${outdir}" @@ -152,15 +279,17 @@ function addb_dump() fid=$(echo "${M0D_FIDS_HEX[i]}" | awk -F'x' '{ print $2; }') outfile="${outdir}/addb_${fid}.dump" inpfile="${M0D_DIR_COMMON}${M0D_FIDS_HEX[i]}/addb-stobs-${M0D_PIDS[i]}/o/100000000000000:2" - _info "Dumping ${inpfile} -> ${outfile} ..." - $a2d -f "${inpfile}" > "${outfile}" + addb2dump ${inpfile} ${outfile} done - inpfile="$PWD/addb_${CLIENT_PID}/o/100000000000000:2" - fid=$(echo "$M0D_CLI_FID_HEX" | awk -F'x' '{ print $2; }') - outfile="${outdir}/addb_${fid}.dump" - _info "Dumping ${inpfile} -> ${outfile} ..." - $a2d -f "${inpfile}" > "${outfile}" + if [[ "x$CLIENT_PID" != "x" ]]; then + inpfile="$PWD/addb_${CLIENT_PID}/o/100000000000000:2" + fid=$(echo "$M0D_CLI_FID_HEX" | awk -F'x' '{ print $2; }') + outfile="${outdir}/addb_${fid}.dump" + addb2dump ${inpfile} ${outfile} + else + _info "Skipping client addb dumps." + fi } function processes_status_check() @@ -186,10 +315,135 @@ function fail() exit 1 } -function main() +function client_run() +{ + _info "Launching the client..." + $MOTR_ROOT/motr/m0crate/m0crate -S $M0CRATE_CFG & + CLIENT_PID=$! + _info "Client pid: ${CLIENT_PID}" +} + +function client_run_gdb() +{ + _info "Launching the client under gdb ..." + libtool --mode=execute gdb --args $MOTR_ROOT/motr/m0crate/m0crate -S $M0CRATE_CFG +} + +function client_wait() +{ + wait ${CLIENT_PID} + rc="$?" + if [[ "$rc" -gt 127 ]]; then + _err "Client terminated, rc = $rc" + exit 1 + fi + return "$rc" +} + +function m0kv_async() +{ + local args + args=$(python3 -c "import yaml; \ + conf = yaml.load(open('$M0CRATE_CFG'), Loader=yaml.FullLoader); \ + m = conf['MOTR_CONFIG']; \ + print('-l %s -h %s -p %s -f %s' % (m['MOTR_LOCAL_ADDR'], m['MOTR_HA_ADDR'], m['PROF'], m['PROCESS_FID']));") + + _info "m0kv $args $@" + ${MOTR_UTILS_DIR}/m0kv $args $@ & + CLIENT_PID=$! +} + +function m0kv() +{ + m0kv_async $@ + client_wait +} + +function m0kv_run_sync() +{ + m0kv index create "${DINDEX}" + m0kv -s index put "${DINDEX}" mykey myvalue + m0kv -s index get "${DINDEX}" mykey + m0kv index drop "${DINDEX}" +} + +function client_run_sync() +{ + client_run + client_wait +} + +function ctgdump() +{ + local svc_num=$1 + local proc_fid="0x720000000000000${M0D_FIDS_HEX[$svc_num]}" + local index=$DINDEX + local svc_ep="${M0D_ENDPOINTS[$svc_num]}" + local stobs="/var/motr/m0d-${proc_fid}/stobs" + local db="/var/motr/m0d-${proc_fid}/db" + local rest="-m 524288 -q 16 -w 8 -U -r 134217728 -c /etc/motr/confd.xc" + local out_dir=$CTGDUMP_DIR + local fid_hex=$(echo "${M0D_FIDS_HEX[$svc_num]}" | awk -F'x' '{ print $2; }') + local out_file="${out_dir}/cas-${fid_hex}.dump" + local cmd=$CTGDUMP + + local cmd_args="-e libfab:${svc_ep} -A linuxstob:addb-stobs -f ${proc_fid} -T ad -S ${stobs} -D ${db} ${rest} str ${index}" + + mkdir -p $out_dir + + _info "$cmd $cmd_args" + $cmd $cmd_args | sort > "$out_file" +} + +function ctg_eq() +{ + local left=$1 + local right=$2 + local left_hex=$(echo "${M0D_FIDS_HEX[$left]}" | awk -F'x' '{ print $2; }') + local right_hex=$(echo "${M0D_FIDS_HEX[$right]}" | awk -F'x' '{ print $2; }') + + local left_file="${CTGDUMP_DIR}/cas-${right_hex}.dump" + local right_file="${CTGDUMP_DIR}/cas-${left_hex}.dump" + + if ! diff -u $left_file $right_file; then + _info "Catalogues are different. Run 'diff -u $left_file $right_file' to see the difference." + return 1 + else + local nr_records=$(wc -l $left_file | cut -f1 -d' ') + _info "Compared ${left_hex} with ${right_hex}. Found ${nr_records} identical records." + return 0; + fi +} + +function check_ctg_consistency() +{ + local ref=$VICTIM + local targets=(${WITNESSES[@]}) + + rm -fR "$CTGDUMP_DIR" + + for i in $(seq 0 $((${#M0D_FIDS_HEX[@]}-1))); do + ctgdump $i + done + + for i in ${targets[@]}; do + if ! ctg_eq $ref $i; then + local ref_pid=${M0D_PIDS[$ref]} + local tgt_pid=${M0D_PIDS[$i]} + _info "Inconsistency detected between $ref and $tgt processes." + exit 1 + fi + done +} + + +# Phase where we power on the cluster. +function boot_phase() { local cli_pid + _info "Phase recovery:boot" + ${MOTR_UTILS_DIR}/m0setup --init-loop-only -s 1 -d ${TEST_ROOT} --pool-width ${POOL_WIDTH} --ipool-width ${IPOOL_WIDTH} _info "Bootstrapping the cluster using Hare..." @@ -200,16 +454,112 @@ function main() _info "Create m0crate configuration..." create_m0crate_cfg +} - _info "Run the client..." - $MOTR_ROOT/motr/m0crate/m0crate -S $M0CRATE_CFG & - cli_pid=$! - wait ${cli_pid} - _info "Client pid: ${cli_pid}" - CLIENT_PID=${cli_pid} - stop_cluster +# Phase where we ensure that all m0ds are online +# from DTM0 perspective. +function online_phase() +{ + _info "Phase recovery:online" + + _info "Wait until every m0d started" + while ! expect_trace_lines_from_m0ds "ALL2ALL_STARTED" 1; do + :; + done + + _info "Wait until every m0d has recovered" + while ! expect_trace_lines_from_m0ds "ALL2ALL_DTM_RECOVERED" 1; do + :; + done +} + +# Phase where we perform recovery. +function recovery_phase() +{ + # 2 * (3 + 1) where: + # 2 is the number of witnesses; + # 3 is the number of records (mykey1, mykey2, mykey3); + # 1 is the EOL. + local nr_redo=8 + local redo_pattern="m0_dtm0_recovery_machine_redo_post > in-redo" + local dtx_done_pattern="dtx_done >" + local m0kv_wait_file="/tmp/m0kv_wait" + + _info "Phase recovery:recovery" + local svc_name="m0d@0x720000000000000${M0D_FIDS_HEX[VICTIM]}.service" + + _info "Create an index" + m0kv index create "${DINDEX}" + + _info "Populate the index" + m0kv -s index put "${DINDEX}" mykey1 myvalue1 + + _info "PUT one key, hang on" + m0kv_async -s index put "${DINDEX}" mykey2 myvalue2 \ + wait "${m0kv_wait_file}" put "${DINDEX}" mykey3 myvalue3 + + _info "Wait until mykey2 reached the persistent storage on at least one node." + while ! expect_trace_lines_from_cli "$dtx_done_pattern" 1; do + :; + done + + _info "Kill the victim (${M0D_PIDS[VICTIM]})." + kill -9 "${M0D_PIDS[VICTIM]}" + + # XXX We may use the pool machine state change to identify this point. + # However, it may not be enough: we need to get to the point + # where the even was consumed _fully_ (i.e., Motr sent "ack" back to Hare). + _info "Wait a few seconds to ensure everyone learned about it." + sleep 10 + + # Let the client create a non-fully replicated record. + touch "$m0kv_wait_file" + + client_wait + + # TODO: Run ctgcmp to ensure mykey3 does not exist in the victim's storage. + + _info "Resurrect the victim: $svc_name" + systemctl start "$svc_name" + + # Victim's PID should be updated. + M0D_PIDS=() + get_m0d_pids + _info "Wait until the victim gets started." + while ! expect_trace_lines_from_m0d ${VICTIM} "ALL2ALL_STARTED" 1; do + :; + done + + _info "Wait until the victim gets recovered." + while ! expect_trace_lines_from_m0d ${VICTIM} "ALL2ALL_DTM_RECOVERED" 1; do + :; + done + + _info "Wait until the victim gets recovered." + while ! expect_trace_lines_from_m0d ${VICTIM} "ALL2ALL_DTM_RECOVERED" 1; do + :; + done + + _info "Ensure we got enough REDO messages." + while ! expect_trace_lines_from_m0d ${VICTIM} "$redo_pattern" $nr_redo; do + :; + done + + # TODO: Run ctgcmp to ensure mykey3 exists in the victim's storage. +} + +# Phase where we are gathering artifacts +# and shutting down the cluster. +function shutdown_phase() +{ + _info "Phase recovery:shutdown" + + _info "m0d pids: ${M0D_PIDS[@]}" + + stop_cluster addb_dump + check_ctg_consistency _info "Checking processes exit status..." processes_status_check || { @@ -220,4 +570,144 @@ function main() _info "TEST STATUS: PASSED" } -main +function recovery_cycle() +{ + _info "Starting the recovery cycle" + + boot_phase && \ + online_phase && \ + recovery_phase && \ + shutdown_phase +} + +function simple_boot_cycle() +{ + _info "Starting the simple boot cycle" + + boot_phase && + client_run_sync && + shutdown_phase +} + +function print_help() +{ + echo -en " + The script allows you to check one of the following cases: + '$0 ss' - 'Simple' bootstrap/shutdown of the 3-process cluster. + '$0 rec' - 'Recovery procedures' where the cluster is trying to + recover a failed participant. + Each case is called a 'cycle', and there are several + phases withing each cycle. You can use these phases to manually + run pieces of the cycles or individual commands. For example: + $0 rec boot + $0 rec m0kv index create \"1:5\" + $0 rec m0kv -s index put \"1:5\" mykey myvalue + $0 rec m0kv -s index put \"1:5\" mykey myvalue + $0 rec m0kv -s index get \"1:5\" mykey + $0 rec m0kv -s index next \"1:5\" \"\\\0\" 1 + $0 rec m0kv -s index del \"1:5\" mykey + $0 rec stop + + Debugging and testing: + 1. Use DTM0 UTs to ensure recovery machine works + with your changes: + sudo ./utils/m0run -d -- m0ut -t dtm0-ut + 2. Use 'dtm0-remach' gdb command to print backtraces + of the coroutines: + $ m0trace -i | grep recovery_machine_init + .... m=0xabcd + $ gdb -p + (gdb) p (struct m0_dtm0_recovery_machine *) 0xabcd + \$1 = (struct m0_dtm0_recovery_machine *) 0xabcd + (gdb) dtm0-remach $1 + < ... prints information about machine ...> + + Known issues: + 1. FOM HUNG warnings for recovery FOM. They have + one single state for waiting on something, and because + of that you may see this warning. + \n"; +} + +# Prints various variables used by the +# script. The function is used only +# for debugging of the script. +function print_env() +{ + get_params_for_ha_msgs + get_m0d_pids + echo "M0D_ENDPOINTS: ${M0D_ENDPOINTS[@]}" + echo "M0D_FIDS_DEC: ${M0D_FIDS_DEC[@]}" + echo "M0D_FIDS_HEX: ${M0D_FIDS_HEX[@]}" + echo "M0D_PIDS: ${M0D_PIDS[@]}" + + echo "M0D_CLI_FID_DEC: ${M0D_CLI_FID_DEC}" + echo "M0D_CLI_FID_HEX: ${M0D_CLI_FID_HEX}" + echo "MOD_CLI_EP: ${MOD_CLI_EP}" +} + +# params: +# (i) cycle_name Name of the test (cycle) +# (ii) phase_name Name of a phase in the cycle. +function main() +{ + local cycle_name="$1" + local phase_name="$2" + + # Run the whole cycle if phase was not + # specified. + if [ "x$phase_name" == "x" ]; then + case $cycle_name in + "ss") + simple_boot_cycle;; + "rec") + recovery_cycle;; + "-h") + print_help;; + "--help") + print_help;; + "--print-env") + print_env;; + *) + # Run ordinary all2al by default. + simple_boot_cycle;; + esac + else + if [[ "x$cycle_name" == "xrec" ]]; then + case $phase_name in + "boot") + boot_phase;; + "online") + online_phase;; + "recovery") + recovery_phase;; + "shutdown") + shutdown_phase;; + "m0kv") + shift; + shift; + m0kv $@;; + "stop") + killall -9 lt-m0ham || true; + killall -9 lt-m0d || true; + killall -9 hax || true; + killall -9 lt-m0crate || true; + killall -9 lt-m0kv || true; + killall -9 lt-m0mkfs || true; + hctl shutdown;; + *) + echo "Wrong phase: $phase_name" + echo "Use one of the following: " + echo " boot online recovery shutdown stop" + exit 1;; + esac + else + echo "Unsupported cycle: $cycle_name." + echo "Use 'r' or 's' (recovery, simple)." + exit 1 + fi + fi + +} + +main $@ diff --git a/dtm0/it/all2all/cdf.yaml b/dtm0/it/all2all/cdf.yaml index 85122b73bf1..aea11c4a3ff 100644 --- a/dtm0/it/all2all/cdf.yaml +++ b/dtm0/it/all2all/cdf.yaml @@ -2,25 +2,26 @@ nodes: - hostname: localhost data_iface: eth0 data_iface_type: tcp + transport_type: libfab m0_servers: - runs_confd: true io_disks: data: [] - io_disks: data: - - /dev/loop0 - - /dev/loop1 + - path: /dev/loop0 + - path: /dev/loop1 - io_disks: data: - - /dev/loop2 - - /dev/loop3 + - path: /dev/loop2 + - path: /dev/loop3 - io_disks: data: - - /dev/loop4 - - /dev/loop5 + - path: /dev/loop4 + - path: /dev/loop5 m0_clients: - s3: 0 - other: 1 + - name: motr_client + instances: 1 pools: - name: SNS pool @@ -32,6 +33,7 @@ pools: - name: DIX pool type: dix # optional; supported values: "sns" (default), "dix", "md" data_units: 1 - parity_units: 1 + parity_units: 2 + spare_units: 0 allowed_failures: { site: 0, rack: 0, encl: 0, ctrl: 1, disk: 1 } diff --git a/dtm0/linux_kernel/stubs.c b/dtm0/linux_kernel/stubs.c index da6518142e8..6b5d629633e 100644 --- a/dtm0/linux_kernel/stubs.c +++ b/dtm0/linux_kernel/stubs.c @@ -57,3 +57,54 @@ M0_INTERNAL int m0_dtm0_req_post(struct m0_dtm0_service *svc, (void) wait_for_ack; return 0; } + +#include "dtm0/recovery.h" + +M0_INTERNAL int m0_drm_domain_init(void) +{ + return 0; +} + +M0_INTERNAL void m0_drm_domain_fini(void) +{ + +} + +M0_INTERNAL void +m0_dtm0_recovery_machine_init(struct m0_dtm0_recovery_machine *m, + const struct m0_dtm0_recovery_machine_ops *ops, + struct m0_dtm0_service *svc) +{ + (void) m; + (void) svc; + (void) ops; +} + +M0_INTERNAL int +m0_dtm0_recovery_machine_start(struct m0_dtm0_recovery_machine *m) +{ + (void) m; + return 0; +} + +M0_INTERNAL void +m0_dtm0_recovery_machine_stop(struct m0_dtm0_recovery_machine *m) +{ + (void) m; +} + +M0_INTERNAL void +m0_dtm0_recovery_machine_fini(struct m0_dtm0_recovery_machine *m) +{ + (void) m; +} + +M0_INTERNAL void +m0_dtm0_recovery_machine_redo_post(struct m0_dtm0_recovery_machine *m, + struct dtm0_req_fop *redo, + struct m0_be_op *op) +{ + (void) m; + (void) redo; + (void) op; +} diff --git a/dtm0/recovery.c b/dtm0/recovery.c new file mode 100644 index 00000000000..1a60a36f10b --- /dev/null +++ b/dtm0/recovery.c @@ -0,0 +1,2203 @@ +/* -*- C -*- */ +/* + * Copyright (c) 2022 Seagate Technology LLC and/or its Affiliates + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * For any questions about this software or licensing, + * please email opensource@seagate.com or cortx-questions@seagate.com. + * + */ + + +/** + @page dtm0br-dld DLD of DTM0 basic recovery + + - @ref DTM0BR-ovw + - @ref DTM0BR-def + - @ref DTM0BR-req + - @ref DTM0BR-depends + - @ref DTM0BR-highlights + - @subpage DTM0BR-fspec "Functional Specification" + - @ref DTM0BR-lspec + - @ref DTM0BR-lspec-comps + - @ref DTM0BR-lspec-rem-fom + - @ref DTM0BR-lspec-loc-fom + - @ref DTM0BR-lspec-evc-fom + - @ref DTM0BR-lspec-state + - @ref DTM0BR-lspec-thread + - @ref DTM0BR-lspec-numa + - @ref DTM0BR-conformance + - @ref DTM0BR-usecases + - @ref DTM0BR-ref + - @ref DTM0BR-impl-plan +--->>> +Max: [* defect *] ut and st sections are missing. +IvanA: Agree. I will add it once we are more-or-less confident in use-cases and + requirements. +<<<--- +--->>> +Max: [* style *] usecases section should go to .h file, along with the rest of + the sections in .h in DLD template. +<<<--- + + +
+ @section DTM0BR-ovw Overview + All specifications must start with an Overview section that + briefly describes the document and provides any additional + instructions or hints on how to best read the specification. + + The document describes the way how DTM0 service is supposed to restore + consistency of the data replicated across a certain kind of Motr-based + cluster. The term "basic" implies that consistency is restored only + for a limited number of use-cases. In the rest of the document, + the term "basic" is omitted for simplicity (there are no other kinds of + DTM0-based recovery at the moment). + +--->>> +Max: [* defect *] This is an obvious definition, which has very small value. + Please make an overview which would bring as much useful information to the + reader as it can in just a few lines. +IvanA: [fixed] I tried to stick to the point, and avoid any "new" terms in + the paragraph. +<<<--- + + +
+ @section DTM0BR-def Definitions + Mandatory. + The DLD shall provide definitions of the terms and concepts + introduced by the design, as well as the relevant terms used by the + specification but described elsewhere. References to the + M0 Glossary and the component's HLD are permitted and encouraged. + Agreed upon terminology should be incorporated in the glossary. + +--->>> +Max: [* question *] Are those all the terms that are used here? If no, please + add the rest of the terms. Please also consider adding terms that are being + used, but which are not in DTM0 HLD. +IvanA: [fixed] No, it seems like they are not used. I removed this part. + I'll populate the "new terms" first, and if there are any redundant + information, I'll add references to the HLD. +<<<--- + + Terms and definitions used in the document: + - User service is a Motr service that is capable of replicating + its data ("user data") across the cluster. The document is focused on + CAS (see @ref cas-dld) but this term could be applied to any Motr service + that supports CRDT[2] and has behavior similar to CAS. + Note, this document does not differentiate actual "clients" and "servers". + For example, the Motr client (including DIX) is also considered to be a + "User service". + - DTM0 service is a Motr-service that helps a user service restore + consistency of its replicated data. + - Recoverable process is Motr process that has exactly one user + service and exactly one DTM0 service in it. Note that in UT environment, + a single OS process may have several user/DTM0 services, thus it may + have multiple recoverable processes. +--->>> +IvanA: [* question *] @Max, "Recoverable process" does not sound good to me. + What do you think about "DTM0 process"? or "DTM0-backed process"? + I just want to avoid confusion with confd and any other Motr process that + does not have a DTM0 service in it. Any ideas on that? +<<<--- + - Recovery procedures is a broad term used to point at DTM0 services' + reactions to HA notifications about states of DTM0 services. In other + words, it references to actions performed by DTM0 services that + help to restore consistency of replicated data. + - Recovery machine is a state machine running within a DTM0 service + that performs recovery procedures. Each recoverable process + has a single instance of recovery machine. + - Recovery FOM is a long-lived FOM responsible for reaction to + HA-provided state changes of a recoverable process. The FOM knows the + id (FID) of this process (Recovery FOM id). If the recovery + FOM id matches with the id of the process it is running on, then + this FOM is called "local". Otherwise, it is called "remote". + If a Motr cluster has N recoverable processes in the configuration then + each recoverable process has N recovery FOMs (one per remote counterpart + plus one local FOM). + - Recovery FOM role is a sub-set of states of a recovery FOM. Each + recovery FOM may have one of the following roles: remote recovery, + local recovery, eviction. The term "role" is often omitted. + - Recovery FOM reincarnation is a transition between the roles + of a recovery FOM. Reincarnation happens as a reaction to HA notifications + about state changes of the corresponding process. + - Remote recovery role defines a sub-set of states of a recovery FOM + that performs recovery of a remote process (by sending REDO messages). + - Local recovery role defines a sub-set of states of a recovery FOM + that is responsible for starting and stopping of recovery procedures on + the local recoverable process. + - Eviction role defines a sub-set of states of a recovery FOM that + restores consistency of up to N-1 (N is number of recoverable processes + in the configuration) recoverable processes after one of the recoverable + processes of the cluster experienced a permanent failure (see HLD[1]). + - W-request is a FOP sent from one user service to another that + causes modifications of persistent storage. Such a request always + contains DTM0-related meta-data (transaction descriptor). W-requests + are used to replicate data across the cluster. They are stored in DTM0 + log, and replayed by DTM0 service in form of REDO messages. In case of + CAS, W-requests are PUT and DEL operations + - R-request is a user service FOP that does not modify persistent + storage. R-requests are allowed to be sent only to the processes that + have ONLINE state. In case of CAS, R-requests are GET and NEXT operations. +--->>> +Max: [* defect *] The terms are not defined. Please define them. +IvanA: I populated the list. I'll update the rest of the document + in a separate round of fixes. +<<<--- + +
+ @section DTM0BR-req Requirements + Mandatory. + The DLD shall state the requirements that it attempts to meet. + + Recovery machine shall meet the requirements defined in the DTM0 HLD[1]. +--->>> +Max: [* question *] All of them? If no, please put a list here. If so, please + put the list anyway. +<<<--- + Additionally, it has the following list of low-level requirements: +--->>> +Max: [* question *] Why do we have those requirements? Please provide a way to + track them to the top-level requirements or requirements for other DTM0 + subcomponents. +<<<--- + + - @b R.DTM0BR.HA-Aligned State transitions of the recovery SM shall be + aligned with the state transitions of Motr processes delivered by + the HA subsystem to the recovery machine. + - @b R.DTM0BR.Basic Recovery machine shall support only a subset of + possible use-cases. The subset is defined in @ref DTM0BR-usecases. +--->>> +Max: [* defect *] Usecases are not defined. +<<<--- + - @b R.DTM0BR.No-Batching Recovery machine shall replay logs in + a synchronous manner one-by-one. +--->>> +Max: [* question *] One-by-one log or one-by-one log record? In the latter case: + what about records from logs on different nodes? +<<<--- +--->>> +Max: [* question *] How those logs or records are ordered (to be able to process + them one-by-one)? +<<<--- +--->>> +Max: [* question *] What about performance: is it possible for this design to + meet performance requirements? In any case please clearly state if it is + and why. +<<<--- + +
+ @section DTM0BR-depends Dependencies + Mandatory. Identify other components on which this specification + depends. + + The basic recovery machine depends on the following components: + - Hare. This component is supposed to own an ordered log of all HA events + in the system. Note, this dependency is optional for a certain set + of use-cases where events are delivered in the right way even without + beign backed up by a log. +--->>> +Max: [* defect *] This is not true at the moment. Please specify if it's + possible to implement this design without this being true or what is the + plan to deal with this. +IvanA: [fixed] I added a note that this thing is optional for some cases. +<<<--- + - DTM0 service. The service provides access to DTM0 log and DTM0 RPC link. + - DTM0 log. The log is used to fill in REDO messages. + - DTM0 RPC link. The link is used as transport for REDO messages. + - Motr HA/conf. HA states of Motr conf objects provide the information + about state transitions of local and remote Motr processes. + - Motr conf. The conf module provides the list of remote DTM0 services. + +
+ @section DTM0BR-highlights Design Highlights + Mandatory. This section briefly summarizes the key design + decisions that are important for understanding the functional and + logical specifications, and enumerates topics that need special + attention. + + Each DTM0 service has a recovery machine. Each recovery machine contains + N recovery FOMs (one per counterpart + one local). Recovery FOMs start + listen to various events (HA events, DTM0 log events, RPC replies) and + act accordingly (sending out M0_CONF_HA_PROCESS_* events and REDO messages). + + @verbatim + + +-------------------------------------------+ + | Recoverable process, process fid = 1 | + | +---------------------------------------+ | + | | DTM0 Service | | + | | | | + | | +-----------------------------------+ | | + | | | Recovery machine | | | + | | | | | | + | | | +--------+ +--------+ +--------+ | | | + | | | | R_FOM1 | | R_FOM2 | | R_FOM3 | | | | + | | | | FID=1 | | FID=2 | | FID=3 | | | | + | | | +--------+ +--------+ +--------+ | | | + | | | | | | + | | +-----------------------------------+ | | + | +---------------------------------------+ | + +-------------------------------------------+ + + Recovery machines of one of the processes of a 3-process cluster. + The other two processes have the same set of FOMs. FID=N means that + the id of this recovery FOM is N. The first one (R_FOM1) is the local + recovery FOM of this DTM0 service. The other two (R_FOM2, R_FOM3) are + remote recovery FOMs that would send REDOs to the processes 2 and 3. + + @endverbatim + + Each recovery FOM may await on external events: + + @verbatim + + +-----------------------------------------------------+ + | Recovery FOM inputs | + | | + | Polling | + | ---------------------------------------------- | + | | HA queue | | DTM0 log | | RPC replies | | + | | | | watcher | | | | + | | /|\ | | | | /|\ | | + | | | | | /|\ | | | | | + | | | | | | | | | | | + | | conf obj | | DTM0 log | | DTM0 RPC link | | + +-----------------------------------------------------+ + + @endverbatim + + For example, when a recovery FOM performs recovery of a remote + process, it may await on HA queue (to halt sending of REDOs), on + DTM0 log watcher (to learn if there are new entries in the log), + or on DTM0 RPC link (to learn if next REDO message can be sent). + + Whenever a recovery FOM receives an event from the HA queue, it gets + "reincarnated" to serve its particular role in recovery procedures + (see "Recovery FOM reincarnation" definition). + Incarnation of a recovery FOM is just a sub-state-machine of the FOM. + For example, here is the full state machine of R_FOM2: + + @verbatim + + INIT -> AWAIT_HEQ (HA event queue) + + AWAIT_HEQ -> DECIDE_WHAT_TO_DO + DECIDE_WHAT_TO_DO -> AWAIT_HEQ + DECIDE_WHAT_TO_DO -> RESET + DECIDE_WHAT_TO_DO -> FINAL + + RESET -> SEND_REDO + + SEND_REDO -> AWAIT_HEQ_OR_LOG + SEND_REDO -> AWAIT_HEQ_OR_RPC + + AWAIT_HEQ_OR_LOG -> SEND_REDO + AWAIT_HEQ_OR_LOG -> HEQ + + AWAIT_HEQ_OR_RPC -> SEND_REDO + AWAIT_HEQ_OR_RPC -> HEQ + + INIT --------------> AWAIT_HEQ + | /|\ + \|/ | + DECIDE_WHAT_TO_DO --------------> FINAL + | /|\ + | | (remote recovery + | | sub-states) + //====================|=====|===================================// + // RESET <-----------+ +---< HEQ (Transitions that // + // | may happen as // + // | a reaction to // + // | +------------------------+ RECOVERING HA event)// + // \|/ \|/ | // + // SEND_REDO ---------------> AWAIT_ON_HEQ_OR_RPC ---> HEQ // + // | /|\ // + // \|/ | // + // AWAIT_ON_HEQ_OR_LOG -----> HEQ // + //==============================================================// + + ... ... + | | (eviction sub-states) + //====================|====|====================================// + // RESET <----------+ +----< HEQ (Transitions that // + // | may happen as a // + // | +------------------+ a reaction to FAILED // + // \|/ \|/ | HA event) // + // SEND_REDO -----> AWAIT_ON_HEQ_OR_RPC ----> HEQ // + //==============================================================// + @endverbatim + + The first frame highlighted with "===//" shows the sub-state machine of + a recovery FOM that are used to recover a participant that entered + RECOVERING HA state. If the same participant enters FAILED state + (permanent failure) then another set of sub-states would be involved + in recovery procedures (the second frame). + + On the diagram above, the transition DECIDE_WHAT_TO_DO -> RESET + marks the point where recovery FOM enters a new incarnation, + the transition HEQ -> DECIDE_WHAT_TO_DO marks the end of the life of this + incarnation. The set of sub-states is called "role". + For example, if a recovery FOM is somewhere inside the first frame + (remote recovery) then we say that it has "remote recovery role" and it + performs "remote recovery". Correspondingly, the second frame describes + "eviction role" or just "eviction". + + The notion of incarnation is used to emphasise that a recovery FOM must + always "clean up" its volatile state before processing next HA event. + For example, it may await on a pending RPC reply or it may have to reset + DTM0 log iterator. + + The local recovery FOM (R_FOM1 in the example), is just a volatile + state that captures information about certain points in the log and sends + out ::M0_CONF_HA_PROCESS_RECOVERED when it believes that the local process + has been fully recovered (see the recovery stop condition below). + The local recovery FOM cannot change its role (it cannot recover itself + from transient or permanent failures). + + UPD: The design described above has actually been implemented using + coroutines, not FSM. So we do not have these states in the code. But -- + coroutines are implemented in the same spirit as outlined above, that is: + it would execute all actions needed for a given incarnation, clean up, and + only after this -- read next event from HEQ (HA event queue). + + +
+ @section DTM0BR-lspec Logical Specification + Mandatory. This section describes the internal design of the component, + explaining how the functional specification is met. Sub-components and + diagrams of their interaction should go into this section. The section has + mandatory subsections created using the Doxygen @@subsection command. The + designer should feel free to use additional sub-sectioning if needed, though + if there is significant additional sub-sectioning, provide a table of + contents here. + + - @ref DTM0BR-lspec-comps + - @ref DTM0BR-lspec-rem-fom + - @ref DTM0BR-lspec-loc-fom + - @ref DTM0BR-lspec-evc-fom + - @ref DTM0BR-lspec-state + - @ref DTM0BR-lspec-thread + - @ref DTM0BR-lspec-numa + + @subsection DLD-lspec-comps Component Overview + Mandatory. + This section describes the internal logical decomposition. + A diagram of the interaction between internal components and + between external consumers and the internal components is useful. + + The following diagram shows connections between recovery machine and + the other components of the system: + + @dot + digraph { + rankdir = LR; + bgcolor="lightblue" + node[shape=box]; + label = "DTM0 Basic recovery machine components and dependencies"; + subgraph cluster_recovery_sm { + label = "Recovery SM"; + bgcolor="lightyellow" + node[shape=box]; + rfom[label="Recovery FOM"]; + E1[shape=none label="⋮" fontsize=30] + rfom_another[label="Another Recovery FOM"]; + } + + subgraph cluster_ha_conf { + node[shape=box]; + label ="HA and Conf"; + conf_obj[label="Conf object"]; + ha[label="Motr HA"]; + } + + drlink[label="DTM0 RPC link"]; + dlog[label="DTM0 log"]; + + conf_obj -> rfom[label="HA decisions"]; + rfom -> ha [label="Process events"]; + rfom -> drlink [label="REDO msgs"]; + dlog -> rfom[label="records"]; + } + @enddot +--->>> +Max: [* defect *] Communication with other recovery foms in the same process and + in different processes are missing. Please add. +<<<--- + + The following sequence diagram represents an example of linerisation of + HA decisions processing: + @msc + ha,rfom, remote; + ha -> rfom [label="RECOVERING"]; + rfom -> remote [label="REDO(1)"]; + ha -> rfom [label="TRANSIENT"]; + remote -> rfom [label="REDO(1).reply(-ETIMEOUT)"]; + ha -> rfom [label="RECOVERING"]; + rfom -> rfom [label="Cancel recovery"]; + rfom -> remote [label="REDO(1)"]; + remote -> rfom [label="REDO(1).reply(ok)"]; + @endmsc + + In this example the recovery machine receives a sequence of events + (TRANSIENT, RECOVERING, TRANSIENT), and reacts on them: + - Starts replay the log by sending REDO(1). + - Awaits on the reply. + - Cancels recovery. + - Re-starts replay by sending REDO(1) again. + - Get the reply. +--->>> +Max: [* defect *] One example is not enough to describe the algorithm. Please + explain how it's done exactly. Maybe not in this section, but somewhere in DLD. +<<<--- + + @subsection DTM0BR-lspec-rem-fom Remote recovery FOM + Such sections briefly describes the purpose and design of each + sub-component. +--->>> +Max: [* defect *] State machine is there, but what the fom actually does is + missing. Please add. +<<<--- + + When a recovery FOM detects a state transition to RECOVERING of a remote + participant, it transits to the sub-SM called "Remote recovery FOM". + In other words, it re-incarnates as "Remote recovery FOM" as soon as + the previous incarnation is ready to reach its "terminal" state. + + Remote recovery FOM reacts on the following kinds of events: + - Process state transitions (from HA subsystem); + - DTM0 log getting new records; + - RPC replies (from DTM0 RPC link component). + + Remote recovery FOM has the following states and transitions: + + @verbatim + REMOTE_RECOVERY(initial) -> WAITING_ON_HA_OR_RPC + REMOTE_RECOVERY(initial) -> WAITING_ON_HA_OR_LOG + WAITING_ON_HA_OR_RPC -> WAITING_ON_HA_OR_LOG + WAITING_ON_HA_OR_LOG -> WAITING_ON_HA_OR_RPC + WAITING_ON_HA_OR_RPC -> NEXT_HA_STATE (terminal) + WAITING_ON_HA_OR_LOG -> NEXT_HA_STATE (terminal) + WAITING_ON_HA_OR_RPC -> SHUTDOWN (terminal) + WAITING_ON_HA_OR_LOG -> SHUTDOWN (terminal) + + Where: + REMOTE_RECOVERY is a local initial state (local to the sub-SM); + NEXT_HA_STATE is a local terminal state; + SHUTDOWN is a global terminal state. + @endverbatim + + @subsection DTM0BR-lspec-loc-fom Local recovery FOM + Such sections briefly describes the purpose and design of each + sub-component. + + Local recovery is used to ensure fairness of overall recovery procedure. +--->>> +Max: [* defect *] Definition of "fairness" in this context is missing. Please + add. +<<<--- +--->>> +Max: [* defect *] Definition of the "fairness" is "ensured" is missing. Please + add. +<<<--- + Whenever a participant learns that it got all missed log records, it sends + M0_CONF_HA_PROCESS_RECOVERED process event to the HA subsystem. + + The point where this event has to be sent is called "recovery stop + condition". The local participant (i.e., the one that is being in + the RECOVERING state) uses the information about new incoming W-requests, + the information about the most recent (txid-wise) log entries on the other + participants to make a decision whether recovery shall be stopped or not. + + TODO: describe the details on when and how it should stop. +--->>> +Max: [* defect *] You're right, the details are missing. +<<<--- + + + @subsection DTM0BR-lspec-evc-fom Eviction FOM + Such sections briefly describes the purpose and design of each + sub-component. +--->>> +Max: [* defect *] The purpose of the eviction fom is not described. +<<<--- + + A recovery FOM re-incarnates as eviction FOM (i.e., enters the initial +--->>> +Max: [* defect *] A definition of "re-incarnation" is missing. +<<<--- + of the corresponding sub-SM) when the HA subsystem notifies about +--->>> +Max: [* defect *] A definition of sub-SM is missing. +<<<--- + permanent failure (FAILED) on the corresponding participant. + + The local DTM0 log shall be scanned for any record where the FAILED +--->>> +Max: [* doc *] Please add a reference to the component which does the scanning. +<<<--- +--->>> +Max: [* doc *] Please explain how to scanning is done: + - sequentially? + - from least recent to most recent? + - one fom/thread/whatever or multiple? How the work is distributed? + - locks also have to be described somewhere in DLD. +<<<--- + participant participated. Such a record shall be replayed to the + other participants that are capable of receiving REDO messages. +--->>> +Max: [* defect *] What to do with the message if a participant is in TRANSIENT + state is not described. +<<<--- + + When the log has been replayed completely, the eviction FOM notifies +--->>> +Max: [* defect *] Criteria of "log has been replayed completely" is missing. + Please add. +<<<--- + the HA subsystem about completion and leaves the sub-SM. +--->>> +Max: [* question *] How? Please also describe what is expected from HA. +<<<--- + + TODO: GC of FAILED processes and handling of clients restart. + + @subsection DTM0BR-lspec-state State Specification + Mandatory. + This section describes any formal state models used by the component, + whether externally exposed or purely internal. + + The states of a recovery FOM is defined as a collection of the sub-SM + states, and a few global states. + + TODO: state diagram for overall recovery FOM. +--->>> +Max: [* doc *] Also distributed state diagram. +<<<--- + + @subsection DTM0BR-lspec-thread Threading and Concurrency Model + Mandatory. + This section describes the threading and concurrency model. + It describes the various asynchronous threads of operation, identifies + the critical sections and synchronization primitives used + (such as semaphores, locks, mutexes and condition variables). + + Recovery machine revolves around the following kinds of threads: +--->>> +Max: [* defect *] Definition of "revolves" is missing. +<<<--- + - Locality threads (Recovery FOMs, DTM0 log updates). + - RPC machine thread (RPC replies). + + Each recovery FOM is implemented using Motr coroutines library. + Within the coroutine ::m0_be_op and/or ::m0_co_op is used to await +--->>> +Max: [* question *] RE: "the coroutine": which one? +<<<--- + on external events. +--->>> +Max: [* suggestion *] A list of such events would help to understand why be/co + ops are needed. +<<<--- + + DTM0 log is locked using a mutex (TBD: the log mutex or a separate mutex?) +--->>> +Max: [* question *] Entire DTM0 log is locked using a mutex? If no, please add + unambiguous sentence. Example: " is + protected with a mutex" or "a separate mutex protects concurrent access to + ". +<<<--- + whenever recovery machine sets up or cleans up its watcher. +--->>> +Max: [* defect *] The watcher description/workflow/etc. is missing. Please add. +<<<--- + + Interaction with RPC machine is wrapped by Motr RPC library and DTM0 +--->>> +Max: [* typo *] s/Interaction/Interaction/ +<<<--- + RPC link component. It helps to unify the type of completion objects + across the FOM code. + + TODO: describe be/co op poll/select. + + @subsection DTM0BR-lspec-numa NUMA optimizations + Mandatory for components with programmatic interfaces. + This section describes if optimal behavior can be supported by + associating the utilizing thread to a single processor. + + There is no so much shared resources outside of DTM0 log and + the localities. Scaling and batching is outside of the scope +--->>> +Max: [* defect *] The number of localities is equal to the number of CPU cores +in our default configuration, so whatever uses more than one locality has to +have some kind of synchronisation. +<<<--- +--->>> +Max: [* defect *] It's not clear what "scaling" and "batching" mean in this + context. Please explain and/or add a reference where they are explained. +<<<--- + of this document. +--->>> +Max: [* defect *] FOM locality assignment is missing. +<<<--- + +
+ @section DTM0BR-conformance Conformance + Mandatory. + This section cites each requirement in the @ref DTM0BR-req section, + and explains briefly how the DLD meets the requirement. +--->>> +Max: [* defect *] Top-level requirements from HLD are missing. +<<<--- + + - @b I.DTM0BR.HA-Aligned Recovery machine provides an event queue that + is beign consumed orderly by the corresponding FOM. +--->>> +Max: [* defect *] The word "queue" is present in this document only here. Please + describe the event queue and events somewhere in this DLD. +<<<--- +--->>> +Max: [* defect *] It's not clear how "HA subsystem" from the requirement is met + by the implementation. +<<<--- + - @b I.DTM0BR.Basic Recovery machine supports only basic log replay defined + by the remote recovery FOM and its counterpart. +--->>> +Max: [* defect *] "basic" term is not defined. Please define. +<<<--- + - @b I.DTM0BR.No-Batching Recovery machine achieves it by awaiting on + RPC replies for every REDO message sent. +--->>> +Max: [* defect *] Maybe I should wait for I.* for performance and availability + requirements, but as of now it's not clear how DTM recovery would catch up + with the rest of the nodes creating new DTM0 transactions in parallel with + DTM recovery. +<<<--- + +
+ @section DTM0BR-usecases + Mandatory. This section describes use-cases for recovery machine. + + + TODO: Add use-cases. +--->>> +Max: [* defect *] Right, they are missing. +<<<--- + + +
+ @section DLD-O Analysis + This section estimates the performance of the component, in terms of + resource (memory, processor, locks, messages, etc.) consumption, + ideally described in big-O notation. +--->>> +Max: [* doc *] Please fill this section with references to the requirements. +<<<--- + +
+ @section DLD-ref References + Mandatory. Provide references to other documents and components that + are cited or used in the design. + In particular a link to the HLD for the DLD should be provided. + + - [1]
+ DTM0 HLD + - [2] + +
+ @section DLD-impl-plan Implementation Plan + Mandatory. Describe the steps that should be taken to implement this + design. + + - Develop pseudocode-based description of recovery activities. + - Identify and add missing concurrency primitives (select/poll for be op). + - Use the pseudocode and new primitives to implement a skeleton of the FOMs. + - Incorporate RECOVERING and TRANSIENT states with their + new meaning as soon as they are available in the upstream code. + - Populate the list of use-cases based on available tools. + - Populate the list of system tests. + - Improve the stop condition based on the use-cases and tests. + - Add implementation of eviction FOM. +--->>> +Max: [* defect *] No mention of tests being done. Please clearly state when and + how the tests are going to be done or where and when it would be possible + to find this information. +<<<--- + */ + + + +#define M0_TRACE_SUBSYSTEM M0_TRACE_SUBSYS_DTM0 +#include "lib/trace.h" +#include "dtm0/recovery.h" /* m0_dtm0_recovery_machine */ +#include "dtm0/service.h" /* m0_dtm0_service */ +#include "dtm0/drlink.h" /* m0_dtm0_req_post */ +#include "dtm0/fop.h" /* dtm0_req_fop */ +#include "be/op.h" /* m0_be_op */ +#include "be/queue.h" /* m0_be_queue */ +#include "conf/diter.h" /* diter */ +#include "conf/helpers.h" /* m0_confc_root_open */ +#include "conf/obj.h" /* m0_conf_obj */ +#include "fop/fom.h" /* m0_fom */ +#include "lib/coroutine.h" /* m0_co_context */ +#include "lib/memory.h" /* M0_ALLOC_PTR */ +#include "reqh/reqh.h" /* m0_reqh2confc */ +#include "rpc/rpc_opcodes.h" /* M0_DTM0_RECOVERY_FOM_OPCODE */ +#include "lib/string.h" /* m0_streq */ +#include "be/dtm0_log.h" /* m0_dtm0_log_rec */ +#include "motr/setup.h" /* m0_cs_reqh_context */ +#include "addb2/identifier.h" /* M0_AVI_FOM_TO_TX */ +#include "dtm0/addb2.h" /* M0_AVI_DORM_SM_STATE */ +#include "motr/client_internal.h" /* struct m0client::m0c_reqh */ + +enum { + /* + * Number of HA event that could be submitted by the HA subsystem + * at once; where: + * "HA event" means state transition of a single DTM0 process; + * "at once" means the maximal duration of a tick of any FOM + * running on the same locality as the recovery FOM that + * handles HA events. + * XXX: The number was chosen randomly. Update the previous sentence + * if you want to change the number. + */ + HEQ_MAX_LEN = 32, + + /* + * Number of HA events and EOL messages that could be submitted + * at once. + * TODO: Modify this comment once we stop putting HA events in this + * queue (with help of be-op-or-set). It shall be EOL-only queue. + */ + EOLQ_MAX_LEN = 100, +}; + +struct recovery_fom { + struct m0_fom rf_base; + + /* Recovery machine instance that owns this FOM. */ + struct m0_dtm0_recovery_machine *rf_m; + + /** Subscription to conf obj HA state. */ + struct m0_clink rf_ha_clink; + + /** HA event queue populated by the clink and consumed by the FOM. */ + struct m0_be_queue rf_heq; + + struct m0_co_context rf_coro; + + /** Linkage for m0_dtm0_recovery_machine::rm_foms */ + struct m0_tlink rf_linkage; + + /** Magic for rfom tlist entry. */ + uint64_t rf_magic; + + struct m0_be_queue rf_eolq; + + struct m0_be_dtm0_log_iter rf_log_iter; + + /* Target DTM0 service FID (id of this FOM within the machine). */ + struct m0_fid rf_tgt_svc; + + struct m0_fid rf_tgt_proc; + + /** Is target DTM0 service the service ::rf_m belongs to? */ + bool rf_is_local; + + /** Is target DTM0 volatile? */ + bool rf_is_volatile; + + + /** The most recent HA state of this remote DTM0 service. */ + enum m0_ha_obj_state rf_last_known_ha_state; + + /** + * The most recent known state of the log on a remote DTM0 service. + * Note, it is impossible to guarantee that this stat is "in-sync" + * with ::rf_last_known_ha_state unless we have HA epochs. + */ + bool rf_last_known_eol; +}; + +enum eolq_item_type { + EIT_EOL, + EIT_HA, + EIT_END, +}; + +struct eolq_item { + enum eolq_item_type ei_type; + struct m0_fid ei_source; + enum m0_ha_obj_state ei_ha_state; +}; + +/* + * A global variable to set off parts of the code that were added + * specifically for the integration (all2all) test script. + * Later on, they need to be removed (once we get a better + * way of testing). + */ +const bool ALL2ALL = true; + +M0_TL_DESCR_DEFINE(rfom, "recovery_fom", + static, struct recovery_fom, rf_linkage, + rf_magic, M0_DTM0_RMACH_MAGIC, M0_DTM0_RMACH_HEAD_MAGIC); +M0_TL_DEFINE(rfom, static, struct recovery_fom); + +static int populate_foms (struct m0_dtm0_recovery_machine *m); +static void unpopulate_foms(struct m0_dtm0_recovery_machine *m); + +static bool recovery_fom_ha_clink_cb(struct m0_clink *clink); + +static void recovery_fom_self_fini(struct m0_fom *fom); +static int recovery_fom_tick(struct m0_fom *fom); + +static void recovery_machine_lock(struct m0_dtm0_recovery_machine *m); +static void recovery_machine_unlock(struct m0_dtm0_recovery_machine *m); +static struct recovery_fom * +recovery_fom_local(struct m0_dtm0_recovery_machine *m); + +static bool ha_event_invariant(uint64_t event) +{ + return event < M0_NC_NR; +} + +static void addb2_relate(const struct m0_sm *left, const struct m0_sm *right) +{ + M0_ADDB2_ADD(M0_AVI_FOM_TO_TX, m0_sm_id_get(left), m0_sm_id_get(right)); +} + + +static struct m0_sm_state_descr recovery_machine_states[] = { + [M0_DRMS_INIT] = { + .sd_name = "M0_DRMS_INIT", + .sd_allowed = M0_BITS(M0_DRMS_STOPPED, M0_DRMS_STARTED), + .sd_flags = M0_SDF_INITIAL, + }, + [M0_DRMS_STARTED] = { + .sd_name = "M0_DRMS_STARTED", + .sd_allowed = M0_BITS(M0_DRMS_STOPPED), + .sd_flags = 0, + }, + [M0_DRMS_STOPPED] = { + .sd_name = "M0_DRMS_STOPPED", + .sd_allowed = 0, + .sd_flags = M0_SDF_TERMINAL, + }, +}; + +static struct m0_sm_trans_descr recovery_machine_trans[] = { + { "started", M0_DRMS_INIT, M0_DRMS_STARTED }, + { "stop-running", M0_DRMS_STARTED, M0_DRMS_STOPPED }, + { "stop-idle", M0_DRMS_INIT, M0_DRMS_STOPPED }, +}; + +struct m0_sm_conf m0_drm_sm_conf = { + .scf_name = "recovery_machine", + .scf_nr_states = ARRAY_SIZE(recovery_machine_states), + .scf_state = recovery_machine_states, + .scf_trans_nr = ARRAY_SIZE(recovery_machine_trans), + .scf_trans = recovery_machine_trans, +}; + +M0_INTERNAL void +m0_dtm0_recovery_machine_init(struct m0_dtm0_recovery_machine *m, + const struct m0_dtm0_recovery_machine_ops *ops, + struct m0_dtm0_service *svc) +{ + M0_PRE(m != NULL); + M0_ENTRY("m=%p, svc=%p", m, svc); + M0_PRE(m0_sm_conf_is_initialized(&m0_drm_sm_conf)); + + m->rm_svc = svc; + if (ops) + m->rm_ops = ops; + else + m->rm_ops = &m0_dtm0_recovery_machine_default_ops; + rfom_tlist_init(&m->rm_rfoms); + m0_sm_group_init(&m->rm_sm_group); + m0_sm_init(&m->rm_sm, &m0_drm_sm_conf, + M0_DRMS_INIT, &m->rm_sm_group); + m0_sm_addb2_counter_init(&m->rm_sm); + + M0_POST(m->rm_ops != NULL); + M0_LEAVE(); +} + +M0_INTERNAL void +m0_dtm0_recovery_machine_fini(struct m0_dtm0_recovery_machine *m) +{ + M0_ENTRY("m=%p", m); + recovery_machine_lock(m); + M0_ASSERT(M0_IN(m->rm_sm.sm_state, (M0_DRMS_STOPPED, M0_DRMS_INIT))); + if (m->rm_sm.sm_state == M0_DRMS_INIT) + m0_sm_state_set(&m->rm_sm, M0_DRMS_STOPPED); + m0_sm_fini(&m->rm_sm); + recovery_machine_unlock(m); + /* + * Question: This uses the sm group lock and immediately finalises it, + * which is, in general, wrong (if there can be other threads waiting on + * the lock, then finalisation is incorrect, if no other threads are + * possible at this point, the lock-unlock are not needed). Please add a + * comment explaining why this is correct. + * + * Answer: There are indeed no other threads waiting on the lock. + * Lock-unlock is needed because m0_sm_fini requires it (otherwise + * asserts fail). The reason there are no other threads below: + * + * See ASSERT above: this function only runs when sm_state is + * M0_DRMS_INIT or M0_DRMS_STOPPED. When M0_DRMS_INIT, there are no + * other threads _yet_. When other threads are created -- state becomes + * M0_DRMS_STARTED (threads are recovery FOMs, see + * m0_dtm0_recovery_machine_start()). When M0_DRMS_STOPPED, there are + * no other threads _already_. We go to M0_DRMS_STOPPED when the last + * FOM is finalized -- see recovery_fom_self_fini(). + */ + m0_sm_group_fini(&m->rm_sm_group); + M0_ASSERT(rfom_tlist_is_empty(&m->rm_rfoms)); + rfom_tlist_fini(&m->rm_rfoms); + M0_LEAVE(); +} + +M0_INTERNAL int +m0_dtm0_recovery_machine_start(struct m0_dtm0_recovery_machine *m) +{ + struct recovery_fom *rf; + int rc; + + /* TODO: Skip initialisation of recovery foms during mkfs. */ + rc = populate_foms(m); + if (rc < 0) + return M0_RC(rc); + + M0_ASSERT(rfom_tlist_is_empty(&m->rm_rfoms) == + (m->rm_local_rfom == NULL)); + + m0_tl_for(rfom, &m->rm_rfoms, rf) { + m0_fom_queue(&rf->rf_base); + } m0_tlist_endfor; + + if (m->rm_local_rfom != NULL) { + recovery_machine_lock(m); + m0_sm_state_set(&m->rm_sm, M0_DRMS_STARTED); + recovery_machine_unlock(m); + } + + if (ALL2ALL) + M0_LOG(M0_DEBUG, "ALL2ALL_STARTED"); + + return M0_RC(rc); +} + +M0_INTERNAL void +m0_dtm0_recovery_machine_stop(struct m0_dtm0_recovery_machine *m) +{ + int rc; + struct recovery_fom *rf; + + recovery_machine_lock(m); + + m0_tl_for(rfom, &m->rm_rfoms, rf) { + m0_be_queue_lock(&rf->rf_heq); + M0_LOG(M0_DEBUG, "heq_end " FID_F, FID_P(&rf->rf_tgt_svc)); + /* + * The recovery fom will finalize itself when the queue ends. + */ + m0_be_queue_end(&rf->rf_heq); + m0_be_queue_unlock(&rf->rf_heq); + } m0_tlist_endfor; + + /* + * This sm wait will release the sm lock before waiting if needed. + * The same lock is used as the sm lock and machine lock. Please see + * recovery_machine_lock() and m0_dtm0_recovery_machine_init(). + * So any other thread that wants to acquire the recovery machine + * lock will have a chance to continue. + */ + rc = m0_sm_timedwait(&m->rm_sm, + M0_BITS(M0_DRMS_INIT, M0_DRMS_STOPPED), + M0_TIME_NEVER); + M0_ASSERT_INFO(rc == 0, "rc=%d", rc); + recovery_machine_unlock(m); + unpopulate_foms(m); +} + +static struct m0_reqh * +m0_dtm0_recovery_machine_reqh(struct m0_dtm0_recovery_machine *m) +{ + return m->rm_svc->dos_generic.rs_reqh; +} + +static const struct m0_fid * +recovery_machine_local_id(const struct m0_dtm0_recovery_machine *m) +{ + return &m->rm_svc->dos_generic.rs_service_fid; +} + +static int recovery_machine_log_iter_next(struct m0_dtm0_recovery_machine *m, + struct m0_be_dtm0_log_iter *iter, + const struct m0_fid *tgt_svc, + const struct m0_fid *origin_svc, + struct m0_dtm0_log_rec *record) +{ + M0_PRE(m->rm_ops->log_iter_next != NULL); + return m->rm_ops->log_iter_next(m, iter, tgt_svc, origin_svc, record); +} + +static int recovery_machine_log_iter_init(struct m0_dtm0_recovery_machine *m, + struct m0_be_dtm0_log_iter *iter) +{ + M0_PRE(m->rm_ops->log_iter_init != NULL); + return m->rm_ops->log_iter_init(m, iter); + +} + +static void recovery_machine_log_iter_fini(struct m0_dtm0_recovery_machine *m, + struct m0_be_dtm0_log_iter *iter) +{ + M0_PRE(m->rm_ops->log_iter_fini != NULL); + m->rm_ops->log_iter_fini(m, iter); +} + +static void recovery_machine_redo_post(struct m0_dtm0_recovery_machine *m, + struct m0_fom *fom, + const struct m0_fid *tgt_proc, + const struct m0_fid *tgt_svc, + struct dtm0_req_fop *redo, + struct m0_be_op *op) +{ + M0_PRE(m->rm_ops->redo_post != NULL); + m->rm_ops->redo_post(m, fom, tgt_proc, tgt_svc, redo, op); +} + +static void recovery_machine_recovered(struct m0_dtm0_recovery_machine *m, + const struct m0_fid *tgt_proc, + const struct m0_fid *tgt_svc) +{ + M0_PRE(m->rm_ops->ha_event_post != NULL); + m->rm_ops->ha_event_post(m, tgt_proc, tgt_svc, + M0_CONF_HA_PROCESS_DTM_RECOVERED); +} + +static void recovery_machine_lock(struct m0_dtm0_recovery_machine *m) +{ + m0_sm_group_lock(&m->rm_sm_group); +} + +static void recovery_machine_unlock(struct m0_dtm0_recovery_machine *m) +{ + m0_sm_group_unlock(&m->rm_sm_group); +} + +enum recovery_fom_state { + RFS_INIT = M0_FOM_PHASE_INIT, + RFS_DONE = M0_FOM_PHASE_FINISH, + RFS_WAITING, + RFS_FAILED, + RFS_NR, +}; + +static struct m0_sm_state_descr recovery_fom_states[] = { + [RFS_INIT] = { + .sd_name = "RFS_INIT", + .sd_allowed = M0_BITS(RFS_WAITING, RFS_DONE, RFS_FAILED), + .sd_flags = M0_SDF_INITIAL, + }, + /* terminal states */ + [RFS_DONE] = { + .sd_name = "RFS_DONE", + .sd_allowed = 0, + .sd_flags = M0_SDF_TERMINAL, + }, + /* failure states */ + [RFS_FAILED] = { + .sd_name = "RFS_FAILED", + .sd_allowed = M0_BITS(RFS_DONE), + .sd_flags = M0_SDF_FAILURE, + }, + + /* intermediate states */ + [RFS_WAITING] = { + .sd_name = "RFS_WAITING", + .sd_allowed = M0_BITS(RFS_DONE, + RFS_FAILED, RFS_WAITING), + }, +}; + +const static struct m0_sm_conf recovery_fom_conf = { + .scf_name = "recovery_fom", + .scf_nr_states = ARRAY_SIZE(recovery_fom_states), + .scf_state = recovery_fom_states, +}; + +static size_t recovery_fom_locality(const struct m0_fom *fom) +{ + return 1; +} + +static const struct m0_fom_ops recovery_fom_ops = { + .fo_fini = recovery_fom_self_fini, + .fo_tick = recovery_fom_tick, + .fo_home_locality = recovery_fom_locality +}; + +static struct m0_fom_type recovery_fom_type; +static const struct m0_fom_type_ops recovery_fom_type_ops = {}; + +M0_INTERNAL int m0_drm_domain_init(void) +{ + int rc = 0; + + M0_PRE(!m0_sm_conf_is_initialized(&m0_drm_sm_conf)); + + m0_fom_type_init(&recovery_fom_type, + M0_DTM0_RECOVERY_FOM_OPCODE, + &recovery_fom_type_ops, + &dtm0_service_type, + &recovery_fom_conf); + + m0_sm_conf_init(&m0_drm_sm_conf); + rc = m0_sm_addb2_init(&m0_drm_sm_conf, + M0_AVI_DRM_SM_STATE, + M0_AVI_DRM_SM_COUNTER); + + M0_POST(m0_sm_conf_is_initialized(&m0_drm_sm_conf)); + + return M0_RC(rc); +} + +M0_INTERNAL void m0_drm_domain_fini(void) +{ + if (m0_sm_conf_is_initialized(&m0_drm_sm_conf)) { + m0_sm_addb2_fini(&m0_drm_sm_conf); + m0_sm_conf_fini(&m0_drm_sm_conf); + M0_POST(!m0_sm_conf_is_initialized(&m0_drm_sm_conf)); + } +} + +static void eolq_post(struct m0_dtm0_recovery_machine *m, + struct eolq_item *item) +{ + struct recovery_fom *rf = recovery_fom_local(m); + m0_be_queue_lock(&rf->rf_eolq); + /* + * Do not post any events if we have already recovered this DTM0 + * service (eoq is an indicator here). + */ + if (!rf->rf_eolq.bq_the_end) { + /* + * Assumption: the queue never gets full. + * XXX: We could panic in this case. Right now, the HA/REDO FOM + * should get stuck in the tick. Probably, panic is + * a better solution here. + */ + M0_BE_OP_SYNC(op, M0_BE_QUEUE_PUT(&rf->rf_eolq, &op, item)); + } + m0_be_queue_unlock(&rf->rf_eolq); +} + +static void heq_post(struct recovery_fom *rf, enum m0_ha_obj_state state) +{ + uint64_t event = state; + m0_be_queue_lock(&rf->rf_heq); + /* + * Assumption: the queue never gets full. + * XXX: We could panic in this case. Right now, the HA FOM should get + * stuck in the tick. Probably, panic is a better solution + * here. + */ + M0_BE_OP_SYNC(op, M0_BE_QUEUE_PUT(&rf->rf_heq, &op, &event)); + m0_be_queue_unlock(&rf->rf_heq); + + M0_LOG(M0_DEBUG, "heq_enq " FID_F " %s ", + FID_P(&rf->rf_tgt_svc), + m0_ha_state2str(state)); + + /* + * Recovery machine uses two kinds of queues: HEQ and EOLQ. HEQ is HA + * Events Queue, every recovery FOM has its own instance; it is used to + * track HA events of a specific participant (tied to this given rfom). + * EOLQ is EOL Queue, queue to track end-of-log events during recovery. + * When we receive EOL from all participants -- we know recovery is + * completed. There is a nuance though. If some remote participant + * goes TRANSIENT/FAILED during recovery, we do not need to wait for EOL + * from that side. The easiest way to deliver this information to local + * recovery FOM is to add an entry in EOLQ (since local RFOM already + * 'listens' on that queue). So that's what happens below. heq_post() + * is called when there is an HA event on some participant. We add the + * HA event itself in HEQ above, and then add it to EOLQ below. Since + * local recovery FOM is not expecting EOL from itself, we only do + * eolq_post() if HA event does not come from local participant. + */ + if (!rf->rf_is_local) + eolq_post(rf->rf_m, + &(struct eolq_item) { + .ei_type = EIT_HA, + .ei_ha_state = state, + .ei_source = rf->rf_tgt_svc, }); +} + +static bool recovery_fom_ha_clink_cb(struct m0_clink *clink) +{ + struct recovery_fom *rf = M0_AMB(rf, clink, rf_ha_clink); + struct m0_conf_obj *proc_conf = container_of(clink->cl_chan, + struct m0_conf_obj, + co_ha_chan); + heq_post(rf, proc_conf->co_ha_state); + return false; +} + +static int recovery_fom_init(struct recovery_fom *rf, + struct m0_dtm0_recovery_machine *m, + struct m0_conf_process *proc_conf, + const struct m0_fid *target, + bool is_volatile) +{ + int rc; + bool is_local = m0_fid_eq(recovery_machine_local_id(m), target); + + M0_ENTRY("m=%p, rf=%p, tgt=" FID_F ", is_vol=%d, is_local=%d", + m, rf, FID_P(target), !!is_volatile, !!is_local); + + M0_PRE(ergo(is_local, m->rm_local_rfom == NULL)); + + rc = m0_be_queue_init(&rf->rf_heq, &(struct m0_be_queue_cfg){ + .bqc_q_size_max = HEQ_MAX_LEN, + /* + * Two producers: + * 1. Conf-obj HA state updates (heq_post()). + * 2. Stop-and-wait-when-finalising + * (m0_dtm0_recovery_machine_stop()). + */ + .bqc_producers_nr_max = 2, + /* + * Single consumer - recovery machine FOM (recovery_fom_coro()). + */ + .bqc_consumers_nr_max = 1, + .bqc_item_length = sizeof(uint64_t), + }); + if (rc != 0) + return M0_ERR(rc); + + if (is_local) { + rc = m0_be_queue_init(&rf->rf_eolq, &(struct m0_be_queue_cfg){ + .bqc_q_size_max = EOLQ_MAX_LEN, + /* + * Consumers and producers are the same as for + * rf_heq above. + */ + .bqc_producers_nr_max = 2, + .bqc_consumers_nr_max = 1, + .bqc_item_length = sizeof(struct eolq_item), + }); + if (rc != 0) { + m0_be_queue_fini(&rf->rf_heq); + return M0_ERR(rc); + } + M0_ASSERT(!rf->rf_eolq.bq_the_end); + m->rm_local_rfom = rf; + + /* + * This is local recovery FOM. We don't need to wait for EOL + * from ourselves. + */ + rf->rf_last_known_eol = true; + /* + * Local recovery FOM is responsible for accepting REDO + * messages. We're starting up, we know for sure that we will + * go through RECOVERING phase, so we are defaulting to + * M0_NC_DTM_RECOVERING. + */ + rf->rf_last_known_ha_state = M0_NC_DTM_RECOVERING; + } + + rf->rf_m = m; + rf->rf_tgt_svc = *target; + rf->rf_tgt_proc = proc_conf->pc_obj.co_id; + rf->rf_is_local = is_local; + rf->rf_is_volatile = is_volatile; + + rfom_tlink_init(rf); + m0_co_context_init(&rf->rf_coro); + + m0_clink_init(&rf->rf_ha_clink, recovery_fom_ha_clink_cb); + m0_clink_add_lock(&proc_conf->pc_obj.co_ha_chan, &rf->rf_ha_clink); + + M0_LOG(M0_DEBUG, "Subscribed to " FID_F " with initial state: %s", + FID_P(target), m0_ha_state2str(proc_conf->pc_obj.co_ha_state)); + + m0_fom_init(&rf->rf_base, &recovery_fom_type, + &recovery_fom_ops, NULL, NULL, + m0_dtm0_recovery_machine_reqh(m)); + + rfom_tlist_add_tail(&m->rm_rfoms, rf); + + return M0_RC(rc); +} + +/* + * Mark the queue as ended and drain it until the end. + */ +static void m0_be_queue__finish(struct m0_be_queue *bq, struct m0_buf *item) +{ + bool got = true; + + m0_be_queue_lock(bq); + if (!bq->bq_the_end) { + m0_be_queue_end(bq); + while (got) + M0_BE_OP_SYNC(op, m0_be_queue_get(bq, &op, item, &got)); + } + M0_POST(bq->bq_the_end); + m0_be_queue_unlock(bq); +} +#define M0_BE_QUEUE__FINISH(bq, item_type) ({ \ + item_type item; \ + m0_be_queue__finish(bq, &M0_BUF_INIT_PTR(&item)); \ +}) + +static void recovery_fom_fini(struct recovery_fom *rf) +{ + M0_ENTRY("m=%p, rf= %p", rf->rf_m, rf); + m0_clink_del_lock(&rf->rf_ha_clink); + m0_clink_fini(&rf->rf_ha_clink); + m0_co_context_fini(&rf->rf_coro); + rfom_tlink_fini(rf); + if (rf->rf_is_local) { + rf->rf_m->rm_local_rfom = NULL; + m0_be_queue_fini(&rf->rf_eolq); + } + m0_be_queue_fini(&rf->rf_heq); + M0_LEAVE(); +} + +static void recovery_fom_self_fini(struct m0_fom *fom) +{ + struct recovery_fom *rf = M0_AMB(rf, fom, rf_base); + struct m0_dtm0_recovery_machine *m = rf->rf_m; + bool is_stopped; + + M0_ENTRY("fom=%p, m=%p, rf=%p", fom, m, rf); + recovery_machine_lock(m); + rfom_tlist_remove(rf); + is_stopped = rfom_tlist_is_empty(&m->rm_rfoms); + recovery_fom_fini(rf); + m0_fom_fini(fom); + m0_free(rf); + if (is_stopped) + m0_sm_move(&m->rm_sm, 0, M0_DRMS_STOPPED); + recovery_machine_unlock(m); + M0_LEAVE("is_stopped=%d", !!is_stopped); +} + +static int recovery_fom_add(struct m0_dtm0_recovery_machine *m, + struct m0_conf_process *proc_conf, + const struct m0_fid *target, + bool is_volatile) +{ + struct recovery_fom *rf; + int rc; + + M0_ALLOC_PTR(rf); + if (rf != NULL) { + recovery_machine_lock(m); + rc = recovery_fom_init(rf, m, proc_conf, target, is_volatile); + recovery_machine_unlock(m); + } else + rc = M0_ERR(-ENOMEM); + + return M0_RC(rc); +} + +static struct recovery_fom * +recovery_fom_by_svc_find(struct m0_dtm0_recovery_machine *m, + const struct m0_fid *tgt_svc) +{ + return m0_tl_find(rfom, rf, &m->rm_rfoms, + m0_fid_eq(tgt_svc, &rf->rf_tgt_svc)); +} + +static struct recovery_fom * +recovery_fom_by_svc_find_lock(struct m0_dtm0_recovery_machine *m, + const struct m0_fid *tgt_svc) +{ + struct recovery_fom *rf; + recovery_machine_lock(m); + rf = recovery_fom_by_svc_find(m, tgt_svc); + recovery_machine_unlock(m); + return rf; +} + +static struct recovery_fom * +recovery_fom_local(struct m0_dtm0_recovery_machine *m) +{ + M0_PRE(m != NULL); + return m->rm_local_rfom; +} + +static void unpopulate_foms(struct m0_dtm0_recovery_machine *m) +{ + struct recovery_fom *rf; + + M0_ENTRY("m=%p", m); + + recovery_machine_lock(m); + /* + * TODO: assert that list is empty instead of this teardown + */ + m0_tl_teardown(rfom, &m->rm_rfoms, rf) { + recovery_fom_fini(rf); + m0_free(rf); + } + recovery_machine_unlock(m); + + M0_LEAVE(); +} + +static bool conf_obj_is_process(const struct m0_conf_obj *obj) +{ + return m0_conf_obj_type(obj) == &M0_CONF_PROCESS_TYPE; +} + +static bool is_svc_volatile(const struct m0_confc *confc, + const struct m0_fid *svc_fid) +{ + struct m0_conf_service *svc; + struct m0_conf_obj *obj; + const char **param; + + if (M0_FI_ENABLED("always_false")) + return false; + + obj = m0_conf_cache_lookup(&confc->cc_cache, svc_fid); + M0_ASSERT(obj != NULL); + + svc = M0_CONF_CAST(obj, m0_conf_service); + + M0_ASSERT(svc->cs_params != NULL); + + for (param = svc->cs_params; *param != NULL; ++param) { + if (m0_streq(*param, "origin:in-volatile")) + return true; + else if (m0_streq(*param, "origin:in-persistent")) + return false; + } + + M0_IMPOSSIBLE("Service origin is not defined in the config?"); +} + +static int populate_foms(struct m0_dtm0_recovery_machine *m) +{ + struct m0_reqh_service *service = &m->rm_svc->dos_generic; + struct m0_confc *confc = m0_reqh2confc(service->rs_reqh); + struct m0_conf_obj *obj; + struct m0_conf_root *root; + struct m0_conf_diter it; + struct m0_conf_process *proc_conf; + struct m0_fid svc_fid; + int rc; + struct recovery_fom *rf; + + M0_ENTRY("recovery machine=%p", m); + + /** UT workaround */ + if (!m0_confc_is_inited(confc)) { + M0_LOG(M0_WARN, "confc is not initiated!"); + return M0_RC(0); + } + + rc = m0_confc_root_open(confc, &root) ?: + m0_conf_diter_init(&it, confc, + &root->rt_obj, + M0_CONF_ROOT_NODES_FID, + M0_CONF_NODE_PROCESSES_FID); + if (rc != 0) + goto out; + + while ((rc = m0_conf_diter_next_sync(&it, conf_obj_is_process)) > 0) { + obj = m0_conf_diter_result(&it); + proc_conf = M0_CONF_CAST(obj, m0_conf_process); + rc = m0_conf_process2service_get(confc, + &proc_conf->pc_obj.co_id, + M0_CST_DTM0, &svc_fid); + if (rc != 0) + continue; + + rc = recovery_fom_add(m, proc_conf, &svc_fid, + is_svc_volatile(confc, &svc_fid)); + if (rc != 0) + break; + } + + m0_conf_diter_fini(&it); + + /* + * It is a workaround to propagate the current HA state. + */ + m0_confc_close(&root->rt_obj); + rc = m0_confc_root_open(confc, &root) ?: + m0_conf_diter_init(&it, confc, + &root->rt_obj, + M0_CONF_ROOT_NODES_FID, + M0_CONF_NODE_PROCESSES_FID); + if (rc != 0) + goto out; + + while ((rc = m0_conf_diter_next_sync(&it, conf_obj_is_process)) > 0) { + obj = m0_conf_diter_result(&it); + proc_conf = M0_CONF_CAST(obj, m0_conf_process); + rc = m0_conf_process2service_get(confc, + &proc_conf->pc_obj.co_id, + M0_CST_DTM0, &svc_fid); + if (rc != 0) + continue; + + rf = recovery_fom_by_svc_find(m, &svc_fid); + if (rf == NULL) + break; + + if (!rf->rf_is_local) + heq_post(rf, proc_conf->pc_obj.co_ha_state); + } + + m0_conf_diter_fini(&it); + /* end of workaround */ + +out: + if (root != NULL) + m0_confc_close(&root->rt_obj); + if (rc != 0) + unpopulate_foms(m); + return M0_RC(rc); +} + +static struct m0_co_context *CO(struct m0_fom *fom) +{ + struct recovery_fom *rf = M0_AMB(rf, fom, rf_base); + return &rf->rf_coro; +} + +#define F M0_CO_FRAME_DATA + +static void heq_await(struct m0_fom *fom, enum m0_ha_obj_state *out, bool *eoq) +{ + struct recovery_fom *rf = M0_AMB(rf, fom, rf_base); + + M0_CO_REENTER(CO(fom), + struct m0_be_op op; + bool got; + uint64_t state;); + + F(got) = false; + F(state) = 0; + M0_SET0(&F(op)); + m0_be_op_init(&F(op)); + m0_be_queue_lock(&rf->rf_heq); + M0_BE_QUEUE_GET(&rf->rf_heq, &F(op), &F(state), &F(got)); + m0_be_queue_unlock(&rf->rf_heq); + M0_CO_YIELD_RC(CO(fom), m0_be_op_tick_ret(&F(op), fom, RFS_WAITING)); + m0_be_op_fini(&F(op)); + + + if (F(got)) { + M0_LOG(M0_DEBUG, "heq_deq " FID_F " %s" , + FID_P(&rf->rf_tgt_svc), + m0_ha_state2str(F(state))); + M0_ASSERT(ha_event_invariant(F(state))); + *out = F(state); + } else { + M0_LOG(M0_DEBUG, "heq_deq " FID_F, FID_P(&rf->rf_tgt_svc)); + *eoq = true; + } +} + +static void eolq_await(struct m0_fom *fom, struct eolq_item *out) +{ + struct recovery_fom *rf = M0_AMB(rf, fom, rf_base); + + M0_CO_REENTER(CO(fom), + struct m0_be_op op; + bool got; + struct eolq_item item;); + + F(got) = false; + M0_SET0(&F(item)); + M0_SET0(&F(op)); + m0_be_op_init(&F(op)); + m0_be_queue_lock(&rf->rf_eolq); + M0_BE_QUEUE_GET(&rf->rf_eolq, &F(op), &F(item), &F(got)); + m0_be_queue_unlock(&rf->rf_eolq); + M0_CO_YIELD_RC(CO(fom), m0_be_op_tick_ret(&F(op), fom, RFS_WAITING)); + m0_be_op_fini(&F(op)); + + *out = F(got) ? F(item) : (struct eolq_item) { .ei_type = EIT_END }; +} + +/** + * Restore missing transactions on remote participant. + * + * Implements part of recovery process. Healthy ONLINE participant will iterate + * through the local DTM log and send all needed REDOs to a remote peer. + */ +static void dtm0_restore(struct m0_fom *fom, + enum m0_ha_obj_state *out, + bool *eoq) +{ + struct recovery_fom *rf = M0_AMB(rf, fom, rf_base); + struct m0_dtm0_log_rec record; + struct dtm0_req_fop redo; + int rc = 0; + + M0_CO_REENTER(CO(fom), + struct m0_fid initiator; + struct m0_be_op reply_op; + bool next; + ); + + recovery_machine_log_iter_init(rf->rf_m, &rf->rf_log_iter); + + /* XXX: race condition in the case where we are stopping the FOM. */ + F(initiator) = recovery_fom_local(rf->rf_m)->rf_tgt_svc; + + M0_SET0(&F(reply_op)); + m0_be_op_init(&F(reply_op)); + + do { + M0_SET0(&record); + rc = recovery_machine_log_iter_next(rf->rf_m, &rf->rf_log_iter, + &rf->rf_tgt_svc, NULL, + &record); + /* Any value except zero means that we should stop recovery. */ + F(next) = rc == 0; + redo = (struct dtm0_req_fop) { + .dtr_msg = DTM_REDO, + .dtr_initiator = F(initiator), + .dtr_payload = record.dlr_payload, + .dtr_txr = record.dlr_txd, + .dtr_flags = F(next) ? 0 : M0_BITS(M0_DMF_EOL), + }; + + /* + * TODO: there is extra memcpy happening here -- first copy done + * in recovery_machine_log_iter_next (it clones the record), + * second copy is done in recovery_machine_redo_post() below. + * If this proves to be too inefficient, we can eliminate extra + * copies. + */ + recovery_machine_redo_post(rf->rf_m, &rf->rf_base, + &rf->rf_tgt_proc, &rf->rf_tgt_svc, + &redo, &F(reply_op)); + + M0_LOG(M0_DEBUG, "out-redo: (m=%p) " REDO_F, + rf->rf_m, REDO_P(&redo)); + M0_CO_YIELD_RC(CO(fom), m0_be_op_tick_ret(&F(reply_op), + fom, RFS_WAITING)); + m0_be_op_reset(&F(reply_op)); + } while (F(next)); + + m0_be_op_fini(&F(reply_op)); + + recovery_machine_log_iter_fini(rf->rf_m, &rf->rf_log_iter); + M0_SET0(&rf->rf_log_iter); + + M0_CO_FUN(CO(fom), heq_await(fom, out, eoq)); +} + +static void remote_recovery_fom_coro(struct m0_fom *fom) +{ + struct recovery_fom *rf = M0_AMB(rf, fom, rf_base); + + M0_CO_REENTER(CO(fom), + struct m0_be_op op; + bool eoq; + enum m0_ha_obj_state state; + void (*action)(struct m0_fom *fom, + enum m0_ha_obj_state *out, bool *eoq); + ); + + F(eoq) = false; + F(state) = M0_NC_UNKNOWN; + + while (!F(eoq)) { + M0_LOG(M0_DEBUG, "remote recovery fom=%p, svc_fid=" FID_F ", " + " proc_fid=" FID_F " handles %s state.", fom, + FID_P(&rf->rf_tgt_svc), FID_P(&rf->rf_tgt_proc), + m0_ha_state2str(F(state))); + + switch (F(state)) { + case M0_NC_DTM_RECOVERING: + F(action) = rf->rf_is_volatile ? heq_await : + dtm0_restore; + break; + case M0_NC_FAILED: + if (ALL2ALL) + M0_LOG(M0_WARN, "Eviction is not supported."); + else + M0_IMPOSSIBLE("Eviction is not supported."); + /* + * Fall-through is intentional here. Eviction code will + * be added here in the future. + */ + default: + F(action) = heq_await; + break; + } + + M0_CO_FUN(CO(fom), F(action)(fom, &F(state), &F(eoq))); + } + + m0_fom_phase_set(fom, RFS_DONE); +} + +static bool was_log_replayed(struct recovery_fom *rf) +{ + bool outcome; + /* + * XXX: Clients are ignored by the recovery stop condition. + * Once HA and Motr are able to properly handle client + * restart, they can be brought back into the condition. + */ + bool is_na = rf->rf_is_volatile; + + /* + * Some of unit tests need to temporarily avoid the "ignore" described + * above. In particular, we have a test where client is sending EOL to + * server, and we want it to be received and handled (e.g. + * remach-reboot-server). + */ + if (m0_dtm0_is_expecting_redo_from_client()) { + M0_LOG(M0_DEBUG, "Expect EOL from client"); + is_na = false; + } + + outcome = ergo(!rf->rf_is_local && !is_na, + M0_IN(rf->rf_last_known_ha_state, + (M0_NC_ONLINE, M0_NC_DTM_RECOVERING)) && + rf->rf_last_known_eol); + return outcome; +} + +static void rec_cond_trace(struct m0_dtm0_recovery_machine *m) +{ + struct recovery_fom *rf; + + if (!m0_tl_exists(rfom, rf, &m->rm_rfoms, !rf->rf_is_local)) + M0_LOG(M0_WARN, "Recovery cannot be completed because there " + "are no remote DTM0 services."); + + m0_tl_for(rfom, &m->rm_rfoms, rf) { + M0_LOG(M0_DEBUG, + "id=" FID_F ", is_volatile=%d, " + "is_local=%d, state=%s, got_eol=%d => %d", + FID_P(&rf->rf_tgt_svc), + (int) rf->rf_is_volatile, + (int) rf->rf_is_local, + m0_ha_state2str(rf->rf_last_known_ha_state), + (int) rf->rf_last_known_eol, + (int) was_log_replayed(rf)); + } m0_tl_endfor; +} + +static bool is_local_recovery_completed(struct m0_dtm0_recovery_machine *m) +{ + M0_ENTRY(); + rec_cond_trace(m); + return M0_RC(m0_tl_exists(rfom, r, &m->rm_rfoms, !r->rf_is_local) && + m0_tl_forall(rfom, r, &m->rm_rfoms, was_log_replayed(r))); +} + +static void remote_state_update(struct recovery_fom *rf, + const struct eolq_item *item) +{ + M0_ENTRY("rf=%p, " FID_F " state: %s, eol: %d", + rf, FID_P(&rf->rf_tgt_svc), + m0_ha_state2str(rf->rf_last_known_ha_state), + (int) rf->rf_last_known_eol); + + switch (item->ei_type) { + case EIT_HA: + M0_LOG(M0_DEBUG, "new_state=%s", + m0_ha_state2str(item->ei_ha_state)); + rf->rf_last_known_ha_state = item->ei_ha_state; + /* Clear the EOL flag if the remote is dead. */ + if (M0_IN(item->ei_ha_state, (M0_NC_TRANSIENT, M0_NC_FAILED))) + rf->rf_last_known_eol = false; + break; + case EIT_EOL: + M0_LOG(M0_DEBUG, "new_eol=1"); + rf->rf_last_known_eol = true; + break; + default: + M0_IMPOSSIBLE("Wrong eolq item type %d?", item->ei_type); + break; + } + + M0_LEAVE(); +} + +static void local_recovery_fom_coro(struct m0_fom *fom) +{ + struct recovery_fom *rf = M0_AMB(rf, fom, rf_base); + struct recovery_fom *remote_rf; + + M0_CO_REENTER(CO(fom), + struct m0_be_op eolq_op; + bool eoq; + bool recovered; + struct eolq_item item; + enum m0_ha_obj_state state;); + + F(eoq) = false; + /* + * A DTM0 service without persistent storage does not need + * REDOs. + * mkfs does not require DTM0 support as well. + */ + F(recovered) = rf->rf_is_volatile; + + /* Wait until the moment where we should start recovery. */ + do { + M0_CO_FUN(CO(fom), heq_await(fom, &F(state), &F(eoq))); + if (F(eoq)) + goto out; + } while (F(state) != M0_NC_DTM_RECOVERING); + + while (!F(recovered)) { + M0_CO_FUN(CO(fom), eolq_await(fom, &F(item))); + M0_ASSERT(F(item).ei_type != EIT_END); + + recovery_machine_lock(rf->rf_m); + remote_rf = recovery_fom_by_svc_find(rf->rf_m, + &F(item).ei_source); + if (remote_rf == NULL) { + /* XXX: machine is stopping? */ + recovery_machine_unlock(rf->rf_m); + break; + } + M0_ASSERT(remote_rf != rf); + remote_state_update(remote_rf, &F(item)); + + F(recovered) = is_local_recovery_completed(rf->rf_m); + recovery_machine_unlock(rf->rf_m); + } + + /* + * At this point we do not expect any more EOL messages, so we 'end' + * and flush the queue to ensure it. + */ + M0_BE_QUEUE__FINISH(&rf->rf_eolq, typeof(F(item))); + + /* + * When F(recovered) is true, we know we received all needed EOL + * messages and have recovered everything. If it is false -- at this + * point, this can only mean that recovery machine is shutting down, and + * there is nothing we need to do further. + */ + if (F(recovered)) { + /* + * Emit "RECOVERED". It shall cause HA to tell us to transit + * from RECOVERING to ONLINE. + */ + recovery_machine_recovered(rf->rf_m, + &rf->rf_tgt_proc, &rf->rf_tgt_svc); + + do { + M0_CO_FUN(CO(fom), heq_await(fom, &F(state), + &F(eoq))); + M0_ASSERT(ergo(!F(eoq), M0_IN(F(state), + (M0_NC_TRANSIENT, + M0_NC_ONLINE)))); + } while (!F(eoq)); + } + +out: + /* + * This is not a duplication to the similar call above -- FINISH is + * indempotent call, and we want to ensure that it's called on all paths + * leading out of this function. + */ + M0_BE_QUEUE__FINISH(&rf->rf_eolq, typeof(F(item))); + m0_fom_phase_set(fom, RFS_DONE); +} + +static void recovery_fom_coro(struct m0_fom *fom) +{ + struct recovery_fom *rf = M0_AMB(rf, fom, rf_base); + + M0_CO_REENTER(CO(fom)); + + addb2_relate(&rf->rf_m->rm_sm, &fom->fo_sm_phase); + + if (rf->rf_is_local) + M0_CO_FUN(CO(fom), local_recovery_fom_coro(fom)); + else + M0_CO_FUN(CO(fom), remote_recovery_fom_coro(fom)); +} + +static int recovery_fom_tick(struct m0_fom *fom) +{ + int rc; + M0_CO_START(CO(fom)); + recovery_fom_coro(fom); + rc = M0_CO_END(CO(fom)); + M0_POST(M0_IN(rc, (0, M0_FSO_AGAIN, M0_FSO_WAIT))); + return rc ?: M0_FSO_WAIT; +} + +#undef F + +M0_INTERNAL void +m0_ut_remach_heq_post(struct m0_dtm0_recovery_machine *m, + const struct m0_fid *tgt_svc, + enum m0_ha_obj_state state) +{ + struct recovery_fom *rf = recovery_fom_by_svc_find_lock(m, tgt_svc); + M0_ASSERT_INFO(rf != NULL, + "Trying to post HA event to a wrong service?"); + heq_post(rf, state); +} + +M0_INTERNAL void +m0_ut_remach_populate(struct m0_dtm0_recovery_machine *m, + struct m0_conf_process *procs, + const struct m0_fid *svcs, + bool *is_volatile, + uint64_t objs_nr) +{ + uint64_t i; + int rc; + + for (i = 0; i < objs_nr; ++i) { + rc = recovery_fom_add(m, procs + i, svcs + i, is_volatile[i]); + M0_ASSERT(rc == 0); + } +} + +M0_INTERNAL void +m0_dtm0_recovery_machine_redo_post(struct m0_dtm0_recovery_machine *m, + struct dtm0_req_fop *redo, + struct m0_be_op *op) +{ + bool is_eol = + !!(redo->dtr_flags & M0_BITS(M0_DMF_EOL)); + bool is_eviction = + !!(redo->dtr_flags & M0_BITS(M0_DMF_EVICTION)); + const struct m0_fid *initiator = &redo->dtr_initiator; + struct eolq_item item = {}; + struct recovery_fom *rf; + + M0_ENTRY("in-redo (m=%p): " REDO_F, m, REDO_P(redo)); + + if (is_eol) { + M0_ASSERT_INFO(!is_eviction, + "TODO: Eviction is not handled yet."); + + rf = recovery_fom_local(m); + if (rf != NULL) { + M0_ASSERT_INFO(equi(is_eviction, !rf->rf_is_local), + "Participant cannot evict itself."); + item = (struct eolq_item) { + .ei_type = EIT_EOL, + .ei_source = *initiator, + }; + m0_be_queue_lock(&rf->rf_eolq); + M0_ASSERT_INFO(!rf->rf_eolq.bq_the_end, + "REDOs are not allowed if local recovery" + " has already been finished."); + M0_BE_QUEUE_PUT(&rf->rf_eolq, op, &item); + m0_be_queue_unlock(&rf->rf_eolq); + } else { + M0_LOG(M0_WARN, + "REDO received but svc is not RECOVERING yet"); + m0_be_op_active(op); + m0_be_op_done(op); + } + } else { + M0_LOG(M0_DEBUG, "A non-EOL REDO was ignored."); + m0_be_op_active(op); + m0_be_op_done(op); + } + + M0_LEAVE(); +} + +static int default_log_iter_init(struct m0_dtm0_recovery_machine *m, + struct m0_be_dtm0_log_iter *iter) +{ + m0_be_dtm0_log_iter_init(iter, m->rm_svc->dos_log); + return 0; +} + +static void default_log_iter_fini(struct m0_dtm0_recovery_machine *m, + struct m0_be_dtm0_log_iter *iter) +{ + struct recovery_fom *rf = M0_AMB(rf, iter, rf_log_iter); + M0_ASSERT(rf->rf_m == m); + m0_be_dtm0_log_iter_fini(iter); +} + +static bool participated(const struct m0_dtm0_log_rec *record, + const struct m0_fid *svc) +{ + return m0_exists(i, record->dlr_txd.dtd_ps.dtp_nr, + m0_fid_eq(&record->dlr_txd.dtd_ps.dtp_pa[i].p_fid, + svc)); +} + +static int default_log_iter_next(struct m0_dtm0_recovery_machine *m, + struct m0_be_dtm0_log_iter *iter, + const struct m0_fid *tgt_svc, + const struct m0_fid *origin_svc, + struct m0_dtm0_log_rec *record) +{ + struct m0_be_dtm0_log *log = m->rm_svc->dos_log; + int rc; + + /* XXX: not supported yet */ + M0_ASSERT(origin_svc == NULL); + + m0_mutex_lock(&log->dl_lock); + + /* Filter out records where tgt_svc is not a participant. */ + do { + M0_SET0(record); + rc = m0_be_dtm0_log_iter_next(iter, record); + if (rc == 0) { + if (participated(record, tgt_svc)) + break; + else + m0_dtm0_log_iter_rec_fini(record); + } + } while (rc == 0); + + m0_mutex_unlock(&log->dl_lock); + + /* XXX: error codes will be adjusted separately. */ + switch (rc) { + case 0: + return 0; + default: + return M0_ERR(rc); + } +} + +/* + * TODO: It was copy-pasted from setup.c (see cs_ha_process_event)! + * Export cs_ha_process_event instead of using this thing. + */ +static void server_process_event(struct m0_motr *cctx, + enum m0_conf_ha_process_event event) +{ + enum m0_conf_ha_process_type type; + + type = cctx->cc_mkfs ? M0_CONF_HA_PROCESS_M0MKFS : + M0_CONF_HA_PROCESS_M0D; + if (cctx->cc_ha_is_started && !cctx->cc_no_conf && + cctx->cc_motr_ha.mh_link != NULL) { + m0_conf_ha_process_event_post(&cctx->cc_motr_ha.mh_ha, + cctx->cc_motr_ha.mh_link, + &cctx->cc_reqh_ctx.rc_fid, + m0_process(), event, type); + } +} + +/* TODO: copy-pasted from client_init.c (see ha_process_event)! + * Actually, clients should not send RECOVERED but at this moment, + * the SM on the Hare side cannot properly handle this. + */ +static void client_process_event(struct m0_client *m0c, + enum m0_conf_ha_process_event event) +{ + const enum m0_conf_ha_process_type type = M0_CONF_HA_PROCESS_OTHER; + if (m0c->m0c_motr_ha.mh_link != NULL) + m0_conf_ha_process_event_post(&m0c->m0c_motr_ha.mh_ha, + m0c->m0c_motr_ha.mh_link, + &m0c->m0c_process_fid, + m0_process(), event, type); +} + +static void default_ha_event_post(struct m0_dtm0_recovery_machine *m, + const struct m0_fid *tgt_proc, + const struct m0_fid *tgt_svc, + enum m0_conf_ha_process_event event) +{ + struct m0_reqh *reqh; + struct m0_client *m0c; + (void) tgt_proc; + (void) tgt_svc; + + if (ALL2ALL) { + M0_LOG(M0_DEBUG, "ALL2ALL_DTM_RECOVERED"); + } + + M0_ASSERT_INFO(m->rm_local_rfom != NULL, + "It is impossible to emit an HA event without local " + "recovery FOM up and running."); + reqh = m0_fom_reqh(&m->rm_local_rfom->rf_base); + + if (!m->rm_local_rfom->rf_is_volatile) { + M0_ASSERT_INFO(m0_cs_reqh_context(reqh) != NULL, + "A fully-functional motr process must " + "have a reqh ctx."); + server_process_event(m0_cs_ctx_get(reqh), event); + } else { + m0c = M0_AMB(m0c, reqh, m0c_reqh); + client_process_event(m0c, event); + } + +} + +static void default_redo_post(struct m0_dtm0_recovery_machine *m, + struct m0_fom *fom, + const struct m0_fid *tgt_proc, + const struct m0_fid *tgt_svc, + struct dtm0_req_fop *redo, + struct m0_be_op *op) +{ + int rc; + + rc = m0_dtm0_req_post(m->rm_svc, op, redo, tgt_svc, fom, true); + /* + * We assume "perfect" links between ONLINE/RECOVERING processes. + * If the link is not perfect then let's just kill the process + * that is not able to send out REDOs. + */ + M0_ASSERT(rc == 0); +} + +M0_INTERNAL const struct m0_dtm0_recovery_machine_ops + m0_dtm0_recovery_machine_default_ops = { + .log_iter_init = default_log_iter_init, + .log_iter_fini = default_log_iter_fini, + .log_iter_next = default_log_iter_next, + + .redo_post = default_redo_post, + .ha_event_post = default_ha_event_post, +}; + +#undef M0_TRACE_SUBSYSTEM +/* + * Local variables: + * c-indentation-style: "K&R" + * c-basic-offset: 8 + * tab-width: 8 + * fill-column: 80 + * scroll-step: 1 + * End: + */ +/* + * vim: tabstop=8 shiftwidth=8 noexpandtab textwidth=80 nowrap + */ diff --git a/dtm0/recovery.h b/dtm0/recovery.h new file mode 100644 index 00000000000..d83f74482fa --- /dev/null +++ b/dtm0/recovery.h @@ -0,0 +1,183 @@ +/* -*- C -*- */ +/* + * Copyright (c) 2021 Seagate Technology LLC and/or its Affiliates + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * For any questions about this software or licensing, + * please email opensource@seagate.com or cortx-questions@seagate.com. + * + */ + + +#pragma once + +#ifndef __MOTR_DTM0_RECOVERY_H__ +#define __MOTR_DTM0_RECOVERY_H__ + +#include "lib/tlist.h" /* m0_tl */ +#include "ha/note.h" /* m0_ha_obj_state */ +#include "conf/ha.h" /* m0_conf_ha_process_event */ +#include "sm/sm.h" /* m0_sm, m0_sm_group */ + +/* imports */ +struct m0_dtm0_service; +struct m0_conf_process; +struct dtm0_req_fop; +struct m0_be_dtm0_log_iter; +struct m0_dtm0_log_rec; +struct m0_be_op; +struct m0_fom; + +/* exports */ +struct m0_dtm0_recovery_machine_ops; +struct m0_dtm0_recovery_machine; +struct recovery_fom; + +enum m0_dtm0_recovery_machine_states { + M0_DRMS_INIT, + M0_DRMS_STARTED, + M0_DRMS_STOPPED, + M0_DRMS_NR, +}; + +struct m0_dtm0_recovery_machine_ops { + /** + * Post a REDO message to the target DTM0 service. + * + * The expectation is that `redo` structure is either processed + * immediately, before function completes, or it is cloned and stored + * for future use. Caller will destroy all content of `redo` structure + * right after the call to redo_post(). + */ + void (*redo_post)(struct m0_dtm0_recovery_machine *m, + struct m0_fom *fom, + const struct m0_fid *tgt_proc, + const struct m0_fid *tgt_svc, + struct dtm0_req_fop *redo, + struct m0_be_op *op); + + /** + * Post a conf ha process event. + */ + void (*ha_event_post)(struct m0_dtm0_recovery_machine *m, + const struct m0_fid *tgt_proc, + const struct m0_fid *tgt_svc, + enum m0_conf_ha_process_event event); + + int (*log_iter_init)(struct m0_dtm0_recovery_machine *m, + struct m0_be_dtm0_log_iter *iter); + + /** + * Get next log record (or -ENOENT) from the local DTM0 log. + * @param[in] tgt_svc DTM0 service this REDO shall be sent to. + * @param[in,opt] origin_svc DTM0 service to be selected. When + * this parameter is set to non-NULL, + * the iter is supposed to select only + * the log records that were originated + * on this particular service. + */ + int (*log_iter_next)(struct m0_dtm0_recovery_machine *m, + struct m0_be_dtm0_log_iter *iter, + const struct m0_fid *tgt_svc, + const struct m0_fid *origin_svc, + struct m0_dtm0_log_rec *record); + + void (*log_iter_fini)(struct m0_dtm0_recovery_machine *m, + struct m0_be_dtm0_log_iter *iter); +}; + + +struct m0_dtm0_recovery_machine { + struct m0_dtm0_service *rm_svc; + struct m0_tl rm_rfoms; + const struct m0_dtm0_recovery_machine_ops *rm_ops; + struct recovery_fom *rm_local_rfom; + + struct m0_sm_group rm_sm_group; + struct m0_sm rm_sm; +}; + +M0_EXTERN const struct m0_dtm0_recovery_machine_ops + m0_dtm0_recovery_machine_default_ops; + +M0_INTERNAL int m0_drm_domain_init(void); + +M0_INTERNAL void m0_drm_domain_fini(void); + +/** + * Initialise a recovery machine. + * @param ops Recovery machine operations (such as sending of REDOs and some + * others) that have to be used instead of the default ops. + * These ops may be used to alter the effects of recovery machine + * decisions on the system (for example, in UTs). + */ +M0_INTERNAL void +m0_dtm0_recovery_machine_init(struct m0_dtm0_recovery_machine *m, + const struct m0_dtm0_recovery_machine_ops *ops, + struct m0_dtm0_service *svc); + +M0_INTERNAL int +m0_dtm0_recovery_machine_start(struct m0_dtm0_recovery_machine *m); + +M0_INTERNAL void +m0_dtm0_recovery_machine_stop(struct m0_dtm0_recovery_machine *m); + +M0_INTERNAL void +m0_dtm0_recovery_machine_fini(struct m0_dtm0_recovery_machine *m); + +/** + * Post a REDO message into recovery machine. + * + * Recovery machine needs to know what REDO messages were received by the local + * Motr process. It helps to properly advance the state of the local process. + * For example, the transition RECOVERING -> ONLINE may happen only when it + * receives enough REDO messages with EOL flag set (or simply EOL messages). + * + * Note, at this moment recovery machine sends out REDO messages but it does not + * apply incoming REDO messages. It must be done elsewhere. However, there is an + * ongoing effort to change this by getting rid of self-sufficient REDO FOMs. + * After these changes are done, recovery machine as a module will be fully + * responsible for sending and executing REDO messages. + */ +M0_INTERNAL void +m0_dtm0_recovery_machine_redo_post(struct m0_dtm0_recovery_machine *m, + struct dtm0_req_fop *redo, + struct m0_be_op *op); + +/* UT-related API */ +M0_INTERNAL void +m0_ut_remach_heq_post(struct m0_dtm0_recovery_machine *m, + const struct m0_fid *tgt_svc, + enum m0_ha_obj_state state); +M0_INTERNAL void +m0_ut_remach_populate(struct m0_dtm0_recovery_machine *m, + struct m0_conf_process *procs, + const struct m0_fid *svcs, + bool *is_volatile, + uint64_t objs_nr); + +#endif /* __MOTR_DTM0_RECOVERY_H__ */ + +/* + * Local variables: + * c-indentation-style: "K&R" + * c-basic-offset: 8 + * tab-width: 8 + * fill-column: 80 + * scroll-step: 1 + * End: + */ +/* + * vim: tabstop=8 shiftwidth=8 noexpandtab textwidth=80 nowrap + */ diff --git a/dtm0/service.c b/dtm0/service.c index 75e979abcc3..a046a48cf28 100644 --- a/dtm0/service.c +++ b/dtm0/service.c @@ -179,8 +179,12 @@ static int dtm0_process_disconnect(struct dtm0_process *process) if (M0_IS0(&process->dop_rlink)) return M0_RC(0); - rc = m0_rpc_link_is_connected(&process->dop_rlink) ? - m0_rpc_link_disconnect_sync(&process->dop_rlink, timeout) : 0; + if (m0_rpc_link_is_connected(&process->dop_rlink)) { + m0_rpc_conn_sessions_cancel(&process->dop_rlink.rlk_conn); + rc = m0_rpc_link_disconnect_sync(&process->dop_rlink, timeout); + } else + rc = 0; + if (M0_IN(rc, (0, -ETIMEDOUT, -ECANCELED))) { /* @@ -263,7 +267,7 @@ static int volatile_log_init(struct m0_dtm0_service *dtm0) rc = m0_be_dtm0_log_init(dtm0->dos_log, NULL, &dtm0->dos_clk_src, false); if (rc != 0) - m0_be_dtm0_log_free(&dtm0->dos_log); + m0_be_dtm0_log_free0(&dtm0->dos_log); } return rc; } @@ -330,10 +334,33 @@ static int dtm_service__origin_fill(struct m0_reqh_service *service) return M0_RC_INFO(rc, "origin=%d", dtm0->dos_origin); } +/* + * Certain UTs manually control the lifetime of recovery machine. + * When manual start-stop is disabled, DTM0 service automatically + * starts-stops the machine. + */ +static bool is_manual_ss_enabled(void) +{ + return M0_FI_ENABLED("ut"); +} + static int dtm0_service_start(struct m0_reqh_service *service) { + struct m0_dtm0_service *dtms = to_dtm(service); + int rc; + M0_PRE(service != NULL); - return dtm_service__origin_fill(service); + rc = dtm_service__origin_fill(service); + if (rc != 0) + return M0_ERR(rc); + + if (ENABLE_DTM0 && !is_manual_ss_enabled()) { + m0_dtm0_recovery_machine_init(&dtms->dos_remach, + NULL, dtms); + rc = m0_dtm0_recovery_machine_start(&dtms->dos_remach); + } + + return M0_RC(rc); } static void dtm0_service_prepare_to_stop(struct m0_reqh_service *reqh_rs) @@ -342,6 +369,8 @@ static void dtm0_service_prepare_to_stop(struct m0_reqh_service *reqh_rs) M0_PRE(reqh_rs != NULL); dtms = M0_AMB(dtms, reqh_rs, dos_generic); + if (ENABLE_DTM0 && !is_manual_ss_enabled()) + m0_dtm0_recovery_machine_stop(&dtms->dos_remach); dtm0_service_conns_term(dtms); } @@ -358,8 +387,11 @@ static void dtm0_service_stop(struct m0_reqh_service *service) if (dtm0->dos_origin == DTM0_ON_VOLATILE && dtm0->dos_log != NULL) { m0_be_dtm0_log_clear(dtm0->dos_log); m0_be_dtm0_log_fini(dtm0->dos_log); - m0_be_dtm0_log_free(&dtm0->dos_log); + m0_be_dtm0_log_free0(&dtm0->dos_log); } + + if (ENABLE_DTM0 && !is_manual_ss_enabled()) + m0_dtm0_recovery_machine_fini(&dtm0->dos_remach); } static void dtm0_service_fini(struct m0_reqh_service *service) @@ -377,12 +409,14 @@ M0_INTERNAL int m0_dtm0_stype_init(void) M0_AVI_DTX0_SM_STATE, M0_AVI_DTX0_SM_COUNTER) ?: m0_dtm0_fop_init() ?: m0_reqh_service_type_register(&dtm0_service_type) ?: - m0_dtm0_rpc_link_mod_init(); + m0_dtm0_rpc_link_mod_init() ?: + m0_drm_domain_init(); } M0_INTERNAL void m0_dtm0_stype_fini(void) { extern struct m0_sm_conf m0_dtx_sm_conf; + m0_drm_domain_fini(); m0_dtm0_rpc_link_mod_fini(); m0_reqh_service_type_unregister(&dtm0_service_type); m0_dtm0_fop_fini(); @@ -411,6 +445,11 @@ m0_dtm0_service_find(const struct m0_reqh *reqh) return rh_srv == NULL ? NULL : to_dtm(rh_srv); } +M0_INTERNAL bool m0_dtm0_is_expecting_redo_from_client(void) +{ + return M0_FI_ENABLED("ut"); +} + M0_INTERNAL bool m0_dtm0_in_ut(void) { return M0_FI_ENABLED("ut"); diff --git a/dtm0/service.h b/dtm0/service.h index 087f0b91b4c..6801bd21e21 100644 --- a/dtm0/service.h +++ b/dtm0/service.h @@ -27,6 +27,7 @@ #include "reqh/reqh_service.h" #include "dtm0/clk_src.h" +#include "dtm0/recovery.h" struct m0_be_dtm0_log; struct dtm0_req_fop; @@ -42,19 +43,20 @@ enum m0_dtm0_service_origin { * DTM0 service structure */ struct m0_dtm0_service { - struct m0_reqh_service dos_generic; - struct m0_tl dos_processes; - enum m0_dtm0_service_origin dos_origin; - uint64_t dos_magix; - struct m0_dtm0_clk_src dos_clk_src; - struct m0_be_dtm0_log *dos_log; - /* + struct m0_reqh_service dos_generic; + struct m0_tl dos_processes; + enum m0_dtm0_service_origin dos_origin; + uint64_t dos_magix; + struct m0_dtm0_clk_src dos_clk_src; + struct m0_be_dtm0_log *dos_log; + struct m0_dtm0_recovery_machine dos_remach; + /** * A queue for DTM_TEST message for drlink UTs. * The UTs are fully responsible for the queue init/fini/get. * DTM_TEST fom puts dtm0_req_fop::dtr_txr::dtd_id::dti_fid to the * queue. */ - struct m0_be_queue *dos_ut_queue; + struct m0_be_queue *dos_ut_queue; }; extern struct m0_reqh_service_type dtm0_service_type; @@ -89,6 +91,7 @@ m0_dtm0_service_find(const struct m0_reqh *reqh); M0_INTERNAL struct m0_dtm0_service *m0_dtm0_fom2service(struct m0_fom *fom); M0_INTERNAL bool m0_dtm0_in_ut(void); +M0_INTERNAL bool m0_dtm0_is_expecting_redo_from_client(void); #endif /* __MOTR_DTM0_SERVICE_H__ */ diff --git a/dtm0/ut/drlink.c b/dtm0/ut/drlink.c index 7c3b5cd7064..a26aae6ae7a 100644 --- a/dtm0/ut/drlink.c +++ b/dtm0/ut/drlink.c @@ -60,6 +60,7 @@ void m0_dtm0_ut_drlink_simple(void) int rc; int i; int j; + struct m0_be_queue *q; M0_ALLOC_PTR(udh); M0_ASSERT(udh != NULL); @@ -93,16 +94,16 @@ void m0_dtm0_ut_drlink_simple(void) for (i = 0; i < DTM0_UT_DRLINK_SIMPLE_POST_NR; ++i) m0_be_op_init(&op[i]); - M0_ALLOC_PTR(svc->dos_ut_queue); - M0_UT_ASSERT(svc->dos_ut_queue != 0); - rc = m0_be_queue_init(svc->dos_ut_queue, - &(struct m0_be_queue_cfg){ + M0_ALLOC_PTR(q); + M0_UT_ASSERT(q != 0); + rc = m0_be_queue_init(q, &(struct m0_be_queue_cfg){ .bqc_q_size_max = DTM0_UT_DRLINK_SIMPLE_POST_NR, .bqc_producers_nr_max = DTM0_UT_DRLINK_SIMPLE_POST_NR, .bqc_consumers_nr_max = 1, .bqc_item_length = sizeof fid[0], }); M0_UT_ASSERT(rc == 0); + svc->dos_ut_queue = q; for (i = 0; i < DTM0_UT_DRLINK_SIMPLE_POST_NR; ++i) { rc = m0_dtm0_req_post(udh->udh_server_dtm0_service, @@ -131,8 +132,6 @@ void m0_dtm0_ut_drlink_simple(void) M0_UT_ASSERT(found); } m0_be_op_fini(&op_out); - m0_be_queue_fini(svc->dos_ut_queue); - m0_free(svc->dos_ut_queue); for (i = 0; i < DTM0_UT_DRLINK_SIMPLE_POST_NR; ++i) M0_UT_ASSERT(m0_fid_eq(&fid[i], &M0_FID0)); m0_free(fid); @@ -144,6 +143,8 @@ void m0_dtm0_ut_drlink_simple(void) m0_free(fop); m0_ut_dtm0_helper_fini(udh); + m0_be_queue_fini(q); + m0_free(q); m0_free(udh); } diff --git a/dtm0/ut/main.c b/dtm0/ut/main.c index 5ce0bdd2e8e..349d4e36eb1 100644 --- a/dtm0/ut/main.c +++ b/dtm0/ut/main.c @@ -30,7 +30,9 @@ #include "ut/ut.h" #include "cas/cas.h" #include "cas/cas_xc.h" +#include "dtm0/recovery.h" +#include "dtm0/ut/helper.h" enum { NUM_CAS_RECS = 10, @@ -99,18 +101,728 @@ static void cas_xcode_test(void) buf, len); M0_UT_ASSERT(rc == 0); - m0_xcode_free_obj(&M0_XCODE_OBJ(m0_cas_op_xc, op_out)); + m0_xcode_free_obj(&M0_XCODE_OBJ(m0_cas_op_xc, op_out)); +} + + +enum ut_sides { + UT_SIDE_SRV, + UT_SIDE_CLI, + UT_SIDE_NR +}; + +enum ut_client_persistence { + UT_CP_UNSPECIFIED, + UT_CP_VOLATILE_CLIENT, + UT_CP_PERSISTENT_CLIENT, +}; + +struct m0_fid g_service_fids[UT_SIDE_NR]; + +struct ut_remach { + bool use_real_log; + enum ut_client_persistence cp; + + struct m0_ut_dtm0_helper udh; + + struct m0_dtm0_service *svcs[UT_SIDE_NR]; + struct m0_be_op recovered[UT_SIDE_NR]; + + /* Client-side stubs for conf objects. */ + struct m0_conf_process cli_procs[UT_SIDE_NR]; + struct m0_mutex cli_proc_guards[UT_SIDE_NR]; +}; + +struct ha_thought { + enum ut_sides who; + enum m0_ha_obj_state what; +}; +#define HA_THOUGHT(_who, _what) (struct ha_thought) { \ + .who = _who, .what = _what \ +} + +static struct m0_dtm0_service *ut_remach_svc_get(struct ut_remach *um, + enum ut_sides side) +{ + M0_UT_ASSERT(side < UT_SIDE_NR); + M0_UT_ASSERT(um->svcs[side] != NULL); + return um->svcs[side]; +} + +static struct m0_dtm0_recovery_machine *ut_remach_get(struct ut_remach *um, + enum ut_sides side) +{ + return &ut_remach_svc_get(um, side)->dos_remach; +} + +static struct m0_fid *ut_remach_fid_get(enum ut_sides side) +{ + M0_UT_ASSERT(side < UT_SIDE_NR); + return &g_service_fids[side]; +} + +static enum ut_sides ut_remach_side_get(const struct m0_fid *svc) +{ + enum ut_sides side; + + for (side = 0; side < UT_SIDE_NR; ++side) { + if (m0_fid_eq(ut_remach_fid_get(side), svc)) + break; + } + + M0_UT_ASSERT(side < UT_SIDE_NR); + return side; +} + +static struct ut_remach *ut_remach_from(struct m0_dtm0_recovery_machine *m, + const struct m0_fid *svc_fid) +{ + struct ut_remach *um = NULL; + struct m0_dtm0_service *svc = M0_AMB(svc, m, dos_remach); + struct m0_reqh *reqh = svc->dos_generic.rs_reqh; + struct m0_reqh_context *rx = svc->dos_generic.rs_reqh_ctx; + struct m0_rpc_client_ctx *cli_ctx; + struct m0_motr *motr_ctx; + struct m0_rpc_server_ctx *srv_ctx; + struct m0_ut_dtm0_helper *udh; + enum ut_sides side = ut_remach_side_get(svc_fid); + + if (rx == NULL) { + M0_UT_ASSERT(side == UT_SIDE_CLI); + cli_ctx = M0_AMB(cli_ctx, reqh, rcx_reqh); + udh = M0_AMB(udh, cli_ctx, udh_cctx); + um = M0_AMB(um, udh, udh); + } else { + M0_UT_ASSERT(side == UT_SIDE_SRV); + motr_ctx = M0_AMB(motr_ctx, rx, cc_reqh_ctx); + srv_ctx = M0_AMB(srv_ctx, motr_ctx, rsx_motr_ctx); + udh = M0_AMB(udh, srv_ctx, udh_sctx); + um = M0_AMB(um, udh, udh); + } + + M0_UT_ASSERT(um != NULL); + M0_UT_ASSERT(ut_remach_get(um, side) == m); + return um; +} + +static void ut_remach_log_add_sync(struct ut_remach *um, + enum ut_sides side, + struct m0_dtm0_tx_desc *txd, + struct m0_buf *payload) +{ + struct m0_dtm0_recovery_machine *m = ut_remach_get(um, side); + struct m0_dtm0_service *svc = m->rm_svc; + struct m0_be_dtm0_log *log = svc->dos_log; + struct m0_be_tx *tx = NULL; + struct m0_be_seg *seg = log->dl_seg; + struct m0_be_tx_credit cred = {}; + struct m0_be_ut_backend *ut_be; + int rc; + + if (log->dl_is_persistent) { + M0_UT_ASSERT(svc->dos_generic.rs_reqh_ctx != NULL); + ut_be = &svc->dos_generic.rs_reqh_ctx->rc_be; + m0_be_dtm0_log_credit(M0_DTML_EXECUTED, txd, payload, seg, + NULL, &cred); + M0_ALLOC_PTR(tx); + M0_UT_ASSERT(tx != NULL); + m0_be_ut_tx_init(tx, ut_be); + m0_be_tx_prep(tx, &cred); + rc = m0_be_tx_open_sync(tx); + M0_UT_ASSERT(rc == 0); + } + + m0_mutex_lock(&log->dl_lock); + rc = m0_be_dtm0_log_update(log, tx, txd, payload); + M0_UT_ASSERT(rc == 0); + m0_mutex_unlock(&log->dl_lock); + + if (log->dl_is_persistent) { + m0_be_tx_close_sync(tx); + m0_be_tx_fini(tx); + m0_free(tx); + } +} + + +static void um_dummy_log_redo_post(struct m0_dtm0_recovery_machine *m, + struct m0_fom *fom, + const struct m0_fid *tgt_proc, + const struct m0_fid *tgt_svc, + struct dtm0_req_fop *redo, + struct m0_be_op *op) +{ + struct ut_remach *um = NULL; + struct m0_dtm0_recovery_machine *counterpart = NULL; + + um = ut_remach_from(m, &redo->dtr_initiator); + counterpart = ut_remach_get(um, ut_remach_side_get(tgt_svc)); + m0_dtm0_recovery_machine_redo_post(counterpart, redo, op); + M0_UT_ASSERT(m0_be_op_is_done(op)); +} + +static void um_real_log_redo_post(struct m0_dtm0_recovery_machine *m, + struct m0_fom *fom, + const struct m0_fid *tgt_proc, + const struct m0_fid *tgt_svc, + struct dtm0_req_fop *redo, + struct m0_be_op *op) +{ + struct ut_remach *um = NULL; + struct m0_dtm0_recovery_machine *counterpart = NULL; + enum ut_sides tgt_side = ut_remach_side_get(tgt_svc); + struct m0_dtm0_service *svc; + struct m0_be_ut_backend *ut_be; + + um = ut_remach_from(m, &redo->dtr_initiator); + counterpart = ut_remach_get(um, tgt_side); + + /* Empty REDOs are allowed only when EOL is set. */ + M0_UT_ASSERT(ergo(m0_dtm0_tx_desc_is_none(&redo->dtr_txr), + !!(redo->dtr_flags & M0_BITS(M0_DMF_EOL)))); + + /* Emulate REDO FOM: update the log */ + if (!m0_dtm0_tx_desc_is_none(&redo->dtr_txr)) + ut_remach_log_add_sync(um, tgt_side, &redo->dtr_txr, + &redo->dtr_payload); + + m0_dtm0_recovery_machine_redo_post(counterpart, redo, op); + M0_UT_ASSERT(m0_be_op_is_done(op)); + + /* + * It is a sordid but simple way of making ::ut_remach_log_add_sync + * work: + * RPC client does not have a fully-funcitonal context, so that + * sm-based BE logic cannot progress because there is no BE associated + * with the corresponding FOM (fom -> reqh -> context -> be). + * However, both sides share the same set of localities, + * so that we can sit down right here and wait until everything + * is completed. + * It might be slow and dangerous but it is enough for a simple test. + */ + if (!m0_dtm0_tx_desc_is_none(&redo->dtr_txr) && + tgt_side == UT_SIDE_SRV) { + svc = counterpart->rm_svc; + ut_be = &svc->dos_generic.rs_reqh_ctx->rc_be; + m0_be_ut_backend_sm_group_asts_run(ut_be); + m0_be_ut_backend_thread_exit(ut_be); + } +} + +static int um_dummy_log_iter_next(struct m0_dtm0_recovery_machine *m, + struct m0_be_dtm0_log_iter *iter, + const struct m0_fid *tgt_svc, + const struct m0_fid *origin_svc, + struct m0_dtm0_log_rec *record) +{ + M0_SET0(record); + return -ENOENT; +} + +static int um_dummy_log_iter_init(struct m0_dtm0_recovery_machine *m, + struct m0_be_dtm0_log_iter *iter) +{ + (void) m; + (void) iter; + return 0; +} + +static void um_dummy_log_iter_fini(struct m0_dtm0_recovery_machine *m, + struct m0_be_dtm0_log_iter *iter) +{ + (void) m; + (void) iter; + /* nothing to do */ +} + +void um_ha_event_post(struct m0_dtm0_recovery_machine *m, + const struct m0_fid *tgt_proc, + const struct m0_fid *tgt_svc, + enum m0_conf_ha_process_event event) +{ + struct ut_remach *um = ut_remach_from(m, tgt_svc); + enum ut_sides side = ut_remach_side_get(tgt_svc); + + switch (event) { + case M0_CONF_HA_PROCESS_DTM_RECOVERED: + m0_be_op_done(&um->recovered[side]); + break; + default: + M0_UT_ASSERT(false); + } +} + +/* + * Unicast an HA thought to a particular side. + */ +static void ut_remach_ha_tells(struct ut_remach *um, + const struct ha_thought *t, + enum ut_sides whom) +{ + m0_ut_remach_heq_post(ut_remach_get(um, whom), + ut_remach_fid_get(t->who), t->what); +} + +/* + * Multicast an HA thought to all the sides. + */ +static void ut_remach_ha_thinks(struct ut_remach *um, + const struct ha_thought *t) +{ + enum ut_sides side; + + for (side = 0; side < UT_SIDE_NR; ++side) + ut_remach_ha_tells(um, t, side); +} + +static const struct m0_dtm0_recovery_machine_ops* +ut_remach_ops_get(struct ut_remach *um) +{ + static struct m0_dtm0_recovery_machine_ops dummy_log_ops = {}; + static struct m0_dtm0_recovery_machine_ops real_log_ops = {}; + static bool initialized = false; + + if (!initialized) { + /* Dummy log operations */ + dummy_log_ops = m0_dtm0_recovery_machine_default_ops; + + dummy_log_ops.log_iter_next = um_dummy_log_iter_next; + dummy_log_ops.log_iter_init = um_dummy_log_iter_init; + dummy_log_ops.log_iter_fini = um_dummy_log_iter_fini; + + dummy_log_ops.redo_post = um_dummy_log_redo_post; + dummy_log_ops.ha_event_post = um_ha_event_post; + + /* Real log operations */ + real_log_ops = m0_dtm0_recovery_machine_default_ops; + + real_log_ops.redo_post = um_real_log_redo_post; + real_log_ops.ha_event_post = um_ha_event_post; + /* + * Do not reassign log ops, as we need to deal with real DTM0 + * log. + */ + + initialized = true; + } + + return um->use_real_log ? &real_log_ops : &dummy_log_ops; +} + +static void ut_srv_remach_init(struct ut_remach *um) +{ + m0_dtm0_recovery_machine_init(ut_remach_get(um, UT_SIDE_SRV), + ut_remach_ops_get(um), + ut_remach_svc_get(um, UT_SIDE_SRV)); +} + +static void ut_cli_remach_conf_obj_init(struct ut_remach *um) +{ + int i; + + for (i = 0; i < UT_SIDE_NR; ++i) { + m0_mutex_init(&um->cli_proc_guards[i]); + m0_chan_init(&um->cli_procs[i].pc_obj.co_ha_chan, + &um->cli_proc_guards[i]); + } +} + +static void ut_cli_remach_conf_obj_fini(struct ut_remach *um) +{ + int i; + + for (i = 0; i < UT_SIDE_NR; ++i) { + m0_chan_fini_lock(&um->cli_procs[i].pc_obj.co_ha_chan); + m0_mutex_fini(&um->cli_proc_guards[i]); + } +} + + +static void ut_cli_remach_init(struct ut_remach *um) +{ + ut_cli_remach_conf_obj_init(um); + + m0_dtm0_recovery_machine_init(ut_remach_get(um, UT_SIDE_CLI), + ut_remach_ops_get(um), + um->svcs[UT_SIDE_CLI]); +} + +static void ut_srv_remach_fini(struct ut_remach *um) +{ + m0_dtm0_recovery_machine_fini(ut_remach_get(um, UT_SIDE_SRV)); +} + +static void ut_cli_remach_fini(struct ut_remach *um) +{ + m0_dtm0_recovery_machine_fini(ut_remach_get(um, UT_SIDE_CLI)); + ut_cli_remach_conf_obj_fini(um); +} + +static void ut_remach_start(struct ut_remach *um) +{ + enum ut_sides side; + int rc; + bool is_volatile[UT_SIDE_NR] = { + [UT_SIDE_SRV] = false, + [UT_SIDE_CLI] = um->cp == UT_CP_VOLATILE_CLIENT, + }; + + m0_ut_remach_populate(ut_remach_get(um, UT_SIDE_CLI), um->cli_procs, + g_service_fids, is_volatile, UT_SIDE_NR); + for (side = 0; side < UT_SIDE_NR; ++side) { + rc = m0_dtm0_recovery_machine_start(ut_remach_get(um, side)); + M0_ASSERT(rc == 0); + } +} + +static void ut_remach_stop(struct ut_remach *um) +{ + enum ut_sides side; + for (side = 0; side < UT_SIDE_NR; ++side) + m0_dtm0_recovery_machine_stop(ut_remach_get(um, side)); +} + +static void ut_remach_init(struct ut_remach *um) +{ + int i; + + M0_UT_ASSERT(M0_IN(um->cp, (UT_CP_PERSISTENT_CLIENT, + UT_CP_VOLATILE_CLIENT))); + + for (i = 0; i < ARRAY_SIZE(um->recovered); ++i) { + m0_be_op_init(um->recovered + i); + m0_be_op_active(um->recovered + i); + } + + m0_fi_enable("m0_dtm0_in_ut", "ut"); + m0_fi_enable("is_manual_ss_enabled", "ut"); + m0_fi_enable("m0_dtm0_is_expecting_redo_from_client", "ut"); + if (um->cp == UT_CP_PERSISTENT_CLIENT) + m0_fi_enable("is_svc_volatile", "always_false"); + + m0_ut_dtm0_helper_init(&um->udh); + + g_service_fids[UT_SIDE_SRV] = um->udh.udh_server_dtm0_fid; + g_service_fids[UT_SIDE_CLI] = um->udh.udh_client_dtm0_fid; + + M0_UT_ASSERT(um->udh.udh_server_dtm0_service != NULL); + um->svcs[UT_SIDE_SRV] = um->udh.udh_server_dtm0_service; + M0_UT_ASSERT(um->udh.udh_client_dtm0_service != NULL); + um->svcs[UT_SIDE_CLI] = um->udh.udh_client_dtm0_service; + + ut_srv_remach_init(um); + ut_cli_remach_init(um); +} + +static void ut_remach_fini(struct ut_remach *um) +{ + int i; + + ut_cli_remach_fini(um); + ut_srv_remach_fini(um); + m0_ut_dtm0_helper_fini(&um->udh); + if (um->cp == UT_CP_PERSISTENT_CLIENT) + m0_fi_disable("is_svc_volatile", "always_false"); + m0_fi_disable("m0_dtm0_is_expecting_redo_from_client", "ut"); + m0_fi_disable("is_manual_ss_enabled", "ut"); + m0_fi_disable("m0_dtm0_in_ut", "ut"); + for (i = 0; i < ARRAY_SIZE(um->recovered); ++i) { + if (!m0_be_op_is_done(um->recovered + i)) + m0_be_op_done(um->recovered + i); + m0_be_op_fini(um->recovered + i); + } + + M0_SET_ARR0(g_service_fids); +} + +static void ut_remach_reset_srv(struct ut_remach *um) +{ + int rc; + struct m0_dtm0_recovery_machine *m = ut_remach_get(um, UT_SIDE_SRV); + + m0_dtm0_recovery_machine_stop(m); + m0_dtm0_recovery_machine_fini(m); + m0_dtm0_recovery_machine_init(m, ut_remach_ops_get(um), + ut_remach_svc_get(um, UT_SIDE_SRV)); + rc = m0_dtm0_recovery_machine_start(m); + M0_UT_ASSERT(rc == 0); +} + +static void ut_remach_log_gen_sync(struct ut_remach *um, + enum ut_sides side, + uint64_t ts_start, + uint64_t records_nr) +{ + struct m0_dtm0_tx_desc txd = {}; + struct m0_buf payload = {}; + int rc; + int i; + + rc = m0_dtm0_tx_desc_init(&txd, 1); + M0_UT_ASSERT(rc == 0); + txd.dtd_ps.dtp_pa[0] = (struct m0_dtm0_tx_pa) { + .p_state = M0_DTPS_EXECUTED, + .p_fid = *ut_remach_fid_get(UT_SIDE_SRV), + }; + txd.dtd_id = (struct m0_dtm0_tid) { + .dti_ts.dts_phys = 0, + .dti_fid = *ut_remach_fid_get(UT_SIDE_CLI), + }; + + for (i = 0; i < records_nr; ++i) { + txd.dtd_id.dti_ts.dts_phys = ts_start + i; + ut_remach_log_add_sync(um, side, &txd, &payload); + } + + m0_dtm0_tx_desc_fini(&txd); +} + +/* + * Ensures that DTM0 log A is a subset of DTM0 log B; and, + * optionally, that A has at exactly "expected_records_nr" log records + * (if expected_records_nr < 0 then this check is omitted). + * Note, pairs (tid, payload) are used as comarison keys. The states + * of participants and the other fields are ignored. + */ +static void log_subset_verify(struct ut_remach *um, + int expected_records_nr, + enum ut_sides a_side, + enum ut_sides b_side) +{ + struct m0_be_dtm0_log *a_log = + ut_remach_get(um, a_side)->rm_svc->dos_log; + struct m0_be_dtm0_log *b_log = + ut_remach_get(um, b_side)->rm_svc->dos_log; + struct m0_be_dtm0_log_iter a_iter; + struct m0_dtm0_log_rec a_record; + struct m0_dtm0_log_rec *b_record; + struct m0_buf *a_buf; + struct m0_buf *b_buf; + struct m0_dtm0_tid *tid; + int rc; + uint64_t actual_records_nr = 0; + + m0_mutex_lock(&a_log->dl_lock); + m0_mutex_lock(&b_log->dl_lock); + + m0_be_dtm0_log_iter_init(&a_iter, a_log); + + while (true) { + rc = m0_be_dtm0_log_iter_next(&a_iter, &a_record); + M0_UT_ASSERT(M0_IN(rc, (0, -ENOENT))); + if (rc == -ENOENT) + break; + M0_UT_ASSERT(rc == 0); + tid = &a_record.dlr_txd.dtd_id; + b_record = m0_be_dtm0_log_find(b_log, tid); + M0_UT_ASSERT(b_record != NULL); + a_buf = &a_record.dlr_payload; + b_buf = &b_record->dlr_payload; + M0_UT_ASSERT(equi(m0_buf_is_set(a_buf), m0_buf_is_set(b_buf))); + M0_UT_ASSERT(ergo(m0_buf_is_set(a_buf), + m0_buf_eq(a_buf, b_buf))); + m0_dtm0_log_iter_rec_fini(&a_record); + actual_records_nr++; + } + + M0_UT_ASSERT(ergo(expected_records_nr >= 0, + expected_records_nr == actual_records_nr)); + + m0_mutex_unlock(&b_log->dl_lock); + m0_mutex_unlock(&a_log->dl_lock); +} + +/* Case: Ensure the machine initialised properly. */ +static void remach_init_fini(void) +{ + struct ut_remach um = { .cp = UT_CP_PERSISTENT_CLIENT }; + ut_remach_init(&um); + ut_remach_fini(&um); +} + +/* Case: Ensure the machine is able to start/stop. */ +static void remach_start_stop(void) +{ + struct ut_remach um = { .cp = UT_CP_PERSISTENT_CLIENT }; + ut_remach_init(&um); + ut_remach_start(&um); + ut_remach_stop(&um); + ut_remach_fini(&um); +} + +static void ut_remach_boot(struct ut_remach *um) +{ + const struct ha_thought starting[] = { + HA_THOUGHT(UT_SIDE_CLI, M0_NC_TRANSIENT), + HA_THOUGHT(UT_SIDE_SRV, M0_NC_TRANSIENT), + + HA_THOUGHT(UT_SIDE_CLI, M0_NC_DTM_RECOVERING), + HA_THOUGHT(UT_SIDE_SRV, M0_NC_DTM_RECOVERING), + }; + const struct ha_thought started[] = { + HA_THOUGHT(UT_SIDE_CLI, M0_NC_ONLINE), + HA_THOUGHT(UT_SIDE_SRV, M0_NC_ONLINE), + }; + int i; + + ut_remach_init(um); + ut_remach_start(um); + + for (i = 0; i < ARRAY_SIZE(starting); ++i) + ut_remach_ha_thinks(um, starting + i); + + for (i = 0; i < ARRAY_SIZE(um->recovered); ++i) + m0_be_op_wait(um->recovered + i); + + for (i = 0; i < ARRAY_SIZE(started); ++i) + ut_remach_ha_thinks(um, started + i); +} + +static void ut_remach_shutdown(struct ut_remach *um) +{ + ut_remach_stop(um); + M0_UT_ASSERT(m0_be_op_is_done(&um->recovered[UT_SIDE_SRV])); + M0_UT_ASSERT(m0_be_op_is_done(&um->recovered[UT_SIDE_CLI])); + ut_remach_fini(um); +} + +/* Use-case: gracefull boot and shutdown of 2-node cluster. */ +static void remach_boot_cluster(enum ut_client_persistence cp) +{ + struct ut_remach um = { .cp = cp }; + + ut_remach_boot(&um); + ut_remach_shutdown(&um); +} + +static void remach_boot_cluster_ss(void) +{ + remach_boot_cluster(UT_CP_PERSISTENT_CLIENT); +} + +static void remach_boot_cluster_cs(void) +{ + remach_boot_cluster(UT_CP_VOLATILE_CLIENT); +} + +/* Use-case: re-boot an ONLINE node. */ +static void remach_reboot_server(void) +{ + struct ut_remach um = { .cp = UT_CP_PERSISTENT_CLIENT }; + + ut_remach_boot(&um); + + m0_be_op_reset(um.recovered + UT_SIDE_SRV); + m0_be_op_active(um.recovered + UT_SIDE_SRV); + ut_remach_reset_srv(&um); + + ut_remach_ha_thinks(&um, &HA_THOUGHT(UT_SIDE_SRV, M0_NC_TRANSIENT)); + ut_remach_ha_tells(&um, &HA_THOUGHT(UT_SIDE_CLI, M0_NC_ONLINE), + UT_SIDE_SRV); + ut_remach_ha_thinks(&um, &HA_THOUGHT(UT_SIDE_SRV, + M0_NC_DTM_RECOVERING)); + m0_be_op_wait(um.recovered + UT_SIDE_SRV); + ut_remach_ha_thinks(&um, &HA_THOUGHT(UT_SIDE_SRV, M0_NC_ONLINE)); + + ut_remach_shutdown(&um); +} + +/* Use-case: reboot a node when it started to recover. */ +static void remach_reboot_twice(void) +{ + struct ut_remach um = { .cp = UT_CP_PERSISTENT_CLIENT }; + + ut_remach_boot(&um); + + m0_be_op_reset(um.recovered + UT_SIDE_SRV); + m0_be_op_active(um.recovered + UT_SIDE_SRV); + ut_remach_reset_srv(&um); + + /* + * Do not tell the client about failure. + * No REDOs would be sent, so that we can see what happens + * in the case where recovery machine has to be stopped + * in the middle of awaiting for REDOs. + */ + ut_remach_ha_tells(&um, &HA_THOUGHT(UT_SIDE_SRV, M0_NC_TRANSIENT), + UT_SIDE_SRV); + ut_remach_ha_tells(&um, &HA_THOUGHT(UT_SIDE_CLI, M0_NC_ONLINE), + UT_SIDE_SRV); + ut_remach_ha_tells(&um, &HA_THOUGHT(UT_SIDE_SRV, M0_NC_DTM_RECOVERING), + UT_SIDE_SRV); + ut_remach_reset_srv(&um); + M0_UT_ASSERT(!m0_be_op_is_done(&um.recovered[UT_SIDE_SRV])); + + ut_remach_ha_thinks(&um, &HA_THOUGHT(UT_SIDE_SRV, M0_NC_TRANSIENT)); + ut_remach_ha_tells(&um, &HA_THOUGHT(UT_SIDE_CLI, M0_NC_ONLINE), + UT_SIDE_SRV); + ut_remach_ha_thinks(&um, &HA_THOUGHT(UT_SIDE_SRV, + M0_NC_DTM_RECOVERING)); + m0_be_op_wait(um.recovered + UT_SIDE_SRV); + ut_remach_ha_thinks(&um, &HA_THOUGHT(UT_SIDE_SRV, M0_NC_ONLINE)); + + ut_remach_shutdown(&um); +} + +/* Use-case: replay an empty DTM0 log. */ +static void remach_boot_real_log(void) +{ + struct ut_remach um = { + .cp = UT_CP_PERSISTENT_CLIENT, + .use_real_log = true + }; + ut_remach_boot(&um); + ut_remach_shutdown(&um); +} + +/* Use-case: replay a non-empty client log to the server. */ +static void remach_real_log_replay(void) +{ + struct ut_remach um = { + .cp = UT_CP_PERSISTENT_CLIENT, + .use_real_log = true + }; + /* cafe bell */ + const uint64_t since = 0xCAFEBELL; + const uint64_t records_nr = 10; + + ut_remach_boot(&um); + + ut_remach_log_gen_sync(&um, UT_SIDE_CLI, since, records_nr); + + m0_be_op_reset(um.recovered + UT_SIDE_SRV); + m0_be_op_active(um.recovered + UT_SIDE_SRV); + ut_remach_reset_srv(&um); + + ut_remach_ha_thinks(&um, &HA_THOUGHT(UT_SIDE_SRV, M0_NC_TRANSIENT)); + ut_remach_ha_tells(&um, &HA_THOUGHT(UT_SIDE_CLI, M0_NC_ONLINE), + UT_SIDE_SRV); + ut_remach_ha_thinks(&um, &HA_THOUGHT(UT_SIDE_SRV, + M0_NC_DTM_RECOVERING)); + m0_be_op_wait(um.recovered + UT_SIDE_SRV); + log_subset_verify(&um, records_nr, UT_SIDE_CLI, UT_SIDE_SRV); + ut_remach_ha_thinks(&um, &HA_THOUGHT(UT_SIDE_SRV, M0_NC_ONLINE)); + + ut_remach_shutdown(&um); } extern void m0_dtm0_ut_drlink_simple(void); extern void m0_dtm0_ut_domain_init_fini(void); struct m0_ut_suite dtm0_ut = { - .ts_name = "dtm0-ut", - .ts_tests = { - { "xcode", &cas_xcode_test }, - { "drlink-simple", &m0_dtm0_ut_drlink_simple }, - { "domain_init-fini", &m0_dtm0_ut_domain_init_fini }, + .ts_name = "dtm0-ut", + .ts_tests = { + { "xcode", cas_xcode_test }, + { "drlink-simple", &m0_dtm0_ut_drlink_simple }, + { "domain_init-fini", &m0_dtm0_ut_domain_init_fini }, + { "remach-init-fini", remach_init_fini }, + { "remach-start-stop", remach_start_stop }, + { "remach-boot-cluster-ss", remach_boot_cluster_ss }, + { "remach-boot-cluster-cs", remach_boot_cluster_cs }, + { "remach-reboot-server", remach_reboot_server }, + { "remach-reboot-twice", remach_reboot_twice }, + { "remach-boot-real-log", remach_boot_real_log }, + { "remach-real-log-replay", remach_real_log_replay }, { NULL, NULL }, } }; @@ -124,3 +836,6 @@ struct m0_ut_suite dtm0_ut = { * scroll-step: 1 * End: */ +/* + * vim: tabstop=8 shiftwidth=8 noexpandtab textwidth=80 nowrap + */ diff --git a/lib/bitmap.c b/lib/bitmap.c index 309160c73e5..fcd43532e96 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -200,10 +200,11 @@ M0_INTERNAL void m0_bitmap_onwire_fini(struct m0_bitmap_onwire *ow_map) M0_INTERNAL void m0_bitmap_store(const struct m0_bitmap *im_map, struct m0_bitmap_onwire *ow_map) { - size_t s = M0_BITMAP_WORDS(im_map->b_nr); + size_t s; M0_PRE(im_map != NULL && ow_map != NULL); M0_PRE(im_map->b_words != NULL); + s = M0_BITMAP_WORDS(im_map->b_nr); M0_PRE(s == ow_map->bo_size); memcpy(ow_map->bo_words, im_map->b_words, diff --git a/lib/user_space/timer.c b/lib/user_space/timer.c index de098357149..8c72f3115ef 100644 --- a/lib/user_space/timer.c +++ b/lib/user_space/timer.c @@ -52,6 +52,10 @@ static const m0_time_t zero_time = M0_MKTIME(0, 0); /** Clock source for M0_TIMER_HARD. @see timer_posix_set() */ static int clock_source_timer = -1; +/* Number of allocated HARD timers */ +static unsigned long long hard_timer_alloc = 0; +static unsigned long long hard_timer_free = 0; + /** Typed list of m0_timer_tid structures. */ @@ -193,9 +197,19 @@ static int timer_posix_init(struct m0_timer *timer) } rc = timer_create(clock_source_timer, &se, &ptimer); /* preserve timer->t_ptimer if timer_create() isn't succeeded */ - if (rc == 0) + if (rc == 0) { timer->t_ptimer = ptimer; - return M0_RC(rc); + hard_timer_alloc++; + } else { + rc = M0_ERR(-errno); + M0_LOG(M0_ERROR, "Failed to allocate HARD timer (%d), " + "alloc=%lld, free=%lld, inuse=%lld", + rc, + hard_timer_alloc, + hard_timer_free, + hard_timer_alloc - hard_timer_free); + } + return rc != 0 ? M0_ERR(rc) : 0; } /** @@ -210,6 +224,7 @@ static void timer_posix_fini(timer_t posix_timer) * timer_delete() can fail iff timer->t_ptimer isn't valid timer ID. */ M0_ASSERT_INFO(rc == 0, "rc = %d", rc); + hard_timer_free++; } static m0_time_t timer_time_to_realtime(m0_time_t expire) diff --git a/lib/ut/cookie.c b/lib/ut/cookie.c index 2c9ab7c0e33..45e1ebdb7e3 100644 --- a/lib/ut/cookie.c +++ b/lib/ut/cookie.c @@ -141,7 +141,7 @@ static void addr_sanity(const uint64_t *addr, bool sane, bool aligned) void test_cookie(void) { int insane_cnt = 0; - uint64_t automatic; + uint64_t automatic = 0; uint64_t *dynamic; uint64_t i; struct m0_cookie cookie_test; diff --git a/motr/idx_dix.c b/motr/idx_dix.c index bcf71de729a..7bee805a564 100644 --- a/motr/idx_dix.c +++ b/motr/idx_dix.c @@ -71,6 +71,14 @@ struct dix_req { struct m0_op_idx *idr_oi; struct m0_sm_ast idr_ast; struct m0_clink idr_clink; + + /** + * Clink that connects STABLE/FAILURE states of DTX with this dix_req. + * Note, the clink is always initialised but not always attached + * to a dtx. + */ + struct m0_clink idr_dtx_clink; + /** * Starting key for NEXT operation. * It's allocated internally and key value is copied from user. @@ -614,6 +622,7 @@ static int dix_mreq_create(struct m0_op_idx *oi, M0_ALLOC_PTR(req); if (req == NULL) return M0_ERR(-ENOMEM); + m0_clink_init(&req->idr_dtx_clink, dixreq_clink_dtx_cb); if (idx_is_distributed(oi)) { m0_dix_meta_req_init(&req->idr_mreq, op_dixc(oi), oi->oi_sm_grp); @@ -651,14 +660,13 @@ static int dix_req_create(struct m0_op_idx *oi, M0_ALLOC_PTR(req); if (req != NULL) { + m0_clink_init(&req->idr_dtx_clink, dixreq_clink_dtx_cb); if (idx_is_distributed(oi)) { m0_dix_req_init(&req->idr_dreq, op_dixc(oi), oi->oi_sm_grp); to_dix_map(&oi->oi_oc.oc_op, &req->idr_dreq); req->idr_dreq.dr_dtx = oi->oi_dtx; - m0_clink_init(&req->idr_clink, - oi->oi_dtx != NULL ? - dixreq_clink_dtx_cb : dixreq_clink_cb); + m0_clink_init(&req->idr_clink, dixreq_clink_cb); /* Store oi for dix callbacks to update SYNC records. */ if (M0_IN(oi->oi_oc.oc_op.op_code, @@ -690,6 +698,7 @@ static void dix_req_destroy(struct dix_req *req) } else { m0_cas_req_fini(&req->idr_creq); } + m0_clink_fini(&req->idr_dtx_clink); m0_free(req); M0_LEAVE(); } @@ -718,6 +727,7 @@ static void dixreq_completed_post(struct dix_req *req, int rc) struct m0_op_idx *oi = req->idr_oi; M0_ENTRY(); + M0_ASSERT(req->idr_ast.sa_cb != dixreq_completed_ast); oi->oi_ar.ar_rc = rc; req->idr_ast.sa_cb = dixreq_completed_ast; req->idr_ast.sa_datum = req; @@ -826,108 +836,42 @@ static bool dix_meta_req_clink_cb(struct m0_clink *cl) return false; } -static void dixreq_stable_ast(struct m0_sm_group *grp, struct m0_sm_ast *ast) -{ - struct dix_req *req = ast->sa_datum; - struct m0_op_idx *oi = req->idr_oi; - int rc = oi->oi_ar.ar_rc; - - M0_ENTRY(); - oi->oi_ar.ar_ast.sa_cb = (rc == 0) ? idx_op_ast_stable : NULL; - M0_ASSERT(grp == oi->oi_sm_grp); - dix_req_destroy(req); - idx_op_ast_stable(oi->oi_sm_grp, &oi->oi_ar.ar_ast); - M0_LEAVE(); -} - -static void dixreq_stable_post(struct dix_req *req, int rc) +static bool dix_req_is_completed(struct dix_req *dix_req) { - struct m0_op_idx *oi = req->idr_oi; + struct m0_dix_req *dreq = &dix_req->idr_dreq; + struct m0_op_idx *oi = dix_req->idr_oi; + struct m0_dtx *dtx = oi->oi_dtx; + bool is_executed = + M0_IN(dreq->dr_sm.sm_state, (DIXREQ_FAILURE, DIXREQ_FINAL)); + bool is_stable = dtx == NULL || + M0_IN(m0_dtx0_sm_state(dtx), (M0_DDS_STABLE, M0_DDS_FAILED)); - M0_ENTRY(); - oi->oi_ar.ar_rc = rc; - req->idr_ast.sa_cb = dixreq_stable_ast; - req->idr_ast.sa_datum = req; - M0_ASSERT_INFO(req->idr_ast.sa_next == NULL, - "Stable() ast cannot be armed before Executed() " - "is completed. Ensure EXECUTED_ALL -> STABLE transition" - "does not happen within the same ast tick"); - m0_sm_ast_post(oi->oi_sm_grp, &req->idr_ast); - M0_LEAVE(); -} - -static void dixreq_executed_post(struct dix_req *req, int rc) -{ - struct m0_op_idx *oi = req->idr_oi; - - M0_ENTRY(); - - M0_ASSERT_INFO(rc == 0, "TODO: Failures are not handled here."); - oi->oi_ar.ar_rc = rc; - - /* XXX: DTX cannot be canceled (as per the current design), - * so that once we got a reply, we prohibit any kind of cancelation. - * The originator should m0_panic itself in case if something needs - * to be canceled. It will be re-started and continue its execution. - */ - oi->oi_in_completion = true; - oi->oi_ar.ar_ast.sa_cb = idx_op_ast_executed; - M0_ASSERT(req->idr_dreq.dr_dtx->tx_dtx->dd_sm.sm_grp == oi->oi_sm_grp); - M0_ASSERT(m0_sm_group_is_locked(oi->oi_sm_grp)); - idx_op_ast_executed(oi->oi_sm_grp, &oi->oi_ar.ar_ast); - M0_LEAVE(); + M0_ENTRY("is_executed=%d, is_stable=%d", !!is_executed, !!is_stable); + return M0_RC(is_executed && is_stable); } static bool dixreq_clink_dtx_cb(struct m0_clink *cl) { - struct dix_req *dix_req = M0_AMB(dix_req, cl, idr_clink); + struct dix_req *dix_req = M0_AMB(dix_req, cl, idr_dtx_clink); struct m0_op_idx *oi = dix_req->idr_oi; - struct m0_sm *req_sm = M0_AMB(req_sm, cl->cl_chan, sm_chan); struct m0_dix_req *dreq = &dix_req->idr_dreq; - struct m0_dtx *dtx = oi->oi_dtx; - enum m0_dtm0_dtx_state state; - int i; - - M0_PRE(M0_IN(oi->oi_oc.oc_op.op_code, (M0_IC_PUT, M0_IC_DEL))); - M0_PRE(dtx != NULL); - - state = m0_dtx0_sm_state(dtx); - if (!M0_IN(state, (M0_DDS_EXECUTED_ALL, M0_DDS_STABLE, M0_DDS_FAILED))) + /* + * Filter out the situations where dtx exists but m0_op_idx + * is not yet attached to dix_req. + */ + if (oi == NULL) return false; - switch (state) { - case M0_DDS_EXECUTED_ALL: - /* TODO: we have a single kv pair; probably, it does not have - * to be a loop. - */ - for (i = 0; i < m0_dix_req_nr(dreq); i++) { - oi->oi_rcs[i] = dreq->dr_items[i].dxi_rc; - } - /* XXX: We cannot use m0_dix_generic_rc here because the - * precondition fails in this case. At this point error - * handling is not covered here, and probably the error - * code needs to be first propogated from DIX to DTX and - * then it needs to be passed here as dtx.dd_sm.sm_rc. - */ - dixreq_executed_post(dix_req, dreq->dr_sm.sm_rc); - break; - case M0_DDS_STABLE: - M0_ASSERT_INFO(m0_dix_generic_rc(dreq) == 0, - "TODO: DIX failures are not supported."); - - M0_ASSERT_INFO(m0_forall(idx, m0_dix_req_nr(dreq), - m0_dix_item_rc(dreq, idx) == 0), - "TODO: failed executions of individual items " - "are not supported yet."); - - dixreq_stable_post(dix_req, m0_dix_generic_rc(dreq)); + M0_PRE(M0_IN(oi->oi_oc.oc_op.op_code, (M0_IC_PUT, M0_IC_DEL))); + M0_PRE(oi->oi_dtx != NULL); + + if (M0_IN(m0_dtx0_sm_state(oi->oi_dtx), + (M0_DDS_STABLE, M0_DDS_FAILED)) && m0_clink_is_armed(cl)) m0_clink_del(cl); - break; - case M0_DDS_FAILED: - M0_IMPOSSIBLE("DTX failures are not supported so far."); - default: - M0_IMPOSSIBLE("Only Executed and Stable are allowed so far."); + + if (dix_req_is_completed(dix_req)) { + dixreq_completed_post(dix_req, m0_dix_generic_rc(dreq)); } return false; @@ -985,15 +929,20 @@ static bool dixreq_clink_cb(struct m0_clink *cl) } } - dixreq_completed_post(dix_req, rc); + if (dix_req_is_completed(dix_req)) + dixreq_completed_post(dix_req, rc); return false; } static void dix_req_immed_failure(struct dix_req *req, int rc) { + struct m0_op_idx *oi = req->idr_oi; + M0_ENTRY(); M0_PRE(rc != 0); m0_clink_del(&req->idr_clink); + if (oi->oi_dtx != NULL) + m0_clink_del(&req->idr_dtx_clink); dixreq_completed_post(req, rc); M0_LEAVE(); } @@ -1119,9 +1068,10 @@ static void dix_dreq_prepare(struct dix_req *req, struct m0_op_idx *oi) { dix_build(oi, dix); - m0_clink_add(oi->oi_dtx != NULL ? - &oi->oi_dtx->tx_dtx->dd_sm.sm_chan : - &req->idr_dreq.dr_sm.sm_chan, &req->idr_clink); + m0_clink_add(&req->idr_dreq.dr_sm.sm_chan, &req->idr_clink); + if (oi->oi_dtx != NULL) + m0_clink_add(&oi->oi_dtx->tx_dtx->dd_sm.sm_chan, + &req->idr_dtx_clink); } static void dix_put_ast(struct m0_sm_group *grp, struct m0_sm_ast *ast) @@ -1271,8 +1221,13 @@ M0_INTERNAL int m0__idx_cancel(struct m0_op_idx *oi) } static void dix_set_idx_flags(struct m0_op_idx *oi) -{ - if (ENABLE_DTM0) +{ + /* XXX: + * For some reason, "if(ENABLE_DTM0)" was added here, but + * it wasn't properly tested. Disabling it until the code + * is covered with tests. + */ + if (0) oi->oi_flags |= M0_OIF_SKIP_LAYOUT; if (!(oi->oi_flags & M0_OIF_SKIP_LAYOUT)) diff --git a/motr/m0kv/index.c b/motr/m0kv/index.c index 294ceacd15f..a6158a0b3c3 100644 --- a/motr/m0kv/index.c +++ b/motr/m0kv/index.c @@ -30,6 +30,7 @@ #define M0_TRACE_SUBSYSTEM M0_TRACE_SUBSYS_CLIENT #include /* FILE */ #include /* uuid_generate */ +#include /* access(), sleep() */ #include "lib/assert.h" /* M0_ASSERT */ #include "lib/errno.h" #include "lib/memory.h" @@ -79,6 +80,7 @@ static int instance_init(struct params *params) .cc_params = params, .cc_conf = { .mc_is_oostore = true, + .mc_is_addb_init = true, .mc_is_read_verify = false, .mc_local_addr = params->cp_local_addr, .mc_ha_addr = params->cp_ha_addr, @@ -161,6 +163,25 @@ static int genv(char *filename, int cnt, int size) return 0; } +static int wait_file(const char *expected_file) +{ + int rc; + + m0_console_printf("Awaiting: 'touch %s' \n", expected_file); + + while (access(expected_file, F_OK ) != 0) { + rc = -errno; + M0_ASSERT_INFO(rc == -ENOENT, "Wrong file or directory? " + "file=%s, rc=%d", expected_file, rc); + sleep(1); + } + + m0_console_printf("File detected: '%s'\n", expected_file); + (void) remove(expected_file); + + return 0; +} + static void log_hex_val(const char *tag, void *buf, int size) { int i; @@ -274,6 +295,9 @@ static int cmd_exec(struct index_cmd *cmd) case GENV: rc = genv(cmd->ic_filename, cmd->ic_cnt, cmd->ic_len); break; + case WLF: + rc = wait_file(cmd->ic_filename); + break; default: rc = M0_ERR(-EINVAL); M0_ASSERT(0); diff --git a/motr/m0kv/index.h b/motr/m0kv/index.h index e24d067eb22..f75b66ccc86 100644 --- a/motr/m0kv/index.h +++ b/motr/m0kv/index.h @@ -43,7 +43,8 @@ enum { GET, /* Get record. */ NXT, /* Next record. */ GENF, /* Generate FID-file. */ - GENV /* Generate VAL-file. */ + GENV, /* Generate VAL-file. */ + WLF, /* Wait for a file to appear. */ }; enum { diff --git a/motr/m0kv/index_parser.c b/motr/m0kv/index_parser.c index 8865e614f2a..cc975fad001 100644 --- a/motr/m0kv/index_parser.c +++ b/motr/m0kv/index_parser.c @@ -61,6 +61,7 @@ static const struct command_descr commands[] = { { GENF, "genf", "genf CNT FILE, generate file with several FID" }, { GENV, "genv", "genv CNT SIZE FILE, generate file with several " "KEY_PARAM/VAL_PARAM. Note: SIZE > 16" }, + { WLF, "wait", "wait FILE, await a file to appear" }, }; static int command_id(const char *name) @@ -362,6 +363,11 @@ static int command_assign(struct index_cmd *cmd, int *argc, char ***argv) ++*params; *argc -= 3; break; + case WLF: + cmd->ic_filename = **params; + ++*params; + --*argc; + break; default: M0_IMPOSSIBLE("Wrong command"); } @@ -409,6 +415,9 @@ static bool command_is_valid(struct index_cmd *cmd) cmd->ic_cnt != 0 && cmd->ic_len != 0; break; + case WLF: + rc = cmd->ic_filename != NULL; + break; default: M0_IMPOSSIBLE("Wrong command."); } diff --git a/motr/magic.h b/motr/magic.h index ecc23779ed2..8b4eebc85d2 100644 --- a/motr/magic.h +++ b/motr/magic.h @@ -430,6 +430,10 @@ enum m0_magic_satchel { M0_DTM_CAT_MAGIX = 0x33acce551b1e4277, /* m0_dtm::d_excited::t_magic (flooded baboo) */ M0_DTM_EXC_MAGIX = 0x33f100dedbab0077, + /* recovery_fom::rf_magic (solidified 66) */ + M0_DTM0_RMACH_MAGIC = 0x335011D1F1ED6677, + /* rfom_tl::td_head_magic (seeded cobble) */ + M0_DTM0_RMACH_HEAD_MAGIC = 0x335EEDEDC0BB1E77, /* Failure Domains */ /* m0_fd_perm_cache::fpc_magic (fascia doodia) */ diff --git a/motr/ut/idx_dix.c b/motr/ut/idx_dix.c index 11d89873dac..252bef10863 100644 --- a/motr/ut/idx_dix.c +++ b/motr/ut/idx_dix.c @@ -1118,7 +1118,7 @@ static void dtm0_ut_cas_op_prepare(const struct m0_fid *cfid, } } -static void dtm0_ut_send_redo(const struct m0_fid *ifid, +static void dtm0_ut_send_redo(const struct m0_fid *ifid, uint32_t sdev_id, uint64_t *key, uint64_t *val) { int rc; @@ -1142,8 +1142,6 @@ static void dtm0_ut_send_redo(const struct m0_fid *ifid, * Ivan Alekhin. */ struct m0_fom zero_fom_to_be_deleted = {}; - /* Extreme hack to convert index fid to component catalogue fid. */ - uint32_t sdev_idx = 10; m0_dtm0_clk_src_init(&dcs, M0_DTM0_CS_PHYS); m0_dtm0_clk_src_now(&dcs, &now); @@ -1162,7 +1160,7 @@ static void dtm0_ut_send_redo(const struct m0_fid *ifid, .dti_fid = cli_dtm0_fid }; - m0_dix_fid_convert_dix2cctg(ifid, &cctg_fid, sdev_idx); + m0_dix_fid_convert_dix2cctg(ifid, &cctg_fid, sdev_id); dtm0_ut_cas_op_prepare(&cctg_fid, &cas_op, &cas_rec, key, val, &txr); @@ -1216,7 +1214,59 @@ static void dtm0_ut_read_and_check(uint64_t key, uint64_t val) m0_free0(&rcs); } -static void st_dtm0_r(void) +static uint32_t dtm0_ut_cas_sdev_id_get(void) +{ + struct m0_fid srv_cas_fid; + struct m0_fid srv_proc_fid; + struct m0_confc *confc = m0_reqh2confc(&ut_m0c->m0c_reqh); + struct m0_conf_obj *obj; + struct m0_conf_service *service; + struct m0_conf_obj *sdevs_dir; + struct m0_conf_sdev *sdev; + bool found = false; + uint32_t sdev_id = 0; + int rc; + + rc = m0_fid_sscanf(ut_m0_config.mc_process_fid, &srv_proc_fid); + M0_UT_ASSERT(rc == 0); + rc = m0_conf_process2service_get(confc, &srv_proc_fid, + M0_CST_CAS, &srv_cas_fid); + M0_UT_ASSERT(rc == 0); + + obj = m0_conf_cache_lookup(&confc->cc_cache, &srv_cas_fid); + M0_ASSERT(obj != NULL); + + service = M0_CONF_CAST(obj, m0_conf_service); + M0_ASSERT(service != NULL); + + sdevs_dir = &service->cs_sdevs->cd_obj; + rc = m0_confc_open_sync(&sdevs_dir, sdevs_dir, M0_FID0); + M0_ASSERT(rc == 0); + + obj = NULL; + while ((rc = m0_confc_readdir_sync(sdevs_dir, &obj)) > 0) { + sdev = M0_CONF_CAST(obj, m0_conf_sdev); + if (!found) { + sdev_id = sdev->sd_dev_idx; + found = true; + } else { + /* + * Single device attached to the CAS service + * is a standard configuration for now, don't + * support several attached devices in the UT. + */ + M0_IMPOSSIBLE("Do not support several CAS devices."); + } + } + + m0_confc_close(sdevs_dir); + + M0_ASSERT(found); + + return sdev_id; +} + +static void st_dtm0_r_common(uint32_t sdev_id) { m0_time_t rem; uint64_t key = 111; @@ -1227,7 +1277,7 @@ static void st_dtm0_r(void) idx_setup(); exec_one_by_one(1, M0_IC_PUT); - dtm0_ut_send_redo(&duc.duc_ifid, &key, &val); + dtm0_ut_send_redo(&duc.duc_ifid, sdev_id, &key, &val); /* XXX dirty hack, but now we don't have completion notification */ rem = 2ULL * M0_TIME_ONE_SECOND; @@ -1238,6 +1288,27 @@ static void st_dtm0_r(void) idx_teardown(); } +static void st_dtm0_r(void) +{ + /* CAS sdev id from the configuration. */ + uint32_t sdev_id = dtm0_ut_cas_sdev_id_get(); + st_dtm0_r_common(sdev_id); +} + +static void st_dtm0_r_wrong_sdev(void) +{ + /* + * Random CAS sdev id, should be fixed by REDO handler. + * In a real system participants will send the REDO messages + * with their own CAS devices IDs during recovery, so the REDO + * handler of the process to be recovered needs to get the CAS + * device ID attached to a local CAS service and set it in an + * operation CAS ID. + */ + uint32_t sdev_id = dtm0_ut_cas_sdev_id_get() + 12345; + st_dtm0_r_common(sdev_id); +} + struct m0_ut_suite ut_suite_mt_idx_dix = { .ts_name = "idx-dix-mt", .ts_owners = "Anatoliy", @@ -1252,6 +1323,7 @@ struct m0_ut_suite ut_suite_mt_idx_dix = { { "dtm0_e_then_s", st_dtm0_e_then_s, "Ivan" }, { "dtm0_c", st_dtm0_c, "Ivan" }, { "dtm0_r", st_dtm0_r, "Sergey" }, + { "dtm0_r_wrong_sdev", st_dtm0_r_wrong_sdev, "Sergey" }, { NULL, NULL } } }; diff --git a/rpc/conn.c b/rpc/conn.c index 57b2ddefd56..27aff8dbd79 100644 --- a/rpc/conn.c +++ b/rpc/conn.c @@ -1349,10 +1349,18 @@ static bool rpc_conn__on_service_event_cb(struct m0_clink *clink) M0_PRE(m0_rpc_conn2svc(conn) == obj); M0_LOG(M0_DEBUG, "obj->co_ha_state = %d", obj->co_ha_state); /* + * At this moment, M0_NC_TRANSIENT means that some volatile state + * associated with that process was lost (its memory, packets, + * connections -- anything). + * In this case, the best thing we can do is to cancel + * all the sessions and let the user to re-connect. + * M0_NC_FAILED means the same thing plus some persistent state + * was lost. + * Previous outdated comment: * Ignore M0_NC_TRANSIENT state to keep items re-sending until service * gets M0_NC_ONLINE or becomes M0_NC_FAILED finally. */ - if (obj->co_ha_state == M0_NC_FAILED) + if (M0_IN(obj->co_ha_state, (M0_NC_FAILED, M0_NC_TRANSIENT))) m0_rpc_conn_sessions_cancel(conn); /** * @todo See if to __conn_ha_unsubscribe() right now, but not wait until diff --git a/rpc/rpc_opcodes.h b/rpc/rpc_opcodes.h index 0310fa2123d..cb0ba05b60f 100644 --- a/rpc/rpc_opcodes.h +++ b/rpc/rpc_opcodes.h @@ -358,6 +358,7 @@ enum M0_RPC_OPCODES { M0_ISCSERVICE_EXEC_OPCODE = 1072, M0_DTM0_RLINK_OPCODE = 1073, M0_FDMI_SOURCE_DOCK_TIMER_OPCODE = 1074, + M0_DTM0_RECOVERY_FOM_OPCODE = 1075, M0_OPCODES_NR = 2048 } M0_XCA_ENUM; diff --git a/scripts/gdb/gdb-extensions b/scripts/gdb/gdb-extensions index 40790d4d9e6..c40ef48b9ad 100644 --- a/scripts/gdb/gdb-extensions +++ b/scripts/gdb/gdb-extensions @@ -113,3 +113,37 @@ define runever end end end + +define coro-bt + set $c = ((struct m0_co_context *) ($arg0)) + set $i = 0 + while ($i < $c->mc_frame) + list *$c->mc_stack[$i] + set $i = $i + 1 + end +end + +document coro-bt + Prints the backtrace of a coroutine context. +end + +define rfom-print + set $s = ((struct recovery_fom *) ($arg0)) + printf "p=%p, v=%d, l=%d, ", $s, (int) $s->rf_is_volatile, (int) $s->rf_is_local + printf "eol=%d, ", (int) $s->rf_last_known_eol + printf "ha=%d, end=%d, ", (int) $s->rf_last_known_ha_state, (int) $s->rf_eolq.bq_the_end + printf "coro=%p \n", ($s->rf_coro.mc_stack[$s->rf_coro.mc_frame - 1]) + coro-bt &$s->rf_coro +end + +define drm-print + set $r = (struct m0_dtm0_recovery_machine *) ($arg0) + m0-list-print &($arg0)->rm_rfoms struct recovery_fom rf_linkage rfom-print +end + +document drm-print + Prints parts of DTM0 recovery machine state in human-readable format. + The only argument is a pointer to a recovery machine. + The function prints some properties of recovery FOMs and the + backtraces of their coroutines. + diff --git a/scripts/install/opt/seagate/cortx/motr/bin/motr_mini_prov.py b/scripts/install/opt/seagate/cortx/motr/bin/motr_mini_prov.py index 3db5129eba0..696c416b12a 100644 --- a/scripts/install/opt/seagate/cortx/motr/bin/motr_mini_prov.py +++ b/scripts/install/opt/seagate/cortx/motr/bin/motr_mini_prov.py @@ -59,6 +59,7 @@ CMD_RETRY_COUNT = 5 MEM_THRESHOLD = 4*1024*1024*1024 CVG_COUNT_KEY = "num_cvg" +MOTR_M0D_MIN_RPC_RECVQ_LEN = 64 class MotrError(Exception): """ Generic Exception with error code and output """ @@ -494,6 +495,7 @@ def update_copy_motr_config_file(self): ("MOTR_M0D_DATA_DIR", f"{MOTR_M0D_DATA_DIR}"), ("MOTR_M0D_CONF_XC", f"{MOTR_M0D_CONF_XC}"), ("MOTR_M0D_ADDB_STOB_DIR", f"{MOTR_M0D_ADDB_STOB_DIR}"), + ("MOTR_M0D_MIN_RPC_RECVQ_LEN", f"{MOTR_M0D_MIN_RPC_RECVQ_LEN}"), ("MOTR_M0D_TRACE_DIR", f"{MOTR_M0D_TRACE_DIR}")] update_config_file(self, f"{MOTR_SYS_CFG}", config_kvs) # Copy config file to new path diff --git a/utils/m0hagen b/utils/m0hagen index 08210b4466e..966e370add9 100755 --- a/utils/m0hagen +++ b/utils/m0hagen @@ -295,7 +295,7 @@ yaml.add_implicit_resolver(Fid.yaml_tag, Fid.re_conf, # enum m0_ha_obj_state ha_states = ['unknown', 'online', 'failed', 'transient', 'repair', 'repaired', - 'rebalance'] + 'rebalance', 'dtm-recovering'] class HAMsgNVec(XCodeable):