diff --git a/addb2/dump.c b/addb2/dump.c
index c9bd0717e2a..b306a74e55d 100644
--- a/addb2/dump.c
+++ b/addb2/dump.c
@@ -861,6 +861,17 @@ static void dtx0_state_counter(struct m0_addb2__context *ctx, char *buf)
sm_trans(&m0_dtx_sm_conf, "dtx0", ctx, buf);
}
+extern struct m0_sm_conf m0_drm_sm_conf;
+static void drm_state(struct m0_addb2__context *ctx, const uint64_t *v,
+ char *buf)
+{
+ sm_state(&m0_drm_sm_conf, ctx, v, buf);
+}
+
+static void drm_state_counter(struct m0_addb2__context *ctx, char *buf)
+{
+ sm_trans(&m0_drm_sm_conf, "drm", ctx, buf);
+}
extern struct m0_sm_conf op_states_conf;
static void beop_state_counter(struct m0_addb2__context *ctx, char *buf)
@@ -1149,6 +1160,11 @@ struct m0_addb2__id_intrp ids[] = {
.ii_repeat = M0_AVI_DTX0_SM_COUNTER_END - M0_AVI_DTX0_SM_COUNTER,
.ii_spec = &dtx0_state_counter },
+ { M0_AVI_DRM_SM_STATE, "drm-state", { &drm_state, SKIP2 } },
+ { M0_AVI_DRM_SM_COUNTER, "",
+ .ii_repeat = M0_AVI_DRM_SM_COUNTER_END - M0_AVI_DRM_SM_COUNTER,
+ .ii_spec = &drm_state_counter },
+
{ M0_AVI_BE_TX_STATE, "tx-state", { &tx_state, SKIP2 } },
{ M0_AVI_BE_TX_COUNTER, "",
.ii_repeat = M0_AVI_BE_TX_COUNTER_END - M0_AVI_BE_TX_COUNTER,
diff --git a/be/dtm0_log.c b/be/dtm0_log.c
index 813222f1dfe..fae419ab20b 100644
--- a/be/dtm0_log.c
+++ b/be/dtm0_log.c
@@ -116,15 +116,12 @@ M0_INTERNAL void m0_be_dtm0_log_fini(struct m0_be_dtm0_log *log)
log->dl_cs = NULL;
}
-M0_INTERNAL void m0_be_dtm0_log_free(struct m0_be_dtm0_log **in_log)
+M0_INTERNAL void m0_be_dtm0_log_free(struct m0_be_dtm0_log *log)
{
- struct m0_be_dtm0_log *log = *in_log;
-
M0_PRE(!log->dl_is_persistent);
m0_free(log->u.dl_inmem);
m0_free(log);
- *in_log = NULL;
}
/**
@@ -394,8 +391,8 @@ static int dtm0_log__insert(struct m0_be_dtm0_log *log,
struct m0_buf *payload)
{
int rc;
- struct m0_dtm0_log_rec *rec;
- struct m0_be_seg *seg = log->dl_seg;
+ struct m0_dtm0_log_rec *rec;
+ struct m0_be_seg *seg = log->dl_seg;
if (log->dl_is_persistent) {
rc = plog_rec_init(&rec, tx, seg, txd, payload);
@@ -454,7 +451,7 @@ M0_INTERNAL int m0_be_dtm0_log_update(struct m0_be_dtm0_log *log,
struct m0_dtm0_tx_desc *txd,
struct m0_buf *payload)
{
- struct m0_dtm0_log_rec *rec;
+ struct m0_dtm0_log_rec *rec;
M0_PRE(payload != NULL);
M0_PRE(m0_be_dtm0_log__invariant(log));
@@ -653,8 +650,8 @@ static bool be_dtm0_log_iter_is_first(const struct m0_be_dtm0_log_iter *iter)
static bool be_dtm0_log_iter_invariant(const struct m0_be_dtm0_log_iter *iter)
{
return _0C(m0_be_dtm0_log__invariant(iter->dli_log)) &&
- _0C(be_dtm0_log_iter_is_first(iter) ||
- m0_dtm0_tid__invariant(&iter->dli_current_tid));
+ _0C(ergo(!be_dtm0_log_iter_is_first(iter),
+ m0_dtm0_tid__invariant(&iter->dli_current_tid)));
}
M0_INTERNAL void m0_be_dtm0_log_iter_init(struct m0_be_dtm0_log_iter *iter,
@@ -671,10 +668,10 @@ M0_INTERNAL void m0_be_dtm0_log_iter_fini(struct m0_be_dtm0_log_iter *iter)
}
M0_INTERNAL int m0_be_dtm0_log_iter_next(struct m0_be_dtm0_log_iter *iter,
- struct m0_dtm0_log_rec *out)
+ struct m0_dtm0_log_rec *out)
{
struct m0_dtm0_log_rec *rec;
- int rc;
+ int rc;
M0_PRE(m0_mutex_is_locked(&iter->dli_log->dl_lock));
M0_PRE(be_dtm0_log_iter_invariant(iter));
@@ -699,7 +696,7 @@ M0_INTERNAL int m0_be_dtm0_log_iter_next(struct m0_be_dtm0_log_iter *iter,
return M0_ERR(rc);
}
- return M0_RC(rec == NULL ? 0 : +1);
+ return M0_RC(rec != NULL ? 0 : -ENOENT);
}
M0_INTERNAL int m0_dtm0_log_rec_copy(struct m0_dtm0_log_rec *dst,
diff --git a/be/dtm0_log.h b/be/dtm0_log.h
index 4e5751bdf5f..70763aebd6f 100644
--- a/be/dtm0_log.h
+++ b/be/dtm0_log.h
@@ -179,7 +179,7 @@ struct m0_dtm0_log_rec {
};
/**
- * @b m0_be_dtm0_log_rec structure
+ * @b m0_be_dtm0_log structure
*
* A DTM0 log is represented by m0_be_dtm0_log structure. The important
* fields in this structure are:
@@ -277,13 +277,21 @@ M0_INTERNAL void m0_be_dtm0_log_fini(struct m0_be_dtm0_log *log);
* Free the memory allocated by m0_be_dtm0_log_alloc
*
* @pre m0_be_dtm0_log->dl_is_persistent needs to be false.
- * @post *log is set to NULL.
+ * @post None
*
* @param log Pointer to a log structure that has been previously allocated.
*
* @return None
*/
-M0_INTERNAL void m0_be_dtm0_log_free(struct m0_be_dtm0_log **log);
+M0_INTERNAL void m0_be_dtm0_log_free(struct m0_be_dtm0_log *log);
+
+/** Frees memory and zeroes the pointer. */
+#define m0_be_dtm0_log_free0(pptr) \
+ do { \
+ typeof(pptr) __pptr = (pptr); \
+ m0_be_dtm0_log_free(*__pptr); \
+ *__pptr = NULL; \
+ } while (0)
/**
* For performing an operation on a persistent log, we need to take the
@@ -553,11 +561,11 @@ M0_INTERNAL void m0_be_dtm0_log_iter_fini(struct m0_be_dtm0_log_iter *iter);
* @param out returned record which is copied and needs to be freed with
* m0_dtm0_log_iter_rec_fini()
*
- * @return +1 when the iterator was successfully moved to the next record.
- * @return 0 when the end of the log has been reached.
+ * @return 0 when the iterator was successfully moved to the next record.
+ * @return -ENOENT when the end of the log has been reached.
*/
M0_INTERNAL int m0_be_dtm0_log_iter_next(struct m0_be_dtm0_log_iter *iter,
- struct m0_dtm0_log_rec *out);
+ struct m0_dtm0_log_rec *out);
/** @} */ /* end DTM0Internals */
diff --git a/be/ut/dtm0_log_ut.c b/be/ut/dtm0_log_ut.c
index 88f9ec77127..281a8654e12 100644
--- a/be/ut/dtm0_log_ut.c
+++ b/be/ut/dtm0_log_ut.c
@@ -281,7 +281,7 @@ void test_volatile_dtm0_log(void)
m0_mutex_unlock(&log->dl_lock);
m0_be_dtm0_log_fini(log);
- m0_be_dtm0_log_free(&log);
+ m0_be_dtm0_log_free0(&log);
m0_dtm0_clk_src_fini(&cs);
}
@@ -625,7 +625,7 @@ static void m0_be_ut_dtm0_log_init_fini(void)
m0_be_dtm0_log_iter_fini(&iter);
m0_be_dtm0_log_fini(log);
- m0_be_dtm0_log_free(&log);
+ m0_be_dtm0_log_free0(&log);
m0_dtm0_clk_src_fini(&cs);
}
@@ -635,10 +635,10 @@ static void m0_be_ut_dtm0_log_next(void)
struct m0_buf buf = {};
struct m0_be_dtm0_log_iter iter;
- struct m0_dtm0_log_rec out;
+ struct m0_dtm0_log_rec out;
struct m0_dtm0_clk_src cs;
struct m0_be_dtm0_log *log;
- int rc;
+ int rc;
m0_dtm0_clk_src_init(&cs, M0_DTM0_CS_PHYS);
@@ -660,11 +660,12 @@ static void m0_be_ut_dtm0_log_next(void)
m0_be_dtm0_log_iter_init(&iter, log);
rc = m0_be_dtm0_log_iter_next(&iter, &out);
- M0_UT_ASSERT(rc == 1);
+ M0_UT_ASSERT(rc == 0);
+ M0_UT_ASSERT(ut_dl_verify_log_rec(&out, 42));
m0_dtm0_log_iter_rec_fini(&out);
rc = m0_be_dtm0_log_iter_next(&iter, &out);
- M0_UT_ASSERT(rc == 0);
+ M0_UT_ASSERT(rc != 0);
m0_be_dtm0_log_iter_fini(&iter);
/* make log finalisation happy */
@@ -672,7 +673,7 @@ static void m0_be_ut_dtm0_log_next(void)
M0_UT_ASSERT(rc == 0);
m0_mutex_unlock(&log->dl_lock);
m0_be_dtm0_log_fini(log);
- m0_be_dtm0_log_free(&log);
+ m0_be_dtm0_log_free0(&log);
m0_dtm0_clk_src_fini(&cs);
}
diff --git a/cas/cas.h b/cas/cas.h
index fb09975e798..7e9e92fb4b9 100644
--- a/cas/cas.h
+++ b/cas/cas.h
@@ -471,6 +471,9 @@ M0_INTERNAL int m0_cas_fom_spawn(
struct m0_fop *cas_fop,
void (*on_fom_complete)(struct m0_fom_thralldom *,
struct m0_fom *));
+M0_INTERNAL uint32_t m0_cas_svc_device_id_get(
+ const struct m0_reqh_service_type *stype,
+ const struct m0_reqh *reqh);
#else
#define m0_cas_svc_init()
#define m0_cas_svc_fini()
diff --git a/cas/service.c b/cas/service.c
index 515bff9675f..2458b824855 100644
--- a/cas/service.c
+++ b/cas/service.c
@@ -308,9 +308,21 @@ enum {
STATS_NR
};
+enum {
+ INVALID_CAS_SDEV_ID = UINT32_MAX
+};
+
struct cas_service {
struct m0_reqh_service c_service;
struct m0_be_domain *c_be_domain;
+
+ /**
+ * sdev used by an instance of CAS service. This should be just one
+ * for the standard configuration, but current unit tests use more
+ * than one storage device attached to emulate several CAS services.
+ * In this case a special invalid value is used.
+ */
+ uint32_t c_sdev_id;
};
struct cas_kv {
@@ -556,6 +568,60 @@ m0_cas__ut_svc_be_get(struct m0_reqh_service *svc)
return service->c_be_domain;
}
+static int cas_service_sdev_id_set(struct cas_service *cas_svc)
+{
+ struct m0_reqh *reqh;
+ struct m0_conf_cache *cache;
+ struct m0_conf_obj *obj;
+ struct m0_fid *cas_svc_fid;
+ struct m0_conf_service *service;
+ struct m0_conf_obj *sdevs_dir;
+ struct m0_conf_sdev *sdev = NULL;
+ bool found = false;
+ int rc = 0;
+
+ M0_ASSERT(cas_svc->c_sdev_id == INVALID_CAS_SDEV_ID);
+
+ if (cas_in_ut())
+ return M0_RC(0);
+
+ reqh = cas_svc->c_service.rs_reqh;
+ cache = &m0_reqh2confc(reqh)->cc_cache;
+
+ cas_svc_fid = &cas_svc->c_service.rs_service_fid;
+ obj = m0_conf_cache_lookup(cache, cas_svc_fid);
+ M0_ASSERT(obj != NULL);
+
+ service = M0_CONF_CAST(obj, m0_conf_service);
+ M0_ASSERT(service != NULL);
+
+ sdevs_dir = &service->cs_sdevs->cd_obj;
+ rc = m0_confc_open_sync(&sdevs_dir, sdevs_dir, M0_FID0);
+ if (rc != 0)
+ return M0_RC(rc);
+
+ obj = NULL;
+ while ((rc = m0_confc_readdir_sync(sdevs_dir, &obj)) > 0) {
+ sdev = M0_CONF_CAST(obj, m0_conf_sdev);
+ if (!found) {
+ cas_svc->c_sdev_id = sdev->sd_dev_idx;
+ found = true;
+ } else {
+ /*
+ * Several devices attached to a single CAS service are
+ * possible if we are run in ut. In this case we use
+ * an invalid value for attached storage device.
+ * Also we can not break the loop until whole directory
+ * is iterated since it leads to crash during request
+ * handler finalisation.
+ */
+ cas_svc->c_sdev_id = INVALID_CAS_SDEV_ID;
+ }
+ }
+
+ m0_confc_close(sdevs_dir);
+ return M0_RC(rc);
+}
static int cas_service_start(struct m0_reqh_service *svc)
{
@@ -568,6 +634,7 @@ static int cas_service_start(struct m0_reqh_service *svc)
/* XXX It's a workaround. It's needed until we have a better way. */
service->c_be_domain = ut_dom != NULL ?
ut_dom : svc->rs_reqh_ctx->rc_beseg->bs_domain;
+ service->c_sdev_id = INVALID_CAS_SDEV_ID;
rc = m0_ctg_store_init(service->c_be_domain);
if (rc == 0) {
/*
@@ -576,6 +643,7 @@ static int cas_service_start(struct m0_reqh_service *svc)
* If no pending index drop, it finishes soon.
*/
m0_cas_gc_start(svc);
+ rc = cas_service_sdev_id_set(service);
}
return rc;
}
@@ -1832,6 +1900,7 @@ static int cas_device_check(const struct cas_fom *fom,
pm = &pver->pv_mach;
rc = cas_sdev_state(pm, device_id, &state);
if (rc == 0 && !M0_IN(state, (M0_PNDS_ONLINE,
+ M0_PNDS_OFFLINE,
M0_PNDS_SNS_REBALANCING)))
rc = M0_ERR(-EBADFD);
} else
@@ -2610,6 +2679,26 @@ M0_INTERNAL int m0_cas_fom_spawn(
return M0_RC(rc);
}
+M0_INTERNAL uint32_t m0_cas_svc_device_id_get(
+ const struct m0_reqh_service_type *stype,
+ const struct m0_reqh *reqh)
+{
+ struct m0_reqh_service *svc;
+ struct cas_service *cas_svc;
+
+ M0_PRE(stype != NULL);
+ M0_PRE(reqh != NULL);
+
+ svc = m0_reqh_service_find(stype, reqh);
+ M0_ASSERT(svc != NULL);
+ M0_ASSERT(m0_reqh_service_state_get(svc) == M0_RST_STARTED);
+
+ cas_svc = M0_AMB(cas_svc, svc, c_service);
+
+ M0_ASSERT(cas_svc->c_sdev_id != INVALID_CAS_SDEV_ID);
+ return cas_svc->c_sdev_id;
+}
+
static int cas_ctg_crow_handle(struct cas_fom *fom, const struct m0_cas_id *cid)
{
struct m0_fop *fop;
diff --git a/dix/fid_convert.c b/dix/fid_convert.c
index 5a221d89de0..c326158c554 100644
--- a/dix/fid_convert.c
+++ b/dix/fid_convert.c
@@ -35,6 +35,15 @@
* @{
*/
+/* Set device id in a DIX fid. */
+M0_INTERNAL void m0_dix_fid__device_id_set(struct m0_fid *fid,
+ uint32_t dev_id)
+{
+ M0_PRE(fid != NULL && (dev_id <= M0_DIX_FID_DEVICE_ID_MAX));
+ fid->f_container = (fid->f_container & ~M0_DIX_FID_DEVICE_ID_MASK) |
+ (((uint64_t)dev_id) << M0_DIX_FID_DEVICE_ID_OFFSET);
+}
+
/* extract bits [32, 56) from fid->f_container */
M0_INTERNAL uint32_t m0_dix_fid__device_id_extract(const struct m0_fid *fid)
{
diff --git a/dix/fid_convert.h b/dix/fid_convert.h
index 548636e0e6a..3508201e450 100644
--- a/dix/fid_convert.h
+++ b/dix/fid_convert.h
@@ -78,6 +78,8 @@ M0_INTERNAL bool m0_dix_fid_validate_cctg(const struct m0_fid *cctg_fid);
M0_INTERNAL uint32_t m0_dix_fid__device_id_extract(const struct m0_fid *fid);
+M0_INTERNAL void m0_dix_fid__device_id_set(struct m0_fid *fid,
+ uint32_t dev_id);
/** @} end of dix group */
#endif /* __MOTR_DIX_FID_CONVERT_H__ */
diff --git a/dix/req.c b/dix/req.c
index 8f797311ab9..f5cb4b113fd 100644
--- a/dix/req.c
+++ b/dix/req.c
@@ -1666,33 +1666,6 @@ static void dix_rop_completed(struct m0_sm_group *grp, struct m0_sm_ast *ast)
}
}
-static void dix_rop_one_completed(struct m0_dix_cas_rop *crop)
-{
- struct m0_dix_req *dreq = crop->crp_parent;
- struct m0_dix_rop_ctx *rop;
-
- M0_ENTRY();
- M0_PRE(!dreq->dr_is_meta);
- M0_PRE(M0_IN(dreq->dr_type, (DIX_PUT, DIX_DEL)));
- M0_PRE(dreq->dr_dtx != NULL);
- M0_PRE(dix_req_smgrp(dreq) == dreq->dr_dtx->tx_dtx->dd_sm.sm_grp);
-
- rop = crop->crp_parent->dr_rop;
- dix_cas_rop_rc_update(crop, 0);
-
- m0_dtx0_executed(dreq->dr_dtx, crop->crp_pa_idx);
-
- if (rop->dg_completed_nr == rop->dg_cas_reqs_nr) {
- rop->dg_ast = (struct m0_sm_ast) {
- .sa_cb = dix_rop_completed,
- .sa_datum = dreq,
- };
- m0_sm_ast_post(dix_req_smgrp(dreq), &rop->dg_ast);
- }
-
- M0_LEAVE();
-}
-
static bool dix_cas_rop_clink_cb(struct m0_clink *cl)
{
struct m0_dix_cas_rop *crop = container_of(cl, struct m0_dix_cas_rop,
@@ -1714,28 +1687,20 @@ static bool dix_cas_rop_clink_cb(struct m0_clink *cl)
dreq, crop->crp_creq.ccr_sess,
&crop->crp_creq.ccr_remid);
-
m0_clink_del(cl);
m0_clink_fini(cl);
rop = crop->crp_parent->dr_rop;
rop->dg_completed_nr++;
M0_PRE(rop->dg_completed_nr <= rop->dg_cas_reqs_nr);
- if (dreq->dr_dtx != NULL) {
- M0_ASSERT(dix_req_smgrp(dreq) ==
- dreq->dr_dtx->tx_dtx->dd_sm.sm_grp);
- dix_rop_one_completed(crop);
- } else {
- if (rop->dg_completed_nr == rop->dg_cas_reqs_nr) {
- rop->dg_ast = (struct m0_sm_ast) {
- .sa_cb = dix_rop_completed,
- .sa_datum = dreq,
- };
- m0_sm_ast_post(dix_req_smgrp(dreq),
- &rop->dg_ast);
- }
+ if (rop->dg_completed_nr == rop->dg_cas_reqs_nr) {
+ rop->dg_ast = (struct m0_sm_ast) {
+ .sa_cb = dix_rop_completed,
+ .sa_datum = dreq,
+ };
+ m0_sm_ast_post(dix_req_smgrp(dreq),
+ &rop->dg_ast);
}
-
}
return true;
}
@@ -1747,7 +1712,6 @@ static int dix_cas_rops_send(struct m0_dix_req *req)
struct m0_dix_cas_rop *cas_rop;
struct m0_cas_req *creq;
uint32_t sdev_idx;
- uint32_t pa_idx;
struct m0_cas_id cctg_id;
struct m0_reqh_service_ctx *cas_svc;
struct m0_dix_layout *layout = &req->dr_indices[0].dd_layout;
@@ -1812,17 +1776,6 @@ static int dix_cas_rops_send(struct m0_dix_req *req)
}
if (rc != 0) {
- /*
- * Treat failed and not sent CAS requests as executed
- * to unblock the EXECUTED-ALL logic. It allows to move
- * transaction to the stable state once the persistent
- * message received (EXECUTED state required for all
- * participants). So EXECUTED participant state is
- * reused in case of failure.
- */
- if (req->dr_dtx != NULL)
- m0_dtx0_executed(req->dr_dtx,
- cas_rop->crp_pa_idx);
m0_clink_del(&cas_rop->crp_clink);
m0_clink_fini(&cas_rop->crp_clink);
m0_cas_req_fini(&cas_rop->crp_creq);
@@ -1849,16 +1802,6 @@ static int dix_cas_rops_send(struct m0_dix_req *req)
rc = m0_dtx0_close(req->dr_dtx);
if (rc != 0)
return M0_ERR(rc);
- /*
- * It is safe to set EXECUTED dtx state for those
- * participants that experience transient failure,
- * it allows to trigger EXECUTED-ALL logic. See
- * the similar comment above for details.
- */
- for (pa_idx = cas_rop_tlist_length(&rop->dg_cas_reqs);
- pa_idx < req->dr_dtx->tx_dtx->dd_txd.dtd_ps.dtp_nr;
- pa_idx++)
- m0_dtx0_executed(req->dr_dtx, pa_idx);
}
return M0_RC(0);
diff --git a/doc/dld/dld-index.c b/doc/dld/dld-index.c
index a1a53a2d44d..a5f11c764e8 100644
--- a/doc/dld/dld-index.c
+++ b/doc/dld/dld-index.c
@@ -53,6 +53,7 @@ file.c -->
- @subpage spiel-dld "SPIEL API DLD"
- @subpage cas-dld "The catalogue service (CAS)"
- @subpage fis-dld "Fault Injection at run time"
+- @subpage dtm0br-dld "DTM0 basic recovery SM"
Detailed designs should use the @subpage DLD "Motr DLD Template"
as a style guide.
diff --git a/dtm0/Makefile.sub b/dtm0/Makefile.sub
index 965dbbd2dee..3d9a3ffee64 100644
--- a/dtm0/Makefile.sub
+++ b/dtm0/Makefile.sub
@@ -15,6 +15,7 @@ nobase_motr_include_HEADERS += \
dtm0/remach.h \
dtm0/service.h \
dtm0/svc_internal.h \
+ dtm0/recovery.h \
dtm0/tx_desc.h
@@ -33,6 +34,7 @@ motr_libmotr_la_SOURCES += \
dtm0/pruner.c \
dtm0/remach.c \
dtm0/service.c \
+ dtm0/recovery.c \
dtm0/tx_desc.c
diff --git a/dtm0/addb2.h b/dtm0/addb2.h
index 79f1918295d..7203c0ff812 100644
--- a/dtm0/addb2.h
+++ b/dtm0/addb2.h
@@ -34,9 +34,15 @@
#include "addb2/identifier.h"
enum m0_avi_dtm0_labels {
- M0_AVI_DTX0_SM_STATE = M0_AVI_DTM0_RANGE_START,
+ /* dtx0 */
+ M0_AVI_DTX0_SM_STATE = M0_AVI_DTM0_RANGE_START + 1,
M0_AVI_DTX0_SM_COUNTER,
M0_AVI_DTX0_SM_COUNTER_END = M0_AVI_DTX0_SM_COUNTER + 0x100,
+
+ /* Recovery Machine */
+ M0_AVI_DRM_SM_STATE = M0_AVI_DTX0_SM_COUNTER_END + 1,
+ M0_AVI_DRM_SM_COUNTER,
+ M0_AVI_DRM_SM_COUNTER_END = M0_AVI_DRM_SM_COUNTER + 0x100,
};
/** @} end of dtm0 group */
diff --git a/dtm0/drlink.c b/dtm0/drlink.c
index 9e5dc46d443..e29eeb75d1b 100644
--- a/dtm0/drlink.c
+++ b/dtm0/drlink.c
@@ -23,6 +23,7 @@
#include "lib/trace.h"
#include "addb2/identifier.h" /* M0_AVI_FOM_TO_TX */
#include "dtm0/fop.h" /* dtm0_req_fop */
+#include "dtm0/fop_xc.h" /* dtm0_req_fop_xc */
#include "dtm0/service.h" /* m0_dtm0_service */
#include "dtm0/svc_internal.h" /* dtm0_process */
#include "lib/coroutine.h" /* m0_co API */
@@ -31,6 +32,18 @@
#include "rpc/rpc.h" /* m0_rpc_item_post */
#include "rpc/rpc_machine.h" /* m0_rpc_machine */
#include "rpc/rpc_opcodes.h" /* M0_DTM0_{RLINK,REQ}_OPCODE */
+#include "lib/string.h" /* m0_streq */
+
+enum {
+ /*
+ * TODO: DTM model assumes infinite timeouts. But this is too scary at
+ * the moment, we cannot yet rely that infinite timeout will work
+ * without issues in connect/disconnect case. These timeouts will be
+ * adjusted/reworked when we work more on stabilising the DTM.
+ */
+ DRLINK_CONNECT_TIMEOUT_SEC = 1,
+ DRLINK_DISCONN_TIMEOUT_SEC = DRLINK_CONNECT_TIMEOUT_SEC,
+};
struct drlink_fom {
struct m0_fom df_gen;
@@ -71,7 +84,9 @@ static const struct m0_fom_ops drlink_fom_ops = {
};
static struct m0_fom_type drlink_fom_type;
-static const struct m0_fom_type_ops drlink_fom_type_ops = {};
+static const struct m0_fom_type_ops drlink_fom_type_ops = {
+ .fto_create = NULL
+};
const static struct m0_sm_conf drlink_fom_conf;
M0_INTERNAL int m0_dtm0_rpc_link_mod_init(void)
@@ -88,33 +103,26 @@ M0_INTERNAL void m0_dtm0_rpc_link_mod_fini(void)
{
}
-/* Create a deep copy of the given request. */
+/** Create a deep copy of the given request. */
static struct dtm0_req_fop *dtm0_req_fop_dup(const struct dtm0_req_fop *src)
{
- int rc;
- struct dtm0_req_fop *dst;
-
- M0_ALLOC_PTR(dst);
- if (dst == NULL)
- return NULL;
-
- rc = m0_dtm0_tx_desc_copy(&src->dtr_txr, &dst->dtr_txr);
- if (rc != 0) {
- M0_ASSERT(rc == -ENOMEM);
- m0_free(dst);
- return NULL;
- }
-
- rc = m0_buf_copy(&dst->dtr_payload, &src->dtr_payload);
- if (rc != 0) {
- m0_dtm0_tx_desc_fini(&dst->dtr_txr);
- m0_free(dst);
- return NULL;
- }
-
- dst->dtr_msg = src->dtr_msg;
-
- return dst;
+ int rc = 0;
+ struct m0_xcode_obj src_obj;
+ struct m0_xcode_obj dest_obj;
+ struct m0_xcode_ctx sctx;
+ struct m0_xcode_ctx dctx;
+ struct dtm0_req_fop *dest = NULL;
+
+ /* There is no const version of xcode objects, we'll have to cast it. */
+ src_obj = M0_XCODE_OBJ(dtm0_req_fop_xc, (struct dtm0_req_fop *)src);
+ dest_obj = M0_XCODE_OBJ(dtm0_req_fop_xc, NULL);
+ m0_xcode_ctx_init(&sctx, &src_obj);
+ m0_xcode_ctx_init(&dctx, &dest_obj);
+ dctx.xcx_alloc = m0_xcode_alloc;
+ rc = m0_xcode_dup(&dctx, &sctx);
+ if (rc == 0)
+ dest = dctx.xcx_it.xcu_stack[0].s_obj.xo_ptr;
+ return dest;
}
static void dtm0_req_fop_fini(struct dtm0_req_fop *req)
@@ -189,7 +197,8 @@ static int drlink_fom_init(struct drlink_fom *fom,
static void drlink_fom_fini(struct m0_fom *fom)
{
struct drlink_fom *df = fom2drlink_fom(fom);
- m0_fop_put_lock(df->df_rfop);
+ if (df->df_rfop != NULL)
+ m0_fop_put_lock(df->df_rfop);
m0_co_op_fini(&df->df_co_op);
m0_fom_fini(fom);
m0_free(fom);
@@ -207,23 +216,6 @@ static void co_long_write_lock(struct m0_co_context *context,
M0_CO_YIELD_RC(context, outcome);
}
-static void co_rpc_link_connect(struct m0_co_context *context,
- struct m0_rpc_link *rlink,
- struct m0_fom *fom,
- int next_phase)
-{
- M0_CO_REENTER(context);
-
- m0_chan_lock(&rlink->rlk_wait);
- m0_fom_wait_on(fom, &rlink->rlk_wait, &fom->fo_cb);
- m0_chan_unlock(&rlink->rlk_wait);
-
- m0_rpc_link_connect_async(rlink, M0_TIME_NEVER, NULL);
- m0_fom_phase_set(fom, next_phase);
-
- M0_CO_YIELD_RC(context, M0_FSO_WAIT);
-}
-
static int find_or_add(struct m0_dtm0_service *dtms,
const struct m0_fid *tgt,
struct dtm0_process **out)
@@ -254,6 +246,7 @@ enum drlink_fom_state {
DRF_INIT = M0_FOM_PHASE_INIT,
DRF_DONE = M0_FOM_PHASE_FINISH,
DRF_LOCKING = M0_FOM_PHASE_NR,
+ DRF_DISCONNECTING,
DRF_CONNECTING,
DRF_SENDING,
DRF_WAITING_FOR_REPLY,
@@ -286,10 +279,15 @@ static struct m0_sm_state_descr drlink_fom_states[] = {
.sd_name = #name, \
.sd_allowed = allowed \
}
- _ST(DRF_LOCKING, M0_BITS(DRF_CONNECTING,
+ _ST(DRF_LOCKING, M0_BITS(DRF_DISCONNECTING,
+ DRF_CONNECTING,
DRF_SENDING,
DRF_FAILED)),
+ _ST(DRF_DISCONNECTING, M0_BITS(DRF_CONNECTING,
+ DRF_FAILED)),
_ST(DRF_CONNECTING, M0_BITS(DRF_SENDING,
+ DRF_CONNECTING,
+ DRF_DISCONNECTING,
DRF_FAILED)),
_ST(DRF_SENDING, M0_BITS(DRF_DONE,
DRF_WAITING_FOR_REPLY,
@@ -345,6 +343,7 @@ static int dtm0_process_rlink_reinit(struct dtm0_process *proc,
const int max_in_flight = DTM0_MAX_RPCS_IN_FLIGHT;
if (!M0_IS0(&proc->dop_rlink)) {
+ m0_rpc_conn_sessions_cancel(&proc->dop_rlink.rlk_conn);
m0_rpc_link_fini(&proc->dop_rlink);
M0_SET0(&proc->dop_rlink);
}
@@ -360,6 +359,9 @@ static int dtm0_process_rlink_send(struct dtm0_process *proc,
struct m0_rpc_session *session = &proc->dop_rlink.rlk_sess;
struct m0_rpc_item *item = &fop->f_item;
+ M0_ENTRY("remote: proc=" FID_F ", ep=%s",
+ FID_P(&proc->dop_rproc_fid), proc->dop_rep);
+
item->ri_ops = &dtm0_req_fop_rlink_rpc_item_ops;
item->ri_session = session;
item->ri_prio = M0_RPC_ITEM_PRIO_MID;
@@ -368,43 +370,7 @@ static int dtm0_process_rlink_send(struct dtm0_process *proc,
if (drf->df_wait_for_ack)
m0_co_op_active(&drf->df_co_op);
- return m0_rpc_post(item);
-}
-
-/** An aggregated status of a dtm0_process:dop_rlink */
-enum dpr_state {
- /** Link is not alive but we can resurrect it. */
- DPR_TRANSIENT,
- /** Link is alive and ready to transfer items. */
- DPR_ONLINE,
- /** Link is permanently dead. */
- DPR_FAILED,
-};
-
-static enum dpr_state dpr_state_infer(struct dtm0_process *proc)
-{
- /*
- * TODO:
- * Observe the states of the following enitities:
- * RPC connection
- * RPC session
- * Conf obj
- * and then decide whether it is alive, dead or permanently dead.
- *
- * @verbatim
- * if (conf_obj is ONLINE) {
- * if (conn is ACTIVE && session is in (IDLE, BUSY))
- * return ONLINE;
- * else
- * return TRANSIENT;
- * } else
- * return FAILED;
- * @endverbatim
- */
- if (m0_rpc_link_is_connected(&proc->dop_rlink))
- return DPR_ONLINE;
-
- return DPR_TRANSIENT;
+ return M0_RC(m0_rpc_post(item));
}
/*
@@ -430,12 +396,96 @@ static void drlink_addb_drf2parent_relate(struct drlink_fom *drf)
m0_sm_id_get(&drf->df_gen.fo_sm_phase));
}
+
#define F M0_CO_FRAME_DATA
-static void drlink_coro_fom_tick(struct m0_co_context *context)
+
+/**
+ * Connect-disconnect context for an rpc link.
+ * The context tightens together clink-based notification from rpc link
+ * channel and a co_op.
+ */
+struct rlink_cd_ctx {
+ struct m0_clink dc_clink;
+ struct m0_co_op dc_op;
+};
+
+static bool rlink_cd_cb(struct m0_clink *clink)
+{
+ struct rlink_cd_ctx *dc = M0_AMB(dc, clink, dc_clink);
+ m0_co_op_done(&dc->dc_op);
+ return false;
+}
+
+static void co_rlink_do(struct m0_co_context *context,
+ struct dtm0_process *proc,
+ enum drlink_fom_state what)
{
- int rc = 0;
struct drlink_fom *drf = M0_AMB(drf, context, df_co);
struct m0_fom *fom = &drf->df_gen;
+ int timeout_sec = 0;
+ void (*action)(struct m0_rpc_link *, m0_time_t,
+ struct m0_clink *) = NULL;
+
+ M0_CO_REENTER(context, struct rlink_cd_ctx dc;);
+
+ M0_SET0(&F(dc));
+ m0_clink_init(&F(dc).dc_clink, rlink_cd_cb);
+ F(dc).dc_clink.cl_is_oneshot = true;
+ m0_co_op_init(&F(dc).dc_op);
+ m0_co_op_active(&F(dc).dc_op);
+
+ switch (what) {
+ case DRF_CONNECTING:
+ timeout_sec = DRLINK_CONNECT_TIMEOUT_SEC;
+ action = m0_rpc_link_connect_async;
+ break;
+ case DRF_DISCONNECTING:
+ m0_rpc_conn_sessions_cancel(&proc->dop_rlink.rlk_conn);
+ timeout_sec = DRLINK_DISCONN_TIMEOUT_SEC;
+ action = m0_rpc_link_disconnect_async;
+ break;
+ default:
+ M0_IMPOSSIBLE("%d is something that cannot be done", what);
+ break;
+ }
+
+ action(&proc->dop_rlink, m0_time_from_now(timeout_sec, 0),
+ &F(dc).dc_clink);
+
+ M0_CO_YIELD_RC(context, m0_co_op_tick_ret(&F(dc).dc_op, fom, what));
+ m0_chan_wait(&F(dc).dc_clink);
+
+ m0_co_op_fini(&F(dc).dc_op);
+ m0_clink_fini(&F(dc).dc_clink);
+}
+
+static bool has_volatile_param(struct m0_conf_obj *obj)
+{
+ struct m0_conf_service *svc;
+ const char **param;
+
+ svc = M0_CONF_CAST(obj, m0_conf_service);
+ M0_ASSERT(svc->cs_params != NULL);
+
+ for (param = svc->cs_params; *param != NULL; ++param) {
+ if (m0_streq(*param, "origin:in-volatile"))
+ return true;
+ else if (m0_streq(*param, "origin:in-persistent"))
+ return false;
+ }
+
+ M0_IMPOSSIBLE("Service origin is not defined in the config?");
+}
+
+static void drlink_coro_fom_tick(struct m0_co_context *context)
+{
+ int rc = 0;
+ struct drlink_fom *drf = M0_AMB(drf, context, df_co);
+ struct m0_fom *fom = &drf->df_gen;
+ const char *reason = "Unknown";
+ struct m0_conf_obj *obj = NULL;
+ struct m0_confc *confc = m0_reqh2confc(m0_fom2reqh(fom));
+ bool always_reconnect = false;
M0_CO_REENTER(context,
struct m0_long_lock_link llink;
@@ -455,8 +505,10 @@ static void drlink_coro_fom_tick(struct m0_co_context *context)
*/
m0_mutex_unlock(&drf->df_svc->dos_generic.rs_mutex);
- if (rc != 0)
+ if (rc != 0) {
+ reason = "Cannot find-or-add remote process";
goto out;
+ }
m0_long_lock_link_init(&F(llink), fom, &F(llock_addb2));
@@ -466,27 +518,73 @@ static void drlink_coro_fom_tick(struct m0_co_context *context)
DRF_LOCKING));
M0_ASSERT(m0_long_is_write_locked(&F(proc)->dop_llock, fom));
- if (dpr_state_infer(F(proc)) == DPR_TRANSIENT) {
+ m0_conf_cache_lock(&confc->cc_cache);
+ obj = m0_conf_cache_lookup(&confc->cc_cache, &F(proc)->dop_rserv_fid);
+ M0_ASSERT(obj != NULL);
+ if (has_volatile_param(obj) && obj->co_ha_state != M0_NC_ONLINE) {
+ M0_LOG(M0_DEBUG, "Force state transition %s -> %s for " FID_F,
+ m0_ha_state2str(obj->co_ha_state),
+ m0_ha_state2str(obj->co_ha_state),
+ FID_P(&F(proc)->dop_rserv_fid));
+ obj->co_ha_state = M0_NC_ONLINE;
+ m0_chan_broadcast(&obj->co_ha_chan);
+ always_reconnect = true;
+ }
+ m0_conf_cache_unlock(&confc->cc_cache);
+
+ /* Reconnect if the session was canceled */
+ if (m0_rpc_link_is_connected(&F(proc)->dop_rlink) &&
+ F(proc)->dop_rlink.rlk_sess.s_cancelled) {
+ always_reconnect = true;
+ }
+
+ /* XXX:
+ * At this moment we cannot detect client falures.
+ * Because of that, we cannot detect the case where
+ * the client drops RPC items because it cannot
+ * find the corresponding connection.
+ * As a workaround, we force drlink to re-connect
+ * whenever it tries to send a message.
+ */
+
+ if (always_reconnect) {
+ if (m0_rpc_link_is_connected(&F(proc)->dop_rlink)) {
+ M0_CO_FUN(context, co_rlink_do(context, F(proc),
+ DRF_DISCONNECTING));
+ }
+
rc = dtm0_process_rlink_reinit(F(proc), drf);
- if (rc != 0)
+ if (rc != 0) {
+ reason = "Cannot reinit RPC link.";
goto unlock;
+ }
/*
* TODO handle network failure after link is connected, but
* before the message is successfully sent
*/
- M0_CO_FUN(context, co_rpc_link_connect(context,
- &F(proc)->dop_rlink,
- fom, DRF_CONNECTING));
+ M0_CO_FUN(context, co_rlink_do(context, F(proc),
+ DRF_CONNECTING));
+ } else {
+ if (!m0_rpc_link_is_connected(&F(proc)->dop_rlink)) {
+ rc = dtm0_process_rlink_reinit(F(proc), drf);
+ if (rc != 0) {
+ reason = "Cannot reinit RPC link.";
+ goto unlock;
+ }
+ M0_CO_FUN(context, co_rlink_do(context, F(proc),
+ DRF_CONNECTING));
+ }
}
- if (dpr_state_infer(F(proc)) == DPR_FAILED)
- goto unlock;
+ M0_ASSERT(ergo(m0_rpc_link_is_connected(&F(proc)->dop_rlink),
+ !F(proc)->dop_rlink.rlk_sess.s_cancelled));
- M0_ASSERT(dpr_state_infer(F(proc)) == DPR_ONLINE);
m0_fom_phase_set(fom, DRF_SENDING);
rc = dtm0_process_rlink_send(F(proc), drf);
- if (rc != 0)
+ if (rc != 0) {
+ reason = "Failed to post a message to RPC layer.";
goto unlock;
+ }
/* Safety: FOP (and item) can be released only in ::drlink_fom_fini. */
drlink_addb_drf2item_relate(drf);
@@ -497,15 +595,27 @@ static void drlink_coro_fom_tick(struct m0_co_context *context)
DRF_WAITING_FOR_REPLY));
m0_co_op_reset(&drf->df_co_op);
rc = m0_rpc_item_error(&drf->df_rfop->f_item);
+ if (rc != 0) {
+ reason = "Rpc item error";
+ goto unlock;
+ }
}
unlock:
+ if (drf->df_rfop != NULL) {
+ m0_fop_put_lock(drf->df_rfop);
+ drf->df_rfop = NULL;
+ }
+
m0_long_write_unlock(&F(proc)->dop_llock, &F(llink));
m0_long_lock_link_fini(&F(llink));
out:
/* TODO handle the error */
- if (rc != 0)
+ if (rc != 0) {
+ M0_LOG(M0_ERROR, "Failed to send a message to" FID_F ", rc=%d, "
+ "reason=%s", FID_P(&drf->df_tgt), rc, reason);
m0_fom_phase_move(fom, rc, DRF_FAILED);
+ }
if (drf->df_op != NULL)
m0_be_op_done(drf->df_op);
diff --git a/dtm0/dtx.c b/dtm0/dtx.c
index b9825b769cd..f81009e0bde 100644
--- a/dtm0/dtx.c
+++ b/dtm0/dtx.c
@@ -46,15 +46,7 @@ static struct m0_sm_state_descr dtx_states[] = {
},
[M0_DDS_INPROGRESS] = {
.sd_name = "inprogress",
- .sd_allowed = M0_BITS(M0_DDS_EXECUTED, M0_DDS_FAILED),
- },
- [M0_DDS_EXECUTED] = {
- .sd_name = "executed",
- .sd_allowed = M0_BITS(M0_DDS_EXECUTED_ALL),
- },
- [M0_DDS_EXECUTED_ALL] = {
- .sd_name = "executed-all",
- .sd_allowed = M0_BITS(M0_DDS_STABLE),
+ .sd_allowed = M0_BITS(M0_DDS_STABLE, M0_DDS_FAILED),
},
[M0_DDS_STABLE] = {
.sd_name = "stable",
@@ -72,11 +64,9 @@ static struct m0_sm_state_descr dtx_states[] = {
static struct m0_sm_trans_descr dtx_trans[] = {
{ "populated", M0_DDS_INIT, M0_DDS_INPROGRESS },
- { "executed", M0_DDS_INPROGRESS, M0_DDS_EXECUTED },
- { "exec-all", M0_DDS_EXECUTED, M0_DDS_EXECUTED_ALL },
- { "exec-fail", M0_DDS_INPROGRESS, M0_DDS_FAILED },
- { "stable", M0_DDS_EXECUTED_ALL, M0_DDS_STABLE },
- { "prune", M0_DDS_STABLE, M0_DDS_DONE }
+ { "stabilised", M0_DDS_INPROGRESS, M0_DDS_STABLE },
+ { "stab-fail", M0_DDS_INPROGRESS, M0_DDS_FAILED },
+ { "prune-it", M0_DDS_STABLE, M0_DDS_DONE }
};
struct m0_sm_conf m0_dtx_sm_conf = {
@@ -109,7 +99,6 @@ static int dtx_log_insert(struct m0_dtm0_dtx *dtx)
struct m0_dtm0_log_rec *record = M0_AMB(record, dtx, dlr_dtx);
int rc;
- M0_PRE(m0_dtm0_tx_desc_state_eq(&dtx->dd_txd, M0_DTPS_INPROGRESS));
M0_PRE(dtx->dd_dtms != NULL);
log = dtx->dd_dtms->dos_log;
M0_PRE(log != NULL);
@@ -285,12 +274,20 @@ static int dtx_fid_assign(struct m0_dtm0_dtx *dtx,
static int dtx_close(struct m0_dtm0_dtx *dtx)
{
int rc;
+ int i;
+ struct m0_dtm0_tx_pa *pa;
M0_ENTRY("dtx=%p", dtx);
M0_PRE(dtx != NULL);
M0_PRE(m0_sm_group_is_locked(dtx->dd_sm.sm_grp));
+ for (i = 0; i < dtx->dd_txd.dtd_ps.dtp_nr; ++i) {
+ pa = &dtx->dd_txd.dtd_ps.dtp_pa[i];
+ pa->p_state = max_check(pa->p_state,
+ (uint32_t)M0_DTPS_EXECUTED);
+ }
+
/*
* TODO:REDO: We may want to capture the fop contents here.
* See ::fol_record_pack and ::m0_fop_encdec for the details.
@@ -299,14 +296,11 @@ static int dtx_close(struct m0_dtm0_dtx *dtx)
*/
rc = dtx_log_insert(dtx);
- M0_ASSERT(rc == 0);
+ if (rc == 0)
+ m0_sm_state_set(&dtx->dd_sm, M0_DDS_INPROGRESS);
+ else
+ m0_sm_move(&dtx->dd_sm, rc, M0_DDS_FAILED);
- /*
- * Once a dtx is closed, the FOP (or FOPs) has to be serialized
- * into the log, so that we should no longer hold any references to it.
- */
- dtx->dd_fop = NULL;
- m0_sm_state_set(&dtx->dd_sm, M0_DDS_INPROGRESS);
return M0_RC(rc);
}
@@ -322,24 +316,6 @@ static void dtx_done(struct m0_dtm0_dtx *dtx)
M0_LEAVE();
}
-static void dtx_exec_all_ast_cb(struct m0_sm_group *grp, struct m0_sm_ast *ast)
-{
- struct m0_dtm0_dtx *dtx = ast->sa_datum;
-
- M0_ENTRY("dtx=%p", dtx);
-
- M0_ASSERT(dtx->dd_sm.sm_state == M0_DDS_EXECUTED_ALL);
- M0_ASSERT(dtx->dd_nr_executed == dtx->dd_txd.dtd_ps.dtp_nr);
-
- if (m0_dtm0_tx_desc_state_exists(&dtx->dd_txd, M0_DTPS_PERSISTENT)) {
- m0_sm_state_set(&dtx->dd_sm, M0_DDS_STABLE);
- M0_LOG(M0_DEBUG, "dtx " DTID0_F "is stable (EXEC_ALL)",
- DTID0_P(&dtx->dd_txd.dtd_id));
- }
-
- M0_LEAVE();
-}
-
static void dtx_persistent_ast_cb(struct m0_sm_group *grp,
struct m0_sm_ast *ast)
{
@@ -355,21 +331,20 @@ static void dtx_persistent_ast_cb(struct m0_sm_group *grp,
m0_mutex_lock(&log->dl_lock);
rec = m0_be_dtm0_log_find(log, &txd->dtd_id);
+ dtx = rec == NULL ? NULL : &rec->dlr_dtx;
- if (rec != NULL) {
+ if (dtx != NULL && dtx->dd_sm.sm_state < M0_DDS_STABLE) {
dtx = &rec->dlr_dtx;
m0_dtm0_tx_desc_apply(&dtx->dd_txd, txd);
dtx_log_update(dtx);
- if (dtx->dd_sm.sm_state == M0_DDS_EXECUTED_ALL &&
- m0_dtm0_tx_desc_state_exists(&dtx->dd_txd,
- M0_DTPS_PERSISTENT)) {
- M0_ASSERT(dtx->dd_nr_executed ==
- dtx->dd_txd.dtd_ps.dtp_nr);
- M0_LOG(M0_DEBUG, "dtx " DTID0_F "is stable (PMA)",
- DTID0_P(&dtx->dd_txd.dtd_id));
- m0_sm_state_set(&dtx->dd_sm, M0_DDS_STABLE);
- }
+ M0_LOG(M0_DEBUG, "dtx " DTID0_F "is stable.",
+ DTID0_P(&dtx->dd_txd.dtd_id));
+ /*
+ * At this moment, DTX is considered to be stable if
+ * at least one Pmsg was received.
+ */
+ m0_sm_state_set(&dtx->dd_sm, M0_DDS_STABLE);
}
m0_mutex_unlock(&log->dl_lock);
@@ -404,9 +379,8 @@ M0_INTERNAL void m0_dtm0_dtx_pmsg_post(struct m0_be_dtm0_log *log,
m0_mutex_lock(&log->dl_lock);
rec = m0_be_dtm0_log_find(log, &txd->dtd_id);
dtx_sm_grp = rec != NULL ? rec->dlr_dtx.dd_sm.sm_grp : NULL;
- m0_mutex_unlock(&log->dl_lock);
- if (rec != NULL) {
+ if (dtx_sm_grp != NULL) {
M0_ASSERT(fop->f_opaque != NULL);
pma = fop->f_opaque;
*pma = (struct m0_dtm0_pmsg_ast) {
@@ -423,61 +397,8 @@ M0_INTERNAL void m0_dtm0_dtx_pmsg_post(struct m0_be_dtm0_log *log,
m0_fop_get(fop);
m0_sm_ast_post(dtx_sm_grp, &pma->p_ast);
}
- M0_LEAVE();
-}
-
-static void dtx_executed(struct m0_dtm0_dtx *dtx, uint32_t idx)
-{
- struct m0_dtm0_tx_pa *pa;
-
- M0_ENTRY("dtx=%p, idx=%"PRIu32, dtx, idx);
-
- M0_PRE(dtx != NULL);
- M0_PRE(m0_sm_group_is_locked(dtx->dd_sm.sm_grp));
-
- pa = &dtx->dd_txd.dtd_ps.dtp_pa[idx];
-
- M0_ASSERT(pa->p_state >= M0_DTPS_INPROGRESS);
-
- pa->p_state = max_check(pa->p_state, (uint32_t)M0_DTPS_EXECUTED);
-
- dtx->dd_nr_executed++;
-
- if (dtx->dd_sm.sm_state < M0_DDS_EXECUTED) {
- M0_ASSERT(dtx->dd_sm.sm_state == M0_DDS_INPROGRESS);
- m0_sm_state_set(&dtx->dd_sm, M0_DDS_EXECUTED);
- }
-
- if (dtx->dd_nr_executed == dtx->dd_txd.dtd_ps.dtp_nr) {
- M0_ASSERT(dtx->dd_sm.sm_state == M0_DDS_EXECUTED);
- M0_ASSERT_INFO(dtds_forall(&dtx->dd_txd, >= M0_DTPS_EXECUTED),
- "Non-executed PAs should not exist "
- "at this point.");
- m0_sm_state_set(&dtx->dd_sm, M0_DDS_EXECUTED_ALL);
-
- /*
- * EXECUTED and STABLE should not be triggered within the
- * same ast tick. This ast helps us to enforce it.
- * XXX: there is a catch22-like problem with DIX states:
- * DIX request should already be in "FINAL" state when
- * the corresponding dtx reaches STABLE. However, the dtx
- * cannot transit from EXECUTED to STABLE (through EXEC_ALL)
- * if DIX reached FINAL already (the list of CAS rops has been
- * destroyed). So that the EXEC_ALL ast cuts this knot by
- * scheduling the transition EXEC_ALL -> STABLE in a separate
- * tick where DIX request reached FINAL.
- */
- dtx->dd_exec_all_ast = (struct m0_sm_ast) {
- .sa_cb = dtx_exec_all_ast_cb,
- .sa_datum = dtx,
- };
- m0_sm_ast_post(dtx->dd_sm.sm_grp, &dtx->dd_exec_all_ast);
- }
-
- m0_mutex_lock(&dtx->dd_dtms->dos_log->dl_lock);
- dtx_log_update(dtx);
- m0_mutex_unlock(&dtx->dd_dtms->dos_log->dl_lock);
+ m0_mutex_unlock(&log->dl_lock);
M0_LEAVE();
}
@@ -516,19 +437,12 @@ M0_INTERNAL void m0_dtx0_fop_assign(struct m0_dtx *dtx,
dtx_fop_assign(dtx->tx_dtx, pa_idx, pa_fop);
}
-
M0_INTERNAL int m0_dtx0_close(struct m0_dtx *dtx)
{
M0_PRE(dtx != NULL);
return dtx_close(dtx->tx_dtx);
}
-M0_INTERNAL void m0_dtx0_executed(struct m0_dtx *dtx, uint32_t pa_idx)
-{
- M0_PRE(dtx != NULL);
- dtx_executed(dtx->tx_dtx, pa_idx);
-}
-
M0_INTERNAL void m0_dtx0_done(struct m0_dtx *dtx)
{
struct m0_be_dtm0_log *log;
diff --git a/dtm0/dtx.h b/dtm0/dtx.h
index ac3dfc8448d..f8907579b7b 100644
--- a/dtm0/dtx.h
+++ b/dtm0/dtx.h
@@ -33,10 +33,6 @@ enum m0_dtm0_dtx_state {
M0_DDS_INIT,
/* dtx has a valid tx record. */
M0_DDS_INPROGRESS,
- /* dtx got one reply. */
- M0_DDS_EXECUTED,
- /* dtx got all replies. */
- M0_DDS_EXECUTED_ALL,
/* dtx got enough PERSISTENT messages. */
M0_DDS_STABLE,
/* dtx can be released when this state reached. */
@@ -61,8 +57,6 @@ struct m0_dtm0_dtx {
struct m0_sm dd_sm;
struct m0_dtm0_tx_desc dd_txd;
struct m0_dtm0_service *dd_dtms;
- uint32_t dd_nr_executed;
- struct m0_sm_ast dd_exec_all_ast;
/*
* XXX: The implementation is very simple and it relies on the idea
@@ -129,13 +123,6 @@ M0_INTERNAL void m0_dtx0_fop_assign(struct m0_dtx *dtx,
*/
M0_INTERNAL int m0_dtx0_close(struct m0_dtx *dtx);
-/**
- * Notifies DTM0 that DTX is executed on the particular participant.
- * @param dtx A DTX that is executed on the particular participant.
- * @param pa_idx Index of the participant.
- */
-M0_INTERNAL void m0_dtx0_executed(struct m0_dtx *dtx, uint32_t pa_idx);
-
/**
* Marks a transaction as "no longer in-use".
* The user does not have exclusive ownership on a dtx after it has
diff --git a/dtm0/fop.c b/dtm0/fop.c
index 3fd5b1ace6d..7b88c6bdefd 100644
--- a/dtm0/fop.c
+++ b/dtm0/fop.c
@@ -24,6 +24,7 @@
#include "lib/trace.h" /* M0_LOG */
#include "cas/cas.h"
#include "cas/cas_xc.h"
+#include "dix/fid_convert.h" /* m0_dix_fid__device_id_set */
#include "dtm0/fop.h"
#include "dtm0/fop_xc.h"
#include "dtm0/addb2.h"
@@ -58,7 +59,8 @@ static int dtm0_fom_create(struct m0_fop *fop, struct m0_fom **out,
struct m0_reqh *reqh);
static void dtm0_fom_fini(struct m0_fom *fom);
static size_t dtm0_fom_locality(const struct m0_fom *fom);
-static int dtm0_cas_fop_prepare(struct dtm0_req_fop *req,
+static int dtm0_cas_fop_prepare(struct m0_reqh *reqh,
+ struct dtm0_req_fop *req,
struct m0_fop_type *cas_fopt,
struct m0_fop **cas_fop_out);
static int dtm0_cas_fom_spawn(
@@ -119,6 +121,7 @@ struct m0_sm_state_descr dtm0_phases[] = {
.sd_name = "dtm0-entry",
.sd_allowed = M0_BITS(M0_FOPH_DTM0_LOGGING,
M0_FOPH_DTM0_TO_CAS,
+ M0_FOPH_DTM0_CAS_DONE,
M0_FOPH_SUCCESS,
M0_FOPH_FAILURE)
},
@@ -150,6 +153,8 @@ struct m0_sm_trans_descr dtm0_phases_trans[] = {
{"dtm0-to-cas", M0_FOPH_DTM0_ENTRY, M0_FOPH_DTM0_TO_CAS},
+ {"dtm0-empty-rec", M0_FOPH_DTM0_ENTRY, M0_FOPH_DTM0_CAS_DONE},
+
{"dtm0-to-cas-fail", M0_FOPH_DTM0_TO_CAS, M0_FOPH_FAILURE},
{"dtm0-cas-done", M0_FOPH_DTM0_TO_CAS, M0_FOPH_DTM0_CAS_DONE},
@@ -202,8 +207,10 @@ M0_INTERNAL int m0_dtm0_fop_init(void)
.rpc_flags = M0_RPC_ITEM_TYPE_REPLY,
.fom_ops = &dtm0_req_fom_type_ops);
- return m0_fop_type_addb2_instrument(&dtm0_req_fop_fopt) ?:
+ return m0_fop_type_addb2_instrument(&dtm0_req_fop_fopt);
+ /*
m0_fop_type_addb2_instrument(&dtm0_redo_fop_fopt);
+ */
}
@@ -358,7 +365,7 @@ M0_INTERNAL int m0_dtm0_on_committed(struct m0_fom *fom,
if (m0_fid_eq(target, source))
target = &txd->dtd_id.dti_fid;
- rc = m0_dtm0_req_post(dtms, NULL, &req, target, fom, false);
+ rc = m0_dtm0_req_post(dtms, NULL, &req, target, fom, true);
if (rc != 0) {
M0_LOG(M0_WARN, "Failed to send PERSISTENT msg "
FID_F " -> " FID_F " (%d).",
@@ -532,7 +539,35 @@ static int dtm0_cas_fom_spawn(
#endif
}
-static int dtm0_cas_fop_prepare(struct dtm0_req_fop *req,
+static void dtm0_cas_sdev_id_set(
+ struct m0_reqh *reqh,
+ const struct m0_reqh_service_type *stype,
+ struct m0_cas_op *cas_op)
+{
+#ifndef __KERNEL__
+ struct m0_cas_id *cid = &cas_op->cg_id;
+ uint32_t sdev_id;
+
+ M0_PRE(reqh != NULL);
+ M0_PRE(stype != NULL);
+ M0_PRE(cas_op != NULL);
+
+ M0_ENTRY();
+
+ sdev_id = m0_cas_svc_device_id_get(stype, reqh);
+ M0_LOG(M0_DEBUG, "Replace device ID: %d -> %d",
+ m0_dix_fid_cctg_device_id(&cid->ci_fid), sdev_id);
+ m0_dix_fid__device_id_set(&cid->ci_fid, sdev_id);
+
+ M0_LEAVE();
+#else
+ /* CAS service is not compiled for kernel. */
+ return;
+#endif
+}
+
+static int dtm0_cas_fop_prepare(struct m0_reqh *reqh,
+ struct dtm0_req_fop *req,
struct m0_fop_type *cas_fopt,
struct m0_fop **cas_fop_out)
{
@@ -540,6 +575,8 @@ static int dtm0_cas_fop_prepare(struct dtm0_req_fop *req,
struct m0_cas_op *cas_op;
struct m0_fop *cas_fop;
+ M0_ENTRY();
+
*cas_fop_out = NULL;
M0_ALLOC_PTR(cas_op);
@@ -552,9 +589,12 @@ static int dtm0_cas_fop_prepare(struct dtm0_req_fop *req,
&M0_XCODE_OBJ(m0_cas_op_xc, cas_op),
req->dtr_payload.b_addr,
req->dtr_payload.b_nob);
- if (rc == 0)
- m0_fop_init(cas_fop, cas_fopt, cas_op, &m0_fop_release);
- else
+ if (rc == 0) {
+ dtm0_cas_sdev_id_set(
+ reqh, cas_fopt->ft_fom_type.ft_rstype, cas_op);
+ m0_fop_init(cas_fop, cas_fopt, cas_op,
+ &m0_fop_release);
+ } else
M0_LOG(M0_ERROR, "Could not decode the REDO payload");
}
@@ -565,7 +605,7 @@ static int dtm0_cas_fop_prepare(struct dtm0_req_fop *req,
m0_free(cas_fop);
}
- return rc;
+ return M0_RC(rc);
}
static int dtm0_rmsg_fom_tick(struct m0_fom *fom)
@@ -576,6 +616,12 @@ static int dtm0_rmsg_fom_tick(struct m0_fom *fom)
struct dtm0_fom *dfom = M0_AMB(dfom, fom, dtf_fom);
struct dtm0_req_fop *req = m0_fop_data(fom->fo_fop);
struct m0_fop *cas_fop = NULL;
+ struct m0_dtm0_service *svc = m0_dtm0_fom2service(fom);
+ struct m0_dtm0_recovery_machine *m = &svc->dos_remach;
+ const struct m0_dtm0_tx_desc *txd = &req->dtr_txr;
+
+ M0_PRE(ergo(m0_dtm0_tx_desc_is_none(txd),
+ !!(req->dtr_flags & M0_BITS(M0_DMF_EOL))));
M0_ENTRY("fom %p phase %d", fom, phase);
@@ -584,7 +630,9 @@ static int dtm0_rmsg_fom_tick(struct m0_fom *fom)
result = m0_fom_tick_generic(fom);
break;
case M0_FOPH_DTM0_ENTRY:
- m0_fom_phase_set(fom, M0_FOPH_DTM0_TO_CAS);
+ m0_fom_phase_set(fom,
+ m0_dtm0_tx_desc_is_none(txd) ?
+ M0_FOPH_DTM0_CAS_DONE : M0_FOPH_DTM0_TO_CAS);
break;
case M0_FOPH_DTM0_TO_CAS:
/* REDO_END()s from all recovering processes received, send
@@ -593,7 +641,8 @@ static int dtm0_rmsg_fom_tick(struct m0_fom *fom)
cs_ha_process_event(m0_cs_ctx_get(m0_fom_reqh(fom)),
M0_CONF_HA_PROCESS_DTM_RECOVERED);
*/
- rc = dtm0_cas_fop_prepare(req, &cas_put_fopt, &cas_fop);
+ rc = dtm0_cas_fop_prepare(dfom->dtf_fom.fo_service->rs_reqh,
+ req, &cas_put_fopt, &cas_fop);
if (rc == 0) {
rc = dtm0_cas_fom_spawn(dfom, cas_fop,
&dtm0_cas_done_cb);
@@ -621,6 +670,25 @@ static int dtm0_rmsg_fom_tick(struct m0_fom *fom)
M0_FOPH_FAILURE);
} else {
m0_fom_phase_set(fom, M0_FOPH_SUCCESS);
+ /*
+ * TODO: make it async when recovery machine starts
+ * using a larger sliding window or the amount of
+ * participants gets bigger than the length of
+ * the EOL queue.
+ * At this moment EOLQ_MAX_LEN is 100, and we may
+ * have at most 3 concurrent EOL. It leaves room
+ * for 97 pending HA state transitions which is
+ * far more than enough for a 3 node cluster with
+ * one single failure.
+ */
+ /*
+ * TODO: Consider propagating FOM or its sm_id
+ * (directly or indirectly) down to the recovery
+ * machine, so that we may relate REDO FOM
+ * and the corresponding recovery FOM.
+ */
+ M0_BE_OP_SYNC(op,
+ m0_dtm0_recovery_machine_redo_post(m, req, &op));
}
break;
default:
diff --git a/dtm0/fop.h b/dtm0/fop.h
index 38f00cc15bf..357aac4a572 100644
--- a/dtm0/fop.h
+++ b/dtm0/fop.h
@@ -47,16 +47,34 @@ enum m0_dtm0s_msg {
DTM_EXECUTE,
DTM_EXECUTED,
DTM_PERSISTENT,
- DTM_REDO
+ DTM_REDO,
} M0_XCA_ENUM;
+enum m0_dtm0_msg_flags {
+ M0_DMF_EOL,
+ M0_DMF_EVICTION,
+};
+
/** A DTM0 message sent as an RPC request to remote DTM0 services. */
struct dtm0_req_fop {
uint32_t dtr_msg M0_XCA_FENUM(m0_dtm0s_msg);
struct m0_dtm0_tx_desc dtr_txr;
struct m0_buf dtr_payload;
+ uint64_t dtr_flags;
+ /**
+ * The participant (DTM0 service) that sent this message.
+ * The initiator is set for DTM_REDO messages.
+ */
+ struct m0_fid dtr_initiator;
} M0_XCA_RECORD M0_XCA_DOMAIN(rpc);
+
+#define REDO_F "redo={ txid=" DTID0_F ", ini=" FID_F ", is_eol=%d }"
+#define REDO_P(_redo) \
+ DTID0_P(&(_redo)->dtr_txr.dtd_id), \
+ FID_P(&(_redo)->dtr_initiator), \
+ !!((_redo)->dtr_flags & M0_BITS(M0_DMF_EOL))
+
struct dtm0_rep_fop {
/** Status code of dtm operation. */
int32_t dr_rc;
diff --git a/dtm0/it/all2all/.gitignore b/dtm0/it/all2all/.gitignore
new file mode 100644
index 00000000000..5e548163dff
--- /dev/null
+++ b/dtm0/it/all2all/.gitignore
@@ -0,0 +1,4 @@
+m0trace.*
+addb_*/
+addb-stobs-*/
+m0crate.yaml
diff --git a/dtm0/it/all2all/all2all b/dtm0/it/all2all/all2all
index f659b62b9d7..b6db8f8985a 100755
--- a/dtm0/it/all2all/all2all
+++ b/dtm0/it/all2all/all2all
@@ -15,6 +15,10 @@ LOOP_IMG_DIR=$TEST_ROOT
CLIENT_PID=
M0D_DIR_COMMON=$MOTR_VAR_DIR/m0d-0x720000000000000
ADDB_DUMP_DIR="/tmp/a2a-addb-out"
+CTGDUMP="$MOTR_ROOT/cas/m0ctgdump"
+DINDEX="1:5"
+CTGDUMP_DIR="/tmp/a2a-ctgdump"
+
M0D_ENDPOINTS=()
M0D_FIDS_DEC=()
@@ -23,10 +27,16 @@ M0D_PIDS=()
M0D_CLI_FID_DEC=
M0D_CLI_FID_HEX=
+MOD_CLI_EP=
POOL_WIDTH=4
IPOOL_WIDTH=2
+# m0d that got killed
+VICTIM=1
+# m0ds that are alive
+WITNESSES=(0 2)
+
. ${MOTR_ROOT}/scripts/addb-py/chronometry/common/common_funcs
@@ -37,7 +47,12 @@ function stop_cluster()
function bootstrap_cluster()
{
- hctl bootstrap --mkfs $CURRENT_CDF
+ # XXX: For some reason Hare does not like localhost.
+ hctl bootstrap --mkfs <(sed "s/localhost/$(hostname -f)/" $CURRENT_CDF)
+ if [[ $? -ne 0 ]]; then
+ _err "Cluster bootstrap failed"
+ exit 1
+ fi
}
function get_m0d_pids()
@@ -46,7 +61,7 @@ function get_m0d_pids()
local pid
for fid in ${M0D_FIDS_HEX[@]} ; do
- pid=$(ps ax | grep m0d | grep $fid | awk '{ print $1; }')
+ pid=$(pgrep -a lt-m0d | grep $fid | awk '{ print $1; }')
M0D_PIDS+=($pid)
pids+="$pid "
done
@@ -60,10 +75,12 @@ function create_m0crate_cfg()
local svcs_json_out=$(echo $hctl_json_out | jq -r '.nodes[] | .svcs[]')
local PROF=$(echo $hctl_json_out | jq -r '.profiles[] | .fid')
- local MOTR_LOCAL_ADDR=$(echo $svcs_json_out | jq -r 'select( .name | contains("m0_client")) | .ep')
- local PROCESS_FID=$(echo $svcs_json_out | jq -r 'select( .name | contains("m0_client")) | .fid')
+ local MOTR_LOCAL_ADDR=$(echo $svcs_json_out | jq -r 'select( .name | contains("motr_client")) | .ep')
+ local PROCESS_FID=$(echo $svcs_json_out | jq -r 'select( .name | contains("motr_client")) | .fid')
local MOTR_HA_ADDR=$(echo $svcs_json_out | jq -r 'select( .name | contains("hax")) | .ep')
+ MOD_CLI_EP=$(echo "$MOTR_LOCAL_ADDR")
+
local M0CRATE_CFG_TMP=m0crate_cfg.tmp
cp $M0CRATE_CFG_IN $M0CRATE_CFG_TMP
sed -i "s/###__PROF__###/$PROF/g" $M0CRATE_CFG_TMP
@@ -77,64 +94,175 @@ function get_params_for_ha_msgs()
{
local svcs_json_out=$(hctl status --json | jq -r '.nodes[] | .svcs[]')
local svc_json_out=$(echo $svcs_json_out | jq -r 'select( .name | contains("ioservice"))')
- local cli_json_out=$(echo $svcs_json_out | jq -r 'select( .name | contains("m0_client"))')
- M0D_ENDPOINTS=($(echo $svc_json_out | jq -r '.ep' | sed -E 's/.*@tcp[:](.*)/\1/'))
+ local cli_json_out=$(echo $svcs_json_out | jq -r 'select( .name | contains("motr_client"))')
+ M0D_ENDPOINTS=($(echo $svc_json_out | jq -r '.ep'))
M0D_FIDS_HEX=($(echo $svc_json_out | jq -r '.fid' | sed -E 's/0x720+([0-9][:]0x[A-Za-z0-9]+)/\1/'))
M0D_FIDS_DEC=($(echo $svc_json_out | jq -r '.fid' | sed -E 's/0x720+([0-9][:])(0x[A-Za-z0-9]+)/printf "%s%d" \1 \2/e'))
M0D_CLI_FID_DEC=$(echo $cli_json_out | jq -r '.fid' | sed -E 's/0x720+([0-9][:])(0x[A-Za-z0-9]+)/printf "%s%d" \1 \2/e')
M0D_CLI_FID_HEX=$(echo $cli_json_out | jq -r '.fid' | sed -E 's/0x720+([0-9][:]0x[A-Za-z0-9]+)/\1/')
}
-function ha_msg_send_transient()
+# params
+# (i) whom to send the message (endpoint). Use "cli" for the client; 0, 1, 2, etc. for m0ds.
+# (ii) who has changed its state. Use "cli" or 0, 1, 2.
+# (iii) what happened. Use "transient", "online" etc.
+function ha_msg_send()
{
- # Here we send "TRANSIENT" messages to trigger start of
- # HA messages handling on the m0d side as dtm0 doesn't
- # handle them until "TRANSIENT" received due to incomplete
- # implementation on the Hare side.
- for i in $(seq 0 $((${#M0D_ENDPOINTS[@]}-1))) ; do
- for j in $(seq 0 $((${#M0D_FIDS_DEC[@]}-1))) ; do
- if [[ $i -ne $j ]]; then
- $MOTR_ST_UTILS_DIR/ha_msg_send.sh "${M0D_ENDPOINTS[$i]}" "^r|${M0D_FIDS_DEC[$j]}" "transient"
- break
- fi
- done
- done
+ local whom="$1"
+ local who="$2"
+ local what="$3"
+
+ if [[ "x$whom" == "xcli" ]]; then
+ whom="${MOD_CLI_EP}"
+ else
+ whom="${M0D_ENDPOINTS[$whom]}"
+ fi
+
+ if [[ "x$who" == "xcli" ]]; then
+ who="${M0D_CLI_FID_DEC}"
+ else
+ who="${M0D_FIDS_DEC[$who]}"
+ fi
+
+ _info "ha_msg_send: whom='${whom}', who='${who}', what='${what}'"
+ $MOTR_ST_UTILS_DIR/ha_msg_send.sh "$whom" "^r|$who" "$what"
}
-function ha_msg_send_online()
+function ha_msg_send_cli()
{
- # Here we send "ONLINE" messages to trigger connections logic.
- for i in $(seq 0 $((${#M0D_ENDPOINTS[@]}-1))) ; do
- for j in $(seq 0 $((${#M0D_FIDS_DEC[@]}-1))) ; do
- if [[ $i -ne $j ]]; then
- $MOTR_ST_UTILS_DIR/ha_msg_send.sh "${M0D_ENDPOINTS[$i]}" "^r|${M0D_FIDS_DEC[$j]}" "online"
- fi
- done
- done
+ local whom="cli"
+ local who="$1"
+ local what="$2"
+ ha_msg_send $whom $who $what
}
-function ha_msg_send_cli_online()
+# params
+# (i) who Who changed its state?
+# (ii) what What is the new state?
+function ha_msg_send_m0ds()
{
- # Here we send "ONLINE" messages to connect servers to client.
- for i in $(seq 0 $((${#M0D_ENDPOINTS[@]}-1))) ; do
- $MOTR_ST_UTILS_DIR/ha_msg_send.sh "${M0D_ENDPOINTS[$i]}" "^r|${M0D_CLI_FID_DEC}" "online"
+ local who="$1"
+ local what="$2"
+
+ if [[ ${#M0D_ENDPOINTS[@]} -ne ${#M0D_FIDS_DEC[@]} ]]; then
+ echo "The number of endpoints is not equal to the number of fids. How is that possible?"
+ exit 1
+ fi
+
+ for i in $(seq 0 $((${#M0D_ENDPOINTS[@]}-1))); do
+ ha_msg_send $i $who $what
done
}
-function expected_trace_lines_num()
+function ha_msg_send_all()
+{
+ local who="$1"
+ local what="$2"
+ ha_msg_send_cli "$who" "$what"
+ ha_msg_send_m0ds "$who" "$what"
+}
+
+# param
+# (i) trace_path Path to the trace file
+# (ii) pattern String to grep.
+# (iii) exp_cnt Expected number of lines that match the pattern.
+function expect_trace_lines
+{
+ local trace_path="$1"
+ local pattern="$2"
+ local exp_cnt="$3"
+ local cnt
+ local cmd;
+
+ _info "expect trace: path=$trace_path, pattern=$pattern, exp_cnt=$exp_cnt"
+ cnt=$($MOTR_ROOT/utils/trace/m0trace -i "$trace_path*" | grep -c "$pattern")
+ if [[ $cnt -ne $exp_cnt ]]; then
+ _info "Unexpected number of trace lines: $cnt != $exp_cnt"
+ return 1
+ fi
+
+ _info "Found"
+ return 0
+}
+
+function expect_trace_lines_from_m0d()
+{
+ local index="$1"
+ local pattern="$2"
+ local exp_cnt="$3"
+ local path="${M0D_DIR_COMMON}${M0D_FIDS_HEX[index]}/m0trace.${M0D_PIDS[index]}"
+
+ if expect_trace_lines "$path" "$pattern" "$exp_cnt"; then
+ return 0;
+ fi
+
+ return 1;
+}
+
+function expect_trace_lines_from_m0ds()
{
local pattern="$1"
local exp_cnt=$2
- local cnt
for i in ${!M0D_PIDS[@]} ; do
- cnt=$($MOTR_ROOT/utils/trace/m0trace -i "${M0D_DIR_COMMON}${M0D_FIDS_HEX[i]}/m0trace.${M0D_PIDS[i]}" | grep "$pattern" | wc -l)
- if [[ $cnt -ne $exp_cnt ]]; then
+ if ! expect_trace_lines_from_m0d "$i" "$pattern" "$exp_cnt"; then
+ _info "Not found in ${M0D_PIDS[i]}"
return 1
fi
done
- return 0
+ return 0;
+}
+
+function expect_trace_lines_from_cli()
+{
+ local pattern="$1"
+ local exp_cnt="$2"
+ local path="$PWD/m0trace.$CLIENT_PID"
+ local cnt
+
+ if ! expect_trace_lines "$path" "$pattern" "$exp_cnt"; then
+ return 1;
+ fi
+
+ return 0;
+}
+
+function expect_trace_lines_from_all()
+{
+ local pattern="$1"
+ local exp_cnt="$2"
+
+ if expect_trace_lines_from_m0ds "$pattern" "$exp_cnt" && \
+ expect_trace_lines_from_cli "$pattern" "$exp_cnt"; then
+ return 0;
+ fi
+
+ return 1
+}
+
+function addb2dump()
+{
+ local inpfile=$1
+ local outfile=$2
+ local a2d=$MOTR_ROOT/utils/m0addb2dump
+ local size
+
+ _info "Dumping ${inpfile} -> ${outfile} ..."
+
+ if ! $a2d -f "${inpfile}" > "${outfile}"; then
+ if [[ "$(stat -c %s ${inpfile})" == "0" ]]; then
+ _info "Skipping empty ADDB file."
+ return 0;
+ else
+ _info "Addb file is not empty but cannot dump it"
+ return 1;
+ fi
+ else
+ echo "OK."
+ return 0;
+ fi
+
}
function addb_dump()
@@ -143,7 +271,6 @@ function addb_dump()
local outfile
local inpfile
local fid
- local a2d=$MOTR_ROOT/utils/m0addb2dump
rm -fR "${outdir}"
mkdir "${outdir}"
@@ -152,15 +279,17 @@ function addb_dump()
fid=$(echo "${M0D_FIDS_HEX[i]}" | awk -F'x' '{ print $2; }')
outfile="${outdir}/addb_${fid}.dump"
inpfile="${M0D_DIR_COMMON}${M0D_FIDS_HEX[i]}/addb-stobs-${M0D_PIDS[i]}/o/100000000000000:2"
- _info "Dumping ${inpfile} -> ${outfile} ..."
- $a2d -f "${inpfile}" > "${outfile}"
+ addb2dump ${inpfile} ${outfile}
done
- inpfile="$PWD/addb_${CLIENT_PID}/o/100000000000000:2"
- fid=$(echo "$M0D_CLI_FID_HEX" | awk -F'x' '{ print $2; }')
- outfile="${outdir}/addb_${fid}.dump"
- _info "Dumping ${inpfile} -> ${outfile} ..."
- $a2d -f "${inpfile}" > "${outfile}"
+ if [[ "x$CLIENT_PID" != "x" ]]; then
+ inpfile="$PWD/addb_${CLIENT_PID}/o/100000000000000:2"
+ fid=$(echo "$M0D_CLI_FID_HEX" | awk -F'x' '{ print $2; }')
+ outfile="${outdir}/addb_${fid}.dump"
+ addb2dump ${inpfile} ${outfile}
+ else
+ _info "Skipping client addb dumps."
+ fi
}
function processes_status_check()
@@ -186,10 +315,135 @@ function fail()
exit 1
}
-function main()
+function client_run()
+{
+ _info "Launching the client..."
+ $MOTR_ROOT/motr/m0crate/m0crate -S $M0CRATE_CFG &
+ CLIENT_PID=$!
+ _info "Client pid: ${CLIENT_PID}"
+}
+
+function client_run_gdb()
+{
+ _info "Launching the client under gdb ..."
+ libtool --mode=execute gdb --args $MOTR_ROOT/motr/m0crate/m0crate -S $M0CRATE_CFG
+}
+
+function client_wait()
+{
+ wait ${CLIENT_PID}
+ rc="$?"
+ if [[ "$rc" -gt 127 ]]; then
+ _err "Client terminated, rc = $rc"
+ exit 1
+ fi
+ return "$rc"
+}
+
+function m0kv_async()
+{
+ local args
+ args=$(python3 -c "import yaml; \
+ conf = yaml.load(open('$M0CRATE_CFG'), Loader=yaml.FullLoader); \
+ m = conf['MOTR_CONFIG']; \
+ print('-l %s -h %s -p %s -f %s' % (m['MOTR_LOCAL_ADDR'], m['MOTR_HA_ADDR'], m['PROF'], m['PROCESS_FID']));")
+
+ _info "m0kv $args $@"
+ ${MOTR_UTILS_DIR}/m0kv $args $@ &
+ CLIENT_PID=$!
+}
+
+function m0kv()
+{
+ m0kv_async $@
+ client_wait
+}
+
+function m0kv_run_sync()
+{
+ m0kv index create "${DINDEX}"
+ m0kv -s index put "${DINDEX}" mykey myvalue
+ m0kv -s index get "${DINDEX}" mykey
+ m0kv index drop "${DINDEX}"
+}
+
+function client_run_sync()
+{
+ client_run
+ client_wait
+}
+
+function ctgdump()
+{
+ local svc_num=$1
+ local proc_fid="0x720000000000000${M0D_FIDS_HEX[$svc_num]}"
+ local index=$DINDEX
+ local svc_ep="${M0D_ENDPOINTS[$svc_num]}"
+ local stobs="/var/motr/m0d-${proc_fid}/stobs"
+ local db="/var/motr/m0d-${proc_fid}/db"
+ local rest="-m 524288 -q 16 -w 8 -U -r 134217728 -c /etc/motr/confd.xc"
+ local out_dir=$CTGDUMP_DIR
+ local fid_hex=$(echo "${M0D_FIDS_HEX[$svc_num]}" | awk -F'x' '{ print $2; }')
+ local out_file="${out_dir}/cas-${fid_hex}.dump"
+ local cmd=$CTGDUMP
+
+ local cmd_args="-e libfab:${svc_ep} -A linuxstob:addb-stobs -f ${proc_fid} -T ad -S ${stobs} -D ${db} ${rest} str ${index}"
+
+ mkdir -p $out_dir
+
+ _info "$cmd $cmd_args"
+ $cmd $cmd_args | sort > "$out_file"
+}
+
+function ctg_eq()
+{
+ local left=$1
+ local right=$2
+ local left_hex=$(echo "${M0D_FIDS_HEX[$left]}" | awk -F'x' '{ print $2; }')
+ local right_hex=$(echo "${M0D_FIDS_HEX[$right]}" | awk -F'x' '{ print $2; }')
+
+ local left_file="${CTGDUMP_DIR}/cas-${right_hex}.dump"
+ local right_file="${CTGDUMP_DIR}/cas-${left_hex}.dump"
+
+ if ! diff -u $left_file $right_file; then
+ _info "Catalogues are different. Run 'diff -u $left_file $right_file' to see the difference."
+ return 1
+ else
+ local nr_records=$(wc -l $left_file | cut -f1 -d' ')
+ _info "Compared ${left_hex} with ${right_hex}. Found ${nr_records} identical records."
+ return 0;
+ fi
+}
+
+function check_ctg_consistency()
+{
+ local ref=$VICTIM
+ local targets=(${WITNESSES[@]})
+
+ rm -fR "$CTGDUMP_DIR"
+
+ for i in $(seq 0 $((${#M0D_FIDS_HEX[@]}-1))); do
+ ctgdump $i
+ done
+
+ for i in ${targets[@]}; do
+ if ! ctg_eq $ref $i; then
+ local ref_pid=${M0D_PIDS[$ref]}
+ local tgt_pid=${M0D_PIDS[$i]}
+ _info "Inconsistency detected between $ref and $tgt processes."
+ exit 1
+ fi
+ done
+}
+
+
+# Phase where we power on the cluster.
+function boot_phase()
{
local cli_pid
+ _info "Phase recovery:boot"
+
${MOTR_UTILS_DIR}/m0setup --init-loop-only -s 1 -d ${TEST_ROOT} --pool-width ${POOL_WIDTH} --ipool-width ${IPOOL_WIDTH}
_info "Bootstrapping the cluster using Hare..."
@@ -200,16 +454,112 @@ function main()
_info "Create m0crate configuration..."
create_m0crate_cfg
+}
- _info "Run the client..."
- $MOTR_ROOT/motr/m0crate/m0crate -S $M0CRATE_CFG &
- cli_pid=$!
- wait ${cli_pid}
- _info "Client pid: ${cli_pid}"
- CLIENT_PID=${cli_pid}
- stop_cluster
+# Phase where we ensure that all m0ds are online
+# from DTM0 perspective.
+function online_phase()
+{
+ _info "Phase recovery:online"
+
+ _info "Wait until every m0d started"
+ while ! expect_trace_lines_from_m0ds "ALL2ALL_STARTED" 1; do
+ :;
+ done
+
+ _info "Wait until every m0d has recovered"
+ while ! expect_trace_lines_from_m0ds "ALL2ALL_DTM_RECOVERED" 1; do
+ :;
+ done
+}
+
+# Phase where we perform recovery.
+function recovery_phase()
+{
+ # 2 * (3 + 1) where:
+ # 2 is the number of witnesses;
+ # 3 is the number of records (mykey1, mykey2, mykey3);
+ # 1 is the EOL.
+ local nr_redo=8
+ local redo_pattern="m0_dtm0_recovery_machine_redo_post > in-redo"
+ local dtx_done_pattern="dtx_done >"
+ local m0kv_wait_file="/tmp/m0kv_wait"
+
+ _info "Phase recovery:recovery"
+ local svc_name="m0d@0x720000000000000${M0D_FIDS_HEX[VICTIM]}.service"
+
+ _info "Create an index"
+ m0kv index create "${DINDEX}"
+
+ _info "Populate the index"
+ m0kv -s index put "${DINDEX}" mykey1 myvalue1
+
+ _info "PUT one key, hang on"
+ m0kv_async -s index put "${DINDEX}" mykey2 myvalue2 \
+ wait "${m0kv_wait_file}" put "${DINDEX}" mykey3 myvalue3
+
+ _info "Wait until mykey2 reached the persistent storage on at least one node."
+ while ! expect_trace_lines_from_cli "$dtx_done_pattern" 1; do
+ :;
+ done
+
+ _info "Kill the victim (${M0D_PIDS[VICTIM]})."
+ kill -9 "${M0D_PIDS[VICTIM]}"
+
+ # XXX We may use the pool machine state change to identify this point.
+ # However, it may not be enough: we need to get to the point
+ # where the even was consumed _fully_ (i.e., Motr sent "ack" back to Hare).
+ _info "Wait a few seconds to ensure everyone learned about it."
+ sleep 10
+
+ # Let the client create a non-fully replicated record.
+ touch "$m0kv_wait_file"
+
+ client_wait
+
+ # TODO: Run ctgcmp to ensure mykey3 does not exist in the victim's storage.
+
+ _info "Resurrect the victim: $svc_name"
+ systemctl start "$svc_name"
+
+ # Victim's PID should be updated.
+ M0D_PIDS=()
+ get_m0d_pids
+ _info "Wait until the victim gets started."
+ while ! expect_trace_lines_from_m0d ${VICTIM} "ALL2ALL_STARTED" 1; do
+ :;
+ done
+
+ _info "Wait until the victim gets recovered."
+ while ! expect_trace_lines_from_m0d ${VICTIM} "ALL2ALL_DTM_RECOVERED" 1; do
+ :;
+ done
+
+ _info "Wait until the victim gets recovered."
+ while ! expect_trace_lines_from_m0d ${VICTIM} "ALL2ALL_DTM_RECOVERED" 1; do
+ :;
+ done
+
+ _info "Ensure we got enough REDO messages."
+ while ! expect_trace_lines_from_m0d ${VICTIM} "$redo_pattern" $nr_redo; do
+ :;
+ done
+
+ # TODO: Run ctgcmp to ensure mykey3 exists in the victim's storage.
+}
+
+# Phase where we are gathering artifacts
+# and shutting down the cluster.
+function shutdown_phase()
+{
+ _info "Phase recovery:shutdown"
+
+ _info "m0d pids: ${M0D_PIDS[@]}"
+
+ stop_cluster
addb_dump
+ check_ctg_consistency
_info "Checking processes exit status..."
processes_status_check || {
@@ -220,4 +570,144 @@ function main()
_info "TEST STATUS: PASSED"
}
-main
+function recovery_cycle()
+{
+ _info "Starting the recovery cycle"
+
+ boot_phase && \
+ online_phase && \
+ recovery_phase && \
+ shutdown_phase
+}
+
+function simple_boot_cycle()
+{
+ _info "Starting the simple boot cycle"
+
+ boot_phase &&
+ client_run_sync &&
+ shutdown_phase
+}
+
+function print_help()
+{
+ echo -en "
+ The script allows you to check one of the following cases:
+ '$0 ss' - 'Simple' bootstrap/shutdown of the 3-process cluster.
+ '$0 rec' - 'Recovery procedures' where the cluster is trying to
+ recover a failed participant.
+ Each case is called a 'cycle', and there are several
+ phases withing each cycle. You can use these phases to manually
+ run pieces of the cycles or individual commands. For example:
+ $0 rec boot
+ $0 rec m0kv index create \"1:5\"
+ $0 rec m0kv -s index put \"1:5\" mykey myvalue
+ $0 rec m0kv -s index put \"1:5\" mykey myvalue
+ $0 rec m0kv -s index get \"1:5\" mykey
+ $0 rec m0kv -s index next \"1:5\" \"\\\0\" 1
+ $0 rec m0kv -s index del \"1:5\" mykey
+ $0 rec stop
+
+ Debugging and testing:
+ 1. Use DTM0 UTs to ensure recovery machine works
+ with your changes:
+ sudo ./utils/m0run -d -- m0ut -t dtm0-ut
+ 2. Use 'dtm0-remach' gdb command to print backtraces
+ of the coroutines:
+ $ m0trace -i
| grep recovery_machine_init
+ .... m=0xabcd
+ $ gdb -p
+ (gdb) p (struct m0_dtm0_recovery_machine *) 0xabcd
+ \$1 = (struct m0_dtm0_recovery_machine *) 0xabcd
+ (gdb) dtm0-remach $1
+ < ... prints information about machine ...>
+
+ Known issues:
+ 1. FOM HUNG warnings for recovery FOM. They have
+ one single state for waiting on something, and because
+ of that you may see this warning.
+ \n";
+}
+
+# Prints various variables used by the
+# script. The function is used only
+# for debugging of the script.
+function print_env()
+{
+ get_params_for_ha_msgs
+ get_m0d_pids
+ echo "M0D_ENDPOINTS: ${M0D_ENDPOINTS[@]}"
+ echo "M0D_FIDS_DEC: ${M0D_FIDS_DEC[@]}"
+ echo "M0D_FIDS_HEX: ${M0D_FIDS_HEX[@]}"
+ echo "M0D_PIDS: ${M0D_PIDS[@]}"
+
+ echo "M0D_CLI_FID_DEC: ${M0D_CLI_FID_DEC}"
+ echo "M0D_CLI_FID_HEX: ${M0D_CLI_FID_HEX}"
+ echo "MOD_CLI_EP: ${MOD_CLI_EP}"
+}
+
+# params:
+# (i) cycle_name Name of the test (cycle)
+# (ii) phase_name Name of a phase in the cycle.
+function main()
+{
+ local cycle_name="$1"
+ local phase_name="$2"
+
+ # Run the whole cycle if phase was not
+ # specified.
+ if [ "x$phase_name" == "x" ]; then
+ case $cycle_name in
+ "ss")
+ simple_boot_cycle;;
+ "rec")
+ recovery_cycle;;
+ "-h")
+ print_help;;
+ "--help")
+ print_help;;
+ "--print-env")
+ print_env;;
+ *)
+ # Run ordinary all2al by default.
+ simple_boot_cycle;;
+ esac
+ else
+ if [[ "x$cycle_name" == "xrec" ]]; then
+ case $phase_name in
+ "boot")
+ boot_phase;;
+ "online")
+ online_phase;;
+ "recovery")
+ recovery_phase;;
+ "shutdown")
+ shutdown_phase;;
+ "m0kv")
+ shift;
+ shift;
+ m0kv $@;;
+ "stop")
+ killall -9 lt-m0ham || true;
+ killall -9 lt-m0d || true;
+ killall -9 hax || true;
+ killall -9 lt-m0crate || true;
+ killall -9 lt-m0kv || true;
+ killall -9 lt-m0mkfs || true;
+ hctl shutdown;;
+ *)
+ echo "Wrong phase: $phase_name"
+ echo "Use one of the following: "
+ echo " boot online recovery shutdown stop"
+ exit 1;;
+ esac
+ else
+ echo "Unsupported cycle: $cycle_name."
+ echo "Use 'r' or 's' (recovery, simple)."
+ exit 1
+ fi
+ fi
+
+}
+
+main $@
diff --git a/dtm0/it/all2all/cdf.yaml b/dtm0/it/all2all/cdf.yaml
index 85122b73bf1..aea11c4a3ff 100644
--- a/dtm0/it/all2all/cdf.yaml
+++ b/dtm0/it/all2all/cdf.yaml
@@ -2,25 +2,26 @@ nodes:
- hostname: localhost
data_iface: eth0
data_iface_type: tcp
+ transport_type: libfab
m0_servers:
- runs_confd: true
io_disks:
data: []
- io_disks:
data:
- - /dev/loop0
- - /dev/loop1
+ - path: /dev/loop0
+ - path: /dev/loop1
- io_disks:
data:
- - /dev/loop2
- - /dev/loop3
+ - path: /dev/loop2
+ - path: /dev/loop3
- io_disks:
data:
- - /dev/loop4
- - /dev/loop5
+ - path: /dev/loop4
+ - path: /dev/loop5
m0_clients:
- s3: 0
- other: 1
+ - name: motr_client
+ instances: 1
pools:
- name: SNS pool
@@ -32,6 +33,7 @@ pools:
- name: DIX pool
type: dix # optional; supported values: "sns" (default), "dix", "md"
data_units: 1
- parity_units: 1
+ parity_units: 2
+ spare_units: 0
allowed_failures: { site: 0, rack: 0, encl: 0, ctrl: 1, disk: 1 }
diff --git a/dtm0/linux_kernel/stubs.c b/dtm0/linux_kernel/stubs.c
index da6518142e8..6b5d629633e 100644
--- a/dtm0/linux_kernel/stubs.c
+++ b/dtm0/linux_kernel/stubs.c
@@ -57,3 +57,54 @@ M0_INTERNAL int m0_dtm0_req_post(struct m0_dtm0_service *svc,
(void) wait_for_ack;
return 0;
}
+
+#include "dtm0/recovery.h"
+
+M0_INTERNAL int m0_drm_domain_init(void)
+{
+ return 0;
+}
+
+M0_INTERNAL void m0_drm_domain_fini(void)
+{
+
+}
+
+M0_INTERNAL void
+m0_dtm0_recovery_machine_init(struct m0_dtm0_recovery_machine *m,
+ const struct m0_dtm0_recovery_machine_ops *ops,
+ struct m0_dtm0_service *svc)
+{
+ (void) m;
+ (void) svc;
+ (void) ops;
+}
+
+M0_INTERNAL int
+m0_dtm0_recovery_machine_start(struct m0_dtm0_recovery_machine *m)
+{
+ (void) m;
+ return 0;
+}
+
+M0_INTERNAL void
+m0_dtm0_recovery_machine_stop(struct m0_dtm0_recovery_machine *m)
+{
+ (void) m;
+}
+
+M0_INTERNAL void
+m0_dtm0_recovery_machine_fini(struct m0_dtm0_recovery_machine *m)
+{
+ (void) m;
+}
+
+M0_INTERNAL void
+m0_dtm0_recovery_machine_redo_post(struct m0_dtm0_recovery_machine *m,
+ struct dtm0_req_fop *redo,
+ struct m0_be_op *op)
+{
+ (void) m;
+ (void) redo;
+ (void) op;
+}
diff --git a/dtm0/recovery.c b/dtm0/recovery.c
new file mode 100644
index 00000000000..1a60a36f10b
--- /dev/null
+++ b/dtm0/recovery.c
@@ -0,0 +1,2203 @@
+/* -*- C -*- */
+/*
+ * Copyright (c) 2022 Seagate Technology LLC and/or its Affiliates
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * For any questions about this software or licensing,
+ * please email opensource@seagate.com or cortx-questions@seagate.com.
+ *
+ */
+
+
+/**
+ @page dtm0br-dld DLD of DTM0 basic recovery
+
+ - @ref DTM0BR-ovw
+ - @ref DTM0BR-def
+ - @ref DTM0BR-req
+ - @ref DTM0BR-depends
+ - @ref DTM0BR-highlights
+ - @subpage DTM0BR-fspec "Functional Specification"
+ - @ref DTM0BR-lspec
+ - @ref DTM0BR-lspec-comps
+ - @ref DTM0BR-lspec-rem-fom
+ - @ref DTM0BR-lspec-loc-fom
+ - @ref DTM0BR-lspec-evc-fom
+ - @ref DTM0BR-lspec-state
+ - @ref DTM0BR-lspec-thread
+ - @ref DTM0BR-lspec-numa
+ - @ref DTM0BR-conformance
+ - @ref DTM0BR-usecases
+ - @ref DTM0BR-ref
+ - @ref DTM0BR-impl-plan
+--->>>
+Max: [* defect *] ut and st sections are missing.
+IvanA: Agree. I will add it once we are more-or-less confident in use-cases and
+ requirements.
+<<<---
+--->>>
+Max: [* style *] usecases section should go to .h file, along with the rest of
+ the sections in .h in DLD template.
+<<<---
+
+
+
+ @section DTM0BR-ovw Overview
+ All specifications must start with an Overview section that
+ briefly describes the document and provides any additional
+ instructions or hints on how to best read the specification.
+
+ The document describes the way how DTM0 service is supposed to restore
+ consistency of the data replicated across a certain kind of Motr-based
+ cluster. The term "basic" implies that consistency is restored only
+ for a limited number of use-cases. In the rest of the document,
+ the term "basic" is omitted for simplicity (there are no other kinds of
+ DTM0-based recovery at the moment).
+
+--->>>
+Max: [* defect *] This is an obvious definition, which has very small value.
+ Please make an overview which would bring as much useful information to the
+ reader as it can in just a few lines.
+IvanA: [fixed] I tried to stick to the point, and avoid any "new" terms in
+ the paragraph.
+<<<---
+
+
+
+ @section DTM0BR-def Definitions
+ Mandatory.
+ The DLD shall provide definitions of the terms and concepts
+ introduced by the design, as well as the relevant terms used by the
+ specification but described elsewhere. References to the
+ M0 Glossary and the component's HLD are permitted and encouraged.
+ Agreed upon terminology should be incorporated in the glossary.
+
+--->>>
+Max: [* question *] Are those all the terms that are used here? If no, please
+ add the rest of the terms. Please also consider adding terms that are being
+ used, but which are not in DTM0 HLD.
+IvanA: [fixed] No, it seems like they are not used. I removed this part.
+ I'll populate the "new terms" first, and if there are any redundant
+ information, I'll add references to the HLD.
+<<<---
+
+ Terms and definitions used in the document:
+ - User service is a Motr service that is capable of replicating
+ its data ("user data") across the cluster. The document is focused on
+ CAS (see @ref cas-dld) but this term could be applied to any Motr service
+ that supports CRDT[2] and has behavior similar to CAS.
+ Note, this document does not differentiate actual "clients" and "servers".
+ For example, the Motr client (including DIX) is also considered to be a
+ "User service".
+ - DTM0 service is a Motr-service that helps a user service restore
+ consistency of its replicated data.
+ - Recoverable process is Motr process that has exactly one user
+ service and exactly one DTM0 service in it. Note that in UT environment,
+ a single OS process may have several user/DTM0 services, thus it may
+ have multiple recoverable processes.
+--->>>
+IvanA: [* question *] @Max, "Recoverable process" does not sound good to me.
+ What do you think about "DTM0 process"? or "DTM0-backed process"?
+ I just want to avoid confusion with confd and any other Motr process that
+ does not have a DTM0 service in it. Any ideas on that?
+<<<---
+ - Recovery procedures is a broad term used to point at DTM0 services'
+ reactions to HA notifications about states of DTM0 services. In other
+ words, it references to actions performed by DTM0 services that
+ help to restore consistency of replicated data.
+ - Recovery machine is a state machine running within a DTM0 service
+ that performs recovery procedures. Each recoverable process
+ has a single instance of recovery machine.
+ - Recovery FOM is a long-lived FOM responsible for reaction to
+ HA-provided state changes of a recoverable process. The FOM knows the
+ id (FID) of this process (Recovery FOM id). If the recovery
+ FOM id matches with the id of the process it is running on, then
+ this FOM is called "local". Otherwise, it is called "remote".
+ If a Motr cluster has N recoverable processes in the configuration then
+ each recoverable process has N recovery FOMs (one per remote counterpart
+ plus one local FOM).
+ - Recovery FOM role is a sub-set of states of a recovery FOM. Each
+ recovery FOM may have one of the following roles: remote recovery,
+ local recovery, eviction. The term "role" is often omitted.
+ - Recovery FOM reincarnation is a transition between the roles
+ of a recovery FOM. Reincarnation happens as a reaction to HA notifications
+ about state changes of the corresponding process.
+ - Remote recovery role defines a sub-set of states of a recovery FOM
+ that performs recovery of a remote process (by sending REDO messages).
+ - Local recovery role defines a sub-set of states of a recovery FOM
+ that is responsible for starting and stopping of recovery procedures on
+ the local recoverable process.
+ - Eviction role defines a sub-set of states of a recovery FOM that
+ restores consistency of up to N-1 (N is number of recoverable processes
+ in the configuration) recoverable processes after one of the recoverable
+ processes of the cluster experienced a permanent failure (see HLD[1]).
+ - W-request is a FOP sent from one user service to another that
+ causes modifications of persistent storage. Such a request always
+ contains DTM0-related meta-data (transaction descriptor). W-requests
+ are used to replicate data across the cluster. They are stored in DTM0
+ log, and replayed by DTM0 service in form of REDO messages. In case of
+ CAS, W-requests are PUT and DEL operations
+ - R-request is a user service FOP that does not modify persistent
+ storage. R-requests are allowed to be sent only to the processes that
+ have ONLINE state. In case of CAS, R-requests are GET and NEXT operations.
+--->>>
+Max: [* defect *] The terms are not defined. Please define them.
+IvanA: I populated the list. I'll update the rest of the document
+ in a separate round of fixes.
+<<<---
+
+
+ @section DTM0BR-req Requirements
+ Mandatory.
+ The DLD shall state the requirements that it attempts to meet.
+
+ Recovery machine shall meet the requirements defined in the DTM0 HLD[1].
+--->>>
+Max: [* question *] All of them? If no, please put a list here. If so, please
+ put the list anyway.
+<<<---
+ Additionally, it has the following list of low-level requirements:
+--->>>
+Max: [* question *] Why do we have those requirements? Please provide a way to
+ track them to the top-level requirements or requirements for other DTM0
+ subcomponents.
+<<<---
+
+ - @b R.DTM0BR.HA-Aligned State transitions of the recovery SM shall be
+ aligned with the state transitions of Motr processes delivered by
+ the HA subsystem to the recovery machine.
+ - @b R.DTM0BR.Basic Recovery machine shall support only a subset of
+ possible use-cases. The subset is defined in @ref DTM0BR-usecases.
+--->>>
+Max: [* defect *] Usecases are not defined.
+<<<---
+ - @b R.DTM0BR.No-Batching Recovery machine shall replay logs in
+ a synchronous manner one-by-one.
+--->>>
+Max: [* question *] One-by-one log or one-by-one log record? In the latter case:
+ what about records from logs on different nodes?
+<<<---
+--->>>
+Max: [* question *] How those logs or records are ordered (to be able to process
+ them one-by-one)?
+<<<---
+--->>>
+Max: [* question *] What about performance: is it possible for this design to
+ meet performance requirements? In any case please clearly state if it is
+ and why.
+<<<---
+
+
+ @section DTM0BR-depends Dependencies
+ Mandatory. Identify other components on which this specification
+ depends.
+
+ The basic recovery machine depends on the following components:
+ - Hare. This component is supposed to own an ordered log of all HA events
+ in the system. Note, this dependency is optional for a certain set
+ of use-cases where events are delivered in the right way even without
+ beign backed up by a log.
+--->>>
+Max: [* defect *] This is not true at the moment. Please specify if it's
+ possible to implement this design without this being true or what is the
+ plan to deal with this.
+IvanA: [fixed] I added a note that this thing is optional for some cases.
+<<<---
+ - DTM0 service. The service provides access to DTM0 log and DTM0 RPC link.
+ - DTM0 log. The log is used to fill in REDO messages.
+ - DTM0 RPC link. The link is used as transport for REDO messages.
+ - Motr HA/conf. HA states of Motr conf objects provide the information
+ about state transitions of local and remote Motr processes.
+ - Motr conf. The conf module provides the list of remote DTM0 services.
+
+
+ @section DTM0BR-highlights Design Highlights
+ Mandatory. This section briefly summarizes the key design
+ decisions that are important for understanding the functional and
+ logical specifications, and enumerates topics that need special
+ attention.
+
+ Each DTM0 service has a recovery machine. Each recovery machine contains
+ N recovery FOMs (one per counterpart + one local). Recovery FOMs start
+ listen to various events (HA events, DTM0 log events, RPC replies) and
+ act accordingly (sending out M0_CONF_HA_PROCESS_* events and REDO messages).
+
+ @verbatim
+
+ +-------------------------------------------+
+ | Recoverable process, process fid = 1 |
+ | +---------------------------------------+ |
+ | | DTM0 Service | |
+ | | | |
+ | | +-----------------------------------+ | |
+ | | | Recovery machine | | |
+ | | | | | |
+ | | | +--------+ +--------+ +--------+ | | |
+ | | | | R_FOM1 | | R_FOM2 | | R_FOM3 | | | |
+ | | | | FID=1 | | FID=2 | | FID=3 | | | |
+ | | | +--------+ +--------+ +--------+ | | |
+ | | | | | |
+ | | +-----------------------------------+ | |
+ | +---------------------------------------+ |
+ +-------------------------------------------+
+
+ Recovery machines of one of the processes of a 3-process cluster.
+ The other two processes have the same set of FOMs. FID=N means that
+ the id of this recovery FOM is N. The first one (R_FOM1) is the local
+ recovery FOM of this DTM0 service. The other two (R_FOM2, R_FOM3) are
+ remote recovery FOMs that would send REDOs to the processes 2 and 3.
+
+ @endverbatim
+
+ Each recovery FOM may await on external events:
+
+ @verbatim
+
+ +-----------------------------------------------------+
+ | Recovery FOM inputs |
+ | |
+ | Polling |
+ | ---------------------------------------------- |
+ | | HA queue | | DTM0 log | | RPC replies | |
+ | | | | watcher | | | |
+ | | /|\ | | | | /|\ | |
+ | | | | | /|\ | | | | |
+ | | | | | | | | | | |
+ | | conf obj | | DTM0 log | | DTM0 RPC link | |
+ +-----------------------------------------------------+
+
+ @endverbatim
+
+ For example, when a recovery FOM performs recovery of a remote
+ process, it may await on HA queue (to halt sending of REDOs), on
+ DTM0 log watcher (to learn if there are new entries in the log),
+ or on DTM0 RPC link (to learn if next REDO message can be sent).
+
+ Whenever a recovery FOM receives an event from the HA queue, it gets
+ "reincarnated" to serve its particular role in recovery procedures
+ (see "Recovery FOM reincarnation" definition).
+ Incarnation of a recovery FOM is just a sub-state-machine of the FOM.
+ For example, here is the full state machine of R_FOM2:
+
+ @verbatim
+
+ INIT -> AWAIT_HEQ (HA event queue)
+
+ AWAIT_HEQ -> DECIDE_WHAT_TO_DO
+ DECIDE_WHAT_TO_DO -> AWAIT_HEQ
+ DECIDE_WHAT_TO_DO -> RESET
+ DECIDE_WHAT_TO_DO -> FINAL
+
+ RESET -> SEND_REDO
+
+ SEND_REDO -> AWAIT_HEQ_OR_LOG
+ SEND_REDO -> AWAIT_HEQ_OR_RPC
+
+ AWAIT_HEQ_OR_LOG -> SEND_REDO
+ AWAIT_HEQ_OR_LOG -> HEQ
+
+ AWAIT_HEQ_OR_RPC -> SEND_REDO
+ AWAIT_HEQ_OR_RPC -> HEQ
+
+ INIT --------------> AWAIT_HEQ
+ | /|\
+ \|/ |
+ DECIDE_WHAT_TO_DO --------------> FINAL
+ | /|\
+ | | (remote recovery
+ | | sub-states)
+ //====================|=====|===================================//
+ // RESET <-----------+ +---< HEQ (Transitions that //
+ // | may happen as //
+ // | a reaction to //
+ // | +------------------------+ RECOVERING HA event)//
+ // \|/ \|/ | //
+ // SEND_REDO ---------------> AWAIT_ON_HEQ_OR_RPC ---> HEQ //
+ // | /|\ //
+ // \|/ | //
+ // AWAIT_ON_HEQ_OR_LOG -----> HEQ //
+ //==============================================================//
+
+ ... ...
+ | | (eviction sub-states)
+ //====================|====|====================================//
+ // RESET <----------+ +----< HEQ (Transitions that //
+ // | may happen as a //
+ // | +------------------+ a reaction to FAILED //
+ // \|/ \|/ | HA event) //
+ // SEND_REDO -----> AWAIT_ON_HEQ_OR_RPC ----> HEQ //
+ //==============================================================//
+ @endverbatim
+
+ The first frame highlighted with "===//" shows the sub-state machine of
+ a recovery FOM that are used to recover a participant that entered
+ RECOVERING HA state. If the same participant enters FAILED state
+ (permanent failure) then another set of sub-states would be involved
+ in recovery procedures (the second frame).
+
+ On the diagram above, the transition DECIDE_WHAT_TO_DO -> RESET
+ marks the point where recovery FOM enters a new incarnation,
+ the transition HEQ -> DECIDE_WHAT_TO_DO marks the end of the life of this
+ incarnation. The set of sub-states is called "role".
+ For example, if a recovery FOM is somewhere inside the first frame
+ (remote recovery) then we say that it has "remote recovery role" and it
+ performs "remote recovery". Correspondingly, the second frame describes
+ "eviction role" or just "eviction".
+
+ The notion of incarnation is used to emphasise that a recovery FOM must
+ always "clean up" its volatile state before processing next HA event.
+ For example, it may await on a pending RPC reply or it may have to reset
+ DTM0 log iterator.
+
+ The local recovery FOM (R_FOM1 in the example), is just a volatile
+ state that captures information about certain points in the log and sends
+ out ::M0_CONF_HA_PROCESS_RECOVERED when it believes that the local process
+ has been fully recovered (see the recovery stop condition below).
+ The local recovery FOM cannot change its role (it cannot recover itself
+ from transient or permanent failures).
+
+ UPD: The design described above has actually been implemented using
+ coroutines, not FSM. So we do not have these states in the code. But --
+ coroutines are implemented in the same spirit as outlined above, that is:
+ it would execute all actions needed for a given incarnation, clean up, and
+ only after this -- read next event from HEQ (HA event queue).
+
+
+
+ @section DTM0BR-lspec Logical Specification
+ Mandatory. This section describes the internal design of the component,
+ explaining how the functional specification is met. Sub-components and
+ diagrams of their interaction should go into this section. The section has
+ mandatory subsections created using the Doxygen @@subsection command. The
+ designer should feel free to use additional sub-sectioning if needed, though
+ if there is significant additional sub-sectioning, provide a table of
+ contents here.
+
+ - @ref DTM0BR-lspec-comps
+ - @ref DTM0BR-lspec-rem-fom
+ - @ref DTM0BR-lspec-loc-fom
+ - @ref DTM0BR-lspec-evc-fom
+ - @ref DTM0BR-lspec-state
+ - @ref DTM0BR-lspec-thread
+ - @ref DTM0BR-lspec-numa
+
+ @subsection DLD-lspec-comps Component Overview
+ Mandatory.
+ This section describes the internal logical decomposition.
+ A diagram of the interaction between internal components and
+ between external consumers and the internal components is useful.
+
+ The following diagram shows connections between recovery machine and
+ the other components of the system:
+
+ @dot
+ digraph {
+ rankdir = LR;
+ bgcolor="lightblue"
+ node[shape=box];
+ label = "DTM0 Basic recovery machine components and dependencies";
+ subgraph cluster_recovery_sm {
+ label = "Recovery SM";
+ bgcolor="lightyellow"
+ node[shape=box];
+ rfom[label="Recovery FOM"];
+ E1[shape=none label="⋮" fontsize=30]
+ rfom_another[label="Another Recovery FOM"];
+ }
+
+ subgraph cluster_ha_conf {
+ node[shape=box];
+ label ="HA and Conf";
+ conf_obj[label="Conf object"];
+ ha[label="Motr HA"];
+ }
+
+ drlink[label="DTM0 RPC link"];
+ dlog[label="DTM0 log"];
+
+ conf_obj -> rfom[label="HA decisions"];
+ rfom -> ha [label="Process events"];
+ rfom -> drlink [label="REDO msgs"];
+ dlog -> rfom[label="records"];
+ }
+ @enddot
+--->>>
+Max: [* defect *] Communication with other recovery foms in the same process and
+ in different processes are missing. Please add.
+<<<---
+
+ The following sequence diagram represents an example of linerisation of
+ HA decisions processing:
+ @msc
+ ha,rfom, remote;
+ ha -> rfom [label="RECOVERING"];
+ rfom -> remote [label="REDO(1)"];
+ ha -> rfom [label="TRANSIENT"];
+ remote -> rfom [label="REDO(1).reply(-ETIMEOUT)"];
+ ha -> rfom [label="RECOVERING"];
+ rfom -> rfom [label="Cancel recovery"];
+ rfom -> remote [label="REDO(1)"];
+ remote -> rfom [label="REDO(1).reply(ok)"];
+ @endmsc
+
+ In this example the recovery machine receives a sequence of events
+ (TRANSIENT, RECOVERING, TRANSIENT), and reacts on them:
+ - Starts replay the log by sending REDO(1).
+ - Awaits on the reply.
+ - Cancels recovery.
+ - Re-starts replay by sending REDO(1) again.
+ - Get the reply.
+--->>>
+Max: [* defect *] One example is not enough to describe the algorithm. Please
+ explain how it's done exactly. Maybe not in this section, but somewhere in DLD.
+<<<---
+
+ @subsection DTM0BR-lspec-rem-fom Remote recovery FOM
+ Such sections briefly describes the purpose and design of each
+ sub-component.
+--->>>
+Max: [* defect *] State machine is there, but what the fom actually does is
+ missing. Please add.
+<<<---
+
+ When a recovery FOM detects a state transition to RECOVERING of a remote
+ participant, it transits to the sub-SM called "Remote recovery FOM".
+ In other words, it re-incarnates as "Remote recovery FOM" as soon as
+ the previous incarnation is ready to reach its "terminal" state.
+
+ Remote recovery FOM reacts on the following kinds of events:
+ - Process state transitions (from HA subsystem);
+ - DTM0 log getting new records;
+ - RPC replies (from DTM0 RPC link component).
+
+ Remote recovery FOM has the following states and transitions:
+
+ @verbatim
+ REMOTE_RECOVERY(initial) -> WAITING_ON_HA_OR_RPC
+ REMOTE_RECOVERY(initial) -> WAITING_ON_HA_OR_LOG
+ WAITING_ON_HA_OR_RPC -> WAITING_ON_HA_OR_LOG
+ WAITING_ON_HA_OR_LOG -> WAITING_ON_HA_OR_RPC
+ WAITING_ON_HA_OR_RPC -> NEXT_HA_STATE (terminal)
+ WAITING_ON_HA_OR_LOG -> NEXT_HA_STATE (terminal)
+ WAITING_ON_HA_OR_RPC -> SHUTDOWN (terminal)
+ WAITING_ON_HA_OR_LOG -> SHUTDOWN (terminal)
+
+ Where:
+ REMOTE_RECOVERY is a local initial state (local to the sub-SM);
+ NEXT_HA_STATE is a local terminal state;
+ SHUTDOWN is a global terminal state.
+ @endverbatim
+
+ @subsection DTM0BR-lspec-loc-fom Local recovery FOM
+ Such sections briefly describes the purpose and design of each
+ sub-component.
+
+ Local recovery is used to ensure fairness of overall recovery procedure.
+--->>>
+Max: [* defect *] Definition of "fairness" in this context is missing. Please
+ add.
+<<<---
+--->>>
+Max: [* defect *] Definition of the "fairness" is "ensured" is missing. Please
+ add.
+<<<---
+ Whenever a participant learns that it got all missed log records, it sends
+ M0_CONF_HA_PROCESS_RECOVERED process event to the HA subsystem.
+
+ The point where this event has to be sent is called "recovery stop
+ condition". The local participant (i.e., the one that is being in
+ the RECOVERING state) uses the information about new incoming W-requests,
+ the information about the most recent (txid-wise) log entries on the other
+ participants to make a decision whether recovery shall be stopped or not.
+
+ TODO: describe the details on when and how it should stop.
+--->>>
+Max: [* defect *] You're right, the details are missing.
+<<<---
+
+
+ @subsection DTM0BR-lspec-evc-fom Eviction FOM
+ Such sections briefly describes the purpose and design of each
+ sub-component.
+--->>>
+Max: [* defect *] The purpose of the eviction fom is not described.
+<<<---
+
+ A recovery FOM re-incarnates as eviction FOM (i.e., enters the initial
+--->>>
+Max: [* defect *] A definition of "re-incarnation" is missing.
+<<<---
+ of the corresponding sub-SM) when the HA subsystem notifies about
+--->>>
+Max: [* defect *] A definition of sub-SM is missing.
+<<<---
+ permanent failure (FAILED) on the corresponding participant.
+
+ The local DTM0 log shall be scanned for any record where the FAILED
+--->>>
+Max: [* doc *] Please add a reference to the component which does the scanning.
+<<<---
+--->>>
+Max: [* doc *] Please explain how to scanning is done:
+ - sequentially?
+ - from least recent to most recent?
+ - one fom/thread/whatever or multiple? How the work is distributed?
+ - locks also have to be described somewhere in DLD.
+<<<---
+ participant participated. Such a record shall be replayed to the
+ other participants that are capable of receiving REDO messages.
+--->>>
+Max: [* defect *] What to do with the message if a participant is in TRANSIENT
+ state is not described.
+<<<---
+
+ When the log has been replayed completely, the eviction FOM notifies
+--->>>
+Max: [* defect *] Criteria of "log has been replayed completely" is missing.
+ Please add.
+<<<---
+ the HA subsystem about completion and leaves the sub-SM.
+--->>>
+Max: [* question *] How? Please also describe what is expected from HA.
+<<<---
+
+ TODO: GC of FAILED processes and handling of clients restart.
+
+ @subsection DTM0BR-lspec-state State Specification
+ Mandatory.
+ This section describes any formal state models used by the component,
+ whether externally exposed or purely internal.
+
+ The states of a recovery FOM is defined as a collection of the sub-SM
+ states, and a few global states.
+
+ TODO: state diagram for overall recovery FOM.
+--->>>
+Max: [* doc *] Also distributed state diagram.
+<<<---
+
+ @subsection DTM0BR-lspec-thread Threading and Concurrency Model
+ Mandatory.
+ This section describes the threading and concurrency model.
+ It describes the various asynchronous threads of operation, identifies
+ the critical sections and synchronization primitives used
+ (such as semaphores, locks, mutexes and condition variables).
+
+ Recovery machine revolves around the following kinds of threads:
+--->>>
+Max: [* defect *] Definition of "revolves" is missing.
+<<<---
+ - Locality threads (Recovery FOMs, DTM0 log updates).
+ - RPC machine thread (RPC replies).
+
+ Each recovery FOM is implemented using Motr coroutines library.
+ Within the coroutine ::m0_be_op and/or ::m0_co_op is used to await
+--->>>
+Max: [* question *] RE: "the coroutine": which one?
+<<<---
+ on external events.
+--->>>
+Max: [* suggestion *] A list of such events would help to understand why be/co
+ ops are needed.
+<<<---
+
+ DTM0 log is locked using a mutex (TBD: the log mutex or a separate mutex?)
+--->>>
+Max: [* question *] Entire DTM0 log is locked using a mutex? If no, please add
+ unambiguous sentence. Example: " is
+ protected with a mutex" or "a separate mutex protects concurrent access to
+ ".
+<<<---
+ whenever recovery machine sets up or cleans up its watcher.
+--->>>
+Max: [* defect *] The watcher description/workflow/etc. is missing. Please add.
+<<<---
+
+ Interaction with RPC machine is wrapped by Motr RPC library and DTM0
+--->>>
+Max: [* typo *] s/Interaction/Interaction/
+<<<---
+ RPC link component. It helps to unify the type of completion objects
+ across the FOM code.
+
+ TODO: describe be/co op poll/select.
+
+ @subsection DTM0BR-lspec-numa NUMA optimizations
+ Mandatory for components with programmatic interfaces.
+ This section describes if optimal behavior can be supported by
+ associating the utilizing thread to a single processor.
+
+ There is no so much shared resources outside of DTM0 log and
+ the localities. Scaling and batching is outside of the scope
+--->>>
+Max: [* defect *] The number of localities is equal to the number of CPU cores
+in our default configuration, so whatever uses more than one locality has to
+have some kind of synchronisation.
+<<<---
+--->>>
+Max: [* defect *] It's not clear what "scaling" and "batching" mean in this
+ context. Please explain and/or add a reference where they are explained.
+<<<---
+ of this document.
+--->>>
+Max: [* defect *] FOM locality assignment is missing.
+<<<---
+
+
+ @section DTM0BR-conformance Conformance
+ Mandatory.
+ This section cites each requirement in the @ref DTM0BR-req section,
+ and explains briefly how the DLD meets the requirement.
+--->>>
+Max: [* defect *] Top-level requirements from HLD are missing.
+<<<---
+
+ - @b I.DTM0BR.HA-Aligned Recovery machine provides an event queue that
+ is beign consumed orderly by the corresponding FOM.
+--->>>
+Max: [* defect *] The word "queue" is present in this document only here. Please
+ describe the event queue and events somewhere in this DLD.
+<<<---
+--->>>
+Max: [* defect *] It's not clear how "HA subsystem" from the requirement is met
+ by the implementation.
+<<<---
+ - @b I.DTM0BR.Basic Recovery machine supports only basic log replay defined
+ by the remote recovery FOM and its counterpart.
+--->>>
+Max: [* defect *] "basic" term is not defined. Please define.
+<<<---
+ - @b I.DTM0BR.No-Batching Recovery machine achieves it by awaiting on
+ RPC replies for every REDO message sent.
+--->>>
+Max: [* defect *] Maybe I should wait for I.* for performance and availability
+ requirements, but as of now it's not clear how DTM recovery would catch up
+ with the rest of the nodes creating new DTM0 transactions in parallel with
+ DTM recovery.
+<<<---
+
+
+ @section DTM0BR-usecases
+ Mandatory. This section describes use-cases for recovery machine.
+
+
+ TODO: Add use-cases.
+--->>>
+Max: [* defect *] Right, they are missing.
+<<<---
+
+
+
+ @section DLD-O Analysis
+ This section estimates the performance of the component, in terms of
+ resource (memory, processor, locks, messages, etc.) consumption,
+ ideally described in big-O notation.
+--->>>
+Max: [* doc *] Please fill this section with references to the requirements.
+<<<---
+
+
+ @section DLD-ref References
+ Mandatory. Provide references to other documents and components that
+ are cited or used in the design.
+ In particular a link to the HLD for the DLD should be provided.
+
+ - [1]
+ DTM0 HLD
+ - [2]
+
+
+ @section DLD-impl-plan Implementation Plan
+ Mandatory. Describe the steps that should be taken to implement this
+ design.
+
+ - Develop pseudocode-based description of recovery activities.
+ - Identify and add missing concurrency primitives (select/poll for be op).
+ - Use the pseudocode and new primitives to implement a skeleton of the FOMs.
+ - Incorporate RECOVERING and TRANSIENT states with their
+ new meaning as soon as they are available in the upstream code.
+ - Populate the list of use-cases based on available tools.
+ - Populate the list of system tests.
+ - Improve the stop condition based on the use-cases and tests.
+ - Add implementation of eviction FOM.
+--->>>
+Max: [* defect *] No mention of tests being done. Please clearly state when and
+ how the tests are going to be done or where and when it would be possible
+ to find this information.
+<<<---
+ */
+
+
+
+#define M0_TRACE_SUBSYSTEM M0_TRACE_SUBSYS_DTM0
+#include "lib/trace.h"
+#include "dtm0/recovery.h" /* m0_dtm0_recovery_machine */
+#include "dtm0/service.h" /* m0_dtm0_service */
+#include "dtm0/drlink.h" /* m0_dtm0_req_post */
+#include "dtm0/fop.h" /* dtm0_req_fop */
+#include "be/op.h" /* m0_be_op */
+#include "be/queue.h" /* m0_be_queue */
+#include "conf/diter.h" /* diter */
+#include "conf/helpers.h" /* m0_confc_root_open */
+#include "conf/obj.h" /* m0_conf_obj */
+#include "fop/fom.h" /* m0_fom */
+#include "lib/coroutine.h" /* m0_co_context */
+#include "lib/memory.h" /* M0_ALLOC_PTR */
+#include "reqh/reqh.h" /* m0_reqh2confc */
+#include "rpc/rpc_opcodes.h" /* M0_DTM0_RECOVERY_FOM_OPCODE */
+#include "lib/string.h" /* m0_streq */
+#include "be/dtm0_log.h" /* m0_dtm0_log_rec */
+#include "motr/setup.h" /* m0_cs_reqh_context */
+#include "addb2/identifier.h" /* M0_AVI_FOM_TO_TX */
+#include "dtm0/addb2.h" /* M0_AVI_DORM_SM_STATE */
+#include "motr/client_internal.h" /* struct m0client::m0c_reqh */
+
+enum {
+ /*
+ * Number of HA event that could be submitted by the HA subsystem
+ * at once; where:
+ * "HA event" means state transition of a single DTM0 process;
+ * "at once" means the maximal duration of a tick of any FOM
+ * running on the same locality as the recovery FOM that
+ * handles HA events.
+ * XXX: The number was chosen randomly. Update the previous sentence
+ * if you want to change the number.
+ */
+ HEQ_MAX_LEN = 32,
+
+ /*
+ * Number of HA events and EOL messages that could be submitted
+ * at once.
+ * TODO: Modify this comment once we stop putting HA events in this
+ * queue (with help of be-op-or-set). It shall be EOL-only queue.
+ */
+ EOLQ_MAX_LEN = 100,
+};
+
+struct recovery_fom {
+ struct m0_fom rf_base;
+
+ /* Recovery machine instance that owns this FOM. */
+ struct m0_dtm0_recovery_machine *rf_m;
+
+ /** Subscription to conf obj HA state. */
+ struct m0_clink rf_ha_clink;
+
+ /** HA event queue populated by the clink and consumed by the FOM. */
+ struct m0_be_queue rf_heq;
+
+ struct m0_co_context rf_coro;
+
+ /** Linkage for m0_dtm0_recovery_machine::rm_foms */
+ struct m0_tlink rf_linkage;
+
+ /** Magic for rfom tlist entry. */
+ uint64_t rf_magic;
+
+ struct m0_be_queue rf_eolq;
+
+ struct m0_be_dtm0_log_iter rf_log_iter;
+
+ /* Target DTM0 service FID (id of this FOM within the machine). */
+ struct m0_fid rf_tgt_svc;
+
+ struct m0_fid rf_tgt_proc;
+
+ /** Is target DTM0 service the service ::rf_m belongs to? */
+ bool rf_is_local;
+
+ /** Is target DTM0 volatile? */
+ bool rf_is_volatile;
+
+
+ /** The most recent HA state of this remote DTM0 service. */
+ enum m0_ha_obj_state rf_last_known_ha_state;
+
+ /**
+ * The most recent known state of the log on a remote DTM0 service.
+ * Note, it is impossible to guarantee that this stat is "in-sync"
+ * with ::rf_last_known_ha_state unless we have HA epochs.
+ */
+ bool rf_last_known_eol;
+};
+
+enum eolq_item_type {
+ EIT_EOL,
+ EIT_HA,
+ EIT_END,
+};
+
+struct eolq_item {
+ enum eolq_item_type ei_type;
+ struct m0_fid ei_source;
+ enum m0_ha_obj_state ei_ha_state;
+};
+
+/*
+ * A global variable to set off parts of the code that were added
+ * specifically for the integration (all2all) test script.
+ * Later on, they need to be removed (once we get a better
+ * way of testing).
+ */
+const bool ALL2ALL = true;
+
+M0_TL_DESCR_DEFINE(rfom, "recovery_fom",
+ static, struct recovery_fom, rf_linkage,
+ rf_magic, M0_DTM0_RMACH_MAGIC, M0_DTM0_RMACH_HEAD_MAGIC);
+M0_TL_DEFINE(rfom, static, struct recovery_fom);
+
+static int populate_foms (struct m0_dtm0_recovery_machine *m);
+static void unpopulate_foms(struct m0_dtm0_recovery_machine *m);
+
+static bool recovery_fom_ha_clink_cb(struct m0_clink *clink);
+
+static void recovery_fom_self_fini(struct m0_fom *fom);
+static int recovery_fom_tick(struct m0_fom *fom);
+
+static void recovery_machine_lock(struct m0_dtm0_recovery_machine *m);
+static void recovery_machine_unlock(struct m0_dtm0_recovery_machine *m);
+static struct recovery_fom *
+recovery_fom_local(struct m0_dtm0_recovery_machine *m);
+
+static bool ha_event_invariant(uint64_t event)
+{
+ return event < M0_NC_NR;
+}
+
+static void addb2_relate(const struct m0_sm *left, const struct m0_sm *right)
+{
+ M0_ADDB2_ADD(M0_AVI_FOM_TO_TX, m0_sm_id_get(left), m0_sm_id_get(right));
+}
+
+
+static struct m0_sm_state_descr recovery_machine_states[] = {
+ [M0_DRMS_INIT] = {
+ .sd_name = "M0_DRMS_INIT",
+ .sd_allowed = M0_BITS(M0_DRMS_STOPPED, M0_DRMS_STARTED),
+ .sd_flags = M0_SDF_INITIAL,
+ },
+ [M0_DRMS_STARTED] = {
+ .sd_name = "M0_DRMS_STARTED",
+ .sd_allowed = M0_BITS(M0_DRMS_STOPPED),
+ .sd_flags = 0,
+ },
+ [M0_DRMS_STOPPED] = {
+ .sd_name = "M0_DRMS_STOPPED",
+ .sd_allowed = 0,
+ .sd_flags = M0_SDF_TERMINAL,
+ },
+};
+
+static struct m0_sm_trans_descr recovery_machine_trans[] = {
+ { "started", M0_DRMS_INIT, M0_DRMS_STARTED },
+ { "stop-running", M0_DRMS_STARTED, M0_DRMS_STOPPED },
+ { "stop-idle", M0_DRMS_INIT, M0_DRMS_STOPPED },
+};
+
+struct m0_sm_conf m0_drm_sm_conf = {
+ .scf_name = "recovery_machine",
+ .scf_nr_states = ARRAY_SIZE(recovery_machine_states),
+ .scf_state = recovery_machine_states,
+ .scf_trans_nr = ARRAY_SIZE(recovery_machine_trans),
+ .scf_trans = recovery_machine_trans,
+};
+
+M0_INTERNAL void
+m0_dtm0_recovery_machine_init(struct m0_dtm0_recovery_machine *m,
+ const struct m0_dtm0_recovery_machine_ops *ops,
+ struct m0_dtm0_service *svc)
+{
+ M0_PRE(m != NULL);
+ M0_ENTRY("m=%p, svc=%p", m, svc);
+ M0_PRE(m0_sm_conf_is_initialized(&m0_drm_sm_conf));
+
+ m->rm_svc = svc;
+ if (ops)
+ m->rm_ops = ops;
+ else
+ m->rm_ops = &m0_dtm0_recovery_machine_default_ops;
+ rfom_tlist_init(&m->rm_rfoms);
+ m0_sm_group_init(&m->rm_sm_group);
+ m0_sm_init(&m->rm_sm, &m0_drm_sm_conf,
+ M0_DRMS_INIT, &m->rm_sm_group);
+ m0_sm_addb2_counter_init(&m->rm_sm);
+
+ M0_POST(m->rm_ops != NULL);
+ M0_LEAVE();
+}
+
+M0_INTERNAL void
+m0_dtm0_recovery_machine_fini(struct m0_dtm0_recovery_machine *m)
+{
+ M0_ENTRY("m=%p", m);
+ recovery_machine_lock(m);
+ M0_ASSERT(M0_IN(m->rm_sm.sm_state, (M0_DRMS_STOPPED, M0_DRMS_INIT)));
+ if (m->rm_sm.sm_state == M0_DRMS_INIT)
+ m0_sm_state_set(&m->rm_sm, M0_DRMS_STOPPED);
+ m0_sm_fini(&m->rm_sm);
+ recovery_machine_unlock(m);
+ /*
+ * Question: This uses the sm group lock and immediately finalises it,
+ * which is, in general, wrong (if there can be other threads waiting on
+ * the lock, then finalisation is incorrect, if no other threads are
+ * possible at this point, the lock-unlock are not needed). Please add a
+ * comment explaining why this is correct.
+ *
+ * Answer: There are indeed no other threads waiting on the lock.
+ * Lock-unlock is needed because m0_sm_fini requires it (otherwise
+ * asserts fail). The reason there are no other threads below:
+ *
+ * See ASSERT above: this function only runs when sm_state is
+ * M0_DRMS_INIT or M0_DRMS_STOPPED. When M0_DRMS_INIT, there are no
+ * other threads _yet_. When other threads are created -- state becomes
+ * M0_DRMS_STARTED (threads are recovery FOMs, see
+ * m0_dtm0_recovery_machine_start()). When M0_DRMS_STOPPED, there are
+ * no other threads _already_. We go to M0_DRMS_STOPPED when the last
+ * FOM is finalized -- see recovery_fom_self_fini().
+ */
+ m0_sm_group_fini(&m->rm_sm_group);
+ M0_ASSERT(rfom_tlist_is_empty(&m->rm_rfoms));
+ rfom_tlist_fini(&m->rm_rfoms);
+ M0_LEAVE();
+}
+
+M0_INTERNAL int
+m0_dtm0_recovery_machine_start(struct m0_dtm0_recovery_machine *m)
+{
+ struct recovery_fom *rf;
+ int rc;
+
+ /* TODO: Skip initialisation of recovery foms during mkfs. */
+ rc = populate_foms(m);
+ if (rc < 0)
+ return M0_RC(rc);
+
+ M0_ASSERT(rfom_tlist_is_empty(&m->rm_rfoms) ==
+ (m->rm_local_rfom == NULL));
+
+ m0_tl_for(rfom, &m->rm_rfoms, rf) {
+ m0_fom_queue(&rf->rf_base);
+ } m0_tlist_endfor;
+
+ if (m->rm_local_rfom != NULL) {
+ recovery_machine_lock(m);
+ m0_sm_state_set(&m->rm_sm, M0_DRMS_STARTED);
+ recovery_machine_unlock(m);
+ }
+
+ if (ALL2ALL)
+ M0_LOG(M0_DEBUG, "ALL2ALL_STARTED");
+
+ return M0_RC(rc);
+}
+
+M0_INTERNAL void
+m0_dtm0_recovery_machine_stop(struct m0_dtm0_recovery_machine *m)
+{
+ int rc;
+ struct recovery_fom *rf;
+
+ recovery_machine_lock(m);
+
+ m0_tl_for(rfom, &m->rm_rfoms, rf) {
+ m0_be_queue_lock(&rf->rf_heq);
+ M0_LOG(M0_DEBUG, "heq_end " FID_F, FID_P(&rf->rf_tgt_svc));
+ /*
+ * The recovery fom will finalize itself when the queue ends.
+ */
+ m0_be_queue_end(&rf->rf_heq);
+ m0_be_queue_unlock(&rf->rf_heq);
+ } m0_tlist_endfor;
+
+ /*
+ * This sm wait will release the sm lock before waiting if needed.
+ * The same lock is used as the sm lock and machine lock. Please see
+ * recovery_machine_lock() and m0_dtm0_recovery_machine_init().
+ * So any other thread that wants to acquire the recovery machine
+ * lock will have a chance to continue.
+ */
+ rc = m0_sm_timedwait(&m->rm_sm,
+ M0_BITS(M0_DRMS_INIT, M0_DRMS_STOPPED),
+ M0_TIME_NEVER);
+ M0_ASSERT_INFO(rc == 0, "rc=%d", rc);
+ recovery_machine_unlock(m);
+ unpopulate_foms(m);
+}
+
+static struct m0_reqh *
+m0_dtm0_recovery_machine_reqh(struct m0_dtm0_recovery_machine *m)
+{
+ return m->rm_svc->dos_generic.rs_reqh;
+}
+
+static const struct m0_fid *
+recovery_machine_local_id(const struct m0_dtm0_recovery_machine *m)
+{
+ return &m->rm_svc->dos_generic.rs_service_fid;
+}
+
+static int recovery_machine_log_iter_next(struct m0_dtm0_recovery_machine *m,
+ struct m0_be_dtm0_log_iter *iter,
+ const struct m0_fid *tgt_svc,
+ const struct m0_fid *origin_svc,
+ struct m0_dtm0_log_rec *record)
+{
+ M0_PRE(m->rm_ops->log_iter_next != NULL);
+ return m->rm_ops->log_iter_next(m, iter, tgt_svc, origin_svc, record);
+}
+
+static int recovery_machine_log_iter_init(struct m0_dtm0_recovery_machine *m,
+ struct m0_be_dtm0_log_iter *iter)
+{
+ M0_PRE(m->rm_ops->log_iter_init != NULL);
+ return m->rm_ops->log_iter_init(m, iter);
+
+}
+
+static void recovery_machine_log_iter_fini(struct m0_dtm0_recovery_machine *m,
+ struct m0_be_dtm0_log_iter *iter)
+{
+ M0_PRE(m->rm_ops->log_iter_fini != NULL);
+ m->rm_ops->log_iter_fini(m, iter);
+}
+
+static void recovery_machine_redo_post(struct m0_dtm0_recovery_machine *m,
+ struct m0_fom *fom,
+ const struct m0_fid *tgt_proc,
+ const struct m0_fid *tgt_svc,
+ struct dtm0_req_fop *redo,
+ struct m0_be_op *op)
+{
+ M0_PRE(m->rm_ops->redo_post != NULL);
+ m->rm_ops->redo_post(m, fom, tgt_proc, tgt_svc, redo, op);
+}
+
+static void recovery_machine_recovered(struct m0_dtm0_recovery_machine *m,
+ const struct m0_fid *tgt_proc,
+ const struct m0_fid *tgt_svc)
+{
+ M0_PRE(m->rm_ops->ha_event_post != NULL);
+ m->rm_ops->ha_event_post(m, tgt_proc, tgt_svc,
+ M0_CONF_HA_PROCESS_DTM_RECOVERED);
+}
+
+static void recovery_machine_lock(struct m0_dtm0_recovery_machine *m)
+{
+ m0_sm_group_lock(&m->rm_sm_group);
+}
+
+static void recovery_machine_unlock(struct m0_dtm0_recovery_machine *m)
+{
+ m0_sm_group_unlock(&m->rm_sm_group);
+}
+
+enum recovery_fom_state {
+ RFS_INIT = M0_FOM_PHASE_INIT,
+ RFS_DONE = M0_FOM_PHASE_FINISH,
+ RFS_WAITING,
+ RFS_FAILED,
+ RFS_NR,
+};
+
+static struct m0_sm_state_descr recovery_fom_states[] = {
+ [RFS_INIT] = {
+ .sd_name = "RFS_INIT",
+ .sd_allowed = M0_BITS(RFS_WAITING, RFS_DONE, RFS_FAILED),
+ .sd_flags = M0_SDF_INITIAL,
+ },
+ /* terminal states */
+ [RFS_DONE] = {
+ .sd_name = "RFS_DONE",
+ .sd_allowed = 0,
+ .sd_flags = M0_SDF_TERMINAL,
+ },
+ /* failure states */
+ [RFS_FAILED] = {
+ .sd_name = "RFS_FAILED",
+ .sd_allowed = M0_BITS(RFS_DONE),
+ .sd_flags = M0_SDF_FAILURE,
+ },
+
+ /* intermediate states */
+ [RFS_WAITING] = {
+ .sd_name = "RFS_WAITING",
+ .sd_allowed = M0_BITS(RFS_DONE,
+ RFS_FAILED, RFS_WAITING),
+ },
+};
+
+const static struct m0_sm_conf recovery_fom_conf = {
+ .scf_name = "recovery_fom",
+ .scf_nr_states = ARRAY_SIZE(recovery_fom_states),
+ .scf_state = recovery_fom_states,
+};
+
+static size_t recovery_fom_locality(const struct m0_fom *fom)
+{
+ return 1;
+}
+
+static const struct m0_fom_ops recovery_fom_ops = {
+ .fo_fini = recovery_fom_self_fini,
+ .fo_tick = recovery_fom_tick,
+ .fo_home_locality = recovery_fom_locality
+};
+
+static struct m0_fom_type recovery_fom_type;
+static const struct m0_fom_type_ops recovery_fom_type_ops = {};
+
+M0_INTERNAL int m0_drm_domain_init(void)
+{
+ int rc = 0;
+
+ M0_PRE(!m0_sm_conf_is_initialized(&m0_drm_sm_conf));
+
+ m0_fom_type_init(&recovery_fom_type,
+ M0_DTM0_RECOVERY_FOM_OPCODE,
+ &recovery_fom_type_ops,
+ &dtm0_service_type,
+ &recovery_fom_conf);
+
+ m0_sm_conf_init(&m0_drm_sm_conf);
+ rc = m0_sm_addb2_init(&m0_drm_sm_conf,
+ M0_AVI_DRM_SM_STATE,
+ M0_AVI_DRM_SM_COUNTER);
+
+ M0_POST(m0_sm_conf_is_initialized(&m0_drm_sm_conf));
+
+ return M0_RC(rc);
+}
+
+M0_INTERNAL void m0_drm_domain_fini(void)
+{
+ if (m0_sm_conf_is_initialized(&m0_drm_sm_conf)) {
+ m0_sm_addb2_fini(&m0_drm_sm_conf);
+ m0_sm_conf_fini(&m0_drm_sm_conf);
+ M0_POST(!m0_sm_conf_is_initialized(&m0_drm_sm_conf));
+ }
+}
+
+static void eolq_post(struct m0_dtm0_recovery_machine *m,
+ struct eolq_item *item)
+{
+ struct recovery_fom *rf = recovery_fom_local(m);
+ m0_be_queue_lock(&rf->rf_eolq);
+ /*
+ * Do not post any events if we have already recovered this DTM0
+ * service (eoq is an indicator here).
+ */
+ if (!rf->rf_eolq.bq_the_end) {
+ /*
+ * Assumption: the queue never gets full.
+ * XXX: We could panic in this case. Right now, the HA/REDO FOM
+ * should get stuck in the tick. Probably, panic is
+ * a better solution here.
+ */
+ M0_BE_OP_SYNC(op, M0_BE_QUEUE_PUT(&rf->rf_eolq, &op, item));
+ }
+ m0_be_queue_unlock(&rf->rf_eolq);
+}
+
+static void heq_post(struct recovery_fom *rf, enum m0_ha_obj_state state)
+{
+ uint64_t event = state;
+ m0_be_queue_lock(&rf->rf_heq);
+ /*
+ * Assumption: the queue never gets full.
+ * XXX: We could panic in this case. Right now, the HA FOM should get
+ * stuck in the tick. Probably, panic is a better solution
+ * here.
+ */
+ M0_BE_OP_SYNC(op, M0_BE_QUEUE_PUT(&rf->rf_heq, &op, &event));
+ m0_be_queue_unlock(&rf->rf_heq);
+
+ M0_LOG(M0_DEBUG, "heq_enq " FID_F " %s ",
+ FID_P(&rf->rf_tgt_svc),
+ m0_ha_state2str(state));
+
+ /*
+ * Recovery machine uses two kinds of queues: HEQ and EOLQ. HEQ is HA
+ * Events Queue, every recovery FOM has its own instance; it is used to
+ * track HA events of a specific participant (tied to this given rfom).
+ * EOLQ is EOL Queue, queue to track end-of-log events during recovery.
+ * When we receive EOL from all participants -- we know recovery is
+ * completed. There is a nuance though. If some remote participant
+ * goes TRANSIENT/FAILED during recovery, we do not need to wait for EOL
+ * from that side. The easiest way to deliver this information to local
+ * recovery FOM is to add an entry in EOLQ (since local RFOM already
+ * 'listens' on that queue). So that's what happens below. heq_post()
+ * is called when there is an HA event on some participant. We add the
+ * HA event itself in HEQ above, and then add it to EOLQ below. Since
+ * local recovery FOM is not expecting EOL from itself, we only do
+ * eolq_post() if HA event does not come from local participant.
+ */
+ if (!rf->rf_is_local)
+ eolq_post(rf->rf_m,
+ &(struct eolq_item) {
+ .ei_type = EIT_HA,
+ .ei_ha_state = state,
+ .ei_source = rf->rf_tgt_svc, });
+}
+
+static bool recovery_fom_ha_clink_cb(struct m0_clink *clink)
+{
+ struct recovery_fom *rf = M0_AMB(rf, clink, rf_ha_clink);
+ struct m0_conf_obj *proc_conf = container_of(clink->cl_chan,
+ struct m0_conf_obj,
+ co_ha_chan);
+ heq_post(rf, proc_conf->co_ha_state);
+ return false;
+}
+
+static int recovery_fom_init(struct recovery_fom *rf,
+ struct m0_dtm0_recovery_machine *m,
+ struct m0_conf_process *proc_conf,
+ const struct m0_fid *target,
+ bool is_volatile)
+{
+ int rc;
+ bool is_local = m0_fid_eq(recovery_machine_local_id(m), target);
+
+ M0_ENTRY("m=%p, rf=%p, tgt=" FID_F ", is_vol=%d, is_local=%d",
+ m, rf, FID_P(target), !!is_volatile, !!is_local);
+
+ M0_PRE(ergo(is_local, m->rm_local_rfom == NULL));
+
+ rc = m0_be_queue_init(&rf->rf_heq, &(struct m0_be_queue_cfg){
+ .bqc_q_size_max = HEQ_MAX_LEN,
+ /*
+ * Two producers:
+ * 1. Conf-obj HA state updates (heq_post()).
+ * 2. Stop-and-wait-when-finalising
+ * (m0_dtm0_recovery_machine_stop()).
+ */
+ .bqc_producers_nr_max = 2,
+ /*
+ * Single consumer - recovery machine FOM (recovery_fom_coro()).
+ */
+ .bqc_consumers_nr_max = 1,
+ .bqc_item_length = sizeof(uint64_t),
+ });
+ if (rc != 0)
+ return M0_ERR(rc);
+
+ if (is_local) {
+ rc = m0_be_queue_init(&rf->rf_eolq, &(struct m0_be_queue_cfg){
+ .bqc_q_size_max = EOLQ_MAX_LEN,
+ /*
+ * Consumers and producers are the same as for
+ * rf_heq above.
+ */
+ .bqc_producers_nr_max = 2,
+ .bqc_consumers_nr_max = 1,
+ .bqc_item_length = sizeof(struct eolq_item),
+ });
+ if (rc != 0) {
+ m0_be_queue_fini(&rf->rf_heq);
+ return M0_ERR(rc);
+ }
+ M0_ASSERT(!rf->rf_eolq.bq_the_end);
+ m->rm_local_rfom = rf;
+
+ /*
+ * This is local recovery FOM. We don't need to wait for EOL
+ * from ourselves.
+ */
+ rf->rf_last_known_eol = true;
+ /*
+ * Local recovery FOM is responsible for accepting REDO
+ * messages. We're starting up, we know for sure that we will
+ * go through RECOVERING phase, so we are defaulting to
+ * M0_NC_DTM_RECOVERING.
+ */
+ rf->rf_last_known_ha_state = M0_NC_DTM_RECOVERING;
+ }
+
+ rf->rf_m = m;
+ rf->rf_tgt_svc = *target;
+ rf->rf_tgt_proc = proc_conf->pc_obj.co_id;
+ rf->rf_is_local = is_local;
+ rf->rf_is_volatile = is_volatile;
+
+ rfom_tlink_init(rf);
+ m0_co_context_init(&rf->rf_coro);
+
+ m0_clink_init(&rf->rf_ha_clink, recovery_fom_ha_clink_cb);
+ m0_clink_add_lock(&proc_conf->pc_obj.co_ha_chan, &rf->rf_ha_clink);
+
+ M0_LOG(M0_DEBUG, "Subscribed to " FID_F " with initial state: %s",
+ FID_P(target), m0_ha_state2str(proc_conf->pc_obj.co_ha_state));
+
+ m0_fom_init(&rf->rf_base, &recovery_fom_type,
+ &recovery_fom_ops, NULL, NULL,
+ m0_dtm0_recovery_machine_reqh(m));
+
+ rfom_tlist_add_tail(&m->rm_rfoms, rf);
+
+ return M0_RC(rc);
+}
+
+/*
+ * Mark the queue as ended and drain it until the end.
+ */
+static void m0_be_queue__finish(struct m0_be_queue *bq, struct m0_buf *item)
+{
+ bool got = true;
+
+ m0_be_queue_lock(bq);
+ if (!bq->bq_the_end) {
+ m0_be_queue_end(bq);
+ while (got)
+ M0_BE_OP_SYNC(op, m0_be_queue_get(bq, &op, item, &got));
+ }
+ M0_POST(bq->bq_the_end);
+ m0_be_queue_unlock(bq);
+}
+#define M0_BE_QUEUE__FINISH(bq, item_type) ({ \
+ item_type item; \
+ m0_be_queue__finish(bq, &M0_BUF_INIT_PTR(&item)); \
+})
+
+static void recovery_fom_fini(struct recovery_fom *rf)
+{
+ M0_ENTRY("m=%p, rf= %p", rf->rf_m, rf);
+ m0_clink_del_lock(&rf->rf_ha_clink);
+ m0_clink_fini(&rf->rf_ha_clink);
+ m0_co_context_fini(&rf->rf_coro);
+ rfom_tlink_fini(rf);
+ if (rf->rf_is_local) {
+ rf->rf_m->rm_local_rfom = NULL;
+ m0_be_queue_fini(&rf->rf_eolq);
+ }
+ m0_be_queue_fini(&rf->rf_heq);
+ M0_LEAVE();
+}
+
+static void recovery_fom_self_fini(struct m0_fom *fom)
+{
+ struct recovery_fom *rf = M0_AMB(rf, fom, rf_base);
+ struct m0_dtm0_recovery_machine *m = rf->rf_m;
+ bool is_stopped;
+
+ M0_ENTRY("fom=%p, m=%p, rf=%p", fom, m, rf);
+ recovery_machine_lock(m);
+ rfom_tlist_remove(rf);
+ is_stopped = rfom_tlist_is_empty(&m->rm_rfoms);
+ recovery_fom_fini(rf);
+ m0_fom_fini(fom);
+ m0_free(rf);
+ if (is_stopped)
+ m0_sm_move(&m->rm_sm, 0, M0_DRMS_STOPPED);
+ recovery_machine_unlock(m);
+ M0_LEAVE("is_stopped=%d", !!is_stopped);
+}
+
+static int recovery_fom_add(struct m0_dtm0_recovery_machine *m,
+ struct m0_conf_process *proc_conf,
+ const struct m0_fid *target,
+ bool is_volatile)
+{
+ struct recovery_fom *rf;
+ int rc;
+
+ M0_ALLOC_PTR(rf);
+ if (rf != NULL) {
+ recovery_machine_lock(m);
+ rc = recovery_fom_init(rf, m, proc_conf, target, is_volatile);
+ recovery_machine_unlock(m);
+ } else
+ rc = M0_ERR(-ENOMEM);
+
+ return M0_RC(rc);
+}
+
+static struct recovery_fom *
+recovery_fom_by_svc_find(struct m0_dtm0_recovery_machine *m,
+ const struct m0_fid *tgt_svc)
+{
+ return m0_tl_find(rfom, rf, &m->rm_rfoms,
+ m0_fid_eq(tgt_svc, &rf->rf_tgt_svc));
+}
+
+static struct recovery_fom *
+recovery_fom_by_svc_find_lock(struct m0_dtm0_recovery_machine *m,
+ const struct m0_fid *tgt_svc)
+{
+ struct recovery_fom *rf;
+ recovery_machine_lock(m);
+ rf = recovery_fom_by_svc_find(m, tgt_svc);
+ recovery_machine_unlock(m);
+ return rf;
+}
+
+static struct recovery_fom *
+recovery_fom_local(struct m0_dtm0_recovery_machine *m)
+{
+ M0_PRE(m != NULL);
+ return m->rm_local_rfom;
+}
+
+static void unpopulate_foms(struct m0_dtm0_recovery_machine *m)
+{
+ struct recovery_fom *rf;
+
+ M0_ENTRY("m=%p", m);
+
+ recovery_machine_lock(m);
+ /*
+ * TODO: assert that list is empty instead of this teardown
+ */
+ m0_tl_teardown(rfom, &m->rm_rfoms, rf) {
+ recovery_fom_fini(rf);
+ m0_free(rf);
+ }
+ recovery_machine_unlock(m);
+
+ M0_LEAVE();
+}
+
+static bool conf_obj_is_process(const struct m0_conf_obj *obj)
+{
+ return m0_conf_obj_type(obj) == &M0_CONF_PROCESS_TYPE;
+}
+
+static bool is_svc_volatile(const struct m0_confc *confc,
+ const struct m0_fid *svc_fid)
+{
+ struct m0_conf_service *svc;
+ struct m0_conf_obj *obj;
+ const char **param;
+
+ if (M0_FI_ENABLED("always_false"))
+ return false;
+
+ obj = m0_conf_cache_lookup(&confc->cc_cache, svc_fid);
+ M0_ASSERT(obj != NULL);
+
+ svc = M0_CONF_CAST(obj, m0_conf_service);
+
+ M0_ASSERT(svc->cs_params != NULL);
+
+ for (param = svc->cs_params; *param != NULL; ++param) {
+ if (m0_streq(*param, "origin:in-volatile"))
+ return true;
+ else if (m0_streq(*param, "origin:in-persistent"))
+ return false;
+ }
+
+ M0_IMPOSSIBLE("Service origin is not defined in the config?");
+}
+
+static int populate_foms(struct m0_dtm0_recovery_machine *m)
+{
+ struct m0_reqh_service *service = &m->rm_svc->dos_generic;
+ struct m0_confc *confc = m0_reqh2confc(service->rs_reqh);
+ struct m0_conf_obj *obj;
+ struct m0_conf_root *root;
+ struct m0_conf_diter it;
+ struct m0_conf_process *proc_conf;
+ struct m0_fid svc_fid;
+ int rc;
+ struct recovery_fom *rf;
+
+ M0_ENTRY("recovery machine=%p", m);
+
+ /** UT workaround */
+ if (!m0_confc_is_inited(confc)) {
+ M0_LOG(M0_WARN, "confc is not initiated!");
+ return M0_RC(0);
+ }
+
+ rc = m0_confc_root_open(confc, &root) ?:
+ m0_conf_diter_init(&it, confc,
+ &root->rt_obj,
+ M0_CONF_ROOT_NODES_FID,
+ M0_CONF_NODE_PROCESSES_FID);
+ if (rc != 0)
+ goto out;
+
+ while ((rc = m0_conf_diter_next_sync(&it, conf_obj_is_process)) > 0) {
+ obj = m0_conf_diter_result(&it);
+ proc_conf = M0_CONF_CAST(obj, m0_conf_process);
+ rc = m0_conf_process2service_get(confc,
+ &proc_conf->pc_obj.co_id,
+ M0_CST_DTM0, &svc_fid);
+ if (rc != 0)
+ continue;
+
+ rc = recovery_fom_add(m, proc_conf, &svc_fid,
+ is_svc_volatile(confc, &svc_fid));
+ if (rc != 0)
+ break;
+ }
+
+ m0_conf_diter_fini(&it);
+
+ /*
+ * It is a workaround to propagate the current HA state.
+ */
+ m0_confc_close(&root->rt_obj);
+ rc = m0_confc_root_open(confc, &root) ?:
+ m0_conf_diter_init(&it, confc,
+ &root->rt_obj,
+ M0_CONF_ROOT_NODES_FID,
+ M0_CONF_NODE_PROCESSES_FID);
+ if (rc != 0)
+ goto out;
+
+ while ((rc = m0_conf_diter_next_sync(&it, conf_obj_is_process)) > 0) {
+ obj = m0_conf_diter_result(&it);
+ proc_conf = M0_CONF_CAST(obj, m0_conf_process);
+ rc = m0_conf_process2service_get(confc,
+ &proc_conf->pc_obj.co_id,
+ M0_CST_DTM0, &svc_fid);
+ if (rc != 0)
+ continue;
+
+ rf = recovery_fom_by_svc_find(m, &svc_fid);
+ if (rf == NULL)
+ break;
+
+ if (!rf->rf_is_local)
+ heq_post(rf, proc_conf->pc_obj.co_ha_state);
+ }
+
+ m0_conf_diter_fini(&it);
+ /* end of workaround */
+
+out:
+ if (root != NULL)
+ m0_confc_close(&root->rt_obj);
+ if (rc != 0)
+ unpopulate_foms(m);
+ return M0_RC(rc);
+}
+
+static struct m0_co_context *CO(struct m0_fom *fom)
+{
+ struct recovery_fom *rf = M0_AMB(rf, fom, rf_base);
+ return &rf->rf_coro;
+}
+
+#define F M0_CO_FRAME_DATA
+
+static void heq_await(struct m0_fom *fom, enum m0_ha_obj_state *out, bool *eoq)
+{
+ struct recovery_fom *rf = M0_AMB(rf, fom, rf_base);
+
+ M0_CO_REENTER(CO(fom),
+ struct m0_be_op op;
+ bool got;
+ uint64_t state;);
+
+ F(got) = false;
+ F(state) = 0;
+ M0_SET0(&F(op));
+ m0_be_op_init(&F(op));
+ m0_be_queue_lock(&rf->rf_heq);
+ M0_BE_QUEUE_GET(&rf->rf_heq, &F(op), &F(state), &F(got));
+ m0_be_queue_unlock(&rf->rf_heq);
+ M0_CO_YIELD_RC(CO(fom), m0_be_op_tick_ret(&F(op), fom, RFS_WAITING));
+ m0_be_op_fini(&F(op));
+
+
+ if (F(got)) {
+ M0_LOG(M0_DEBUG, "heq_deq " FID_F " %s" ,
+ FID_P(&rf->rf_tgt_svc),
+ m0_ha_state2str(F(state)));
+ M0_ASSERT(ha_event_invariant(F(state)));
+ *out = F(state);
+ } else {
+ M0_LOG(M0_DEBUG, "heq_deq " FID_F, FID_P(&rf->rf_tgt_svc));
+ *eoq = true;
+ }
+}
+
+static void eolq_await(struct m0_fom *fom, struct eolq_item *out)
+{
+ struct recovery_fom *rf = M0_AMB(rf, fom, rf_base);
+
+ M0_CO_REENTER(CO(fom),
+ struct m0_be_op op;
+ bool got;
+ struct eolq_item item;);
+
+ F(got) = false;
+ M0_SET0(&F(item));
+ M0_SET0(&F(op));
+ m0_be_op_init(&F(op));
+ m0_be_queue_lock(&rf->rf_eolq);
+ M0_BE_QUEUE_GET(&rf->rf_eolq, &F(op), &F(item), &F(got));
+ m0_be_queue_unlock(&rf->rf_eolq);
+ M0_CO_YIELD_RC(CO(fom), m0_be_op_tick_ret(&F(op), fom, RFS_WAITING));
+ m0_be_op_fini(&F(op));
+
+ *out = F(got) ? F(item) : (struct eolq_item) { .ei_type = EIT_END };
+}
+
+/**
+ * Restore missing transactions on remote participant.
+ *
+ * Implements part of recovery process. Healthy ONLINE participant will iterate
+ * through the local DTM log and send all needed REDOs to a remote peer.
+ */
+static void dtm0_restore(struct m0_fom *fom,
+ enum m0_ha_obj_state *out,
+ bool *eoq)
+{
+ struct recovery_fom *rf = M0_AMB(rf, fom, rf_base);
+ struct m0_dtm0_log_rec record;
+ struct dtm0_req_fop redo;
+ int rc = 0;
+
+ M0_CO_REENTER(CO(fom),
+ struct m0_fid initiator;
+ struct m0_be_op reply_op;
+ bool next;
+ );
+
+ recovery_machine_log_iter_init(rf->rf_m, &rf->rf_log_iter);
+
+ /* XXX: race condition in the case where we are stopping the FOM. */
+ F(initiator) = recovery_fom_local(rf->rf_m)->rf_tgt_svc;
+
+ M0_SET0(&F(reply_op));
+ m0_be_op_init(&F(reply_op));
+
+ do {
+ M0_SET0(&record);
+ rc = recovery_machine_log_iter_next(rf->rf_m, &rf->rf_log_iter,
+ &rf->rf_tgt_svc, NULL,
+ &record);
+ /* Any value except zero means that we should stop recovery. */
+ F(next) = rc == 0;
+ redo = (struct dtm0_req_fop) {
+ .dtr_msg = DTM_REDO,
+ .dtr_initiator = F(initiator),
+ .dtr_payload = record.dlr_payload,
+ .dtr_txr = record.dlr_txd,
+ .dtr_flags = F(next) ? 0 : M0_BITS(M0_DMF_EOL),
+ };
+
+ /*
+ * TODO: there is extra memcpy happening here -- first copy done
+ * in recovery_machine_log_iter_next (it clones the record),
+ * second copy is done in recovery_machine_redo_post() below.
+ * If this proves to be too inefficient, we can eliminate extra
+ * copies.
+ */
+ recovery_machine_redo_post(rf->rf_m, &rf->rf_base,
+ &rf->rf_tgt_proc, &rf->rf_tgt_svc,
+ &redo, &F(reply_op));
+
+ M0_LOG(M0_DEBUG, "out-redo: (m=%p) " REDO_F,
+ rf->rf_m, REDO_P(&redo));
+ M0_CO_YIELD_RC(CO(fom), m0_be_op_tick_ret(&F(reply_op),
+ fom, RFS_WAITING));
+ m0_be_op_reset(&F(reply_op));
+ } while (F(next));
+
+ m0_be_op_fini(&F(reply_op));
+
+ recovery_machine_log_iter_fini(rf->rf_m, &rf->rf_log_iter);
+ M0_SET0(&rf->rf_log_iter);
+
+ M0_CO_FUN(CO(fom), heq_await(fom, out, eoq));
+}
+
+static void remote_recovery_fom_coro(struct m0_fom *fom)
+{
+ struct recovery_fom *rf = M0_AMB(rf, fom, rf_base);
+
+ M0_CO_REENTER(CO(fom),
+ struct m0_be_op op;
+ bool eoq;
+ enum m0_ha_obj_state state;
+ void (*action)(struct m0_fom *fom,
+ enum m0_ha_obj_state *out, bool *eoq);
+ );
+
+ F(eoq) = false;
+ F(state) = M0_NC_UNKNOWN;
+
+ while (!F(eoq)) {
+ M0_LOG(M0_DEBUG, "remote recovery fom=%p, svc_fid=" FID_F ", "
+ " proc_fid=" FID_F " handles %s state.", fom,
+ FID_P(&rf->rf_tgt_svc), FID_P(&rf->rf_tgt_proc),
+ m0_ha_state2str(F(state)));
+
+ switch (F(state)) {
+ case M0_NC_DTM_RECOVERING:
+ F(action) = rf->rf_is_volatile ? heq_await :
+ dtm0_restore;
+ break;
+ case M0_NC_FAILED:
+ if (ALL2ALL)
+ M0_LOG(M0_WARN, "Eviction is not supported.");
+ else
+ M0_IMPOSSIBLE("Eviction is not supported.");
+ /*
+ * Fall-through is intentional here. Eviction code will
+ * be added here in the future.
+ */
+ default:
+ F(action) = heq_await;
+ break;
+ }
+
+ M0_CO_FUN(CO(fom), F(action)(fom, &F(state), &F(eoq)));
+ }
+
+ m0_fom_phase_set(fom, RFS_DONE);
+}
+
+static bool was_log_replayed(struct recovery_fom *rf)
+{
+ bool outcome;
+ /*
+ * XXX: Clients are ignored by the recovery stop condition.
+ * Once HA and Motr are able to properly handle client
+ * restart, they can be brought back into the condition.
+ */
+ bool is_na = rf->rf_is_volatile;
+
+ /*
+ * Some of unit tests need to temporarily avoid the "ignore" described
+ * above. In particular, we have a test where client is sending EOL to
+ * server, and we want it to be received and handled (e.g.
+ * remach-reboot-server).
+ */
+ if (m0_dtm0_is_expecting_redo_from_client()) {
+ M0_LOG(M0_DEBUG, "Expect EOL from client");
+ is_na = false;
+ }
+
+ outcome = ergo(!rf->rf_is_local && !is_na,
+ M0_IN(rf->rf_last_known_ha_state,
+ (M0_NC_ONLINE, M0_NC_DTM_RECOVERING)) &&
+ rf->rf_last_known_eol);
+ return outcome;
+}
+
+static void rec_cond_trace(struct m0_dtm0_recovery_machine *m)
+{
+ struct recovery_fom *rf;
+
+ if (!m0_tl_exists(rfom, rf, &m->rm_rfoms, !rf->rf_is_local))
+ M0_LOG(M0_WARN, "Recovery cannot be completed because there "
+ "are no remote DTM0 services.");
+
+ m0_tl_for(rfom, &m->rm_rfoms, rf) {
+ M0_LOG(M0_DEBUG,
+ "id=" FID_F ", is_volatile=%d, "
+ "is_local=%d, state=%s, got_eol=%d => %d",
+ FID_P(&rf->rf_tgt_svc),
+ (int) rf->rf_is_volatile,
+ (int) rf->rf_is_local,
+ m0_ha_state2str(rf->rf_last_known_ha_state),
+ (int) rf->rf_last_known_eol,
+ (int) was_log_replayed(rf));
+ } m0_tl_endfor;
+}
+
+static bool is_local_recovery_completed(struct m0_dtm0_recovery_machine *m)
+{
+ M0_ENTRY();
+ rec_cond_trace(m);
+ return M0_RC(m0_tl_exists(rfom, r, &m->rm_rfoms, !r->rf_is_local) &&
+ m0_tl_forall(rfom, r, &m->rm_rfoms, was_log_replayed(r)));
+}
+
+static void remote_state_update(struct recovery_fom *rf,
+ const struct eolq_item *item)
+{
+ M0_ENTRY("rf=%p, " FID_F " state: %s, eol: %d",
+ rf, FID_P(&rf->rf_tgt_svc),
+ m0_ha_state2str(rf->rf_last_known_ha_state),
+ (int) rf->rf_last_known_eol);
+
+ switch (item->ei_type) {
+ case EIT_HA:
+ M0_LOG(M0_DEBUG, "new_state=%s",
+ m0_ha_state2str(item->ei_ha_state));
+ rf->rf_last_known_ha_state = item->ei_ha_state;
+ /* Clear the EOL flag if the remote is dead. */
+ if (M0_IN(item->ei_ha_state, (M0_NC_TRANSIENT, M0_NC_FAILED)))
+ rf->rf_last_known_eol = false;
+ break;
+ case EIT_EOL:
+ M0_LOG(M0_DEBUG, "new_eol=1");
+ rf->rf_last_known_eol = true;
+ break;
+ default:
+ M0_IMPOSSIBLE("Wrong eolq item type %d?", item->ei_type);
+ break;
+ }
+
+ M0_LEAVE();
+}
+
+static void local_recovery_fom_coro(struct m0_fom *fom)
+{
+ struct recovery_fom *rf = M0_AMB(rf, fom, rf_base);
+ struct recovery_fom *remote_rf;
+
+ M0_CO_REENTER(CO(fom),
+ struct m0_be_op eolq_op;
+ bool eoq;
+ bool recovered;
+ struct eolq_item item;
+ enum m0_ha_obj_state state;);
+
+ F(eoq) = false;
+ /*
+ * A DTM0 service without persistent storage does not need
+ * REDOs.
+ * mkfs does not require DTM0 support as well.
+ */
+ F(recovered) = rf->rf_is_volatile;
+
+ /* Wait until the moment where we should start recovery. */
+ do {
+ M0_CO_FUN(CO(fom), heq_await(fom, &F(state), &F(eoq)));
+ if (F(eoq))
+ goto out;
+ } while (F(state) != M0_NC_DTM_RECOVERING);
+
+ while (!F(recovered)) {
+ M0_CO_FUN(CO(fom), eolq_await(fom, &F(item)));
+ M0_ASSERT(F(item).ei_type != EIT_END);
+
+ recovery_machine_lock(rf->rf_m);
+ remote_rf = recovery_fom_by_svc_find(rf->rf_m,
+ &F(item).ei_source);
+ if (remote_rf == NULL) {
+ /* XXX: machine is stopping? */
+ recovery_machine_unlock(rf->rf_m);
+ break;
+ }
+ M0_ASSERT(remote_rf != rf);
+ remote_state_update(remote_rf, &F(item));
+
+ F(recovered) = is_local_recovery_completed(rf->rf_m);
+ recovery_machine_unlock(rf->rf_m);
+ }
+
+ /*
+ * At this point we do not expect any more EOL messages, so we 'end'
+ * and flush the queue to ensure it.
+ */
+ M0_BE_QUEUE__FINISH(&rf->rf_eolq, typeof(F(item)));
+
+ /*
+ * When F(recovered) is true, we know we received all needed EOL
+ * messages and have recovered everything. If it is false -- at this
+ * point, this can only mean that recovery machine is shutting down, and
+ * there is nothing we need to do further.
+ */
+ if (F(recovered)) {
+ /*
+ * Emit "RECOVERED". It shall cause HA to tell us to transit
+ * from RECOVERING to ONLINE.
+ */
+ recovery_machine_recovered(rf->rf_m,
+ &rf->rf_tgt_proc, &rf->rf_tgt_svc);
+
+ do {
+ M0_CO_FUN(CO(fom), heq_await(fom, &F(state),
+ &F(eoq)));
+ M0_ASSERT(ergo(!F(eoq), M0_IN(F(state),
+ (M0_NC_TRANSIENT,
+ M0_NC_ONLINE))));
+ } while (!F(eoq));
+ }
+
+out:
+ /*
+ * This is not a duplication to the similar call above -- FINISH is
+ * indempotent call, and we want to ensure that it's called on all paths
+ * leading out of this function.
+ */
+ M0_BE_QUEUE__FINISH(&rf->rf_eolq, typeof(F(item)));
+ m0_fom_phase_set(fom, RFS_DONE);
+}
+
+static void recovery_fom_coro(struct m0_fom *fom)
+{
+ struct recovery_fom *rf = M0_AMB(rf, fom, rf_base);
+
+ M0_CO_REENTER(CO(fom));
+
+ addb2_relate(&rf->rf_m->rm_sm, &fom->fo_sm_phase);
+
+ if (rf->rf_is_local)
+ M0_CO_FUN(CO(fom), local_recovery_fom_coro(fom));
+ else
+ M0_CO_FUN(CO(fom), remote_recovery_fom_coro(fom));
+}
+
+static int recovery_fom_tick(struct m0_fom *fom)
+{
+ int rc;
+ M0_CO_START(CO(fom));
+ recovery_fom_coro(fom);
+ rc = M0_CO_END(CO(fom));
+ M0_POST(M0_IN(rc, (0, M0_FSO_AGAIN, M0_FSO_WAIT)));
+ return rc ?: M0_FSO_WAIT;
+}
+
+#undef F
+
+M0_INTERNAL void
+m0_ut_remach_heq_post(struct m0_dtm0_recovery_machine *m,
+ const struct m0_fid *tgt_svc,
+ enum m0_ha_obj_state state)
+{
+ struct recovery_fom *rf = recovery_fom_by_svc_find_lock(m, tgt_svc);
+ M0_ASSERT_INFO(rf != NULL,
+ "Trying to post HA event to a wrong service?");
+ heq_post(rf, state);
+}
+
+M0_INTERNAL void
+m0_ut_remach_populate(struct m0_dtm0_recovery_machine *m,
+ struct m0_conf_process *procs,
+ const struct m0_fid *svcs,
+ bool *is_volatile,
+ uint64_t objs_nr)
+{
+ uint64_t i;
+ int rc;
+
+ for (i = 0; i < objs_nr; ++i) {
+ rc = recovery_fom_add(m, procs + i, svcs + i, is_volatile[i]);
+ M0_ASSERT(rc == 0);
+ }
+}
+
+M0_INTERNAL void
+m0_dtm0_recovery_machine_redo_post(struct m0_dtm0_recovery_machine *m,
+ struct dtm0_req_fop *redo,
+ struct m0_be_op *op)
+{
+ bool is_eol =
+ !!(redo->dtr_flags & M0_BITS(M0_DMF_EOL));
+ bool is_eviction =
+ !!(redo->dtr_flags & M0_BITS(M0_DMF_EVICTION));
+ const struct m0_fid *initiator = &redo->dtr_initiator;
+ struct eolq_item item = {};
+ struct recovery_fom *rf;
+
+ M0_ENTRY("in-redo (m=%p): " REDO_F, m, REDO_P(redo));
+
+ if (is_eol) {
+ M0_ASSERT_INFO(!is_eviction,
+ "TODO: Eviction is not handled yet.");
+
+ rf = recovery_fom_local(m);
+ if (rf != NULL) {
+ M0_ASSERT_INFO(equi(is_eviction, !rf->rf_is_local),
+ "Participant cannot evict itself.");
+ item = (struct eolq_item) {
+ .ei_type = EIT_EOL,
+ .ei_source = *initiator,
+ };
+ m0_be_queue_lock(&rf->rf_eolq);
+ M0_ASSERT_INFO(!rf->rf_eolq.bq_the_end,
+ "REDOs are not allowed if local recovery"
+ " has already been finished.");
+ M0_BE_QUEUE_PUT(&rf->rf_eolq, op, &item);
+ m0_be_queue_unlock(&rf->rf_eolq);
+ } else {
+ M0_LOG(M0_WARN,
+ "REDO received but svc is not RECOVERING yet");
+ m0_be_op_active(op);
+ m0_be_op_done(op);
+ }
+ } else {
+ M0_LOG(M0_DEBUG, "A non-EOL REDO was ignored.");
+ m0_be_op_active(op);
+ m0_be_op_done(op);
+ }
+
+ M0_LEAVE();
+}
+
+static int default_log_iter_init(struct m0_dtm0_recovery_machine *m,
+ struct m0_be_dtm0_log_iter *iter)
+{
+ m0_be_dtm0_log_iter_init(iter, m->rm_svc->dos_log);
+ return 0;
+}
+
+static void default_log_iter_fini(struct m0_dtm0_recovery_machine *m,
+ struct m0_be_dtm0_log_iter *iter)
+{
+ struct recovery_fom *rf = M0_AMB(rf, iter, rf_log_iter);
+ M0_ASSERT(rf->rf_m == m);
+ m0_be_dtm0_log_iter_fini(iter);
+}
+
+static bool participated(const struct m0_dtm0_log_rec *record,
+ const struct m0_fid *svc)
+{
+ return m0_exists(i, record->dlr_txd.dtd_ps.dtp_nr,
+ m0_fid_eq(&record->dlr_txd.dtd_ps.dtp_pa[i].p_fid,
+ svc));
+}
+
+static int default_log_iter_next(struct m0_dtm0_recovery_machine *m,
+ struct m0_be_dtm0_log_iter *iter,
+ const struct m0_fid *tgt_svc,
+ const struct m0_fid *origin_svc,
+ struct m0_dtm0_log_rec *record)
+{
+ struct m0_be_dtm0_log *log = m->rm_svc->dos_log;
+ int rc;
+
+ /* XXX: not supported yet */
+ M0_ASSERT(origin_svc == NULL);
+
+ m0_mutex_lock(&log->dl_lock);
+
+ /* Filter out records where tgt_svc is not a participant. */
+ do {
+ M0_SET0(record);
+ rc = m0_be_dtm0_log_iter_next(iter, record);
+ if (rc == 0) {
+ if (participated(record, tgt_svc))
+ break;
+ else
+ m0_dtm0_log_iter_rec_fini(record);
+ }
+ } while (rc == 0);
+
+ m0_mutex_unlock(&log->dl_lock);
+
+ /* XXX: error codes will be adjusted separately. */
+ switch (rc) {
+ case 0:
+ return 0;
+ default:
+ return M0_ERR(rc);
+ }
+}
+
+/*
+ * TODO: It was copy-pasted from setup.c (see cs_ha_process_event)!
+ * Export cs_ha_process_event instead of using this thing.
+ */
+static void server_process_event(struct m0_motr *cctx,
+ enum m0_conf_ha_process_event event)
+{
+ enum m0_conf_ha_process_type type;
+
+ type = cctx->cc_mkfs ? M0_CONF_HA_PROCESS_M0MKFS :
+ M0_CONF_HA_PROCESS_M0D;
+ if (cctx->cc_ha_is_started && !cctx->cc_no_conf &&
+ cctx->cc_motr_ha.mh_link != NULL) {
+ m0_conf_ha_process_event_post(&cctx->cc_motr_ha.mh_ha,
+ cctx->cc_motr_ha.mh_link,
+ &cctx->cc_reqh_ctx.rc_fid,
+ m0_process(), event, type);
+ }
+}
+
+/* TODO: copy-pasted from client_init.c (see ha_process_event)!
+ * Actually, clients should not send RECOVERED but at this moment,
+ * the SM on the Hare side cannot properly handle this.
+ */
+static void client_process_event(struct m0_client *m0c,
+ enum m0_conf_ha_process_event event)
+{
+ const enum m0_conf_ha_process_type type = M0_CONF_HA_PROCESS_OTHER;
+ if (m0c->m0c_motr_ha.mh_link != NULL)
+ m0_conf_ha_process_event_post(&m0c->m0c_motr_ha.mh_ha,
+ m0c->m0c_motr_ha.mh_link,
+ &m0c->m0c_process_fid,
+ m0_process(), event, type);
+}
+
+static void default_ha_event_post(struct m0_dtm0_recovery_machine *m,
+ const struct m0_fid *tgt_proc,
+ const struct m0_fid *tgt_svc,
+ enum m0_conf_ha_process_event event)
+{
+ struct m0_reqh *reqh;
+ struct m0_client *m0c;
+ (void) tgt_proc;
+ (void) tgt_svc;
+
+ if (ALL2ALL) {
+ M0_LOG(M0_DEBUG, "ALL2ALL_DTM_RECOVERED");
+ }
+
+ M0_ASSERT_INFO(m->rm_local_rfom != NULL,
+ "It is impossible to emit an HA event without local "
+ "recovery FOM up and running.");
+ reqh = m0_fom_reqh(&m->rm_local_rfom->rf_base);
+
+ if (!m->rm_local_rfom->rf_is_volatile) {
+ M0_ASSERT_INFO(m0_cs_reqh_context(reqh) != NULL,
+ "A fully-functional motr process must "
+ "have a reqh ctx.");
+ server_process_event(m0_cs_ctx_get(reqh), event);
+ } else {
+ m0c = M0_AMB(m0c, reqh, m0c_reqh);
+ client_process_event(m0c, event);
+ }
+
+}
+
+static void default_redo_post(struct m0_dtm0_recovery_machine *m,
+ struct m0_fom *fom,
+ const struct m0_fid *tgt_proc,
+ const struct m0_fid *tgt_svc,
+ struct dtm0_req_fop *redo,
+ struct m0_be_op *op)
+{
+ int rc;
+
+ rc = m0_dtm0_req_post(m->rm_svc, op, redo, tgt_svc, fom, true);
+ /*
+ * We assume "perfect" links between ONLINE/RECOVERING processes.
+ * If the link is not perfect then let's just kill the process
+ * that is not able to send out REDOs.
+ */
+ M0_ASSERT(rc == 0);
+}
+
+M0_INTERNAL const struct m0_dtm0_recovery_machine_ops
+ m0_dtm0_recovery_machine_default_ops = {
+ .log_iter_init = default_log_iter_init,
+ .log_iter_fini = default_log_iter_fini,
+ .log_iter_next = default_log_iter_next,
+
+ .redo_post = default_redo_post,
+ .ha_event_post = default_ha_event_post,
+};
+
+#undef M0_TRACE_SUBSYSTEM
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 80
+ * scroll-step: 1
+ * End:
+ */
+/*
+ * vim: tabstop=8 shiftwidth=8 noexpandtab textwidth=80 nowrap
+ */
diff --git a/dtm0/recovery.h b/dtm0/recovery.h
new file mode 100644
index 00000000000..d83f74482fa
--- /dev/null
+++ b/dtm0/recovery.h
@@ -0,0 +1,183 @@
+/* -*- C -*- */
+/*
+ * Copyright (c) 2021 Seagate Technology LLC and/or its Affiliates
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * For any questions about this software or licensing,
+ * please email opensource@seagate.com or cortx-questions@seagate.com.
+ *
+ */
+
+
+#pragma once
+
+#ifndef __MOTR_DTM0_RECOVERY_H__
+#define __MOTR_DTM0_RECOVERY_H__
+
+#include "lib/tlist.h" /* m0_tl */
+#include "ha/note.h" /* m0_ha_obj_state */
+#include "conf/ha.h" /* m0_conf_ha_process_event */
+#include "sm/sm.h" /* m0_sm, m0_sm_group */
+
+/* imports */
+struct m0_dtm0_service;
+struct m0_conf_process;
+struct dtm0_req_fop;
+struct m0_be_dtm0_log_iter;
+struct m0_dtm0_log_rec;
+struct m0_be_op;
+struct m0_fom;
+
+/* exports */
+struct m0_dtm0_recovery_machine_ops;
+struct m0_dtm0_recovery_machine;
+struct recovery_fom;
+
+enum m0_dtm0_recovery_machine_states {
+ M0_DRMS_INIT,
+ M0_DRMS_STARTED,
+ M0_DRMS_STOPPED,
+ M0_DRMS_NR,
+};
+
+struct m0_dtm0_recovery_machine_ops {
+ /**
+ * Post a REDO message to the target DTM0 service.
+ *
+ * The expectation is that `redo` structure is either processed
+ * immediately, before function completes, or it is cloned and stored
+ * for future use. Caller will destroy all content of `redo` structure
+ * right after the call to redo_post().
+ */
+ void (*redo_post)(struct m0_dtm0_recovery_machine *m,
+ struct m0_fom *fom,
+ const struct m0_fid *tgt_proc,
+ const struct m0_fid *tgt_svc,
+ struct dtm0_req_fop *redo,
+ struct m0_be_op *op);
+
+ /**
+ * Post a conf ha process event.
+ */
+ void (*ha_event_post)(struct m0_dtm0_recovery_machine *m,
+ const struct m0_fid *tgt_proc,
+ const struct m0_fid *tgt_svc,
+ enum m0_conf_ha_process_event event);
+
+ int (*log_iter_init)(struct m0_dtm0_recovery_machine *m,
+ struct m0_be_dtm0_log_iter *iter);
+
+ /**
+ * Get next log record (or -ENOENT) from the local DTM0 log.
+ * @param[in] tgt_svc DTM0 service this REDO shall be sent to.
+ * @param[in,opt] origin_svc DTM0 service to be selected. When
+ * this parameter is set to non-NULL,
+ * the iter is supposed to select only
+ * the log records that were originated
+ * on this particular service.
+ */
+ int (*log_iter_next)(struct m0_dtm0_recovery_machine *m,
+ struct m0_be_dtm0_log_iter *iter,
+ const struct m0_fid *tgt_svc,
+ const struct m0_fid *origin_svc,
+ struct m0_dtm0_log_rec *record);
+
+ void (*log_iter_fini)(struct m0_dtm0_recovery_machine *m,
+ struct m0_be_dtm0_log_iter *iter);
+};
+
+
+struct m0_dtm0_recovery_machine {
+ struct m0_dtm0_service *rm_svc;
+ struct m0_tl rm_rfoms;
+ const struct m0_dtm0_recovery_machine_ops *rm_ops;
+ struct recovery_fom *rm_local_rfom;
+
+ struct m0_sm_group rm_sm_group;
+ struct m0_sm rm_sm;
+};
+
+M0_EXTERN const struct m0_dtm0_recovery_machine_ops
+ m0_dtm0_recovery_machine_default_ops;
+
+M0_INTERNAL int m0_drm_domain_init(void);
+
+M0_INTERNAL void m0_drm_domain_fini(void);
+
+/**
+ * Initialise a recovery machine.
+ * @param ops Recovery machine operations (such as sending of REDOs and some
+ * others) that have to be used instead of the default ops.
+ * These ops may be used to alter the effects of recovery machine
+ * decisions on the system (for example, in UTs).
+ */
+M0_INTERNAL void
+m0_dtm0_recovery_machine_init(struct m0_dtm0_recovery_machine *m,
+ const struct m0_dtm0_recovery_machine_ops *ops,
+ struct m0_dtm0_service *svc);
+
+M0_INTERNAL int
+m0_dtm0_recovery_machine_start(struct m0_dtm0_recovery_machine *m);
+
+M0_INTERNAL void
+m0_dtm0_recovery_machine_stop(struct m0_dtm0_recovery_machine *m);
+
+M0_INTERNAL void
+m0_dtm0_recovery_machine_fini(struct m0_dtm0_recovery_machine *m);
+
+/**
+ * Post a REDO message into recovery machine.
+ *
+ * Recovery machine needs to know what REDO messages were received by the local
+ * Motr process. It helps to properly advance the state of the local process.
+ * For example, the transition RECOVERING -> ONLINE may happen only when it
+ * receives enough REDO messages with EOL flag set (or simply EOL messages).
+ *
+ * Note, at this moment recovery machine sends out REDO messages but it does not
+ * apply incoming REDO messages. It must be done elsewhere. However, there is an
+ * ongoing effort to change this by getting rid of self-sufficient REDO FOMs.
+ * After these changes are done, recovery machine as a module will be fully
+ * responsible for sending and executing REDO messages.
+ */
+M0_INTERNAL void
+m0_dtm0_recovery_machine_redo_post(struct m0_dtm0_recovery_machine *m,
+ struct dtm0_req_fop *redo,
+ struct m0_be_op *op);
+
+/* UT-related API */
+M0_INTERNAL void
+m0_ut_remach_heq_post(struct m0_dtm0_recovery_machine *m,
+ const struct m0_fid *tgt_svc,
+ enum m0_ha_obj_state state);
+M0_INTERNAL void
+m0_ut_remach_populate(struct m0_dtm0_recovery_machine *m,
+ struct m0_conf_process *procs,
+ const struct m0_fid *svcs,
+ bool *is_volatile,
+ uint64_t objs_nr);
+
+#endif /* __MOTR_DTM0_RECOVERY_H__ */
+
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 80
+ * scroll-step: 1
+ * End:
+ */
+/*
+ * vim: tabstop=8 shiftwidth=8 noexpandtab textwidth=80 nowrap
+ */
diff --git a/dtm0/service.c b/dtm0/service.c
index 75e979abcc3..a046a48cf28 100644
--- a/dtm0/service.c
+++ b/dtm0/service.c
@@ -179,8 +179,12 @@ static int dtm0_process_disconnect(struct dtm0_process *process)
if (M0_IS0(&process->dop_rlink))
return M0_RC(0);
- rc = m0_rpc_link_is_connected(&process->dop_rlink) ?
- m0_rpc_link_disconnect_sync(&process->dop_rlink, timeout) : 0;
+ if (m0_rpc_link_is_connected(&process->dop_rlink)) {
+ m0_rpc_conn_sessions_cancel(&process->dop_rlink.rlk_conn);
+ rc = m0_rpc_link_disconnect_sync(&process->dop_rlink, timeout);
+ } else
+ rc = 0;
+
if (M0_IN(rc, (0, -ETIMEDOUT, -ECANCELED))) {
/*
@@ -263,7 +267,7 @@ static int volatile_log_init(struct m0_dtm0_service *dtm0)
rc = m0_be_dtm0_log_init(dtm0->dos_log, NULL,
&dtm0->dos_clk_src, false);
if (rc != 0)
- m0_be_dtm0_log_free(&dtm0->dos_log);
+ m0_be_dtm0_log_free0(&dtm0->dos_log);
}
return rc;
}
@@ -330,10 +334,33 @@ static int dtm_service__origin_fill(struct m0_reqh_service *service)
return M0_RC_INFO(rc, "origin=%d", dtm0->dos_origin);
}
+/*
+ * Certain UTs manually control the lifetime of recovery machine.
+ * When manual start-stop is disabled, DTM0 service automatically
+ * starts-stops the machine.
+ */
+static bool is_manual_ss_enabled(void)
+{
+ return M0_FI_ENABLED("ut");
+}
+
static int dtm0_service_start(struct m0_reqh_service *service)
{
+ struct m0_dtm0_service *dtms = to_dtm(service);
+ int rc;
+
M0_PRE(service != NULL);
- return dtm_service__origin_fill(service);
+ rc = dtm_service__origin_fill(service);
+ if (rc != 0)
+ return M0_ERR(rc);
+
+ if (ENABLE_DTM0 && !is_manual_ss_enabled()) {
+ m0_dtm0_recovery_machine_init(&dtms->dos_remach,
+ NULL, dtms);
+ rc = m0_dtm0_recovery_machine_start(&dtms->dos_remach);
+ }
+
+ return M0_RC(rc);
}
static void dtm0_service_prepare_to_stop(struct m0_reqh_service *reqh_rs)
@@ -342,6 +369,8 @@ static void dtm0_service_prepare_to_stop(struct m0_reqh_service *reqh_rs)
M0_PRE(reqh_rs != NULL);
dtms = M0_AMB(dtms, reqh_rs, dos_generic);
+ if (ENABLE_DTM0 && !is_manual_ss_enabled())
+ m0_dtm0_recovery_machine_stop(&dtms->dos_remach);
dtm0_service_conns_term(dtms);
}
@@ -358,8 +387,11 @@ static void dtm0_service_stop(struct m0_reqh_service *service)
if (dtm0->dos_origin == DTM0_ON_VOLATILE && dtm0->dos_log != NULL) {
m0_be_dtm0_log_clear(dtm0->dos_log);
m0_be_dtm0_log_fini(dtm0->dos_log);
- m0_be_dtm0_log_free(&dtm0->dos_log);
+ m0_be_dtm0_log_free0(&dtm0->dos_log);
}
+
+ if (ENABLE_DTM0 && !is_manual_ss_enabled())
+ m0_dtm0_recovery_machine_fini(&dtm0->dos_remach);
}
static void dtm0_service_fini(struct m0_reqh_service *service)
@@ -377,12 +409,14 @@ M0_INTERNAL int m0_dtm0_stype_init(void)
M0_AVI_DTX0_SM_STATE, M0_AVI_DTX0_SM_COUNTER) ?:
m0_dtm0_fop_init() ?:
m0_reqh_service_type_register(&dtm0_service_type) ?:
- m0_dtm0_rpc_link_mod_init();
+ m0_dtm0_rpc_link_mod_init() ?:
+ m0_drm_domain_init();
}
M0_INTERNAL void m0_dtm0_stype_fini(void)
{
extern struct m0_sm_conf m0_dtx_sm_conf;
+ m0_drm_domain_fini();
m0_dtm0_rpc_link_mod_fini();
m0_reqh_service_type_unregister(&dtm0_service_type);
m0_dtm0_fop_fini();
@@ -411,6 +445,11 @@ m0_dtm0_service_find(const struct m0_reqh *reqh)
return rh_srv == NULL ? NULL : to_dtm(rh_srv);
}
+M0_INTERNAL bool m0_dtm0_is_expecting_redo_from_client(void)
+{
+ return M0_FI_ENABLED("ut");
+}
+
M0_INTERNAL bool m0_dtm0_in_ut(void)
{
return M0_FI_ENABLED("ut");
diff --git a/dtm0/service.h b/dtm0/service.h
index 087f0b91b4c..6801bd21e21 100644
--- a/dtm0/service.h
+++ b/dtm0/service.h
@@ -27,6 +27,7 @@
#include "reqh/reqh_service.h"
#include "dtm0/clk_src.h"
+#include "dtm0/recovery.h"
struct m0_be_dtm0_log;
struct dtm0_req_fop;
@@ -42,19 +43,20 @@ enum m0_dtm0_service_origin {
* DTM0 service structure
*/
struct m0_dtm0_service {
- struct m0_reqh_service dos_generic;
- struct m0_tl dos_processes;
- enum m0_dtm0_service_origin dos_origin;
- uint64_t dos_magix;
- struct m0_dtm0_clk_src dos_clk_src;
- struct m0_be_dtm0_log *dos_log;
- /*
+ struct m0_reqh_service dos_generic;
+ struct m0_tl dos_processes;
+ enum m0_dtm0_service_origin dos_origin;
+ uint64_t dos_magix;
+ struct m0_dtm0_clk_src dos_clk_src;
+ struct m0_be_dtm0_log *dos_log;
+ struct m0_dtm0_recovery_machine dos_remach;
+ /**
* A queue for DTM_TEST message for drlink UTs.
* The UTs are fully responsible for the queue init/fini/get.
* DTM_TEST fom puts dtm0_req_fop::dtr_txr::dtd_id::dti_fid to the
* queue.
*/
- struct m0_be_queue *dos_ut_queue;
+ struct m0_be_queue *dos_ut_queue;
};
extern struct m0_reqh_service_type dtm0_service_type;
@@ -89,6 +91,7 @@ m0_dtm0_service_find(const struct m0_reqh *reqh);
M0_INTERNAL struct m0_dtm0_service *m0_dtm0_fom2service(struct m0_fom *fom);
M0_INTERNAL bool m0_dtm0_in_ut(void);
+M0_INTERNAL bool m0_dtm0_is_expecting_redo_from_client(void);
#endif /* __MOTR_DTM0_SERVICE_H__ */
diff --git a/dtm0/ut/drlink.c b/dtm0/ut/drlink.c
index 7c3b5cd7064..a26aae6ae7a 100644
--- a/dtm0/ut/drlink.c
+++ b/dtm0/ut/drlink.c
@@ -60,6 +60,7 @@ void m0_dtm0_ut_drlink_simple(void)
int rc;
int i;
int j;
+ struct m0_be_queue *q;
M0_ALLOC_PTR(udh);
M0_ASSERT(udh != NULL);
@@ -93,16 +94,16 @@ void m0_dtm0_ut_drlink_simple(void)
for (i = 0; i < DTM0_UT_DRLINK_SIMPLE_POST_NR; ++i)
m0_be_op_init(&op[i]);
- M0_ALLOC_PTR(svc->dos_ut_queue);
- M0_UT_ASSERT(svc->dos_ut_queue != 0);
- rc = m0_be_queue_init(svc->dos_ut_queue,
- &(struct m0_be_queue_cfg){
+ M0_ALLOC_PTR(q);
+ M0_UT_ASSERT(q != 0);
+ rc = m0_be_queue_init(q, &(struct m0_be_queue_cfg){
.bqc_q_size_max = DTM0_UT_DRLINK_SIMPLE_POST_NR,
.bqc_producers_nr_max = DTM0_UT_DRLINK_SIMPLE_POST_NR,
.bqc_consumers_nr_max = 1,
.bqc_item_length = sizeof fid[0],
});
M0_UT_ASSERT(rc == 0);
+ svc->dos_ut_queue = q;
for (i = 0; i < DTM0_UT_DRLINK_SIMPLE_POST_NR; ++i) {
rc = m0_dtm0_req_post(udh->udh_server_dtm0_service,
@@ -131,8 +132,6 @@ void m0_dtm0_ut_drlink_simple(void)
M0_UT_ASSERT(found);
}
m0_be_op_fini(&op_out);
- m0_be_queue_fini(svc->dos_ut_queue);
- m0_free(svc->dos_ut_queue);
for (i = 0; i < DTM0_UT_DRLINK_SIMPLE_POST_NR; ++i)
M0_UT_ASSERT(m0_fid_eq(&fid[i], &M0_FID0));
m0_free(fid);
@@ -144,6 +143,8 @@ void m0_dtm0_ut_drlink_simple(void)
m0_free(fop);
m0_ut_dtm0_helper_fini(udh);
+ m0_be_queue_fini(q);
+ m0_free(q);
m0_free(udh);
}
diff --git a/dtm0/ut/main.c b/dtm0/ut/main.c
index 5ce0bdd2e8e..349d4e36eb1 100644
--- a/dtm0/ut/main.c
+++ b/dtm0/ut/main.c
@@ -30,7 +30,9 @@
#include "ut/ut.h"
#include "cas/cas.h"
#include "cas/cas_xc.h"
+#include "dtm0/recovery.h"
+#include "dtm0/ut/helper.h"
enum {
NUM_CAS_RECS = 10,
@@ -99,18 +101,728 @@ static void cas_xcode_test(void)
buf, len);
M0_UT_ASSERT(rc == 0);
- m0_xcode_free_obj(&M0_XCODE_OBJ(m0_cas_op_xc, op_out));
+ m0_xcode_free_obj(&M0_XCODE_OBJ(m0_cas_op_xc, op_out));
+}
+
+
+enum ut_sides {
+ UT_SIDE_SRV,
+ UT_SIDE_CLI,
+ UT_SIDE_NR
+};
+
+enum ut_client_persistence {
+ UT_CP_UNSPECIFIED,
+ UT_CP_VOLATILE_CLIENT,
+ UT_CP_PERSISTENT_CLIENT,
+};
+
+struct m0_fid g_service_fids[UT_SIDE_NR];
+
+struct ut_remach {
+ bool use_real_log;
+ enum ut_client_persistence cp;
+
+ struct m0_ut_dtm0_helper udh;
+
+ struct m0_dtm0_service *svcs[UT_SIDE_NR];
+ struct m0_be_op recovered[UT_SIDE_NR];
+
+ /* Client-side stubs for conf objects. */
+ struct m0_conf_process cli_procs[UT_SIDE_NR];
+ struct m0_mutex cli_proc_guards[UT_SIDE_NR];
+};
+
+struct ha_thought {
+ enum ut_sides who;
+ enum m0_ha_obj_state what;
+};
+#define HA_THOUGHT(_who, _what) (struct ha_thought) { \
+ .who = _who, .what = _what \
+}
+
+static struct m0_dtm0_service *ut_remach_svc_get(struct ut_remach *um,
+ enum ut_sides side)
+{
+ M0_UT_ASSERT(side < UT_SIDE_NR);
+ M0_UT_ASSERT(um->svcs[side] != NULL);
+ return um->svcs[side];
+}
+
+static struct m0_dtm0_recovery_machine *ut_remach_get(struct ut_remach *um,
+ enum ut_sides side)
+{
+ return &ut_remach_svc_get(um, side)->dos_remach;
+}
+
+static struct m0_fid *ut_remach_fid_get(enum ut_sides side)
+{
+ M0_UT_ASSERT(side < UT_SIDE_NR);
+ return &g_service_fids[side];
+}
+
+static enum ut_sides ut_remach_side_get(const struct m0_fid *svc)
+{
+ enum ut_sides side;
+
+ for (side = 0; side < UT_SIDE_NR; ++side) {
+ if (m0_fid_eq(ut_remach_fid_get(side), svc))
+ break;
+ }
+
+ M0_UT_ASSERT(side < UT_SIDE_NR);
+ return side;
+}
+
+static struct ut_remach *ut_remach_from(struct m0_dtm0_recovery_machine *m,
+ const struct m0_fid *svc_fid)
+{
+ struct ut_remach *um = NULL;
+ struct m0_dtm0_service *svc = M0_AMB(svc, m, dos_remach);
+ struct m0_reqh *reqh = svc->dos_generic.rs_reqh;
+ struct m0_reqh_context *rx = svc->dos_generic.rs_reqh_ctx;
+ struct m0_rpc_client_ctx *cli_ctx;
+ struct m0_motr *motr_ctx;
+ struct m0_rpc_server_ctx *srv_ctx;
+ struct m0_ut_dtm0_helper *udh;
+ enum ut_sides side = ut_remach_side_get(svc_fid);
+
+ if (rx == NULL) {
+ M0_UT_ASSERT(side == UT_SIDE_CLI);
+ cli_ctx = M0_AMB(cli_ctx, reqh, rcx_reqh);
+ udh = M0_AMB(udh, cli_ctx, udh_cctx);
+ um = M0_AMB(um, udh, udh);
+ } else {
+ M0_UT_ASSERT(side == UT_SIDE_SRV);
+ motr_ctx = M0_AMB(motr_ctx, rx, cc_reqh_ctx);
+ srv_ctx = M0_AMB(srv_ctx, motr_ctx, rsx_motr_ctx);
+ udh = M0_AMB(udh, srv_ctx, udh_sctx);
+ um = M0_AMB(um, udh, udh);
+ }
+
+ M0_UT_ASSERT(um != NULL);
+ M0_UT_ASSERT(ut_remach_get(um, side) == m);
+ return um;
+}
+
+static void ut_remach_log_add_sync(struct ut_remach *um,
+ enum ut_sides side,
+ struct m0_dtm0_tx_desc *txd,
+ struct m0_buf *payload)
+{
+ struct m0_dtm0_recovery_machine *m = ut_remach_get(um, side);
+ struct m0_dtm0_service *svc = m->rm_svc;
+ struct m0_be_dtm0_log *log = svc->dos_log;
+ struct m0_be_tx *tx = NULL;
+ struct m0_be_seg *seg = log->dl_seg;
+ struct m0_be_tx_credit cred = {};
+ struct m0_be_ut_backend *ut_be;
+ int rc;
+
+ if (log->dl_is_persistent) {
+ M0_UT_ASSERT(svc->dos_generic.rs_reqh_ctx != NULL);
+ ut_be = &svc->dos_generic.rs_reqh_ctx->rc_be;
+ m0_be_dtm0_log_credit(M0_DTML_EXECUTED, txd, payload, seg,
+ NULL, &cred);
+ M0_ALLOC_PTR(tx);
+ M0_UT_ASSERT(tx != NULL);
+ m0_be_ut_tx_init(tx, ut_be);
+ m0_be_tx_prep(tx, &cred);
+ rc = m0_be_tx_open_sync(tx);
+ M0_UT_ASSERT(rc == 0);
+ }
+
+ m0_mutex_lock(&log->dl_lock);
+ rc = m0_be_dtm0_log_update(log, tx, txd, payload);
+ M0_UT_ASSERT(rc == 0);
+ m0_mutex_unlock(&log->dl_lock);
+
+ if (log->dl_is_persistent) {
+ m0_be_tx_close_sync(tx);
+ m0_be_tx_fini(tx);
+ m0_free(tx);
+ }
+}
+
+
+static void um_dummy_log_redo_post(struct m0_dtm0_recovery_machine *m,
+ struct m0_fom *fom,
+ const struct m0_fid *tgt_proc,
+ const struct m0_fid *tgt_svc,
+ struct dtm0_req_fop *redo,
+ struct m0_be_op *op)
+{
+ struct ut_remach *um = NULL;
+ struct m0_dtm0_recovery_machine *counterpart = NULL;
+
+ um = ut_remach_from(m, &redo->dtr_initiator);
+ counterpart = ut_remach_get(um, ut_remach_side_get(tgt_svc));
+ m0_dtm0_recovery_machine_redo_post(counterpart, redo, op);
+ M0_UT_ASSERT(m0_be_op_is_done(op));
+}
+
+static void um_real_log_redo_post(struct m0_dtm0_recovery_machine *m,
+ struct m0_fom *fom,
+ const struct m0_fid *tgt_proc,
+ const struct m0_fid *tgt_svc,
+ struct dtm0_req_fop *redo,
+ struct m0_be_op *op)
+{
+ struct ut_remach *um = NULL;
+ struct m0_dtm0_recovery_machine *counterpart = NULL;
+ enum ut_sides tgt_side = ut_remach_side_get(tgt_svc);
+ struct m0_dtm0_service *svc;
+ struct m0_be_ut_backend *ut_be;
+
+ um = ut_remach_from(m, &redo->dtr_initiator);
+ counterpart = ut_remach_get(um, tgt_side);
+
+ /* Empty REDOs are allowed only when EOL is set. */
+ M0_UT_ASSERT(ergo(m0_dtm0_tx_desc_is_none(&redo->dtr_txr),
+ !!(redo->dtr_flags & M0_BITS(M0_DMF_EOL))));
+
+ /* Emulate REDO FOM: update the log */
+ if (!m0_dtm0_tx_desc_is_none(&redo->dtr_txr))
+ ut_remach_log_add_sync(um, tgt_side, &redo->dtr_txr,
+ &redo->dtr_payload);
+
+ m0_dtm0_recovery_machine_redo_post(counterpart, redo, op);
+ M0_UT_ASSERT(m0_be_op_is_done(op));
+
+ /*
+ * It is a sordid but simple way of making ::ut_remach_log_add_sync
+ * work:
+ * RPC client does not have a fully-funcitonal context, so that
+ * sm-based BE logic cannot progress because there is no BE associated
+ * with the corresponding FOM (fom -> reqh -> context -> be).
+ * However, both sides share the same set of localities,
+ * so that we can sit down right here and wait until everything
+ * is completed.
+ * It might be slow and dangerous but it is enough for a simple test.
+ */
+ if (!m0_dtm0_tx_desc_is_none(&redo->dtr_txr) &&
+ tgt_side == UT_SIDE_SRV) {
+ svc = counterpart->rm_svc;
+ ut_be = &svc->dos_generic.rs_reqh_ctx->rc_be;
+ m0_be_ut_backend_sm_group_asts_run(ut_be);
+ m0_be_ut_backend_thread_exit(ut_be);
+ }
+}
+
+static int um_dummy_log_iter_next(struct m0_dtm0_recovery_machine *m,
+ struct m0_be_dtm0_log_iter *iter,
+ const struct m0_fid *tgt_svc,
+ const struct m0_fid *origin_svc,
+ struct m0_dtm0_log_rec *record)
+{
+ M0_SET0(record);
+ return -ENOENT;
+}
+
+static int um_dummy_log_iter_init(struct m0_dtm0_recovery_machine *m,
+ struct m0_be_dtm0_log_iter *iter)
+{
+ (void) m;
+ (void) iter;
+ return 0;
+}
+
+static void um_dummy_log_iter_fini(struct m0_dtm0_recovery_machine *m,
+ struct m0_be_dtm0_log_iter *iter)
+{
+ (void) m;
+ (void) iter;
+ /* nothing to do */
+}
+
+void um_ha_event_post(struct m0_dtm0_recovery_machine *m,
+ const struct m0_fid *tgt_proc,
+ const struct m0_fid *tgt_svc,
+ enum m0_conf_ha_process_event event)
+{
+ struct ut_remach *um = ut_remach_from(m, tgt_svc);
+ enum ut_sides side = ut_remach_side_get(tgt_svc);
+
+ switch (event) {
+ case M0_CONF_HA_PROCESS_DTM_RECOVERED:
+ m0_be_op_done(&um->recovered[side]);
+ break;
+ default:
+ M0_UT_ASSERT(false);
+ }
+}
+
+/*
+ * Unicast an HA thought to a particular side.
+ */
+static void ut_remach_ha_tells(struct ut_remach *um,
+ const struct ha_thought *t,
+ enum ut_sides whom)
+{
+ m0_ut_remach_heq_post(ut_remach_get(um, whom),
+ ut_remach_fid_get(t->who), t->what);
+}
+
+/*
+ * Multicast an HA thought to all the sides.
+ */
+static void ut_remach_ha_thinks(struct ut_remach *um,
+ const struct ha_thought *t)
+{
+ enum ut_sides side;
+
+ for (side = 0; side < UT_SIDE_NR; ++side)
+ ut_remach_ha_tells(um, t, side);
+}
+
+static const struct m0_dtm0_recovery_machine_ops*
+ut_remach_ops_get(struct ut_remach *um)
+{
+ static struct m0_dtm0_recovery_machine_ops dummy_log_ops = {};
+ static struct m0_dtm0_recovery_machine_ops real_log_ops = {};
+ static bool initialized = false;
+
+ if (!initialized) {
+ /* Dummy log operations */
+ dummy_log_ops = m0_dtm0_recovery_machine_default_ops;
+
+ dummy_log_ops.log_iter_next = um_dummy_log_iter_next;
+ dummy_log_ops.log_iter_init = um_dummy_log_iter_init;
+ dummy_log_ops.log_iter_fini = um_dummy_log_iter_fini;
+
+ dummy_log_ops.redo_post = um_dummy_log_redo_post;
+ dummy_log_ops.ha_event_post = um_ha_event_post;
+
+ /* Real log operations */
+ real_log_ops = m0_dtm0_recovery_machine_default_ops;
+
+ real_log_ops.redo_post = um_real_log_redo_post;
+ real_log_ops.ha_event_post = um_ha_event_post;
+ /*
+ * Do not reassign log ops, as we need to deal with real DTM0
+ * log.
+ */
+
+ initialized = true;
+ }
+
+ return um->use_real_log ? &real_log_ops : &dummy_log_ops;
+}
+
+static void ut_srv_remach_init(struct ut_remach *um)
+{
+ m0_dtm0_recovery_machine_init(ut_remach_get(um, UT_SIDE_SRV),
+ ut_remach_ops_get(um),
+ ut_remach_svc_get(um, UT_SIDE_SRV));
+}
+
+static void ut_cli_remach_conf_obj_init(struct ut_remach *um)
+{
+ int i;
+
+ for (i = 0; i < UT_SIDE_NR; ++i) {
+ m0_mutex_init(&um->cli_proc_guards[i]);
+ m0_chan_init(&um->cli_procs[i].pc_obj.co_ha_chan,
+ &um->cli_proc_guards[i]);
+ }
+}
+
+static void ut_cli_remach_conf_obj_fini(struct ut_remach *um)
+{
+ int i;
+
+ for (i = 0; i < UT_SIDE_NR; ++i) {
+ m0_chan_fini_lock(&um->cli_procs[i].pc_obj.co_ha_chan);
+ m0_mutex_fini(&um->cli_proc_guards[i]);
+ }
+}
+
+
+static void ut_cli_remach_init(struct ut_remach *um)
+{
+ ut_cli_remach_conf_obj_init(um);
+
+ m0_dtm0_recovery_machine_init(ut_remach_get(um, UT_SIDE_CLI),
+ ut_remach_ops_get(um),
+ um->svcs[UT_SIDE_CLI]);
+}
+
+static void ut_srv_remach_fini(struct ut_remach *um)
+{
+ m0_dtm0_recovery_machine_fini(ut_remach_get(um, UT_SIDE_SRV));
+}
+
+static void ut_cli_remach_fini(struct ut_remach *um)
+{
+ m0_dtm0_recovery_machine_fini(ut_remach_get(um, UT_SIDE_CLI));
+ ut_cli_remach_conf_obj_fini(um);
+}
+
+static void ut_remach_start(struct ut_remach *um)
+{
+ enum ut_sides side;
+ int rc;
+ bool is_volatile[UT_SIDE_NR] = {
+ [UT_SIDE_SRV] = false,
+ [UT_SIDE_CLI] = um->cp == UT_CP_VOLATILE_CLIENT,
+ };
+
+ m0_ut_remach_populate(ut_remach_get(um, UT_SIDE_CLI), um->cli_procs,
+ g_service_fids, is_volatile, UT_SIDE_NR);
+ for (side = 0; side < UT_SIDE_NR; ++side) {
+ rc = m0_dtm0_recovery_machine_start(ut_remach_get(um, side));
+ M0_ASSERT(rc == 0);
+ }
+}
+
+static void ut_remach_stop(struct ut_remach *um)
+{
+ enum ut_sides side;
+ for (side = 0; side < UT_SIDE_NR; ++side)
+ m0_dtm0_recovery_machine_stop(ut_remach_get(um, side));
+}
+
+static void ut_remach_init(struct ut_remach *um)
+{
+ int i;
+
+ M0_UT_ASSERT(M0_IN(um->cp, (UT_CP_PERSISTENT_CLIENT,
+ UT_CP_VOLATILE_CLIENT)));
+
+ for (i = 0; i < ARRAY_SIZE(um->recovered); ++i) {
+ m0_be_op_init(um->recovered + i);
+ m0_be_op_active(um->recovered + i);
+ }
+
+ m0_fi_enable("m0_dtm0_in_ut", "ut");
+ m0_fi_enable("is_manual_ss_enabled", "ut");
+ m0_fi_enable("m0_dtm0_is_expecting_redo_from_client", "ut");
+ if (um->cp == UT_CP_PERSISTENT_CLIENT)
+ m0_fi_enable("is_svc_volatile", "always_false");
+
+ m0_ut_dtm0_helper_init(&um->udh);
+
+ g_service_fids[UT_SIDE_SRV] = um->udh.udh_server_dtm0_fid;
+ g_service_fids[UT_SIDE_CLI] = um->udh.udh_client_dtm0_fid;
+
+ M0_UT_ASSERT(um->udh.udh_server_dtm0_service != NULL);
+ um->svcs[UT_SIDE_SRV] = um->udh.udh_server_dtm0_service;
+ M0_UT_ASSERT(um->udh.udh_client_dtm0_service != NULL);
+ um->svcs[UT_SIDE_CLI] = um->udh.udh_client_dtm0_service;
+
+ ut_srv_remach_init(um);
+ ut_cli_remach_init(um);
+}
+
+static void ut_remach_fini(struct ut_remach *um)
+{
+ int i;
+
+ ut_cli_remach_fini(um);
+ ut_srv_remach_fini(um);
+ m0_ut_dtm0_helper_fini(&um->udh);
+ if (um->cp == UT_CP_PERSISTENT_CLIENT)
+ m0_fi_disable("is_svc_volatile", "always_false");
+ m0_fi_disable("m0_dtm0_is_expecting_redo_from_client", "ut");
+ m0_fi_disable("is_manual_ss_enabled", "ut");
+ m0_fi_disable("m0_dtm0_in_ut", "ut");
+ for (i = 0; i < ARRAY_SIZE(um->recovered); ++i) {
+ if (!m0_be_op_is_done(um->recovered + i))
+ m0_be_op_done(um->recovered + i);
+ m0_be_op_fini(um->recovered + i);
+ }
+
+ M0_SET_ARR0(g_service_fids);
+}
+
+static void ut_remach_reset_srv(struct ut_remach *um)
+{
+ int rc;
+ struct m0_dtm0_recovery_machine *m = ut_remach_get(um, UT_SIDE_SRV);
+
+ m0_dtm0_recovery_machine_stop(m);
+ m0_dtm0_recovery_machine_fini(m);
+ m0_dtm0_recovery_machine_init(m, ut_remach_ops_get(um),
+ ut_remach_svc_get(um, UT_SIDE_SRV));
+ rc = m0_dtm0_recovery_machine_start(m);
+ M0_UT_ASSERT(rc == 0);
+}
+
+static void ut_remach_log_gen_sync(struct ut_remach *um,
+ enum ut_sides side,
+ uint64_t ts_start,
+ uint64_t records_nr)
+{
+ struct m0_dtm0_tx_desc txd = {};
+ struct m0_buf payload = {};
+ int rc;
+ int i;
+
+ rc = m0_dtm0_tx_desc_init(&txd, 1);
+ M0_UT_ASSERT(rc == 0);
+ txd.dtd_ps.dtp_pa[0] = (struct m0_dtm0_tx_pa) {
+ .p_state = M0_DTPS_EXECUTED,
+ .p_fid = *ut_remach_fid_get(UT_SIDE_SRV),
+ };
+ txd.dtd_id = (struct m0_dtm0_tid) {
+ .dti_ts.dts_phys = 0,
+ .dti_fid = *ut_remach_fid_get(UT_SIDE_CLI),
+ };
+
+ for (i = 0; i < records_nr; ++i) {
+ txd.dtd_id.dti_ts.dts_phys = ts_start + i;
+ ut_remach_log_add_sync(um, side, &txd, &payload);
+ }
+
+ m0_dtm0_tx_desc_fini(&txd);
+}
+
+/*
+ * Ensures that DTM0 log A is a subset of DTM0 log B; and,
+ * optionally, that A has at exactly "expected_records_nr" log records
+ * (if expected_records_nr < 0 then this check is omitted).
+ * Note, pairs (tid, payload) are used as comarison keys. The states
+ * of participants and the other fields are ignored.
+ */
+static void log_subset_verify(struct ut_remach *um,
+ int expected_records_nr,
+ enum ut_sides a_side,
+ enum ut_sides b_side)
+{
+ struct m0_be_dtm0_log *a_log =
+ ut_remach_get(um, a_side)->rm_svc->dos_log;
+ struct m0_be_dtm0_log *b_log =
+ ut_remach_get(um, b_side)->rm_svc->dos_log;
+ struct m0_be_dtm0_log_iter a_iter;
+ struct m0_dtm0_log_rec a_record;
+ struct m0_dtm0_log_rec *b_record;
+ struct m0_buf *a_buf;
+ struct m0_buf *b_buf;
+ struct m0_dtm0_tid *tid;
+ int rc;
+ uint64_t actual_records_nr = 0;
+
+ m0_mutex_lock(&a_log->dl_lock);
+ m0_mutex_lock(&b_log->dl_lock);
+
+ m0_be_dtm0_log_iter_init(&a_iter, a_log);
+
+ while (true) {
+ rc = m0_be_dtm0_log_iter_next(&a_iter, &a_record);
+ M0_UT_ASSERT(M0_IN(rc, (0, -ENOENT)));
+ if (rc == -ENOENT)
+ break;
+ M0_UT_ASSERT(rc == 0);
+ tid = &a_record.dlr_txd.dtd_id;
+ b_record = m0_be_dtm0_log_find(b_log, tid);
+ M0_UT_ASSERT(b_record != NULL);
+ a_buf = &a_record.dlr_payload;
+ b_buf = &b_record->dlr_payload;
+ M0_UT_ASSERT(equi(m0_buf_is_set(a_buf), m0_buf_is_set(b_buf)));
+ M0_UT_ASSERT(ergo(m0_buf_is_set(a_buf),
+ m0_buf_eq(a_buf, b_buf)));
+ m0_dtm0_log_iter_rec_fini(&a_record);
+ actual_records_nr++;
+ }
+
+ M0_UT_ASSERT(ergo(expected_records_nr >= 0,
+ expected_records_nr == actual_records_nr));
+
+ m0_mutex_unlock(&b_log->dl_lock);
+ m0_mutex_unlock(&a_log->dl_lock);
+}
+
+/* Case: Ensure the machine initialised properly. */
+static void remach_init_fini(void)
+{
+ struct ut_remach um = { .cp = UT_CP_PERSISTENT_CLIENT };
+ ut_remach_init(&um);
+ ut_remach_fini(&um);
+}
+
+/* Case: Ensure the machine is able to start/stop. */
+static void remach_start_stop(void)
+{
+ struct ut_remach um = { .cp = UT_CP_PERSISTENT_CLIENT };
+ ut_remach_init(&um);
+ ut_remach_start(&um);
+ ut_remach_stop(&um);
+ ut_remach_fini(&um);
+}
+
+static void ut_remach_boot(struct ut_remach *um)
+{
+ const struct ha_thought starting[] = {
+ HA_THOUGHT(UT_SIDE_CLI, M0_NC_TRANSIENT),
+ HA_THOUGHT(UT_SIDE_SRV, M0_NC_TRANSIENT),
+
+ HA_THOUGHT(UT_SIDE_CLI, M0_NC_DTM_RECOVERING),
+ HA_THOUGHT(UT_SIDE_SRV, M0_NC_DTM_RECOVERING),
+ };
+ const struct ha_thought started[] = {
+ HA_THOUGHT(UT_SIDE_CLI, M0_NC_ONLINE),
+ HA_THOUGHT(UT_SIDE_SRV, M0_NC_ONLINE),
+ };
+ int i;
+
+ ut_remach_init(um);
+ ut_remach_start(um);
+
+ for (i = 0; i < ARRAY_SIZE(starting); ++i)
+ ut_remach_ha_thinks(um, starting + i);
+
+ for (i = 0; i < ARRAY_SIZE(um->recovered); ++i)
+ m0_be_op_wait(um->recovered + i);
+
+ for (i = 0; i < ARRAY_SIZE(started); ++i)
+ ut_remach_ha_thinks(um, started + i);
+}
+
+static void ut_remach_shutdown(struct ut_remach *um)
+{
+ ut_remach_stop(um);
+ M0_UT_ASSERT(m0_be_op_is_done(&um->recovered[UT_SIDE_SRV]));
+ M0_UT_ASSERT(m0_be_op_is_done(&um->recovered[UT_SIDE_CLI]));
+ ut_remach_fini(um);
+}
+
+/* Use-case: gracefull boot and shutdown of 2-node cluster. */
+static void remach_boot_cluster(enum ut_client_persistence cp)
+{
+ struct ut_remach um = { .cp = cp };
+
+ ut_remach_boot(&um);
+ ut_remach_shutdown(&um);
+}
+
+static void remach_boot_cluster_ss(void)
+{
+ remach_boot_cluster(UT_CP_PERSISTENT_CLIENT);
+}
+
+static void remach_boot_cluster_cs(void)
+{
+ remach_boot_cluster(UT_CP_VOLATILE_CLIENT);
+}
+
+/* Use-case: re-boot an ONLINE node. */
+static void remach_reboot_server(void)
+{
+ struct ut_remach um = { .cp = UT_CP_PERSISTENT_CLIENT };
+
+ ut_remach_boot(&um);
+
+ m0_be_op_reset(um.recovered + UT_SIDE_SRV);
+ m0_be_op_active(um.recovered + UT_SIDE_SRV);
+ ut_remach_reset_srv(&um);
+
+ ut_remach_ha_thinks(&um, &HA_THOUGHT(UT_SIDE_SRV, M0_NC_TRANSIENT));
+ ut_remach_ha_tells(&um, &HA_THOUGHT(UT_SIDE_CLI, M0_NC_ONLINE),
+ UT_SIDE_SRV);
+ ut_remach_ha_thinks(&um, &HA_THOUGHT(UT_SIDE_SRV,
+ M0_NC_DTM_RECOVERING));
+ m0_be_op_wait(um.recovered + UT_SIDE_SRV);
+ ut_remach_ha_thinks(&um, &HA_THOUGHT(UT_SIDE_SRV, M0_NC_ONLINE));
+
+ ut_remach_shutdown(&um);
+}
+
+/* Use-case: reboot a node when it started to recover. */
+static void remach_reboot_twice(void)
+{
+ struct ut_remach um = { .cp = UT_CP_PERSISTENT_CLIENT };
+
+ ut_remach_boot(&um);
+
+ m0_be_op_reset(um.recovered + UT_SIDE_SRV);
+ m0_be_op_active(um.recovered + UT_SIDE_SRV);
+ ut_remach_reset_srv(&um);
+
+ /*
+ * Do not tell the client about failure.
+ * No REDOs would be sent, so that we can see what happens
+ * in the case where recovery machine has to be stopped
+ * in the middle of awaiting for REDOs.
+ */
+ ut_remach_ha_tells(&um, &HA_THOUGHT(UT_SIDE_SRV, M0_NC_TRANSIENT),
+ UT_SIDE_SRV);
+ ut_remach_ha_tells(&um, &HA_THOUGHT(UT_SIDE_CLI, M0_NC_ONLINE),
+ UT_SIDE_SRV);
+ ut_remach_ha_tells(&um, &HA_THOUGHT(UT_SIDE_SRV, M0_NC_DTM_RECOVERING),
+ UT_SIDE_SRV);
+ ut_remach_reset_srv(&um);
+ M0_UT_ASSERT(!m0_be_op_is_done(&um.recovered[UT_SIDE_SRV]));
+
+ ut_remach_ha_thinks(&um, &HA_THOUGHT(UT_SIDE_SRV, M0_NC_TRANSIENT));
+ ut_remach_ha_tells(&um, &HA_THOUGHT(UT_SIDE_CLI, M0_NC_ONLINE),
+ UT_SIDE_SRV);
+ ut_remach_ha_thinks(&um, &HA_THOUGHT(UT_SIDE_SRV,
+ M0_NC_DTM_RECOVERING));
+ m0_be_op_wait(um.recovered + UT_SIDE_SRV);
+ ut_remach_ha_thinks(&um, &HA_THOUGHT(UT_SIDE_SRV, M0_NC_ONLINE));
+
+ ut_remach_shutdown(&um);
+}
+
+/* Use-case: replay an empty DTM0 log. */
+static void remach_boot_real_log(void)
+{
+ struct ut_remach um = {
+ .cp = UT_CP_PERSISTENT_CLIENT,
+ .use_real_log = true
+ };
+ ut_remach_boot(&um);
+ ut_remach_shutdown(&um);
+}
+
+/* Use-case: replay a non-empty client log to the server. */
+static void remach_real_log_replay(void)
+{
+ struct ut_remach um = {
+ .cp = UT_CP_PERSISTENT_CLIENT,
+ .use_real_log = true
+ };
+ /* cafe bell */
+ const uint64_t since = 0xCAFEBELL;
+ const uint64_t records_nr = 10;
+
+ ut_remach_boot(&um);
+
+ ut_remach_log_gen_sync(&um, UT_SIDE_CLI, since, records_nr);
+
+ m0_be_op_reset(um.recovered + UT_SIDE_SRV);
+ m0_be_op_active(um.recovered + UT_SIDE_SRV);
+ ut_remach_reset_srv(&um);
+
+ ut_remach_ha_thinks(&um, &HA_THOUGHT(UT_SIDE_SRV, M0_NC_TRANSIENT));
+ ut_remach_ha_tells(&um, &HA_THOUGHT(UT_SIDE_CLI, M0_NC_ONLINE),
+ UT_SIDE_SRV);
+ ut_remach_ha_thinks(&um, &HA_THOUGHT(UT_SIDE_SRV,
+ M0_NC_DTM_RECOVERING));
+ m0_be_op_wait(um.recovered + UT_SIDE_SRV);
+ log_subset_verify(&um, records_nr, UT_SIDE_CLI, UT_SIDE_SRV);
+ ut_remach_ha_thinks(&um, &HA_THOUGHT(UT_SIDE_SRV, M0_NC_ONLINE));
+
+ ut_remach_shutdown(&um);
}
extern void m0_dtm0_ut_drlink_simple(void);
extern void m0_dtm0_ut_domain_init_fini(void);
struct m0_ut_suite dtm0_ut = {
- .ts_name = "dtm0-ut",
- .ts_tests = {
- { "xcode", &cas_xcode_test },
- { "drlink-simple", &m0_dtm0_ut_drlink_simple },
- { "domain_init-fini", &m0_dtm0_ut_domain_init_fini },
+ .ts_name = "dtm0-ut",
+ .ts_tests = {
+ { "xcode", cas_xcode_test },
+ { "drlink-simple", &m0_dtm0_ut_drlink_simple },
+ { "domain_init-fini", &m0_dtm0_ut_domain_init_fini },
+ { "remach-init-fini", remach_init_fini },
+ { "remach-start-stop", remach_start_stop },
+ { "remach-boot-cluster-ss", remach_boot_cluster_ss },
+ { "remach-boot-cluster-cs", remach_boot_cluster_cs },
+ { "remach-reboot-server", remach_reboot_server },
+ { "remach-reboot-twice", remach_reboot_twice },
+ { "remach-boot-real-log", remach_boot_real_log },
+ { "remach-real-log-replay", remach_real_log_replay },
{ NULL, NULL },
}
};
@@ -124,3 +836,6 @@ struct m0_ut_suite dtm0_ut = {
* scroll-step: 1
* End:
*/
+/*
+ * vim: tabstop=8 shiftwidth=8 noexpandtab textwidth=80 nowrap
+ */
diff --git a/lib/bitmap.c b/lib/bitmap.c
index 309160c73e5..fcd43532e96 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -200,10 +200,11 @@ M0_INTERNAL void m0_bitmap_onwire_fini(struct m0_bitmap_onwire *ow_map)
M0_INTERNAL void m0_bitmap_store(const struct m0_bitmap *im_map,
struct m0_bitmap_onwire *ow_map)
{
- size_t s = M0_BITMAP_WORDS(im_map->b_nr);
+ size_t s;
M0_PRE(im_map != NULL && ow_map != NULL);
M0_PRE(im_map->b_words != NULL);
+ s = M0_BITMAP_WORDS(im_map->b_nr);
M0_PRE(s == ow_map->bo_size);
memcpy(ow_map->bo_words, im_map->b_words,
diff --git a/lib/user_space/timer.c b/lib/user_space/timer.c
index de098357149..8c72f3115ef 100644
--- a/lib/user_space/timer.c
+++ b/lib/user_space/timer.c
@@ -52,6 +52,10 @@ static const m0_time_t zero_time = M0_MKTIME(0, 0);
/** Clock source for M0_TIMER_HARD. @see timer_posix_set() */
static int clock_source_timer = -1;
+/* Number of allocated HARD timers */
+static unsigned long long hard_timer_alloc = 0;
+static unsigned long long hard_timer_free = 0;
+
/**
Typed list of m0_timer_tid structures.
*/
@@ -193,9 +197,19 @@ static int timer_posix_init(struct m0_timer *timer)
}
rc = timer_create(clock_source_timer, &se, &ptimer);
/* preserve timer->t_ptimer if timer_create() isn't succeeded */
- if (rc == 0)
+ if (rc == 0) {
timer->t_ptimer = ptimer;
- return M0_RC(rc);
+ hard_timer_alloc++;
+ } else {
+ rc = M0_ERR(-errno);
+ M0_LOG(M0_ERROR, "Failed to allocate HARD timer (%d), "
+ "alloc=%lld, free=%lld, inuse=%lld",
+ rc,
+ hard_timer_alloc,
+ hard_timer_free,
+ hard_timer_alloc - hard_timer_free);
+ }
+ return rc != 0 ? M0_ERR(rc) : 0;
}
/**
@@ -210,6 +224,7 @@ static void timer_posix_fini(timer_t posix_timer)
* timer_delete() can fail iff timer->t_ptimer isn't valid timer ID.
*/
M0_ASSERT_INFO(rc == 0, "rc = %d", rc);
+ hard_timer_free++;
}
static m0_time_t timer_time_to_realtime(m0_time_t expire)
diff --git a/lib/ut/cookie.c b/lib/ut/cookie.c
index 2c9ab7c0e33..45e1ebdb7e3 100644
--- a/lib/ut/cookie.c
+++ b/lib/ut/cookie.c
@@ -141,7 +141,7 @@ static void addr_sanity(const uint64_t *addr, bool sane, bool aligned)
void test_cookie(void)
{
int insane_cnt = 0;
- uint64_t automatic;
+ uint64_t automatic = 0;
uint64_t *dynamic;
uint64_t i;
struct m0_cookie cookie_test;
diff --git a/motr/idx_dix.c b/motr/idx_dix.c
index bcf71de729a..7bee805a564 100644
--- a/motr/idx_dix.c
+++ b/motr/idx_dix.c
@@ -71,6 +71,14 @@ struct dix_req {
struct m0_op_idx *idr_oi;
struct m0_sm_ast idr_ast;
struct m0_clink idr_clink;
+
+ /**
+ * Clink that connects STABLE/FAILURE states of DTX with this dix_req.
+ * Note, the clink is always initialised but not always attached
+ * to a dtx.
+ */
+ struct m0_clink idr_dtx_clink;
+
/**
* Starting key for NEXT operation.
* It's allocated internally and key value is copied from user.
@@ -614,6 +622,7 @@ static int dix_mreq_create(struct m0_op_idx *oi,
M0_ALLOC_PTR(req);
if (req == NULL)
return M0_ERR(-ENOMEM);
+ m0_clink_init(&req->idr_dtx_clink, dixreq_clink_dtx_cb);
if (idx_is_distributed(oi)) {
m0_dix_meta_req_init(&req->idr_mreq, op_dixc(oi),
oi->oi_sm_grp);
@@ -651,14 +660,13 @@ static int dix_req_create(struct m0_op_idx *oi,
M0_ALLOC_PTR(req);
if (req != NULL) {
+ m0_clink_init(&req->idr_dtx_clink, dixreq_clink_dtx_cb);
if (idx_is_distributed(oi)) {
m0_dix_req_init(&req->idr_dreq, op_dixc(oi),
oi->oi_sm_grp);
to_dix_map(&oi->oi_oc.oc_op, &req->idr_dreq);
req->idr_dreq.dr_dtx = oi->oi_dtx;
- m0_clink_init(&req->idr_clink,
- oi->oi_dtx != NULL ?
- dixreq_clink_dtx_cb : dixreq_clink_cb);
+ m0_clink_init(&req->idr_clink, dixreq_clink_cb);
/* Store oi for dix callbacks to update SYNC records. */
if (M0_IN(oi->oi_oc.oc_op.op_code,
@@ -690,6 +698,7 @@ static void dix_req_destroy(struct dix_req *req)
} else {
m0_cas_req_fini(&req->idr_creq);
}
+ m0_clink_fini(&req->idr_dtx_clink);
m0_free(req);
M0_LEAVE();
}
@@ -718,6 +727,7 @@ static void dixreq_completed_post(struct dix_req *req, int rc)
struct m0_op_idx *oi = req->idr_oi;
M0_ENTRY();
+ M0_ASSERT(req->idr_ast.sa_cb != dixreq_completed_ast);
oi->oi_ar.ar_rc = rc;
req->idr_ast.sa_cb = dixreq_completed_ast;
req->idr_ast.sa_datum = req;
@@ -826,108 +836,42 @@ static bool dix_meta_req_clink_cb(struct m0_clink *cl)
return false;
}
-static void dixreq_stable_ast(struct m0_sm_group *grp, struct m0_sm_ast *ast)
-{
- struct dix_req *req = ast->sa_datum;
- struct m0_op_idx *oi = req->idr_oi;
- int rc = oi->oi_ar.ar_rc;
-
- M0_ENTRY();
- oi->oi_ar.ar_ast.sa_cb = (rc == 0) ? idx_op_ast_stable : NULL;
- M0_ASSERT(grp == oi->oi_sm_grp);
- dix_req_destroy(req);
- idx_op_ast_stable(oi->oi_sm_grp, &oi->oi_ar.ar_ast);
- M0_LEAVE();
-}
-
-static void dixreq_stable_post(struct dix_req *req, int rc)
+static bool dix_req_is_completed(struct dix_req *dix_req)
{
- struct m0_op_idx *oi = req->idr_oi;
+ struct m0_dix_req *dreq = &dix_req->idr_dreq;
+ struct m0_op_idx *oi = dix_req->idr_oi;
+ struct m0_dtx *dtx = oi->oi_dtx;
+ bool is_executed =
+ M0_IN(dreq->dr_sm.sm_state, (DIXREQ_FAILURE, DIXREQ_FINAL));
+ bool is_stable = dtx == NULL ||
+ M0_IN(m0_dtx0_sm_state(dtx), (M0_DDS_STABLE, M0_DDS_FAILED));
- M0_ENTRY();
- oi->oi_ar.ar_rc = rc;
- req->idr_ast.sa_cb = dixreq_stable_ast;
- req->idr_ast.sa_datum = req;
- M0_ASSERT_INFO(req->idr_ast.sa_next == NULL,
- "Stable() ast cannot be armed before Executed() "
- "is completed. Ensure EXECUTED_ALL -> STABLE transition"
- "does not happen within the same ast tick");
- m0_sm_ast_post(oi->oi_sm_grp, &req->idr_ast);
- M0_LEAVE();
-}
-
-static void dixreq_executed_post(struct dix_req *req, int rc)
-{
- struct m0_op_idx *oi = req->idr_oi;
-
- M0_ENTRY();
-
- M0_ASSERT_INFO(rc == 0, "TODO: Failures are not handled here.");
- oi->oi_ar.ar_rc = rc;
-
- /* XXX: DTX cannot be canceled (as per the current design),
- * so that once we got a reply, we prohibit any kind of cancelation.
- * The originator should m0_panic itself in case if something needs
- * to be canceled. It will be re-started and continue its execution.
- */
- oi->oi_in_completion = true;
- oi->oi_ar.ar_ast.sa_cb = idx_op_ast_executed;
- M0_ASSERT(req->idr_dreq.dr_dtx->tx_dtx->dd_sm.sm_grp == oi->oi_sm_grp);
- M0_ASSERT(m0_sm_group_is_locked(oi->oi_sm_grp));
- idx_op_ast_executed(oi->oi_sm_grp, &oi->oi_ar.ar_ast);
- M0_LEAVE();
+ M0_ENTRY("is_executed=%d, is_stable=%d", !!is_executed, !!is_stable);
+ return M0_RC(is_executed && is_stable);
}
static bool dixreq_clink_dtx_cb(struct m0_clink *cl)
{
- struct dix_req *dix_req = M0_AMB(dix_req, cl, idr_clink);
+ struct dix_req *dix_req = M0_AMB(dix_req, cl, idr_dtx_clink);
struct m0_op_idx *oi = dix_req->idr_oi;
- struct m0_sm *req_sm = M0_AMB(req_sm, cl->cl_chan, sm_chan);
struct m0_dix_req *dreq = &dix_req->idr_dreq;
- struct m0_dtx *dtx = oi->oi_dtx;
- enum m0_dtm0_dtx_state state;
- int i;
-
- M0_PRE(M0_IN(oi->oi_oc.oc_op.op_code, (M0_IC_PUT, M0_IC_DEL)));
- M0_PRE(dtx != NULL);
-
- state = m0_dtx0_sm_state(dtx);
- if (!M0_IN(state, (M0_DDS_EXECUTED_ALL, M0_DDS_STABLE, M0_DDS_FAILED)))
+ /*
+ * Filter out the situations where dtx exists but m0_op_idx
+ * is not yet attached to dix_req.
+ */
+ if (oi == NULL)
return false;
- switch (state) {
- case M0_DDS_EXECUTED_ALL:
- /* TODO: we have a single kv pair; probably, it does not have
- * to be a loop.
- */
- for (i = 0; i < m0_dix_req_nr(dreq); i++) {
- oi->oi_rcs[i] = dreq->dr_items[i].dxi_rc;
- }
- /* XXX: We cannot use m0_dix_generic_rc here because the
- * precondition fails in this case. At this point error
- * handling is not covered here, and probably the error
- * code needs to be first propogated from DIX to DTX and
- * then it needs to be passed here as dtx.dd_sm.sm_rc.
- */
- dixreq_executed_post(dix_req, dreq->dr_sm.sm_rc);
- break;
- case M0_DDS_STABLE:
- M0_ASSERT_INFO(m0_dix_generic_rc(dreq) == 0,
- "TODO: DIX failures are not supported.");
-
- M0_ASSERT_INFO(m0_forall(idx, m0_dix_req_nr(dreq),
- m0_dix_item_rc(dreq, idx) == 0),
- "TODO: failed executions of individual items "
- "are not supported yet.");
-
- dixreq_stable_post(dix_req, m0_dix_generic_rc(dreq));
+ M0_PRE(M0_IN(oi->oi_oc.oc_op.op_code, (M0_IC_PUT, M0_IC_DEL)));
+ M0_PRE(oi->oi_dtx != NULL);
+
+ if (M0_IN(m0_dtx0_sm_state(oi->oi_dtx),
+ (M0_DDS_STABLE, M0_DDS_FAILED)) && m0_clink_is_armed(cl))
m0_clink_del(cl);
- break;
- case M0_DDS_FAILED:
- M0_IMPOSSIBLE("DTX failures are not supported so far.");
- default:
- M0_IMPOSSIBLE("Only Executed and Stable are allowed so far.");
+
+ if (dix_req_is_completed(dix_req)) {
+ dixreq_completed_post(dix_req, m0_dix_generic_rc(dreq));
}
return false;
@@ -985,15 +929,20 @@ static bool dixreq_clink_cb(struct m0_clink *cl)
}
}
- dixreq_completed_post(dix_req, rc);
+ if (dix_req_is_completed(dix_req))
+ dixreq_completed_post(dix_req, rc);
return false;
}
static void dix_req_immed_failure(struct dix_req *req, int rc)
{
+ struct m0_op_idx *oi = req->idr_oi;
+
M0_ENTRY();
M0_PRE(rc != 0);
m0_clink_del(&req->idr_clink);
+ if (oi->oi_dtx != NULL)
+ m0_clink_del(&req->idr_dtx_clink);
dixreq_completed_post(req, rc);
M0_LEAVE();
}
@@ -1119,9 +1068,10 @@ static void dix_dreq_prepare(struct dix_req *req,
struct m0_op_idx *oi)
{
dix_build(oi, dix);
- m0_clink_add(oi->oi_dtx != NULL ?
- &oi->oi_dtx->tx_dtx->dd_sm.sm_chan :
- &req->idr_dreq.dr_sm.sm_chan, &req->idr_clink);
+ m0_clink_add(&req->idr_dreq.dr_sm.sm_chan, &req->idr_clink);
+ if (oi->oi_dtx != NULL)
+ m0_clink_add(&oi->oi_dtx->tx_dtx->dd_sm.sm_chan,
+ &req->idr_dtx_clink);
}
static void dix_put_ast(struct m0_sm_group *grp, struct m0_sm_ast *ast)
@@ -1271,8 +1221,13 @@ M0_INTERNAL int m0__idx_cancel(struct m0_op_idx *oi)
}
static void dix_set_idx_flags(struct m0_op_idx *oi)
-{
- if (ENABLE_DTM0)
+{
+ /* XXX:
+ * For some reason, "if(ENABLE_DTM0)" was added here, but
+ * it wasn't properly tested. Disabling it until the code
+ * is covered with tests.
+ */
+ if (0)
oi->oi_flags |= M0_OIF_SKIP_LAYOUT;
if (!(oi->oi_flags & M0_OIF_SKIP_LAYOUT))
diff --git a/motr/m0kv/index.c b/motr/m0kv/index.c
index 294ceacd15f..a6158a0b3c3 100644
--- a/motr/m0kv/index.c
+++ b/motr/m0kv/index.c
@@ -30,6 +30,7 @@
#define M0_TRACE_SUBSYSTEM M0_TRACE_SUBSYS_CLIENT
#include /* FILE */
#include /* uuid_generate */
+#include /* access(), sleep() */
#include "lib/assert.h" /* M0_ASSERT */
#include "lib/errno.h"
#include "lib/memory.h"
@@ -79,6 +80,7 @@ static int instance_init(struct params *params)
.cc_params = params,
.cc_conf = {
.mc_is_oostore = true,
+ .mc_is_addb_init = true,
.mc_is_read_verify = false,
.mc_local_addr = params->cp_local_addr,
.mc_ha_addr = params->cp_ha_addr,
@@ -161,6 +163,25 @@ static int genv(char *filename, int cnt, int size)
return 0;
}
+static int wait_file(const char *expected_file)
+{
+ int rc;
+
+ m0_console_printf("Awaiting: 'touch %s' \n", expected_file);
+
+ while (access(expected_file, F_OK ) != 0) {
+ rc = -errno;
+ M0_ASSERT_INFO(rc == -ENOENT, "Wrong file or directory? "
+ "file=%s, rc=%d", expected_file, rc);
+ sleep(1);
+ }
+
+ m0_console_printf("File detected: '%s'\n", expected_file);
+ (void) remove(expected_file);
+
+ return 0;
+}
+
static void log_hex_val(const char *tag, void *buf, int size)
{
int i;
@@ -274,6 +295,9 @@ static int cmd_exec(struct index_cmd *cmd)
case GENV:
rc = genv(cmd->ic_filename, cmd->ic_cnt, cmd->ic_len);
break;
+ case WLF:
+ rc = wait_file(cmd->ic_filename);
+ break;
default:
rc = M0_ERR(-EINVAL);
M0_ASSERT(0);
diff --git a/motr/m0kv/index.h b/motr/m0kv/index.h
index e24d067eb22..f75b66ccc86 100644
--- a/motr/m0kv/index.h
+++ b/motr/m0kv/index.h
@@ -43,7 +43,8 @@ enum {
GET, /* Get record. */
NXT, /* Next record. */
GENF, /* Generate FID-file. */
- GENV /* Generate VAL-file. */
+ GENV, /* Generate VAL-file. */
+ WLF, /* Wait for a file to appear. */
};
enum {
diff --git a/motr/m0kv/index_parser.c b/motr/m0kv/index_parser.c
index 8865e614f2a..cc975fad001 100644
--- a/motr/m0kv/index_parser.c
+++ b/motr/m0kv/index_parser.c
@@ -61,6 +61,7 @@ static const struct command_descr commands[] = {
{ GENF, "genf", "genf CNT FILE, generate file with several FID" },
{ GENV, "genv", "genv CNT SIZE FILE, generate file with several "
"KEY_PARAM/VAL_PARAM. Note: SIZE > 16" },
+ { WLF, "wait", "wait FILE, await a file to appear" },
};
static int command_id(const char *name)
@@ -362,6 +363,11 @@ static int command_assign(struct index_cmd *cmd, int *argc, char ***argv)
++*params;
*argc -= 3;
break;
+ case WLF:
+ cmd->ic_filename = **params;
+ ++*params;
+ --*argc;
+ break;
default:
M0_IMPOSSIBLE("Wrong command");
}
@@ -409,6 +415,9 @@ static bool command_is_valid(struct index_cmd *cmd)
cmd->ic_cnt != 0 &&
cmd->ic_len != 0;
break;
+ case WLF:
+ rc = cmd->ic_filename != NULL;
+ break;
default:
M0_IMPOSSIBLE("Wrong command.");
}
diff --git a/motr/magic.h b/motr/magic.h
index ecc23779ed2..8b4eebc85d2 100644
--- a/motr/magic.h
+++ b/motr/magic.h
@@ -430,6 +430,10 @@ enum m0_magic_satchel {
M0_DTM_CAT_MAGIX = 0x33acce551b1e4277,
/* m0_dtm::d_excited::t_magic (flooded baboo) */
M0_DTM_EXC_MAGIX = 0x33f100dedbab0077,
+ /* recovery_fom::rf_magic (solidified 66) */
+ M0_DTM0_RMACH_MAGIC = 0x335011D1F1ED6677,
+ /* rfom_tl::td_head_magic (seeded cobble) */
+ M0_DTM0_RMACH_HEAD_MAGIC = 0x335EEDEDC0BB1E77,
/* Failure Domains */
/* m0_fd_perm_cache::fpc_magic (fascia doodia) */
diff --git a/motr/ut/idx_dix.c b/motr/ut/idx_dix.c
index 11d89873dac..252bef10863 100644
--- a/motr/ut/idx_dix.c
+++ b/motr/ut/idx_dix.c
@@ -1118,7 +1118,7 @@ static void dtm0_ut_cas_op_prepare(const struct m0_fid *cfid,
}
}
-static void dtm0_ut_send_redo(const struct m0_fid *ifid,
+static void dtm0_ut_send_redo(const struct m0_fid *ifid, uint32_t sdev_id,
uint64_t *key, uint64_t *val)
{
int rc;
@@ -1142,8 +1142,6 @@ static void dtm0_ut_send_redo(const struct m0_fid *ifid,
* Ivan Alekhin.
*/
struct m0_fom zero_fom_to_be_deleted = {};
- /* Extreme hack to convert index fid to component catalogue fid. */
- uint32_t sdev_idx = 10;
m0_dtm0_clk_src_init(&dcs, M0_DTM0_CS_PHYS);
m0_dtm0_clk_src_now(&dcs, &now);
@@ -1162,7 +1160,7 @@ static void dtm0_ut_send_redo(const struct m0_fid *ifid,
.dti_fid = cli_dtm0_fid
};
- m0_dix_fid_convert_dix2cctg(ifid, &cctg_fid, sdev_idx);
+ m0_dix_fid_convert_dix2cctg(ifid, &cctg_fid, sdev_id);
dtm0_ut_cas_op_prepare(&cctg_fid, &cas_op, &cas_rec, key, val, &txr);
@@ -1216,7 +1214,59 @@ static void dtm0_ut_read_and_check(uint64_t key, uint64_t val)
m0_free0(&rcs);
}
-static void st_dtm0_r(void)
+static uint32_t dtm0_ut_cas_sdev_id_get(void)
+{
+ struct m0_fid srv_cas_fid;
+ struct m0_fid srv_proc_fid;
+ struct m0_confc *confc = m0_reqh2confc(&ut_m0c->m0c_reqh);
+ struct m0_conf_obj *obj;
+ struct m0_conf_service *service;
+ struct m0_conf_obj *sdevs_dir;
+ struct m0_conf_sdev *sdev;
+ bool found = false;
+ uint32_t sdev_id = 0;
+ int rc;
+
+ rc = m0_fid_sscanf(ut_m0_config.mc_process_fid, &srv_proc_fid);
+ M0_UT_ASSERT(rc == 0);
+ rc = m0_conf_process2service_get(confc, &srv_proc_fid,
+ M0_CST_CAS, &srv_cas_fid);
+ M0_UT_ASSERT(rc == 0);
+
+ obj = m0_conf_cache_lookup(&confc->cc_cache, &srv_cas_fid);
+ M0_ASSERT(obj != NULL);
+
+ service = M0_CONF_CAST(obj, m0_conf_service);
+ M0_ASSERT(service != NULL);
+
+ sdevs_dir = &service->cs_sdevs->cd_obj;
+ rc = m0_confc_open_sync(&sdevs_dir, sdevs_dir, M0_FID0);
+ M0_ASSERT(rc == 0);
+
+ obj = NULL;
+ while ((rc = m0_confc_readdir_sync(sdevs_dir, &obj)) > 0) {
+ sdev = M0_CONF_CAST(obj, m0_conf_sdev);
+ if (!found) {
+ sdev_id = sdev->sd_dev_idx;
+ found = true;
+ } else {
+ /*
+ * Single device attached to the CAS service
+ * is a standard configuration for now, don't
+ * support several attached devices in the UT.
+ */
+ M0_IMPOSSIBLE("Do not support several CAS devices.");
+ }
+ }
+
+ m0_confc_close(sdevs_dir);
+
+ M0_ASSERT(found);
+
+ return sdev_id;
+}
+
+static void st_dtm0_r_common(uint32_t sdev_id)
{
m0_time_t rem;
uint64_t key = 111;
@@ -1227,7 +1277,7 @@ static void st_dtm0_r(void)
idx_setup();
exec_one_by_one(1, M0_IC_PUT);
- dtm0_ut_send_redo(&duc.duc_ifid, &key, &val);
+ dtm0_ut_send_redo(&duc.duc_ifid, sdev_id, &key, &val);
/* XXX dirty hack, but now we don't have completion notification */
rem = 2ULL * M0_TIME_ONE_SECOND;
@@ -1238,6 +1288,27 @@ static void st_dtm0_r(void)
idx_teardown();
}
+static void st_dtm0_r(void)
+{
+ /* CAS sdev id from the configuration. */
+ uint32_t sdev_id = dtm0_ut_cas_sdev_id_get();
+ st_dtm0_r_common(sdev_id);
+}
+
+static void st_dtm0_r_wrong_sdev(void)
+{
+ /*
+ * Random CAS sdev id, should be fixed by REDO handler.
+ * In a real system participants will send the REDO messages
+ * with their own CAS devices IDs during recovery, so the REDO
+ * handler of the process to be recovered needs to get the CAS
+ * device ID attached to a local CAS service and set it in an
+ * operation CAS ID.
+ */
+ uint32_t sdev_id = dtm0_ut_cas_sdev_id_get() + 12345;
+ st_dtm0_r_common(sdev_id);
+}
+
struct m0_ut_suite ut_suite_mt_idx_dix = {
.ts_name = "idx-dix-mt",
.ts_owners = "Anatoliy",
@@ -1252,6 +1323,7 @@ struct m0_ut_suite ut_suite_mt_idx_dix = {
{ "dtm0_e_then_s", st_dtm0_e_then_s, "Ivan" },
{ "dtm0_c", st_dtm0_c, "Ivan" },
{ "dtm0_r", st_dtm0_r, "Sergey" },
+ { "dtm0_r_wrong_sdev", st_dtm0_r_wrong_sdev, "Sergey" },
{ NULL, NULL }
}
};
diff --git a/rpc/conn.c b/rpc/conn.c
index 57b2ddefd56..27aff8dbd79 100644
--- a/rpc/conn.c
+++ b/rpc/conn.c
@@ -1349,10 +1349,18 @@ static bool rpc_conn__on_service_event_cb(struct m0_clink *clink)
M0_PRE(m0_rpc_conn2svc(conn) == obj);
M0_LOG(M0_DEBUG, "obj->co_ha_state = %d", obj->co_ha_state);
/*
+ * At this moment, M0_NC_TRANSIENT means that some volatile state
+ * associated with that process was lost (its memory, packets,
+ * connections -- anything).
+ * In this case, the best thing we can do is to cancel
+ * all the sessions and let the user to re-connect.
+ * M0_NC_FAILED means the same thing plus some persistent state
+ * was lost.
+ * Previous outdated comment:
* Ignore M0_NC_TRANSIENT state to keep items re-sending until service
* gets M0_NC_ONLINE or becomes M0_NC_FAILED finally.
*/
- if (obj->co_ha_state == M0_NC_FAILED)
+ if (M0_IN(obj->co_ha_state, (M0_NC_FAILED, M0_NC_TRANSIENT)))
m0_rpc_conn_sessions_cancel(conn);
/**
* @todo See if to __conn_ha_unsubscribe() right now, but not wait until
diff --git a/rpc/rpc_opcodes.h b/rpc/rpc_opcodes.h
index 0310fa2123d..cb0ba05b60f 100644
--- a/rpc/rpc_opcodes.h
+++ b/rpc/rpc_opcodes.h
@@ -358,6 +358,7 @@ enum M0_RPC_OPCODES {
M0_ISCSERVICE_EXEC_OPCODE = 1072,
M0_DTM0_RLINK_OPCODE = 1073,
M0_FDMI_SOURCE_DOCK_TIMER_OPCODE = 1074,
+ M0_DTM0_RECOVERY_FOM_OPCODE = 1075,
M0_OPCODES_NR = 2048
} M0_XCA_ENUM;
diff --git a/scripts/gdb/gdb-extensions b/scripts/gdb/gdb-extensions
index 40790d4d9e6..c40ef48b9ad 100644
--- a/scripts/gdb/gdb-extensions
+++ b/scripts/gdb/gdb-extensions
@@ -113,3 +113,37 @@ define runever
end
end
end
+
+define coro-bt
+ set $c = ((struct m0_co_context *) ($arg0))
+ set $i = 0
+ while ($i < $c->mc_frame)
+ list *$c->mc_stack[$i]
+ set $i = $i + 1
+ end
+end
+
+document coro-bt
+ Prints the backtrace of a coroutine context.
+end
+
+define rfom-print
+ set $s = ((struct recovery_fom *) ($arg0))
+ printf "p=%p, v=%d, l=%d, ", $s, (int) $s->rf_is_volatile, (int) $s->rf_is_local
+ printf "eol=%d, ", (int) $s->rf_last_known_eol
+ printf "ha=%d, end=%d, ", (int) $s->rf_last_known_ha_state, (int) $s->rf_eolq.bq_the_end
+ printf "coro=%p \n", ($s->rf_coro.mc_stack[$s->rf_coro.mc_frame - 1])
+ coro-bt &$s->rf_coro
+end
+
+define drm-print
+ set $r = (struct m0_dtm0_recovery_machine *) ($arg0)
+ m0-list-print &($arg0)->rm_rfoms struct recovery_fom rf_linkage rfom-print
+end
+
+document drm-print
+ Prints parts of DTM0 recovery machine state in human-readable format.
+ The only argument is a pointer to a recovery machine.
+ The function prints some properties of recovery FOMs and the
+ backtraces of their coroutines.
+
diff --git a/scripts/install/opt/seagate/cortx/motr/bin/motr_mini_prov.py b/scripts/install/opt/seagate/cortx/motr/bin/motr_mini_prov.py
index 3db5129eba0..696c416b12a 100644
--- a/scripts/install/opt/seagate/cortx/motr/bin/motr_mini_prov.py
+++ b/scripts/install/opt/seagate/cortx/motr/bin/motr_mini_prov.py
@@ -59,6 +59,7 @@
CMD_RETRY_COUNT = 5
MEM_THRESHOLD = 4*1024*1024*1024
CVG_COUNT_KEY = "num_cvg"
+MOTR_M0D_MIN_RPC_RECVQ_LEN = 64
class MotrError(Exception):
""" Generic Exception with error code and output """
@@ -494,6 +495,7 @@ def update_copy_motr_config_file(self):
("MOTR_M0D_DATA_DIR", f"{MOTR_M0D_DATA_DIR}"),
("MOTR_M0D_CONF_XC", f"{MOTR_M0D_CONF_XC}"),
("MOTR_M0D_ADDB_STOB_DIR", f"{MOTR_M0D_ADDB_STOB_DIR}"),
+ ("MOTR_M0D_MIN_RPC_RECVQ_LEN", f"{MOTR_M0D_MIN_RPC_RECVQ_LEN}"),
("MOTR_M0D_TRACE_DIR", f"{MOTR_M0D_TRACE_DIR}")]
update_config_file(self, f"{MOTR_SYS_CFG}", config_kvs)
# Copy config file to new path
diff --git a/utils/m0hagen b/utils/m0hagen
index 08210b4466e..966e370add9 100755
--- a/utils/m0hagen
+++ b/utils/m0hagen
@@ -295,7 +295,7 @@ yaml.add_implicit_resolver(Fid.yaml_tag, Fid.re_conf,
# enum m0_ha_obj_state
ha_states = ['unknown', 'online', 'failed', 'transient', 'repair', 'repaired',
- 'rebalance']
+ 'rebalance', 'dtm-recovering']
class HAMsgNVec(XCodeable):